Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_neon.h
7455 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "common/intrin.h"
5
#include "common/types.h"
6
7
#include <algorithm>
8
#include <cmath>
9
#include <cstdint>
10
11
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
12
#define GSVECTOR_HAS_SRLV 1
13
14
#ifdef CPU_ARCH_ARM64
15
// tbl2 with 128-bit vectors is not in A32.
16
#define GSVECTOR_HAS_TBL2 1
17
#endif
18
19
class GSVector2;
20
class GSVector2i;
21
class GSVector4;
22
class GSVector4i;
23
24
class alignas(16) GSVector2i
25
{
26
struct cxpr_init_tag
27
{
28
};
29
static constexpr cxpr_init_tag cxpr_init{};
30
31
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
32
33
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
34
35
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
36
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
37
{
38
}
39
40
public:
41
union
42
{
43
struct
44
{
45
s32 x, y;
46
};
47
struct
48
{
49
s32 r, g;
50
};
51
float F32[2];
52
s8 S8[8];
53
s16 S16[4];
54
s32 S32[2];
55
s64 S64[1];
56
u8 U8[8];
57
u16 U16[4];
58
u32 U32[2];
59
u64 U64[1];
60
int32x2_t v2s;
61
};
62
63
GSVector2i() = default;
64
65
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
66
67
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
68
69
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
70
71
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
72
{
73
return GSVector2i(cxpr_init, s0, s1, s2, s3);
74
}
75
76
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
77
{
78
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
79
}
80
81
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
82
83
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
84
85
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
86
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
87
{
88
}
89
90
ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
91
92
ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
93
94
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
95
96
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
97
98
ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
99
100
ALWAYS_INLINE operator int32x2_t() const { return v2s; }
101
102
ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
103
{
104
return max_s8(min).min_s8(max);
105
}
106
ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
107
{
108
return max_s16(min).min_s16(max);
109
}
110
ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
111
{
112
return max_s32(min).min_s32(max);
113
}
114
115
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
116
{
117
return max_u8(min).min_u8(max);
118
}
119
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
120
{
121
return max_u16(min).min_u16(max);
122
}
123
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
124
{
125
return max_u32(min).min_u32(max);
126
}
127
128
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const
129
{
130
return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
131
}
132
133
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const
134
{
135
return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
136
}
137
138
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const
139
{
140
return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
141
}
142
143
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const
144
{
145
return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
146
}
147
148
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
149
150
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
151
152
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
153
{
154
return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
155
}
156
157
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
158
{
159
return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
160
}
161
162
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
163
{
164
return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
165
}
166
167
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
168
{
169
return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
170
}
171
172
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
173
{
174
return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
175
}
176
177
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
178
{
179
return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
180
}
181
182
ALWAYS_INLINE s32 addv_s32() const
183
{
184
#ifdef CPU_ARCH_ARM64
185
return vaddv_s32(v2s);
186
#else
187
return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
188
#endif
189
}
190
191
#ifdef CPU_ARCH_ARM64
192
193
ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
194
195
ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
196
197
ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
198
199
ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
200
201
ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
202
203
ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(vreinterpret_u32_s32(v2s)); }
204
205
ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
206
207
ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(vreinterpret_u32_s32(v2s)); }
208
209
#else
210
211
ALWAYS_INLINE u8 minv_u8() const
212
{
213
uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
214
return static_cast<u8>(
215
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
216
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
217
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
218
}
219
220
ALWAYS_INLINE u16 maxv_u8() const
221
{
222
uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
223
return static_cast<u8>(
224
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
225
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
226
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
227
}
228
229
ALWAYS_INLINE u16 minv_u16() const
230
{
231
uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
232
return static_cast<u16>(
233
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
234
}
235
236
ALWAYS_INLINE u16 maxv_u16() const
237
{
238
uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
239
return static_cast<u16>(
240
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
241
}
242
243
ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
244
245
ALWAYS_INLINE u32 minv_u32() const
246
{
247
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
248
}
249
250
ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
251
252
ALWAYS_INLINE u32 maxv_u32() const
253
{
254
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
255
}
256
257
#endif
258
259
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
260
261
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
262
{
263
uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
264
return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
265
}
266
267
template<int mask>
268
ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
269
{
270
static constexpr const uint16_t _mask[4] = {
271
((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
272
((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
273
return GSVector2i(
274
vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
275
}
276
277
template<int mask>
278
ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
279
{
280
constexpr int bit1 = ((mask & 2) * 3) << 1;
281
constexpr int bit0 = (mask & 1) * 3;
282
return blend16<bit1 | bit0>(v);
283
}
284
285
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
286
{
287
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
288
vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
289
}
290
291
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
292
{
293
return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s))));
294
}
295
296
ALWAYS_INLINE GSVector2i ps16() const
297
{
298
return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
299
}
300
301
ALWAYS_INLINE GSVector2i pu16() const
302
{
303
return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
304
}
305
306
ALWAYS_INLINE GSVector2i ps32() const
307
{
308
return GSVector2i(vreinterpret_s32_s16(vqmovn_s32(vcombine_s32(v2s, vcreate_s32(0)))));
309
}
310
311
ALWAYS_INLINE GSVector2i pu32() const
312
{
313
return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
314
}
315
316
#ifdef CPU_ARCH_ARM64
317
318
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
319
{
320
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
321
}
322
323
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
324
{
325
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
326
}
327
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
328
329
ALWAYS_INLINE GSVector2i upl8() const
330
{
331
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
332
}
333
334
ALWAYS_INLINE GSVector2i upl16() const
335
{
336
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
337
}
338
339
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
340
341
#else
342
343
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
344
{
345
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
346
}
347
348
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
349
{
350
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
351
}
352
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
353
354
ALWAYS_INLINE GSVector2i upl8() const
355
{
356
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
357
}
358
359
ALWAYS_INLINE GSVector2i upl16() const
360
{
361
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
362
}
363
364
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
365
366
#endif
367
368
ALWAYS_INLINE GSVector2i s8to16() const
369
{
370
return GSVector2i(vreinterpret_s32_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s)))));
371
}
372
373
ALWAYS_INLINE GSVector2i u8to16() const
374
{
375
return GSVector2i(vreinterpret_s32_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))));
376
}
377
378
ALWAYS_INLINE GSVector2i s8to32() const
379
{
380
return GSVector2i(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s))))));
381
}
382
383
ALWAYS_INLINE GSVector2i u8to32() const
384
{
385
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))))));
386
}
387
388
ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(vget_low_s32(vmovl_s16(vreinterpret_s16_s32(v2s)))); }
389
390
ALWAYS_INLINE GSVector2i u16to32() const
391
{
392
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s32(v2s)))));
393
}
394
395
template<int i>
396
ALWAYS_INLINE GSVector2i srl() const
397
{
398
return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
399
}
400
401
template<int i>
402
ALWAYS_INLINE GSVector2i sll() const
403
{
404
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 8 - i)));
405
}
406
407
template<int i>
408
ALWAYS_INLINE GSVector2i sll16() const
409
{
410
return GSVector2i(vreinterpret_s32_u16(vshl_n_u16(vreinterpret_u16_s32(v2s), i)));
411
}
412
413
ALWAYS_INLINE GSVector2i sll16(s32 i) const
414
{
415
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_s16(i))));
416
}
417
418
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419
{
420
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421
}
422
423
template<int i>
424
ALWAYS_INLINE GSVector2i srl16() const
425
{
426
return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
427
}
428
429
ALWAYS_INLINE GSVector2i srl16(s32 i) const
430
{
431
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_s16(-i))));
432
}
433
434
ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
435
{
436
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
437
}
438
439
template<int i>
440
ALWAYS_INLINE GSVector2i sra16() const
441
{
442
constexpr int count = (i & ~15) ? 15 : i;
443
return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
444
}
445
446
ALWAYS_INLINE GSVector2i sra16(s32 i) const
447
{
448
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
449
}
450
451
ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
452
{
453
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
454
}
455
456
template<int i>
457
ALWAYS_INLINE GSVector2i sll32() const
458
{
459
return GSVector2i(vshl_n_s32(v2s, i));
460
}
461
462
ALWAYS_INLINE GSVector2i sll32(s32 i) const
463
{
464
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(i))));
465
}
466
467
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const
468
{
469
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), v.v2s)));
470
}
471
472
template<int i>
473
ALWAYS_INLINE GSVector2i srl32() const
474
{
475
return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
476
}
477
478
ALWAYS_INLINE GSVector2i srl32(s32 i) const
479
{
480
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
481
}
482
483
ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
484
{
485
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
486
}
487
488
template<int i>
489
ALWAYS_INLINE GSVector2i sra32() const
490
{
491
return GSVector2i(vshr_n_s32(v2s, i));
492
}
493
494
ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
495
496
ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, vneg_s32(v.v2s))); }
497
498
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
499
{
500
return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
501
}
502
503
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
504
{
505
return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
506
}
507
508
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
509
510
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
511
{
512
return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
513
}
514
515
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
516
{
517
return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
518
}
519
520
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
521
{
522
return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
523
}
524
525
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
526
{
527
return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
528
}
529
530
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
531
{
532
return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
533
}
534
535
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
536
{
537
return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
538
}
539
540
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
541
542
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
543
{
544
return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
545
}
546
547
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
548
{
549
return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
550
}
551
552
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
553
{
554
return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
555
}
556
557
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
558
{
559
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
560
}
561
562
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
563
{
564
return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
565
}
566
567
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
568
{
569
return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
570
}
571
572
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
573
{
574
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
575
}
576
577
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
578
579
ALWAYS_INLINE bool eq(const GSVector2i& v) const
580
{
581
return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
582
}
583
584
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
585
{
586
return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
587
}
588
589
ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
590
{
591
return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
592
}
593
594
ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
595
{
596
return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
597
}
598
599
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
600
601
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
602
603
ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
604
605
ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
606
{
607
return GSVector2i(vreinterpret_s32_u8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
608
}
609
610
ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
611
{
612
return GSVector2i(vreinterpret_s32_u16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
613
}
614
615
ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const
616
{
617
return GSVector2i(vreinterpret_s32_u32(vcgt_s32(v2s, v.v2s)));
618
}
619
620
ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
621
{
622
return GSVector2i(vreinterpret_s32_u8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
623
}
624
ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
625
{
626
return GSVector2i(vreinterpret_s32_u16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
627
}
628
ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const
629
{
630
return GSVector2i(vreinterpret_s32_u32(vcge_s32(v2s, v.v2s)));
631
}
632
633
ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
634
{
635
return GSVector2i(vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
636
}
637
638
ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
639
{
640
return GSVector2i(vreinterpret_s32_u16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
641
}
642
643
ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const
644
{
645
return GSVector2i(vreinterpret_s32_u32(vclt_s32(v2s, v.v2s)));
646
}
647
648
ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
649
{
650
return GSVector2i(vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
651
}
652
ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
653
{
654
return GSVector2i(vreinterpret_s32_u16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
655
}
656
ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const
657
{
658
return GSVector2i(vreinterpret_s32_u32(vcle_s32(v2s, v.v2s)));
659
}
660
661
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
662
663
ALWAYS_INLINE int mask() const
664
{
665
// borrowed from sse2neon
666
const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
667
const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
668
const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
669
const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
670
return static_cast<int>(vget_lane_u8(paired64, 0));
671
}
672
673
ALWAYS_INLINE bool alltrue() const
674
{
675
return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
676
}
677
678
ALWAYS_INLINE bool anytrue() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) != UINT64_C(0)); }
679
680
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
681
682
template<int i>
683
ALWAYS_INLINE GSVector2i insert8(int a) const
684
{
685
return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
686
}
687
688
template<int i>
689
ALWAYS_INLINE int extract8() const
690
{
691
return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
692
}
693
694
template<int i>
695
ALWAYS_INLINE GSVector2i insert16(int a) const
696
{
697
return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
698
}
699
700
template<int i>
701
ALWAYS_INLINE int extract16() const
702
{
703
return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
704
}
705
706
template<int i>
707
ALWAYS_INLINE GSVector2i insert32(int a) const
708
{
709
return GSVector2i(vset_lane_s32(a, v2s, i));
710
}
711
712
template<int i>
713
ALWAYS_INLINE int extract32() const
714
{
715
return vget_lane_s32(v2s, i);
716
}
717
718
ALWAYS_INLINE static GSVector2i load32(const void* p)
719
{
720
// should be ldr s0, [x0]
721
s32 val;
722
std::memcpy(&val, p, sizeof(s32));
723
return GSVector2i(vset_lane_s32(val, vdup_n_s32(0), 0));
724
}
725
726
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
727
728
template<bool aligned>
729
ALWAYS_INLINE static GSVector2i load(const void* p)
730
{
731
#ifdef CPU_ARCH_ARM32
732
if constexpr (!aligned)
733
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
734
#endif
735
736
return GSVector2i(vld1_s32((const int32_t*)p));
737
}
738
739
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
740
{
741
s32 val = vget_lane_s32(v, 0);
742
std::memcpy(p, &val, sizeof(s32));
743
}
744
745
template<bool aligned>
746
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
747
{
748
#ifdef CPU_ARCH_ARM32
749
if constexpr (!aligned)
750
{
751
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
752
return;
753
}
754
#endif
755
756
vst1_s32((int32_t*)p, v.v2s);
757
}
758
759
ALWAYS_INLINE void operator&=(const GSVector2i& v)
760
{
761
v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
762
}
763
764
ALWAYS_INLINE void operator|=(const GSVector2i& v)
765
{
766
v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
767
}
768
769
ALWAYS_INLINE void operator^=(const GSVector2i& v)
770
{
771
v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
772
}
773
774
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
775
{
776
return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
777
}
778
779
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
780
{
781
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
782
}
783
784
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
785
{
786
return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
787
}
788
789
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
790
791
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
792
793
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
794
795
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
796
797
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
798
799
ALWAYS_INLINE GSVector2i xy() const { return *this; }
800
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
801
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
802
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
803
};
804
805
class alignas(16) GSVector2
806
{
807
struct cxpr_init_tag
808
{
809
};
810
static constexpr cxpr_init_tag cxpr_init{};
811
812
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
813
814
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
815
816
public:
817
union
818
{
819
struct
820
{
821
float x, y;
822
};
823
struct
824
{
825
float r, g;
826
};
827
float F32[2];
828
double F64[1];
829
s8 I8[8];
830
s16 I16[4];
831
s32 I32[2];
832
s64 I64[1];
833
u8 U8[8];
834
u16 U16[4];
835
u32 U32[2];
836
u64 U64[1];
837
float32x2_t v2s;
838
};
839
840
GSVector2() = default;
841
842
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
843
844
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
845
846
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
847
848
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
849
850
ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
851
852
ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
853
854
ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
855
856
ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
857
858
ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
859
860
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
861
862
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
863
864
ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
865
866
ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
867
868
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
869
870
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
871
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
872
873
#ifdef CPU_ARCH_ARM64
874
875
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
876
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
877
878
#else
879
880
ALWAYS_INLINE GSVector2 floor() const
881
{
882
return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
883
}
884
885
ALWAYS_INLINE GSVector2 ceil() const
886
{
887
return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
888
}
889
890
#endif
891
892
ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
893
894
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
895
896
ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
897
898
ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
899
900
ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
901
902
template<int mask>
903
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
904
{
905
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 2 : 0, (mask & 2) ? 3 : 1));
906
}
907
908
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
909
{
910
// duplicate sign bit across and bit select
911
const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
912
return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
913
}
914
915
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
916
{
917
return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
918
}
919
920
ALWAYS_INLINE int mask() const
921
{
922
const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_f32(v2s), 31);
923
return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
924
}
925
926
ALWAYS_INLINE bool alltrue() const
927
{
928
return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
929
}
930
931
ALWAYS_INLINE bool anytrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) != UINT64_C(0)); }
932
933
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
934
935
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
936
937
template<int src, int dst>
938
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
939
{
940
#ifdef CPU_ARCH_ARM64
941
return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
942
#else
943
return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
944
#endif
945
}
946
947
template<int i>
948
ALWAYS_INLINE int extract32() const
949
{
950
return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
951
}
952
953
ALWAYS_INLINE float dot(const GSVector2& v) const
954
{
955
#ifdef CPU_ARCH_ARM64
956
return vaddv_f32(vmul_f32(v2s, v.v2s));
957
#else
958
const float32x2_t dp = vmul_f32(v2s, v.v2s);
959
return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
960
#endif
961
}
962
963
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
964
965
ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
966
967
template<bool aligned>
968
ALWAYS_INLINE static GSVector2 load(const void* p)
969
{
970
#ifdef CPU_ARCH_ARM32
971
if constexpr (!aligned)
972
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
973
#endif
974
975
return GSVector2(vld1_f32(static_cast<const float*>(p)));
976
}
977
978
template<bool aligned>
979
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
980
{
981
#ifdef CPU_ARCH_ARM32
982
if constexpr (!aligned)
983
{
984
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
985
return;
986
}
987
#endif
988
989
vst1_f32(static_cast<float*>(p), v.v2s);
990
}
991
992
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
993
994
ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
995
ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
996
ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
997
ALWAYS_INLINE void operator/=(const GSVector2& v)
998
{
999
#ifdef CPU_ARCH_ARM64
1000
v2s = vdiv_f32(v2s, v.v2s);
1001
#else
1002
*this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
1003
#endif
1004
}
1005
1006
ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
1007
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
1008
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
1009
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
1010
1011
ALWAYS_INLINE void operator&=(const GSVector2& v)
1012
{
1013
v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
1014
}
1015
1016
ALWAYS_INLINE void operator|=(const GSVector2& v)
1017
{
1018
v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
1019
}
1020
1021
ALWAYS_INLINE void operator^=(const GSVector2& v)
1022
{
1023
v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
1024
}
1025
1026
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
1027
{
1028
return GSVector2(vadd_f32(v1.v2s, v2.v2s));
1029
}
1030
1031
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
1032
{
1033
return GSVector2(vsub_f32(v1.v2s, v2.v2s));
1034
}
1035
1036
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
1037
{
1038
return GSVector2(vmul_f32(v1.v2s, v2.v2s));
1039
}
1040
1041
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
1042
{
1043
#ifdef CPU_ARCH_ARM64
1044
return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
1045
#else
1046
return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
1047
vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
1048
#endif
1049
}
1050
1051
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
1052
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
1053
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
1054
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
1055
1056
ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
1057
{
1058
return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1059
}
1060
1061
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
1062
{
1063
return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1064
}
1065
1066
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
1067
{
1068
return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1069
}
1070
1071
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
1072
{
1073
return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
1074
}
1075
1076
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
1077
{
1078
// NEON has no !=
1079
return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
1080
}
1081
1082
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
1083
{
1084
return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
1085
}
1086
1087
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
1088
{
1089
return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
1090
}
1091
1092
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
1093
{
1094
return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
1095
}
1096
1097
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
1098
{
1099
return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
1100
}
1101
1102
ALWAYS_INLINE GSVector2 xy() const { return *this; }
1103
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
1104
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
1105
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
1106
};
1107
1108
class alignas(16) GSVector4i
1109
{
1110
struct cxpr_init_tag
1111
{
1112
};
1113
static constexpr cxpr_init_tag cxpr_init{};
1114
1115
constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1116
1117
constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1118
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1119
{
1120
}
1121
1122
constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1123
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1124
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1125
{
1126
}
1127
1128
public:
1129
union
1130
{
1131
struct
1132
{
1133
int x, y, z, w;
1134
};
1135
struct
1136
{
1137
int r, g, b, a;
1138
};
1139
struct
1140
{
1141
int left, top, right, bottom;
1142
};
1143
float F32[4];
1144
s8 S8[16];
1145
s16 S16[8];
1146
s32 S32[4];
1147
s64 S64[2];
1148
u8 U8[16];
1149
u16 U16[8];
1150
u32 U32[4];
1151
u64 U64[2];
1152
int32x4_t v4s;
1153
};
1154
1155
GSVector4i() = default;
1156
1157
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1158
{
1159
return GSVector4i(cxpr_init, x, y, z, w);
1160
}
1161
1162
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1163
1164
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1165
1166
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1167
{
1168
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1169
}
1170
1171
ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1172
s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1173
{
1174
return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1175
}
1176
1177
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
1178
: v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3))
1179
{
1180
}
1181
1182
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1183
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1184
{
1185
}
1186
1187
constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
1188
s8 b13, s8 b14, s8 b15)
1189
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1190
{
1191
}
1192
1193
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : v4s(vcombine_s32(v.v2s, vcreate_s32(0))) {}
1194
1195
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1196
1197
ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
1198
1199
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v) : v4s(vcombine_s32(vcvt_s32_f32(v.v2s), vcreate_s32(0))) {}
1200
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1201
1202
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1203
1204
ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); }
1205
1206
ALWAYS_INLINE operator int32x4_t() const { return v4s; }
1207
1208
// rect
1209
1210
ALWAYS_INLINE s32 width() const { return right - left; }
1211
ALWAYS_INLINE s32 height() const { return bottom - top; }
1212
1213
ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1214
1215
ALWAYS_INLINE bool rempty() const
1216
{
1217
// !any((x, y) < (z, w)) i.e. !not_empty
1218
return (vget_lane_u64(vreinterpret_u64_u32(vclt_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) !=
1219
0xFFFFFFFFFFFFFFFFULL);
1220
}
1221
1222
ALWAYS_INLINE bool rvalid() const
1223
{
1224
// !all((x, y) >= (z, w))
1225
return (vget_lane_u64(vreinterpret_u64_u32(vcge_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == 0);
1226
}
1227
1228
ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_s32(a).upl64(max_s32(a).srl<8>()); }
1229
1230
ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_s32(a); }
1231
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1232
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1233
1234
ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1235
1236
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1237
{
1238
return max_s8(min).min_s8(max);
1239
}
1240
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1241
{
1242
return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1243
}
1244
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1245
{
1246
return max_s16(min).min_s16(max);
1247
}
1248
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1249
{
1250
return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1251
}
1252
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1253
{
1254
return max_s32(min).min_s32(max);
1255
}
1256
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1257
{
1258
return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1259
}
1260
1261
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1262
{
1263
return max_u8(min).min_u8(max);
1264
}
1265
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1266
{
1267
return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1268
}
1269
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1270
{
1271
return max_u16(min).min_u16(max);
1272
}
1273
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1274
{
1275
return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1276
}
1277
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1278
{
1279
return max_u32(min).min_u32(max);
1280
}
1281
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1282
{
1283
return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1284
}
1285
1286
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const
1287
{
1288
return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1289
}
1290
1291
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const
1292
{
1293
return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1294
}
1295
1296
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const
1297
{
1298
return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1299
}
1300
1301
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const
1302
{
1303
return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1304
}
1305
1306
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
1307
1308
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
1309
1310
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
1311
{
1312
return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1313
}
1314
1315
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
1316
{
1317
return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1318
}
1319
1320
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
1321
{
1322
return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1323
}
1324
1325
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
1326
{
1327
return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1328
}
1329
1330
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
1331
{
1332
return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1333
}
1334
1335
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
1336
{
1337
return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1338
}
1339
1340
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
1341
{
1342
#ifdef CPU_ARCH_ARM64
1343
const int32x4_t low =
1344
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1345
const int32x4_t high = vmull_high_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
1346
return GSVector4i(vpaddq_s32(low, high));
1347
#else
1348
// borrowed from sse2neon
1349
const int32x4_t low =
1350
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1351
const int32x4_t high =
1352
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1353
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
1354
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
1355
#endif
1356
}
1357
1358
ALWAYS_INLINE GSVector4i addp_s32() const
1359
{
1360
#ifdef CPU_ARCH_ARM64
1361
return GSVector4i(vpaddq_s32(v4s, v4s));
1362
#else
1363
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1364
return GSVector4i(vcombine_s32(res, res));
1365
#endif
1366
}
1367
1368
ALWAYS_INLINE s32 addv_s32() const
1369
{
1370
#ifdef CPU_ARCH_ARM64
1371
return vaddvq_s32(v4s);
1372
#else
1373
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1374
return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
1375
#endif
1376
}
1377
1378
#ifdef CPU_ARCH_ARM64
1379
1380
ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
1381
1382
ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
1383
1384
ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
1385
1386
ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
1387
1388
ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
1389
1390
ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(vreinterpretq_u32_s32(v4s)); }
1391
1392
ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
1393
1394
ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(vreinterpretq_u32_s32(v4s)); }
1395
1396
#else
1397
1398
ALWAYS_INLINE u8 minv_u8() const
1399
{
1400
uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1401
vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
1402
return static_cast<u8>(
1403
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
1404
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
1405
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
1406
}
1407
1408
ALWAYS_INLINE u16 maxv_u8() const
1409
{
1410
uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1411
vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
1412
return static_cast<u8>(
1413
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
1414
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
1415
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
1416
}
1417
1418
ALWAYS_INLINE u16 minv_u16() const
1419
{
1420
uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1421
vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
1422
return static_cast<u16>(
1423
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
1424
}
1425
1426
ALWAYS_INLINE u16 maxv_u16() const
1427
{
1428
uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1429
vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
1430
return static_cast<u16>(
1431
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
1432
}
1433
1434
ALWAYS_INLINE s32 minv_s32() const
1435
{
1436
int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1437
return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
1438
}
1439
1440
ALWAYS_INLINE u32 minv_u32() const
1441
{
1442
uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1443
return std::min<u32>(vget_lane_u32(vmin, 0), vget_lane_u32(vmin, 1));
1444
}
1445
1446
ALWAYS_INLINE s32 maxv_s32() const
1447
{
1448
int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1449
return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
1450
}
1451
1452
ALWAYS_INLINE u32 maxv_u32() const
1453
{
1454
uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1455
return std::max<u32>(vget_lane_u32(vmax, 0), vget_lane_u32(vmax, 1));
1456
}
1457
1458
#endif
1459
1460
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1461
1462
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
1463
{
1464
uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
1465
return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
1466
}
1467
1468
template<int mask>
1469
ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
1470
{
1471
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(
1472
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s), ((mask & 0x01) == 0) ? 0 : 8,
1473
((mask & 0x02) == 0) ? 1 : 9, ((mask & 0x04) == 0) ? 2 : 10, ((mask & 0x08) == 0) ? 3 : 11,
1474
((mask & 0x10) == 0) ? 4 : 12, ((mask & 0x20) == 0) ? 5 : 13, ((mask & 0x40) == 0) ? 6 : 14,
1475
((mask & 0x80) == 0) ? 7 : 15)));
1476
}
1477
1478
template<int mask>
1479
ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1480
{
1481
return GSVector4i(__builtin_shufflevector(v4s, v.v4s, ((mask & 1) == 0) ? 0 : 4, ((mask & 2) == 0) ? 1 : 5,
1482
((mask & 4) == 0) ? 2 : 6, ((mask & 8) == 0) ? 3 : 7));
1483
}
1484
1485
ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1486
{
1487
return GSVector4i(
1488
vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
1489
vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
1490
}
1491
1492
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
1493
{
1494
#ifdef CPU_ARCH_ARM64
1495
return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
1496
#else
1497
int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
1498
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
1499
vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
1500
#endif
1501
}
1502
1503
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
1504
{
1505
return GSVector4i(vreinterpretq_s32_s8(
1506
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
1507
}
1508
1509
ALWAYS_INLINE GSVector4i ps16() const
1510
{
1511
return GSVector4i(vreinterpretq_s32_s8(
1512
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
1513
}
1514
1515
ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
1516
{
1517
return GSVector4i(vreinterpretq_s32_u8(
1518
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
1519
}
1520
1521
ALWAYS_INLINE GSVector4i pu16() const
1522
{
1523
return GSVector4i(vreinterpretq_s32_u8(
1524
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
1525
}
1526
1527
ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
1528
{
1529
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
1530
}
1531
1532
ALWAYS_INLINE GSVector4i ps32() const
1533
{
1534
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
1535
}
1536
1537
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1538
{
1539
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
1540
}
1541
1542
ALWAYS_INLINE GSVector4i pu32() const
1543
{
1544
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
1545
}
1546
1547
#ifdef CPU_ARCH_ARM64
1548
1549
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1550
{
1551
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1552
}
1553
1554
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1555
{
1556
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1557
}
1558
1559
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1560
{
1561
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1562
}
1563
1564
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1565
{
1566
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1567
}
1568
1569
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
1570
1571
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
1572
1573
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1574
{
1575
return GSVector4i(vreinterpretq_s32_s64(
1576
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1577
}
1578
1579
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1580
{
1581
return GSVector4i(vreinterpretq_s32_s64(
1582
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1583
}
1584
1585
ALWAYS_INLINE GSVector4i upl8() const
1586
{
1587
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1588
}
1589
1590
ALWAYS_INLINE GSVector4i uph8() const
1591
{
1592
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1593
}
1594
1595
ALWAYS_INLINE GSVector4i upl16() const
1596
{
1597
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1598
}
1599
1600
ALWAYS_INLINE GSVector4i uph16() const
1601
{
1602
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1603
}
1604
1605
ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
1606
1607
ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
1608
1609
ALWAYS_INLINE GSVector4i upl64() const
1610
{
1611
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1612
}
1613
1614
ALWAYS_INLINE GSVector4i uph64() const
1615
{
1616
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1617
}
1618
1619
#else
1620
1621
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1622
{
1623
const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
1624
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1625
}
1626
1627
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1628
{
1629
const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
1630
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1631
}
1632
1633
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1634
{
1635
const int16x4x2_t res =
1636
vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1637
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1638
}
1639
1640
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1641
{
1642
const int16x4x2_t res =
1643
vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1644
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1645
}
1646
1647
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
1648
{
1649
const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
1650
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1651
}
1652
1653
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
1654
{
1655
const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
1656
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1657
}
1658
1659
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1660
{
1661
return GSVector4i(vreinterpretq_s32_s64(
1662
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1663
}
1664
1665
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1666
{
1667
return GSVector4i(vreinterpretq_s32_s64(
1668
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1669
}
1670
1671
ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
1672
1673
ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
1674
1675
ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
1676
1677
ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
1678
1679
ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
1680
1681
ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
1682
1683
ALWAYS_INLINE GSVector4i upl64() const
1684
{
1685
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1686
}
1687
1688
ALWAYS_INLINE GSVector4i uph64() const
1689
{
1690
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1691
}
1692
#endif
1693
1694
ALWAYS_INLINE GSVector4i s8to16() const
1695
{
1696
return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
1697
}
1698
1699
ALWAYS_INLINE GSVector4i u8to16() const
1700
{
1701
return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
1702
}
1703
1704
ALWAYS_INLINE GSVector4i s8to32() const
1705
{
1706
return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
1707
}
1708
1709
ALWAYS_INLINE GSVector4i u8to32() const
1710
{
1711
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
1712
}
1713
1714
ALWAYS_INLINE GSVector4i s8to64() const
1715
{
1716
return GSVector4i(vreinterpretq_s32_s64(
1717
vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
1718
}
1719
1720
ALWAYS_INLINE GSVector4i u8to64() const
1721
{
1722
return GSVector4i(vreinterpretq_s32_u64(
1723
vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
1724
}
1725
1726
ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
1727
1728
ALWAYS_INLINE GSVector4i u16to32() const
1729
{
1730
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
1731
}
1732
1733
ALWAYS_INLINE GSVector4i s16to64() const
1734
{
1735
return GSVector4i(
1736
vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
1737
}
1738
1739
ALWAYS_INLINE GSVector4i u16to64() const
1740
{
1741
return GSVector4i(
1742
vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
1743
}
1744
1745
ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
1746
1747
ALWAYS_INLINE GSVector4i u32to64() const
1748
{
1749
return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
1750
}
1751
1752
template<int i>
1753
ALWAYS_INLINE GSVector4i srl() const
1754
{
1755
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
1756
}
1757
1758
template<int i>
1759
ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1760
{
1761
if constexpr (i >= 16)
1762
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
1763
else
1764
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
1765
}
1766
1767
template<int i>
1768
ALWAYS_INLINE GSVector4i sll() const
1769
{
1770
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
1771
}
1772
1773
template<int i>
1774
ALWAYS_INLINE GSVector4i sll16() const
1775
{
1776
return GSVector4i(vreinterpretq_s32_u16(vshlq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1777
}
1778
1779
ALWAYS_INLINE GSVector4i sll16(s32 i) const
1780
{
1781
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_s16(i))));
1782
}
1783
1784
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
1785
{
1786
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1787
}
1788
1789
template<int i>
1790
ALWAYS_INLINE GSVector4i srl16() const
1791
{
1792
return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1793
}
1794
1795
ALWAYS_INLINE GSVector4i srl16(s32 i) const
1796
{
1797
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_s16(-i))));
1798
}
1799
1800
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
1801
{
1802
return GSVector4i(
1803
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1804
}
1805
1806
template<int i>
1807
ALWAYS_INLINE GSVector4i sra16() const
1808
{
1809
constexpr int count = (i & ~15) ? 15 : i;
1810
return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
1811
}
1812
1813
ALWAYS_INLINE GSVector4i sra16(s32 i) const
1814
{
1815
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
1816
}
1817
1818
ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
1819
{
1820
return GSVector4i(
1821
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1822
}
1823
1824
template<int i>
1825
ALWAYS_INLINE GSVector4i sll32() const
1826
{
1827
return GSVector4i(vshlq_n_s32(v4s, i));
1828
}
1829
1830
ALWAYS_INLINE GSVector4i sll32(s32 i) const
1831
{
1832
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(i))));
1833
}
1834
1835
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const
1836
{
1837
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), v.v4s)));
1838
}
1839
1840
template<int i>
1841
ALWAYS_INLINE GSVector4i srl32() const
1842
{
1843
return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
1844
}
1845
1846
ALWAYS_INLINE GSVector4i srl32(s32 i) const
1847
{
1848
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
1849
}
1850
1851
ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
1852
{
1853
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
1854
}
1855
1856
template<int i>
1857
ALWAYS_INLINE GSVector4i sra32() const
1858
{
1859
return GSVector4i(vshrq_n_s32(v4s, i));
1860
}
1861
1862
ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
1863
1864
ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, vnegq_s32(v.v4s))); }
1865
1866
template<int i>
1867
ALWAYS_INLINE GSVector4i sll64() const
1868
{
1869
return GSVector4i(vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1870
}
1871
1872
ALWAYS_INLINE GSVector4i sll64(s32 i) const
1873
{
1874
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_s64(i))));
1875
}
1876
1877
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
1878
{
1879
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1880
}
1881
1882
template<int i>
1883
ALWAYS_INLINE GSVector4i srl64() const
1884
{
1885
return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1886
}
1887
1888
ALWAYS_INLINE GSVector4i srl64(s32 i) const
1889
{
1890
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_s64(-i))));
1891
}
1892
1893
#ifdef CPU_ARCH_ARM64
1894
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
1895
{
1896
return GSVector4i(
1897
vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
1898
}
1899
#endif
1900
1901
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
1902
{
1903
return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1904
}
1905
1906
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
1907
{
1908
return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1909
}
1910
1911
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
1912
1913
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
1914
{
1915
return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1916
}
1917
1918
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
1919
{
1920
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1921
}
1922
1923
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
1924
{
1925
// can't use vpaddq_s16() here, because we need saturation.
1926
// return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1927
#ifdef CPU_ARCH_ARM64
1928
const int16x8_t a = vreinterpretq_s16_s32(v4s);
1929
const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
1930
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))));
1931
#else
1932
// sse2neon again
1933
int16x8_t ab0246 = vcombine_s16(vmovn_s32(v4s), vmovn_s32(v.v4s));
1934
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(v4s, 16), vshrn_n_s32(v.v4s, 16));
1935
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
1936
#endif
1937
}
1938
1939
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
1940
{
1941
return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1942
}
1943
1944
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
1945
{
1946
return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1947
}
1948
1949
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
1950
{
1951
return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1952
}
1953
1954
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
1955
{
1956
return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1957
}
1958
1959
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
1960
1961
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
1962
{
1963
return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1964
}
1965
1966
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
1967
{
1968
return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1969
}
1970
1971
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
1972
{
1973
return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1974
}
1975
1976
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
1977
{
1978
return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1979
}
1980
1981
ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
1982
{
1983
return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1984
}
1985
1986
ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
1987
{
1988
return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1989
}
1990
1991
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
1992
{
1993
// from sse2neon
1994
int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
1995
int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
1996
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1997
int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
1998
int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
1999
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
2000
uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
2001
return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
2002
}
2003
2004
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
2005
{
2006
return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2007
}
2008
2009
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
2010
{
2011
int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
2012
int32x4_t mul_hi =
2013
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
2014
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
2015
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
2016
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
2017
}
2018
2019
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
2020
2021
ALWAYS_INLINE bool eq(const GSVector4i& v) const
2022
{
2023
const int32x4_t res = veorq_s32(v4s, v.v4s);
2024
#ifdef CPU_ARCH_ARM64
2025
return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
2026
#else
2027
const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
2028
return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
2029
#endif
2030
}
2031
2032
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
2033
{
2034
return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2035
}
2036
2037
ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
2038
{
2039
return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2040
}
2041
2042
ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
2043
{
2044
return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
2045
}
2046
2047
#ifdef CPU_ARCH_ARM64
2048
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
2049
{
2050
return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
2051
}
2052
#endif
2053
2054
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
2055
2056
ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
2057
2058
ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
2059
2060
ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
2061
{
2062
return GSVector4i(vreinterpretq_s32_u8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2063
}
2064
2065
ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
2066
{
2067
return GSVector4i(vreinterpretq_s32_u16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2068
}
2069
2070
ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const
2071
{
2072
return GSVector4i(vreinterpretq_s32_u32(vcgtq_s32(v4s, v.v4s)));
2073
}
2074
2075
ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
2076
{
2077
return GSVector4i(vreinterpretq_s32_u8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2078
}
2079
ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
2080
{
2081
return GSVector4i(vreinterpretq_s32_u16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2082
}
2083
ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const
2084
{
2085
return GSVector4i(vreinterpretq_s32_u32(vcgeq_s32(v4s, v.v4s)));
2086
}
2087
2088
ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
2089
{
2090
return GSVector4i(vreinterpretq_s32_u8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2091
}
2092
2093
ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
2094
{
2095
return GSVector4i(vreinterpretq_s32_u16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2096
}
2097
2098
ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const
2099
{
2100
return GSVector4i(vreinterpretq_s32_u32(vcltq_s32(v4s, v.v4s)));
2101
}
2102
2103
ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
2104
{
2105
return GSVector4i(vreinterpretq_s32_u8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2106
}
2107
ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
2108
{
2109
return GSVector4i(vreinterpretq_s32_u16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2110
}
2111
ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const
2112
{
2113
return GSVector4i(vreinterpretq_s32_u32(vcleq_s32(v4s, v.v4s)));
2114
}
2115
2116
ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
2117
2118
ALWAYS_INLINE int mask() const
2119
{
2120
// borrowed from sse2neon
2121
const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
2122
const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2123
const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2124
const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2125
return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
2126
}
2127
2128
ALWAYS_INLINE bool alltrue() const
2129
{
2130
#ifdef CPU_ARCH_ARM64
2131
return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
2132
#else
2133
return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
2134
UINT64_C(0xFFFFFFFFFFFFFFFF));
2135
#endif
2136
}
2137
2138
ALWAYS_INLINE bool anytrue() const
2139
{
2140
#ifdef CPU_ARCH_ARM64
2141
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) != UINT32_C(0));
2142
#else
2143
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) != UINT64_C(0));
2144
#endif
2145
}
2146
2147
ALWAYS_INLINE bool allfalse() const
2148
{
2149
#ifdef CPU_ARCH_ARM64
2150
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
2151
#else
2152
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
2153
#endif
2154
}
2155
2156
template<int i>
2157
ALWAYS_INLINE GSVector4i insert8(int a) const
2158
{
2159
return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
2160
}
2161
2162
template<int i>
2163
ALWAYS_INLINE int extract8() const
2164
{
2165
return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
2166
}
2167
2168
template<int i>
2169
ALWAYS_INLINE GSVector4i insert16(int a) const
2170
{
2171
return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
2172
}
2173
2174
template<int i>
2175
ALWAYS_INLINE int extract16() const
2176
{
2177
return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
2178
}
2179
2180
template<int i>
2181
ALWAYS_INLINE GSVector4i insert32(int a) const
2182
{
2183
return GSVector4i(vsetq_lane_s32(a, v4s, i));
2184
}
2185
2186
template<int i>
2187
ALWAYS_INLINE int extract32() const
2188
{
2189
return vgetq_lane_s32(v4s, i);
2190
}
2191
2192
template<int i>
2193
ALWAYS_INLINE GSVector4i insert64(s64 a) const
2194
{
2195
return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
2196
}
2197
2198
template<int i>
2199
ALWAYS_INLINE s64 extract64() const
2200
{
2201
return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
2202
}
2203
2204
#ifdef CPU_ARCH_ARM64
2205
ALWAYS_INLINE GSVector4i tbl2(const GSVector4i& a, const GSVector4i& b, const GSVector4i& idx)
2206
{
2207
return GSVector4i(vreinterpretq_s32_u8(
2208
vqtbx2q_u8(vreinterpretq_u8_s32(v4s), uint8x16x2_t{vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(b.v4s)},
2209
vreinterpretq_u8_s32(idx.v4s))));
2210
}
2211
#endif
2212
2213
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
2214
{
2215
#if __has_builtin(__builtin_nontemporal_store)
2216
return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
2217
#else
2218
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2219
#endif
2220
}
2221
2222
ALWAYS_INLINE static GSVector4i load32(const void* p)
2223
{
2224
// should be ldr s0, [x0]
2225
s32 val;
2226
std::memcpy(&val, p, sizeof(s32));
2227
return GSVector4i(vsetq_lane_s32(val, vdupq_n_s32(0), 0));
2228
}
2229
2230
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
2231
2232
template<bool aligned>
2233
ALWAYS_INLINE static GSVector4i loadl(const void* p)
2234
{
2235
#ifdef CPU_ARCH_ARM32
2236
if constexpr (!aligned)
2237
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
2238
#endif
2239
2240
return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
2241
}
2242
2243
ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v) { return GSVector4i(vcombine_s32(v.v2s, vcreate_s32(0))); }
2244
2245
template<bool aligned>
2246
ALWAYS_INLINE static GSVector4i loadh(const void* p)
2247
{
2248
#ifdef CPU_ARCH_ARM32
2249
if constexpr (!aligned)
2250
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2251
#endif
2252
2253
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2254
}
2255
2256
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
2257
2258
template<bool aligned>
2259
ALWAYS_INLINE static GSVector4i load(const void* p)
2260
{
2261
#ifdef CPU_ARCH_ARM32
2262
if constexpr (!aligned)
2263
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
2264
#endif
2265
2266
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2267
}
2268
2269
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
2270
{
2271
#if __has_builtin(__builtin_nontemporal_store)
2272
__builtin_nontemporal_store(v.v4s, static_cast<int32x4_t*>(p));
2273
#else
2274
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2275
#endif
2276
}
2277
2278
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
2279
{
2280
u32 val = vgetq_lane_s32(v, 0);
2281
std::memcpy(p, &val, sizeof(u32));
2282
}
2283
2284
template<bool aligned>
2285
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
2286
{
2287
#ifdef CPU_ARCH_ARM32
2288
if constexpr (!aligned)
2289
{
2290
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
2291
return;
2292
}
2293
#endif
2294
2295
vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
2296
}
2297
2298
template<bool aligned>
2299
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
2300
{
2301
#ifdef CPU_ARCH_ARM32
2302
if constexpr (!aligned)
2303
{
2304
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
2305
return;
2306
}
2307
#endif
2308
2309
vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
2310
}
2311
2312
template<bool aligned>
2313
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
2314
{
2315
#ifdef CPU_ARCH_ARM32
2316
if constexpr (!aligned)
2317
{
2318
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
2319
return;
2320
}
2321
#endif
2322
2323
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2324
}
2325
2326
ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
2327
2328
template<bool aligned>
2329
ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
2330
{
2331
return load<aligned>(v);
2332
}
2333
2334
ALWAYS_INLINE void operator&=(const GSVector4i& v)
2335
{
2336
v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2337
}
2338
2339
ALWAYS_INLINE void operator|=(const GSVector4i& v)
2340
{
2341
v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2342
}
2343
2344
ALWAYS_INLINE void operator^=(const GSVector4i& v)
2345
{
2346
v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2347
}
2348
2349
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
2350
{
2351
return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2352
}
2353
2354
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
2355
{
2356
return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2357
}
2358
2359
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
2360
{
2361
return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2362
}
2363
2364
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
2365
2366
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
2367
2368
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
2369
2370
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
2371
2372
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
2373
2374
ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
2375
2376
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
2377
2378
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
2379
{
2380
return GSVector4i(vcombine_s32(xy.v2s, zw.v2s));
2381
}
2382
2383
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw) { return GSVector4i(vcombine_s32(xyzw.v2s, xyzw.v2s)); }
2384
2385
static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
2386
2387
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
2388
2389
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
2390
2391
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
2392
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
2393
{ \
2394
return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
2395
} \
2396
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \
2397
{ \
2398
return GSVector4i(vreinterpretq_s32_s16( \
2399
__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7))); \
2400
} \
2401
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \
2402
{ \
2403
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector( \
2404
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn))); \
2405
}
2406
2407
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
2408
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
2409
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
2410
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
2411
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2412
2413
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
2414
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
2415
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
2416
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
2417
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2418
2419
#define VECTOR4i_SHUFFLE_1(xs, xn) \
2420
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
2421
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
2422
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
2423
VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
2424
2425
VECTOR4i_SHUFFLE_1(x, 0);
2426
VECTOR4i_SHUFFLE_1(y, 1);
2427
VECTOR4i_SHUFFLE_1(z, 2);
2428
VECTOR4i_SHUFFLE_1(w, 3);
2429
2430
#undef VECTOR4i_SHUFFLE_1
2431
#undef VECTOR4i_SHUFFLE_2
2432
#undef VECTOR4i_SHUFFLE_3
2433
#undef VECTOR4i_SHUFFLE_4
2434
};
2435
2436
class alignas(16) GSVector4
2437
{
2438
struct cxpr_init_tag
2439
{
2440
};
2441
static constexpr cxpr_init_tag cxpr_init{};
2442
2443
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
2444
2445
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
2446
2447
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
2448
2449
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
2450
2451
public:
2452
union
2453
{
2454
struct
2455
{
2456
float x, y, z, w;
2457
};
2458
struct
2459
{
2460
float r, g, b, a;
2461
};
2462
struct
2463
{
2464
float left, top, right, bottom;
2465
};
2466
float F32[4];
2467
double F64[2];
2468
s8 I8[16];
2469
s16 I16[8];
2470
s32 I32[4];
2471
s64 I64[2];
2472
u8 U8[16];
2473
u16 U16[8];
2474
u32 U32[4];
2475
u64 U64[2];
2476
float32x4_t v4s;
2477
};
2478
2479
GSVector4() = default;
2480
2481
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
2482
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
2483
2484
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
2485
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
2486
2487
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
2488
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
2489
2490
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
2491
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
2492
2493
constexpr static GSVector4 cxpr_rgba32(u32 rgba)
2494
{
2495
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff), static_cast<float>((rgba >> 8) & 0xff),
2496
static_cast<float>((rgba >> 16) & 0xff), static_cast<float>((rgba >> 24) & 0xff));
2497
}
2498
2499
constexpr static GSVector4 cxpr_unorm8(u32 rgba)
2500
{
2501
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff) / 255.0f,
2502
static_cast<float>((rgba >> 8) & 0xff) / 255.0f, static_cast<float>((rgba >> 16) & 0xff) / 255.0f,
2503
static_cast<float>((rgba >> 24) & 0xff) / 255.0f);
2504
}
2505
2506
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
2507
{
2508
const float arr[4] = {x, y, z, w};
2509
v4s = vld1q_f32(arr);
2510
}
2511
2512
ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
2513
2514
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
2515
{
2516
const int arr[4] = {x, y, z, w};
2517
v4s = vcvtq_f32_s32(vld1q_s32(arr));
2518
}
2519
2520
ALWAYS_INLINE GSVector4(int x, int y)
2521
{
2522
v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
2523
}
2524
2525
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
2526
2527
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
2528
2529
ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
2530
2531
ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
2532
2533
ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
2534
2535
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
2536
2537
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2538
2539
ALWAYS_INLINE static GSVector4 f64(double x, double y)
2540
{
2541
#ifdef CPU_ARCH_ARM64
2542
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
2543
#else
2544
GSVector4 ret;
2545
ret.F64[0] = x;
2546
ret.F64[1] = y;
2547
return ret;
2548
#endif
2549
}
2550
2551
ALWAYS_INLINE static GSVector4 f64(double x)
2552
{
2553
#ifdef CPU_ARCH_ARM64
2554
return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
2555
#else
2556
GSVector4 ret;
2557
ret.F64[0] = ret.F64[1] = x;
2558
return ret;
2559
#endif
2560
}
2561
2562
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
2563
2564
ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
2565
2566
ALWAYS_INLINE operator float32x4_t() const { return v4s; }
2567
2568
ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2569
2570
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2571
{
2572
return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2573
}
2574
2575
ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2576
2577
ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
2578
2579
ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
2580
2581
#ifdef _M_ARM64
2582
2583
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
2584
2585
ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
2586
2587
#else
2588
2589
ALWAYS_INLINE GSVector4 floor() const
2590
{
2591
return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
2592
std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
2593
}
2594
2595
ALWAYS_INLINE GSVector4 ceil() const
2596
{
2597
return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
2598
std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
2599
}
2600
2601
#endif
2602
2603
#ifdef CPU_ARCH_ARM64
2604
2605
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
2606
2607
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
2608
2609
ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
2610
2611
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2612
{
2613
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
2614
}
2615
2616
#else
2617
2618
ALWAYS_INLINE GSVector4 hadd() const
2619
{
2620
const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2621
return GSVector4(vcombine_f32(res, res));
2622
}
2623
2624
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
2625
{
2626
const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2627
const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
2628
return GSVector4(vcombine_f32(res1, res2));
2629
}
2630
2631
ALWAYS_INLINE GSVector4 hsub() const
2632
{
2633
const float32x4x2_t res = vuzpq_f32(v4s, v4s);
2634
return GSVector4(vsubq_f32(res.val[0], res.val[0]));
2635
}
2636
2637
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2638
{
2639
const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
2640
return GSVector4(vsubq_f32(res.val[0], res.val[1]));
2641
}
2642
2643
#endif
2644
2645
ALWAYS_INLINE float dot(const GSVector4& v) const
2646
{
2647
#ifdef CPU_ARCH_ARM64
2648
return vaddvq_f32(vmulq_f32(v4s, v.v4s));
2649
#else
2650
const float32x4_t dp = vmulq_f32(v4s, v.v4s);
2651
float32x2_t tmp = vadd_f32(vget_low_f32(dp), vget_high_f32(dp)); // (x+z, y+w)
2652
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2653
#endif
2654
}
2655
2656
ALWAYS_INLINE float addv() const
2657
{
2658
#ifdef CPU_ARCH_ARM64
2659
return vaddvq_f32(v4s);
2660
#else
2661
float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2662
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2663
#endif
2664
}
2665
2666
ALWAYS_INLINE float minv() const
2667
{
2668
#ifdef CPU_ARCH_ARM64
2669
return vminvq_f32(v4s);
2670
#else
2671
float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2672
return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2673
#endif
2674
}
2675
2676
ALWAYS_INLINE float maxv() const
2677
{
2678
#ifdef CPU_ARCH_ARM64
2679
return vmaxvq_f32(v4s);
2680
#else
2681
float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2682
return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2683
#endif
2684
}
2685
2686
ALWAYS_INLINE float width() const { return right - left; }
2687
ALWAYS_INLINE float height() const { return bottom - top; }
2688
2689
ALWAYS_INLINE GSVector2 rsize() const { return (zw() - xy()); }
2690
2691
ALWAYS_INLINE bool rempty() const
2692
{
2693
// !any((x, y) < (z, w)) i.e. !not_empty
2694
return (vget_lane_u64(vreinterpret_u64_f32(vclt_f32(vget_low_f32(v4s), vget_high_f32(v4s))), 0) !=
2695
0xFFFFFFFFFFFFFFFFULL);
2696
}
2697
2698
ALWAYS_INLINE bool rvalid() const
2699
{
2700
// !all((x, y) >= (z, w))
2701
return (vget_lane_u64(vreinterpret_u64_f32(vcge_f32(vget_low_f32(v4s), vget_high_f32(v4s))), 0) == 0);
2702
}
2703
2704
ALWAYS_INLINE GSVector4 runion(const GSVector4 v) const { return min(v).blend32<0xc>(max(v)); }
2705
2706
ALWAYS_INLINE GSVector4 rintersect(const GSVector4& a) const { return sat(a); }
2707
ALWAYS_INLINE bool rintersects(const GSVector4& v) const { return rintersect(v).rvalid(); }
2708
ALWAYS_INLINE bool rcontains(const GSVector4& v) const { return rintersect(v).eq(v); }
2709
2710
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
2711
2712
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
2713
{
2714
#ifdef CPU_ARCH_ARM64
2715
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
2716
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
2717
#else
2718
const GSVector4 minv(a.xyxy());
2719
const GSVector4 maxv(a.zwzw());
2720
#endif
2721
return sat(minv, maxv);
2722
}
2723
2724
ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2725
2726
ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2727
2728
ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
2729
2730
ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
2731
2732
template<int mask>
2733
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
2734
{
2735
return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
2736
(mask & 8) ? 7 : 3));
2737
}
2738
2739
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
2740
{
2741
// duplicate sign bit across and bit select
2742
const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
2743
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
2744
}
2745
2746
#ifdef CPU_ARCH_ARM64
2747
2748
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
2749
2750
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
2751
2752
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2753
{
2754
return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2755
}
2756
2757
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2758
{
2759
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2760
}
2761
2762
#else
2763
2764
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
2765
{
2766
const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
2767
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2768
}
2769
2770
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
2771
{
2772
const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
2773
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2774
}
2775
2776
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2777
{
2778
return GSVector4(vreinterpretq_f32_s64(
2779
vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
2780
}
2781
2782
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2783
{
2784
return GSVector4(vreinterpretq_f32_s64(
2785
vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
2786
}
2787
2788
#endif
2789
2790
ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
2791
{
2792
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
2793
}
2794
2795
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
2796
{
2797
return GSVector4(vcombine_f32(vget_high_f32(a.v4s), vget_high_f32(v4s)));
2798
}
2799
2800
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
2801
{
2802
return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
2803
}
2804
2805
ALWAYS_INLINE int mask() const
2806
{
2807
#ifdef CPU_ARCH_ARM64
2808
static constexpr const int32_t shifts[] = {0, 1, 2, 3};
2809
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
2810
#else
2811
// sse2neon again
2812
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
2813
uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2814
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2815
#endif
2816
}
2817
2818
ALWAYS_INLINE bool alltrue() const
2819
{
2820
#ifdef CPU_ARCH_ARM64
2821
return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
2822
#else
2823
2824
return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2825
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2826
0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
2827
#endif
2828
}
2829
2830
ALWAYS_INLINE bool anytrue() const
2831
{
2832
#ifdef CPU_ARCH_ARM64
2833
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) != UINT32_C(0));
2834
#else
2835
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2836
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2837
0) != UINT64_C(0));
2838
#endif
2839
}
2840
2841
ALWAYS_INLINE bool allfalse() const
2842
{
2843
#ifdef CPU_ARCH_ARM64
2844
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
2845
#else
2846
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2847
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2848
0) == UINT64_C(0));
2849
#endif
2850
}
2851
2852
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2853
2854
template<int src, int dst>
2855
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2856
{
2857
#ifdef CPU_ARCH_ARM64
2858
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
2859
#else
2860
return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
2861
#endif
2862
}
2863
2864
template<int i>
2865
ALWAYS_INLINE GSVector4 insert32(float v) const
2866
{
2867
return GSVector4(vsetq_lane_f32(v, v4s, i));
2868
}
2869
2870
template<int i>
2871
ALWAYS_INLINE float extract32() const
2872
{
2873
return vgetq_lane_f32(v4s, i);
2874
}
2875
2876
template<int dst>
2877
ALWAYS_INLINE GSVector4 insert64(double v) const
2878
{
2879
#ifdef CPU_ARCH_ARM64
2880
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
2881
#else
2882
GSVector4 ret;
2883
ret.F64[dst] = v;
2884
return ret;
2885
#endif
2886
}
2887
2888
template<int src>
2889
ALWAYS_INLINE double extract64() const
2890
{
2891
#ifdef CPU_ARCH_ARM64
2892
return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
2893
#else
2894
return F64[src];
2895
#endif
2896
}
2897
2898
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
2899
2900
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
2901
2902
template<bool aligned>
2903
ALWAYS_INLINE static GSVector4 loadl(const void* p)
2904
{
2905
#ifdef CPU_ARCH_ARM32
2906
if constexpr (!aligned)
2907
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
2908
#endif
2909
2910
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
2911
}
2912
2913
template<bool aligned>
2914
ALWAYS_INLINE static GSVector4 loadh(const void* p)
2915
{
2916
#ifdef CPU_ARCH_ARM32
2917
if constexpr (!aligned)
2918
return GSVector4(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2919
#endif
2920
2921
return GSVector4(vreinterpretq_f32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2922
}
2923
2924
ALWAYS_INLINE static GSVector4 loadh(const GSVector2& v) { return GSVector4(vcombine_f32(vcreate_f32(0), v.v2s)); }
2925
2926
ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
2927
2928
template<bool aligned>
2929
ALWAYS_INLINE static GSVector4 load(const void* p)
2930
{
2931
#ifdef CPU_ARCH_ARM32
2932
if constexpr (!aligned)
2933
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
2934
#endif
2935
2936
return GSVector4(vld1q_f32((const float*)p));
2937
}
2938
2939
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
2940
2941
template<bool aligned>
2942
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2943
{
2944
#ifdef CPU_ARCH_ARM32
2945
if constexpr (!aligned)
2946
{
2947
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
2948
return;
2949
}
2950
#endif
2951
2952
vst1_f32((float*)p, vget_low_f32(v.v4s));
2953
}
2954
2955
template<bool aligned>
2956
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2957
{
2958
#ifdef CPU_ARCH_ARM32
2959
if constexpr (!aligned)
2960
{
2961
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
2962
return;
2963
}
2964
#endif
2965
2966
vst1_f32((float*)p, vget_high_f32(v.v4s));
2967
}
2968
2969
template<bool aligned>
2970
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2971
{
2972
#ifdef CPU_ARCH_ARM32
2973
if constexpr (!aligned)
2974
{
2975
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
2976
return;
2977
}
2978
#endif
2979
2980
vst1q_f32((float*)p, v.v4s);
2981
}
2982
2983
ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
2984
2985
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2986
2987
ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
2988
ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
2989
ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
2990
ALWAYS_INLINE void operator/=(const GSVector4& v)
2991
{
2992
#ifdef CPU_ARCH_ARM64
2993
v4s = vdivq_f32(v4s, v.v4s);
2994
#else
2995
*this =
2996
GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
2997
vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
2998
#endif
2999
}
3000
3001
ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
3002
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
3003
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
3004
ALWAYS_INLINE void operator/=(float f)
3005
{
3006
#ifdef CPU_ARCH_ARM64
3007
*this /= GSVector4(f);
3008
#else
3009
*this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
3010
vgetq_lane_f32(v4s, 3) / f);
3011
#endif
3012
}
3013
3014
ALWAYS_INLINE void operator&=(const GSVector4& v)
3015
{
3016
v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
3017
}
3018
3019
ALWAYS_INLINE void operator|=(const GSVector4& v)
3020
{
3021
v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
3022
}
3023
3024
ALWAYS_INLINE void operator^=(const GSVector4& v)
3025
{
3026
v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
3027
}
3028
3029
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
3030
{
3031
return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
3032
}
3033
3034
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
3035
{
3036
return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
3037
}
3038
3039
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
3040
{
3041
return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
3042
}
3043
3044
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
3045
{
3046
#ifdef CPU_ARCH_ARM64
3047
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
3048
#else
3049
return GSVector4(
3050
vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
3051
vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
3052
#endif
3053
}
3054
3055
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
3056
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
3057
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
3058
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
3059
{
3060
#ifdef CPU_ARCH_ARM64
3061
return v / GSVector4(f);
3062
#else
3063
return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
3064
vgetq_lane_f32(v.v4s, 3) / f);
3065
#endif
3066
}
3067
3068
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
3069
{
3070
return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
3071
}
3072
3073
ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
3074
{
3075
return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
3076
}
3077
3078
ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
3079
{
3080
return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
3081
}
3082
3083
ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
3084
{
3085
return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
3086
}
3087
3088
ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
3089
{
3090
// NEON has no !=
3091
return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
3092
}
3093
3094
ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
3095
{
3096
return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
3097
}
3098
3099
ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
3100
{
3101
return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
3102
}
3103
3104
ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
3105
{
3106
return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
3107
}
3108
3109
ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
3110
{
3111
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
3112
}
3113
3114
ALWAYS_INLINE bool eq(const GSVector4& v) const
3115
{
3116
#ifdef CPU_ARCH_ARM64
3117
const uint32x4_t res = vceqq_f32(v4s, v.v4s);
3118
return (vminvq_u32(res) != 0);
3119
#else
3120
const uint32x4_t res = vmvnq_u32(vceqq_f32(v4s, v.v4s));
3121
const uint32x2_t paired = vorr_u32(vget_low_u32(res), vget_high_u32(res));
3122
return (vget_lane_u64(paired, 0) == 0);
3123
#endif
3124
}
3125
3126
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
3127
{
3128
#ifdef CPU_ARCH_ARM64
3129
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3130
#else
3131
return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
3132
#endif
3133
}
3134
3135
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
3136
{
3137
#ifdef CPU_ARCH_ARM64
3138
return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3139
#else
3140
return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
3141
#endif
3142
}
3143
3144
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
3145
{
3146
#ifdef CPU_ARCH_ARM64
3147
return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3148
#else
3149
return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
3150
#endif
3151
}
3152
3153
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
3154
{
3155
#ifdef CPU_ARCH_ARM64
3156
return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3157
#else
3158
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
3159
#endif
3160
}
3161
3162
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
3163
{
3164
#ifdef CPU_ARCH_ARM64
3165
return GSVector4(vreinterpretq_f32_u64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3166
#else
3167
GSVector4 ret;
3168
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3169
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3170
return ret;
3171
#endif
3172
}
3173
3174
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
3175
{
3176
#ifdef CPU_ARCH_ARM64
3177
return GSVector4(vreinterpretq_f32_u64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3178
#else
3179
GSVector4 ret;
3180
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3181
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3182
return ret;
3183
#endif
3184
}
3185
3186
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
3187
{
3188
#ifdef CPU_ARCH_ARM64
3189
return GSVector4(vreinterpretq_f32_u64(vcltq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3190
#else
3191
GSVector4 ret;
3192
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3193
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3194
return ret;
3195
#endif
3196
}
3197
3198
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
3199
{
3200
#ifdef CPU_ARCH_ARM64
3201
return GSVector4(vreinterpretq_f32_u64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3202
#else
3203
GSVector4 ret;
3204
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3205
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3206
return ret;
3207
#endif
3208
}
3209
3210
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
3211
{
3212
#ifdef CPU_ARCH_ARM64
3213
return GSVector4(vreinterpretq_f32_u64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3214
#else
3215
GSVector4 ret;
3216
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3217
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3218
return ret;
3219
#endif
3220
}
3221
3222
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
3223
{
3224
#ifdef CPU_ARCH_ARM64
3225
return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3226
#else
3227
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
3228
#endif
3229
}
3230
3231
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
3232
{
3233
#ifdef CPU_ARCH_ARM64
3234
return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3235
#else
3236
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
3237
#endif
3238
}
3239
3240
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
3241
3242
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
3243
3244
ALWAYS_INLINE GSVector4 sqrt64() const
3245
{
3246
#ifdef CPU_ARCH_ARM64
3247
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3248
#else
3249
return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
3250
#endif
3251
}
3252
3253
ALWAYS_INLINE GSVector4 sqr64() const
3254
{
3255
#ifdef CPU_ARCH_ARM64
3256
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v4s))));
3257
#else
3258
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
3259
#endif
3260
}
3261
3262
ALWAYS_INLINE GSVector4 floor64() const
3263
{
3264
#ifdef CPU_ARCH_ARM64
3265
return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
3266
#else
3267
return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
3268
#endif
3269
}
3270
3271
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
3272
{
3273
#ifdef CPU_ARCH_ARM64
3274
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
3275
#else
3276
return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
3277
#endif
3278
}
3279
3280
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
3281
{
3282
#ifdef CPU_ARCH_ARM64
3283
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
3284
#else
3285
const float* fp = static_cast<const float*>(p);
3286
return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
3287
#endif
3288
}
3289
3290
ALWAYS_INLINE GSVector4i f64toi32() const
3291
{
3292
#ifdef CPU_ARCH_ARM64
3293
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
3294
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
3295
#else
3296
const s32 low = static_cast<s32>(F64[0]);
3297
const s32 high = static_cast<s32>(F64[1]);
3298
#endif
3299
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
3300
}
3301
3302
ALWAYS_INLINE GSVector2 xy() const { return GSVector2(vget_low_f32(v4s)); }
3303
3304
ALWAYS_INLINE GSVector2 zw() const { return GSVector2(vget_high_f32(v4s)); }
3305
3306
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
3307
{
3308
return GSVector4(vcombine_f32(l.v2s, h.v2s));
3309
}
3310
3311
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(vcombine_f32(l.v2s, l.v2s)); }
3312
3313
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
3314
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
3315
{ \
3316
return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
3317
} \
3318
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const \
3319
{ \
3320
return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); \
3321
}
3322
3323
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
3324
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
3325
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
3326
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
3327
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
3328
3329
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
3330
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
3331
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
3332
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
3333
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
3334
3335
#define VECTOR4_SHUFFLE_1(xs, xn) \
3336
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
3337
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
3338
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
3339
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
3340
3341
VECTOR4_SHUFFLE_1(x, 0);
3342
VECTOR4_SHUFFLE_1(y, 1);
3343
VECTOR4_SHUFFLE_1(z, 2);
3344
VECTOR4_SHUFFLE_1(w, 3);
3345
3346
#undef VECTOR4_SHUFFLE_1
3347
#undef VECTOR4_SHUFFLE_2
3348
#undef VECTOR4_SHUFFLE_3
3349
#undef VECTOR4_SHUFFLE_4
3350
3351
ALWAYS_INLINE GSVector4 broadcast32() const
3352
{
3353
#ifdef CPU_ARCH_ARM64
3354
return GSVector4(vdupq_laneq_f32(v4s, 0));
3355
#else
3356
return xxxx();
3357
#endif
3358
}
3359
3360
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
3361
{
3362
#ifdef CPU_ARCH_ARM64
3363
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
3364
#else
3365
return v.xxxx();
3366
#endif
3367
}
3368
3369
ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
3370
3371
ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
3372
{
3373
#ifdef CPU_ARCH_ARM64
3374
return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
3375
#else
3376
return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
3377
#endif
3378
}
3379
};
3380
3381
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
3382
{
3383
v2s = vcvt_s32_f32(v.v2s);
3384
}
3385
3386
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
3387
{
3388
v2s = vcvt_f32_s32(v.v2s);
3389
}
3390
3391
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
3392
{
3393
return GSVector2i(vreinterpret_s32_f32(v.v2s));
3394
}
3395
3396
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
3397
{
3398
return GSVector2(vreinterpret_f32_s32(v.v2s));
3399
}
3400
3401
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
3402
{
3403
v4s = vcvtq_s32_f32(v.v4s);
3404
}
3405
3406
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
3407
{
3408
v4s = vcvtq_f32_s32(v.v4s);
3409
}
3410
3411
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
3412
{
3413
return GSVector4i(vreinterpretq_s32_f32(v.v4s));
3414
}
3415
3416
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
3417
{
3418
return GSVector4(vreinterpretq_f32_s32(v.v4s));
3419
}
3420
3421