Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_neon.h
4211 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "common/intrin.h"
5
#include "common/types.h"
6
7
#include <algorithm>
8
#include <cmath>
9
#include <cstdint>
10
11
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
12
#define GSVECTOR_HAS_SRLV 1
13
14
#ifdef CPU_ARCH_ARM64
15
// tbl2 with 128-bit vectors is not in A32.
16
#define GSVECTOR_HAS_TBL2 1
17
#endif
18
19
class GSVector2;
20
class GSVector2i;
21
class GSVector4;
22
class GSVector4i;
23
24
class alignas(16) GSVector2i
25
{
26
struct cxpr_init_tag
27
{
28
};
29
static constexpr cxpr_init_tag cxpr_init{};
30
31
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
32
33
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
34
35
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
36
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
37
{
38
}
39
40
public:
41
union
42
{
43
struct
44
{
45
s32 x, y;
46
};
47
struct
48
{
49
s32 r, g;
50
};
51
float F32[2];
52
s8 S8[8];
53
s16 S16[4];
54
s32 S32[2];
55
s64 S64[1];
56
u8 U8[8];
57
u16 U16[4];
58
u32 U32[2];
59
u64 U64[1];
60
int32x2_t v2s;
61
};
62
63
GSVector2i() = default;
64
65
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
66
67
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
68
69
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
70
71
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
72
{
73
return GSVector2i(cxpr_init, s0, s1, s2, s3);
74
}
75
76
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
77
{
78
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
79
}
80
81
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
82
83
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
84
85
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
86
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
87
{
88
}
89
90
ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
91
92
ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
93
94
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
95
96
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
97
98
ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
99
100
ALWAYS_INLINE operator int32x2_t() const { return v2s; }
101
102
ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
103
{
104
return max_s8(min).min_s8(max);
105
}
106
ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
107
{
108
return max_s16(min).min_s16(max);
109
}
110
ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
111
{
112
return max_s32(min).min_s32(max);
113
}
114
115
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
116
{
117
return max_u8(min).min_u8(max);
118
}
119
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
120
{
121
return max_u16(min).min_u16(max);
122
}
123
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
124
{
125
return max_u32(min).min_u32(max);
126
}
127
128
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const
129
{
130
return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
131
}
132
133
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const
134
{
135
return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
136
}
137
138
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const
139
{
140
return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
141
}
142
143
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const
144
{
145
return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
146
}
147
148
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
149
150
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
151
152
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
153
{
154
return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
155
}
156
157
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
158
{
159
return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
160
}
161
162
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
163
{
164
return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
165
}
166
167
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
168
{
169
return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
170
}
171
172
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
173
{
174
return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
175
}
176
177
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
178
{
179
return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
180
}
181
182
ALWAYS_INLINE s32 addv_s32() const
183
{
184
#ifdef CPU_ARCH_ARM64
185
return vaddv_s32(v2s);
186
#else
187
return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
188
#endif
189
}
190
191
#ifdef CPU_ARCH_ARM64
192
193
ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
194
195
ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
196
197
ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
198
199
ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
200
201
ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
202
203
ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); }
204
205
ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
206
207
ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
208
209
#else
210
211
ALWAYS_INLINE u8 minv_u8() const
212
{
213
uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
214
return static_cast<u8>(
215
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
216
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
217
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
218
}
219
220
ALWAYS_INLINE u16 maxv_u8() const
221
{
222
uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
223
return static_cast<u8>(
224
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
225
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
226
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
227
}
228
229
ALWAYS_INLINE u16 minv_u16() const
230
{
231
uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
232
return static_cast<u16>(
233
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
234
}
235
236
ALWAYS_INLINE u16 maxv_u16() const
237
{
238
uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
239
return static_cast<u16>(
240
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
241
}
242
243
ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
244
245
ALWAYS_INLINE u32 minv_u32() const
246
{
247
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
248
}
249
250
ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
251
252
ALWAYS_INLINE u32 maxv_u32() const
253
{
254
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
255
}
256
257
#endif
258
259
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
260
261
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
262
{
263
uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
264
return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
265
}
266
267
template<int mask>
268
ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
269
{
270
static constexpr const uint16_t _mask[4] = {
271
((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
272
((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
273
return GSVector2i(
274
vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
275
}
276
277
template<int mask>
278
ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
279
{
280
constexpr int bit1 = ((mask & 2) * 3) << 1;
281
constexpr int bit0 = (mask & 1) * 3;
282
return blend16 < bit1 | bit0 > (v);
283
}
284
285
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
286
{
287
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
288
vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
289
}
290
291
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
292
{
293
return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
294
}
295
296
ALWAYS_INLINE GSVector2i ps16() const
297
{
298
return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
299
}
300
301
ALWAYS_INLINE GSVector2i pu16() const
302
{
303
return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
304
}
305
306
ALWAYS_INLINE GSVector2i ps32() const
307
{
308
return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0)))));
309
}
310
311
ALWAYS_INLINE GSVector2i pu32() const
312
{
313
return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
314
}
315
316
#ifdef CPU_ARCH_ARM64
317
318
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
319
{
320
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
321
}
322
323
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
324
{
325
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
326
}
327
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
328
329
ALWAYS_INLINE GSVector2i upl8() const
330
{
331
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
332
}
333
334
ALWAYS_INLINE GSVector2i upl16() const
335
{
336
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
337
}
338
339
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
340
341
#else
342
343
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
344
{
345
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
346
}
347
348
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
349
{
350
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
351
}
352
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
353
354
ALWAYS_INLINE GSVector2i upl8() const
355
{
356
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
357
}
358
359
ALWAYS_INLINE GSVector2i upl16() const
360
{
361
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
362
}
363
364
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
365
366
#endif
367
368
ALWAYS_INLINE GSVector2i s8to16() const
369
{
370
return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
371
}
372
373
ALWAYS_INLINE GSVector2i u8to16() const
374
{
375
return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s)))));
376
}
377
378
ALWAYS_INLINE GSVector2i s8to32() const
379
{
380
return GSVector2i(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s))))));
381
}
382
383
ALWAYS_INLINE GSVector2i u8to32() const
384
{
385
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))))));
386
}
387
388
ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(vget_low_s32(vmovl_s16(vreinterpret_s16_s32(v2s)))); }
389
390
ALWAYS_INLINE GSVector2i u16to32() const
391
{
392
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s32(v2s)))));
393
}
394
395
template<int i>
396
ALWAYS_INLINE GSVector2i srl() const
397
{
398
return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
399
}
400
401
template<int i>
402
ALWAYS_INLINE GSVector2i sll() const
403
{
404
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
405
}
406
407
template<int i>
408
ALWAYS_INLINE GSVector2i sll16() const
409
{
410
return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
411
}
412
413
ALWAYS_INLINE GSVector2i sll16(s32 i) const
414
{
415
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
416
}
417
418
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419
{
420
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421
}
422
423
template<int i>
424
ALWAYS_INLINE GSVector2i srl16() const
425
{
426
return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
427
}
428
429
ALWAYS_INLINE GSVector2i srl16(s32 i) const
430
{
431
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i))));
432
}
433
434
ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
435
{
436
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
437
}
438
439
template<int i>
440
ALWAYS_INLINE GSVector2i sra16() const
441
{
442
constexpr int count = (i & ~15) ? 15 : i;
443
return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
444
}
445
446
ALWAYS_INLINE GSVector2i sra16(s32 i) const
447
{
448
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
449
}
450
451
ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
452
{
453
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
454
}
455
456
template<int i>
457
ALWAYS_INLINE GSVector2i sll32() const
458
{
459
return GSVector2i(vshl_n_s32(v2s, i));
460
}
461
462
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
463
464
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
465
466
template<int i>
467
ALWAYS_INLINE GSVector2i srl32() const
468
{
469
return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
470
}
471
472
ALWAYS_INLINE GSVector2i srl32(s32 i) const
473
{
474
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
475
}
476
477
ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
478
{
479
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
480
}
481
482
template<int i>
483
ALWAYS_INLINE GSVector2i sra32() const
484
{
485
return GSVector2i(vshr_n_s32(v2s, i));
486
}
487
488
ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
489
490
ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const
491
{
492
return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)));
493
}
494
495
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
496
{
497
return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
498
}
499
500
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
501
{
502
return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
503
}
504
505
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
506
507
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
508
{
509
return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
510
}
511
512
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
513
{
514
return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
515
}
516
517
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
518
{
519
return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
520
}
521
522
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
523
{
524
return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
525
}
526
527
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
528
{
529
return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
530
}
531
532
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
533
{
534
return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
535
}
536
537
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
538
539
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
540
{
541
return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
542
}
543
544
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
545
{
546
return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
547
}
548
549
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
550
{
551
return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
552
}
553
554
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
555
{
556
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
557
}
558
559
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
560
{
561
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
562
}
563
564
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
565
566
ALWAYS_INLINE bool eq(const GSVector2i& v) const
567
{
568
return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
569
}
570
571
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
572
{
573
return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
574
}
575
576
ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
577
{
578
return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
579
}
580
581
ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
582
{
583
return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
584
}
585
586
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
587
588
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
589
590
ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
591
592
ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
593
{
594
return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
595
}
596
597
ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
598
{
599
return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
600
}
601
602
ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); }
603
604
ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
605
{
606
return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
607
}
608
ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
609
{
610
return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
611
}
612
ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); }
613
614
ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
615
{
616
return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
617
}
618
619
ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
620
{
621
return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
622
}
623
624
ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); }
625
626
ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
627
{
628
return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
629
}
630
ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
631
{
632
return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
633
}
634
ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); }
635
636
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
637
638
ALWAYS_INLINE int mask() const
639
{
640
// borrowed from sse2neon
641
const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
642
const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
643
const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
644
const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
645
return static_cast<int>(vget_lane_u8(paired64, 0));
646
}
647
648
ALWAYS_INLINE bool alltrue() const
649
{
650
return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
651
}
652
653
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
654
655
template<int i>
656
ALWAYS_INLINE GSVector2i insert8(int a) const
657
{
658
return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
659
}
660
661
template<int i>
662
ALWAYS_INLINE int extract8() const
663
{
664
return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
665
}
666
667
template<int i>
668
ALWAYS_INLINE GSVector2i insert16(int a) const
669
{
670
return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
671
}
672
673
template<int i>
674
ALWAYS_INLINE int extract16() const
675
{
676
return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
677
}
678
679
template<int i>
680
ALWAYS_INLINE GSVector2i insert32(int a) const
681
{
682
return GSVector2i(vset_lane_s32(a, v2s, i));
683
}
684
685
template<int i>
686
ALWAYS_INLINE int extract32() const
687
{
688
return vget_lane_s32(v2s, i);
689
}
690
691
ALWAYS_INLINE static GSVector2i load32(const void* p)
692
{
693
// should be ldr s0, [x0]
694
u32 val;
695
std::memcpy(&val, p, sizeof(u32));
696
return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0));
697
}
698
699
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
700
701
template<bool aligned>
702
ALWAYS_INLINE static GSVector2i load(const void* p)
703
{
704
#ifdef CPU_ARCH_ARM32
705
if constexpr (!aligned)
706
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
707
#endif
708
709
return GSVector2i(vld1_s32((const int32_t*)p));
710
}
711
712
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
713
{
714
s32 val = vget_lane_s32(v, 0);
715
std::memcpy(p, &val, sizeof(s32));
716
}
717
718
template<bool aligned>
719
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
720
{
721
#ifdef CPU_ARCH_ARM32
722
if constexpr (!aligned)
723
{
724
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
725
return;
726
}
727
#endif
728
729
vst1_s32((int32_t*)p, v.v2s);
730
}
731
732
ALWAYS_INLINE void operator&=(const GSVector2i& v)
733
{
734
v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
735
}
736
737
ALWAYS_INLINE void operator|=(const GSVector2i& v)
738
{
739
v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
740
}
741
742
ALWAYS_INLINE void operator^=(const GSVector2i& v)
743
{
744
v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
745
}
746
747
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
748
{
749
return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
750
}
751
752
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
753
{
754
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
755
}
756
757
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
758
{
759
return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
760
}
761
762
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
763
764
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
765
766
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
767
768
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
769
770
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
771
772
ALWAYS_INLINE GSVector2i xy() const { return *this; }
773
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
774
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
775
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
776
};
777
778
class alignas(16) GSVector2
779
{
780
struct cxpr_init_tag
781
{
782
};
783
static constexpr cxpr_init_tag cxpr_init{};
784
785
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
786
787
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
788
789
public:
790
union
791
{
792
struct
793
{
794
float x, y;
795
};
796
struct
797
{
798
float r, g;
799
};
800
float F32[2];
801
double F64[1];
802
s8 I8[8];
803
s16 I16[4];
804
s32 I32[2];
805
s64 I64[1];
806
u8 U8[8];
807
u16 U16[4];
808
u32 U32[2];
809
u64 U64[1];
810
float32x2_t v2s;
811
};
812
813
GSVector2() = default;
814
815
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
816
817
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
818
819
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
820
821
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
822
823
ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
824
825
ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
826
827
ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
828
829
ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
830
831
ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
832
833
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
834
835
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
836
837
ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
838
839
ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
840
841
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
842
843
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
844
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
845
846
#ifdef CPU_ARCH_ARM64
847
848
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
849
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
850
851
#else
852
853
ALWAYS_INLINE GSVector2 floor() const
854
{
855
return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
856
}
857
858
ALWAYS_INLINE GSVector2 ceil() const
859
{
860
return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
861
}
862
863
#endif
864
865
ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
866
867
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
868
869
ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
870
871
ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
872
873
ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
874
875
template<int mask>
876
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
877
{
878
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
879
}
880
881
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
882
{
883
// duplicate sign bit across and bit select
884
const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
885
return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
886
}
887
888
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
889
{
890
return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
891
}
892
893
ALWAYS_INLINE int mask() const
894
{
895
const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31);
896
return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
897
}
898
899
ALWAYS_INLINE bool alltrue() const
900
{
901
return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
902
}
903
904
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
905
906
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
907
908
template<int src, int dst>
909
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
910
{
911
#ifdef CPU_ARCH_ARM64
912
return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
913
#else
914
return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
915
#endif
916
}
917
918
template<int i>
919
ALWAYS_INLINE int extract32() const
920
{
921
return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
922
}
923
924
ALWAYS_INLINE float dot(const GSVector2& v) const
925
{
926
#ifdef CPU_ARCH_ARM64
927
return vaddv_f32(vmul_f32(v2s, v.v2s));
928
#else
929
const float32x2_t dp = vmul_f32(v2s, v.v2s);
930
return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
931
#endif
932
}
933
934
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
935
936
ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
937
938
template<bool aligned>
939
ALWAYS_INLINE static GSVector2 load(const void* p)
940
{
941
#ifdef CPU_ARCH_ARM32
942
if constexpr (!aligned)
943
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
944
#endif
945
946
return GSVector2(vld1_f32(static_cast<const float*>(p)));
947
}
948
949
template<bool aligned>
950
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
951
{
952
#ifdef CPU_ARCH_ARM32
953
if constexpr (!aligned)
954
{
955
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
956
return;
957
}
958
#endif
959
960
vst1_f32(static_cast<float*>(p), v.v2s);
961
}
962
963
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
964
965
ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
966
ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
967
ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
968
ALWAYS_INLINE void operator/=(const GSVector2& v)
969
{
970
#ifdef CPU_ARCH_ARM64
971
v2s = vdiv_f32(v2s, v.v2s);
972
#else
973
*this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
974
#endif
975
}
976
977
ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
978
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
979
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
980
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
981
982
ALWAYS_INLINE void operator&=(const GSVector2& v)
983
{
984
v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
985
}
986
987
ALWAYS_INLINE void operator|=(const GSVector2& v)
988
{
989
v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
990
}
991
992
ALWAYS_INLINE void operator^=(const GSVector2& v)
993
{
994
v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
995
}
996
997
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
998
{
999
return GSVector2(vadd_f32(v1.v2s, v2.v2s));
1000
}
1001
1002
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
1003
{
1004
return GSVector2(vsub_f32(v1.v2s, v2.v2s));
1005
}
1006
1007
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
1008
{
1009
return GSVector2(vmul_f32(v1.v2s, v2.v2s));
1010
}
1011
1012
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
1013
{
1014
#ifdef CPU_ARCH_ARM64
1015
return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
1016
#else
1017
return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
1018
vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
1019
#endif
1020
}
1021
1022
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
1023
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
1024
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
1025
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
1026
1027
ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
1028
{
1029
return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1030
}
1031
1032
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
1033
{
1034
return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1035
}
1036
1037
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
1038
{
1039
return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1040
}
1041
1042
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
1043
{
1044
return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
1045
}
1046
1047
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
1048
{
1049
// NEON has no !=
1050
return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
1051
}
1052
1053
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
1054
{
1055
return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
1056
}
1057
1058
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
1059
{
1060
return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
1061
}
1062
1063
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
1064
{
1065
return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
1066
}
1067
1068
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
1069
{
1070
return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
1071
}
1072
1073
ALWAYS_INLINE GSVector2 xy() const { return *this; }
1074
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
1075
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
1076
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
1077
};
1078
1079
class alignas(16) GSVector4i
1080
{
1081
struct cxpr_init_tag
1082
{
1083
};
1084
static constexpr cxpr_init_tag cxpr_init{};
1085
1086
constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1087
1088
constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1089
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1090
{
1091
}
1092
1093
constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1094
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1095
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1096
{
1097
}
1098
1099
public:
1100
union
1101
{
1102
struct
1103
{
1104
int x, y, z, w;
1105
};
1106
struct
1107
{
1108
int r, g, b, a;
1109
};
1110
struct
1111
{
1112
int left, top, right, bottom;
1113
};
1114
float F32[4];
1115
s8 S8[16];
1116
s16 S16[8];
1117
s32 S32[4];
1118
s64 S64[2];
1119
u8 U8[16];
1120
u16 U16[8];
1121
u32 U32[4];
1122
u64 U64[2];
1123
int32x4_t v4s;
1124
};
1125
1126
GSVector4i() = default;
1127
1128
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1129
{
1130
return GSVector4i(cxpr_init, x, y, z, w);
1131
}
1132
1133
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1134
1135
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1136
1137
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1138
{
1139
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1140
}
1141
1142
ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1143
s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1144
{
1145
return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1146
}
1147
1148
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
1149
: v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3))
1150
{
1151
}
1152
1153
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1154
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1155
{
1156
}
1157
1158
constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
1159
s8 b13, s8 b14, s8 b15)
1160
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1161
{
1162
}
1163
1164
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : v4s(vcombine_s32(v.v2s, vcreate_s32(0))) {}
1165
1166
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1167
1168
ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
1169
1170
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v) : v4s(vcombine_s32(vcvt_s32_f32(v.v2s), vcreate_s32(0))) {}
1171
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1172
1173
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1174
1175
ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); }
1176
1177
ALWAYS_INLINE operator int32x4_t() const { return v4s; }
1178
1179
// rect
1180
1181
ALWAYS_INLINE s32 width() const { return right - left; }
1182
ALWAYS_INLINE s32 height() const { return bottom - top; }
1183
1184
ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1185
1186
ALWAYS_INLINE bool rempty() const
1187
{
1188
// !any((x, y) < (z, w)) i.e. !not_empty
1189
return (vget_lane_u64(vreinterpret_u64_u32(vclt_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) !=
1190
0xFFFFFFFFFFFFFFFFULL);
1191
}
1192
1193
ALWAYS_INLINE bool rvalid() const
1194
{
1195
// !all((x, y) >= (z, w))
1196
return (vget_lane_u64(vreinterpret_u64_u32(vcge_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == 0);
1197
}
1198
1199
ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_s32(a).upl64(max_s32(a).srl<8>()); }
1200
1201
ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_s32(a); }
1202
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1203
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1204
1205
ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1206
1207
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1208
{
1209
return max_s8(min).min_s8(max);
1210
}
1211
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1212
{
1213
return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1214
}
1215
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1216
{
1217
return max_s16(min).min_s16(max);
1218
}
1219
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1220
{
1221
return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1222
}
1223
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1224
{
1225
return max_s32(min).min_s32(max);
1226
}
1227
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1228
{
1229
return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1230
}
1231
1232
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1233
{
1234
return max_u8(min).min_u8(max);
1235
}
1236
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1237
{
1238
return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1239
}
1240
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1241
{
1242
return max_u16(min).min_u16(max);
1243
}
1244
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1245
{
1246
return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1247
}
1248
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1249
{
1250
return max_u32(min).min_u32(max);
1251
}
1252
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1253
{
1254
return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1255
}
1256
1257
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const
1258
{
1259
return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1260
}
1261
1262
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const
1263
{
1264
return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1265
}
1266
1267
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const
1268
{
1269
return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1270
}
1271
1272
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const
1273
{
1274
return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1275
}
1276
1277
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
1278
1279
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
1280
1281
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
1282
{
1283
return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1284
}
1285
1286
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
1287
{
1288
return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1289
}
1290
1291
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
1292
{
1293
return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1294
}
1295
1296
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
1297
{
1298
return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1299
}
1300
1301
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
1302
{
1303
return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1304
}
1305
1306
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
1307
{
1308
return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1309
}
1310
1311
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
1312
{
1313
#ifdef CPU_ARCH_ARM64
1314
const int32x4_t acc =
1315
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1316
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
1317
#else
1318
// borrowed from sse2neon
1319
const int32x4_t low =
1320
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1321
const int32x4_t high =
1322
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1323
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
1324
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
1325
#endif
1326
}
1327
1328
ALWAYS_INLINE GSVector4i addp_s32() const
1329
{
1330
#ifdef CPU_ARCH_ARM64
1331
return GSVector4i(vpaddq_s32(v4s, v4s));
1332
#else
1333
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1334
return GSVector4i(vcombine_s32(res, res));
1335
#endif
1336
}
1337
1338
ALWAYS_INLINE s32 addv_s32() const
1339
{
1340
#ifdef CPU_ARCH_ARM64
1341
return vaddvq_s32(v4s);
1342
#else
1343
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1344
return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
1345
#endif
1346
}
1347
1348
#ifdef CPU_ARCH_ARM64
1349
1350
ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
1351
1352
ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
1353
1354
ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
1355
1356
ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
1357
1358
ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
1359
1360
ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
1361
1362
ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
1363
1364
ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
1365
1366
#else
1367
1368
ALWAYS_INLINE u8 minv_u8() const
1369
{
1370
uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1371
vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
1372
return static_cast<u8>(
1373
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
1374
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
1375
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
1376
}
1377
1378
ALWAYS_INLINE u16 maxv_u8() const
1379
{
1380
uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1381
vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
1382
return static_cast<u8>(
1383
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
1384
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
1385
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
1386
}
1387
1388
ALWAYS_INLINE u16 minv_u16() const
1389
{
1390
uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1391
vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
1392
return static_cast<u16>(
1393
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
1394
}
1395
1396
ALWAYS_INLINE u16 maxv_u16() const
1397
{
1398
uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1399
vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
1400
return static_cast<u16>(
1401
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
1402
}
1403
1404
ALWAYS_INLINE s32 minv_s32() const
1405
{
1406
int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1407
return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
1408
}
1409
1410
ALWAYS_INLINE u32 minv_u32() const
1411
{
1412
uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1413
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
1414
}
1415
1416
ALWAYS_INLINE s32 maxv_s32() const
1417
{
1418
int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1419
return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
1420
}
1421
1422
ALWAYS_INLINE u32 maxv_u32() const
1423
{
1424
uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1425
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
1426
}
1427
1428
#endif
1429
1430
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1431
1432
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
1433
{
1434
uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
1435
return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
1436
}
1437
1438
template<int mask>
1439
ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
1440
{
1441
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(
1442
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s), ((mask & 0x01) == 0) ? 0 : 8,
1443
((mask & 0x02) == 0) ? 1 : 9, ((mask & 0x04) == 0) ? 2 : 10, ((mask & 0x08) == 0) ? 3 : 11,
1444
((mask & 0x10) == 0) ? 4 : 12, ((mask & 0x20) == 0) ? 5 : 13, ((mask & 0x40) == 0) ? 6 : 14,
1445
((mask & 0x80) == 0) ? 7 : 15)));
1446
}
1447
1448
template<int mask>
1449
ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1450
{
1451
return GSVector4i(__builtin_shufflevector(v4s, v.v4s, ((mask & 1) == 0) ? 0 : 4, ((mask & 2) == 0) ? 1 : 5,
1452
((mask & 4) == 0) ? 2 : 6, ((mask & 8) == 0) ? 3 : 7));
1453
}
1454
1455
ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1456
{
1457
return GSVector4i(
1458
vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
1459
vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
1460
}
1461
1462
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
1463
{
1464
#ifdef CPU_ARCH_ARM64
1465
return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
1466
#else
1467
int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
1468
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
1469
vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
1470
#endif
1471
}
1472
1473
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
1474
{
1475
return GSVector4i(vreinterpretq_s32_s8(
1476
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
1477
}
1478
1479
ALWAYS_INLINE GSVector4i ps16() const
1480
{
1481
return GSVector4i(vreinterpretq_s32_s8(
1482
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
1483
}
1484
1485
ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
1486
{
1487
return GSVector4i(vreinterpretq_s32_u8(
1488
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
1489
}
1490
1491
ALWAYS_INLINE GSVector4i pu16() const
1492
{
1493
return GSVector4i(vreinterpretq_s32_u8(
1494
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
1495
}
1496
1497
ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
1498
{
1499
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
1500
}
1501
1502
ALWAYS_INLINE GSVector4i ps32() const
1503
{
1504
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
1505
}
1506
1507
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1508
{
1509
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
1510
}
1511
1512
ALWAYS_INLINE GSVector4i pu32() const
1513
{
1514
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
1515
}
1516
1517
#ifdef CPU_ARCH_ARM64
1518
1519
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1520
{
1521
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1522
}
1523
1524
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1525
{
1526
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1527
}
1528
1529
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1530
{
1531
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1532
}
1533
1534
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1535
{
1536
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1537
}
1538
1539
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
1540
1541
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
1542
1543
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1544
{
1545
return GSVector4i(vreinterpretq_s32_s64(
1546
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1547
}
1548
1549
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1550
{
1551
return GSVector4i(vreinterpretq_s32_s64(
1552
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1553
}
1554
1555
ALWAYS_INLINE GSVector4i upl8() const
1556
{
1557
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1558
}
1559
1560
ALWAYS_INLINE GSVector4i uph8() const
1561
{
1562
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1563
}
1564
1565
ALWAYS_INLINE GSVector4i upl16() const
1566
{
1567
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1568
}
1569
1570
ALWAYS_INLINE GSVector4i uph16() const
1571
{
1572
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1573
}
1574
1575
ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
1576
1577
ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
1578
1579
ALWAYS_INLINE GSVector4i upl64() const
1580
{
1581
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1582
}
1583
1584
ALWAYS_INLINE GSVector4i uph64() const
1585
{
1586
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1587
}
1588
1589
#else
1590
1591
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1592
{
1593
const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
1594
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1595
}
1596
1597
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1598
{
1599
const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
1600
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1601
}
1602
1603
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1604
{
1605
const int16x4x2_t res =
1606
vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1607
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1608
}
1609
1610
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1611
{
1612
const int16x4x2_t res =
1613
vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1614
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1615
}
1616
1617
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
1618
{
1619
const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
1620
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1621
}
1622
1623
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
1624
{
1625
const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
1626
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1627
}
1628
1629
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1630
{
1631
return GSVector4i(vreinterpretq_s32_s64(
1632
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1633
}
1634
1635
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1636
{
1637
return GSVector4i(vreinterpretq_s32_s64(
1638
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1639
}
1640
1641
ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
1642
1643
ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
1644
1645
ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
1646
1647
ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
1648
1649
ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
1650
1651
ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
1652
1653
ALWAYS_INLINE GSVector4i upl64() const
1654
{
1655
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1656
}
1657
1658
ALWAYS_INLINE GSVector4i uph64() const
1659
{
1660
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1661
}
1662
#endif
1663
1664
ALWAYS_INLINE GSVector4i s8to16() const
1665
{
1666
return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
1667
}
1668
1669
ALWAYS_INLINE GSVector4i u8to16() const
1670
{
1671
return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
1672
}
1673
1674
ALWAYS_INLINE GSVector4i s8to32() const
1675
{
1676
return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
1677
}
1678
1679
ALWAYS_INLINE GSVector4i u8to32() const
1680
{
1681
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
1682
}
1683
1684
ALWAYS_INLINE GSVector4i s8to64() const
1685
{
1686
return GSVector4i(vreinterpretq_s32_s64(
1687
vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
1688
}
1689
1690
ALWAYS_INLINE GSVector4i u8to64() const
1691
{
1692
return GSVector4i(vreinterpretq_s32_u64(
1693
vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
1694
}
1695
1696
ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
1697
1698
ALWAYS_INLINE GSVector4i u16to32() const
1699
{
1700
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
1701
}
1702
1703
ALWAYS_INLINE GSVector4i s16to64() const
1704
{
1705
return GSVector4i(
1706
vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
1707
}
1708
1709
ALWAYS_INLINE GSVector4i u16to64() const
1710
{
1711
return GSVector4i(
1712
vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
1713
}
1714
1715
ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
1716
1717
ALWAYS_INLINE GSVector4i u32to64() const
1718
{
1719
return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
1720
}
1721
1722
template<int i>
1723
ALWAYS_INLINE GSVector4i srl() const
1724
{
1725
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
1726
}
1727
1728
template<int i>
1729
ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1730
{
1731
if constexpr (i >= 16)
1732
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
1733
else
1734
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
1735
}
1736
1737
template<int i>
1738
ALWAYS_INLINE GSVector4i sll() const
1739
{
1740
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
1741
}
1742
1743
template<int i>
1744
ALWAYS_INLINE GSVector4i sll16() const
1745
{
1746
return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
1747
}
1748
1749
ALWAYS_INLINE GSVector4i sll16(s32 i) const
1750
{
1751
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
1752
}
1753
1754
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
1755
{
1756
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1757
}
1758
1759
template<int i>
1760
ALWAYS_INLINE GSVector4i srl16() const
1761
{
1762
return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1763
}
1764
1765
ALWAYS_INLINE GSVector4i srl16(s32 i) const
1766
{
1767
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
1768
}
1769
1770
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
1771
{
1772
return GSVector4i(
1773
vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1774
}
1775
1776
template<int i>
1777
ALWAYS_INLINE GSVector4i sra16() const
1778
{
1779
constexpr int count = (i & ~15) ? 15 : i;
1780
return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
1781
}
1782
1783
ALWAYS_INLINE GSVector4i sra16(s32 i) const
1784
{
1785
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
1786
}
1787
1788
ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
1789
{
1790
return GSVector4i(
1791
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1792
}
1793
1794
template<int i>
1795
ALWAYS_INLINE GSVector4i sll32() const
1796
{
1797
return GSVector4i(vshlq_n_s32(v4s, i));
1798
}
1799
1800
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
1801
1802
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
1803
1804
template<int i>
1805
ALWAYS_INLINE GSVector4i srl32() const
1806
{
1807
return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
1808
}
1809
1810
ALWAYS_INLINE GSVector4i srl32(s32 i) const
1811
{
1812
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
1813
}
1814
1815
ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
1816
{
1817
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
1818
}
1819
1820
template<int i>
1821
ALWAYS_INLINE GSVector4i sra32() const
1822
{
1823
return GSVector4i(vshrq_n_s32(v4s, i));
1824
}
1825
1826
ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
1827
1828
ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
1829
{
1830
return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
1831
}
1832
1833
template<int i>
1834
ALWAYS_INLINE GSVector4i sll64() const
1835
{
1836
return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
1837
}
1838
1839
ALWAYS_INLINE GSVector4i sll64(s32 i) const
1840
{
1841
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
1842
}
1843
1844
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
1845
{
1846
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1847
}
1848
1849
template<int i>
1850
ALWAYS_INLINE GSVector4i srl64() const
1851
{
1852
return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1853
}
1854
1855
ALWAYS_INLINE GSVector4i srl64(s32 i) const
1856
{
1857
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
1858
}
1859
1860
#ifdef CPU_ARCH_ARM64
1861
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
1862
{
1863
return GSVector4i(
1864
vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
1865
}
1866
#endif
1867
1868
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
1869
{
1870
return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1871
}
1872
1873
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
1874
{
1875
return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1876
}
1877
1878
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
1879
1880
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
1881
{
1882
return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1883
}
1884
1885
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
1886
{
1887
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1888
}
1889
1890
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
1891
{
1892
// can't use vpaddq_s16() here, because we need saturation.
1893
// return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1894
const int16x8_t a = vreinterpretq_s16_s32(v4s);
1895
const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
1896
#ifdef CPU_ARCH_ARM64
1897
return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
1898
#else
1899
// sse2neon again
1900
int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
1901
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
1902
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
1903
#endif
1904
}
1905
1906
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
1907
{
1908
return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1909
}
1910
1911
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
1912
{
1913
return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1914
}
1915
1916
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
1917
{
1918
return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1919
}
1920
1921
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
1922
{
1923
return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1924
}
1925
1926
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
1927
1928
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
1929
{
1930
return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1931
}
1932
1933
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
1934
{
1935
return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1936
}
1937
1938
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
1939
{
1940
return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1941
}
1942
1943
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
1944
{
1945
return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1946
}
1947
1948
ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
1949
{
1950
return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1951
}
1952
1953
ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
1954
{
1955
return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1956
}
1957
1958
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
1959
{
1960
// from sse2neon
1961
int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
1962
int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
1963
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1964
int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
1965
int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
1966
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
1967
uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
1968
return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
1969
}
1970
1971
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
1972
{
1973
return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1974
}
1975
1976
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
1977
{
1978
int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1979
int32x4_t mul_hi =
1980
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1981
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
1982
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
1983
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
1984
}
1985
1986
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
1987
1988
ALWAYS_INLINE bool eq(const GSVector4i& v) const
1989
{
1990
const int32x4_t res = veorq_s32(v4s, v.v4s);
1991
#ifdef CPU_ARCH_ARM64
1992
return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
1993
#else
1994
const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
1995
return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
1996
#endif
1997
}
1998
1999
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
2000
{
2001
return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2002
}
2003
2004
ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
2005
{
2006
return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2007
}
2008
2009
ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
2010
{
2011
return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
2012
}
2013
2014
#ifdef CPU_ARCH_ARM64
2015
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
2016
{
2017
return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
2018
}
2019
#endif
2020
2021
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
2022
2023
ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
2024
2025
ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
2026
2027
ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
2028
{
2029
return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2030
}
2031
2032
ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
2033
{
2034
return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2035
}
2036
2037
ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
2038
2039
ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
2040
{
2041
return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2042
}
2043
ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
2044
{
2045
return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2046
}
2047
ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
2048
2049
ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
2050
{
2051
return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2052
}
2053
2054
ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
2055
{
2056
return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2057
}
2058
2059
ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
2060
2061
ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
2062
{
2063
return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2064
}
2065
ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
2066
{
2067
return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2068
}
2069
ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
2070
2071
ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
2072
2073
ALWAYS_INLINE int mask() const
2074
{
2075
// borrowed from sse2neon
2076
const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
2077
const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2078
const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2079
const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2080
return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
2081
}
2082
2083
ALWAYS_INLINE bool alltrue() const
2084
{
2085
#ifdef CPU_ARCH_ARM64
2086
return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
2087
#else
2088
return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
2089
UINT64_C(0xFFFFFFFFFFFFFFFF));
2090
#endif
2091
}
2092
2093
ALWAYS_INLINE bool allfalse() const
2094
{
2095
#ifdef CPU_ARCH_ARM64
2096
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
2097
#else
2098
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
2099
#endif
2100
}
2101
2102
template<int i>
2103
ALWAYS_INLINE GSVector4i insert8(int a) const
2104
{
2105
return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
2106
}
2107
2108
template<int i>
2109
ALWAYS_INLINE int extract8() const
2110
{
2111
return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
2112
}
2113
2114
template<int i>
2115
ALWAYS_INLINE GSVector4i insert16(int a) const
2116
{
2117
return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
2118
}
2119
2120
template<int i>
2121
ALWAYS_INLINE int extract16() const
2122
{
2123
return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
2124
}
2125
2126
template<int i>
2127
ALWAYS_INLINE GSVector4i insert32(int a) const
2128
{
2129
return GSVector4i(vsetq_lane_s32(a, v4s, i));
2130
}
2131
2132
template<int i>
2133
ALWAYS_INLINE int extract32() const
2134
{
2135
return vgetq_lane_s32(v4s, i);
2136
}
2137
2138
template<int i>
2139
ALWAYS_INLINE GSVector4i insert64(s64 a) const
2140
{
2141
return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
2142
}
2143
2144
template<int i>
2145
ALWAYS_INLINE s64 extract64() const
2146
{
2147
return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
2148
}
2149
2150
#ifdef CPU_ARCH_ARM64
2151
ALWAYS_INLINE GSVector4i tbl2(const GSVector4i& a, const GSVector4i& b, const GSVector4i& idx)
2152
{
2153
return GSVector4i(vreinterpretq_s32_u8(
2154
vqtbx2q_u8(vreinterpretq_u8_s32(v4s), uint8x16x2_t{vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(b.v4s)},
2155
vreinterpretq_u8_s32(idx.v4s))));
2156
}
2157
#endif
2158
2159
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
2160
{
2161
#if __has_builtin(__builtin_nontemporal_store)
2162
return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
2163
#else
2164
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2165
#endif
2166
}
2167
2168
ALWAYS_INLINE static GSVector4i load32(const void* p)
2169
{
2170
// should be ldr s0, [x0]
2171
u32 val;
2172
std::memcpy(&val, p, sizeof(u32));
2173
return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
2174
}
2175
2176
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
2177
2178
template<bool aligned>
2179
ALWAYS_INLINE static GSVector4i loadl(const void* p)
2180
{
2181
#ifdef CPU_ARCH_ARM32
2182
if constexpr (!aligned)
2183
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
2184
#endif
2185
2186
return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
2187
}
2188
2189
ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v) { return GSVector4i(vcombine_s32(v.v2s, vcreate_s32(0))); }
2190
2191
template<bool aligned>
2192
ALWAYS_INLINE static GSVector4i loadh(const void* p)
2193
{
2194
#ifdef CPU_ARCH_ARM32
2195
if constexpr (!aligned)
2196
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2197
#endif
2198
2199
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2200
}
2201
2202
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
2203
2204
template<bool aligned>
2205
ALWAYS_INLINE static GSVector4i load(const void* p)
2206
{
2207
#ifdef CPU_ARCH_ARM32
2208
if constexpr (!aligned)
2209
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
2210
#endif
2211
2212
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2213
}
2214
2215
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
2216
{
2217
#if __has_builtin(__builtin_nontemporal_store)
2218
__builtin_nontemporal_store(v.v4s, static_cast<int32x4_t*>(p));
2219
#else
2220
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2221
#endif
2222
}
2223
2224
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
2225
{
2226
u32 val = vgetq_lane_s32(v, 0);
2227
std::memcpy(p, &val, sizeof(u32));
2228
}
2229
2230
template<bool aligned>
2231
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
2232
{
2233
#ifdef CPU_ARCH_ARM32
2234
if constexpr (!aligned)
2235
{
2236
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
2237
return;
2238
}
2239
#endif
2240
2241
vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
2242
}
2243
2244
template<bool aligned>
2245
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
2246
{
2247
#ifdef CPU_ARCH_ARM32
2248
if constexpr (!aligned)
2249
{
2250
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
2251
return;
2252
}
2253
#endif
2254
2255
vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
2256
}
2257
2258
template<bool aligned>
2259
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
2260
{
2261
#ifdef CPU_ARCH_ARM32
2262
if constexpr (!aligned)
2263
{
2264
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
2265
return;
2266
}
2267
#endif
2268
2269
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2270
}
2271
2272
ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
2273
2274
template<bool aligned>
2275
ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
2276
{
2277
return load<aligned>(v);
2278
}
2279
2280
ALWAYS_INLINE void operator&=(const GSVector4i& v)
2281
{
2282
v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2283
}
2284
2285
ALWAYS_INLINE void operator|=(const GSVector4i& v)
2286
{
2287
v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2288
}
2289
2290
ALWAYS_INLINE void operator^=(const GSVector4i& v)
2291
{
2292
v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2293
}
2294
2295
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
2296
{
2297
return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2298
}
2299
2300
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
2301
{
2302
return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2303
}
2304
2305
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
2306
{
2307
return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2308
}
2309
2310
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
2311
2312
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
2313
2314
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
2315
2316
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
2317
2318
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
2319
2320
ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
2321
2322
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
2323
2324
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
2325
{
2326
return GSVector4i(vcombine_s32(xy.v2s, zw.v2s));
2327
}
2328
2329
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw) { return GSVector4i(vcombine_s32(xyzw.v2s, xyzw.v2s)); }
2330
2331
static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
2332
2333
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
2334
2335
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
2336
2337
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
2338
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
2339
{ \
2340
return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
2341
} \
2342
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \
2343
{ \
2344
return GSVector4i(vreinterpretq_s32_s16( \
2345
__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7))); \
2346
} \
2347
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \
2348
{ \
2349
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector( \
2350
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn))); \
2351
}
2352
2353
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
2354
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
2355
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
2356
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
2357
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2358
2359
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
2360
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
2361
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
2362
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
2363
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2364
2365
#define VECTOR4i_SHUFFLE_1(xs, xn) \
2366
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
2367
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
2368
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
2369
VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
2370
2371
VECTOR4i_SHUFFLE_1(x, 0);
2372
VECTOR4i_SHUFFLE_1(y, 1);
2373
VECTOR4i_SHUFFLE_1(z, 2);
2374
VECTOR4i_SHUFFLE_1(w, 3);
2375
2376
#undef VECTOR4i_SHUFFLE_1
2377
#undef VECTOR4i_SHUFFLE_2
2378
#undef VECTOR4i_SHUFFLE_3
2379
#undef VECTOR4i_SHUFFLE_4
2380
};
2381
2382
class alignas(16) GSVector4
2383
{
2384
struct cxpr_init_tag
2385
{
2386
};
2387
static constexpr cxpr_init_tag cxpr_init{};
2388
2389
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
2390
2391
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
2392
2393
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
2394
2395
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
2396
2397
public:
2398
union
2399
{
2400
struct
2401
{
2402
float x, y, z, w;
2403
};
2404
struct
2405
{
2406
float r, g, b, a;
2407
};
2408
struct
2409
{
2410
float left, top, right, bottom;
2411
};
2412
float F32[4];
2413
double F64[2];
2414
s8 I8[16];
2415
s16 I16[8];
2416
s32 I32[4];
2417
s64 I64[2];
2418
u8 U8[16];
2419
u16 U16[8];
2420
u32 U32[4];
2421
u64 U64[2];
2422
float32x4_t v4s;
2423
};
2424
2425
GSVector4() = default;
2426
2427
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
2428
2429
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
2430
2431
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
2432
2433
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
2434
2435
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
2436
2437
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
2438
2439
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
2440
2441
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
2442
2443
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
2444
{
2445
const float arr[4] = {x, y, z, w};
2446
v4s = vld1q_f32(arr);
2447
}
2448
2449
ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
2450
2451
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
2452
{
2453
const int arr[4] = {x, y, z, w};
2454
v4s = vcvtq_f32_s32(vld1q_s32(arr));
2455
}
2456
2457
ALWAYS_INLINE GSVector4(int x, int y)
2458
{
2459
v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
2460
}
2461
2462
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
2463
2464
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
2465
2466
ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
2467
2468
ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
2469
2470
ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
2471
2472
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
2473
2474
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2475
2476
ALWAYS_INLINE static GSVector4 f64(double x, double y)
2477
{
2478
#ifdef CPU_ARCH_ARM64
2479
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
2480
#else
2481
GSVector4 ret;
2482
ret.F64[0] = x;
2483
ret.F64[1] = y;
2484
return ret;
2485
#endif
2486
}
2487
2488
ALWAYS_INLINE static GSVector4 f64(double x)
2489
{
2490
#ifdef CPU_ARCH_ARM64
2491
return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
2492
#else
2493
GSVector4 ret;
2494
ret.F64[0] = ret.F64[1] = x;
2495
return ret;
2496
#endif
2497
}
2498
2499
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
2500
2501
ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
2502
2503
ALWAYS_INLINE operator float32x4_t() const { return v4s; }
2504
2505
ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2506
2507
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2508
{
2509
return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2510
}
2511
2512
ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2513
2514
ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
2515
2516
ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
2517
2518
#ifdef _M_ARM64
2519
2520
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
2521
2522
ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
2523
2524
#else
2525
2526
ALWAYS_INLINE GSVector4 floor() const
2527
{
2528
return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
2529
std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
2530
}
2531
2532
ALWAYS_INLINE GSVector4 ceil() const
2533
{
2534
return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
2535
std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
2536
}
2537
2538
#endif
2539
2540
#ifdef CPU_ARCH_ARM64
2541
2542
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
2543
2544
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
2545
2546
ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
2547
2548
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2549
{
2550
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
2551
}
2552
2553
#else
2554
2555
ALWAYS_INLINE GSVector4 hadd() const
2556
{
2557
const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2558
return GSVector4(vcombine_f32(res, res));
2559
}
2560
2561
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
2562
{
2563
const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2564
const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
2565
return GSVector4(vcombine_f32(res1, res2));
2566
}
2567
2568
ALWAYS_INLINE GSVector4 hsub() const
2569
{
2570
const float32x4x2_t res = vuzpq_f32(v4s, v4s);
2571
return GSVector4(vsubq_f32(res.val[0], res.val[0]));
2572
}
2573
2574
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2575
{
2576
const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
2577
return GSVector4(vsubq_f32(res.val[0], res.val[1]));
2578
}
2579
2580
#endif
2581
2582
ALWAYS_INLINE float dot(const GSVector4& v) const
2583
{
2584
#ifdef CPU_ARCH_ARM64
2585
return vaddvq_f32(vmulq_f32(v4s, v.v4s));
2586
#else
2587
const float32x4_t dp = vmulq_f32(v4s, v.v4s);
2588
float32x2_t tmp = vadd_f32(vget_low_f32(dp), vget_high_f32(dp)); // (x+z, y+w)
2589
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2590
#endif
2591
}
2592
2593
ALWAYS_INLINE float addv() const
2594
{
2595
#ifdef CPU_ARCH_ARM64
2596
return vaddvq_f32(v4s);
2597
#else
2598
float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2599
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2600
#endif
2601
}
2602
2603
ALWAYS_INLINE float minv() const
2604
{
2605
#ifdef CPU_ARCH_ARM64
2606
return vminvq_f32(v4s);
2607
#else
2608
float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2609
return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2610
#endif
2611
}
2612
2613
ALWAYS_INLINE float maxv() const
2614
{
2615
#ifdef CPU_ARCH_ARM64
2616
return vmaxvq_f32(v4s);
2617
#else
2618
float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2619
return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2620
#endif
2621
}
2622
2623
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
2624
2625
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
2626
{
2627
#ifdef CPU_ARCH_ARM64
2628
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
2629
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
2630
#else
2631
const GSVector4 minv(a.xyxy());
2632
const GSVector4 maxv(a.zwzw());
2633
#endif
2634
return sat(minv, maxv);
2635
}
2636
2637
ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2638
2639
ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2640
2641
ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
2642
2643
ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
2644
2645
template<int mask>
2646
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
2647
{
2648
return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
2649
(mask & 8) ? 7 : 3));
2650
}
2651
2652
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
2653
{
2654
// duplicate sign bit across and bit select
2655
const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
2656
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
2657
}
2658
2659
#ifdef CPU_ARCH_ARM64
2660
2661
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
2662
2663
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
2664
2665
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2666
{
2667
return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2668
}
2669
2670
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2671
{
2672
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2673
}
2674
2675
#else
2676
2677
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
2678
{
2679
const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
2680
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2681
}
2682
2683
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
2684
{
2685
const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
2686
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2687
}
2688
2689
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2690
{
2691
return GSVector4(vreinterpretq_f32_s64(
2692
vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
2693
}
2694
2695
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2696
{
2697
return GSVector4(vreinterpretq_f32_s64(
2698
vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
2699
}
2700
2701
#endif
2702
2703
ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
2704
{
2705
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
2706
}
2707
2708
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
2709
{
2710
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
2711
}
2712
2713
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
2714
{
2715
return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
2716
}
2717
2718
ALWAYS_INLINE int mask() const
2719
{
2720
#ifdef CPU_ARCH_ARM64
2721
static constexpr const int32_t shifts[] = {0, 1, 2, 3};
2722
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
2723
#else
2724
// sse2neon again
2725
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
2726
uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2727
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2728
#endif
2729
}
2730
2731
ALWAYS_INLINE bool alltrue() const
2732
{
2733
#ifdef CPU_ARCH_ARM64
2734
return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
2735
#else
2736
2737
return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2738
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2739
0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
2740
#endif
2741
}
2742
2743
ALWAYS_INLINE bool allfalse() const
2744
{
2745
#ifdef CPU_ARCH_ARM64
2746
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
2747
#else
2748
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2749
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2750
0) == UINT64_C(0));
2751
#endif
2752
}
2753
2754
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2755
2756
template<int src, int dst>
2757
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2758
{
2759
#ifdef CPU_ARCH_ARM64
2760
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
2761
#else
2762
return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
2763
#endif
2764
}
2765
2766
template<int i>
2767
ALWAYS_INLINE GSVector4 insert32(float v) const
2768
{
2769
return GSVector4(vsetq_lane_f32(v, v4s, i));
2770
}
2771
2772
template<int i>
2773
ALWAYS_INLINE float extract32() const
2774
{
2775
return vgetq_lane_f32(v4s, i);
2776
}
2777
2778
template<int dst>
2779
ALWAYS_INLINE GSVector4 insert64(double v) const
2780
{
2781
#ifdef CPU_ARCH_ARM64
2782
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
2783
#else
2784
GSVector4 ret;
2785
ret.F64[dst] = v;
2786
return ret;
2787
#endif
2788
}
2789
2790
template<int src>
2791
ALWAYS_INLINE double extract64() const
2792
{
2793
#ifdef CPU_ARCH_ARM64
2794
return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
2795
#else
2796
return F64[src];
2797
#endif
2798
}
2799
2800
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
2801
2802
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
2803
2804
template<bool aligned>
2805
ALWAYS_INLINE static GSVector4 loadl(const void* p)
2806
{
2807
#ifdef CPU_ARCH_ARM32
2808
if constexpr (!aligned)
2809
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
2810
#endif
2811
2812
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
2813
}
2814
2815
ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
2816
2817
template<bool aligned>
2818
ALWAYS_INLINE static GSVector4 load(const void* p)
2819
{
2820
#ifdef CPU_ARCH_ARM32
2821
if constexpr (!aligned)
2822
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
2823
#endif
2824
2825
return GSVector4(vld1q_f32((const float*)p));
2826
}
2827
2828
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
2829
2830
template<bool aligned>
2831
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2832
{
2833
#ifdef CPU_ARCH_ARM32
2834
if constexpr (!aligned)
2835
{
2836
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
2837
return;
2838
}
2839
#endif
2840
2841
vst1_f32((float*)p, vget_low_f32(v.v4s));
2842
}
2843
2844
template<bool aligned>
2845
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2846
{
2847
#ifdef CPU_ARCH_ARM32
2848
if constexpr (!aligned)
2849
{
2850
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
2851
return;
2852
}
2853
#endif
2854
2855
vst1_f32((float*)p, vget_high_f32(v.v4s));
2856
}
2857
2858
template<bool aligned>
2859
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2860
{
2861
#ifdef CPU_ARCH_ARM32
2862
if constexpr (!aligned)
2863
{
2864
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
2865
return;
2866
}
2867
#endif
2868
2869
vst1q_f32((float*)p, v.v4s);
2870
}
2871
2872
ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
2873
2874
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2875
2876
ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
2877
ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
2878
ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
2879
ALWAYS_INLINE void operator/=(const GSVector4& v)
2880
{
2881
#ifdef CPU_ARCH_ARM64
2882
v4s = vdivq_f32(v4s, v.v4s);
2883
#else
2884
*this =
2885
GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
2886
vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
2887
#endif
2888
}
2889
2890
ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
2891
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
2892
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
2893
ALWAYS_INLINE void operator/=(float f)
2894
{
2895
#ifdef CPU_ARCH_ARM64
2896
*this /= GSVector4(f);
2897
#else
2898
*this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
2899
vgetq_lane_f32(v4s, 3) / f);
2900
#endif
2901
}
2902
2903
ALWAYS_INLINE void operator&=(const GSVector4& v)
2904
{
2905
v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2906
}
2907
2908
ALWAYS_INLINE void operator|=(const GSVector4& v)
2909
{
2910
v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2911
}
2912
2913
ALWAYS_INLINE void operator^=(const GSVector4& v)
2914
{
2915
v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2916
}
2917
2918
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2919
{
2920
return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
2921
}
2922
2923
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2924
{
2925
return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
2926
}
2927
2928
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2929
{
2930
return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
2931
}
2932
2933
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2934
{
2935
#ifdef CPU_ARCH_ARM64
2936
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
2937
#else
2938
return GSVector4(
2939
vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
2940
vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
2941
#endif
2942
}
2943
2944
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2945
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2946
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2947
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
2948
{
2949
#ifdef CPU_ARCH_ARM64
2950
return v / GSVector4(f);
2951
#else
2952
return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
2953
vgetq_lane_f32(v.v4s, 3) / f);
2954
#endif
2955
}
2956
2957
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2958
{
2959
return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2960
}
2961
2962
ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2963
{
2964
return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2965
}
2966
2967
ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2968
{
2969
return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2970
}
2971
2972
ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2973
{
2974
return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
2975
}
2976
2977
ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2978
{
2979
// NEON has no !=
2980
return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
2981
}
2982
2983
ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2984
{
2985
return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
2986
}
2987
2988
ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2989
{
2990
return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
2991
}
2992
2993
ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
2994
{
2995
return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
2996
}
2997
2998
ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
2999
{
3000
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
3001
}
3002
3003
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
3004
{
3005
#ifdef CPU_ARCH_ARM64
3006
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3007
#else
3008
return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
3009
#endif
3010
}
3011
3012
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
3013
{
3014
#ifdef CPU_ARCH_ARM64
3015
return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3016
#else
3017
return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
3018
#endif
3019
}
3020
3021
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
3022
{
3023
#ifdef CPU_ARCH_ARM64
3024
return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3025
#else
3026
return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
3027
#endif
3028
}
3029
3030
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
3031
{
3032
#ifdef CPU_ARCH_ARM64
3033
return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3034
#else
3035
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
3036
#endif
3037
}
3038
3039
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
3040
{
3041
#ifdef CPU_ARCH_ARM64
3042
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3043
#else
3044
GSVector4 ret;
3045
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3046
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3047
return ret;
3048
#endif
3049
}
3050
3051
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
3052
{
3053
#ifdef CPU_ARCH_ARM64
3054
return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3055
#else
3056
GSVector4 ret;
3057
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3058
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3059
return ret;
3060
#endif
3061
}
3062
3063
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
3064
{
3065
#ifdef CPU_ARCH_ARM64
3066
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3067
#else
3068
GSVector4 ret;
3069
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3070
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3071
return ret;
3072
#endif
3073
}
3074
3075
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
3076
{
3077
#ifdef CPU_ARCH_ARM64
3078
return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3079
#else
3080
GSVector4 ret;
3081
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3082
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3083
return ret;
3084
#endif
3085
}
3086
3087
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
3088
{
3089
#ifdef CPU_ARCH_ARM64
3090
return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3091
#else
3092
GSVector4 ret;
3093
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3094
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3095
return ret;
3096
#endif
3097
}
3098
3099
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
3100
{
3101
#ifdef CPU_ARCH_ARM64
3102
return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3103
#else
3104
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
3105
#endif
3106
}
3107
3108
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
3109
{
3110
#ifdef CPU_ARCH_ARM64
3111
return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3112
#else
3113
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
3114
#endif
3115
}
3116
3117
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
3118
3119
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
3120
3121
ALWAYS_INLINE GSVector4 sqrt64() const
3122
{
3123
#ifdef CPU_ARCH_ARM64
3124
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3125
#else
3126
return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
3127
#endif
3128
}
3129
3130
ALWAYS_INLINE GSVector4 sqr64() const
3131
{
3132
#ifdef CPU_ARCH_ARM64
3133
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3134
#else
3135
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
3136
#endif
3137
}
3138
3139
ALWAYS_INLINE GSVector4 floor64() const
3140
{
3141
#ifdef CPU_ARCH_ARM64
3142
return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
3143
#else
3144
return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
3145
#endif
3146
}
3147
3148
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
3149
{
3150
#ifdef CPU_ARCH_ARM64
3151
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
3152
#else
3153
return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
3154
#endif
3155
}
3156
3157
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
3158
{
3159
#ifdef CPU_ARCH_ARM64
3160
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
3161
#else
3162
const float* fp = static_cast<const float*>(p);
3163
return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
3164
#endif
3165
}
3166
3167
ALWAYS_INLINE GSVector4i f64toi32() const
3168
{
3169
#ifdef CPU_ARCH_ARM64
3170
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
3171
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
3172
#else
3173
const s32 low = static_cast<s32>(F64[0]);
3174
const s32 high = static_cast<s32>(F64[1]);
3175
#endif
3176
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
3177
}
3178
3179
ALWAYS_INLINE GSVector2 xy() const { return GSVector2(vget_low_s32(v4s)); }
3180
3181
ALWAYS_INLINE GSVector2 zw() const { return GSVector2(vget_high_s32(v4s)); }
3182
3183
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
3184
{
3185
return GSVector4(vcombine_f32(l.v2s, h.v2s));
3186
}
3187
3188
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(vcombine_f32(l.v2s, l.v2s)); }
3189
3190
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
3191
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
3192
{ \
3193
return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
3194
} \
3195
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const \
3196
{ \
3197
return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); \
3198
}
3199
3200
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
3201
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
3202
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
3203
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
3204
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
3205
3206
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
3207
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
3208
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
3209
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
3210
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
3211
3212
#define VECTOR4_SHUFFLE_1(xs, xn) \
3213
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
3214
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
3215
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
3216
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
3217
3218
VECTOR4_SHUFFLE_1(x, 0);
3219
VECTOR4_SHUFFLE_1(y, 1);
3220
VECTOR4_SHUFFLE_1(z, 2);
3221
VECTOR4_SHUFFLE_1(w, 3);
3222
3223
#undef VECTOR4_SHUFFLE_1
3224
#undef VECTOR4_SHUFFLE_2
3225
#undef VECTOR4_SHUFFLE_3
3226
#undef VECTOR4_SHUFFLE_4
3227
3228
ALWAYS_INLINE GSVector4 broadcast32() const
3229
{
3230
#ifdef CPU_ARCH_ARM64
3231
return GSVector4(vdupq_laneq_f32(v4s, 0));
3232
#else
3233
return xxxx();
3234
#endif
3235
}
3236
3237
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
3238
{
3239
#ifdef CPU_ARCH_ARM64
3240
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
3241
#else
3242
return v.xxxx();
3243
#endif
3244
}
3245
3246
ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
3247
3248
ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
3249
{
3250
#ifdef CPU_ARCH_ARM64
3251
return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
3252
#else
3253
return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
3254
#endif
3255
}
3256
};
3257
3258
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
3259
{
3260
v2s = vcvt_s32_f32(v.v2s);
3261
}
3262
3263
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
3264
{
3265
v2s = vcvt_f32_s32(v.v2s);
3266
}
3267
3268
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
3269
{
3270
return GSVector2i(vreinterpret_s32_f32(v.v2s));
3271
}
3272
3273
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
3274
{
3275
return GSVector2(vreinterpret_f32_s32(v.v2s));
3276
}
3277
3278
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
3279
{
3280
v4s = vcvtq_s32_f32(v.v4s);
3281
}
3282
3283
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
3284
{
3285
v4s = vcvtq_f32_s32(v.v4s);
3286
}
3287
3288
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
3289
{
3290
return GSVector4i(vreinterpretq_s32_f32(v.v4s));
3291
}
3292
3293
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
3294
{
3295
return GSVector4(vreinterpretq_f32_s32(v.v4s));
3296
}
3297
3298