CoCalc -- gsvector

GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_neon.h
⁴²¹¹ views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3

4
#include "common/intrin.h"
5
#include "common/types.h"
6

7
#include <algorithm>
8
#include <cmath>
9
#include <cstdint>
10

11
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
12
#define GSVECTOR_HAS_SRLV 1
13

14
#ifdef CPU_ARCH_ARM64
15
// tbl2 with 128-bit vectors is not in A32.
16
#define GSVECTOR_HAS_TBL2 1
17
#endif
18

19
class GSVector2;
20
class GSVector2i;
21
class GSVector4;
22
class GSVector4i;
23

24
class alignas(16) GSVector2i
25
{
26
  struct cxpr_init_tag
27
  {
28
  };
29
  static constexpr cxpr_init_tag cxpr_init{};
30

31
  constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
32

33
  constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
34

35
  constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
36
    : S8{b0, b1, b2, b3, b4, b5, b6, b7}
37
  {
38
  }
39

40
public:
41
  union
42
  {
43
    struct
44
    {
45
      s32 x, y;
46
    };
47
    struct
48
    {
49
      s32 r, g;
50
    };
51
    float F32[2];
52
    s8 S8[8];
53
    s16 S16[4];
54
    s32 S32[2];
55
    s64 S64[1];
56
    u8 U8[8];
57
    u16 U16[4];
58
    u32 U32[2];
59
    u64 U64[1];
60
    int32x2_t v2s;
61
  };
62

63
  GSVector2i() = default;
64

65
  ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
66

67
  ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
68

69
  ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
70

71
  ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
72
  {
73
    return GSVector2i(cxpr_init, s0, s1, s2, s3);
74
  }
75

76
  ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
77
  {
78
    return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
79
  }
80

81
  ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
82

83
  ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
84

85
  ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
86
    : S8{b0, b1, b2, b3, b4, b5, b6, b7}
87
  {
88
  }
89

90
  ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
91

92
  ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
93

94
  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
95

96
  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
97

98
  ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
99

100
  ALWAYS_INLINE operator int32x2_t() const { return v2s; }
101

102
  ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
103
  {
104
    return max_s8(min).min_s8(max);
105
  }
106
  ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
107
  {
108
    return max_s16(min).min_s16(max);
109
  }
110
  ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
111
  {
112
    return max_s32(min).min_s32(max);
113
  }
114

115
  ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
116
  {
117
    return max_u8(min).min_u8(max);
118
  }
119
  ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
120
  {
121
    return max_u16(min).min_u16(max);
122
  }
123
  ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
124
  {
125
    return max_u32(min).min_u32(max);
126
  }
127

128
  ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const
129
  {
130
    return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
131
  }
132

133
  ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const
134
  {
135
    return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
136
  }
137

138
  ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const
139
  {
140
    return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
141
  }
142

143
  ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const
144
  {
145
    return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
146
  }
147

148
  ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
149

150
  ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
151

152
  ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
153
  {
154
    return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
155
  }
156

157
  ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
158
  {
159
    return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
160
  }
161

162
  ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
163
  {
164
    return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
165
  }
166

167
  ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
168
  {
169
    return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
170
  }
171

172
  ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
173
  {
174
    return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
175
  }
176

177
  ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
178
  {
179
    return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
180
  }
181

182
  ALWAYS_INLINE s32 addv_s32() const
183
  {
184
#ifdef CPU_ARCH_ARM64
185
    return vaddv_s32(v2s);
186
#else
187
    return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
188
#endif
189
  }
190

191
#ifdef CPU_ARCH_ARM64
192

193
  ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
194

195
  ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
196

197
  ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
198

199
  ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
200

201
  ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
202

203
  ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); }
204

205
  ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
206

207
  ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
208

209
#else
210

211
  ALWAYS_INLINE u8 minv_u8() const
212
  {
213
    uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
214
    return static_cast<u8>(
215
      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
216
               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
217
                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
218
  }
219

220
  ALWAYS_INLINE u16 maxv_u8() const
221
  {
222
    uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
223
    return static_cast<u8>(
224
      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
225
               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
226
                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
227
  }
228

229
  ALWAYS_INLINE u16 minv_u16() const
230
  {
231
    uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
232
    return static_cast<u16>(
233
      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
234
  }
235

236
  ALWAYS_INLINE u16 maxv_u16() const
237
  {
238
    uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
239
    return static_cast<u16>(
240
      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
241
  }
242

243
  ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
244

245
  ALWAYS_INLINE u32 minv_u32() const
246
  {
247
    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
248
  }
249

250
  ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
251

252
  ALWAYS_INLINE u32 maxv_u32() const
253
  {
254
    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
255
  }
256

257
#endif
258

259
  ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
260

261
  ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
262
  {
263
    uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
264
    return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
265
  }
266

267
  template<int mask>
268
  ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
269
  {
270
    static constexpr const uint16_t _mask[4] = {
271
      ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
272
      ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
273
    return GSVector2i(
274
      vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
275
  }
276

277
  template<int mask>
278
  ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
279
  {
280
    constexpr int bit1 = ((mask & 2) * 3) << 1;
281
    constexpr int bit0 = (mask & 1) * 3;
282
    return blend16 < bit1 | bit0 > (v);
283
  }
284

285
  ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
286
  {
287
    return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
288
                                                  vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
289
  }
290

291
  ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
292
  {
293
    return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
294
  }
295

296
  ALWAYS_INLINE GSVector2i ps16() const
297
  {
298
    return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
299
  }
300

301
  ALWAYS_INLINE GSVector2i pu16() const
302
  {
303
    return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
304
  }
305

306
  ALWAYS_INLINE GSVector2i ps32() const
307
  {
308
    return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0)))));
309
  }
310

311
  ALWAYS_INLINE GSVector2i pu32() const
312
  {
313
    return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
314
  }
315

316
#ifdef CPU_ARCH_ARM64
317

318
  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
319
  {
320
    return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
321
  }
322

323
  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
324
  {
325
    return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
326
  }
327
  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
328

329
  ALWAYS_INLINE GSVector2i upl8() const
330
  {
331
    return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
332
  }
333

334
  ALWAYS_INLINE GSVector2i upl16() const
335
  {
336
    return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
337
  }
338

339
  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
340

341
#else
342

343
  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
344
  {
345
    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
346
  }
347

348
  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
349
  {
350
    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
351
  }
352
  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
353

354
  ALWAYS_INLINE GSVector2i upl8() const
355
  {
356
    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
357
  }
358

359
  ALWAYS_INLINE GSVector2i upl16() const
360
  {
361
    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
362
  }
363

364
  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
365

366
#endif
367

368
  ALWAYS_INLINE GSVector2i s8to16() const
369
  {
370
    return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
371
  }
372

373
  ALWAYS_INLINE GSVector2i u8to16() const
374
  {
375
    return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s)))));
376
  }
377

378
  ALWAYS_INLINE GSVector2i s8to32() const
379
  {
380
    return GSVector2i(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s))))));
381
  }
382

383
  ALWAYS_INLINE GSVector2i u8to32() const
384
  {
385
    return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))))));
386
  }
387

388
  ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(vget_low_s32(vmovl_s16(vreinterpret_s16_s32(v2s)))); }
389

390
  ALWAYS_INLINE GSVector2i u16to32() const
391
  {
392
    return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s32(v2s)))));
393
  }
394

395
  template<int i>
396
  ALWAYS_INLINE GSVector2i srl() const
397
  {
398
    return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
399
  }
400

401
  template<int i>
402
  ALWAYS_INLINE GSVector2i sll() const
403
  {
404
    return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
405
  }
406

407
  template<int i>
408
  ALWAYS_INLINE GSVector2i sll16() const
409
  {
410
    return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
411
  }
412

413
  ALWAYS_INLINE GSVector2i sll16(s32 i) const
414
  {
415
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
416
  }
417

418
  ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419
  {
420
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421
  }
422

423
  template<int i>
424
  ALWAYS_INLINE GSVector2i srl16() const
425
  {
426
    return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
427
  }
428

429
  ALWAYS_INLINE GSVector2i srl16(s32 i) const
430
  {
431
    return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i))));
432
  }
433

434
  ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
435
  {
436
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
437
  }
438

439
  template<int i>
440
  ALWAYS_INLINE GSVector2i sra16() const
441
  {
442
    constexpr int count = (i & ~15) ? 15 : i;
443
    return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
444
  }
445

446
  ALWAYS_INLINE GSVector2i sra16(s32 i) const
447
  {
448
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
449
  }
450

451
  ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
452
  {
453
    return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
454
  }
455

456
  template<int i>
457
  ALWAYS_INLINE GSVector2i sll32() const
458
  {
459
    return GSVector2i(vshl_n_s32(v2s, i));
460
  }
461

462
  ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
463

464
  ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
465

466
  template<int i>
467
  ALWAYS_INLINE GSVector2i srl32() const
468
  {
469
    return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
470
  }
471

472
  ALWAYS_INLINE GSVector2i srl32(s32 i) const
473
  {
474
    return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
475
  }
476

477
  ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
478
  {
479
    return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
480
  }
481

482
  template<int i>
483
  ALWAYS_INLINE GSVector2i sra32() const
484
  {
485
    return GSVector2i(vshr_n_s32(v2s, i));
486
  }
487

488
  ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
489

490
  ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const
491
  {
492
    return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)));
493
  }
494

495
  ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
496
  {
497
    return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
498
  }
499

500
  ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
501
  {
502
    return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
503
  }
504

505
  ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
506

507
  ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
508
  {
509
    return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
510
  }
511

512
  ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
513
  {
514
    return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
515
  }
516

517
  ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
518
  {
519
    return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
520
  }
521

522
  ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
523
  {
524
    return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
525
  }
526

527
  ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
528
  {
529
    return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
530
  }
531

532
  ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
533
  {
534
    return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
535
  }
536

537
  ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
538

539
  ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
540
  {
541
    return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
542
  }
543

544
  ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
545
  {
546
    return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
547
  }
548

549
  ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
550
  {
551
    return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
552
  }
553

554
  ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
555
  {
556
    return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
557
  }
558

559
  ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
560
  {
561
    return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
562
  }
563

564
  ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
565

566
  ALWAYS_INLINE bool eq(const GSVector2i& v) const
567
  {
568
    return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
569
  }
570

571
  ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
572
  {
573
    return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
574
  }
575

576
  ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
577
  {
578
    return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
579
  }
580

581
  ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
582
  {
583
    return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
584
  }
585

586
  ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
587

588
  ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
589

590
  ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
591

592
  ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
593
  {
594
    return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
595
  }
596

597
  ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
598
  {
599
    return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
600
  }
601

602
  ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); }
603

604
  ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
605
  {
606
    return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
607
  }
608
  ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
609
  {
610
    return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
611
  }
612
  ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); }
613

614
  ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
615
  {
616
    return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
617
  }
618

619
  ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
620
  {
621
    return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
622
  }
623

624
  ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); }
625

626
  ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
627
  {
628
    return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
629
  }
630
  ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
631
  {
632
    return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
633
  }
634
  ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); }
635

636
  ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
637

638
  ALWAYS_INLINE int mask() const
639
  {
640
    // borrowed from sse2neon
641
    const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
642
    const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
643
    const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
644
    const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
645
    return static_cast<int>(vget_lane_u8(paired64, 0));
646
  }
647

648
  ALWAYS_INLINE bool alltrue() const
649
  {
650
    return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
651
  }
652

653
  ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
654

655
  template<int i>
656
  ALWAYS_INLINE GSVector2i insert8(int a) const
657
  {
658
    return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
659
  }
660

661
  template<int i>
662
  ALWAYS_INLINE int extract8() const
663
  {
664
    return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
665
  }
666

667
  template<int i>
668
  ALWAYS_INLINE GSVector2i insert16(int a) const
669
  {
670
    return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
671
  }
672

673
  template<int i>
674
  ALWAYS_INLINE int extract16() const
675
  {
676
    return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
677
  }
678

679
  template<int i>
680
  ALWAYS_INLINE GSVector2i insert32(int a) const
681
  {
682
    return GSVector2i(vset_lane_s32(a, v2s, i));
683
  }
684

685
  template<int i>
686
  ALWAYS_INLINE int extract32() const
687
  {
688
    return vget_lane_s32(v2s, i);
689
  }
690

691
  ALWAYS_INLINE static GSVector2i load32(const void* p)
692
  {
693
    // should be ldr s0, [x0]
694
    u32 val;
695
    std::memcpy(&val, p, sizeof(u32));
696
    return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0));
697
  }
698

699
  ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
700

701
  template<bool aligned>
702
  ALWAYS_INLINE static GSVector2i load(const void* p)
703
  {
704
#ifdef CPU_ARCH_ARM32
705
    if constexpr (!aligned)
706
      return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
707
#endif
708

709
    return GSVector2i(vld1_s32((const int32_t*)p));
710
  }
711

712
  ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
713
  {
714
    s32 val = vget_lane_s32(v, 0);
715
    std::memcpy(p, &val, sizeof(s32));
716
  }
717

718
  template<bool aligned>
719
  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
720
  {
721
#ifdef CPU_ARCH_ARM32
722
    if constexpr (!aligned)
723
    {
724
      vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
725
      return;
726
    }
727
#endif
728

729
    vst1_s32((int32_t*)p, v.v2s);
730
  }
731

732
  ALWAYS_INLINE void operator&=(const GSVector2i& v)
733
  {
734
    v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
735
  }
736

737
  ALWAYS_INLINE void operator|=(const GSVector2i& v)
738
  {
739
    v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
740
  }
741

742
  ALWAYS_INLINE void operator^=(const GSVector2i& v)
743
  {
744
    v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
745
  }
746

747
  ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
748
  {
749
    return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
750
  }
751

752
  ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
753
  {
754
    return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
755
  }
756

757
  ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
758
  {
759
    return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
760
  }
761

762
  ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
763

764
  ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
765

766
  ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
767

768
  ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
769

770
  ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
771

772
  ALWAYS_INLINE GSVector2i xy() const { return *this; }
773
  ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
774
  ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
775
  ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
776
};
777

778
class alignas(16) GSVector2
779
{
780
  struct cxpr_init_tag
781
  {
782
  };
783
  static constexpr cxpr_init_tag cxpr_init{};
784

785
  constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
786

787
  constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
788

789
public:
790
  union
791
  {
792
    struct
793
    {
794
      float x, y;
795
    };
796
    struct
797
    {
798
      float r, g;
799
    };
800
    float F32[2];
801
    double F64[1];
802
    s8 I8[8];
803
    s16 I16[4];
804
    s32 I32[2];
805
    s64 I64[1];
806
    u8 U8[8];
807
    u16 U16[4];
808
    u32 U32[2];
809
    u64 U64[1];
810
    float32x2_t v2s;
811
  };
812

813
  GSVector2() = default;
814

815
  constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
816

817
  constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
818

819
  constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
820

821
  constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
822

823
  ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
824

825
  ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
826

827
  ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
828

829
  ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
830

831
  ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
832

833
  ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
834

835
  ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
836

837
  ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
838

839
  ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
840

841
  ALWAYS_INLINE operator float32x2_t() const { return v2s; }
842

843
  ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
844
  ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
845

846
#ifdef CPU_ARCH_ARM64
847

848
  ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
849
  ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
850

851
#else
852

853
  ALWAYS_INLINE GSVector2 floor() const
854
  {
855
    return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
856
  }
857

858
  ALWAYS_INLINE GSVector2 ceil() const
859
  {
860
    return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
861
  }
862

863
#endif
864

865
  ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
866

867
  ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
868

869
  ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
870

871
  ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
872

873
  ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
874

875
  template<int mask>
876
  ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
877
  {
878
    return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
879
  }
880

881
  ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
882
  {
883
    // duplicate sign bit across and bit select
884
    const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
885
    return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
886
  }
887

888
  ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
889
  {
890
    return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
891
  }
892

893
  ALWAYS_INLINE int mask() const
894
  {
895
    const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31);
896
    return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
897
  }
898

899
  ALWAYS_INLINE bool alltrue() const
900
  {
901
    return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
902
  }
903

904
  ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
905

906
  ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
907

908
  template<int src, int dst>
909
  ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
910
  {
911
#ifdef CPU_ARCH_ARM64
912
    return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
913
#else
914
    return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
915
#endif
916
  }
917

918
  template<int i>
919
  ALWAYS_INLINE int extract32() const
920
  {
921
    return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
922
  }
923

924
  ALWAYS_INLINE float dot(const GSVector2& v) const
925
  {
926
#ifdef CPU_ARCH_ARM64
927
    return vaddv_f32(vmul_f32(v2s, v.v2s));
928
#else
929
    const float32x2_t dp = vmul_f32(v2s, v.v2s);
930
    return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
931
#endif
932
  }
933

934
  ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
935

936
  ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
937

938
  template<bool aligned>
939
  ALWAYS_INLINE static GSVector2 load(const void* p)
940
  {
941
#ifdef CPU_ARCH_ARM32
942
    if constexpr (!aligned)
943
      return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
944
#endif
945

946
    return GSVector2(vld1_f32(static_cast<const float*>(p)));
947
  }
948

949
  template<bool aligned>
950
  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
951
  {
952
#ifdef CPU_ARCH_ARM32
953
    if constexpr (!aligned)
954
    {
955
      vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
956
      return;
957
    }
958
#endif
959

960
    vst1_f32(static_cast<float*>(p), v.v2s);
961
  }
962

963
  ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
964

965
  ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
966
  ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
967
  ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
968
  ALWAYS_INLINE void operator/=(const GSVector2& v)
969
  {
970
#ifdef CPU_ARCH_ARM64
971
    v2s = vdiv_f32(v2s, v.v2s);
972
#else
973
    *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
974
#endif
975
  }
976

977
  ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
978
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
979
  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
980
  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
981

982
  ALWAYS_INLINE void operator&=(const GSVector2& v)
983
  {
984
    v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
985
  }
986

987
  ALWAYS_INLINE void operator|=(const GSVector2& v)
988
  {
989
    v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
990
  }
991

992
  ALWAYS_INLINE void operator^=(const GSVector2& v)
993
  {
994
    v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
995
  }
996

997
  ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
998
  {
999
    return GSVector2(vadd_f32(v1.v2s, v2.v2s));
1000
  }
1001

1002
  ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
1003
  {
1004
    return GSVector2(vsub_f32(v1.v2s, v2.v2s));
1005
  }
1006

1007
  ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
1008
  {
1009
    return GSVector2(vmul_f32(v1.v2s, v2.v2s));
1010
  }
1011

1012
  ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
1013
  {
1014
#ifdef CPU_ARCH_ARM64
1015
    return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
1016
#else
1017
    return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
1018
                     vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
1019
#endif
1020
  }
1021

1022
  ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
1023
  ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
1024
  ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
1025
  ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
1026

1027
  ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
1028
  {
1029
    return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1030
  }
1031

1032
  ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
1033
  {
1034
    return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1035
  }
1036

1037
  ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
1038
  {
1039
    return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1040
  }
1041

1042
  ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
1043
  {
1044
    return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
1045
  }
1046

1047
  ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
1048
  {
1049
    // NEON has no !=
1050
    return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
1051
  }
1052

1053
  ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
1054
  {
1055
    return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
1056
  }
1057

1058
  ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
1059
  {
1060
    return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
1061
  }
1062

1063
  ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
1064
  {
1065
    return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
1066
  }
1067

1068
  ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
1069
  {
1070
    return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
1071
  }
1072

1073
  ALWAYS_INLINE GSVector2 xy() const { return *this; }
1074
  ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
1075
  ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
1076
  ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
1077
};
1078

1079
class alignas(16) GSVector4i
1080
{
1081
  struct cxpr_init_tag
1082
  {
1083
  };
1084
  static constexpr cxpr_init_tag cxpr_init{};
1085

1086
  constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1087

1088
  constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1089
    : S16{s0, s1, s2, s3, s4, s5, s6, s7}
1090
  {
1091
  }
1092

1093
  constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1094
                       s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1095
    : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1096
  {
1097
  }
1098

1099
public:
1100
  union
1101
  {
1102
    struct
1103
    {
1104
      int x, y, z, w;
1105
    };
1106
    struct
1107
    {
1108
      int r, g, b, a;
1109
    };
1110
    struct
1111
    {
1112
      int left, top, right, bottom;
1113
    };
1114
    float F32[4];
1115
    s8 S8[16];
1116
    s16 S16[8];
1117
    s32 S32[4];
1118
    s64 S64[2];
1119
    u8 U8[16];
1120
    u16 U16[8];
1121
    u32 U32[4];
1122
    u64 U64[2];
1123
    int32x4_t v4s;
1124
  };
1125

1126
  GSVector4i() = default;
1127

1128
  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1129
  {
1130
    return GSVector4i(cxpr_init, x, y, z, w);
1131
  }
1132

1133
  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1134

1135
  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1136

1137
  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1138
  {
1139
    return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1140
  }
1141

1142
  ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1143
                                                  s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1144
  {
1145
    return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1146
  }
1147

1148
  ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
1149
    : v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3))
1150
  {
1151
  }
1152

1153
  ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1154
    : S16{s0, s1, s2, s3, s4, s5, s6, s7}
1155
  {
1156
  }
1157

1158
  constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
1159
                       s8 b13, s8 b14, s8 b15)
1160
    : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1161
  {
1162
  }
1163

1164
  ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : v4s(vcombine_s32(v.v2s, vcreate_s32(0))) {}
1165

1166
  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1167

1168
  ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
1169

1170
  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v) : v4s(vcombine_s32(vcvt_s32_f32(v.v2s), vcreate_s32(0))) {}
1171
  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1172

1173
  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1174

1175
  ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); }
1176

1177
  ALWAYS_INLINE operator int32x4_t() const { return v4s; }
1178

1179
  // rect
1180

1181
  ALWAYS_INLINE s32 width() const { return right - left; }
1182
  ALWAYS_INLINE s32 height() const { return bottom - top; }
1183

1184
  ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1185

1186
  ALWAYS_INLINE bool rempty() const
1187
  {
1188
    // !any((x, y) < (z, w)) i.e. !not_empty
1189
    return (vget_lane_u64(vreinterpret_u64_u32(vclt_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) !=
1190
            0xFFFFFFFFFFFFFFFFULL);
1191
  }
1192

1193
  ALWAYS_INLINE bool rvalid() const
1194
  {
1195
    // !all((x, y) >= (z, w))
1196
    return (vget_lane_u64(vreinterpret_u64_u32(vcge_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == 0);
1197
  }
1198

1199
  ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_s32(a).upl64(max_s32(a).srl<8>()); }
1200

1201
  ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_s32(a); }
1202
  ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1203
  ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1204

1205
  ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1206

1207
  ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1208
  {
1209
    return max_s8(min).min_s8(max);
1210
  }
1211
  ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1212
  {
1213
    return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1214
  }
1215
  ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1216
  {
1217
    return max_s16(min).min_s16(max);
1218
  }
1219
  ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1220
  {
1221
    return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1222
  }
1223
  ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1224
  {
1225
    return max_s32(min).min_s32(max);
1226
  }
1227
  ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1228
  {
1229
    return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1230
  }
1231

1232
  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1233
  {
1234
    return max_u8(min).min_u8(max);
1235
  }
1236
  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1237
  {
1238
    return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1239
  }
1240
  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1241
  {
1242
    return max_u16(min).min_u16(max);
1243
  }
1244
  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1245
  {
1246
    return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1247
  }
1248
  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1249
  {
1250
    return max_u32(min).min_u32(max);
1251
  }
1252
  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1253
  {
1254
    return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1255
  }
1256

1257
  ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const
1258
  {
1259
    return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1260
  }
1261

1262
  ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const
1263
  {
1264
    return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1265
  }
1266

1267
  ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const
1268
  {
1269
    return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1270
  }
1271

1272
  ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const
1273
  {
1274
    return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1275
  }
1276

1277
  ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
1278

1279
  ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
1280

1281
  ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
1282
  {
1283
    return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1284
  }
1285

1286
  ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
1287
  {
1288
    return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1289
  }
1290

1291
  ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
1292
  {
1293
    return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1294
  }
1295

1296
  ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
1297
  {
1298
    return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1299
  }
1300

1301
  ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
1302
  {
1303
    return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1304
  }
1305

1306
  ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
1307
  {
1308
    return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1309
  }
1310

1311
  ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
1312
  {
1313
#ifdef CPU_ARCH_ARM64
1314
    const int32x4_t acc =
1315
      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1316
    return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
1317
#else
1318
    // borrowed from sse2neon
1319
    const int32x4_t low =
1320
      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1321
    const int32x4_t high =
1322
      vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1323
    return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
1324
                                   vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
1325
#endif
1326
  }
1327

1328
  ALWAYS_INLINE GSVector4i addp_s32() const
1329
  {
1330
#ifdef CPU_ARCH_ARM64
1331
    return GSVector4i(vpaddq_s32(v4s, v4s));
1332
#else
1333
    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1334
    return GSVector4i(vcombine_s32(res, res));
1335
#endif
1336
  }
1337

1338
  ALWAYS_INLINE s32 addv_s32() const
1339
  {
1340
#ifdef CPU_ARCH_ARM64
1341
    return vaddvq_s32(v4s);
1342
#else
1343
    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1344
    return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
1345
#endif
1346
  }
1347

1348
#ifdef CPU_ARCH_ARM64
1349

1350
  ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
1351

1352
  ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
1353

1354
  ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
1355

1356
  ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
1357

1358
  ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
1359

1360
  ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
1361

1362
  ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
1363

1364
  ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
1365

1366
#else
1367

1368
  ALWAYS_INLINE u8 minv_u8() const
1369
  {
1370
    uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1371
    vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
1372
    return static_cast<u8>(
1373
      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
1374
               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
1375
                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
1376
  }
1377

1378
  ALWAYS_INLINE u16 maxv_u8() const
1379
  {
1380
    uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1381
    vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
1382
    return static_cast<u8>(
1383
      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
1384
               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
1385
                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
1386
  }
1387

1388
  ALWAYS_INLINE u16 minv_u16() const
1389
  {
1390
    uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1391
    vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
1392
    return static_cast<u16>(
1393
      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
1394
  }
1395

1396
  ALWAYS_INLINE u16 maxv_u16() const
1397
  {
1398
    uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1399
    vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
1400
    return static_cast<u16>(
1401
      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
1402
  }
1403

1404
  ALWAYS_INLINE s32 minv_s32() const
1405
  {
1406
    int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1407
    return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
1408
  }
1409

1410
  ALWAYS_INLINE u32 minv_u32() const
1411
  {
1412
    uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1413
    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
1414
  }
1415

1416
  ALWAYS_INLINE s32 maxv_s32() const
1417
  {
1418
    int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1419
    return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
1420
  }
1421

1422
  ALWAYS_INLINE u32 maxv_u32() const
1423
  {
1424
    uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1425
    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
1426
  }
1427

1428
#endif
1429

1430
  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1431

1432
  ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
1433
  {
1434
    uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
1435
    return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
1436
  }
1437

1438
  template<int mask>
1439
  ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
1440
  {
1441
    return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(
1442
      vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s), ((mask & 0x01) == 0) ? 0 : 8,
1443
      ((mask & 0x02) == 0) ? 1 : 9, ((mask & 0x04) == 0) ? 2 : 10, ((mask & 0x08) == 0) ? 3 : 11,
1444
      ((mask & 0x10) == 0) ? 4 : 12, ((mask & 0x20) == 0) ? 5 : 13, ((mask & 0x40) == 0) ? 6 : 14,
1445
      ((mask & 0x80) == 0) ? 7 : 15)));
1446
  }
1447

1448
  template<int mask>
1449
  ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1450
  {
1451
    return GSVector4i(__builtin_shufflevector(v4s, v.v4s, ((mask & 1) == 0) ? 0 : 4, ((mask & 2) == 0) ? 1 : 5,
1452
                                              ((mask & 4) == 0) ? 2 : 6, ((mask & 8) == 0) ? 3 : 7));
1453
  }
1454

1455
  ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1456
  {
1457
    return GSVector4i(
1458
      vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
1459
                                    vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
1460
  }
1461

1462
  ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
1463
  {
1464
#ifdef CPU_ARCH_ARM64
1465
    return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
1466
#else
1467
    int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
1468
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
1469
                                                       vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
1470
#endif
1471
  }
1472

1473
  ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
1474
  {
1475
    return GSVector4i(vreinterpretq_s32_s8(
1476
      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
1477
  }
1478

1479
  ALWAYS_INLINE GSVector4i ps16() const
1480
  {
1481
    return GSVector4i(vreinterpretq_s32_s8(
1482
      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
1483
  }
1484

1485
  ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
1486
  {
1487
    return GSVector4i(vreinterpretq_s32_u8(
1488
      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
1489
  }
1490

1491
  ALWAYS_INLINE GSVector4i pu16() const
1492
  {
1493
    return GSVector4i(vreinterpretq_s32_u8(
1494
      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
1495
  }
1496

1497
  ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
1498
  {
1499
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
1500
  }
1501

1502
  ALWAYS_INLINE GSVector4i ps32() const
1503
  {
1504
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
1505
  }
1506

1507
  ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1508
  {
1509
    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
1510
  }
1511

1512
  ALWAYS_INLINE GSVector4i pu32() const
1513
  {
1514
    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
1515
  }
1516

1517
#ifdef CPU_ARCH_ARM64
1518

1519
  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1520
  {
1521
    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1522
  }
1523

1524
  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1525
  {
1526
    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1527
  }
1528

1529
  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1530
  {
1531
    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1532
  }
1533

1534
  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1535
  {
1536
    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1537
  }
1538

1539
  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
1540

1541
  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
1542

1543
  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1544
  {
1545
    return GSVector4i(vreinterpretq_s32_s64(
1546
      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1547
  }
1548

1549
  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1550
  {
1551
    return GSVector4i(vreinterpretq_s32_s64(
1552
      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1553
  }
1554

1555
  ALWAYS_INLINE GSVector4i upl8() const
1556
  {
1557
    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1558
  }
1559

1560
  ALWAYS_INLINE GSVector4i uph8() const
1561
  {
1562
    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1563
  }
1564

1565
  ALWAYS_INLINE GSVector4i upl16() const
1566
  {
1567
    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1568
  }
1569

1570
  ALWAYS_INLINE GSVector4i uph16() const
1571
  {
1572
    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1573
  }
1574

1575
  ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
1576

1577
  ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
1578

1579
  ALWAYS_INLINE GSVector4i upl64() const
1580
  {
1581
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1582
  }
1583

1584
  ALWAYS_INLINE GSVector4i uph64() const
1585
  {
1586
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1587
  }
1588

1589
#else
1590

1591
  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1592
  {
1593
    const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
1594
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1595
  }
1596

1597
  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1598
  {
1599
    const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
1600
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1601
  }
1602

1603
  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1604
  {
1605
    const int16x4x2_t res =
1606
      vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1607
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1608
  }
1609

1610
  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1611
  {
1612
    const int16x4x2_t res =
1613
      vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1614
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1615
  }
1616

1617
  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
1618
  {
1619
    const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
1620
    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1621
  }
1622

1623
  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
1624
  {
1625
    const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
1626
    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1627
  }
1628

1629
  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1630
  {
1631
    return GSVector4i(vreinterpretq_s32_s64(
1632
      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1633
  }
1634

1635
  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1636
  {
1637
    return GSVector4i(vreinterpretq_s32_s64(
1638
      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1639
  }
1640

1641
  ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
1642

1643
  ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
1644

1645
  ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
1646

1647
  ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
1648

1649
  ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
1650

1651
  ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
1652

1653
  ALWAYS_INLINE GSVector4i upl64() const
1654
  {
1655
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1656
  }
1657

1658
  ALWAYS_INLINE GSVector4i uph64() const
1659
  {
1660
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1661
  }
1662
#endif
1663

1664
  ALWAYS_INLINE GSVector4i s8to16() const
1665
  {
1666
    return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
1667
  }
1668

1669
  ALWAYS_INLINE GSVector4i u8to16() const
1670
  {
1671
    return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
1672
  }
1673

1674
  ALWAYS_INLINE GSVector4i s8to32() const
1675
  {
1676
    return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
1677
  }
1678

1679
  ALWAYS_INLINE GSVector4i u8to32() const
1680
  {
1681
    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
1682
  }
1683

1684
  ALWAYS_INLINE GSVector4i s8to64() const
1685
  {
1686
    return GSVector4i(vreinterpretq_s32_s64(
1687
      vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
1688
  }
1689

1690
  ALWAYS_INLINE GSVector4i u8to64() const
1691
  {
1692
    return GSVector4i(vreinterpretq_s32_u64(
1693
      vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
1694
  }
1695

1696
  ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
1697

1698
  ALWAYS_INLINE GSVector4i u16to32() const
1699
  {
1700
    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
1701
  }
1702

1703
  ALWAYS_INLINE GSVector4i s16to64() const
1704
  {
1705
    return GSVector4i(
1706
      vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
1707
  }
1708

1709
  ALWAYS_INLINE GSVector4i u16to64() const
1710
  {
1711
    return GSVector4i(
1712
      vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
1713
  }
1714

1715
  ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
1716

1717
  ALWAYS_INLINE GSVector4i u32to64() const
1718
  {
1719
    return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
1720
  }
1721

1722
  template<int i>
1723
  ALWAYS_INLINE GSVector4i srl() const
1724
  {
1725
    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
1726
  }
1727

1728
  template<int i>
1729
  ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1730
  {
1731
    if constexpr (i >= 16)
1732
      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
1733
    else
1734
      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
1735
  }
1736

1737
  template<int i>
1738
  ALWAYS_INLINE GSVector4i sll() const
1739
  {
1740
    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
1741
  }
1742

1743
  template<int i>
1744
  ALWAYS_INLINE GSVector4i sll16() const
1745
  {
1746
    return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
1747
  }
1748

1749
  ALWAYS_INLINE GSVector4i sll16(s32 i) const
1750
  {
1751
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
1752
  }
1753

1754
  ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
1755
  {
1756
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1757
  }
1758

1759
  template<int i>
1760
  ALWAYS_INLINE GSVector4i srl16() const
1761
  {
1762
    return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1763
  }
1764

1765
  ALWAYS_INLINE GSVector4i srl16(s32 i) const
1766
  {
1767
    return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
1768
  }
1769

1770
  ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
1771
  {
1772
    return GSVector4i(
1773
      vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1774
  }
1775

1776
  template<int i>
1777
  ALWAYS_INLINE GSVector4i sra16() const
1778
  {
1779
    constexpr int count = (i & ~15) ? 15 : i;
1780
    return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
1781
  }
1782

1783
  ALWAYS_INLINE GSVector4i sra16(s32 i) const
1784
  {
1785
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
1786
  }
1787

1788
  ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
1789
  {
1790
    return GSVector4i(
1791
      vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1792
  }
1793

1794
  template<int i>
1795
  ALWAYS_INLINE GSVector4i sll32() const
1796
  {
1797
    return GSVector4i(vshlq_n_s32(v4s, i));
1798
  }
1799

1800
  ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
1801

1802
  ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
1803

1804
  template<int i>
1805
  ALWAYS_INLINE GSVector4i srl32() const
1806
  {
1807
    return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
1808
  }
1809

1810
  ALWAYS_INLINE GSVector4i srl32(s32 i) const
1811
  {
1812
    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
1813
  }
1814

1815
  ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
1816
  {
1817
    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
1818
  }
1819

1820
  template<int i>
1821
  ALWAYS_INLINE GSVector4i sra32() const
1822
  {
1823
    return GSVector4i(vshrq_n_s32(v4s, i));
1824
  }
1825

1826
  ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
1827

1828
  ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
1829
  {
1830
    return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
1831
  }
1832

1833
  template<int i>
1834
  ALWAYS_INLINE GSVector4i sll64() const
1835
  {
1836
    return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
1837
  }
1838

1839
  ALWAYS_INLINE GSVector4i sll64(s32 i) const
1840
  {
1841
    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
1842
  }
1843

1844
  ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
1845
  {
1846
    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1847
  }
1848

1849
  template<int i>
1850
  ALWAYS_INLINE GSVector4i srl64() const
1851
  {
1852
    return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1853
  }
1854

1855
  ALWAYS_INLINE GSVector4i srl64(s32 i) const
1856
  {
1857
    return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
1858
  }
1859

1860
#ifdef CPU_ARCH_ARM64
1861
  ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
1862
  {
1863
    return GSVector4i(
1864
      vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
1865
  }
1866
#endif
1867

1868
  ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
1869
  {
1870
    return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1871
  }
1872

1873
  ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
1874
  {
1875
    return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1876
  }
1877

1878
  ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
1879

1880
  ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
1881
  {
1882
    return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1883
  }
1884

1885
  ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
1886
  {
1887
    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1888
  }
1889

1890
  ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
1891
  {
1892
    // can't use vpaddq_s16() here, because we need saturation.
1893
    // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1894
    const int16x8_t a = vreinterpretq_s16_s32(v4s);
1895
    const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
1896
#ifdef CPU_ARCH_ARM64
1897
    return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
1898
#else
1899
    // sse2neon again
1900
    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
1901
    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
1902
    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
1903
#endif
1904
  }
1905

1906
  ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
1907
  {
1908
    return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1909
  }
1910

1911
  ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
1912
  {
1913
    return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1914
  }
1915

1916
  ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
1917
  {
1918
    return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1919
  }
1920

1921
  ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
1922
  {
1923
    return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1924
  }
1925

1926
  ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
1927

1928
  ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
1929
  {
1930
    return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1931
  }
1932

1933
  ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
1934
  {
1935
    return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1936
  }
1937

1938
  ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
1939
  {
1940
    return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1941
  }
1942

1943
  ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
1944
  {
1945
    return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1946
  }
1947

1948
  ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
1949
  {
1950
    return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1951
  }
1952

1953
  ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
1954
  {
1955
    return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1956
  }
1957

1958
  ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
1959
  {
1960
    // from sse2neon
1961
    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
1962
    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
1963
    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1964
    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
1965
    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
1966
    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
1967
    uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
1968
    return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
1969
  }
1970

1971
  ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
1972
  {
1973
    return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1974
  }
1975

1976
  ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
1977
  {
1978
    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1979
    int32x4_t mul_hi =
1980
      vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1981
    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
1982
    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
1983
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
1984
  }
1985

1986
  ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
1987

1988
  ALWAYS_INLINE bool eq(const GSVector4i& v) const
1989
  {
1990
    const int32x4_t res = veorq_s32(v4s, v.v4s);
1991
#ifdef CPU_ARCH_ARM64
1992
    return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
1993
#else
1994
    const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
1995
    return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
1996
#endif
1997
  }
1998

1999
  ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
2000
  {
2001
    return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2002
  }
2003

2004
  ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
2005
  {
2006
    return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2007
  }
2008

2009
  ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
2010
  {
2011
    return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
2012
  }
2013

2014
#ifdef CPU_ARCH_ARM64
2015
  ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
2016
  {
2017
    return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
2018
  }
2019
#endif
2020

2021
  ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
2022

2023
  ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
2024

2025
  ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
2026

2027
  ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
2028
  {
2029
    return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2030
  }
2031

2032
  ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
2033
  {
2034
    return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2035
  }
2036

2037
  ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
2038

2039
  ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
2040
  {
2041
    return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2042
  }
2043
  ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
2044
  {
2045
    return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2046
  }
2047
  ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
2048

2049
  ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
2050
  {
2051
    return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2052
  }
2053

2054
  ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
2055
  {
2056
    return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2057
  }
2058

2059
  ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
2060

2061
  ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
2062
  {
2063
    return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2064
  }
2065
  ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
2066
  {
2067
    return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2068
  }
2069
  ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
2070

2071
  ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
2072

2073
  ALWAYS_INLINE int mask() const
2074
  {
2075
    // borrowed from sse2neon
2076
    const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
2077
    const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2078
    const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2079
    const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2080
    return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
2081
  }
2082

2083
  ALWAYS_INLINE bool alltrue() const
2084
  {
2085
#ifdef CPU_ARCH_ARM64
2086
    return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
2087
#else
2088
    return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
2089
            UINT64_C(0xFFFFFFFFFFFFFFFF));
2090
#endif
2091
  }
2092

2093
  ALWAYS_INLINE bool allfalse() const
2094
  {
2095
#ifdef CPU_ARCH_ARM64
2096
    return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
2097
#else
2098
    return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
2099
#endif
2100
  }
2101

2102
  template<int i>
2103
  ALWAYS_INLINE GSVector4i insert8(int a) const
2104
  {
2105
    return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
2106
  }
2107

2108
  template<int i>
2109
  ALWAYS_INLINE int extract8() const
2110
  {
2111
    return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
2112
  }
2113

2114
  template<int i>
2115
  ALWAYS_INLINE GSVector4i insert16(int a) const
2116
  {
2117
    return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
2118
  }
2119

2120
  template<int i>
2121
  ALWAYS_INLINE int extract16() const
2122
  {
2123
    return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
2124
  }
2125

2126
  template<int i>
2127
  ALWAYS_INLINE GSVector4i insert32(int a) const
2128
  {
2129
    return GSVector4i(vsetq_lane_s32(a, v4s, i));
2130
  }
2131

2132
  template<int i>
2133
  ALWAYS_INLINE int extract32() const
2134
  {
2135
    return vgetq_lane_s32(v4s, i);
2136
  }
2137

2138
  template<int i>
2139
  ALWAYS_INLINE GSVector4i insert64(s64 a) const
2140
  {
2141
    return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
2142
  }
2143

2144
  template<int i>
2145
  ALWAYS_INLINE s64 extract64() const
2146
  {
2147
    return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
2148
  }
2149

2150
#ifdef CPU_ARCH_ARM64
2151
  ALWAYS_INLINE GSVector4i tbl2(const GSVector4i& a, const GSVector4i& b, const GSVector4i& idx)
2152
  {
2153
    return GSVector4i(vreinterpretq_s32_u8(
2154
      vqtbx2q_u8(vreinterpretq_u8_s32(v4s), uint8x16x2_t{vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(b.v4s)},
2155
                 vreinterpretq_u8_s32(idx.v4s))));
2156
  }
2157
#endif
2158

2159
  ALWAYS_INLINE static GSVector4i loadnt(const void* p)
2160
  {
2161
#if __has_builtin(__builtin_nontemporal_store)
2162
    return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
2163
#else
2164
    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2165
#endif
2166
  }
2167

2168
  ALWAYS_INLINE static GSVector4i load32(const void* p)
2169
  {
2170
    // should be ldr s0, [x0]
2171
    u32 val;
2172
    std::memcpy(&val, p, sizeof(u32));
2173
    return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
2174
  }
2175

2176
  ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
2177

2178
  template<bool aligned>
2179
  ALWAYS_INLINE static GSVector4i loadl(const void* p)
2180
  {
2181
#ifdef CPU_ARCH_ARM32
2182
    if constexpr (!aligned)
2183
      return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
2184
#endif
2185

2186
    return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
2187
  }
2188

2189
  ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v) { return GSVector4i(vcombine_s32(v.v2s, vcreate_s32(0))); }
2190

2191
  template<bool aligned>
2192
  ALWAYS_INLINE static GSVector4i loadh(const void* p)
2193
  {
2194
#ifdef CPU_ARCH_ARM32
2195
    if constexpr (!aligned)
2196
      return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2197
#endif
2198

2199
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2200
  }
2201

2202
  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
2203

2204
  template<bool aligned>
2205
  ALWAYS_INLINE static GSVector4i load(const void* p)
2206
  {
2207
#ifdef CPU_ARCH_ARM32
2208
    if constexpr (!aligned)
2209
      return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
2210
#endif
2211

2212
    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2213
  }
2214

2215
  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
2216
  {
2217
#if __has_builtin(__builtin_nontemporal_store)
2218
    __builtin_nontemporal_store(v.v4s, static_cast<int32x4_t*>(p));
2219
#else
2220
    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2221
#endif
2222
  }
2223

2224
  ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
2225
  {
2226
    u32 val = vgetq_lane_s32(v, 0);
2227
    std::memcpy(p, &val, sizeof(u32));
2228
  }
2229

2230
  template<bool aligned>
2231
  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
2232
  {
2233
#ifdef CPU_ARCH_ARM32
2234
    if constexpr (!aligned)
2235
    {
2236
      vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
2237
      return;
2238
    }
2239
#endif
2240

2241
    vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
2242
  }
2243

2244
  template<bool aligned>
2245
  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
2246
  {
2247
#ifdef CPU_ARCH_ARM32
2248
    if constexpr (!aligned)
2249
    {
2250
      vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
2251
      return;
2252
    }
2253
#endif
2254

2255
    vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
2256
  }
2257

2258
  template<bool aligned>
2259
  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
2260
  {
2261
#ifdef CPU_ARCH_ARM32
2262
    if constexpr (!aligned)
2263
    {
2264
      vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
2265
      return;
2266
    }
2267
#endif
2268

2269
    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2270
  }
2271

2272
  ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
2273

2274
  template<bool aligned>
2275
  ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
2276
  {
2277
    return load<aligned>(v);
2278
  }
2279

2280
  ALWAYS_INLINE void operator&=(const GSVector4i& v)
2281
  {
2282
    v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2283
  }
2284

2285
  ALWAYS_INLINE void operator|=(const GSVector4i& v)
2286
  {
2287
    v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2288
  }
2289

2290
  ALWAYS_INLINE void operator^=(const GSVector4i& v)
2291
  {
2292
    v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2293
  }
2294

2295
  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
2296
  {
2297
    return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2298
  }
2299

2300
  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
2301
  {
2302
    return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2303
  }
2304

2305
  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
2306
  {
2307
    return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2308
  }
2309

2310
  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
2311

2312
  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
2313

2314
  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
2315

2316
  ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
2317

2318
  ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
2319

2320
  ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
2321

2322
  ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
2323

2324
  ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
2325
  {
2326
    return GSVector4i(vcombine_s32(xy.v2s, zw.v2s));
2327
  }
2328

2329
  ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw) { return GSVector4i(vcombine_s32(xyzw.v2s, xyzw.v2s)); }
2330

2331
  static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
2332

2333
  ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
2334

2335
  ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
2336

2337
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                             \
2338
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const                                                                      \
2339
  {                                                                                                                    \
2340
    return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                              \
2341
  }                                                                                                                    \
2342
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const                                                                   \
2343
  {                                                                                                                    \
2344
    return GSVector4i(vreinterpretq_s32_s16(                                                                           \
2345
      __builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7)));   \
2346
  }                                                                                                                    \
2347
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const                                                                   \
2348
  {                                                                                                                    \
2349
    return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(                                                   \
2350
      vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn)));           \
2351
  }
2352

2353
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                     \
2354
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                    \
2355
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                    \
2356
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                    \
2357
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2358

2359
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn)                                                                             \
2360
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                            \
2361
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                            \
2362
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                            \
2363
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2364

2365
#define VECTOR4i_SHUFFLE_1(xs, xn)                                                                                     \
2366
  VECTOR4i_SHUFFLE_2(xs, xn, x, 0);                                                                                    \
2367
  VECTOR4i_SHUFFLE_2(xs, xn, y, 1);                                                                                    \
2368
  VECTOR4i_SHUFFLE_2(xs, xn, z, 2);                                                                                    \
2369
  VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
2370

2371
  VECTOR4i_SHUFFLE_1(x, 0);
2372
  VECTOR4i_SHUFFLE_1(y, 1);
2373
  VECTOR4i_SHUFFLE_1(z, 2);
2374
  VECTOR4i_SHUFFLE_1(w, 3);
2375

2376
#undef VECTOR4i_SHUFFLE_1
2377
#undef VECTOR4i_SHUFFLE_2
2378
#undef VECTOR4i_SHUFFLE_3
2379
#undef VECTOR4i_SHUFFLE_4
2380
};
2381

2382
class alignas(16) GSVector4
2383
{
2384
  struct cxpr_init_tag
2385
  {
2386
  };
2387
  static constexpr cxpr_init_tag cxpr_init{};
2388

2389
  constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
2390

2391
  constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
2392

2393
  constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
2394

2395
  constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
2396

2397
public:
2398
  union
2399
  {
2400
    struct
2401
    {
2402
      float x, y, z, w;
2403
    };
2404
    struct
2405
    {
2406
      float r, g, b, a;
2407
    };
2408
    struct
2409
    {
2410
      float left, top, right, bottom;
2411
    };
2412
    float F32[4];
2413
    double F64[2];
2414
    s8 I8[16];
2415
    s16 I16[8];
2416
    s32 I32[4];
2417
    s64 I64[2];
2418
    u8 U8[16];
2419
    u16 U16[8];
2420
    u32 U32[4];
2421
    u64 U64[2];
2422
    float32x4_t v4s;
2423
  };
2424

2425
  GSVector4() = default;
2426

2427
  constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
2428

2429
  constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
2430

2431
  constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
2432

2433
  constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
2434

2435
  constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
2436

2437
  constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
2438

2439
  constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
2440

2441
  constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
2442

2443
  ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
2444
  {
2445
    const float arr[4] = {x, y, z, w};
2446
    v4s = vld1q_f32(arr);
2447
  }
2448

2449
  ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
2450

2451
  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
2452
  {
2453
    const int arr[4] = {x, y, z, w};
2454
    v4s = vcvtq_f32_s32(vld1q_s32(arr));
2455
  }
2456

2457
  ALWAYS_INLINE GSVector4(int x, int y)
2458
  {
2459
    v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
2460
  }
2461

2462
  ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
2463

2464
  ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
2465

2466
  ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
2467

2468
  ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
2469

2470
  ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
2471

2472
  ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
2473

2474
  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2475

2476
  ALWAYS_INLINE static GSVector4 f64(double x, double y)
2477
  {
2478
#ifdef CPU_ARCH_ARM64
2479
    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
2480
#else
2481
    GSVector4 ret;
2482
    ret.F64[0] = x;
2483
    ret.F64[1] = y;
2484
    return ret;
2485
#endif
2486
  }
2487

2488
  ALWAYS_INLINE static GSVector4 f64(double x)
2489
  {
2490
#ifdef CPU_ARCH_ARM64
2491
    return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
2492
#else
2493
    GSVector4 ret;
2494
    ret.F64[0] = ret.F64[1] = x;
2495
    return ret;
2496
#endif
2497
  }
2498

2499
  ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
2500

2501
  ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
2502

2503
  ALWAYS_INLINE operator float32x4_t() const { return v4s; }
2504

2505
  ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2506

2507
  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2508
  {
2509
    return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2510
  }
2511

2512
  ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2513

2514
  ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
2515

2516
  ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
2517

2518
#ifdef _M_ARM64
2519

2520
  ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
2521

2522
  ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
2523

2524
#else
2525

2526
  ALWAYS_INLINE GSVector4 floor() const
2527
  {
2528
    return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
2529
                     std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
2530
  }
2531

2532
  ALWAYS_INLINE GSVector4 ceil() const
2533
  {
2534
    return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
2535
                     std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
2536
  }
2537

2538
#endif
2539

2540
#ifdef CPU_ARCH_ARM64
2541

2542
  ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
2543

2544
  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
2545

2546
  ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
2547

2548
  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2549
  {
2550
    return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
2551
  }
2552

2553
#else
2554

2555
  ALWAYS_INLINE GSVector4 hadd() const
2556
  {
2557
    const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2558
    return GSVector4(vcombine_f32(res, res));
2559
  }
2560

2561
  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
2562
  {
2563
    const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2564
    const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
2565
    return GSVector4(vcombine_f32(res1, res2));
2566
  }
2567

2568
  ALWAYS_INLINE GSVector4 hsub() const
2569
  {
2570
    const float32x4x2_t res = vuzpq_f32(v4s, v4s);
2571
    return GSVector4(vsubq_f32(res.val[0], res.val[0]));
2572
  }
2573

2574
  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2575
  {
2576
    const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
2577
    return GSVector4(vsubq_f32(res.val[0], res.val[1]));
2578
  }
2579

2580
#endif
2581

2582
  ALWAYS_INLINE float dot(const GSVector4& v) const
2583
  {
2584
#ifdef CPU_ARCH_ARM64
2585
    return vaddvq_f32(vmulq_f32(v4s, v.v4s));
2586
#else
2587
    const float32x4_t dp = vmulq_f32(v4s, v.v4s);
2588
    float32x2_t tmp = vadd_f32(vget_low_f32(dp), vget_high_f32(dp)); // (x+z, y+w)
2589
    return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2590
#endif
2591
  }
2592

2593
  ALWAYS_INLINE float addv() const
2594
  {
2595
#ifdef CPU_ARCH_ARM64
2596
    return vaddvq_f32(v4s);
2597
#else
2598
    float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2599
    return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2600
#endif
2601
  }
2602

2603
  ALWAYS_INLINE float minv() const
2604
  {
2605
#ifdef CPU_ARCH_ARM64
2606
    return vminvq_f32(v4s);
2607
#else
2608
    float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2609
    return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2610
#endif
2611
  }
2612

2613
  ALWAYS_INLINE float maxv() const
2614
  {
2615
#ifdef CPU_ARCH_ARM64
2616
    return vmaxvq_f32(v4s);
2617
#else
2618
    float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2619
    return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2620
#endif
2621
  }
2622

2623
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
2624

2625
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
2626
  {
2627
#ifdef CPU_ARCH_ARM64
2628
    const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
2629
    const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
2630
#else
2631
    const GSVector4 minv(a.xyxy());
2632
    const GSVector4 maxv(a.zwzw());
2633
#endif
2634
    return sat(minv, maxv);
2635
  }
2636

2637
  ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2638

2639
  ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2640

2641
  ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
2642

2643
  ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
2644

2645
  template<int mask>
2646
  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
2647
  {
2648
    return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
2649
                                             (mask & 8) ? 7 : 3));
2650
  }
2651

2652
  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
2653
  {
2654
    // duplicate sign bit across and bit select
2655
    const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
2656
    return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
2657
  }
2658

2659
#ifdef CPU_ARCH_ARM64
2660

2661
  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
2662

2663
  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
2664

2665
  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2666
  {
2667
    return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2668
  }
2669

2670
  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2671
  {
2672
    return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2673
  }
2674

2675
#else
2676

2677
  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
2678
  {
2679
    const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
2680
    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2681
  }
2682

2683
  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
2684
  {
2685
    const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
2686
    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2687
  }
2688

2689
  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2690
  {
2691
    return GSVector4(vreinterpretq_f32_s64(
2692
      vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
2693
  }
2694

2695
  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2696
  {
2697
    return GSVector4(vreinterpretq_f32_s64(
2698
      vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
2699
  }
2700

2701
#endif
2702

2703
  ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
2704
  {
2705
    return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
2706
  }
2707

2708
  ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
2709
  {
2710
    return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
2711
  }
2712

2713
  ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
2714
  {
2715
    return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
2716
  }
2717

2718
  ALWAYS_INLINE int mask() const
2719
  {
2720
#ifdef CPU_ARCH_ARM64
2721
    static constexpr const int32_t shifts[] = {0, 1, 2, 3};
2722
    return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
2723
#else
2724
    // sse2neon again
2725
    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
2726
    uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2727
    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2728
#endif
2729
  }
2730

2731
  ALWAYS_INLINE bool alltrue() const
2732
  {
2733
#ifdef CPU_ARCH_ARM64
2734
    return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
2735
#else
2736

2737
    return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2738
                                                        vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2739
                          0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
2740
#endif
2741
  }
2742

2743
  ALWAYS_INLINE bool allfalse() const
2744
  {
2745
#ifdef CPU_ARCH_ARM64
2746
    return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
2747
#else
2748
    return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2749
                                                        vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2750
                          0) == UINT64_C(0));
2751
#endif
2752
  }
2753

2754
  ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2755

2756
  template<int src, int dst>
2757
  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2758
  {
2759
#ifdef CPU_ARCH_ARM64
2760
    return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
2761
#else
2762
    return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
2763
#endif
2764
  }
2765

2766
  template<int i>
2767
  ALWAYS_INLINE GSVector4 insert32(float v) const
2768
  {
2769
    return GSVector4(vsetq_lane_f32(v, v4s, i));
2770
  }
2771

2772
  template<int i>
2773
  ALWAYS_INLINE float extract32() const
2774
  {
2775
    return vgetq_lane_f32(v4s, i);
2776
  }
2777

2778
  template<int dst>
2779
  ALWAYS_INLINE GSVector4 insert64(double v) const
2780
  {
2781
#ifdef CPU_ARCH_ARM64
2782
    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
2783
#else
2784
    GSVector4 ret;
2785
    ret.F64[dst] = v;
2786
    return ret;
2787
#endif
2788
  }
2789

2790
  template<int src>
2791
  ALWAYS_INLINE double extract64() const
2792
  {
2793
#ifdef CPU_ARCH_ARM64
2794
    return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
2795
#else
2796
    return F64[src];
2797
#endif
2798
  }
2799

2800
  ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
2801

2802
  ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
2803

2804
  template<bool aligned>
2805
  ALWAYS_INLINE static GSVector4 loadl(const void* p)
2806
  {
2807
#ifdef CPU_ARCH_ARM32
2808
    if constexpr (!aligned)
2809
      return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
2810
#endif
2811

2812
    return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
2813
  }
2814

2815
  ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
2816

2817
  template<bool aligned>
2818
  ALWAYS_INLINE static GSVector4 load(const void* p)
2819
  {
2820
#ifdef CPU_ARCH_ARM32
2821
    if constexpr (!aligned)
2822
      return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
2823
#endif
2824

2825
    return GSVector4(vld1q_f32((const float*)p));
2826
  }
2827

2828
  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
2829

2830
  template<bool aligned>
2831
  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2832
  {
2833
#ifdef CPU_ARCH_ARM32
2834
    if constexpr (!aligned)
2835
    {
2836
      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
2837
      return;
2838
    }
2839
#endif
2840

2841
    vst1_f32((float*)p, vget_low_f32(v.v4s));
2842
  }
2843

2844
  template<bool aligned>
2845
  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2846
  {
2847
#ifdef CPU_ARCH_ARM32
2848
    if constexpr (!aligned)
2849
    {
2850
      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
2851
      return;
2852
    }
2853
#endif
2854

2855
    vst1_f32((float*)p, vget_high_f32(v.v4s));
2856
  }
2857

2858
  template<bool aligned>
2859
  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2860
  {
2861
#ifdef CPU_ARCH_ARM32
2862
    if constexpr (!aligned)
2863
    {
2864
      vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
2865
      return;
2866
    }
2867
#endif
2868

2869
    vst1q_f32((float*)p, v.v4s);
2870
  }
2871

2872
  ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
2873

2874
  ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2875

2876
  ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
2877
  ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
2878
  ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
2879
  ALWAYS_INLINE void operator/=(const GSVector4& v)
2880
  {
2881
#ifdef CPU_ARCH_ARM64
2882
    v4s = vdivq_f32(v4s, v.v4s);
2883
#else
2884
    *this =
2885
      GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
2886
                vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
2887
#endif
2888
  }
2889

2890
  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
2891
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
2892
  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
2893
  ALWAYS_INLINE void operator/=(float f)
2894
  {
2895
#ifdef CPU_ARCH_ARM64
2896
    *this /= GSVector4(f);
2897
#else
2898
    *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
2899
                      vgetq_lane_f32(v4s, 3) / f);
2900
#endif
2901
  }
2902

2903
  ALWAYS_INLINE void operator&=(const GSVector4& v)
2904
  {
2905
    v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2906
  }
2907

2908
  ALWAYS_INLINE void operator|=(const GSVector4& v)
2909
  {
2910
    v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2911
  }
2912

2913
  ALWAYS_INLINE void operator^=(const GSVector4& v)
2914
  {
2915
    v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2916
  }
2917

2918
  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2919
  {
2920
    return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
2921
  }
2922

2923
  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2924
  {
2925
    return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
2926
  }
2927

2928
  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2929
  {
2930
    return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
2931
  }
2932

2933
  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2934
  {
2935
#ifdef CPU_ARCH_ARM64
2936
    return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
2937
#else
2938
    return GSVector4(
2939
      vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
2940
      vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
2941
#endif
2942
  }
2943

2944
  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2945
  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2946
  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2947
  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
2948
  {
2949
#ifdef CPU_ARCH_ARM64
2950
    return v / GSVector4(f);
2951
#else
2952
    return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
2953
                     vgetq_lane_f32(v.v4s, 3) / f);
2954
#endif
2955
  }
2956

2957
  ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2958
  {
2959
    return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2960
  }
2961

2962
  ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2963
  {
2964
    return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2965
  }
2966

2967
  ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2968
  {
2969
    return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2970
  }
2971

2972
  ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2973
  {
2974
    return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
2975
  }
2976

2977
  ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2978
  {
2979
    // NEON has no !=
2980
    return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
2981
  }
2982

2983
  ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2984
  {
2985
    return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
2986
  }
2987

2988
  ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2989
  {
2990
    return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
2991
  }
2992

2993
  ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
2994
  {
2995
    return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
2996
  }
2997

2998
  ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
2999
  {
3000
    return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
3001
  }
3002

3003
  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
3004
  {
3005
#ifdef CPU_ARCH_ARM64
3006
    return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3007
#else
3008
    return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
3009
#endif
3010
  }
3011

3012
  ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
3013
  {
3014
#ifdef CPU_ARCH_ARM64
3015
    return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3016
#else
3017
    return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
3018
#endif
3019
  }
3020

3021
  ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
3022
  {
3023
#ifdef CPU_ARCH_ARM64
3024
    return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3025
#else
3026
    return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
3027
#endif
3028
  }
3029

3030
  ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
3031
  {
3032
#ifdef CPU_ARCH_ARM64
3033
    return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3034
#else
3035
    return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
3036
#endif
3037
  }
3038

3039
  ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
3040
  {
3041
#ifdef CPU_ARCH_ARM64
3042
    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3043
#else
3044
    GSVector4 ret;
3045
    ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3046
    ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3047
    return ret;
3048
#endif
3049
  }
3050

3051
  ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
3052
  {
3053
#ifdef CPU_ARCH_ARM64
3054
    return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3055
#else
3056
    GSVector4 ret;
3057
    ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3058
    ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3059
    return ret;
3060
#endif
3061
  }
3062

3063
  ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
3064
  {
3065
#ifdef CPU_ARCH_ARM64
3066
    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3067
#else
3068
    GSVector4 ret;
3069
    ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3070
    ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3071
    return ret;
3072
#endif
3073
  }
3074

3075
  ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
3076
  {
3077
#ifdef CPU_ARCH_ARM64
3078
    return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3079
#else
3080
    GSVector4 ret;
3081
    ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3082
    ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3083
    return ret;
3084
#endif
3085
  }
3086

3087
  ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
3088
  {
3089
#ifdef CPU_ARCH_ARM64
3090
    return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3091
#else
3092
    GSVector4 ret;
3093
    ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3094
    ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3095
    return ret;
3096
#endif
3097
  }
3098

3099
  ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
3100
  {
3101
#ifdef CPU_ARCH_ARM64
3102
    return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3103
#else
3104
    return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
3105
#endif
3106
  }
3107

3108
  ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
3109
  {
3110
#ifdef CPU_ARCH_ARM64
3111
    return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3112
#else
3113
    return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
3114
#endif
3115
  }
3116

3117
  ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
3118

3119
  ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
3120

3121
  ALWAYS_INLINE GSVector4 sqrt64() const
3122
  {
3123
#ifdef CPU_ARCH_ARM64
3124
    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3125
#else
3126
    return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
3127
#endif
3128
  }
3129

3130
  ALWAYS_INLINE GSVector4 sqr64() const
3131
  {
3132
#ifdef CPU_ARCH_ARM64
3133
    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3134
#else
3135
    return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
3136
#endif
3137
  }
3138

3139
  ALWAYS_INLINE GSVector4 floor64() const
3140
  {
3141
#ifdef CPU_ARCH_ARM64
3142
    return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
3143
#else
3144
    return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
3145
#endif
3146
  }
3147

3148
  ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
3149
  {
3150
#ifdef CPU_ARCH_ARM64
3151
    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
3152
#else
3153
    return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
3154
#endif
3155
  }
3156

3157
  ALWAYS_INLINE static GSVector4 f32to64(const void* p)
3158
  {
3159
#ifdef CPU_ARCH_ARM64
3160
    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
3161
#else
3162
    const float* fp = static_cast<const float*>(p);
3163
    return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
3164
#endif
3165
  }
3166

3167
  ALWAYS_INLINE GSVector4i f64toi32() const
3168
  {
3169
#ifdef CPU_ARCH_ARM64
3170
    const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
3171
    const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
3172
#else
3173
    const s32 low = static_cast<s32>(F64[0]);
3174
    const s32 high = static_cast<s32>(F64[1]);
3175
#endif
3176
    return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
3177
  }
3178

3179
  ALWAYS_INLINE GSVector2 xy() const { return GSVector2(vget_low_s32(v4s)); }
3180

3181
  ALWAYS_INLINE GSVector2 zw() const { return GSVector2(vget_high_s32(v4s)); }
3182

3183
  ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
3184
  {
3185
    return GSVector4(vcombine_f32(l.v2s, h.v2s));
3186
  }
3187

3188
  ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(vcombine_f32(l.v2s, l.v2s)); }
3189

3190
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                              \
3191
  ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const                                                                       \
3192
  {                                                                                                                    \
3193
    return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                               \
3194
  }                                                                                                                    \
3195
  ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const                                                     \
3196
  {                                                                                                                    \
3197
    return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn));                                     \
3198
  }
3199

3200
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                      \
3201
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                     \
3202
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                     \
3203
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                     \
3204
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
3205

3206
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn)                                                                              \
3207
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                             \
3208
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                             \
3209
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                             \
3210
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
3211

3212
#define VECTOR4_SHUFFLE_1(xs, xn)                                                                                      \
3213
  VECTOR4_SHUFFLE_2(xs, xn, x, 0);                                                                                     \
3214
  VECTOR4_SHUFFLE_2(xs, xn, y, 1);                                                                                     \
3215
  VECTOR4_SHUFFLE_2(xs, xn, z, 2);                                                                                     \
3216
  VECTOR4_SHUFFLE_2(xs, xn, w, 3);
3217

3218
  VECTOR4_SHUFFLE_1(x, 0);
3219
  VECTOR4_SHUFFLE_1(y, 1);
3220
  VECTOR4_SHUFFLE_1(z, 2);
3221
  VECTOR4_SHUFFLE_1(w, 3);
3222

3223
#undef VECTOR4_SHUFFLE_1
3224
#undef VECTOR4_SHUFFLE_2
3225
#undef VECTOR4_SHUFFLE_3
3226
#undef VECTOR4_SHUFFLE_4
3227

3228
  ALWAYS_INLINE GSVector4 broadcast32() const
3229
  {
3230
#ifdef CPU_ARCH_ARM64
3231
    return GSVector4(vdupq_laneq_f32(v4s, 0));
3232
#else
3233
    return xxxx();
3234
#endif
3235
  }
3236

3237
  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
3238
  {
3239
#ifdef CPU_ARCH_ARM64
3240
    return GSVector4(vdupq_laneq_f32(v.v4s, 0));
3241
#else
3242
    return v.xxxx();
3243
#endif
3244
  }
3245

3246
  ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
3247

3248
  ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
3249
  {
3250
#ifdef CPU_ARCH_ARM64
3251
    return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
3252
#else
3253
    return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
3254
#endif
3255
  }
3256
};
3257

3258
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
3259
{
3260
  v2s = vcvt_s32_f32(v.v2s);
3261
}
3262

3263
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
3264
{
3265
  v2s = vcvt_f32_s32(v.v2s);
3266
}
3267

3268
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
3269
{
3270
  return GSVector2i(vreinterpret_s32_f32(v.v2s));
3271
}
3272

3273
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
3274
{
3275
  return GSVector2(vreinterpret_f32_s32(v.v2s));
3276
}
3277

3278
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
3279
{
3280
  v4s = vcvtq_s32_f32(v.v4s);
3281
}
3282

3283
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
3284
{
3285
  v4s = vcvtq_f32_s32(v.v4s);
3286
}
3287

3288
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
3289
{
3290
  return GSVector4i(vreinterpretq_s32_f32(v.v4s));
3291
}
3292

3293
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
3294
{
3295
  return GSVector4(vreinterpretq_f32_s32(v.v4s));
3296
}
3297

3298
Product

Resources

Company