Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_sse.h
4211 views
1
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
//
4
// Lightweight wrapper over native SIMD types for cross-platform vector code.
5
// Rewritten and NEON+No-SIMD variants added for DuckStation.
6
//
7
8
#pragma once
9
10
#include "common/intrin.h"
11
#include "common/types.h"
12
13
#include <algorithm>
14
15
#ifdef CPU_ARCH_SSE41
16
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
17
#endif
18
19
#ifdef CPU_ARCH_AVX2
20
#define GSVECTOR_HAS_SRLV 1
21
#define GSVECTOR_HAS_256 1
22
#endif
23
24
class GSVector2;
25
class GSVector2i;
26
class GSVector4;
27
class GSVector4i;
28
29
#ifndef CPU_ARCH_SSE41
30
31
// Thank LLVM for these.
32
ALWAYS_INLINE static __m128i sse2_min_s8(const __m128i m, const __m128i v)
33
{
34
const __m128i temp = _mm_cmpgt_epi8(m, v);
35
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
36
}
37
38
ALWAYS_INLINE static __m128i sse2_max_s8(const __m128i m, const __m128i v)
39
{
40
const __m128i temp = _mm_cmpgt_epi8(v, m);
41
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
42
}
43
44
ALWAYS_INLINE static __m128i sse2_min_s32(const __m128i m, const __m128i v)
45
{
46
const __m128i temp = _mm_cmpgt_epi32(m, v);
47
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
48
}
49
50
ALWAYS_INLINE static __m128i sse2_max_s32(const __m128i m, const __m128i v)
51
{
52
const __m128i temp = _mm_cmpgt_epi32(v, m);
53
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
54
}
55
56
ALWAYS_INLINE static __m128i sse2_min_u16(const __m128i m, const __m128i v)
57
{
58
return _mm_sub_epi16(m, _mm_subs_epu16(m, v));
59
}
60
61
ALWAYS_INLINE static __m128i sse2_max_u16(const __m128i m, const __m128i v)
62
{
63
return _mm_add_epi16(m, _mm_subs_epu16(v, m));
64
}
65
66
ALWAYS_INLINE static __m128i sse2_min_u32(const __m128i m, const __m128i v)
67
{
68
const __m128i msb = _mm_set1_epi32(0x80000000);
69
const __m128i temp = _mm_cmpgt_epi32(_mm_xor_si128(msb, v), _mm_xor_si128(m, msb));
70
return _mm_or_si128(_mm_andnot_si128(temp, v), _mm_and_si128(m, temp));
71
}
72
73
ALWAYS_INLINE static __m128i sse2_max_u32(const __m128i m, const __m128i v)
74
{
75
const __m128i msb = _mm_set1_epi32(0x80000000);
76
const __m128i temp = _mm_cmpgt_epi32(_mm_xor_si128(msb, m), _mm_xor_si128(v, msb));
77
return _mm_or_si128(_mm_andnot_si128(temp, v), _mm_and_si128(m, temp));
78
}
79
80
#endif
81
82
class alignas(16) GSVector2i
83
{
84
struct cxpr_init_tag
85
{
86
};
87
static constexpr cxpr_init_tag cxpr_init{};
88
89
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y, 0, 0} {}
90
91
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3, 0, 0, 0, 0} {}
92
93
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
94
: S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
95
{
96
}
97
98
public:
99
union
100
{
101
struct
102
{
103
s32 x, y;
104
};
105
struct
106
{
107
s32 r, g;
108
};
109
float F32[4];
110
s8 S8[16];
111
s16 S16[8];
112
s32 S32[4];
113
s64 S64[2];
114
u8 U8[16];
115
u16 U16[8];
116
u32 U32[4];
117
u64 U64[2];
118
__m128i m;
119
};
120
121
GSVector2i() = default;
122
123
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
124
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
125
126
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
127
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
128
{
129
return GSVector2i(cxpr_init, s0, s1, s2, s3);
130
}
131
132
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
133
{
134
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
135
}
136
137
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); }
138
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); }
139
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
140
: S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
141
{
142
}
143
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
144
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
145
ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {}
146
147
ALWAYS_INLINE GSVector2i& operator=(s32 i)
148
{
149
m = _mm_set1_epi32(i);
150
return *this;
151
}
152
153
ALWAYS_INLINE GSVector2i& operator=(__m128i m_)
154
{
155
m = m_;
156
return *this;
157
}
158
159
ALWAYS_INLINE operator __m128i() const { return m; }
160
161
ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
162
{
163
return max_s8(min).min_s8(max);
164
}
165
ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
166
{
167
return max_s16(min).min_s16(max);
168
}
169
ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
170
{
171
return max_s32(min).min_s32(max);
172
}
173
174
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
175
{
176
return max_u8(min).min_u8(max);
177
}
178
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
179
{
180
return max_u16(min).min_u16(max);
181
}
182
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
183
{
184
return max_u32(min).min_u32(max);
185
}
186
187
#ifdef CPU_ARCH_SSE41
188
189
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const { return GSVector2i(_mm_min_epi8(m, v)); }
190
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const { return GSVector2i(_mm_max_epi8(m, v)); }
191
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); }
192
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); }
193
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(_mm_min_epi32(m, v)); }
194
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(_mm_max_epi32(m, v)); }
195
196
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); }
197
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); }
198
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(_mm_min_epu16(m, v)); }
199
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(_mm_max_epu16(m, v)); }
200
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); }
201
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); }
202
203
ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); }
204
205
#define VECTOR2i_REDUCE_8(name, func, ret) \
206
ALWAYS_INLINE ret name() const \
207
{ \
208
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
209
v = func(v, _mm_srli_epi32(v, 16)); \
210
v = func(v, _mm_srli_epi16(v, 8)); \
211
return static_cast<ret>(_mm_extract_epi8(v, 0)); \
212
}
213
214
#define VECTOR2i_REDUCE_16(name, func, ret) \
215
ALWAYS_INLINE ret name() const \
216
{ \
217
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
218
v = func(v, _mm_srli_epi32(v, 16)); \
219
return static_cast<ret>(_mm_extract_epi16(v, 0)); \
220
}
221
222
#define VECTOR2i_REDUCE_32(name, func, ret) \
223
ALWAYS_INLINE ret name() const \
224
{ \
225
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
226
return static_cast<ret>(_mm_extract_epi32(v, 0)); \
227
}
228
229
VECTOR2i_REDUCE_8(minv_s8, _mm_min_epi8, s8);
230
VECTOR2i_REDUCE_8(maxv_s8, _mm_max_epi8, s8);
231
VECTOR2i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
232
VECTOR2i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
233
VECTOR2i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
234
VECTOR2i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
235
VECTOR2i_REDUCE_16(minv_u16, _mm_min_epu16, u16);
236
VECTOR2i_REDUCE_16(maxv_u16, _mm_max_epu16, u16);
237
VECTOR2i_REDUCE_32(minv_s32, _mm_min_epi32, s32);
238
VECTOR2i_REDUCE_32(maxv_s32, _mm_max_epi32, s32);
239
VECTOR2i_REDUCE_32(minv_u32, _mm_min_epu32, u32);
240
VECTOR2i_REDUCE_32(maxv_u32, _mm_max_epu32, u32);
241
242
#undef VECTOR2i_REDUCE_32
243
#undef VECTOR2i_REDUCE_16
244
#undef VECTOR2i_REDUCE_8
245
246
#else
247
248
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const { return GSVector2i(sse2_min_s8(m, v)); }
249
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const { return GSVector2i(sse2_max_s8(m, v)); }
250
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); }
251
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); }
252
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(sse2_min_s32(m, v)); }
253
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(sse2_max_s32(m, v)); }
254
255
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); }
256
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); }
257
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(sse2_min_u16(m, v)); }
258
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(sse2_max_u16(m, v)); }
259
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(sse2_min_u32(m, v)); }
260
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(sse2_max_u32(m, v)); }
261
262
s32 addv_s32() const { return (x + y); }
263
264
#define VECTOR2i_REDUCE_8(name, func, ret) \
265
ALWAYS_INLINE ret name() const \
266
{ \
267
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
268
v = func(v, _mm_srli_epi32(v, 16)); \
269
v = func(v, _mm_srli_epi16(v, 8)); \
270
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
271
}
272
273
#define VECTOR2i_REDUCE_16(name, func, ret) \
274
ALWAYS_INLINE ret name() const \
275
{ \
276
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
277
v = func(v, _mm_srli_epi32(v, 16)); \
278
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
279
}
280
281
#define VECTOR2i_REDUCE_32(name, func, ret) \
282
ALWAYS_INLINE ret name() const \
283
{ \
284
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
285
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
286
}
287
288
VECTOR2i_REDUCE_8(minv_s8, sse2_min_s8, s8);
289
VECTOR2i_REDUCE_8(maxv_s8, sse2_max_s8, s8);
290
VECTOR2i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
291
VECTOR2i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
292
VECTOR2i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
293
VECTOR2i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
294
VECTOR2i_REDUCE_16(minv_u16, sse2_min_u16, u16);
295
VECTOR2i_REDUCE_16(maxv_u16, sse2_max_u16, u16);
296
VECTOR2i_REDUCE_32(minv_s32, sse2_min_s32, s32);
297
VECTOR2i_REDUCE_32(maxv_s32, sse2_max_s32, s32);
298
VECTOR2i_REDUCE_32(minv_u32, sse2_min_u32, u32);
299
VECTOR2i_REDUCE_32(maxv_u32, sse2_max_u32, u32);
300
301
#undef VECTOR2i_REDUCE_32
302
#undef VECTOR2i_REDUCE_16
303
#undef VECTOR2i_REDUCE_8
304
305
#endif
306
307
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
308
309
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const
310
{
311
return GSVector2i(_mm_blendv_epi8(m, v, mask));
312
}
313
314
template<s32 mask>
315
ALWAYS_INLINE GSVector2i blend16(const GSVector2i& v) const
316
{
317
return GSVector2i(_mm_blend_epi16(m, v, mask));
318
}
319
320
template<s32 mask>
321
ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
322
{
323
#if defined(CPU_ARCH_AVX2)
324
return GSVector2i(_mm_blend_epi32(m, v.m, mask));
325
#else
326
constexpr s32 bit1 = ((mask & 2) * 3) << 1;
327
constexpr s32 bit0 = (mask & 1) * 3;
328
return blend16 < bit1 | bit0 > (v);
329
#endif
330
}
331
332
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
333
{
334
return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
335
}
336
337
#ifdef CPU_ARCH_SSE41
338
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); }
339
#else
340
GSVector2i shuffle8(const GSVector2i& mask) const
341
{
342
GSVector2i ret;
343
for (size_t i = 0; i < 8; i++)
344
ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]);
345
return ret;
346
}
347
#endif
348
349
ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); }
350
ALWAYS_INLINE GSVector2i pu16() const { return GSVector2i(_mm_packus_epi16(m, m)); }
351
ALWAYS_INLINE GSVector2i ps32() const { return GSVector2i(_mm_packs_epi32(m, m)); }
352
#ifdef CPU_ARCH_SSE41
353
ALWAYS_INLINE GSVector2i pu32() const { return GSVector2i(_mm_packus_epi32(m, m)); }
354
#endif
355
356
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi8(m, v)); }
357
ALWAYS_INLINE GSVector2i uph8(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi8(m, v)); }
358
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi16(m, v)); }
359
ALWAYS_INLINE GSVector2i uph16(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi16(m, v)); }
360
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi32(m, v)); }
361
ALWAYS_INLINE GSVector2i uph32(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi32(m, v)); }
362
363
ALWAYS_INLINE GSVector2i upl8() const { return GSVector2i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
364
ALWAYS_INLINE GSVector2i uph8() const { return GSVector2i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
365
366
ALWAYS_INLINE GSVector2i upl16() const { return GSVector2i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
367
ALWAYS_INLINE GSVector2i uph16() const { return GSVector2i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
368
369
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
370
ALWAYS_INLINE GSVector2i uph32() const { return GSVector2i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
371
372
#ifdef CPU_ARCH_SSE41
373
ALWAYS_INLINE GSVector2i u8to16() const { return GSVector2i(_mm_cvtepu8_epi16(m)); }
374
ALWAYS_INLINE GSVector2i u8to32() const { return GSVector2i(_mm_cvtepu8_epi32(m)); }
375
ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(_mm_cvtepi16_epi32(m)); }
376
ALWAYS_INLINE GSVector2i u16to32() const { return GSVector2i(_mm_cvtepu16_epi32(m)); }
377
#else
378
// These are a pain, adding only as needed...
379
ALWAYS_INLINE GSVector2i u8to16() const { return upl8(); }
380
ALWAYS_INLINE GSVector2i u8to32() const
381
{
382
return GSVector2i(_mm_unpacklo_epi16(_mm_unpacklo_epi8(m, _mm_setzero_si128()), _mm_setzero_si128()));
383
}
384
385
ALWAYS_INLINE GSVector2i s16to32() const { return upl16().sll32<16>().sra32<16>(); }
386
ALWAYS_INLINE GSVector2i u16to32() const { return upl16(); }
387
#endif
388
389
template<s32 i>
390
ALWAYS_INLINE GSVector2i srl() const
391
{
392
return GSVector2i(_mm_srli_si128(m, i));
393
}
394
395
template<s32 i>
396
ALWAYS_INLINE GSVector2i sll() const
397
{
398
return GSVector2i(_mm_slli_si128(m, i));
399
}
400
401
template<s32 i>
402
ALWAYS_INLINE GSVector2i sll16() const
403
{
404
return GSVector2i(_mm_slli_epi16(m, i));
405
}
406
407
ALWAYS_INLINE GSVector2i sll16(s32 i) const { return GSVector2i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
408
409
#ifdef CPU_ARCH_AVX2
410
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi16(m, v.m)); }
411
#endif
412
413
template<s32 i>
414
ALWAYS_INLINE GSVector2i srl16() const
415
{
416
return GSVector2i(_mm_srli_epi16(m, i));
417
}
418
419
ALWAYS_INLINE GSVector2i srl16(s32 i) const { return GSVector2i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
420
421
#ifdef CPU_ARCH_AVX2
422
ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi16(m, v.m)); }
423
#endif
424
425
template<s32 i>
426
ALWAYS_INLINE GSVector2i sra16() const
427
{
428
return GSVector2i(_mm_srai_epi16(m, i));
429
}
430
431
ALWAYS_INLINE GSVector2i sra16(s32 i) const { return GSVector2i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
432
433
#ifdef CPU_ARCH_AVX2
434
ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi16(m, v.m)); }
435
#endif
436
437
template<s32 i>
438
ALWAYS_INLINE GSVector2i sll32() const
439
{
440
return GSVector2i(_mm_slli_epi32(m, i));
441
}
442
443
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
444
445
#ifdef CPU_ARCH_AVX2
446
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi32(m, v.m)); }
447
#endif
448
449
template<s32 i>
450
ALWAYS_INLINE GSVector2i srl32() const
451
{
452
return GSVector2i(_mm_srli_epi32(m, i));
453
}
454
455
ALWAYS_INLINE GSVector2i srl32(s32 i) const { return GSVector2i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
456
457
#ifdef CPU_ARCH_AVX2
458
ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi32(m, v.m)); }
459
#endif
460
461
template<s32 i>
462
ALWAYS_INLINE GSVector2i sra32() const
463
{
464
return GSVector2i(_mm_srai_epi32(m, i));
465
}
466
467
ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
468
469
#ifdef CPU_ARCH_AVX2
470
ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi32(m, v.m)); }
471
#endif
472
473
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); }
474
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); }
475
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); }
476
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); }
477
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); }
478
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); }
479
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); }
480
481
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); }
482
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); }
483
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); }
484
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); }
485
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); }
486
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); }
487
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); }
488
489
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); }
490
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); }
491
492
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); }
493
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); }
494
495
ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); }
496
497
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi8(m, v.m)); }
498
ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi16(m, v.m)); }
499
ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi32(m, v.m)); }
500
501
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
502
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
503
ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
504
505
ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
506
ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
507
ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
508
509
ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi8(m, v.m)); }
510
ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi16(m, v.m)); }
511
ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi32(m, v.m)); }
512
513
ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi8(m, v.m)); }
514
ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi16(m, v.m)); }
515
ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi32(m, v.m)); }
516
517
ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
518
ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
519
ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
520
521
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(_mm_andnot_si128(v.m, m)); }
522
523
ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); }
524
525
ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); }
526
ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); }
527
528
template<s32 i>
529
ALWAYS_INLINE GSVector2i insert8(s32 a) const
530
{
531
#ifdef CPU_ARCH_SSE41
532
return GSVector2i(_mm_insert_epi8(m, a, i));
533
#else
534
GSVector2i ret(*this);
535
ret.S8[i] = static_cast<s8>(a);
536
return ret;
537
#endif
538
}
539
540
template<s32 i>
541
ALWAYS_INLINE s32 extract8() const
542
{
543
#ifdef CPU_ARCH_SSE41
544
return _mm_extract_epi8(m, i);
545
#else
546
return S8[i];
547
#endif
548
}
549
550
template<s32 i>
551
ALWAYS_INLINE GSVector2i insert16(s32 a) const
552
{
553
#ifdef CPU_ARCH_SSE41
554
return GSVector2i(_mm_insert_epi16(m, a, i));
555
#else
556
GSVector2i ret(*this);
557
ret.S16[i] = static_cast<s16>(a);
558
return ret;
559
#endif
560
}
561
562
template<s32 i>
563
ALWAYS_INLINE s32 extract16() const
564
{
565
#ifdef CPU_ARCH_SSE41
566
return _mm_extract_epi16(m, i);
567
#else
568
return S16[i];
569
#endif
570
}
571
572
template<s32 i>
573
ALWAYS_INLINE GSVector2i insert32(s32 a) const
574
{
575
#ifdef CPU_ARCH_SSE41
576
return GSVector2i(_mm_insert_epi32(m, a, i));
577
#else
578
GSVector2i ret(*this);
579
ret.S32[i] = a;
580
return ret;
581
#endif
582
}
583
584
template<s32 i>
585
ALWAYS_INLINE s32 extract32() const
586
{
587
#ifdef CPU_ARCH_SSE41
588
return _mm_extract_epi32(m, i);
589
#else
590
if constexpr (i == 0)
591
return _mm_cvtsi128_si32(m);
592
else
593
return S32[i];
594
#endif
595
}
596
597
ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
598
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
599
600
template<bool aligned>
601
ALWAYS_INLINE static GSVector2i load(const void* p)
602
{
603
return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
604
}
605
606
template<bool aligned>
607
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
608
{
609
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
610
}
611
612
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
613
614
ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
615
{
616
m = _mm_and_si128(m, v);
617
return *this;
618
}
619
620
ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v)
621
{
622
m = _mm_or_si128(m, v);
623
return *this;
624
}
625
626
ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v)
627
{
628
m = _mm_xor_si128(m, v);
629
return *this;
630
}
631
632
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
633
{
634
return GSVector2i(_mm_and_si128(v1, v2));
635
}
636
637
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
638
{
639
return GSVector2i(_mm_or_si128(v1, v2));
640
}
641
642
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
643
{
644
return GSVector2i(_mm_xor_si128(v1, v2));
645
}
646
647
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); }
648
649
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); }
650
651
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); }
652
653
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
654
655
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); }
656
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
657
658
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
659
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); }
660
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 1))); }
661
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 1, 1))); }
662
};
663
664
class alignas(16) GSVector2
665
{
666
struct cxpr_init_tag
667
{
668
};
669
static constexpr cxpr_init_tag cxpr_init{};
670
671
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
672
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
673
674
public:
675
union
676
{
677
struct
678
{
679
float x, y;
680
};
681
struct
682
{
683
float r, g;
684
};
685
float F32[4];
686
double F64[2];
687
s8 I8[16];
688
s16 I16[8];
689
s32 I32[4];
690
s64 I64[2];
691
u8 U8[16];
692
u16 U16[8];
693
u32 U32[4];
694
u64 U64[2];
695
__m128 m;
696
};
697
698
GSVector2() = default;
699
700
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
701
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
702
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
703
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
704
705
ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); }
706
ALWAYS_INLINE GSVector2(int x, int y)
707
{
708
GSVector2i v_(x, y);
709
m = _mm_cvtepi32_ps(v_.m);
710
}
711
712
ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {}
713
ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {}
714
ALWAYS_INLINE explicit GSVector2(float f) { *this = f; }
715
ALWAYS_INLINE explicit GSVector2(int i)
716
{
717
#ifdef CPU_ARCH_AVX2
718
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
719
#else
720
*this = GSVector2(GSVector2i(i));
721
#endif
722
}
723
724
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v) : m(_mm_cvtepi32_ps(v)) {}
725
726
ALWAYS_INLINE GSVector2& operator=(float f)
727
{
728
m = _mm_set1_ps(f);
729
return *this;
730
}
731
732
ALWAYS_INLINE GSVector2& operator=(__m128 m_)
733
{
734
m = m_;
735
return *this;
736
}
737
738
ALWAYS_INLINE operator __m128() const { return m; }
739
740
ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); }
741
ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); }
742
ALWAYS_INLINE GSVector2 floor() const
743
{
744
return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
745
}
746
747
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
748
749
ALWAYS_INLINE GSVector2 sat(const GSVector2& min, const GSVector2& max) const
750
{
751
return GSVector2(_mm_min_ps(_mm_max_ps(m, min), max));
752
}
753
754
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
755
756
ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
757
758
ALWAYS_INLINE GSVector2 min(const GSVector2& v) const { return GSVector2(_mm_min_ps(m, v)); }
759
760
ALWAYS_INLINE GSVector2 max(const GSVector2& v) const { return GSVector2(_mm_max_ps(m, v)); }
761
762
template<int mask>
763
ALWAYS_INLINE GSVector2 blend32(const GSVector2& v) const
764
{
765
return GSVector2(_mm_blend_ps(m, v, mask));
766
}
767
768
ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const
769
{
770
return GSVector2(_mm_blendv_ps(m, v, mask));
771
}
772
773
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const { return GSVector2(_mm_andnot_ps(v.m, m)); }
774
775
ALWAYS_INLINE int mask() const { return (_mm_movemask_ps(m) & 0x3); }
776
777
ALWAYS_INLINE bool alltrue() const { return (mask() == 0x3); }
778
779
ALWAYS_INLINE bool allfalse() const { return (mask() == 0x0); }
780
781
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
782
783
template<int src, int dst>
784
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
785
{
786
#ifdef CPU_ARCH_SSE41
787
if constexpr (src == dst)
788
return GSVector2(_mm_blend_ps(m, v.m, 1 << src));
789
else
790
return GSVector2(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
791
#else
792
GSVector2 ret(*this);
793
ret.F32[dst] = v.F32[src];
794
return ret;
795
#endif
796
}
797
798
template<int i>
799
ALWAYS_INLINE int extract32() const
800
{
801
#ifdef CPU_ARCH_SSE41
802
return _mm_extract_ps(m, i);
803
#else
804
if constexpr (i == 0)
805
return _mm_cvtsi128_si32(_mm_castps_si128(m));
806
else
807
return F32[i];
808
#endif
809
}
810
811
#ifdef CPU_ARCH_SSE41
812
ALWAYS_INLINE float dot(const GSVector2& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0x31)); }
813
#else
814
float dot(const GSVector2& v) const
815
{
816
const __m128 tmp = _mm_mul_ps(m, v.m);
817
float ret;
818
_mm_store_ss(&ret, _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1))));
819
return ret;
820
}
821
#endif
822
823
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(_mm_setzero_ps()); }
824
825
ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
826
827
template<bool aligned>
828
ALWAYS_INLINE static GSVector2 load(const void* p)
829
{
830
return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
831
}
832
833
template<bool aligned>
834
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
835
{
836
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
837
}
838
839
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
840
841
ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_)
842
{
843
m = _mm_add_ps(m, v_);
844
return *this;
845
}
846
ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_)
847
{
848
m = _mm_sub_ps(m, v_);
849
return *this;
850
}
851
ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_)
852
{
853
m = _mm_mul_ps(m, v_);
854
return *this;
855
}
856
ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_)
857
{
858
m = _mm_div_ps(m, v_);
859
return *this;
860
}
861
862
ALWAYS_INLINE GSVector2& operator+=(float f)
863
{
864
*this += GSVector2(f);
865
return *this;
866
}
867
ALWAYS_INLINE GSVector2& operator-=(float f)
868
{
869
*this -= GSVector2(f);
870
return *this;
871
}
872
ALWAYS_INLINE GSVector2& operator*=(float f)
873
{
874
*this *= GSVector2(f);
875
return *this;
876
}
877
ALWAYS_INLINE GSVector2& operator/=(float f)
878
{
879
*this /= GSVector2(f);
880
return *this;
881
}
882
883
ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_)
884
{
885
m = _mm_and_ps(m, v_);
886
return *this;
887
}
888
ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_)
889
{
890
m = _mm_or_ps(m, v_);
891
return *this;
892
}
893
ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_)
894
{
895
m = _mm_xor_ps(m, v_);
896
return *this;
897
}
898
899
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
900
{
901
return GSVector2(_mm_add_ps(v1, v2));
902
}
903
904
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
905
{
906
return GSVector2(_mm_sub_ps(v1, v2));
907
}
908
909
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
910
{
911
return GSVector2(_mm_mul_ps(v1, v2));
912
}
913
914
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
915
{
916
return GSVector2(_mm_div_ps(v1, v2));
917
}
918
919
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
920
921
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
922
923
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
924
925
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
926
927
ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
928
{
929
return GSVector2(_mm_and_ps(v1, v2));
930
}
931
932
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
933
{
934
return GSVector2(_mm_or_ps(v1, v2));
935
}
936
937
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
938
{
939
return GSVector2(_mm_xor_ps(v1, v2));
940
}
941
942
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
943
{
944
return GSVector2(_mm_cmpeq_ps(v1, v2));
945
}
946
947
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
948
{
949
return GSVector2(_mm_cmpneq_ps(v1, v2));
950
}
951
952
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
953
{
954
return GSVector2(_mm_cmpgt_ps(v1, v2));
955
}
956
957
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
958
{
959
return GSVector2(_mm_cmplt_ps(v1, v2));
960
}
961
962
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
963
{
964
return GSVector2(_mm_cmpge_ps(v1, v2));
965
}
966
967
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
968
{
969
return GSVector2(_mm_cmple_ps(v1, v2));
970
}
971
972
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
973
974
ALWAYS_INLINE GSVector2 xy() const { return *this; }
975
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); }
976
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); }
977
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 1, 1))); }
978
};
979
980
class alignas(16) GSVector4i
981
{
982
struct cxpr_init_tag
983
{
984
};
985
static constexpr cxpr_init_tag cxpr_init{};
986
987
constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
988
989
constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
990
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
991
{
992
}
993
994
constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
995
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
996
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
997
{
998
}
999
1000
public:
1001
union
1002
{
1003
struct
1004
{
1005
s32 x, y, z, w;
1006
};
1007
struct
1008
{
1009
s32 r, g, b, a;
1010
};
1011
struct
1012
{
1013
s32 left, top, right, bottom;
1014
};
1015
float F32[4];
1016
s8 S8[16];
1017
s16 S16[8];
1018
s32 S32[4];
1019
s64 S64[2];
1020
u8 U8[16];
1021
u16 U16[8];
1022
u32 U32[4];
1023
u64 U64[2];
1024
__m128i m;
1025
};
1026
1027
GSVector4i() = default;
1028
1029
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1030
{
1031
return GSVector4i(cxpr_init, x, y, z, w);
1032
}
1033
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1034
1035
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1036
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1037
{
1038
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1039
}
1040
1041
ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1042
s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1043
{
1044
return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1045
}
1046
1047
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
1048
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1049
{
1050
m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
1051
}
1052
1053
ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1054
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1055
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1056
{
1057
}
1058
1059
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_unpacklo_epi64(v.m, _mm_setzero_si128()); }
1060
1061
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v)
1062
: m(_mm_unpacklo_epi64(_mm_cvttps_epi32(v), _mm_setzero_si128()))
1063
{
1064
}
1065
1066
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1067
1068
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1069
1070
ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
1071
1072
ALWAYS_INLINE GSVector4i& operator=(s32 i)
1073
{
1074
m = _mm_set1_epi32(i);
1075
return *this;
1076
}
1077
ALWAYS_INLINE GSVector4i& operator=(__m128i m_)
1078
{
1079
m = m_;
1080
return *this;
1081
}
1082
1083
ALWAYS_INLINE operator __m128i() const { return m; }
1084
1085
ALWAYS_INLINE s32 width() const { return right - left; }
1086
ALWAYS_INLINE s32 height() const { return bottom - top; }
1087
1088
ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1089
ALWAYS_INLINE bool rempty() const { return (lt32(zwzw()).mask() != 0x00ff); }
1090
ALWAYS_INLINE bool rvalid() const { return ((ge32(zwzw()).mask() & 0xff) == 0); }
1091
1092
ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_s32(v).blend32<0xc>(max_s32(v)); }
1093
1094
ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_s32(v); }
1095
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1096
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1097
1098
ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1099
1100
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1101
{
1102
return max_s8(min).min_s8(max);
1103
}
1104
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1105
{
1106
return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1107
}
1108
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1109
{
1110
return max_s16(min).min_s16(max);
1111
}
1112
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1113
{
1114
return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1115
}
1116
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1117
{
1118
return max_s32(min).min_s32(max);
1119
}
1120
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1121
{
1122
return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1123
}
1124
1125
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1126
{
1127
return max_u8(min).min_u8(max);
1128
}
1129
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1130
{
1131
return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1132
}
1133
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1134
{
1135
return max_u16(min).min_u16(max);
1136
}
1137
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1138
{
1139
return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1140
}
1141
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1142
{
1143
return max_u32(min).min_u32(max);
1144
}
1145
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1146
{
1147
return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1148
}
1149
1150
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); }
1151
1152
#ifdef CPU_ARCH_SSE41
1153
1154
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); }
1155
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); }
1156
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
1157
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
1158
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); }
1159
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); }
1160
1161
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
1162
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
1163
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); }
1164
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); }
1165
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); }
1166
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); }
1167
1168
ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); }
1169
1170
ALWAYS_INLINE s32 addv_s32() const
1171
{
1172
const __m128i pairs = _mm_hadd_epi32(m, m);
1173
return _mm_cvtsi128_si32(_mm_hadd_epi32(pairs, pairs));
1174
}
1175
1176
#define VECTOR4i_REDUCE_8(name, func, ret) \
1177
ALWAYS_INLINE ret name() const \
1178
{ \
1179
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1180
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1181
v = func(v, _mm_srli_epi32(v, 16)); \
1182
v = func(v, _mm_srli_epi16(v, 8)); \
1183
return static_cast<ret>(_mm_extract_epi8(v, 0)); \
1184
}
1185
1186
#define VECTOR4i_REDUCE_16(name, func, ret) \
1187
ALWAYS_INLINE ret name() const \
1188
{ \
1189
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1190
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1191
v = func(v, _mm_srli_epi32(v, 16)); \
1192
return static_cast<ret>(_mm_extract_epi16(v, 0)); \
1193
}
1194
1195
#define VECTOR4i_REDUCE_32(name, func, ret) \
1196
ALWAYS_INLINE ret name() const \
1197
{ \
1198
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1199
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1200
return static_cast<ret>(_mm_extract_epi32(v, 0)); \
1201
}
1202
1203
VECTOR4i_REDUCE_8(minv_s8, _mm_min_epi8, s8);
1204
VECTOR4i_REDUCE_8(maxv_s8, _mm_max_epi8, s8);
1205
VECTOR4i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
1206
VECTOR4i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
1207
VECTOR4i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
1208
VECTOR4i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
1209
VECTOR4i_REDUCE_16(minv_u16, _mm_min_epu16, u16);
1210
VECTOR4i_REDUCE_16(maxv_u16, _mm_max_epu16, u16);
1211
VECTOR4i_REDUCE_32(minv_s32, _mm_min_epi32, s32);
1212
VECTOR4i_REDUCE_32(maxv_s32, _mm_max_epi32, s32);
1213
VECTOR4i_REDUCE_32(minv_u32, _mm_min_epu32, u32);
1214
VECTOR4i_REDUCE_32(maxv_u32, _mm_max_epu32, u32);
1215
1216
#undef VECTOR4i_REDUCE_32
1217
#undef VECTOR4i_REDUCE_16
1218
#undef VECTOR4i_REDUCE_8
1219
1220
#else
1221
1222
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const { return GSVector4i(sse2_min_s8(m, v)); }
1223
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const { return GSVector4i(sse2_max_s8(m, v)); }
1224
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
1225
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
1226
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(sse2_min_s32(m, v)); }
1227
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(sse2_max_s32(m, v)); }
1228
1229
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
1230
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
1231
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(sse2_min_u16(m, v)); }
1232
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(sse2_max_u16(m, v)); }
1233
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(sse2_min_u32(m, v)); }
1234
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(sse2_max_u32(m, v)); }
1235
1236
GSVector4i addp_s32() const
1237
{
1238
return GSVector4i(
1239
_mm_shuffle_epi32(_mm_add_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 3, 1, 1))), _MM_SHUFFLE(3, 2, 2, 0)));
1240
}
1241
1242
ALWAYS_INLINE s32 addv_s32() const
1243
{
1244
const __m128i pair1 = _mm_add_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 3, 1, 1))); // 0+1,1+1,2+3,3+3
1245
const __m128i pair2 = _mm_add_epi32(pair1, _mm_shuffle_epi32(pair1, _MM_SHUFFLE(3, 2, 1, 2)));
1246
return _mm_cvtsi128_si32(pair2);
1247
}
1248
1249
#define VECTOR4i_REDUCE_8(name, func, ret) \
1250
ALWAYS_INLINE ret name() const \
1251
{ \
1252
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1253
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1254
v = func(v, _mm_srli_epi32(v, 16)); \
1255
v = func(v, _mm_srli_epi16(v, 8)); \
1256
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1257
}
1258
1259
#define VECTOR4i_REDUCE_16(name, func, ret) \
1260
ALWAYS_INLINE ret name() const \
1261
{ \
1262
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1263
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1264
v = func(v, _mm_srli_epi32(v, 16)); \
1265
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1266
}
1267
1268
#define VECTOR4i_REDUCE_32(name, func, ret) \
1269
ALWAYS_INLINE ret name() const \
1270
{ \
1271
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1272
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1273
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1274
}
1275
1276
VECTOR4i_REDUCE_8(minv_s8, sse2_min_s8, s8);
1277
VECTOR4i_REDUCE_8(maxv_s8, sse2_max_s8, s8);
1278
VECTOR4i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
1279
VECTOR4i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
1280
VECTOR4i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
1281
VECTOR4i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
1282
VECTOR4i_REDUCE_16(minv_u16, sse2_min_u16, u16);
1283
VECTOR4i_REDUCE_16(maxv_u16, sse2_max_u16, u16);
1284
VECTOR4i_REDUCE_32(minv_s32, sse2_min_s32, s32);
1285
VECTOR4i_REDUCE_32(maxv_s32, sse2_max_s32, s32);
1286
VECTOR4i_REDUCE_32(minv_u32, sse2_min_u32, u32);
1287
VECTOR4i_REDUCE_32(maxv_u32, sse2_max_u32, u32);
1288
1289
#undef VECTOR4i_REDUCE_32
1290
#undef VECTOR4i_REDUCE_16
1291
#undef VECTOR4i_REDUCE_8
1292
1293
#endif
1294
1295
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1296
1297
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
1298
{
1299
#ifdef CPU_ARCH_SSE41
1300
return GSVector4i(_mm_blendv_epi8(m, v, mask));
1301
#else
1302
// NOTE: Assumes the entire lane is set with 1s or 0s.
1303
return (v & mask) | andnot(mask);
1304
#endif
1305
}
1306
1307
template<s32 mask>
1308
ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const
1309
{
1310
#ifdef CPU_ARCH_SSE41
1311
return GSVector4i(_mm_blend_epi16(m, v, mask));
1312
#else
1313
static constexpr GSVector4i vmask =
1314
GSVector4i::cxpr16(((mask) & (1 << 0)) ? -1 : 0x0, ((mask) & (1 << 1)) ? -1 : 0x0, ((mask) & (1 << 2)) ? -1 : 0x0,
1315
((mask) & (1 << 3)) ? -1 : 0x0, ((mask) & (1 << 4)) ? -1 : 0x0, ((mask) & (1 << 5)) ? -1 : 0x0,
1316
((mask) & (1 << 6)) ? -1 : 0x0, ((mask) & (1 << 7)) ? -1 : 0x0);
1317
return (v & vmask) | andnot(vmask);
1318
#endif
1319
}
1320
1321
template<s32 mask>
1322
ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1323
{
1324
#ifdef CPU_ARCH_AVX2
1325
return GSVector4i(_mm_blend_epi32(m, v.m, mask));
1326
#else
1327
#ifndef CPU_ARCH_SSE41
1328
// we can do this with a movsd if 0,1 are from a, and 2,3 from b
1329
if constexpr ((mask & 15) == 12)
1330
return GSVector4i(_mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(v.m), _mm_castsi128_pd(m))));
1331
#endif
1332
1333
constexpr s32 bit3 = ((mask & 8) * 3) << 3;
1334
constexpr s32 bit2 = ((mask & 4) * 3) << 2;
1335
constexpr s32 bit1 = ((mask & 2) * 3) << 1;
1336
constexpr s32 bit0 = (mask & 1) * 3;
1337
return blend16 < bit3 | bit2 | bit1 | bit0 > (v);
1338
#endif
1339
}
1340
1341
ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1342
{
1343
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
1344
}
1345
1346
#ifdef CPU_ARCH_SSE41
1347
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
1348
#else
1349
GSVector4i shuffle8(const GSVector4i& mask) const
1350
{
1351
GSVector4i ret;
1352
for (size_t i = 0; i < 16; i++)
1353
ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]);
1354
return ret;
1355
}
1356
#endif
1357
1358
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
1359
ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); }
1360
ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); }
1361
ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); }
1362
ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); }
1363
ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); }
1364
#ifdef CPU_ARCH_SSE41
1365
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); }
1366
ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); }
1367
#else
1368
// sign extend so it matches
1369
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1370
{
1371
return GSVector4i(_mm_packs_epi32(sll32<16>().sra32<16>(), v.sll32<16>().sra32<16>()));
1372
}
1373
ALWAYS_INLINE GSVector4i pu32() const
1374
{
1375
const GSVector4i tmp = sll32<16>().sra32<16>();
1376
return GSVector4i(_mm_packs_epi32(tmp.m, tmp.m));
1377
}
1378
#endif
1379
1380
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); }
1381
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); }
1382
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); }
1383
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); }
1384
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); }
1385
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); }
1386
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); }
1387
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); }
1388
1389
ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
1390
ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
1391
1392
ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
1393
ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
1394
1395
ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
1396
1397
ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
1398
ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); }
1399
ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); }
1400
1401
ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); }
1402
ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); }
1403
ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); }
1404
1405
#ifdef CPU_ARCH_SSE41
1406
ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); }
1407
ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); }
1408
ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); }
1409
ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); }
1410
ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); }
1411
ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
1412
ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); }
1413
ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
1414
ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); }
1415
#else
1416
// These are a pain, adding only as needed...
1417
ALWAYS_INLINE GSVector4i u8to32() const
1418
{
1419
return GSVector4i(_mm_unpacklo_epi16(_mm_unpacklo_epi8(m, _mm_setzero_si128()), _mm_setzero_si128()));
1420
}
1421
1422
ALWAYS_INLINE GSVector4i u16to32() const { return upl16(); }
1423
ALWAYS_INLINE GSVector4i s16to32() const { return upl16().sll32<16>().sra32<16>(); }
1424
ALWAYS_INLINE GSVector4i u8to16() const { return upl8(); }
1425
#endif
1426
1427
template<s32 i>
1428
ALWAYS_INLINE GSVector4i srl() const
1429
{
1430
return GSVector4i(_mm_srli_si128(m, i));
1431
}
1432
1433
template<s32 i>
1434
ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1435
{
1436
return GSVector4i(_mm_alignr_epi8(v.m, m, i));
1437
}
1438
1439
template<s32 i>
1440
ALWAYS_INLINE GSVector4i sll() const
1441
{
1442
return GSVector4i(_mm_slli_si128(m, i));
1443
}
1444
1445
template<s32 i>
1446
ALWAYS_INLINE GSVector4i sll16() const
1447
{
1448
return GSVector4i(_mm_slli_epi16(m, i));
1449
}
1450
1451
ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
1452
1453
#ifdef CPU_ARCH_AVX2
1454
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
1455
#endif
1456
1457
template<s32 i>
1458
ALWAYS_INLINE GSVector4i srl16() const
1459
{
1460
return GSVector4i(_mm_srli_epi16(m, i));
1461
}
1462
1463
ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
1464
1465
#ifdef CPU_ARCH_AVX2
1466
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); }
1467
#endif
1468
1469
template<s32 i>
1470
ALWAYS_INLINE GSVector4i sra16() const
1471
{
1472
return GSVector4i(_mm_srai_epi16(m, i));
1473
}
1474
1475
ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
1476
1477
#ifdef CPU_ARCH_AVX2
1478
ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); }
1479
#endif
1480
1481
template<s32 i>
1482
ALWAYS_INLINE GSVector4i sll32() const
1483
{
1484
return GSVector4i(_mm_slli_epi32(m, i));
1485
}
1486
1487
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
1488
1489
#ifdef CPU_ARCH_AVX2
1490
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); }
1491
#endif
1492
1493
template<s32 i>
1494
ALWAYS_INLINE GSVector4i srl32() const
1495
{
1496
return GSVector4i(_mm_srli_epi32(m, i));
1497
}
1498
1499
ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
1500
1501
#ifdef CPU_ARCH_AVX2
1502
ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); }
1503
#endif
1504
1505
template<s32 i>
1506
ALWAYS_INLINE GSVector4i sra32() const
1507
{
1508
return GSVector4i(_mm_srai_epi32(m, i));
1509
}
1510
1511
ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
1512
1513
#ifdef CPU_ARCH_AVX2
1514
ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); }
1515
#endif
1516
1517
template<s64 i>
1518
ALWAYS_INLINE GSVector4i sll64() const
1519
{
1520
return GSVector4i(_mm_slli_epi64(m, i));
1521
}
1522
1523
ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); }
1524
1525
#ifdef CPU_ARCH_AVX2
1526
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); }
1527
#endif
1528
1529
template<s64 i>
1530
ALWAYS_INLINE GSVector4i srl64() const
1531
{
1532
return GSVector4i(_mm_srli_epi64(m, i));
1533
}
1534
1535
ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); }
1536
1537
#ifdef CPU_ARCH_AVX2
1538
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); }
1539
#endif
1540
1541
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
1542
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
1543
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
1544
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
1545
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
1546
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
1547
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
1548
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
1549
1550
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
1551
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
1552
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
1553
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
1554
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
1555
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
1556
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
1557
1558
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
1559
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
1560
1561
#ifdef CPU_ARCH_SSE41
1562
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
1563
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
1564
#else
1565
// We can abuse the fact that signed and unsigned multiplies are the same.
1566
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const
1567
{
1568
return GSVector4i(_mm_castps_si128(
1569
_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(_mm_unpacklo_epi32(m, _mm_setzero_si128()),
1570
_mm_unpacklo_epi32(v.m, _mm_setzero_si128()))), // x,y
1571
_mm_castsi128_ps(_mm_mul_epu32(_mm_unpackhi_epi32(m, _mm_setzero_si128()),
1572
_mm_unpackhi_epi32(v.m, _mm_setzero_si128()))), // z,w
1573
_MM_SHUFFLE(2, 0, 2, 0))));
1574
}
1575
#endif
1576
1577
ALWAYS_INLINE bool eq(const GSVector4i& v) const
1578
{
1579
#ifdef CPU_ARCH_SSE41
1580
const GSVector4i t = *this ^ v;
1581
return _mm_testz_si128(t, t) != 0;
1582
#else
1583
return eq8(v).alltrue();
1584
#endif
1585
}
1586
1587
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); }
1588
ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); }
1589
ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); }
1590
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); }
1591
1592
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
1593
ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
1594
ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
1595
1596
ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
1597
ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
1598
ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
1599
1600
ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); }
1601
ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); }
1602
ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); }
1603
1604
ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); }
1605
ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); }
1606
ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); }
1607
1608
ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
1609
ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
1610
ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
1611
1612
ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); }
1613
1614
ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); }
1615
1616
ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; }
1617
1618
ALWAYS_INLINE bool allfalse() const
1619
{
1620
#ifdef CPU_ARCH_SSE41
1621
return _mm_testz_si128(m, m) != 0;
1622
#else
1623
return mask() == 0;
1624
#endif
1625
}
1626
1627
template<s32 i>
1628
ALWAYS_INLINE GSVector4i insert8(s32 a) const
1629
{
1630
#ifdef CPU_ARCH_SSE41
1631
return GSVector4i(_mm_insert_epi8(m, a, i));
1632
#else
1633
GSVector4i ret(*this);
1634
ret.S8[i] = static_cast<s8>(a);
1635
return ret;
1636
#endif
1637
}
1638
1639
template<s32 i>
1640
ALWAYS_INLINE s32 extract8() const
1641
{
1642
#ifdef CPU_ARCH_SSE41
1643
return _mm_extract_epi8(m, i);
1644
#else
1645
return S8[i];
1646
#endif
1647
}
1648
1649
template<s32 i>
1650
ALWAYS_INLINE GSVector4i insert16(s32 a) const
1651
{
1652
#ifdef CPU_ARCH_SSE41
1653
return GSVector4i(_mm_insert_epi16(m, a, i));
1654
#else
1655
GSVector4i ret(*this);
1656
ret.S16[i] = static_cast<s16>(a);
1657
return ret;
1658
#endif
1659
}
1660
1661
template<s32 i>
1662
ALWAYS_INLINE s32 extract16() const
1663
{
1664
#ifdef CPU_ARCH_SSE41
1665
return _mm_extract_epi16(m, i);
1666
#else
1667
return S16[i];
1668
#endif
1669
}
1670
1671
template<s32 i>
1672
ALWAYS_INLINE GSVector4i insert32(s32 a) const
1673
{
1674
#ifdef CPU_ARCH_SSE41
1675
return GSVector4i(_mm_insert_epi32(m, a, i));
1676
#else
1677
GSVector4i ret(*this);
1678
ret.S32[i] = a;
1679
return ret;
1680
#endif
1681
}
1682
1683
template<s32 i>
1684
ALWAYS_INLINE s32 extract32() const
1685
{
1686
#ifdef CPU_ARCH_SSE41
1687
return _mm_extract_epi32(m, i);
1688
#else
1689
if constexpr (i == 0)
1690
return _mm_cvtsi128_si32(m);
1691
else
1692
return S32[i];
1693
#endif
1694
}
1695
1696
template<s32 i>
1697
ALWAYS_INLINE GSVector4i insert64(s64 a) const
1698
{
1699
#ifdef CPU_ARCH_SSE41
1700
return GSVector4i(_mm_insert_epi64(m, a, i));
1701
#else
1702
GSVector4i ret(*this);
1703
ret.S64[i] = a;
1704
return ret;
1705
#endif
1706
}
1707
1708
template<s32 i>
1709
ALWAYS_INLINE s64 extract64() const
1710
{
1711
#ifdef CPU_ARCH_SSE41
1712
return _mm_extract_epi64(m, i);
1713
#else
1714
return S64[i];
1715
#endif
1716
}
1717
1718
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
1719
{
1720
// Should be const, but isn't...
1721
return GSVector4i(_mm_stream_load_si128(const_cast<__m128i*>(static_cast<const __m128i*>(p))));
1722
}
1723
1724
ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
1725
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }
1726
1727
template<bool aligned>
1728
ALWAYS_INLINE static GSVector4i loadl(const void* p)
1729
{
1730
return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
1731
}
1732
1733
ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v)
1734
{
1735
return GSVector4i(_mm_unpacklo_epi64(v.m, _mm_setzero_si128()));
1736
}
1737
1738
template<bool aligned>
1739
ALWAYS_INLINE static GSVector4i loadh(const void* p)
1740
{
1741
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
1742
}
1743
1744
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
1745
{
1746
return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
1747
}
1748
1749
template<bool aligned>
1750
ALWAYS_INLINE static GSVector4i load(const void* p)
1751
{
1752
return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) :
1753
_mm_loadu_si128(static_cast<const __m128i*>(p)));
1754
}
1755
1756
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
1757
1758
template<bool aligned>
1759
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
1760
{
1761
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
1762
}
1763
1764
template<bool aligned>
1765
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
1766
{
1767
_mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
1768
}
1769
1770
template<bool aligned>
1771
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
1772
{
1773
if constexpr (aligned)
1774
_mm_store_si128(static_cast<__m128i*>(p), v.m);
1775
else
1776
_mm_storeu_si128(static_cast<__m128i*>(p), v.m);
1777
}
1778
1779
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
1780
1781
ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v)
1782
{
1783
m = _mm_and_si128(m, v);
1784
return *this;
1785
}
1786
ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v)
1787
{
1788
m = _mm_or_si128(m, v);
1789
return *this;
1790
}
1791
ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v)
1792
{
1793
m = _mm_xor_si128(m, v);
1794
return *this;
1795
}
1796
1797
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
1798
{
1799
return GSVector4i(_mm_and_si128(v1, v2));
1800
}
1801
1802
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
1803
{
1804
return GSVector4i(_mm_or_si128(v1, v2));
1805
}
1806
1807
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
1808
{
1809
return GSVector4i(_mm_xor_si128(v1, v2));
1810
}
1811
1812
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
1813
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
1814
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
1815
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
1816
1817
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
1818
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1819
1820
ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
1821
1822
template<bool aligned>
1823
ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
1824
{
1825
return load<aligned>(v);
1826
}
1827
1828
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
1829
1830
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw)
1831
{
1832
return GSVector4i(_mm_unpacklo_epi64(xyzw.m, xyzw.m));
1833
}
1834
1835
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
1836
{
1837
return GSVector4i(_mm_unpacklo_epi64(xy.m, zw.m));
1838
}
1839
1840
static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
1841
1842
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
1843
1844
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); }
1845
1846
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
1847
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
1848
{ \
1849
return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1850
} \
1851
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \
1852
{ \
1853
return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1854
} \
1855
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \
1856
{ \
1857
return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1858
}
1859
1860
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
1861
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
1862
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
1863
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
1864
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
1865
1866
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
1867
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
1868
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
1869
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
1870
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
1871
1872
#define VECTOR4i_SHUFFLE_1(xs, xn) \
1873
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
1874
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
1875
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
1876
VECTOR4i_SHUFFLE_2(xs, xn, w, 3)
1877
1878
VECTOR4i_SHUFFLE_1(x, 0);
1879
VECTOR4i_SHUFFLE_1(y, 1);
1880
VECTOR4i_SHUFFLE_1(z, 2);
1881
VECTOR4i_SHUFFLE_1(w, 3)
1882
1883
#undef VECTOR4i_SHUFFLE_1
1884
#undef VECTOR4i_SHUFFLE_2
1885
#undef VECTOR4i_SHUFFLE_3
1886
#undef VECTOR4i_SHUFFLE_4
1887
};
1888
1889
class alignas(16) GSVector4
1890
{
1891
struct cxpr_init_tag
1892
{
1893
};
1894
static constexpr cxpr_init_tag cxpr_init{};
1895
1896
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
1897
1898
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
1899
1900
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
1901
1902
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
1903
1904
public:
1905
union
1906
{
1907
struct
1908
{
1909
float x, y, z, w;
1910
};
1911
struct
1912
{
1913
float r, g, b, a;
1914
};
1915
struct
1916
{
1917
float left, top, right, bottom;
1918
};
1919
float F32[4];
1920
double F64[2];
1921
s8 I8[16];
1922
s16 I16[8];
1923
s32 I32[4];
1924
s64 I64[2];
1925
u8 U8[16];
1926
u16 U16[8];
1927
u32 U32[4];
1928
u64 U64[2];
1929
__m128 m;
1930
};
1931
1932
GSVector4() = default;
1933
1934
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
1935
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
1936
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
1937
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
1938
1939
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
1940
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
1941
1942
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
1943
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
1944
1945
ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
1946
ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
1947
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
1948
{
1949
GSVector4i v_(x, y, z, w);
1950
m = _mm_cvtepi32_ps(v_.m);
1951
}
1952
ALWAYS_INLINE GSVector4(int x, int y)
1953
{
1954
m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
1955
}
1956
1957
ALWAYS_INLINE explicit GSVector4(const GSVector2& v)
1958
: m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(v.m), _mm_setzero_pd())))
1959
{
1960
}
1961
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
1962
: m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd())))
1963
{
1964
}
1965
1966
ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {}
1967
1968
ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {}
1969
1970
ALWAYS_INLINE explicit GSVector4(float f) { *this = f; }
1971
1972
ALWAYS_INLINE explicit GSVector4(int i)
1973
{
1974
#ifdef CPU_ARCH_AVX2
1975
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
1976
#else
1977
*this = GSVector4(GSVector4i(i));
1978
#endif
1979
}
1980
1981
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v) : m(_mm_cvtepi32_ps(v)) {}
1982
1983
ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
1984
ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); }
1985
1986
ALWAYS_INLINE GSVector4& operator=(float f)
1987
{
1988
m = _mm_set1_ps(f);
1989
return *this;
1990
}
1991
1992
ALWAYS_INLINE GSVector4& operator=(__m128 m_)
1993
{
1994
this->m = m_;
1995
return *this;
1996
}
1997
1998
ALWAYS_INLINE operator __m128() const { return m; }
1999
2000
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2001
2002
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2003
{
2004
return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2005
}
2006
2007
ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2008
2009
ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); }
2010
2011
ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
2012
2013
ALWAYS_INLINE GSVector4 floor() const
2014
{
2015
return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
2016
}
2017
2018
ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
2019
2020
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
2021
2022
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
2023
2024
ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); }
2025
2026
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); }
2027
2028
#ifdef CPU_ARCH_SSE41
2029
ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); }
2030
ALWAYS_INLINE float addv() const
2031
{
2032
const __m128 pairs = _mm_hadd_ps(m, m);
2033
return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs));
2034
}
2035
#else
2036
float dot(const GSVector4& v) const
2037
{
2038
__m128 tmp = _mm_mul_ps(m, v.m);
2039
tmp = _mm_add_ps(tmp, _mm_movehl_ps(tmp, tmp)); // (x+z, y+w, ..., ...)
2040
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
2041
return _mm_cvtss_f32(tmp);
2042
}
2043
float addv() const
2044
{
2045
__m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...)
2046
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
2047
return _mm_cvtss_f32(tmp);
2048
}
2049
#endif
2050
2051
ALWAYS_INLINE float minv() const
2052
{
2053
__m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
2054
v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
2055
return _mm_cvtss_f32(v);
2056
}
2057
2058
ALWAYS_INLINE float maxv() const
2059
{
2060
__m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
2061
v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
2062
return _mm_cvtss_f32(v);
2063
}
2064
2065
ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
2066
{
2067
return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
2068
}
2069
2070
ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const
2071
{
2072
return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw()));
2073
}
2074
2075
ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2076
2077
ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2078
2079
ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); }
2080
2081
ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); }
2082
2083
template<int mask>
2084
ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const
2085
{
2086
return GSVector4(_mm_blend_ps(m, v, mask));
2087
}
2088
2089
ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
2090
{
2091
#ifdef CPU_ARCH_SSE41
2092
return GSVector4(_mm_blendv_ps(m, v, mask));
2093
#else
2094
// NOTE: Assumes the entire lane is set with 1s or 0s.
2095
return (v & mask) | andnot(mask);
2096
#endif
2097
}
2098
2099
ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); }
2100
2101
ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); }
2102
2103
ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const
2104
{
2105
return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
2106
}
2107
2108
ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const
2109
{
2110
return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
2111
}
2112
2113
ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); }
2114
2115
ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); }
2116
2117
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); }
2118
2119
ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); }
2120
2121
ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; }
2122
2123
ALWAYS_INLINE bool allfalse() const
2124
{
2125
#ifdef CPU_ARCH_AVX2
2126
return _mm_testz_ps(m, m) != 0;
2127
#else
2128
const __m128i ii = _mm_castps_si128(m);
2129
return _mm_testz_si128(ii, ii) != 0;
2130
#endif
2131
}
2132
2133
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2134
2135
template<int src, int dst>
2136
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2137
{
2138
#ifdef CPU_ARCH_SSE41
2139
if constexpr (src == dst)
2140
return GSVector4(_mm_blend_ps(m, v.m, 1 << src));
2141
else
2142
return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
2143
#else
2144
GSVector4 ret(*this);
2145
ret.F32[dst] = v.F32[src];
2146
return ret;
2147
#endif
2148
}
2149
2150
template<int i>
2151
ALWAYS_INLINE GSVector4 insert32(float v) const
2152
{
2153
#ifdef CPU_ARCH_SSE41
2154
if constexpr (i == 0)
2155
return GSVector4(_mm_move_ss(m, _mm_load_ss(&v)));
2156
else
2157
return GSVector4(_mm_insert_ps(m, _mm_load_ss(&v), _MM_MK_INSERTPS_NDX(0, i, 0)));
2158
#else
2159
GSVector4 ret(*this);
2160
ret.F32[i] = v;
2161
return ret;
2162
#endif
2163
}
2164
2165
template<int i>
2166
ALWAYS_INLINE float extract32() const
2167
{
2168
#ifdef CPU_ARCH_SSE41
2169
if constexpr (i == 0)
2170
return _mm_cvtss_f32(m);
2171
else
2172
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i)));
2173
#else
2174
return F32[i];
2175
#endif
2176
}
2177
2178
template<int dst>
2179
ALWAYS_INLINE GSVector4 insert64(double v) const
2180
{
2181
#ifdef CPU_ARCH_SSE41
2182
if constexpr (dst == 0)
2183
return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v)));
2184
else
2185
return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0));
2186
#else
2187
GSVector4 ret(*this);
2188
ret.F64[dst] = v;
2189
return ret;
2190
#endif
2191
}
2192
2193
template<int src>
2194
ALWAYS_INLINE double extract64() const
2195
{
2196
double ret;
2197
if constexpr (src == 0)
2198
_mm_storel_pd(&ret, _mm_castps_pd(m));
2199
else
2200
_mm_storeh_pd(&ret, _mm_castps_pd(m));
2201
return ret;
2202
}
2203
2204
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
2205
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2206
2207
ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
2208
2209
template<bool aligned>
2210
ALWAYS_INLINE static GSVector4 loadl(const void* p)
2211
{
2212
return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
2213
}
2214
2215
template<bool aligned>
2216
ALWAYS_INLINE static GSVector4 load(const void* p)
2217
{
2218
return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p)));
2219
}
2220
2221
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
2222
2223
template<bool aligned>
2224
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2225
{
2226
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
2227
}
2228
2229
template<bool aligned>
2230
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2231
{
2232
_mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
2233
}
2234
2235
template<bool aligned>
2236
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2237
{
2238
if constexpr (aligned)
2239
_mm_store_ps(static_cast<float*>(p), v.m);
2240
else
2241
_mm_storeu_ps(static_cast<float*>(p), v.m);
2242
}
2243
2244
ALWAYS_INLINE static void store32(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
2245
2246
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2247
2248
ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_)
2249
{
2250
m = _mm_add_ps(m, v_);
2251
return *this;
2252
}
2253
2254
ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_)
2255
{
2256
m = _mm_sub_ps(m, v_);
2257
return *this;
2258
}
2259
2260
ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_)
2261
{
2262
m = _mm_mul_ps(m, v_);
2263
return *this;
2264
}
2265
2266
ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_)
2267
{
2268
m = _mm_div_ps(m, v_);
2269
return *this;
2270
}
2271
2272
ALWAYS_INLINE GSVector4& operator+=(float f)
2273
{
2274
*this += GSVector4(f);
2275
return *this;
2276
}
2277
2278
ALWAYS_INLINE GSVector4& operator-=(float f)
2279
{
2280
*this -= GSVector4(f);
2281
return *this;
2282
}
2283
2284
ALWAYS_INLINE GSVector4& operator*=(float f)
2285
{
2286
*this *= GSVector4(f);
2287
return *this;
2288
}
2289
2290
ALWAYS_INLINE GSVector4& operator/=(float f)
2291
{
2292
*this /= GSVector4(f);
2293
return *this;
2294
}
2295
2296
ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_)
2297
{
2298
m = _mm_and_ps(m, v_);
2299
return *this;
2300
}
2301
2302
ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_)
2303
{
2304
m = _mm_or_ps(m, v_);
2305
return *this;
2306
}
2307
2308
ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_)
2309
{
2310
m = _mm_xor_ps(m, v_);
2311
return *this;
2312
}
2313
2314
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2315
{
2316
return GSVector4(_mm_add_ps(v1, v2));
2317
}
2318
2319
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2320
{
2321
return GSVector4(_mm_sub_ps(v1, v2));
2322
}
2323
2324
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2325
{
2326
return GSVector4(_mm_mul_ps(v1, v2));
2327
}
2328
2329
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2330
{
2331
return GSVector4(_mm_div_ps(v1, v2));
2332
}
2333
2334
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2335
2336
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2337
2338
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2339
2340
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
2341
2342
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2343
{
2344
return GSVector4(_mm_and_ps(v1, v2));
2345
}
2346
2347
ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2348
{
2349
return GSVector4(_mm_or_ps(v1, v2));
2350
}
2351
2352
ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2353
{
2354
return GSVector4(_mm_xor_ps(v1, v2));
2355
}
2356
2357
ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2358
{
2359
return GSVector4(_mm_cmpeq_ps(v1, v2));
2360
}
2361
2362
ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2363
{
2364
return GSVector4(_mm_cmpneq_ps(v1, v2));
2365
}
2366
2367
ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2368
{
2369
return GSVector4(_mm_cmpgt_ps(v1, v2));
2370
}
2371
2372
ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2373
{
2374
return GSVector4(_mm_cmplt_ps(v1, v2));
2375
}
2376
2377
ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
2378
{
2379
return GSVector4(_mm_cmpge_ps(v1, v2));
2380
}
2381
2382
ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
2383
{
2384
return GSVector4(_mm_cmple_ps(v1, v2));
2385
}
2386
2387
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
2388
{
2389
return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2390
}
2391
2392
ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
2393
{
2394
return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2395
}
2396
2397
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
2398
{
2399
return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2400
}
2401
2402
ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const
2403
{
2404
return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2405
}
2406
2407
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const
2408
{
2409
return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2410
}
2411
2412
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const
2413
{
2414
return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2415
}
2416
2417
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const
2418
{
2419
return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2420
}
2421
2422
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const
2423
{
2424
return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2425
}
2426
2427
ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const
2428
{
2429
return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2430
}
2431
2432
ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const
2433
{
2434
return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2435
}
2436
2437
ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const
2438
{
2439
return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2440
}
2441
2442
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
2443
2444
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
2445
2446
ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); }
2447
2448
ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); }
2449
2450
ALWAYS_INLINE GSVector4 floor64() const
2451
{
2452
return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
2453
}
2454
2455
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
2456
2457
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
2458
{
2459
return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
2460
}
2461
2462
ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); }
2463
2464
ALWAYS_INLINE GSVector2 xy() const { return GSVector2(m); }
2465
2466
ALWAYS_INLINE GSVector2 zw() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2))); }
2467
2468
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
2469
{
2470
return GSVector4(_mm_movelh_ps(l.m, h.m));
2471
}
2472
2473
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(_mm_movelh_ps(l.m, l.m)); }
2474
2475
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
2476
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
2477
{ \
2478
return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); \
2479
}
2480
2481
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
2482
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
2483
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
2484
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
2485
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2486
2487
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
2488
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
2489
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
2490
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
2491
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2492
2493
#define VECTOR4_SHUFFLE_1(xs, xn) \
2494
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
2495
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
2496
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
2497
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
2498
2499
VECTOR4_SHUFFLE_1(x, 0);
2500
VECTOR4_SHUFFLE_1(y, 1);
2501
VECTOR4_SHUFFLE_1(z, 2);
2502
VECTOR4_SHUFFLE_1(w, 3);
2503
2504
#undef VECTOR4_SHUFFLE_1
2505
#undef VECTOR4_SHUFFLE_2
2506
#undef VECTOR4_SHUFFLE_3
2507
#undef VECTOR4_SHUFFLE_4
2508
2509
#if CPU_ARCH_AVX2
2510
2511
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); }
2512
2513
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); }
2514
2515
ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
2516
{
2517
return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f))));
2518
}
2519
2520
#else
2521
2522
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0))); }
2523
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
2524
{
2525
return GSVector4(_mm_shuffle_ps(v.m, v.m, _MM_SHUFFLE(0, 0, 0, 0)));
2526
}
2527
ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
2528
{
2529
const __m128 v = _mm_load_ss(static_cast<const float*>(f));
2530
return GSVector4(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)));
2531
}
2532
2533
#endif
2534
2535
ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
2536
{
2537
return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
2538
}
2539
};
2540
2541
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
2542
{
2543
m = _mm_cvttps_epi32(v);
2544
}
2545
2546
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
2547
{
2548
return GSVector2i(_mm_castps_si128(v.m));
2549
}
2550
2551
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
2552
{
2553
return GSVector2(_mm_castsi128_ps(v.m));
2554
}
2555
2556
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
2557
{
2558
m = _mm_cvttps_epi32(v);
2559
}
2560
2561
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
2562
{
2563
return GSVector4i(_mm_castps_si128(v.m));
2564
}
2565
2566
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
2567
{
2568
return GSVector4(_mm_castsi128_ps(v.m));
2569
}
2570
2571
#ifdef GSVECTOR_HAS_256
2572
2573
class alignas(32) GSVector8i
2574
{
2575
struct cxpr_init_tag
2576
{
2577
};
2578
static constexpr cxpr_init_tag cxpr_init{};
2579
2580
constexpr GSVector8i(cxpr_init_tag, s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1)
2581
: S32{x0, y0, z0, w0, x1, y1, z1, w1}
2582
{
2583
}
2584
2585
constexpr GSVector8i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7, s16 s8, s16 s9,
2586
s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15)
2587
: S16{s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15}
2588
{
2589
}
2590
2591
public:
2592
union
2593
{
2594
struct
2595
{
2596
s32 x0, y0, z0, w0, x1, y1, z1, w1;
2597
};
2598
struct
2599
{
2600
s32 r0, g0, b0, a0, r1, g1, b1, a1;
2601
};
2602
2603
float F32[8];
2604
s8 S8[32];
2605
s16 S16[16];
2606
s32 S32[8];
2607
s64 S64[4];
2608
u8 U8[32];
2609
u16 U16[16];
2610
u32 U32[8];
2611
u64 U64[4];
2612
__m256i m;
2613
};
2614
2615
GSVector8i() = default;
2616
2617
ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1)
2618
{
2619
return GSVector8i(cxpr_init, x0, y0, z0, w0, x1, y1, z1, w1);
2620
}
2621
ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x) { return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x); }
2622
2623
ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 x)
2624
{
2625
return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2626
}
2627
ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7,
2628
s16 s8, s16 s9, s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15)
2629
{
2630
return GSVector8i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15);
2631
}
2632
2633
ALWAYS_INLINE explicit GSVector8i(s32 i) { *this = i; }
2634
2635
ALWAYS_INLINE constexpr explicit GSVector8i(__m256i m) : m(m) {}
2636
2637
ALWAYS_INLINE GSVector8i& operator=(s32 i)
2638
{
2639
m = _mm256_set1_epi32(i);
2640
return *this;
2641
}
2642
ALWAYS_INLINE GSVector8i& operator=(__m256i m_)
2643
{
2644
m = m_;
2645
return *this;
2646
}
2647
2648
ALWAYS_INLINE operator __m256i() const { return m; }
2649
2650
ALWAYS_INLINE GSVector8i min_s8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi8(m, v)); }
2651
ALWAYS_INLINE GSVector8i max_s8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi8(m, v)); }
2652
ALWAYS_INLINE GSVector8i min_s16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi16(m, v)); }
2653
ALWAYS_INLINE GSVector8i max_s16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi16(m, v)); }
2654
ALWAYS_INLINE GSVector8i min_s32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi32(m, v)); }
2655
ALWAYS_INLINE GSVector8i max_s32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi32(m, v)); }
2656
2657
ALWAYS_INLINE GSVector8i min_u8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu8(m, v)); }
2658
ALWAYS_INLINE GSVector8i max_u8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu8(m, v)); }
2659
ALWAYS_INLINE GSVector8i min_u16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu16(m, v)); }
2660
ALWAYS_INLINE GSVector8i max_u16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu16(m, v)); }
2661
ALWAYS_INLINE GSVector8i min_u32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu32(m, v)); }
2662
ALWAYS_INLINE GSVector8i max_u32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu32(m, v)); }
2663
2664
ALWAYS_INLINE GSVector8i madd_s16(const GSVector8i& v) const { return GSVector8i(_mm256_madd_epi16(m, v.m)); }
2665
2666
ALWAYS_INLINE GSVector8i clamp8() const { return pu16().upl8(); }
2667
2668
ALWAYS_INLINE GSVector8i blend8(const GSVector8i& v, const GSVector8i& mask) const
2669
{
2670
return GSVector8i(_mm256_blendv_epi8(m, v, mask));
2671
}
2672
2673
template<s32 mask>
2674
ALWAYS_INLINE GSVector8i blend16(const GSVector8i& v) const
2675
{
2676
return GSVector8i(_mm256_blend_epi16(m, v, mask));
2677
}
2678
2679
template<s32 mask>
2680
ALWAYS_INLINE GSVector8i blend32(const GSVector8i& v) const
2681
{
2682
return GSVector8i(_mm256_blend_epi32(m, v.m, mask));
2683
}
2684
2685
ALWAYS_INLINE GSVector8i blend(const GSVector8i& v, const GSVector8i& mask) const
2686
{
2687
return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, v)));
2688
}
2689
2690
ALWAYS_INLINE GSVector8i shuffle8(const GSVector8i& mask) const { return GSVector8i(_mm256_shuffle_epi8(m, mask)); }
2691
2692
ALWAYS_INLINE GSVector8i ps16(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi16(m, v)); }
2693
ALWAYS_INLINE GSVector8i ps16() const { return GSVector8i(_mm256_packs_epi16(m, m)); }
2694
ALWAYS_INLINE GSVector8i pu16(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi16(m, v)); }
2695
ALWAYS_INLINE GSVector8i pu16() const { return GSVector8i(_mm256_packus_epi16(m, m)); }
2696
ALWAYS_INLINE GSVector8i ps32(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi32(m, v)); }
2697
ALWAYS_INLINE GSVector8i ps32() const { return GSVector8i(_mm256_packs_epi32(m, m)); }
2698
ALWAYS_INLINE GSVector8i pu32(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi32(m, v)); }
2699
ALWAYS_INLINE GSVector8i pu32() const { return GSVector8i(_mm256_packus_epi32(m, m)); }
2700
2701
ALWAYS_INLINE GSVector8i upl8(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi8(m, v)); }
2702
ALWAYS_INLINE GSVector8i uph8(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi8(m, v)); }
2703
ALWAYS_INLINE GSVector8i upl16(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi16(m, v)); }
2704
ALWAYS_INLINE GSVector8i uph16(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi16(m, v)); }
2705
ALWAYS_INLINE GSVector8i upl32(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi32(m, v)); }
2706
ALWAYS_INLINE GSVector8i uph32(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi32(m, v)); }
2707
ALWAYS_INLINE GSVector8i upl64(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi64(m, v)); }
2708
ALWAYS_INLINE GSVector8i uph64(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi64(m, v)); }
2709
2710
ALWAYS_INLINE GSVector8i upl8() const { return GSVector8i(_mm256_unpacklo_epi8(m, _mm256_setzero_si256())); }
2711
ALWAYS_INLINE GSVector8i uph8() const { return GSVector8i(_mm256_unpackhi_epi8(m, _mm256_setzero_si256())); }
2712
2713
ALWAYS_INLINE GSVector8i upl16() const { return GSVector8i(_mm256_unpacklo_epi16(m, _mm256_setzero_si256())); }
2714
ALWAYS_INLINE GSVector8i uph16() const { return GSVector8i(_mm256_unpackhi_epi16(m, _mm256_setzero_si256())); }
2715
2716
ALWAYS_INLINE GSVector8i upl32() const { return GSVector8i(_mm256_unpacklo_epi32(m, _mm256_setzero_si256())); }
2717
2718
ALWAYS_INLINE GSVector8i uph32() const { return GSVector8i(_mm256_unpackhi_epi32(m, _mm256_setzero_si256())); }
2719
ALWAYS_INLINE GSVector8i upl64() const { return GSVector8i(_mm256_unpacklo_epi64(m, _mm256_setzero_si256())); }
2720
ALWAYS_INLINE GSVector8i uph64() const { return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256())); }
2721
2722
ALWAYS_INLINE GSVector8i s8to16() const { return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m))); }
2723
ALWAYS_INLINE GSVector8i s8to32() const { return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m))); }
2724
ALWAYS_INLINE GSVector8i s8to64() const { return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m))); }
2725
2726
ALWAYS_INLINE GSVector8i s16to32() const { return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m))); }
2727
ALWAYS_INLINE GSVector8i s16to64() const { return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m))); }
2728
ALWAYS_INLINE GSVector8i s32to64() const { return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m))); }
2729
ALWAYS_INLINE GSVector8i u8to16() const { return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m))); }
2730
ALWAYS_INLINE GSVector8i u8to32() const { return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m))); }
2731
ALWAYS_INLINE GSVector8i u8to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); }
2732
ALWAYS_INLINE GSVector8i u16to32() const { return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m))); }
2733
ALWAYS_INLINE GSVector8i u16to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); }
2734
ALWAYS_INLINE GSVector8i u32to64() const { return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m))); }
2735
2736
ALWAYS_INLINE static GSVector8i s8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi16(v.m)); }
2737
ALWAYS_INLINE static GSVector8i s8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi32(v.m)); }
2738
ALWAYS_INLINE static GSVector8i s8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi64(v.m)); }
2739
2740
ALWAYS_INLINE static GSVector8i s16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi32(v.m)); }
2741
ALWAYS_INLINE static GSVector8i s16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi64(v.m)); }
2742
ALWAYS_INLINE static GSVector8i s32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi32_epi64(v.m)); }
2743
ALWAYS_INLINE static GSVector8i u8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi16(v.m)); }
2744
ALWAYS_INLINE static GSVector8i u8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi32(v.m)); }
2745
ALWAYS_INLINE static GSVector8i u8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); }
2746
ALWAYS_INLINE static GSVector8i u16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi32(v.m)); }
2747
ALWAYS_INLINE static GSVector8i u16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); }
2748
ALWAYS_INLINE static GSVector8i u32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu32_epi64(v.m)); }
2749
2750
template<s32 i>
2751
ALWAYS_INLINE GSVector8i srl() const
2752
{
2753
return GSVector8i(_mm256_srli_si256(m, i));
2754
}
2755
2756
template<s32 i>
2757
ALWAYS_INLINE GSVector8i srl(const GSVector8i& v)
2758
{
2759
return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
2760
}
2761
2762
template<s32 i>
2763
ALWAYS_INLINE GSVector8i sll() const
2764
{
2765
return GSVector8i(_mm256_slli_si256(m, i));
2766
}
2767
2768
template<s32 i>
2769
ALWAYS_INLINE GSVector8i sll16() const
2770
{
2771
return GSVector8i(_mm256_slli_epi16(m, i));
2772
}
2773
2774
ALWAYS_INLINE GSVector8i sll16(s32 i) const { return GSVector8i(_mm256_sll_epi16(m, _mm_cvtsi32_si128(i))); }
2775
ALWAYS_INLINE GSVector8i sllv16(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi16(m, v.m)); }
2776
2777
template<s32 i>
2778
ALWAYS_INLINE GSVector8i srl16() const
2779
{
2780
return GSVector8i(_mm256_srli_epi16(m, i));
2781
}
2782
2783
ALWAYS_INLINE GSVector8i srl16(s32 i) const { return GSVector8i(_mm256_srl_epi16(m, _mm_cvtsi32_si128(i))); }
2784
ALWAYS_INLINE GSVector8i srlv16(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi16(m, v.m)); }
2785
2786
template<s32 i>
2787
ALWAYS_INLINE GSVector8i sra16() const
2788
{
2789
return GSVector8i(_mm256_srai_epi16(m, i));
2790
}
2791
2792
ALWAYS_INLINE GSVector8i sra16(s32 i) const { return GSVector8i(_mm256_sra_epi16(m, _mm_cvtsi32_si128(i))); }
2793
ALWAYS_INLINE GSVector8i srav16(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi16(m, v.m)); }
2794
2795
template<s32 i>
2796
ALWAYS_INLINE GSVector8i sll32() const
2797
{
2798
return GSVector8i(_mm256_slli_epi32(m, i));
2799
}
2800
2801
ALWAYS_INLINE GSVector8i sll32(s32 i) const { return GSVector8i(_mm256_sll_epi32(m, _mm_cvtsi32_si128(i))); }
2802
ALWAYS_INLINE GSVector8i sllv32(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi32(m, v.m)); }
2803
2804
template<s32 i>
2805
ALWAYS_INLINE GSVector8i srl32() const
2806
{
2807
return GSVector8i(_mm256_srli_epi32(m, i));
2808
}
2809
2810
ALWAYS_INLINE GSVector8i srl32(s32 i) const { return GSVector8i(_mm256_srl_epi32(m, _mm_cvtsi32_si128(i))); }
2811
ALWAYS_INLINE GSVector8i srlv32(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi32(m, v.m)); }
2812
2813
template<s32 i>
2814
ALWAYS_INLINE GSVector8i sra32() const
2815
{
2816
return GSVector8i(_mm256_srai_epi32(m, i));
2817
}
2818
2819
ALWAYS_INLINE GSVector8i sra32(s32 i) const { return GSVector8i(_mm256_sra_epi32(m, _mm_cvtsi32_si128(i))); }
2820
ALWAYS_INLINE GSVector8i srav32(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi32(m, v.m)); }
2821
2822
template<s64 i>
2823
ALWAYS_INLINE GSVector8i sll64() const
2824
{
2825
return GSVector8i(_mm256_slli_epi64(m, i));
2826
}
2827
2828
ALWAYS_INLINE GSVector8i sll64(s32 i) const { return GSVector8i(_mm256_sll_epi64(m, _mm_cvtsi32_si128(i))); }
2829
ALWAYS_INLINE GSVector8i sllv64(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi64(m, v.m)); }
2830
2831
template<s64 i>
2832
ALWAYS_INLINE GSVector8i srl64() const
2833
{
2834
return GSVector8i(_mm256_srli_epi64(m, i));
2835
}
2836
2837
ALWAYS_INLINE GSVector8i srl64(s32 i) const { return GSVector8i(_mm256_srl_epi64(m, _mm_cvtsi32_si128(i))); }
2838
ALWAYS_INLINE GSVector8i srlv64(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi64(m, v.m)); }
2839
2840
template<s64 i>
2841
ALWAYS_INLINE GSVector8i sra64() const
2842
{
2843
return GSVector8i(_mm256_srai_epi64(m, i));
2844
}
2845
2846
ALWAYS_INLINE GSVector8i sra64(s32 i) const { return GSVector8i(_mm256_sra_epi64(m, _mm_cvtsi32_si128(i))); }
2847
ALWAYS_INLINE GSVector8i srav64(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi64(m, v.m)); }
2848
2849
ALWAYS_INLINE GSVector8i add8(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi8(m, v.m)); }
2850
ALWAYS_INLINE GSVector8i add16(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi16(m, v.m)); }
2851
ALWAYS_INLINE GSVector8i add32(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi32(m, v.m)); }
2852
ALWAYS_INLINE GSVector8i adds8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi8(m, v.m)); }
2853
ALWAYS_INLINE GSVector8i adds16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi16(m, v.m)); }
2854
ALWAYS_INLINE GSVector8i hadds16(const GSVector8i& v) const { return GSVector8i(_mm256_hadds_epi16(m, v.m)); }
2855
ALWAYS_INLINE GSVector8i addus8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu8(m, v.m)); }
2856
ALWAYS_INLINE GSVector8i addus16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu16(m, v.m)); }
2857
2858
ALWAYS_INLINE GSVector8i sub8(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi8(m, v.m)); }
2859
ALWAYS_INLINE GSVector8i sub16(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi16(m, v.m)); }
2860
ALWAYS_INLINE GSVector8i sub32(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi32(m, v.m)); }
2861
ALWAYS_INLINE GSVector8i subs8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi8(m, v.m)); }
2862
ALWAYS_INLINE GSVector8i subs16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi16(m, v.m)); }
2863
ALWAYS_INLINE GSVector8i subus8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu8(m, v.m)); }
2864
ALWAYS_INLINE GSVector8i subus16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu16(m, v.m)); }
2865
2866
ALWAYS_INLINE GSVector8i mul16hs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhi_epi16(m, v.m)); }
2867
ALWAYS_INLINE GSVector8i mul16l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi16(m, v.m)); }
2868
ALWAYS_INLINE GSVector8i mul16hrs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhrs_epi16(m, v.m)); }
2869
ALWAYS_INLINE GSVector8i mul32l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi32(m, v.m)); }
2870
2871
ALWAYS_INLINE bool eq(const GSVector8i& v) const
2872
{
2873
const GSVector8i t = *this ^ v;
2874
return _mm256_testz_si256(t, t) != 0;
2875
}
2876
2877
ALWAYS_INLINE GSVector8i eq8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi8(m, v.m)); }
2878
ALWAYS_INLINE GSVector8i eq16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi16(m, v.m)); }
2879
ALWAYS_INLINE GSVector8i eq32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi32(m, v.m)); }
2880
ALWAYS_INLINE GSVector8i eq64(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi64(m, v.m)); }
2881
2882
ALWAYS_INLINE GSVector8i neq8(const GSVector8i& v) const { return ~eq8(v); }
2883
ALWAYS_INLINE GSVector8i neq16(const GSVector8i& v) const { return ~eq16(v); }
2884
ALWAYS_INLINE GSVector8i neq32(const GSVector8i& v) const { return ~eq32(v); }
2885
2886
ALWAYS_INLINE GSVector8i gt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(m, v.m)); }
2887
ALWAYS_INLINE GSVector8i gt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(m, v.m)); }
2888
ALWAYS_INLINE GSVector8i gt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(m, v.m)); }
2889
2890
ALWAYS_INLINE GSVector8i ge8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
2891
ALWAYS_INLINE GSVector8i ge16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
2892
ALWAYS_INLINE GSVector8i ge32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
2893
2894
ALWAYS_INLINE GSVector8i lt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
2895
ALWAYS_INLINE GSVector8i lt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(v.m, m)); }
2896
ALWAYS_INLINE GSVector8i lt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(v.m, m)); }
2897
2898
ALWAYS_INLINE GSVector8i le8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(m, v.m)); }
2899
ALWAYS_INLINE GSVector8i le16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi16(m, v.m)); }
2900
ALWAYS_INLINE GSVector8i le32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi32(m, v.m)); }
2901
2902
ALWAYS_INLINE GSVector8i andnot(const GSVector8i& v) const { return GSVector8i(_mm256_andnot_si256(v.m, m)); }
2903
2904
ALWAYS_INLINE u32 mask() const { return static_cast<u32>(_mm256_movemask_epi8(m)); }
2905
2906
ALWAYS_INLINE bool alltrue() const { return mask() == 0xFFFFFFFFu; }
2907
2908
ALWAYS_INLINE bool allfalse() const { return _mm256_testz_si256(m, m) != 0; }
2909
2910
template<s32 i>
2911
ALWAYS_INLINE GSVector8i insert8(s32 a) const
2912
{
2913
return GSVector8i(_mm256_insert_epi8(m, a, i));
2914
}
2915
2916
template<s32 i>
2917
ALWAYS_INLINE s32 extract8() const
2918
{
2919
return _mm256_extract_epi8(m, i);
2920
}
2921
2922
template<s32 i>
2923
ALWAYS_INLINE GSVector8i insert16(s32 a) const
2924
{
2925
return GSVector8i(_mm256_insert_epi16(m, a, i));
2926
}
2927
2928
template<s32 i>
2929
ALWAYS_INLINE s32 extract16() const
2930
{
2931
return _mm256_extract_epi16(m, i);
2932
}
2933
2934
template<s32 i>
2935
ALWAYS_INLINE GSVector8i insert32(s32 a) const
2936
{
2937
return GSVector8i(_mm256_insert_epi32(m, a, i));
2938
}
2939
2940
template<s32 i>
2941
ALWAYS_INLINE s32 extract32() const
2942
{
2943
return _mm256_extract_epi32(m, i);
2944
}
2945
2946
template<s32 i>
2947
ALWAYS_INLINE GSVector8i insert64(s64 a) const
2948
{
2949
return GSVector8i(_mm256_insert_epi64(m, a, i));
2950
}
2951
2952
template<s32 i>
2953
ALWAYS_INLINE s64 extract64() const
2954
{
2955
return _mm256_extract_epi64(m, i);
2956
}
2957
2958
ALWAYS_INLINE static GSVector8i zext32(s32 v) { return GSVector8i(_mm256_zextsi128_si256(GSVector4i::zext32(v))); }
2959
2960
ALWAYS_INLINE static GSVector8i loadnt(const void* p)
2961
{
2962
// Should be const, but isn't...
2963
return GSVector8i(_mm256_stream_load_si256(const_cast<__m256i*>(static_cast<const __m256i*>(p))));
2964
}
2965
2966
template<bool aligned>
2967
ALWAYS_INLINE static GSVector8i load(const void* p)
2968
{
2969
return GSVector8i(aligned ? _mm256_load_si256(static_cast<const __m256i*>(p)) :
2970
_mm256_loadu_si256(static_cast<const __m256i*>(p)));
2971
}
2972
2973
ALWAYS_INLINE static void storent(void* p, const GSVector8i& v)
2974
{
2975
_mm256_stream_si256(static_cast<__m256i*>(p), v.m);
2976
}
2977
2978
template<bool aligned>
2979
ALWAYS_INLINE static void store(void* p, const GSVector8i& v)
2980
{
2981
if constexpr (aligned)
2982
_mm256_store_si256(static_cast<__m256i*>(p), v.m);
2983
else
2984
_mm256_storeu_si256(static_cast<__m256i*>(p), v.m);
2985
}
2986
2987
template<bool aligned>
2988
ALWAYS_INLINE static void storel(void* p, const GSVector8i& v)
2989
{
2990
if constexpr (aligned)
2991
_mm_store_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m));
2992
else
2993
_mm_storeu_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m));
2994
}
2995
2996
ALWAYS_INLINE GSVector8i& operator&=(const GSVector8i& v)
2997
{
2998
m = _mm256_and_si256(m, v);
2999
return *this;
3000
}
3001
ALWAYS_INLINE GSVector8i& operator|=(const GSVector8i& v)
3002
{
3003
m = _mm256_or_si256(m, v);
3004
return *this;
3005
}
3006
ALWAYS_INLINE GSVector8i& operator^=(const GSVector8i& v)
3007
{
3008
m = _mm256_xor_si256(m, v);
3009
return *this;
3010
}
3011
3012
ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v1, const GSVector8i& v2)
3013
{
3014
return GSVector8i(_mm256_and_si256(v1, v2));
3015
}
3016
3017
ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v1, const GSVector8i& v2)
3018
{
3019
return GSVector8i(_mm256_or_si256(v1, v2));
3020
}
3021
3022
ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v1, const GSVector8i& v2)
3023
{
3024
return GSVector8i(_mm256_xor_si256(v1, v2));
3025
}
3026
3027
ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v, s32 i) { return v & GSVector8i(i); }
3028
ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v, s32 i) { return v | GSVector8i(i); }
3029
ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v, s32 i) { return v ^ GSVector8i(i); }
3030
ALWAYS_INLINE friend GSVector8i operator~(const GSVector8i& v) { return v ^ v.eq32(v); }
3031
3032
ALWAYS_INLINE static GSVector8i zero() { return GSVector8i(_mm256_setzero_si256()); }
3033
3034
ALWAYS_INLINE static GSVector8i broadcast128(const GSVector4i& v)
3035
{
3036
return GSVector8i(_mm256_broadcastsi128_si256(v.m));
3037
}
3038
3039
template<bool aligned>
3040
ALWAYS_INLINE static GSVector8i broadcast128(const void* v)
3041
{
3042
return broadcast128(GSVector4i::load<aligned>(v));
3043
}
3044
3045
ALWAYS_INLINE GSVector4i low128() const { return GSVector4i(_mm256_castsi256_si128(m)); }
3046
ALWAYS_INLINE GSVector4i high128() const { return GSVector4i(_mm256_extracti128_si256(m, 1)); }
3047
};
3048
3049
#endif
3050
3051