Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_sse.h
7448 views
1
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
//
4
// Lightweight wrapper over native SIMD types for cross-platform vector code.
5
// Rewritten and NEON+No-SIMD variants added for DuckStation.
6
//
7
8
#pragma once
9
10
#include "common/intrin.h"
11
#include "common/types.h"
12
13
#include <algorithm>
14
15
#ifndef CPU_ARCH_SSE41
16
#include <cmath>
17
#endif
18
19
#ifdef CPU_ARCH_SSE41
20
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
21
#endif
22
23
#ifdef CPU_ARCH_AVX2
24
#define GSVECTOR_HAS_SRLV 1
25
#define GSVECTOR_HAS_256 1
26
#endif
27
28
class GSVector2;
29
class GSVector2i;
30
class GSVector4;
31
class GSVector4i;
32
33
#ifndef CPU_ARCH_SSE41
34
35
// Thank LLVM for these.
36
ALWAYS_INLINE static __m128i sse2_min_s8(const __m128i m, const __m128i v)
37
{
38
const __m128i temp = _mm_cmpgt_epi8(m, v);
39
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
40
}
41
42
ALWAYS_INLINE static __m128i sse2_max_s8(const __m128i m, const __m128i v)
43
{
44
const __m128i temp = _mm_cmpgt_epi8(v, m);
45
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
46
}
47
48
ALWAYS_INLINE static __m128i sse2_min_s32(const __m128i m, const __m128i v)
49
{
50
const __m128i temp = _mm_cmpgt_epi32(m, v);
51
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
52
}
53
54
ALWAYS_INLINE static __m128i sse2_max_s32(const __m128i m, const __m128i v)
55
{
56
const __m128i temp = _mm_cmpgt_epi32(v, m);
57
return _mm_or_si128(_mm_andnot_si128(temp, m), _mm_and_si128(v, temp));
58
}
59
60
ALWAYS_INLINE static __m128i sse2_min_u16(const __m128i m, const __m128i v)
61
{
62
return _mm_sub_epi16(m, _mm_subs_epu16(m, v));
63
}
64
65
ALWAYS_INLINE static __m128i sse2_max_u16(const __m128i m, const __m128i v)
66
{
67
return _mm_add_epi16(m, _mm_subs_epu16(v, m));
68
}
69
70
ALWAYS_INLINE static __m128i sse2_min_u32(const __m128i m, const __m128i v)
71
{
72
const __m128i msb = _mm_set1_epi32(0x80000000);
73
const __m128i temp = _mm_cmpgt_epi32(_mm_xor_si128(msb, v), _mm_xor_si128(m, msb));
74
return _mm_or_si128(_mm_andnot_si128(temp, v), _mm_and_si128(m, temp));
75
}
76
77
ALWAYS_INLINE static __m128i sse2_max_u32(const __m128i m, const __m128i v)
78
{
79
const __m128i msb = _mm_set1_epi32(0x80000000);
80
const __m128i temp = _mm_cmpgt_epi32(_mm_xor_si128(msb, m), _mm_xor_si128(v, msb));
81
return _mm_or_si128(_mm_andnot_si128(temp, v), _mm_and_si128(m, temp));
82
}
83
84
#endif
85
86
class alignas(16) GSVector2i
87
{
88
struct cxpr_init_tag
89
{
90
};
91
static constexpr cxpr_init_tag cxpr_init{};
92
93
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y, 0, 0} {}
94
95
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3, 0, 0, 0, 0} {}
96
97
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
98
: S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
99
{
100
}
101
102
public:
103
union
104
{
105
struct
106
{
107
s32 x, y;
108
};
109
struct
110
{
111
s32 r, g;
112
};
113
float F32[4];
114
s8 S8[16];
115
s16 S16[8];
116
s32 S32[4];
117
s64 S64[2];
118
u8 U8[16];
119
u16 U16[8];
120
u32 U32[4];
121
u64 U64[2];
122
__m128i m;
123
};
124
125
GSVector2i() = default;
126
127
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
128
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
129
130
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
131
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
132
{
133
return GSVector2i(cxpr_init, s0, s1, s2, s3);
134
}
135
136
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
137
{
138
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
139
}
140
141
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); }
142
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); }
143
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
144
: S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
145
{
146
}
147
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
148
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
149
ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {}
150
151
ALWAYS_INLINE GSVector2i& operator=(s32 i)
152
{
153
m = _mm_set1_epi32(i);
154
return *this;
155
}
156
157
ALWAYS_INLINE GSVector2i& operator=(__m128i m_)
158
{
159
m = m_;
160
return *this;
161
}
162
163
ALWAYS_INLINE operator __m128i() const { return m; }
164
165
ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
166
{
167
return max_s8(min).min_s8(max);
168
}
169
ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
170
{
171
return max_s16(min).min_s16(max);
172
}
173
ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
174
{
175
return max_s32(min).min_s32(max);
176
}
177
178
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
179
{
180
return max_u8(min).min_u8(max);
181
}
182
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
183
{
184
return max_u16(min).min_u16(max);
185
}
186
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
187
{
188
return max_u32(min).min_u32(max);
189
}
190
191
#ifdef CPU_ARCH_SSE41
192
193
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const { return GSVector2i(_mm_min_epi8(m, v)); }
194
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const { return GSVector2i(_mm_max_epi8(m, v)); }
195
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); }
196
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); }
197
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(_mm_min_epi32(m, v)); }
198
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(_mm_max_epi32(m, v)); }
199
200
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); }
201
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); }
202
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(_mm_min_epu16(m, v)); }
203
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(_mm_max_epu16(m, v)); }
204
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); }
205
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); }
206
207
ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); }
208
209
#define VECTOR2i_REDUCE_8(name, func, ret) \
210
ALWAYS_INLINE ret name() const \
211
{ \
212
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
213
v = func(v, _mm_srli_epi32(v, 16)); \
214
v = func(v, _mm_srli_epi16(v, 8)); \
215
return static_cast<ret>(_mm_extract_epi8(v, 0)); \
216
}
217
218
#define VECTOR2i_REDUCE_16(name, func, ret) \
219
ALWAYS_INLINE ret name() const \
220
{ \
221
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
222
v = func(v, _mm_srli_epi32(v, 16)); \
223
return static_cast<ret>(_mm_extract_epi16(v, 0)); \
224
}
225
226
#define VECTOR2i_REDUCE_32(name, func, ret) \
227
ALWAYS_INLINE ret name() const \
228
{ \
229
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
230
return static_cast<ret>(_mm_extract_epi32(v, 0)); \
231
}
232
233
VECTOR2i_REDUCE_8(minv_s8, _mm_min_epi8, s8);
234
VECTOR2i_REDUCE_8(maxv_s8, _mm_max_epi8, s8);
235
VECTOR2i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
236
VECTOR2i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
237
VECTOR2i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
238
VECTOR2i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
239
VECTOR2i_REDUCE_16(minv_u16, _mm_min_epu16, u16);
240
VECTOR2i_REDUCE_16(maxv_u16, _mm_max_epu16, u16);
241
VECTOR2i_REDUCE_32(minv_s32, _mm_min_epi32, s32);
242
VECTOR2i_REDUCE_32(maxv_s32, _mm_max_epi32, s32);
243
VECTOR2i_REDUCE_32(minv_u32, _mm_min_epu32, u32);
244
VECTOR2i_REDUCE_32(maxv_u32, _mm_max_epu32, u32);
245
246
#undef VECTOR2i_REDUCE_32
247
#undef VECTOR2i_REDUCE_16
248
#undef VECTOR2i_REDUCE_8
249
250
#else
251
252
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const { return GSVector2i(sse2_min_s8(m, v)); }
253
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const { return GSVector2i(sse2_max_s8(m, v)); }
254
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); }
255
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); }
256
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(sse2_min_s32(m, v)); }
257
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(sse2_max_s32(m, v)); }
258
259
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); }
260
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); }
261
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(sse2_min_u16(m, v)); }
262
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(sse2_max_u16(m, v)); }
263
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(sse2_min_u32(m, v)); }
264
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(sse2_max_u32(m, v)); }
265
266
s32 addv_s32() const { return (x + y); }
267
268
#define VECTOR2i_REDUCE_8(name, func, ret) \
269
ALWAYS_INLINE ret name() const \
270
{ \
271
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
272
v = func(v, _mm_srli_epi32(v, 16)); \
273
v = func(v, _mm_srli_epi16(v, 8)); \
274
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
275
}
276
277
#define VECTOR2i_REDUCE_16(name, func, ret) \
278
ALWAYS_INLINE ret name() const \
279
{ \
280
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
281
v = func(v, _mm_srli_epi32(v, 16)); \
282
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
283
}
284
285
#define VECTOR2i_REDUCE_32(name, func, ret) \
286
ALWAYS_INLINE ret name() const \
287
{ \
288
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); \
289
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
290
}
291
292
VECTOR2i_REDUCE_8(minv_s8, sse2_min_s8, s8);
293
VECTOR2i_REDUCE_8(maxv_s8, sse2_max_s8, s8);
294
VECTOR2i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
295
VECTOR2i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
296
VECTOR2i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
297
VECTOR2i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
298
VECTOR2i_REDUCE_16(minv_u16, sse2_min_u16, u16);
299
VECTOR2i_REDUCE_16(maxv_u16, sse2_max_u16, u16);
300
VECTOR2i_REDUCE_32(minv_s32, sse2_min_s32, s32);
301
VECTOR2i_REDUCE_32(maxv_s32, sse2_max_s32, s32);
302
VECTOR2i_REDUCE_32(minv_u32, sse2_min_u32, u32);
303
VECTOR2i_REDUCE_32(maxv_u32, sse2_max_u32, u32);
304
305
#undef VECTOR2i_REDUCE_32
306
#undef VECTOR2i_REDUCE_16
307
#undef VECTOR2i_REDUCE_8
308
309
#endif
310
311
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
312
313
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const
314
{
315
return GSVector2i(_mm_blendv_epi8(m, v, mask));
316
}
317
318
template<s32 mask>
319
ALWAYS_INLINE GSVector2i blend16(const GSVector2i& v) const
320
{
321
return GSVector2i(_mm_blend_epi16(m, v, mask));
322
}
323
324
template<s32 mask>
325
ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
326
{
327
#if defined(CPU_ARCH_AVX2)
328
return GSVector2i(_mm_blend_epi32(m, v.m, mask));
329
#else
330
constexpr s32 bit1 = ((mask & 2) * 3) << 1;
331
constexpr s32 bit0 = (mask & 1) * 3;
332
return blend16<bit1 | bit0>(v);
333
#endif
334
}
335
336
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
337
{
338
return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
339
}
340
341
#ifdef CPU_ARCH_SSE41
342
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); }
343
#else
344
GSVector2i shuffle8(const GSVector2i& mask) const
345
{
346
GSVector2i ret;
347
for (size_t i = 0; i < 8; i++)
348
ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]);
349
return ret;
350
}
351
#endif
352
353
ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); }
354
ALWAYS_INLINE GSVector2i pu16() const { return GSVector2i(_mm_packus_epi16(m, m)); }
355
ALWAYS_INLINE GSVector2i ps32() const { return GSVector2i(_mm_packs_epi32(m, m)); }
356
#ifdef CPU_ARCH_SSE41
357
ALWAYS_INLINE GSVector2i pu32() const { return GSVector2i(_mm_packus_epi32(m, m)); }
358
#endif
359
360
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi8(m, v)); }
361
ALWAYS_INLINE GSVector2i uph8(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi8(m, v)); }
362
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi16(m, v)); }
363
ALWAYS_INLINE GSVector2i uph16(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi16(m, v)); }
364
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi32(m, v)); }
365
ALWAYS_INLINE GSVector2i uph32(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi32(m, v)); }
366
367
ALWAYS_INLINE GSVector2i upl8() const { return GSVector2i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
368
ALWAYS_INLINE GSVector2i uph8() const { return GSVector2i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
369
370
ALWAYS_INLINE GSVector2i upl16() const { return GSVector2i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
371
ALWAYS_INLINE GSVector2i uph16() const { return GSVector2i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
372
373
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
374
ALWAYS_INLINE GSVector2i uph32() const { return GSVector2i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
375
376
#ifdef CPU_ARCH_SSE41
377
ALWAYS_INLINE GSVector2i u8to16() const { return GSVector2i(_mm_cvtepu8_epi16(m)); }
378
ALWAYS_INLINE GSVector2i u8to32() const { return GSVector2i(_mm_cvtepu8_epi32(m)); }
379
ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(_mm_cvtepi16_epi32(m)); }
380
ALWAYS_INLINE GSVector2i u16to32() const { return GSVector2i(_mm_cvtepu16_epi32(m)); }
381
#else
382
// These are a pain, adding only as needed...
383
ALWAYS_INLINE GSVector2i u8to16() const { return upl8(); }
384
ALWAYS_INLINE GSVector2i u8to32() const
385
{
386
return GSVector2i(_mm_unpacklo_epi16(_mm_unpacklo_epi8(m, _mm_setzero_si128()), _mm_setzero_si128()));
387
}
388
389
ALWAYS_INLINE GSVector2i s16to32() const { return upl16().sll32<16>().sra32<16>(); }
390
ALWAYS_INLINE GSVector2i u16to32() const { return upl16(); }
391
#endif
392
393
template<s32 i>
394
ALWAYS_INLINE GSVector2i srl() const
395
{
396
return GSVector2i(_mm_srli_si128(m, i));
397
}
398
399
template<s32 i>
400
ALWAYS_INLINE GSVector2i sll() const
401
{
402
return GSVector2i(_mm_slli_si128(m, i));
403
}
404
405
template<s32 i>
406
ALWAYS_INLINE GSVector2i sll16() const
407
{
408
return GSVector2i(_mm_slli_epi16(m, i));
409
}
410
411
ALWAYS_INLINE GSVector2i sll16(s32 i) const { return GSVector2i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
412
413
#ifdef CPU_ARCH_AVX2
414
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi16(m, v.m)); }
415
#endif
416
417
template<s32 i>
418
ALWAYS_INLINE GSVector2i srl16() const
419
{
420
return GSVector2i(_mm_srli_epi16(m, i));
421
}
422
423
ALWAYS_INLINE GSVector2i srl16(s32 i) const { return GSVector2i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
424
425
#ifdef CPU_ARCH_AVX2
426
ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi16(m, v.m)); }
427
#endif
428
429
template<s32 i>
430
ALWAYS_INLINE GSVector2i sra16() const
431
{
432
return GSVector2i(_mm_srai_epi16(m, i));
433
}
434
435
ALWAYS_INLINE GSVector2i sra16(s32 i) const { return GSVector2i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
436
437
#ifdef CPU_ARCH_AVX2
438
ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi16(m, v.m)); }
439
#endif
440
441
template<s32 i>
442
ALWAYS_INLINE GSVector2i sll32() const
443
{
444
return GSVector2i(_mm_slli_epi32(m, i));
445
}
446
447
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
448
449
#ifdef CPU_ARCH_AVX2
450
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi32(m, v.m)); }
451
#endif
452
453
template<s32 i>
454
ALWAYS_INLINE GSVector2i srl32() const
455
{
456
return GSVector2i(_mm_srli_epi32(m, i));
457
}
458
459
ALWAYS_INLINE GSVector2i srl32(s32 i) const { return GSVector2i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
460
461
#ifdef CPU_ARCH_AVX2
462
ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi32(m, v.m)); }
463
#endif
464
465
template<s32 i>
466
ALWAYS_INLINE GSVector2i sra32() const
467
{
468
return GSVector2i(_mm_srai_epi32(m, i));
469
}
470
471
ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
472
473
#ifdef CPU_ARCH_AVX2
474
ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi32(m, v.m)); }
475
#endif
476
477
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); }
478
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); }
479
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); }
480
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); }
481
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); }
482
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); }
483
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); }
484
485
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); }
486
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); }
487
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); }
488
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); }
489
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); }
490
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); }
491
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); }
492
493
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); }
494
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); }
495
496
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); }
497
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); }
498
499
ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); }
500
501
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi8(m, v.m)); }
502
ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi16(m, v.m)); }
503
ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi32(m, v.m)); }
504
505
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
506
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
507
ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
508
509
ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
510
ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
511
ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
512
513
ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi8(m, v.m)); }
514
ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi16(m, v.m)); }
515
ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi32(m, v.m)); }
516
517
ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi8(m, v.m)); }
518
ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi16(m, v.m)); }
519
ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi32(m, v.m)); }
520
521
ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
522
ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
523
ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
524
525
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(_mm_andnot_si128(v.m, m)); }
526
527
ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); }
528
529
ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); }
530
ALWAYS_INLINE bool anytrue() const { return (mask() != 0x00); }
531
ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); }
532
533
template<s32 i>
534
ALWAYS_INLINE GSVector2i insert8(s32 a) const
535
{
536
#ifdef CPU_ARCH_SSE41
537
return GSVector2i(_mm_insert_epi8(m, a, i));
538
#else
539
GSVector2i ret(*this);
540
ret.S8[i] = static_cast<s8>(a);
541
return ret;
542
#endif
543
}
544
545
template<s32 i>
546
ALWAYS_INLINE s32 extract8() const
547
{
548
#ifdef CPU_ARCH_SSE41
549
return _mm_extract_epi8(m, i);
550
#else
551
return S8[i];
552
#endif
553
}
554
555
template<s32 i>
556
ALWAYS_INLINE GSVector2i insert16(s32 a) const
557
{
558
#ifdef CPU_ARCH_SSE41
559
return GSVector2i(_mm_insert_epi16(m, a, i));
560
#else
561
GSVector2i ret(*this);
562
ret.S16[i] = static_cast<s16>(a);
563
return ret;
564
#endif
565
}
566
567
template<s32 i>
568
ALWAYS_INLINE s32 extract16() const
569
{
570
#ifdef CPU_ARCH_SSE41
571
return _mm_extract_epi16(m, i);
572
#else
573
return S16[i];
574
#endif
575
}
576
577
template<s32 i>
578
ALWAYS_INLINE GSVector2i insert32(s32 a) const
579
{
580
#ifdef CPU_ARCH_SSE41
581
return GSVector2i(_mm_insert_epi32(m, a, i));
582
#else
583
GSVector2i ret(*this);
584
ret.S32[i] = a;
585
return ret;
586
#endif
587
}
588
589
template<s32 i>
590
ALWAYS_INLINE s32 extract32() const
591
{
592
#ifdef CPU_ARCH_SSE41
593
return _mm_extract_epi32(m, i);
594
#else
595
if constexpr (i == 0)
596
return _mm_cvtsi128_si32(m);
597
else
598
return S32[i];
599
#endif
600
}
601
602
ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
603
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
604
605
template<bool aligned>
606
ALWAYS_INLINE static GSVector2i load(const void* p)
607
{
608
return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
609
}
610
611
template<bool aligned>
612
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
613
{
614
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
615
}
616
617
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
618
619
ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
620
{
621
m = _mm_and_si128(m, v);
622
return *this;
623
}
624
625
ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v)
626
{
627
m = _mm_or_si128(m, v);
628
return *this;
629
}
630
631
ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v)
632
{
633
m = _mm_xor_si128(m, v);
634
return *this;
635
}
636
637
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
638
{
639
return GSVector2i(_mm_and_si128(v1, v2));
640
}
641
642
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
643
{
644
return GSVector2i(_mm_or_si128(v1, v2));
645
}
646
647
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
648
{
649
return GSVector2i(_mm_xor_si128(v1, v2));
650
}
651
652
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); }
653
654
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); }
655
656
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); }
657
658
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
659
660
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); }
661
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
662
663
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
664
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); }
665
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 1))); }
666
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 1, 1))); }
667
};
668
669
class alignas(16) GSVector2
670
{
671
struct cxpr_init_tag
672
{
673
};
674
static constexpr cxpr_init_tag cxpr_init{};
675
676
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
677
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
678
679
public:
680
union
681
{
682
struct
683
{
684
float x, y;
685
};
686
struct
687
{
688
float r, g;
689
};
690
float F32[4];
691
double F64[2];
692
s8 I8[16];
693
s16 I16[8];
694
s32 I32[4];
695
s64 I64[2];
696
u8 U8[16];
697
u16 U16[8];
698
u32 U32[4];
699
u64 U64[2];
700
__m128 m;
701
};
702
703
GSVector2() = default;
704
705
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
706
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
707
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
708
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
709
710
ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); }
711
ALWAYS_INLINE GSVector2(int x, int y)
712
{
713
GSVector2i v_(x, y);
714
m = _mm_cvtepi32_ps(v_.m);
715
}
716
717
ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {}
718
ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {}
719
ALWAYS_INLINE explicit GSVector2(float f) { *this = f; }
720
ALWAYS_INLINE explicit GSVector2(int i)
721
{
722
#ifdef CPU_ARCH_AVX2
723
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
724
#else
725
*this = GSVector2(GSVector2i(i));
726
#endif
727
}
728
729
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v) : m(_mm_cvtepi32_ps(v)) {}
730
731
ALWAYS_INLINE GSVector2& operator=(float f)
732
{
733
m = _mm_set1_ps(f);
734
return *this;
735
}
736
737
ALWAYS_INLINE GSVector2& operator=(__m128 m_)
738
{
739
m = m_;
740
return *this;
741
}
742
743
ALWAYS_INLINE operator __m128() const { return m; }
744
745
ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); }
746
ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); }
747
ALWAYS_INLINE GSVector2 floor() const
748
{
749
#ifdef CPU_ARCH_SSE41
750
return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
751
#else
752
return GSVector2(std::floor(x), std::floor(y));
753
#endif
754
}
755
756
ALWAYS_INLINE GSVector2 ceil() const
757
{
758
#ifdef CPU_ARCH_SSE41
759
return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
760
#else
761
return GSVector2(std::ceil(x), std::ceil(y));
762
#endif
763
}
764
765
ALWAYS_INLINE GSVector2 sat(const GSVector2& min, const GSVector2& max) const
766
{
767
return GSVector2(_mm_min_ps(_mm_max_ps(m, min), max));
768
}
769
770
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
771
772
ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
773
774
ALWAYS_INLINE GSVector2 min(const GSVector2& v) const { return GSVector2(_mm_min_ps(m, v)); }
775
776
ALWAYS_INLINE GSVector2 max(const GSVector2& v) const { return GSVector2(_mm_max_ps(m, v)); }
777
778
template<int mask>
779
ALWAYS_INLINE GSVector2 blend32(const GSVector2& v) const
780
{
781
return GSVector2(_mm_blend_ps(m, v, mask));
782
}
783
784
ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const
785
{
786
return GSVector2(_mm_blendv_ps(m, v, mask));
787
}
788
789
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const { return GSVector2(_mm_andnot_ps(v.m, m)); }
790
791
ALWAYS_INLINE int mask() const { return (_mm_movemask_ps(m) & 0x3); }
792
793
ALWAYS_INLINE bool alltrue() const { return (mask() == 0x3); }
794
ALWAYS_INLINE bool anytrue() const { return (mask() != 0x0); }
795
ALWAYS_INLINE bool allfalse() const { return (mask() == 0x0); }
796
797
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
798
799
template<int src, int dst>
800
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
801
{
802
#ifdef CPU_ARCH_SSE41
803
if constexpr (src == dst)
804
return GSVector2(_mm_blend_ps(m, v.m, 1 << src));
805
else
806
return GSVector2(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
807
#else
808
GSVector2 ret(*this);
809
ret.F32[dst] = v.F32[src];
810
return ret;
811
#endif
812
}
813
814
template<int i>
815
ALWAYS_INLINE int extract32() const
816
{
817
#ifdef CPU_ARCH_SSE41
818
return _mm_extract_ps(m, i);
819
#else
820
if constexpr (i == 0)
821
return _mm_cvtsi128_si32(_mm_castps_si128(m));
822
else
823
return F32[i];
824
#endif
825
}
826
827
#ifdef CPU_ARCH_SSE41
828
ALWAYS_INLINE float dot(const GSVector2& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0x31)); }
829
#else
830
float dot(const GSVector2& v) const
831
{
832
const __m128 tmp = _mm_mul_ps(m, v.m);
833
float ret;
834
_mm_store_ss(&ret, _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1))));
835
return ret;
836
}
837
#endif
838
839
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(_mm_setzero_ps()); }
840
841
ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
842
843
template<bool aligned>
844
ALWAYS_INLINE static GSVector2 load(const void* p)
845
{
846
return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
847
}
848
849
template<bool aligned>
850
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
851
{
852
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
853
}
854
855
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
856
857
ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_)
858
{
859
m = _mm_add_ps(m, v_);
860
return *this;
861
}
862
ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_)
863
{
864
m = _mm_sub_ps(m, v_);
865
return *this;
866
}
867
ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_)
868
{
869
m = _mm_mul_ps(m, v_);
870
return *this;
871
}
872
ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_)
873
{
874
m = _mm_div_ps(m, v_);
875
return *this;
876
}
877
878
ALWAYS_INLINE GSVector2& operator+=(float f)
879
{
880
*this += GSVector2(f);
881
return *this;
882
}
883
ALWAYS_INLINE GSVector2& operator-=(float f)
884
{
885
*this -= GSVector2(f);
886
return *this;
887
}
888
ALWAYS_INLINE GSVector2& operator*=(float f)
889
{
890
*this *= GSVector2(f);
891
return *this;
892
}
893
ALWAYS_INLINE GSVector2& operator/=(float f)
894
{
895
*this /= GSVector2(f);
896
return *this;
897
}
898
899
ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_)
900
{
901
m = _mm_and_ps(m, v_);
902
return *this;
903
}
904
ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_)
905
{
906
m = _mm_or_ps(m, v_);
907
return *this;
908
}
909
ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_)
910
{
911
m = _mm_xor_ps(m, v_);
912
return *this;
913
}
914
915
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
916
{
917
return GSVector2(_mm_add_ps(v1, v2));
918
}
919
920
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
921
{
922
return GSVector2(_mm_sub_ps(v1, v2));
923
}
924
925
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
926
{
927
return GSVector2(_mm_mul_ps(v1, v2));
928
}
929
930
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
931
{
932
return GSVector2(_mm_div_ps(v1, v2));
933
}
934
935
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
936
937
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
938
939
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
940
941
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
942
943
ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
944
{
945
return GSVector2(_mm_and_ps(v1, v2));
946
}
947
948
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
949
{
950
return GSVector2(_mm_or_ps(v1, v2));
951
}
952
953
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
954
{
955
return GSVector2(_mm_xor_ps(v1, v2));
956
}
957
958
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
959
{
960
return GSVector2(_mm_cmpeq_ps(v1, v2));
961
}
962
963
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
964
{
965
return GSVector2(_mm_cmpneq_ps(v1, v2));
966
}
967
968
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
969
{
970
return GSVector2(_mm_cmpgt_ps(v1, v2));
971
}
972
973
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
974
{
975
return GSVector2(_mm_cmplt_ps(v1, v2));
976
}
977
978
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
979
{
980
return GSVector2(_mm_cmpge_ps(v1, v2));
981
}
982
983
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
984
{
985
return GSVector2(_mm_cmple_ps(v1, v2));
986
}
987
988
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
989
990
ALWAYS_INLINE GSVector2 xy() const { return *this; }
991
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); }
992
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); }
993
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 1, 1))); }
994
};
995
996
class alignas(16) GSVector4i
997
{
998
struct cxpr_init_tag
999
{
1000
};
1001
static constexpr cxpr_init_tag cxpr_init{};
1002
1003
constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1004
1005
constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1006
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1007
{
1008
}
1009
1010
constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1011
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1012
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1013
{
1014
}
1015
1016
public:
1017
union
1018
{
1019
struct
1020
{
1021
s32 x, y, z, w;
1022
};
1023
struct
1024
{
1025
s32 r, g, b, a;
1026
};
1027
struct
1028
{
1029
s32 left, top, right, bottom;
1030
};
1031
float F32[4];
1032
s8 S8[16];
1033
s16 S16[8];
1034
s32 S32[4];
1035
s64 S64[2];
1036
u8 U8[16];
1037
u16 U16[8];
1038
u32 U32[4];
1039
u64 U64[2];
1040
__m128i m;
1041
};
1042
1043
GSVector4i() = default;
1044
1045
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1046
{
1047
return GSVector4i(cxpr_init, x, y, z, w);
1048
}
1049
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1050
1051
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1052
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1053
{
1054
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1055
}
1056
1057
ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1058
s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1059
{
1060
return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1061
}
1062
1063
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
1064
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1065
{
1066
m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
1067
}
1068
1069
ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1070
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1071
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1072
{
1073
}
1074
1075
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_unpacklo_epi64(v.m, _mm_setzero_si128()); }
1076
1077
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v)
1078
: m(_mm_unpacklo_epi64(_mm_cvttps_epi32(v), _mm_setzero_si128()))
1079
{
1080
}
1081
1082
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1083
1084
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1085
1086
ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
1087
1088
ALWAYS_INLINE GSVector4i& operator=(s32 i)
1089
{
1090
m = _mm_set1_epi32(i);
1091
return *this;
1092
}
1093
ALWAYS_INLINE GSVector4i& operator=(__m128i m_)
1094
{
1095
m = m_;
1096
return *this;
1097
}
1098
1099
ALWAYS_INLINE operator __m128i() const { return m; }
1100
1101
ALWAYS_INLINE s32 width() const { return right - left; }
1102
ALWAYS_INLINE s32 height() const { return bottom - top; }
1103
1104
ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1105
ALWAYS_INLINE bool rempty() const { return (lt32(zwzw()).mask() != 0x00ff); }
1106
ALWAYS_INLINE bool rvalid() const { return ((ge32(zwzw()).mask() & 0xff) == 0); }
1107
1108
ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_s32(v).blend32<0xc>(max_s32(v)); }
1109
1110
ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_s32(v); }
1111
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1112
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1113
1114
ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1115
1116
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1117
{
1118
return max_s8(min).min_s8(max);
1119
}
1120
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1121
{
1122
return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1123
}
1124
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1125
{
1126
return max_s16(min).min_s16(max);
1127
}
1128
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1129
{
1130
return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1131
}
1132
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1133
{
1134
return max_s32(min).min_s32(max);
1135
}
1136
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1137
{
1138
return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1139
}
1140
1141
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1142
{
1143
return max_u8(min).min_u8(max);
1144
}
1145
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1146
{
1147
return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1148
}
1149
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1150
{
1151
return max_u16(min).min_u16(max);
1152
}
1153
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1154
{
1155
return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1156
}
1157
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1158
{
1159
return max_u32(min).min_u32(max);
1160
}
1161
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1162
{
1163
return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1164
}
1165
1166
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); }
1167
1168
#ifdef CPU_ARCH_SSE41
1169
1170
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); }
1171
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); }
1172
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
1173
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
1174
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); }
1175
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); }
1176
1177
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
1178
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
1179
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); }
1180
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); }
1181
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); }
1182
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); }
1183
1184
ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); }
1185
1186
ALWAYS_INLINE s32 addv_s32() const
1187
{
1188
const __m128i pairs = _mm_hadd_epi32(m, m);
1189
return _mm_cvtsi128_si32(_mm_hadd_epi32(pairs, pairs));
1190
}
1191
1192
#define VECTOR4i_REDUCE_8(name, func, ret) \
1193
ALWAYS_INLINE ret name() const \
1194
{ \
1195
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1196
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1197
v = func(v, _mm_srli_epi32(v, 16)); \
1198
v = func(v, _mm_srli_epi16(v, 8)); \
1199
return static_cast<ret>(_mm_extract_epi8(v, 0)); \
1200
}
1201
1202
#define VECTOR4i_REDUCE_16(name, func, ret) \
1203
ALWAYS_INLINE ret name() const \
1204
{ \
1205
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1206
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1207
v = func(v, _mm_srli_epi32(v, 16)); \
1208
return static_cast<ret>(_mm_extract_epi16(v, 0)); \
1209
}
1210
1211
#define VECTOR4i_REDUCE_32(name, func, ret) \
1212
ALWAYS_INLINE ret name() const \
1213
{ \
1214
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1215
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1216
return static_cast<ret>(_mm_extract_epi32(v, 0)); \
1217
}
1218
1219
VECTOR4i_REDUCE_8(minv_s8, _mm_min_epi8, s8);
1220
VECTOR4i_REDUCE_8(maxv_s8, _mm_max_epi8, s8);
1221
VECTOR4i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
1222
VECTOR4i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
1223
VECTOR4i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
1224
VECTOR4i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
1225
VECTOR4i_REDUCE_16(minv_u16, _mm_min_epu16, u16);
1226
VECTOR4i_REDUCE_16(maxv_u16, _mm_max_epu16, u16);
1227
VECTOR4i_REDUCE_32(minv_s32, _mm_min_epi32, s32);
1228
VECTOR4i_REDUCE_32(maxv_s32, _mm_max_epi32, s32);
1229
VECTOR4i_REDUCE_32(minv_u32, _mm_min_epu32, u32);
1230
VECTOR4i_REDUCE_32(maxv_u32, _mm_max_epu32, u32);
1231
1232
#undef VECTOR4i_REDUCE_32
1233
#undef VECTOR4i_REDUCE_16
1234
#undef VECTOR4i_REDUCE_8
1235
1236
#else
1237
1238
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const { return GSVector4i(sse2_min_s8(m, v)); }
1239
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const { return GSVector4i(sse2_max_s8(m, v)); }
1240
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
1241
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
1242
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(sse2_min_s32(m, v)); }
1243
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(sse2_max_s32(m, v)); }
1244
1245
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
1246
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
1247
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(sse2_min_u16(m, v)); }
1248
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(sse2_max_u16(m, v)); }
1249
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(sse2_min_u32(m, v)); }
1250
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(sse2_max_u32(m, v)); }
1251
1252
GSVector4i addp_s32() const
1253
{
1254
return GSVector4i(
1255
_mm_shuffle_epi32(_mm_add_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 3, 1, 1))), _MM_SHUFFLE(3, 2, 2, 0)));
1256
}
1257
1258
ALWAYS_INLINE s32 addv_s32() const
1259
{
1260
const __m128i pair1 = _mm_add_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 3, 1, 1))); // 0+1,1+1,2+3,3+3
1261
const __m128i pair2 = _mm_add_epi32(pair1, _mm_shuffle_epi32(pair1, _MM_SHUFFLE(3, 2, 1, 2)));
1262
return _mm_cvtsi128_si32(pair2);
1263
}
1264
1265
#define VECTOR4i_REDUCE_8(name, func, ret) \
1266
ALWAYS_INLINE ret name() const \
1267
{ \
1268
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1269
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1270
v = func(v, _mm_srli_epi32(v, 16)); \
1271
v = func(v, _mm_srli_epi16(v, 8)); \
1272
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1273
}
1274
1275
#define VECTOR4i_REDUCE_16(name, func, ret) \
1276
ALWAYS_INLINE ret name() const \
1277
{ \
1278
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1279
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1280
v = func(v, _mm_srli_epi32(v, 16)); \
1281
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1282
}
1283
1284
#define VECTOR4i_REDUCE_32(name, func, ret) \
1285
ALWAYS_INLINE ret name() const \
1286
{ \
1287
__m128i v = func(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); \
1288
v = func(v, _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1))); \
1289
return static_cast<ret>(_mm_cvtsi128_si32(v)); \
1290
}
1291
1292
VECTOR4i_REDUCE_8(minv_s8, sse2_min_s8, s8);
1293
VECTOR4i_REDUCE_8(maxv_s8, sse2_max_s8, s8);
1294
VECTOR4i_REDUCE_8(minv_u8, _mm_min_epu8, u8);
1295
VECTOR4i_REDUCE_8(maxv_u8, _mm_max_epu8, u8);
1296
VECTOR4i_REDUCE_16(minv_s16, _mm_min_epi16, s16);
1297
VECTOR4i_REDUCE_16(maxv_s16, _mm_max_epi16, s16);
1298
VECTOR4i_REDUCE_16(minv_u16, sse2_min_u16, u16);
1299
VECTOR4i_REDUCE_16(maxv_u16, sse2_max_u16, u16);
1300
VECTOR4i_REDUCE_32(minv_s32, sse2_min_s32, s32);
1301
VECTOR4i_REDUCE_32(maxv_s32, sse2_max_s32, s32);
1302
VECTOR4i_REDUCE_32(minv_u32, sse2_min_u32, u32);
1303
VECTOR4i_REDUCE_32(maxv_u32, sse2_max_u32, u32);
1304
1305
#undef VECTOR4i_REDUCE_32
1306
#undef VECTOR4i_REDUCE_16
1307
#undef VECTOR4i_REDUCE_8
1308
1309
#endif
1310
1311
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1312
1313
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
1314
{
1315
#ifdef CPU_ARCH_SSE41
1316
return GSVector4i(_mm_blendv_epi8(m, v, mask));
1317
#else
1318
// NOTE: Assumes the entire lane is set with 1s or 0s.
1319
return (v & mask) | andnot(mask);
1320
#endif
1321
}
1322
1323
template<s32 mask>
1324
ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const
1325
{
1326
#ifdef CPU_ARCH_SSE41
1327
return GSVector4i(_mm_blend_epi16(m, v, mask));
1328
#else
1329
static constexpr GSVector4i vmask =
1330
GSVector4i::cxpr16(((mask) & (1 << 0)) ? -1 : 0x0, ((mask) & (1 << 1)) ? -1 : 0x0, ((mask) & (1 << 2)) ? -1 : 0x0,
1331
((mask) & (1 << 3)) ? -1 : 0x0, ((mask) & (1 << 4)) ? -1 : 0x0, ((mask) & (1 << 5)) ? -1 : 0x0,
1332
((mask) & (1 << 6)) ? -1 : 0x0, ((mask) & (1 << 7)) ? -1 : 0x0);
1333
return (v & vmask) | andnot(vmask);
1334
#endif
1335
}
1336
1337
template<s32 mask>
1338
ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1339
{
1340
#ifdef CPU_ARCH_AVX2
1341
return GSVector4i(_mm_blend_epi32(m, v.m, mask));
1342
#else
1343
#ifndef CPU_ARCH_SSE41
1344
// we can do this with a movsd if 0,1 are from a, and 2,3 from b
1345
if constexpr ((mask & 15) == 12)
1346
return GSVector4i(_mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(v.m), _mm_castsi128_pd(m))));
1347
#endif
1348
1349
constexpr s32 bit3 = ((mask & 8) * 3) << 3;
1350
constexpr s32 bit2 = ((mask & 4) * 3) << 2;
1351
constexpr s32 bit1 = ((mask & 2) * 3) << 1;
1352
constexpr s32 bit0 = (mask & 1) * 3;
1353
return blend16<bit3 | bit2 | bit1 | bit0>(v);
1354
#endif
1355
}
1356
1357
ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1358
{
1359
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
1360
}
1361
1362
#ifdef CPU_ARCH_SSE41
1363
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
1364
#else
1365
GSVector4i shuffle8(const GSVector4i& mask) const
1366
{
1367
GSVector4i ret;
1368
for (size_t i = 0; i < 16; i++)
1369
ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]);
1370
return ret;
1371
}
1372
#endif
1373
1374
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
1375
ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); }
1376
ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); }
1377
ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); }
1378
ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); }
1379
ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); }
1380
#ifdef CPU_ARCH_SSE41
1381
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); }
1382
ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); }
1383
#else
1384
// sign extend so it matches
1385
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1386
{
1387
return GSVector4i(_mm_packs_epi32(sll32<16>().sra32<16>(), v.sll32<16>().sra32<16>()));
1388
}
1389
ALWAYS_INLINE GSVector4i pu32() const
1390
{
1391
const GSVector4i tmp = sll32<16>().sra32<16>();
1392
return GSVector4i(_mm_packs_epi32(tmp.m, tmp.m));
1393
}
1394
#endif
1395
1396
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); }
1397
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); }
1398
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); }
1399
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); }
1400
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); }
1401
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); }
1402
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); }
1403
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); }
1404
1405
ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
1406
ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
1407
1408
ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
1409
ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
1410
1411
ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
1412
1413
ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
1414
ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); }
1415
ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); }
1416
1417
ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); }
1418
ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); }
1419
ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); }
1420
1421
#ifdef CPU_ARCH_SSE41
1422
ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); }
1423
ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); }
1424
ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); }
1425
ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); }
1426
ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); }
1427
ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
1428
ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); }
1429
ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
1430
ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); }
1431
#else
1432
// These are a pain, adding only as needed...
1433
ALWAYS_INLINE GSVector4i u8to32() const
1434
{
1435
return GSVector4i(_mm_unpacklo_epi16(_mm_unpacklo_epi8(m, _mm_setzero_si128()), _mm_setzero_si128()));
1436
}
1437
1438
ALWAYS_INLINE GSVector4i u16to32() const { return upl16(); }
1439
ALWAYS_INLINE GSVector4i s16to32() const { return upl16().sll32<16>().sra32<16>(); }
1440
ALWAYS_INLINE GSVector4i u8to16() const { return upl8(); }
1441
#endif
1442
1443
template<s32 i>
1444
ALWAYS_INLINE GSVector4i srl() const
1445
{
1446
return GSVector4i(_mm_srli_si128(m, i));
1447
}
1448
1449
template<s32 i>
1450
ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1451
{
1452
return GSVector4i(_mm_alignr_epi8(v.m, m, i));
1453
}
1454
1455
template<s32 i>
1456
ALWAYS_INLINE GSVector4i sll() const
1457
{
1458
return GSVector4i(_mm_slli_si128(m, i));
1459
}
1460
1461
template<s32 i>
1462
ALWAYS_INLINE GSVector4i sll16() const
1463
{
1464
return GSVector4i(_mm_slli_epi16(m, i));
1465
}
1466
1467
ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
1468
1469
#ifdef CPU_ARCH_AVX2
1470
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
1471
#endif
1472
1473
template<s32 i>
1474
ALWAYS_INLINE GSVector4i srl16() const
1475
{
1476
return GSVector4i(_mm_srli_epi16(m, i));
1477
}
1478
1479
ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
1480
1481
#ifdef CPU_ARCH_AVX2
1482
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); }
1483
#endif
1484
1485
template<s32 i>
1486
ALWAYS_INLINE GSVector4i sra16() const
1487
{
1488
return GSVector4i(_mm_srai_epi16(m, i));
1489
}
1490
1491
ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
1492
1493
#ifdef CPU_ARCH_AVX2
1494
ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); }
1495
#endif
1496
1497
template<s32 i>
1498
ALWAYS_INLINE GSVector4i sll32() const
1499
{
1500
return GSVector4i(_mm_slli_epi32(m, i));
1501
}
1502
1503
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
1504
1505
#ifdef CPU_ARCH_AVX2
1506
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); }
1507
#endif
1508
1509
template<s32 i>
1510
ALWAYS_INLINE GSVector4i srl32() const
1511
{
1512
return GSVector4i(_mm_srli_epi32(m, i));
1513
}
1514
1515
ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
1516
1517
#ifdef CPU_ARCH_AVX2
1518
ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); }
1519
#endif
1520
1521
template<s32 i>
1522
ALWAYS_INLINE GSVector4i sra32() const
1523
{
1524
return GSVector4i(_mm_srai_epi32(m, i));
1525
}
1526
1527
ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
1528
1529
#ifdef CPU_ARCH_AVX2
1530
ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); }
1531
#endif
1532
1533
template<s64 i>
1534
ALWAYS_INLINE GSVector4i sll64() const
1535
{
1536
return GSVector4i(_mm_slli_epi64(m, i));
1537
}
1538
1539
ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); }
1540
1541
#ifdef CPU_ARCH_AVX2
1542
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); }
1543
#endif
1544
1545
template<s64 i>
1546
ALWAYS_INLINE GSVector4i srl64() const
1547
{
1548
return GSVector4i(_mm_srli_epi64(m, i));
1549
}
1550
1551
ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); }
1552
1553
#ifdef CPU_ARCH_AVX2
1554
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); }
1555
#endif
1556
1557
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
1558
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
1559
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
1560
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
1561
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
1562
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
1563
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
1564
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
1565
1566
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
1567
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
1568
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
1569
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
1570
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
1571
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
1572
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
1573
1574
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
1575
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
1576
1577
#ifdef CPU_ARCH_SSE41
1578
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
1579
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
1580
#else
1581
// We can abuse the fact that signed and unsigned multiplies are the same.
1582
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const
1583
{
1584
return GSVector4i(_mm_castps_si128(
1585
_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(_mm_unpacklo_epi32(m, _mm_setzero_si128()),
1586
_mm_unpacklo_epi32(v.m, _mm_setzero_si128()))), // x,y
1587
_mm_castsi128_ps(_mm_mul_epu32(_mm_unpackhi_epi32(m, _mm_setzero_si128()),
1588
_mm_unpackhi_epi32(v.m, _mm_setzero_si128()))), // z,w
1589
_MM_SHUFFLE(2, 0, 2, 0))));
1590
}
1591
#endif
1592
1593
ALWAYS_INLINE bool eq(const GSVector4i& v) const
1594
{
1595
#ifdef CPU_ARCH_SSE41
1596
const GSVector4i t = *this ^ v;
1597
return _mm_testz_si128(t, t) != 0;
1598
#else
1599
return eq8(v).alltrue();
1600
#endif
1601
}
1602
1603
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); }
1604
ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); }
1605
ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); }
1606
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); }
1607
1608
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
1609
ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
1610
ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
1611
1612
ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
1613
ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
1614
ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
1615
1616
ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); }
1617
ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); }
1618
ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); }
1619
1620
ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); }
1621
ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); }
1622
ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); }
1623
1624
ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
1625
ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
1626
ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
1627
1628
ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); }
1629
1630
ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); }
1631
1632
ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; }
1633
1634
ALWAYS_INLINE bool anytrue() const
1635
{
1636
#ifdef CPU_ARCH_SSE41
1637
return (_mm_testz_si128(m, m) == 0);
1638
#else
1639
return (mask() != 0);
1640
#endif
1641
}
1642
1643
ALWAYS_INLINE bool allfalse() const
1644
{
1645
#ifdef CPU_ARCH_SSE41
1646
return (_mm_testz_si128(m, m) != 0);
1647
#else
1648
return (mask() == 0);
1649
#endif
1650
}
1651
1652
template<s32 i>
1653
ALWAYS_INLINE GSVector4i insert8(s32 a) const
1654
{
1655
#ifdef CPU_ARCH_SSE41
1656
return GSVector4i(_mm_insert_epi8(m, a, i));
1657
#else
1658
GSVector4i ret(*this);
1659
ret.S8[i] = static_cast<s8>(a);
1660
return ret;
1661
#endif
1662
}
1663
1664
template<s32 i>
1665
ALWAYS_INLINE s32 extract8() const
1666
{
1667
#ifdef CPU_ARCH_SSE41
1668
return _mm_extract_epi8(m, i);
1669
#else
1670
return S8[i];
1671
#endif
1672
}
1673
1674
template<s32 i>
1675
ALWAYS_INLINE GSVector4i insert16(s32 a) const
1676
{
1677
#ifdef CPU_ARCH_SSE41
1678
return GSVector4i(_mm_insert_epi16(m, a, i));
1679
#else
1680
GSVector4i ret(*this);
1681
ret.S16[i] = static_cast<s16>(a);
1682
return ret;
1683
#endif
1684
}
1685
1686
template<s32 i>
1687
ALWAYS_INLINE s32 extract16() const
1688
{
1689
#ifdef CPU_ARCH_SSE41
1690
return _mm_extract_epi16(m, i);
1691
#else
1692
return S16[i];
1693
#endif
1694
}
1695
1696
template<s32 i>
1697
ALWAYS_INLINE GSVector4i insert32(s32 a) const
1698
{
1699
#ifdef CPU_ARCH_SSE41
1700
return GSVector4i(_mm_insert_epi32(m, a, i));
1701
#else
1702
GSVector4i ret(*this);
1703
ret.S32[i] = a;
1704
return ret;
1705
#endif
1706
}
1707
1708
template<s32 i>
1709
ALWAYS_INLINE s32 extract32() const
1710
{
1711
#ifdef CPU_ARCH_SSE41
1712
return _mm_extract_epi32(m, i);
1713
#else
1714
if constexpr (i == 0)
1715
return _mm_cvtsi128_si32(m);
1716
else
1717
return S32[i];
1718
#endif
1719
}
1720
1721
template<s32 i>
1722
ALWAYS_INLINE GSVector4i insert64(s64 a) const
1723
{
1724
#ifdef CPU_ARCH_SSE41
1725
return GSVector4i(_mm_insert_epi64(m, a, i));
1726
#else
1727
GSVector4i ret(*this);
1728
ret.S64[i] = a;
1729
return ret;
1730
#endif
1731
}
1732
1733
template<s32 i>
1734
ALWAYS_INLINE s64 extract64() const
1735
{
1736
#ifdef CPU_ARCH_SSE41
1737
return _mm_extract_epi64(m, i);
1738
#else
1739
return S64[i];
1740
#endif
1741
}
1742
1743
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
1744
{
1745
// Should be const, but isn't...
1746
return GSVector4i(_mm_stream_load_si128(const_cast<__m128i*>(static_cast<const __m128i*>(p))));
1747
}
1748
1749
ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
1750
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }
1751
1752
template<bool aligned>
1753
ALWAYS_INLINE static GSVector4i loadl(const void* p)
1754
{
1755
return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
1756
}
1757
1758
ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v)
1759
{
1760
return GSVector4i(_mm_unpacklo_epi64(v.m, _mm_setzero_si128()));
1761
}
1762
1763
template<bool aligned>
1764
ALWAYS_INLINE static GSVector4i loadh(const void* p)
1765
{
1766
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
1767
}
1768
1769
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
1770
{
1771
return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
1772
}
1773
1774
template<bool aligned>
1775
ALWAYS_INLINE static GSVector4i load(const void* p)
1776
{
1777
return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) :
1778
_mm_loadu_si128(static_cast<const __m128i*>(p)));
1779
}
1780
1781
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
1782
1783
template<bool aligned>
1784
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
1785
{
1786
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
1787
}
1788
1789
template<bool aligned>
1790
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
1791
{
1792
_mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
1793
}
1794
1795
template<bool aligned>
1796
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
1797
{
1798
if constexpr (aligned)
1799
_mm_store_si128(static_cast<__m128i*>(p), v.m);
1800
else
1801
_mm_storeu_si128(static_cast<__m128i*>(p), v.m);
1802
}
1803
1804
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
1805
1806
ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v)
1807
{
1808
m = _mm_and_si128(m, v);
1809
return *this;
1810
}
1811
ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v)
1812
{
1813
m = _mm_or_si128(m, v);
1814
return *this;
1815
}
1816
ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v)
1817
{
1818
m = _mm_xor_si128(m, v);
1819
return *this;
1820
}
1821
1822
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
1823
{
1824
return GSVector4i(_mm_and_si128(v1, v2));
1825
}
1826
1827
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
1828
{
1829
return GSVector4i(_mm_or_si128(v1, v2));
1830
}
1831
1832
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
1833
{
1834
return GSVector4i(_mm_xor_si128(v1, v2));
1835
}
1836
1837
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
1838
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
1839
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
1840
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
1841
1842
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
1843
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1844
1845
ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
1846
1847
template<bool aligned>
1848
ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
1849
{
1850
return load<aligned>(v);
1851
}
1852
1853
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
1854
1855
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw)
1856
{
1857
return GSVector4i(_mm_unpacklo_epi64(xyzw.m, xyzw.m));
1858
}
1859
1860
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
1861
{
1862
return GSVector4i(_mm_unpacklo_epi64(xy.m, zw.m));
1863
}
1864
1865
static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
1866
1867
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
1868
1869
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); }
1870
1871
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
1872
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
1873
{ \
1874
return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1875
} \
1876
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \
1877
{ \
1878
return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1879
} \
1880
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \
1881
{ \
1882
return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \
1883
}
1884
1885
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
1886
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
1887
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
1888
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
1889
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
1890
1891
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
1892
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
1893
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
1894
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
1895
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
1896
1897
#define VECTOR4i_SHUFFLE_1(xs, xn) \
1898
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
1899
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
1900
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
1901
VECTOR4i_SHUFFLE_2(xs, xn, w, 3)
1902
1903
VECTOR4i_SHUFFLE_1(x, 0);
1904
VECTOR4i_SHUFFLE_1(y, 1);
1905
VECTOR4i_SHUFFLE_1(z, 2);
1906
VECTOR4i_SHUFFLE_1(w, 3)
1907
1908
#undef VECTOR4i_SHUFFLE_1
1909
#undef VECTOR4i_SHUFFLE_2
1910
#undef VECTOR4i_SHUFFLE_3
1911
#undef VECTOR4i_SHUFFLE_4
1912
};
1913
1914
class alignas(16) GSVector4
1915
{
1916
struct cxpr_init_tag
1917
{
1918
};
1919
static constexpr cxpr_init_tag cxpr_init{};
1920
1921
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
1922
1923
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
1924
1925
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
1926
1927
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
1928
1929
public:
1930
union
1931
{
1932
struct
1933
{
1934
float x, y, z, w;
1935
};
1936
struct
1937
{
1938
float r, g, b, a;
1939
};
1940
struct
1941
{
1942
float left, top, right, bottom;
1943
};
1944
float F32[4];
1945
double F64[2];
1946
s8 I8[16];
1947
s16 I16[8];
1948
s32 I32[4];
1949
s64 I64[2];
1950
u8 U8[16];
1951
u16 U16[8];
1952
u32 U32[4];
1953
u64 U64[2];
1954
__m128 m;
1955
};
1956
1957
GSVector4() = default;
1958
1959
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
1960
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
1961
1962
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
1963
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
1964
1965
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
1966
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
1967
1968
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
1969
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
1970
1971
constexpr static GSVector4 cxpr_rgba32(u32 rgba)
1972
{
1973
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff), static_cast<float>((rgba >> 8) & 0xff),
1974
static_cast<float>((rgba >> 16) & 0xff), static_cast<float>((rgba >> 24) & 0xff));
1975
}
1976
1977
constexpr static GSVector4 cxpr_unorm8(u32 rgba)
1978
{
1979
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff) / 255.0f,
1980
static_cast<float>((rgba >> 8) & 0xff) / 255.0f, static_cast<float>((rgba >> 16) & 0xff) / 255.0f,
1981
static_cast<float>((rgba >> 24) & 0xff) / 255.0f);
1982
}
1983
1984
ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
1985
ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
1986
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
1987
{
1988
GSVector4i v_(x, y, z, w);
1989
m = _mm_cvtepi32_ps(v_.m);
1990
}
1991
ALWAYS_INLINE GSVector4(int x, int y)
1992
{
1993
m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
1994
}
1995
1996
ALWAYS_INLINE explicit GSVector4(const GSVector2& v)
1997
: m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(v.m), _mm_setzero_pd())))
1998
{
1999
}
2000
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
2001
: m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd())))
2002
{
2003
}
2004
2005
ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {}
2006
2007
ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {}
2008
2009
ALWAYS_INLINE explicit GSVector4(float f) { *this = f; }
2010
2011
ALWAYS_INLINE explicit GSVector4(int i)
2012
{
2013
#ifdef CPU_ARCH_AVX2
2014
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
2015
#else
2016
*this = GSVector4(GSVector4i(i));
2017
#endif
2018
}
2019
2020
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v) : m(_mm_cvtepi32_ps(v)) {}
2021
2022
ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
2023
ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); }
2024
2025
ALWAYS_INLINE GSVector4& operator=(float f)
2026
{
2027
m = _mm_set1_ps(f);
2028
return *this;
2029
}
2030
2031
ALWAYS_INLINE GSVector4& operator=(__m128 m_)
2032
{
2033
this->m = m_;
2034
return *this;
2035
}
2036
2037
ALWAYS_INLINE operator __m128() const { return m; }
2038
2039
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2040
2041
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2042
{
2043
return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2044
}
2045
2046
ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2047
2048
ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); }
2049
2050
ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
2051
2052
ALWAYS_INLINE GSVector4 floor() const
2053
{
2054
#ifdef CPU_ARCH_SSE41
2055
return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
2056
#else
2057
return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w));
2058
#endif
2059
}
2060
2061
ALWAYS_INLINE GSVector4 ceil() const
2062
{
2063
#ifdef CPU_ARCH_SSE41
2064
return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
2065
#else
2066
return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w));
2067
#endif
2068
}
2069
2070
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
2071
2072
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
2073
2074
ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); }
2075
2076
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); }
2077
2078
#ifdef CPU_ARCH_SSE41
2079
ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); }
2080
ALWAYS_INLINE float addv() const
2081
{
2082
const __m128 pairs = _mm_hadd_ps(m, m);
2083
return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs));
2084
}
2085
#else
2086
float dot(const GSVector4& v) const
2087
{
2088
__m128 tmp = _mm_mul_ps(m, v.m);
2089
tmp = _mm_add_ps(tmp, _mm_movehl_ps(tmp, tmp)); // (x+z, y+w, ..., ...)
2090
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
2091
return _mm_cvtss_f32(tmp);
2092
}
2093
float addv() const
2094
{
2095
__m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...)
2096
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
2097
return _mm_cvtss_f32(tmp);
2098
}
2099
#endif
2100
2101
ALWAYS_INLINE float minv() const
2102
{
2103
__m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
2104
v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
2105
return _mm_cvtss_f32(v);
2106
}
2107
2108
ALWAYS_INLINE float maxv() const
2109
{
2110
__m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
2111
v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
2112
return _mm_cvtss_f32(v);
2113
}
2114
2115
ALWAYS_INLINE float width() const { return right - left; }
2116
ALWAYS_INLINE float height() const { return bottom - top; }
2117
2118
ALWAYS_INLINE GSVector2 rsize() const { return (zwzw() - xyxy()).xy(); }
2119
ALWAYS_INLINE bool rempty() const { return ((*this < zwzw()).mask() != 0x3); }
2120
ALWAYS_INLINE bool rvalid() const { return ((((*this >= zwzw()).mask()) & 0x03) == 0); }
2121
2122
ALWAYS_INLINE GSVector4 runion(const GSVector4 v) const { return min(v).blend32<0xc>(max(v)); }
2123
2124
ALWAYS_INLINE GSVector4 rintersect(const GSVector4& v) const { return sat(v); }
2125
ALWAYS_INLINE bool rintersects(const GSVector4& v) const { return rintersect(v).rvalid(); }
2126
ALWAYS_INLINE bool rcontains(const GSVector4& v) const { return rintersect(v).eq(v); }
2127
2128
ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
2129
{
2130
return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
2131
}
2132
2133
ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const
2134
{
2135
return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw()));
2136
}
2137
2138
ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2139
2140
ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2141
2142
ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); }
2143
2144
ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); }
2145
2146
template<int mask>
2147
ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const
2148
{
2149
#ifdef CPU_ARCH_SSE41
2150
return GSVector4(_mm_blend_ps(m, v, mask));
2151
#else
2152
// horrible, just horrible
2153
static_assert(mask >= 0 && mask < 16);
2154
__m128 ret;
2155
if constexpr (mask == 0)
2156
{
2157
ret = m;
2158
}
2159
else if constexpr (mask == 1)
2160
{
2161
ret = _mm_move_ss(m, v.m);
2162
}
2163
else if constexpr (mask == 2)
2164
{
2165
ret = _mm_shuffle_ps(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(1, 1, 0, 0)), m, _MM_SHUFFLE(3, 2, 2, 0));
2166
}
2167
else if constexpr (mask == 3)
2168
{
2169
ret = _mm_castpd_ps(_mm_move_sd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
2170
}
2171
else if constexpr (mask == 4)
2172
{
2173
ret = _mm_shuffle_ps(m, _mm_shuffle_ps(v.m, m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2174
}
2175
else if constexpr (mask == 5)
2176
{
2177
ret =
2178
_mm_shuffle_ps(_mm_move_ss(m, v.m), _mm_shuffle_ps(v.m, m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2179
}
2180
else if constexpr (mask == 6)
2181
{
2182
ret = _mm_shuffle_ps(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(1, 1, 0, 0)),
2183
_mm_shuffle_ps(v.m, m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 2, 0));
2184
}
2185
else if constexpr (mask == 7)
2186
{
2187
ret = _mm_shuffle_ps(_mm_castpd_ps(_mm_move_sd(_mm_castps_pd(m), _mm_castps_pd(v.m))),
2188
_mm_shuffle_ps(v.m, m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2189
}
2190
else if constexpr (mask == 8)
2191
{
2192
ret = _mm_shuffle_ps(m, _mm_shuffle_ps(m, v.m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2193
}
2194
else if constexpr (mask == 9)
2195
{
2196
ret =
2197
_mm_shuffle_ps(_mm_move_ss(m, v.m), _mm_shuffle_ps(m, v.m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2198
}
2199
else if constexpr (mask == 10)
2200
{
2201
ret = _mm_shuffle_ps(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(1, 1, 0, 0)),
2202
_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 2, 0));
2203
}
2204
else if constexpr (mask == 11)
2205
{
2206
ret = _mm_shuffle_ps(_mm_castpd_ps(_mm_move_sd(_mm_castps_pd(m), _mm_castps_pd(v.m))),
2207
_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(3, 3, 2, 2)), _MM_SHUFFLE(2, 0, 1, 0));
2208
}
2209
else if constexpr (mask == 12)
2210
{
2211
ret = _mm_shuffle_ps(m, v.m, _MM_SHUFFLE(3, 2, 1, 0));
2212
}
2213
else if constexpr (mask == 13)
2214
{
2215
ret = _mm_shuffle_ps(_mm_move_ss(m, v.m), v.m, _MM_SHUFFLE(3, 2, 1, 0));
2216
}
2217
else if constexpr (mask == 14)
2218
{
2219
ret = _mm_shuffle_ps(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(1, 1, 0, 0)), v.m, _MM_SHUFFLE(3, 2, 2, 0));
2220
}
2221
else if constexpr (mask == 15)
2222
{
2223
ret = v.m;
2224
}
2225
2226
return GSVector4(ret);
2227
#endif
2228
}
2229
2230
ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
2231
{
2232
#ifdef CPU_ARCH_SSE41
2233
return GSVector4(_mm_blendv_ps(m, v, mask));
2234
#else
2235
// NOTE: Assumes the entire lane is set with 1s or 0s.
2236
return (v & mask) | andnot(mask);
2237
#endif
2238
}
2239
2240
ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); }
2241
2242
ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); }
2243
2244
ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const
2245
{
2246
return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
2247
}
2248
2249
ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const
2250
{
2251
return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
2252
}
2253
2254
ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); }
2255
2256
ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); }
2257
2258
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); }
2259
2260
ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); }
2261
2262
ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; }
2263
2264
ALWAYS_INLINE bool anytrue() const
2265
{
2266
#ifdef CPU_ARCH_AVX2
2267
return (_mm_testz_ps(m, m) == 0);
2268
#else
2269
const __m128i ii = _mm_castps_si128(m);
2270
return (_mm_testz_si128(ii, ii) == 0);
2271
#endif
2272
}
2273
2274
ALWAYS_INLINE bool allfalse() const
2275
{
2276
#ifdef CPU_ARCH_AVX2
2277
return (_mm_testz_ps(m, m) != 0);
2278
#else
2279
const __m128i ii = _mm_castps_si128(m);
2280
return (_mm_testz_si128(ii, ii) != 0);
2281
#endif
2282
}
2283
2284
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2285
2286
template<int src, int dst>
2287
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2288
{
2289
#ifdef CPU_ARCH_SSE41
2290
if constexpr (src == dst)
2291
return GSVector4(_mm_blend_ps(m, v.m, 1 << src));
2292
else
2293
return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
2294
#else
2295
GSVector4 ret(*this);
2296
ret.F32[dst] = v.F32[src];
2297
return ret;
2298
#endif
2299
}
2300
2301
template<int i>
2302
ALWAYS_INLINE GSVector4 insert32(float v) const
2303
{
2304
#ifdef CPU_ARCH_SSE41
2305
if constexpr (i == 0)
2306
return GSVector4(_mm_move_ss(m, _mm_load_ss(&v)));
2307
else
2308
return GSVector4(_mm_insert_ps(m, _mm_load_ss(&v), _MM_MK_INSERTPS_NDX(0, i, 0)));
2309
#else
2310
GSVector4 ret(*this);
2311
ret.F32[i] = v;
2312
return ret;
2313
#endif
2314
}
2315
2316
template<int i>
2317
ALWAYS_INLINE float extract32() const
2318
{
2319
#ifdef CPU_ARCH_SSE41
2320
if constexpr (i == 0)
2321
return _mm_cvtss_f32(m);
2322
else
2323
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i)));
2324
#else
2325
return F32[i];
2326
#endif
2327
}
2328
2329
template<int dst>
2330
ALWAYS_INLINE GSVector4 insert64(double v) const
2331
{
2332
#ifdef CPU_ARCH_SSE41
2333
if constexpr (dst == 0)
2334
return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v)));
2335
else
2336
return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0));
2337
#else
2338
GSVector4 ret(*this);
2339
ret.F64[dst] = v;
2340
return ret;
2341
#endif
2342
}
2343
2344
template<int src>
2345
ALWAYS_INLINE double extract64() const
2346
{
2347
double ret;
2348
if constexpr (src == 0)
2349
_mm_storel_pd(&ret, _mm_castps_pd(m));
2350
else
2351
_mm_storeh_pd(&ret, _mm_castps_pd(m));
2352
return ret;
2353
}
2354
2355
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
2356
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2357
2358
ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
2359
2360
template<bool aligned>
2361
ALWAYS_INLINE static GSVector4 loadl(const void* p)
2362
{
2363
return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
2364
}
2365
2366
template<bool aligned>
2367
ALWAYS_INLINE static GSVector4 loadh(const void* p)
2368
{
2369
return GSVector4(_mm_castpd_ps(_mm_loadh_pd(_mm_setzero_pd(), static_cast<const double*>(p))));
2370
}
2371
2372
ALWAYS_INLINE static GSVector4 loadh(const GSVector2& v)
2373
{
2374
return GSVector4(_mm_unpacklo_pd(_mm_setzero_pd(), _mm_castps_pd(v.m)));
2375
}
2376
2377
template<bool aligned>
2378
ALWAYS_INLINE static GSVector4 load(const void* p)
2379
{
2380
return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p)));
2381
}
2382
2383
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
2384
2385
template<bool aligned>
2386
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2387
{
2388
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
2389
}
2390
2391
template<bool aligned>
2392
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2393
{
2394
_mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
2395
}
2396
2397
template<bool aligned>
2398
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2399
{
2400
if constexpr (aligned)
2401
_mm_store_ps(static_cast<float*>(p), v.m);
2402
else
2403
_mm_storeu_ps(static_cast<float*>(p), v.m);
2404
}
2405
2406
ALWAYS_INLINE static void store32(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
2407
2408
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2409
2410
ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_)
2411
{
2412
m = _mm_add_ps(m, v_);
2413
return *this;
2414
}
2415
2416
ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_)
2417
{
2418
m = _mm_sub_ps(m, v_);
2419
return *this;
2420
}
2421
2422
ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_)
2423
{
2424
m = _mm_mul_ps(m, v_);
2425
return *this;
2426
}
2427
2428
ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_)
2429
{
2430
m = _mm_div_ps(m, v_);
2431
return *this;
2432
}
2433
2434
ALWAYS_INLINE GSVector4& operator+=(float f)
2435
{
2436
*this += GSVector4(f);
2437
return *this;
2438
}
2439
2440
ALWAYS_INLINE GSVector4& operator-=(float f)
2441
{
2442
*this -= GSVector4(f);
2443
return *this;
2444
}
2445
2446
ALWAYS_INLINE GSVector4& operator*=(float f)
2447
{
2448
*this *= GSVector4(f);
2449
return *this;
2450
}
2451
2452
ALWAYS_INLINE GSVector4& operator/=(float f)
2453
{
2454
*this /= GSVector4(f);
2455
return *this;
2456
}
2457
2458
ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_)
2459
{
2460
m = _mm_and_ps(m, v_);
2461
return *this;
2462
}
2463
2464
ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_)
2465
{
2466
m = _mm_or_ps(m, v_);
2467
return *this;
2468
}
2469
2470
ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_)
2471
{
2472
m = _mm_xor_ps(m, v_);
2473
return *this;
2474
}
2475
2476
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2477
{
2478
return GSVector4(_mm_add_ps(v1, v2));
2479
}
2480
2481
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2482
{
2483
return GSVector4(_mm_sub_ps(v1, v2));
2484
}
2485
2486
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2487
{
2488
return GSVector4(_mm_mul_ps(v1, v2));
2489
}
2490
2491
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2492
{
2493
return GSVector4(_mm_div_ps(v1, v2));
2494
}
2495
2496
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2497
2498
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2499
2500
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2501
2502
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
2503
2504
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2505
{
2506
return GSVector4(_mm_and_ps(v1, v2));
2507
}
2508
2509
ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2510
{
2511
return GSVector4(_mm_or_ps(v1, v2));
2512
}
2513
2514
ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2515
{
2516
return GSVector4(_mm_xor_ps(v1, v2));
2517
}
2518
2519
ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2520
{
2521
return GSVector4(_mm_cmpeq_ps(v1, v2));
2522
}
2523
2524
ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2525
{
2526
return GSVector4(_mm_cmpneq_ps(v1, v2));
2527
}
2528
2529
ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2530
{
2531
return GSVector4(_mm_cmpgt_ps(v1, v2));
2532
}
2533
2534
ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2535
{
2536
return GSVector4(_mm_cmplt_ps(v1, v2));
2537
}
2538
2539
ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
2540
{
2541
return GSVector4(_mm_cmpge_ps(v1, v2));
2542
}
2543
2544
ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
2545
{
2546
return GSVector4(_mm_cmple_ps(v1, v2));
2547
}
2548
2549
ALWAYS_INLINE bool eq(const GSVector4& v) const
2550
{
2551
#ifdef CPU_ARCH_SSE41
2552
const __m128i t = _mm_castps_si128(_mm_xor_ps(m, v.m));
2553
return _mm_testz_si128(t, t) != 0;
2554
#else
2555
return (*this == v).alltrue();
2556
#endif
2557
}
2558
2559
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
2560
{
2561
return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2562
}
2563
2564
ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
2565
{
2566
return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2567
}
2568
2569
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
2570
{
2571
return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2572
}
2573
2574
ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const
2575
{
2576
return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
2577
}
2578
2579
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const
2580
{
2581
return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2582
}
2583
2584
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const
2585
{
2586
return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2587
}
2588
2589
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const
2590
{
2591
return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2592
}
2593
2594
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const
2595
{
2596
return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2597
}
2598
2599
ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const
2600
{
2601
return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2602
}
2603
2604
ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const
2605
{
2606
return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2607
}
2608
2609
ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const
2610
{
2611
return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
2612
}
2613
2614
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
2615
2616
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
2617
2618
ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); }
2619
2620
ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); }
2621
2622
ALWAYS_INLINE GSVector4 floor64() const
2623
{
2624
return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
2625
}
2626
2627
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
2628
2629
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
2630
{
2631
return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
2632
}
2633
2634
ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); }
2635
2636
ALWAYS_INLINE GSVector2 xy() const { return GSVector2(m); }
2637
2638
ALWAYS_INLINE GSVector2 zw() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2))); }
2639
2640
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
2641
{
2642
return GSVector4(_mm_movelh_ps(l.m, h.m));
2643
}
2644
2645
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(_mm_movelh_ps(l.m, l.m)); }
2646
2647
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
2648
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
2649
{ \
2650
return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); \
2651
}
2652
2653
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
2654
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
2655
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
2656
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
2657
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2658
2659
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
2660
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
2661
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
2662
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
2663
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2664
2665
#define VECTOR4_SHUFFLE_1(xs, xn) \
2666
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
2667
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
2668
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
2669
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
2670
2671
VECTOR4_SHUFFLE_1(x, 0);
2672
VECTOR4_SHUFFLE_1(y, 1);
2673
VECTOR4_SHUFFLE_1(z, 2);
2674
VECTOR4_SHUFFLE_1(w, 3);
2675
2676
#undef VECTOR4_SHUFFLE_1
2677
#undef VECTOR4_SHUFFLE_2
2678
#undef VECTOR4_SHUFFLE_3
2679
#undef VECTOR4_SHUFFLE_4
2680
2681
#if CPU_ARCH_AVX2
2682
2683
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); }
2684
2685
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); }
2686
2687
ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
2688
{
2689
return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f))));
2690
}
2691
2692
#else
2693
2694
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0))); }
2695
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
2696
{
2697
return GSVector4(_mm_shuffle_ps(v.m, v.m, _MM_SHUFFLE(0, 0, 0, 0)));
2698
}
2699
ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
2700
{
2701
const __m128 v = _mm_load_ss(static_cast<const float*>(f));
2702
return GSVector4(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)));
2703
}
2704
2705
#endif
2706
2707
ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
2708
{
2709
return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
2710
}
2711
};
2712
2713
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
2714
{
2715
m = _mm_cvttps_epi32(v);
2716
}
2717
2718
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
2719
{
2720
return GSVector2i(_mm_castps_si128(v.m));
2721
}
2722
2723
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
2724
{
2725
return GSVector2(_mm_castsi128_ps(v.m));
2726
}
2727
2728
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
2729
{
2730
m = _mm_cvttps_epi32(v);
2731
}
2732
2733
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
2734
{
2735
return GSVector4i(_mm_castps_si128(v.m));
2736
}
2737
2738
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
2739
{
2740
return GSVector4(_mm_castsi128_ps(v.m));
2741
}
2742
2743
#ifdef GSVECTOR_HAS_256
2744
2745
class alignas(32) GSVector8i
2746
{
2747
struct cxpr_init_tag
2748
{
2749
};
2750
static constexpr cxpr_init_tag cxpr_init{};
2751
2752
constexpr GSVector8i(cxpr_init_tag, s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1)
2753
: S32{x0, y0, z0, w0, x1, y1, z1, w1}
2754
{
2755
}
2756
2757
constexpr GSVector8i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7, s16 s8, s16 s9,
2758
s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15)
2759
: S16{s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15}
2760
{
2761
}
2762
2763
public:
2764
union
2765
{
2766
struct
2767
{
2768
s32 x0, y0, z0, w0, x1, y1, z1, w1;
2769
};
2770
struct
2771
{
2772
s32 r0, g0, b0, a0, r1, g1, b1, a1;
2773
};
2774
2775
float F32[8];
2776
s8 S8[32];
2777
s16 S16[16];
2778
s32 S32[8];
2779
s64 S64[4];
2780
u8 U8[32];
2781
u16 U16[16];
2782
u32 U32[8];
2783
u64 U64[4];
2784
__m256i m;
2785
};
2786
2787
GSVector8i() = default;
2788
2789
ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1)
2790
{
2791
return GSVector8i(cxpr_init, x0, y0, z0, w0, x1, y1, z1, w1);
2792
}
2793
ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x) { return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x); }
2794
2795
ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 x)
2796
{
2797
return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2798
}
2799
ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7,
2800
s16 s8, s16 s9, s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15)
2801
{
2802
return GSVector8i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15);
2803
}
2804
2805
ALWAYS_INLINE explicit GSVector8i(s32 i) { *this = i; }
2806
2807
ALWAYS_INLINE constexpr explicit GSVector8i(__m256i m) : m(m) {}
2808
2809
ALWAYS_INLINE GSVector8i& operator=(s32 i)
2810
{
2811
m = _mm256_set1_epi32(i);
2812
return *this;
2813
}
2814
ALWAYS_INLINE GSVector8i& operator=(__m256i m_)
2815
{
2816
m = m_;
2817
return *this;
2818
}
2819
2820
ALWAYS_INLINE operator __m256i() const { return m; }
2821
2822
ALWAYS_INLINE GSVector8i min_s8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi8(m, v)); }
2823
ALWAYS_INLINE GSVector8i max_s8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi8(m, v)); }
2824
ALWAYS_INLINE GSVector8i min_s16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi16(m, v)); }
2825
ALWAYS_INLINE GSVector8i max_s16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi16(m, v)); }
2826
ALWAYS_INLINE GSVector8i min_s32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi32(m, v)); }
2827
ALWAYS_INLINE GSVector8i max_s32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi32(m, v)); }
2828
2829
ALWAYS_INLINE GSVector8i min_u8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu8(m, v)); }
2830
ALWAYS_INLINE GSVector8i max_u8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu8(m, v)); }
2831
ALWAYS_INLINE GSVector8i min_u16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu16(m, v)); }
2832
ALWAYS_INLINE GSVector8i max_u16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu16(m, v)); }
2833
ALWAYS_INLINE GSVector8i min_u32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu32(m, v)); }
2834
ALWAYS_INLINE GSVector8i max_u32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu32(m, v)); }
2835
2836
ALWAYS_INLINE GSVector8i madd_s16(const GSVector8i& v) const { return GSVector8i(_mm256_madd_epi16(m, v.m)); }
2837
2838
ALWAYS_INLINE GSVector8i clamp8() const { return pu16().upl8(); }
2839
2840
ALWAYS_INLINE GSVector8i blend8(const GSVector8i& v, const GSVector8i& mask) const
2841
{
2842
return GSVector8i(_mm256_blendv_epi8(m, v, mask));
2843
}
2844
2845
template<s32 mask>
2846
ALWAYS_INLINE GSVector8i blend16(const GSVector8i& v) const
2847
{
2848
return GSVector8i(_mm256_blend_epi16(m, v, mask));
2849
}
2850
2851
template<s32 mask>
2852
ALWAYS_INLINE GSVector8i blend32(const GSVector8i& v) const
2853
{
2854
return GSVector8i(_mm256_blend_epi32(m, v.m, mask));
2855
}
2856
2857
ALWAYS_INLINE GSVector8i blend(const GSVector8i& v, const GSVector8i& mask) const
2858
{
2859
return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, v)));
2860
}
2861
2862
ALWAYS_INLINE GSVector8i shuffle8(const GSVector8i& mask) const { return GSVector8i(_mm256_shuffle_epi8(m, mask)); }
2863
2864
ALWAYS_INLINE GSVector8i ps16(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi16(m, v)); }
2865
ALWAYS_INLINE GSVector8i ps16() const { return GSVector8i(_mm256_packs_epi16(m, m)); }
2866
ALWAYS_INLINE GSVector8i pu16(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi16(m, v)); }
2867
ALWAYS_INLINE GSVector8i pu16() const { return GSVector8i(_mm256_packus_epi16(m, m)); }
2868
ALWAYS_INLINE GSVector8i ps32(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi32(m, v)); }
2869
ALWAYS_INLINE GSVector8i ps32() const { return GSVector8i(_mm256_packs_epi32(m, m)); }
2870
ALWAYS_INLINE GSVector8i pu32(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi32(m, v)); }
2871
ALWAYS_INLINE GSVector8i pu32() const { return GSVector8i(_mm256_packus_epi32(m, m)); }
2872
2873
ALWAYS_INLINE GSVector8i upl8(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi8(m, v)); }
2874
ALWAYS_INLINE GSVector8i uph8(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi8(m, v)); }
2875
ALWAYS_INLINE GSVector8i upl16(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi16(m, v)); }
2876
ALWAYS_INLINE GSVector8i uph16(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi16(m, v)); }
2877
ALWAYS_INLINE GSVector8i upl32(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi32(m, v)); }
2878
ALWAYS_INLINE GSVector8i uph32(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi32(m, v)); }
2879
ALWAYS_INLINE GSVector8i upl64(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi64(m, v)); }
2880
ALWAYS_INLINE GSVector8i uph64(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi64(m, v)); }
2881
2882
ALWAYS_INLINE GSVector8i upl8() const { return GSVector8i(_mm256_unpacklo_epi8(m, _mm256_setzero_si256())); }
2883
ALWAYS_INLINE GSVector8i uph8() const { return GSVector8i(_mm256_unpackhi_epi8(m, _mm256_setzero_si256())); }
2884
2885
ALWAYS_INLINE GSVector8i upl16() const { return GSVector8i(_mm256_unpacklo_epi16(m, _mm256_setzero_si256())); }
2886
ALWAYS_INLINE GSVector8i uph16() const { return GSVector8i(_mm256_unpackhi_epi16(m, _mm256_setzero_si256())); }
2887
2888
ALWAYS_INLINE GSVector8i upl32() const { return GSVector8i(_mm256_unpacklo_epi32(m, _mm256_setzero_si256())); }
2889
2890
ALWAYS_INLINE GSVector8i uph32() const { return GSVector8i(_mm256_unpackhi_epi32(m, _mm256_setzero_si256())); }
2891
ALWAYS_INLINE GSVector8i upl64() const { return GSVector8i(_mm256_unpacklo_epi64(m, _mm256_setzero_si256())); }
2892
ALWAYS_INLINE GSVector8i uph64() const { return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256())); }
2893
2894
ALWAYS_INLINE GSVector8i s8to16() const { return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m))); }
2895
ALWAYS_INLINE GSVector8i s8to32() const { return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m))); }
2896
ALWAYS_INLINE GSVector8i s8to64() const { return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m))); }
2897
2898
ALWAYS_INLINE GSVector8i s16to32() const { return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m))); }
2899
ALWAYS_INLINE GSVector8i s16to64() const { return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m))); }
2900
ALWAYS_INLINE GSVector8i s32to64() const { return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m))); }
2901
ALWAYS_INLINE GSVector8i u8to16() const { return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m))); }
2902
ALWAYS_INLINE GSVector8i u8to32() const { return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m))); }
2903
ALWAYS_INLINE GSVector8i u8to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); }
2904
ALWAYS_INLINE GSVector8i u16to32() const { return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m))); }
2905
ALWAYS_INLINE GSVector8i u16to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); }
2906
ALWAYS_INLINE GSVector8i u32to64() const { return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m))); }
2907
2908
ALWAYS_INLINE static GSVector8i s8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi16(v.m)); }
2909
ALWAYS_INLINE static GSVector8i s8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi32(v.m)); }
2910
ALWAYS_INLINE static GSVector8i s8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi64(v.m)); }
2911
2912
ALWAYS_INLINE static GSVector8i s16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi32(v.m)); }
2913
ALWAYS_INLINE static GSVector8i s16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi64(v.m)); }
2914
ALWAYS_INLINE static GSVector8i s32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi32_epi64(v.m)); }
2915
ALWAYS_INLINE static GSVector8i u8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi16(v.m)); }
2916
ALWAYS_INLINE static GSVector8i u8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi32(v.m)); }
2917
ALWAYS_INLINE static GSVector8i u8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); }
2918
ALWAYS_INLINE static GSVector8i u16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi32(v.m)); }
2919
ALWAYS_INLINE static GSVector8i u16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); }
2920
ALWAYS_INLINE static GSVector8i u32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu32_epi64(v.m)); }
2921
2922
template<s32 i>
2923
ALWAYS_INLINE GSVector8i srl() const
2924
{
2925
return GSVector8i(_mm256_srli_si256(m, i));
2926
}
2927
2928
template<s32 i>
2929
ALWAYS_INLINE GSVector8i srl(const GSVector8i& v)
2930
{
2931
return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
2932
}
2933
2934
template<s32 i>
2935
ALWAYS_INLINE GSVector8i sll() const
2936
{
2937
return GSVector8i(_mm256_slli_si256(m, i));
2938
}
2939
2940
template<s32 i>
2941
ALWAYS_INLINE GSVector8i sll16() const
2942
{
2943
return GSVector8i(_mm256_slli_epi16(m, i));
2944
}
2945
2946
ALWAYS_INLINE GSVector8i sll16(s32 i) const { return GSVector8i(_mm256_sll_epi16(m, _mm_cvtsi32_si128(i))); }
2947
ALWAYS_INLINE GSVector8i sllv16(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi16(m, v.m)); }
2948
2949
template<s32 i>
2950
ALWAYS_INLINE GSVector8i srl16() const
2951
{
2952
return GSVector8i(_mm256_srli_epi16(m, i));
2953
}
2954
2955
ALWAYS_INLINE GSVector8i srl16(s32 i) const { return GSVector8i(_mm256_srl_epi16(m, _mm_cvtsi32_si128(i))); }
2956
ALWAYS_INLINE GSVector8i srlv16(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi16(m, v.m)); }
2957
2958
template<s32 i>
2959
ALWAYS_INLINE GSVector8i sra16() const
2960
{
2961
return GSVector8i(_mm256_srai_epi16(m, i));
2962
}
2963
2964
ALWAYS_INLINE GSVector8i sra16(s32 i) const { return GSVector8i(_mm256_sra_epi16(m, _mm_cvtsi32_si128(i))); }
2965
ALWAYS_INLINE GSVector8i srav16(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi16(m, v.m)); }
2966
2967
template<s32 i>
2968
ALWAYS_INLINE GSVector8i sll32() const
2969
{
2970
return GSVector8i(_mm256_slli_epi32(m, i));
2971
}
2972
2973
ALWAYS_INLINE GSVector8i sll32(s32 i) const { return GSVector8i(_mm256_sll_epi32(m, _mm_cvtsi32_si128(i))); }
2974
ALWAYS_INLINE GSVector8i sllv32(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi32(m, v.m)); }
2975
2976
template<s32 i>
2977
ALWAYS_INLINE GSVector8i srl32() const
2978
{
2979
return GSVector8i(_mm256_srli_epi32(m, i));
2980
}
2981
2982
ALWAYS_INLINE GSVector8i srl32(s32 i) const { return GSVector8i(_mm256_srl_epi32(m, _mm_cvtsi32_si128(i))); }
2983
ALWAYS_INLINE GSVector8i srlv32(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi32(m, v.m)); }
2984
2985
template<s32 i>
2986
ALWAYS_INLINE GSVector8i sra32() const
2987
{
2988
return GSVector8i(_mm256_srai_epi32(m, i));
2989
}
2990
2991
ALWAYS_INLINE GSVector8i sra32(s32 i) const { return GSVector8i(_mm256_sra_epi32(m, _mm_cvtsi32_si128(i))); }
2992
ALWAYS_INLINE GSVector8i srav32(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi32(m, v.m)); }
2993
2994
template<s64 i>
2995
ALWAYS_INLINE GSVector8i sll64() const
2996
{
2997
return GSVector8i(_mm256_slli_epi64(m, i));
2998
}
2999
3000
ALWAYS_INLINE GSVector8i sll64(s32 i) const { return GSVector8i(_mm256_sll_epi64(m, _mm_cvtsi32_si128(i))); }
3001
ALWAYS_INLINE GSVector8i sllv64(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi64(m, v.m)); }
3002
3003
template<s64 i>
3004
ALWAYS_INLINE GSVector8i srl64() const
3005
{
3006
return GSVector8i(_mm256_srli_epi64(m, i));
3007
}
3008
3009
ALWAYS_INLINE GSVector8i srl64(s32 i) const { return GSVector8i(_mm256_srl_epi64(m, _mm_cvtsi32_si128(i))); }
3010
ALWAYS_INLINE GSVector8i srlv64(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi64(m, v.m)); }
3011
3012
template<s64 i>
3013
ALWAYS_INLINE GSVector8i sra64() const
3014
{
3015
return GSVector8i(_mm256_srai_epi64(m, i));
3016
}
3017
3018
ALWAYS_INLINE GSVector8i sra64(s32 i) const { return GSVector8i(_mm256_sra_epi64(m, _mm_cvtsi32_si128(i))); }
3019
ALWAYS_INLINE GSVector8i srav64(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi64(m, v.m)); }
3020
3021
ALWAYS_INLINE GSVector8i add8(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi8(m, v.m)); }
3022
ALWAYS_INLINE GSVector8i add16(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi16(m, v.m)); }
3023
ALWAYS_INLINE GSVector8i add32(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi32(m, v.m)); }
3024
ALWAYS_INLINE GSVector8i adds8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi8(m, v.m)); }
3025
ALWAYS_INLINE GSVector8i adds16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi16(m, v.m)); }
3026
ALWAYS_INLINE GSVector8i hadds16(const GSVector8i& v) const { return GSVector8i(_mm256_hadds_epi16(m, v.m)); }
3027
ALWAYS_INLINE GSVector8i addus8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu8(m, v.m)); }
3028
ALWAYS_INLINE GSVector8i addus16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu16(m, v.m)); }
3029
3030
ALWAYS_INLINE GSVector8i sub8(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi8(m, v.m)); }
3031
ALWAYS_INLINE GSVector8i sub16(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi16(m, v.m)); }
3032
ALWAYS_INLINE GSVector8i sub32(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi32(m, v.m)); }
3033
ALWAYS_INLINE GSVector8i subs8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi8(m, v.m)); }
3034
ALWAYS_INLINE GSVector8i subs16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi16(m, v.m)); }
3035
ALWAYS_INLINE GSVector8i subus8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu8(m, v.m)); }
3036
ALWAYS_INLINE GSVector8i subus16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu16(m, v.m)); }
3037
3038
ALWAYS_INLINE GSVector8i mul16hs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhi_epi16(m, v.m)); }
3039
ALWAYS_INLINE GSVector8i mul16l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi16(m, v.m)); }
3040
ALWAYS_INLINE GSVector8i mul16hrs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhrs_epi16(m, v.m)); }
3041
ALWAYS_INLINE GSVector8i mul32l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi32(m, v.m)); }
3042
3043
ALWAYS_INLINE bool eq(const GSVector8i& v) const
3044
{
3045
const GSVector8i t = *this ^ v;
3046
return _mm256_testz_si256(t, t) != 0;
3047
}
3048
3049
ALWAYS_INLINE GSVector8i eq8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi8(m, v.m)); }
3050
ALWAYS_INLINE GSVector8i eq16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi16(m, v.m)); }
3051
ALWAYS_INLINE GSVector8i eq32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi32(m, v.m)); }
3052
ALWAYS_INLINE GSVector8i eq64(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi64(m, v.m)); }
3053
3054
ALWAYS_INLINE GSVector8i neq8(const GSVector8i& v) const { return ~eq8(v); }
3055
ALWAYS_INLINE GSVector8i neq16(const GSVector8i& v) const { return ~eq16(v); }
3056
ALWAYS_INLINE GSVector8i neq32(const GSVector8i& v) const { return ~eq32(v); }
3057
3058
ALWAYS_INLINE GSVector8i gt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(m, v.m)); }
3059
ALWAYS_INLINE GSVector8i gt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(m, v.m)); }
3060
ALWAYS_INLINE GSVector8i gt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(m, v.m)); }
3061
3062
ALWAYS_INLINE GSVector8i ge8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
3063
ALWAYS_INLINE GSVector8i ge16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
3064
ALWAYS_INLINE GSVector8i ge32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
3065
3066
ALWAYS_INLINE GSVector8i lt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(v.m, m)); }
3067
ALWAYS_INLINE GSVector8i lt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(v.m, m)); }
3068
ALWAYS_INLINE GSVector8i lt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(v.m, m)); }
3069
3070
ALWAYS_INLINE GSVector8i le8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(m, v.m)); }
3071
ALWAYS_INLINE GSVector8i le16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi16(m, v.m)); }
3072
ALWAYS_INLINE GSVector8i le32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi32(m, v.m)); }
3073
3074
ALWAYS_INLINE GSVector8i andnot(const GSVector8i& v) const { return GSVector8i(_mm256_andnot_si256(v.m, m)); }
3075
3076
ALWAYS_INLINE u32 mask() const { return static_cast<u32>(_mm256_movemask_epi8(m)); }
3077
3078
ALWAYS_INLINE bool alltrue() const { return mask() == 0xFFFFFFFFu; }
3079
ALWAYS_INLINE bool anytrue() const { return (_mm256_testz_si256(m, m) == 0); }
3080
ALWAYS_INLINE bool allfalse() const { return (_mm256_testz_si256(m, m) != 0); }
3081
3082
template<s32 i>
3083
ALWAYS_INLINE GSVector8i insert8(s32 a) const
3084
{
3085
return GSVector8i(_mm256_insert_epi8(m, a, i));
3086
}
3087
3088
template<s32 i>
3089
ALWAYS_INLINE s32 extract8() const
3090
{
3091
return _mm256_extract_epi8(m, i);
3092
}
3093
3094
template<s32 i>
3095
ALWAYS_INLINE GSVector8i insert16(s32 a) const
3096
{
3097
return GSVector8i(_mm256_insert_epi16(m, a, i));
3098
}
3099
3100
template<s32 i>
3101
ALWAYS_INLINE s32 extract16() const
3102
{
3103
return _mm256_extract_epi16(m, i);
3104
}
3105
3106
template<s32 i>
3107
ALWAYS_INLINE GSVector8i insert32(s32 a) const
3108
{
3109
return GSVector8i(_mm256_insert_epi32(m, a, i));
3110
}
3111
3112
template<s32 i>
3113
ALWAYS_INLINE s32 extract32() const
3114
{
3115
return _mm256_extract_epi32(m, i);
3116
}
3117
3118
template<s32 i>
3119
ALWAYS_INLINE GSVector8i insert64(s64 a) const
3120
{
3121
return GSVector8i(_mm256_insert_epi64(m, a, i));
3122
}
3123
3124
template<s32 i>
3125
ALWAYS_INLINE s64 extract64() const
3126
{
3127
return _mm256_extract_epi64(m, i);
3128
}
3129
3130
ALWAYS_INLINE static GSVector8i zext32(s32 v) { return GSVector8i(_mm256_zextsi128_si256(GSVector4i::zext32(v))); }
3131
3132
ALWAYS_INLINE static GSVector8i loadnt(const void* p)
3133
{
3134
// Should be const, but isn't...
3135
return GSVector8i(_mm256_stream_load_si256(const_cast<__m256i*>(static_cast<const __m256i*>(p))));
3136
}
3137
3138
template<bool aligned>
3139
ALWAYS_INLINE static GSVector8i load(const void* p)
3140
{
3141
return GSVector8i(aligned ? _mm256_load_si256(static_cast<const __m256i*>(p)) :
3142
_mm256_loadu_si256(static_cast<const __m256i*>(p)));
3143
}
3144
3145
ALWAYS_INLINE static void storent(void* p, const GSVector8i& v)
3146
{
3147
_mm256_stream_si256(static_cast<__m256i*>(p), v.m);
3148
}
3149
3150
template<bool aligned>
3151
ALWAYS_INLINE static void store(void* p, const GSVector8i& v)
3152
{
3153
if constexpr (aligned)
3154
_mm256_store_si256(static_cast<__m256i*>(p), v.m);
3155
else
3156
_mm256_storeu_si256(static_cast<__m256i*>(p), v.m);
3157
}
3158
3159
template<bool aligned>
3160
ALWAYS_INLINE static void storel(void* p, const GSVector8i& v)
3161
{
3162
if constexpr (aligned)
3163
_mm_store_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m));
3164
else
3165
_mm_storeu_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m));
3166
}
3167
3168
ALWAYS_INLINE GSVector8i& operator&=(const GSVector8i& v)
3169
{
3170
m = _mm256_and_si256(m, v);
3171
return *this;
3172
}
3173
ALWAYS_INLINE GSVector8i& operator|=(const GSVector8i& v)
3174
{
3175
m = _mm256_or_si256(m, v);
3176
return *this;
3177
}
3178
ALWAYS_INLINE GSVector8i& operator^=(const GSVector8i& v)
3179
{
3180
m = _mm256_xor_si256(m, v);
3181
return *this;
3182
}
3183
3184
ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v1, const GSVector8i& v2)
3185
{
3186
return GSVector8i(_mm256_and_si256(v1, v2));
3187
}
3188
3189
ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v1, const GSVector8i& v2)
3190
{
3191
return GSVector8i(_mm256_or_si256(v1, v2));
3192
}
3193
3194
ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v1, const GSVector8i& v2)
3195
{
3196
return GSVector8i(_mm256_xor_si256(v1, v2));
3197
}
3198
3199
ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v, s32 i) { return v & GSVector8i(i); }
3200
ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v, s32 i) { return v | GSVector8i(i); }
3201
ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v, s32 i) { return v ^ GSVector8i(i); }
3202
ALWAYS_INLINE friend GSVector8i operator~(const GSVector8i& v) { return v ^ v.eq32(v); }
3203
3204
ALWAYS_INLINE static GSVector8i zero() { return GSVector8i(_mm256_setzero_si256()); }
3205
3206
ALWAYS_INLINE static GSVector8i broadcast128(const GSVector4i& v)
3207
{
3208
return GSVector8i(_mm256_broadcastsi128_si256(v.m));
3209
}
3210
3211
template<bool aligned>
3212
ALWAYS_INLINE static GSVector8i broadcast128(const void* v)
3213
{
3214
return broadcast128(GSVector4i::load<aligned>(v));
3215
}
3216
3217
ALWAYS_INLINE GSVector4i low128() const { return GSVector4i(_mm256_castsi256_si128(m)); }
3218
ALWAYS_INLINE GSVector4i high128() const { return GSVector4i(_mm256_extracti128_si256(m, 1)); }
3219
};
3220
3221
#endif
3222
3223