Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/cppspmd_sse.h
9903 views
1
// cppspmd_sse.h
2
// Copyright 2020-2022 Binomial LLC
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
// http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
//
16
// Notes for Basis Universal:
17
// All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
18
// The techniques used in this code were originally demonstrated for AVX2 by Nicolas Guillemot, Jefferson Amstutz in their "CppSPMD" project.
19
// This is new code for use in Basis Universal, although it uses the same general SPMD techniques in SSE 2/4.
20
21
#include <stdlib.h>
22
#include <stdint.h>
23
#include <assert.h>
24
#include <math.h>
25
#include <utility>
26
#include <algorithm>
27
28
#if CPPSPMD_SSE2
29
#include <xmmintrin.h> // SSE
30
#include <emmintrin.h> // SSE2
31
#else
32
#include <xmmintrin.h> // SSE
33
#include <emmintrin.h> // SSE2
34
#include <pmmintrin.h> // SSE3
35
#include <tmmintrin.h> // SSSE3
36
#include <smmintrin.h> // SSE4.1
37
//#include <nmmintrin.h> // SSE4.2
38
#endif
39
40
#undef CPPSPMD_SSE
41
#undef CPPSPMD_AVX1
42
#undef CPPSPMD_AVX2
43
#undef CPPSPMD_AVX
44
#undef CPPSPMD_FLOAT4
45
#undef CPPSPMD_INT16
46
47
#define CPPSPMD_SSE 1
48
#define CPPSPMD_AVX 0
49
#define CPPSPMD_AVX1 0
50
#define CPPSPMD_AVX2 0
51
#define CPPSPMD_FLOAT4 0
52
#define CPPSPMD_INT16 0
53
54
#ifdef _MSC_VER
55
#ifndef CPPSPMD_DECL
56
#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
57
#endif
58
59
#ifndef CPPSPMD_ALIGN
60
#define CPPSPMD_ALIGN(v) __declspec(align(v))
61
#endif
62
63
#define _mm_undefined_si128 _mm_setzero_si128
64
#define _mm_undefined_ps _mm_setzero_ps
65
#else
66
#ifndef CPPSPMD_DECL
67
#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
68
#endif
69
70
#ifndef CPPSPMD_ALIGN
71
#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
72
#endif
73
#endif
74
75
#ifndef CPPSPMD_FORCE_INLINE
76
#ifdef _DEBUG
77
#define CPPSPMD_FORCE_INLINE inline
78
#else
79
#ifdef _MSC_VER
80
#define CPPSPMD_FORCE_INLINE __forceinline
81
#else
82
#define CPPSPMD_FORCE_INLINE inline
83
#endif
84
#endif
85
#endif
86
87
#undef CPPSPMD
88
#undef CPPSPMD_ARCH
89
90
#if CPPSPMD_SSE2
91
#define CPPSPMD_SSE41 0
92
#define CPPSPMD cppspmd_sse2
93
#define CPPSPMD_ARCH _sse2
94
#else
95
#define CPPSPMD_SSE41 1
96
#define CPPSPMD cppspmd_sse41
97
#define CPPSPMD_ARCH _sse41
98
#endif
99
100
#ifndef CPPSPMD_GLUER
101
#define CPPSPMD_GLUER(a, b) a##b
102
#endif
103
104
#ifndef CPPSPMD_GLUER2
105
#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
106
#endif
107
108
#ifndef CPPSPMD_NAME
109
#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
110
#endif
111
112
#undef VASSERT
113
#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
114
#define VASSERT(cond) assert( VCOND(cond) )
115
116
#define CPPSPMD_ALIGNMENT (16)
117
118
#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
119
120
namespace CPPSPMD
121
{
122
123
const int PROGRAM_COUNT_SHIFT = 2;
124
const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
125
126
template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N; return static_cast<N*>(p); }
127
template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
128
129
CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
130
CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
131
CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
132
CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
133
134
CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) =
135
{
136
{ UINT32_MAX, 0, 0, 0 },
137
{ 0, UINT32_MAX, 0, 0 },
138
{ 0, 0, UINT32_MAX, 0 },
139
{ 0, 0, 0, UINT32_MAX },
140
};
141
142
#if CPPSPMD_SSE41
143
CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
144
#endif
145
146
CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
147
{
148
#if CPPSPMD_SSE2
149
return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
150
#else
151
return _mm_blendv_epi8(a, b, mask);
152
#endif
153
}
154
155
CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
156
{
157
#if CPPSPMD_SSE2
158
// We know it's a mask, so we can just emulate the blend.
159
return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
160
#else
161
return _mm_blendv_ps(a, b, mask);
162
#endif
163
}
164
165
CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
166
{
167
#if CPPSPMD_SSE2
168
// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
169
mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
170
return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
171
#else
172
return _mm_blendv_ps(a, b, mask);
173
#endif
174
}
175
176
CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
177
{
178
return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
179
}
180
181
CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
182
{
183
return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
184
}
185
186
#if CPPSPMD_SSE2
187
CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
188
CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
189
CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
190
CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
191
192
// Returns float bits as int, to emulate _mm_extract_ps()
193
CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f; }
194
CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
195
CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
196
CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
197
198
// Returns floats
199
CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
200
CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
201
CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
202
CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
203
#else
204
CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
205
CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
206
CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
207
CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
208
209
// Returns float bits as int
210
CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
211
CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
212
CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
213
CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
214
215
// Returns floats
216
CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
217
CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
218
CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
219
CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
220
#endif
221
222
#if CPPSPMD_SSE2
223
CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
224
CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
225
CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
226
CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
227
#else
228
CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
229
CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
230
CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
231
CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
232
#endif
233
234
#if CPPSPMD_SSE2
235
inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
236
{
237
// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
238
CPPSPMD_ALIGN(16) uint8_t av[16];
239
_mm_store_si128((__m128i*)av, a);
240
241
CPPSPMD_ALIGN(16) uint8_t bvi[16];
242
_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
243
244
CPPSPMD_ALIGN(16) uint8_t result[16];
245
246
result[0] = av[bvi[0]];
247
result[1] = av[bvi[1]];
248
result[2] = av[bvi[2]];
249
result[3] = av[bvi[3]];
250
251
result[4] = av[bvi[4]];
252
result[5] = av[bvi[5]];
253
result[6] = av[bvi[6]];
254
result[7] = av[bvi[7]];
255
256
result[8] = av[bvi[8]];
257
result[9] = av[bvi[9]];
258
result[10] = av[bvi[10]];
259
result[11] = av[bvi[11]];
260
261
result[12] = av[bvi[12]];
262
result[13] = av[bvi[13]];
263
result[14] = av[bvi[14]];
264
result[15] = av[bvi[15]];
265
266
return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
267
}
268
#else
269
CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
270
{
271
return _mm_shuffle_epi8(a, b);
272
}
273
#endif
274
275
#if CPPSPMD_SSE2
276
CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
277
{
278
return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
279
}
280
CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
281
{
282
return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
283
}
284
CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
285
{
286
__m128i n = _mm_set1_epi32(0x80000000);
287
__m128i ac = _mm_add_epi32(a, n);
288
__m128i bc = _mm_add_epi32(b, n);
289
return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
290
}
291
CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
292
{
293
__m128i n = _mm_set1_epi32(0x80000000);
294
__m128i ac = _mm_add_epi32(a, n);
295
__m128i bc = _mm_add_epi32(b, n);
296
return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
297
}
298
#else
299
CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
300
{
301
return _mm_min_epi32(a, b);
302
}
303
CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
304
{
305
return _mm_max_epi32(a, b);
306
}
307
CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
308
{
309
return _mm_min_epu32(a, b);
310
}
311
CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
312
{
313
return _mm_max_epu32(a, b);
314
}
315
#endif
316
317
#if CPPSPMD_SSE2
318
CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
319
{
320
__m128i sign_mask = _mm_srai_epi32(a, 31);
321
return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
322
}
323
#else
324
CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
325
{
326
return _mm_abs_epi32(a);
327
}
328
#endif
329
330
#if CPPSPMD_SSE2
331
CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
332
{
333
__m128i tmp1 = _mm_mul_epu32(a, b);
334
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
335
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
336
}
337
#else
338
CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
339
{
340
return _mm_mullo_epi32(a, b);
341
}
342
#endif
343
344
CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
345
{
346
__m128i tmp1 = _mm_mul_epu32(a, b);
347
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
348
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
349
}
350
351
#if CPPSPMD_SSE2
352
inline __m128i load_rgba32(const void* p)
353
{
354
__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
355
xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
356
xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
357
return xmm;
358
}
359
#else
360
inline __m128i load_rgba32(const void* p)
361
{
362
return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
363
}
364
#endif
365
366
inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
367
{
368
__m128i t0 = _mm_unpacklo_epi32(r0, r1);
369
__m128i t1 = _mm_unpacklo_epi32(r2, r3);
370
__m128i t2 = _mm_unpackhi_epi32(r0, r1);
371
__m128i t3 = _mm_unpackhi_epi32(r2, r3);
372
x = _mm_unpacklo_epi64(t0, t1);
373
y = _mm_unpackhi_epi64(t0, t1);
374
z = _mm_unpacklo_epi64(t2, t3);
375
w = _mm_unpackhi_epi64(t2, t3);
376
}
377
378
const uint32_t ALL_ON_MOVEMASK = 0xF;
379
380
struct spmd_kernel
381
{
382
struct vint;
383
struct lint;
384
struct vbool;
385
struct vfloat;
386
387
typedef int int_t;
388
typedef vint vint_t;
389
typedef lint lint_t;
390
391
// Exec mask
392
struct exec_mask
393
{
394
__m128i m_mask;
395
396
exec_mask() = default;
397
398
CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
399
CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
400
401
CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
402
403
static CPPSPMD_FORCE_INLINE exec_mask all_on() { return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) }; }
404
static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
405
406
CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
407
};
408
409
friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
410
friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
411
412
CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
413
CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
414
CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
415
416
// true if cond is true for all active lanes - false if no active lanes
417
CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
418
// true if cond is true for any active lanes
419
CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
420
CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
421
422
friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
423
friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
424
friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
425
426
exec_mask m_exec;
427
exec_mask m_kernel_exec;
428
exec_mask m_continue_mask;
429
#ifdef _DEBUG
430
bool m_in_loop;
431
#endif
432
433
CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
434
435
void init(const exec_mask& kernel_exec);
436
437
// Varying bool
438
439
struct vbool
440
{
441
__m128i m_value;
442
443
vbool() = default;
444
445
CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
446
447
CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
448
449
CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
450
CPPSPMD_FORCE_INLINE explicit operator vint() const;
451
452
private:
453
//vbool& operator=(const vbool&);
454
};
455
456
friend vbool operator!(const vbool& v);
457
458
CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
459
{
460
dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
461
return dst;
462
}
463
464
CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
465
{
466
dst.m_value = src.m_value;
467
return dst;
468
}
469
470
// Varying float
471
struct vfloat
472
{
473
__m128 m_value;
474
475
vfloat() = default;
476
477
CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
478
479
CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
480
481
CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
482
483
private:
484
//vfloat& operator=(const vfloat&);
485
};
486
487
CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
488
{
489
dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
490
return dst;
491
}
492
493
CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
494
{
495
dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
496
return dst;
497
}
498
499
CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
500
{
501
dst.m_value = src.m_value;
502
return dst;
503
}
504
505
CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
506
{
507
dst.m_value = src.m_value;
508
return dst;
509
}
510
511
// Linear ref to floats
512
struct float_lref
513
{
514
float* m_pValue;
515
516
private:
517
//float_lref& operator=(const float_lref&);
518
};
519
520
CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
521
{
522
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
523
if (mask == ALL_ON_MOVEMASK)
524
_mm_storeu_ps(dst.m_pValue, src.m_value);
525
else
526
_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
527
return dst;
528
}
529
530
CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
531
{
532
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
533
if (mask == ALL_ON_MOVEMASK)
534
_mm_storeu_ps(dst.m_pValue, src.m_value);
535
else
536
_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
537
return dst;
538
}
539
540
CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
541
{
542
_mm_storeu_ps(dst.m_pValue, src.m_value);
543
return dst;
544
}
545
546
CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
547
{
548
_mm_storeu_ps(dst.m_pValue, src.m_value);
549
return dst;
550
}
551
552
CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
553
{
554
return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
555
}
556
557
// Varying ref to floats
558
struct float_vref
559
{
560
__m128i m_vindex;
561
float* m_pValue;
562
563
private:
564
//float_vref& operator=(const float_vref&);
565
};
566
567
// Varying ref to varying float
568
struct vfloat_vref
569
{
570
__m128i m_vindex;
571
vfloat* m_pValue;
572
573
private:
574
//vfloat_vref& operator=(const vfloat_vref&);
575
};
576
577
// Varying ref to varying int
578
struct vint_vref
579
{
580
__m128i m_vindex;
581
vint* m_pValue;
582
583
private:
584
//vint_vref& operator=(const vint_vref&);
585
};
586
587
CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
588
CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
589
590
CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
591
CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
592
593
CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
594
{
595
CPPSPMD_ALIGN(16) int vindex[4];
596
_mm_store_si128((__m128i *)vindex, src.m_vindex);
597
598
CPPSPMD_ALIGN(16) float loaded[4];
599
600
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
601
for (int i = 0; i < 4; i++)
602
{
603
if (mask & (1 << i))
604
loaded[i] = src.m_pValue[vindex[i]];
605
}
606
return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
607
}
608
609
CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
610
{
611
CPPSPMD_ALIGN(16) int vindex[4];
612
_mm_store_si128((__m128i *)vindex, src.m_vindex);
613
614
CPPSPMD_ALIGN(16) float loaded[4];
615
616
for (int i = 0; i < 4; i++)
617
loaded[i] = src.m_pValue[vindex[i]];
618
return vfloat{ _mm_load_ps((const float*)loaded) };
619
}
620
621
// Linear ref to ints
622
struct int_lref
623
{
624
int* m_pValue;
625
626
private:
627
//int_lref& operator=(const int_lref&);
628
};
629
630
CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
631
{
632
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
633
if (mask == ALL_ON_MOVEMASK)
634
{
635
_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
636
}
637
else
638
{
639
CPPSPMD_ALIGN(16) int stored[4];
640
_mm_store_si128((__m128i *)stored, src.m_value);
641
642
for (int i = 0; i < 4; i++)
643
{
644
if (mask & (1 << i))
645
dst.m_pValue[i] = stored[i];
646
}
647
}
648
return dst;
649
}
650
651
CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
652
{
653
__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
654
655
v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
656
657
return vint{ v };
658
}
659
660
// Linear ref to int16's
661
struct int16_lref
662
{
663
int16_t* m_pValue;
664
665
private:
666
//int16_lref& operator=(const int16_lref&);
667
};
668
669
CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
670
{
671
CPPSPMD_ALIGN(16) int stored[4];
672
_mm_store_si128((__m128i *)stored, src.m_value);
673
674
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
675
for (int i = 0; i < 4; i++)
676
{
677
if (mask & (1 << i))
678
dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
679
}
680
return dst;
681
}
682
683
CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
684
{
685
CPPSPMD_ALIGN(16) int stored[4];
686
_mm_store_si128((__m128i *)stored, src.m_value);
687
688
for (int i = 0; i < 4; i++)
689
dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
690
return dst;
691
}
692
693
CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
694
{
695
CPPSPMD_ALIGN(16) int values[4];
696
697
for (int i = 0; i < 4; i++)
698
values[i] = static_cast<int16_t>(src.m_pValue[i]);
699
700
__m128i t = _mm_load_si128( (const __m128i *)values );
701
702
return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
703
}
704
705
CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
706
{
707
CPPSPMD_ALIGN(16) int values[4];
708
709
for (int i = 0; i < 4; i++)
710
values[i] = static_cast<int16_t>(src.m_pValue[i]);
711
712
__m128i t = _mm_load_si128( (const __m128i *)values );
713
714
return vint{ t };
715
}
716
717
// Linear ref to constant ints
718
struct cint_lref
719
{
720
const int* m_pValue;
721
722
private:
723
//cint_lref& operator=(const cint_lref&);
724
};
725
726
CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
727
{
728
__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
729
v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
730
return vint{ v };
731
}
732
733
CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
734
{
735
return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
736
}
737
738
// Varying ref to ints
739
struct int_vref
740
{
741
__m128i m_vindex;
742
int* m_pValue;
743
744
private:
745
//int_vref& operator=(const int_vref&);
746
};
747
748
// Varying ref to constant ints
749
struct cint_vref
750
{
751
__m128i m_vindex;
752
const int* m_pValue;
753
754
private:
755
//cint_vref& operator=(const cint_vref&);
756
};
757
758
// Varying int
759
struct vint
760
{
761
__m128i m_value;
762
763
vint() = default;
764
765
CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value) { }
766
767
CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
768
769
CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
770
771
CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
772
773
CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value)) { }
774
775
CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
776
777
CPPSPMD_FORCE_INLINE explicit operator vbool() const
778
{
779
return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
780
}
781
782
CPPSPMD_FORCE_INLINE explicit operator vfloat() const
783
{
784
return vfloat{ _mm_cvtepi32_ps(m_value) };
785
}
786
787
CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
788
{
789
return int_vref{ m_value, ptr };
790
}
791
792
CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
793
{
794
return cint_vref{ m_value, ptr };
795
}
796
797
CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
798
{
799
return float_vref{ m_value, ptr };
800
}
801
802
CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
803
{
804
return vfloat_vref{ m_value, ptr };
805
}
806
807
CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
808
{
809
return vint_vref{ m_value, ptr };
810
}
811
812
private:
813
//vint& operator=(const vint&);
814
};
815
816
// Load/store linear int
817
CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
818
{
819
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
820
if (mask == ALL_ON_MOVEMASK)
821
_mm_storeu_si128((__m128i *)pDst, src.m_value);
822
else
823
{
824
if (mask & 1) pDst[0] = extract_x(src.m_value);
825
if (mask & 2) pDst[1] = extract_y(src.m_value);
826
if (mask & 4) pDst[2] = extract_z(src.m_value);
827
if (mask & 8) pDst[3] = extract_w(src.m_value);
828
}
829
}
830
831
CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
832
{
833
_mm_storeu_si128((__m128i*)pDst, src.m_value);
834
}
835
836
CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
837
{
838
_mm_store_si128((__m128i*)pDst, src.m_value);
839
}
840
841
CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
842
{
843
__m128i v = _mm_loadu_si128((const __m128i*)pSrc);
844
845
v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
846
847
return vint{ v };
848
}
849
850
CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
851
{
852
return vint{ _mm_loadu_si128((__m128i*)pSrc) };
853
}
854
855
CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
856
{
857
return vint{ _mm_load_si128((__m128i*)pSrc) };
858
}
859
860
// Load/store linear float
861
CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
862
{
863
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
864
if (mask == ALL_ON_MOVEMASK)
865
_mm_storeu_ps((float*)pDst, src.m_value);
866
else
867
{
868
int *pDstI = (int *)pDst;
869
if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
870
if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
871
if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
872
if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
873
}
874
}
875
876
CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
877
{
878
_mm_storeu_ps((float*)pDst, src.m_value);
879
}
880
881
CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
882
{
883
_mm_store_ps((float*)pDst, src.m_value);
884
}
885
886
CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
887
{
888
__m128 v = _mm_loadu_ps((const float*)pSrc);
889
890
v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
891
892
return vfloat{ v };
893
}
894
895
CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
896
{
897
return vfloat{ _mm_loadu_ps((float*)pSrc) };
898
}
899
900
CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
901
{
902
return vfloat{ _mm_load_ps((float*)pSrc) };
903
}
904
905
CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
906
{
907
dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
908
return dst;
909
}
910
911
CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
912
{
913
CPPSPMD_ALIGN(16) int vindex[4];
914
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
915
916
CPPSPMD_ALIGN(16) int stored[4];
917
_mm_store_si128((__m128i*)stored, src.m_value);
918
919
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
920
for (int i = 0; i < 4; i++)
921
{
922
if (mask & (1 << i))
923
dst.m_pValue[vindex[i]] = stored[i];
924
}
925
return dst;
926
}
927
928
CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
929
{
930
dst.m_value = src.m_value;
931
return dst;
932
}
933
934
CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
935
{
936
CPPSPMD_ALIGN(16) int vindex[4];
937
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
938
939
CPPSPMD_ALIGN(16) int stored[4];
940
_mm_store_si128((__m128i*)stored, src.m_value);
941
942
for (int i = 0; i < 4; i++)
943
dst.m_pValue[vindex[i]] = stored[i];
944
945
return dst;
946
}
947
948
CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
949
{
950
CPPSPMD_ALIGN(16) int values[4];
951
952
CPPSPMD_ALIGN(16) int indices[4];
953
_mm_store_si128((__m128i *)indices, src.m_vindex);
954
955
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
956
for (int i = 0; i < 4; i++)
957
{
958
if (mask & (1 << i))
959
values[i] = src.m_pValue[indices[i]];
960
}
961
962
return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
963
}
964
965
CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
966
{
967
CPPSPMD_ALIGN(16) int values[4];
968
969
CPPSPMD_ALIGN(16) int indices[4];
970
_mm_store_si128((__m128i *)indices, src.m_vindex);
971
972
for (int i = 0; i < 4; i++)
973
values[i] = src.m_pValue[indices[i]];
974
975
return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
976
}
977
978
CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
979
{
980
CPPSPMD_ALIGN(16) int values[4];
981
982
CPPSPMD_ALIGN(16) int indices[4];
983
_mm_store_si128((__m128i *)indices, src.m_vindex);
984
985
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
986
for (int i = 0; i < 4; i++)
987
{
988
if (mask & (1 << i))
989
values[i] = src.m_pValue[indices[i]];
990
}
991
992
return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
993
}
994
995
CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
996
{
997
CPPSPMD_ALIGN(16) int values[4];
998
999
CPPSPMD_ALIGN(16) int indices[4];
1000
_mm_store_si128((__m128i *)indices, src.m_vindex);
1001
1002
for (int i = 0; i < 4; i++)
1003
values[i] = src.m_pValue[indices[i]];
1004
1005
return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
1006
}
1007
1008
CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
1009
{
1010
__m128i v0_l;
1011
1012
const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
1013
v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
1014
v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
1015
v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
1016
v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
1017
1018
return vint{ v0_l };
1019
}
1020
1021
CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
1022
{
1023
__m128i v0_l;
1024
1025
const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
1026
v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
1027
v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
1028
v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
1029
v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
1030
1031
return vint{ v0_l };
1032
}
1033
1034
CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
1035
{
1036
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1037
1038
if (mask & 1) pDst[0] = extract_x(v.m_value);
1039
if (mask & 2) pDst[stride] = extract_y(v.m_value);
1040
if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
1041
if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
1042
}
1043
1044
CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
1045
{
1046
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1047
1048
if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
1049
if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
1050
if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
1051
if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
1052
}
1053
1054
CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
1055
{
1056
pDst[0] = extract_x(v.m_value);
1057
pDst[stride] = extract_y(v.m_value);
1058
pDst[stride*2] = extract_z(v.m_value);
1059
pDst[stride*3] = extract_w(v.m_value);
1060
}
1061
1062
CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
1063
{
1064
((int *)pDstF)[0] = extract_ps_x(v.m_value);
1065
((int *)pDstF)[stride] = extract_ps_y(v.m_value);
1066
((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
1067
((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
1068
}
1069
1070
CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
1071
{
1072
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1073
1074
#if CPPSPMD_SSE2
1075
CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
1076
if (mask & 1) vals[0] = pSrc[0];
1077
if (mask & 2) vals[1] = pSrc[stride];
1078
if (mask & 4) vals[2] = pSrc[stride * 2];
1079
if (mask & 8) vals[3] = pSrc[stride * 3];
1080
return vint{ _mm_load_si128((__m128i*)vals) };
1081
#else
1082
const float* pSrcF = (const float*)pSrc;
1083
__m128 v = _mm_setzero_ps();
1084
if (mask & 1) v = _mm_load_ss(pSrcF);
1085
if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
1086
if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
1087
if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
1088
return vint{ _mm_castps_si128(v) };
1089
#endif
1090
}
1091
1092
CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
1093
{
1094
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1095
1096
#if CPPSPMD_SSE2
1097
CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
1098
if (mask & 1) vals[0] = pSrc[0];
1099
if (mask & 2) vals[1] = pSrc[stride];
1100
if (mask & 4) vals[2] = pSrc[stride * 2];
1101
if (mask & 8) vals[3] = pSrc[stride * 3];
1102
return vfloat{ _mm_load_ps(vals) };
1103
#else
1104
__m128 v = _mm_setzero_ps();
1105
if (mask & 1) v = _mm_load_ss(pSrc);
1106
if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
1107
if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
1108
if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
1109
return vfloat{ v };
1110
#endif
1111
}
1112
1113
CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
1114
{
1115
#if CPPSPMD_SSE2
1116
CPPSPMD_ALIGN(16) int vals[4];
1117
vals[0] = pSrc[0];
1118
vals[1] = pSrc[stride];
1119
vals[2] = pSrc[stride * 2];
1120
vals[3] = pSrc[stride * 3];
1121
return vint{ _mm_load_si128((__m128i*)vals) };
1122
#else
1123
const float* pSrcF = (const float*)pSrc;
1124
__m128 v = _mm_load_ss(pSrcF);
1125
v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
1126
v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
1127
v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
1128
return vint{ _mm_castps_si128(v) };
1129
#endif
1130
}
1131
1132
CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
1133
{
1134
#if CPPSPMD_SSE2
1135
CPPSPMD_ALIGN(16) float vals[4];
1136
vals[0] = pSrc[0];
1137
vals[1] = pSrc[stride];
1138
vals[2] = pSrc[stride * 2];
1139
vals[3] = pSrc[stride * 3];
1140
return vfloat{ _mm_load_ps(vals) };
1141
#else
1142
__m128 v = _mm_load_ss(pSrc);
1143
v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
1144
v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
1145
v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
1146
return vfloat{ v };
1147
#endif
1148
}
1149
1150
CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
1151
{
1152
// TODO: There's surely a better way
1153
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1154
1155
if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
1156
if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
1157
if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
1158
if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
1159
1160
return dst;
1161
}
1162
1163
CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
1164
{
1165
// TODO: There's surely a better way
1166
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1167
1168
__m128i k = _mm_setzero_si128();
1169
1170
if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1171
if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1172
if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1173
if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1174
1175
return vfloat{ _mm_castsi128_ps(k) };
1176
}
1177
1178
CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
1179
{
1180
// TODO: There's surely a better way
1181
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1182
1183
if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
1184
if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
1185
if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
1186
if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
1187
1188
return dst;
1189
}
1190
1191
CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
1192
{
1193
// TODO: There's surely a better way
1194
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1195
1196
__m128i k = _mm_setzero_si128();
1197
1198
if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1199
if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1200
if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1201
if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1202
1203
return vint{ k };
1204
}
1205
1206
CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
1207
{
1208
// TODO: There's surely a better way
1209
__m128i k = _mm_setzero_si128();
1210
1211
k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1212
k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1213
k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1214
k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1215
1216
return vint{ k };
1217
}
1218
1219
// Linear integer
1220
struct lint
1221
{
1222
__m128i m_value;
1223
1224
CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
1225
: m_value(value)
1226
{ }
1227
1228
CPPSPMD_FORCE_INLINE explicit operator vfloat() const
1229
{
1230
return vfloat{ _mm_cvtepi32_ps(m_value) };
1231
}
1232
1233
CPPSPMD_FORCE_INLINE explicit operator vint() const
1234
{
1235
return vint{ m_value };
1236
}
1237
1238
CPPSPMD_FORCE_INLINE int get_first_value() const
1239
{
1240
return _mm_cvtsi128_si32(m_value);
1241
}
1242
1243
CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
1244
{
1245
return float_lref{ ptr + get_first_value() };
1246
}
1247
1248
CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
1249
{
1250
return int_lref{ ptr + get_first_value() };
1251
}
1252
1253
CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
1254
{
1255
return int16_lref{ ptr + get_first_value() };
1256
}
1257
1258
CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
1259
{
1260
return cint_lref{ ptr + get_first_value() };
1261
}
1262
1263
private:
1264
//lint& operator=(const lint&);
1265
};
1266
1267
CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
1268
{
1269
dst.m_value = src.m_value;
1270
return dst;
1271
}
1272
1273
const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
1274
1275
// SPMD condition helpers
1276
1277
template<typename IfBody>
1278
CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
1279
1280
CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
1281
1282
// No breaks, continues, etc. allowed
1283
template<typename IfBody>
1284
CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
1285
1286
// No breaks, continues, etc. allowed
1287
template<typename IfBody, typename ElseBody>
1288
CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
1289
1290
template<typename IfBody, typename ElseBody>
1291
CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
1292
1293
template<typename WhileCondBody, typename WhileBody>
1294
CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
1295
1296
template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
1297
CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
1298
1299
template<typename ForeachBody>
1300
CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
1301
1302
#ifdef _DEBUG
1303
CPPSPMD_FORCE_INLINE void check_masks();
1304
#else
1305
CPPSPMD_FORCE_INLINE void check_masks() { }
1306
#endif
1307
1308
CPPSPMD_FORCE_INLINE void spmd_break();
1309
CPPSPMD_FORCE_INLINE void spmd_continue();
1310
1311
CPPSPMD_FORCE_INLINE void spmd_return();
1312
1313
template<typename UnmaskedBody>
1314
CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
1315
1316
template<typename SPMDKernel, typename... Args>
1317
//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
1318
CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
1319
1320
CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
1321
CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
1322
CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
1323
1324
CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)
1325
{
1326
__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
1327
__m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
1328
return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp));
1329
}
1330
1331
CPPSPMD_FORCE_INLINE int reduce_add(vint v)
1332
{
1333
__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
1334
__m128i temp = _mm_add_epi32(_mm_shuffle_epi32(k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
1335
return extract_x(_mm_add_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(temp), _mm_castsi128_ps(temp))), temp));
1336
}
1337
1338
#include "cppspmd_math_declares.h"
1339
1340
}; // struct spmd_kernel
1341
1342
using exec_mask = spmd_kernel::exec_mask;
1343
using vint = spmd_kernel::vint;
1344
using int_lref = spmd_kernel::int_lref;
1345
using cint_vref = spmd_kernel::cint_vref;
1346
using cint_lref = spmd_kernel::cint_lref;
1347
using int_vref = spmd_kernel::int_vref;
1348
using lint = spmd_kernel::lint;
1349
using vbool = spmd_kernel::vbool;
1350
using vfloat = spmd_kernel::vfloat;
1351
using float_lref = spmd_kernel::float_lref;
1352
using float_vref = spmd_kernel::float_vref;
1353
using vfloat_vref = spmd_kernel::vfloat_vref;
1354
using vint_vref = spmd_kernel::vint_vref;
1355
1356
CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const
1357
{
1358
return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) };
1359
}
1360
1361
// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
1362
CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const
1363
{
1364
return vint { m_value };
1365
}
1366
1367
CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
1368
{
1369
return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
1370
}
1371
1372
CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
1373
1374
CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
1375
CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
1376
CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
1377
1378
CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
1379
CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
1380
1381
// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
1382
CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
1383
CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
1384
1385
CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
1386
CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
1387
CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
1388
1389
CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
1390
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) { return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
1391
CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
1392
CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
1393
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
1394
CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
1395
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
1396
CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
1397
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
1398
CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
1399
1400
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
1401
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
1402
CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
1403
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
1404
CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
1405
1406
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) { return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
1407
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
1408
CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
1409
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
1410
CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
1411
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
1412
1413
CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
1414
CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
1415
1416
CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
1417
CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
1418
1419
CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
1420
CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
1421
1422
CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
1423
CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
1424
1425
CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
1426
CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
1427
1428
CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
1429
CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
1430
1431
CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
1432
CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
1433
1434
CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
1435
CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
1436
CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
1437
CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) { return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
1438
1439
#if CPPSPMD_SSE2
1440
CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
1441
{
1442
__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
1443
__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1444
1445
__m128i ai = _mm_cvttps_epi32(a.m_value);
1446
1447
__m128 af = _mm_cvtepi32_ps(ai);
1448
return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1449
}
1450
1451
CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
1452
{
1453
__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1454
__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1455
1456
__m128i ai = _mm_cvtps_epi32(a.m_value);
1457
__m128 af = _mm_cvtepi32_ps(ai);
1458
__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
1459
1460
af = _mm_add_ps(af, changed);
1461
1462
return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1463
}
1464
1465
CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
1466
{
1467
__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1468
__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1469
1470
__m128i ai = _mm_cvtps_epi32(a.m_value);
1471
__m128 af = _mm_cvtepi32_ps(ai);
1472
__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
1473
1474
af = _mm_sub_ps(af, changed);
1475
1476
return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1477
}
1478
1479
// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
1480
// I wish there was a better way.
1481
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1482
inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
1483
#elif defined(__clang__)
1484
inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
1485
#elif defined (_MSC_VER)
1486
#pragma float_control(push)
1487
#pragma float_control(precise, on)
1488
inline __m128 add_sub(__m128 a, __m128 b)
1489
#else
1490
inline __m128 add_sub(__m128 a, __m128 b)
1491
#endif
1492
{
1493
return _mm_sub_ps(_mm_add_ps(a, b), b);
1494
}
1495
1496
#if defined (_MSC_VER)
1497
#pragma float_control(pop)
1498
#endif
1499
1500
CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
1501
{
1502
__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
1503
1504
__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
1505
__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
1506
1507
// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
1508
//__m128 temp1 = _mm_add_ps(a.m_value, force_int);
1509
//__m128 temp2 = _mm_sub_ps(temp1, force_int);
1510
__m128 temp2 = add_sub(a.m_value, force_int);
1511
1512
__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1513
__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
1514
return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
1515
}
1516
1517
#else
1518
CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
1519
CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
1520
CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
1521
CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
1522
#endif
1523
1524
CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
1525
CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
1526
CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
1527
1528
CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
1529
CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) { return vint{ min_epi32(a.m_value, b.m_value) }; }
1530
1531
CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
1532
CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
1533
1534
CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
1535
1536
CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) { return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)) }; }
1537
1538
CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
1539
CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
1540
1541
CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
1542
{
1543
return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
1544
}
1545
1546
CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
1547
{
1548
return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
1549
}
1550
1551
CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
1552
{
1553
return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
1554
}
1555
1556
CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
1557
{
1558
return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
1559
}
1560
1561
CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
1562
{
1563
return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
1564
}
1565
1566
CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
1567
{
1568
return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
1569
}
1570
1571
CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
1572
1573
CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
1574
CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
1575
CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
1576
CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
1577
CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
1578
CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
1579
1580
CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
1581
CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
1582
CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
1583
CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
1584
CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
1585
CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
1586
CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
1587
CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1588
CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1589
CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1590
CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1591
CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1592
CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1593
CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
1594
CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
1595
CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
1596
CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
1597
CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
1598
CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
1599
CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
1600
CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
1601
CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
1602
1603
CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
1604
1605
CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
1606
1607
CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
1608
1609
// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
1610
CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) { return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
1611
CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
1612
CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
1613
CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
1614
CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
1615
CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
1616
1617
CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
1618
CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
1619
CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
1620
CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
1621
CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
1622
CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
1623
CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
1624
CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
1625
CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
1626
CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
1627
CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
1628
1629
CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
1630
CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
1631
CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
1632
CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
1633
CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
1634
1635
CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
1636
CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
1637
1638
CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
1639
CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
1640
CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
1641
CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
1642
CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
1643
CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
1644
CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
1645
CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
1646
CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
1647
CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
1648
CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
1649
CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
1650
CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
1651
CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
1652
CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
1653
CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
1654
CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
1655
CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
1656
1657
CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
1658
CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
1659
CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
1660
1661
#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
1662
#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
1663
#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
1664
1665
CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
1666
CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
1667
1668
CPPSPMD_FORCE_INLINE vint zero_vint() { return vint{ _mm_setzero_si128() }; }
1669
CPPSPMD_FORCE_INLINE vfloat zero_vfloat() { return vfloat{ _mm_setzero_ps() }; }
1670
1671
CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
1672
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
1673
CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
1674
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
1675
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
1676
#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
1677
#define VFLOAT_LANE_SHUFFLE_PS(a, b, control) vfloat(_mm_shuffle_ps((a).m_value, (b).m_value, control))
1678
1679
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
1680
#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
1681
#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
1682
1683
#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
1684
#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
1685
1686
#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
1687
#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
1688
1689
// Unpack and interleave 8-bit integers from the low or high half of a and b
1690
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
1691
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
1692
1693
// Unpack and interleave 16-bit integers from the low or high half of a and b
1694
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
1695
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
1696
1697
// Unpack and interleave 32-bit integers from the low or high half of a and b
1698
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
1699
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
1700
1701
// Unpack and interleave 64-bit integers from the low or high half of a and b
1702
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
1703
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
1704
1705
CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
1706
CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
1707
CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
1708
CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
1709
1710
CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
1711
1712
CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
1713
{
1714
__m128d al = _mm_cvtepi32_pd(a.m_value);
1715
__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
1716
1717
__m128d bl = _mm_cvtepi32_pd(b.m_value);
1718
__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
1719
1720
__m128d rl = _mm_div_pd(al, bl);
1721
__m128d rh = _mm_div_pd(ah, bh);
1722
1723
__m128i rli = _mm_cvttpd_epi32(rl);
1724
__m128i rhi = _mm_cvttpd_epi32(rh);
1725
1726
return vint(_mm_unpacklo_epi64(rli, rhi));
1727
}
1728
1729
CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
1730
{
1731
vint aa = abs(a), ab = abs(b);
1732
vint q = div_epi32(aa, ab);
1733
vint r = aa - q * ab;
1734
return spmd_ternaryi(a < 0, -r, r);
1735
}
1736
1737
CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
1738
{
1739
return div_epi32(a, b);
1740
}
1741
1742
CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
1743
{
1744
return div_epi32(a, vint(b));
1745
}
1746
1747
CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
1748
{
1749
return mod_epi32(a, b);
1750
}
1751
1752
CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
1753
{
1754
return mod_epi32(a, vint(b));
1755
}
1756
1757
CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
1758
{
1759
#if 0
1760
CPPSPMD_ALIGN(32) int result[4];
1761
result[0] = extract_x(a.m_value) << extract_x(b.m_value);
1762
result[1] = extract_y(a.m_value) << extract_y(b.m_value);
1763
result[2] = extract_z(a.m_value) << extract_z(b.m_value);
1764
result[3] = extract_w(a.m_value) << extract_w(b.m_value);
1765
1766
return vint{ _mm_load_si128((__m128i*)result) };
1767
#elif 0
1768
int x = extract_x(a.m_value) << extract_x(b.m_value);
1769
int y = extract_y(a.m_value) << extract_y(b.m_value);
1770
int z = extract_z(a.m_value) << extract_z(b.m_value);
1771
int w = extract_w(a.m_value) << extract_w(b.m_value);
1772
1773
__m128i v = insert_x(_mm_undefined_si128(), x);
1774
v = insert_y(v, y);
1775
v = insert_z(v, z);
1776
return vint{ insert_w(v, w) };
1777
#else
1778
// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
1779
return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
1780
#endif
1781
}
1782
1783
// uniform shift left
1784
CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
1785
{
1786
__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1787
return vint{ _mm_sll_epi32(a.m_value, bv) };
1788
}
1789
1790
// uniform arithmetic shift right
1791
CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
1792
{
1793
__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1794
return vint{ _mm_sra_epi32(a.m_value, bv) };
1795
}
1796
1797
// uniform shift right
1798
CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
1799
{
1800
__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1801
return vint{ _mm_srl_epi32(a.m_value, bv) };
1802
}
1803
1804
CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
1805
{
1806
#if 0
1807
CPPSPMD_ALIGN(32) int result[4];
1808
result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
1809
result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
1810
result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
1811
result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
1812
1813
return vint{ _mm_load_si128((__m128i*)result) };
1814
#elif 0
1815
uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
1816
uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
1817
uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
1818
uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
1819
1820
__m128i v = insert_x(_mm_undefined_si128(), x);
1821
v = insert_y(v, y);
1822
v = insert_z(v, z);
1823
return vint{ insert_w(v, w) };
1824
#else
1825
//vint inv_shift = 32 - b;
1826
//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
1827
1828
// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
1829
vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
1830
1831
// Now convert scale factor to integer.
1832
vint r = vint(f);
1833
1834
// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
1835
vint q(mulhi_epu32(a.m_value, r.m_value));
1836
1837
// Handle shift amounts of 0.
1838
return spmd_ternaryi(b > 0, q, a);
1839
#endif
1840
}
1841
1842
CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
1843
{
1844
//vint inv_shift = 32 - b;
1845
//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
1846
1847
// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
1848
vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
1849
1850
// Now convert scale factor to integer.
1851
vint r = vint(f);
1852
1853
// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
1854
return vint(mulhi_epu32(a.m_value, r.m_value));
1855
}
1856
1857
CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
1858
{
1859
#if 0
1860
CPPSPMD_ALIGN(32) int result[4];
1861
result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
1862
result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
1863
result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
1864
result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
1865
1866
return vint{ _mm_load_si128((__m128i*)result) };
1867
#elif 0
1868
int x = extract_x(a.m_value) >> extract_x(b.m_value);
1869
int y = extract_y(a.m_value) >> extract_y(b.m_value);
1870
int z = extract_z(a.m_value) >> extract_z(b.m_value);
1871
int w = extract_w(a.m_value) >> extract_w(b.m_value);
1872
1873
__m128i v = insert_x(_mm_undefined_si128(), x);
1874
v = insert_y(v, y);
1875
v = insert_z(v, z);
1876
return vint{ insert_w(v, w) };
1877
#else
1878
vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
1879
vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
1880
return a_shifted;
1881
#endif
1882
}
1883
1884
#undef VINT_SHIFT_LEFT
1885
#undef VINT_SHIFT_RIGHT
1886
#undef VUINT_SHIFT_RIGHT
1887
1888
// Shift left/right by a uniform immediate constant
1889
#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
1890
#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) )
1891
#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
1892
#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
1893
1894
CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1895
CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
1896
CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
1897
CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1898
CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1899
CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1900
CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1901
1902
CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
1903
CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
1904
CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
1905
CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
1906
1907
#undef VINT_EXTRACT
1908
#undef VBOOL_EXTRACT
1909
#undef VFLOAT_EXTRACT
1910
1911
#if CPPSPMD_SSE2
1912
// Pass in an immediate constant and the compiler will optimize these expressions.
1913
#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
1914
#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
1915
#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
1916
#else
1917
CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
1918
1919
#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
1920
#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
1921
#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
1922
#endif
1923
1924
CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
1925
{
1926
assert(instance < 4);
1927
CPPSPMD_ALIGN(16) float values[4];
1928
_mm_store_ps(values, v.m_value);
1929
values[instance] = f;
1930
v.m_value = _mm_load_ps(values);
1931
return v;
1932
}
1933
1934
CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
1935
{
1936
assert(instance < 4);
1937
CPPSPMD_ALIGN(16) int values[4];
1938
_mm_store_si128((__m128i *)values, v.m_value);
1939
values[instance] = i;
1940
v.m_value = _mm_load_si128((__m128i *)values);
1941
return v;
1942
}
1943
1944
CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
1945
{
1946
__m128i l = _mm_loadu_si128((const __m128i*)pTab);
1947
return vint{ l };
1948
}
1949
1950
CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
1951
{
1952
return vint{ shuffle_epi8(table.m_value, a.m_value) };
1953
}
1954
1955
CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
1956
{
1957
__m128i l = _mm_loadu_si128((const __m128i*)pTab);
1958
__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
1959
table_0.m_value = l;
1960
table_1.m_value = h;
1961
}
1962
1963
CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
1964
{
1965
__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
1966
__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
1967
1968
__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
1969
1970
__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
1971
1972
return vint{ _mm_castps_si128(v_0) };
1973
}
1974
1975
CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
1976
{
1977
__m128i a = _mm_loadu_si128((const __m128i*)pTab);
1978
__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
1979
__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
1980
__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
1981
1982
table_0.m_value = a;
1983
table_1.m_value = b;
1984
table_2.m_value = c;
1985
table_3.m_value = d;
1986
}
1987
1988
CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
1989
{
1990
__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
1991
1992
__m128 av_0;
1993
{
1994
__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
1995
__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
1996
av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
1997
}
1998
1999
__m128 bv_0;
2000
{
2001
__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
2002
__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
2003
bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
2004
}
2005
2006
__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
2007
__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
2008
2009
return vint{ _mm_castps_si128(v2_0) };
2010
}
2011
2012
#if 0
2013
template<typename SPMDKernel, typename... Args>
2014
CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
2015
{
2016
SPMDKernel kernel;
2017
kernel.init(exec_mask::all_on());
2018
return kernel._call(std::forward<Args>(args)...);
2019
}
2020
#else
2021
template<typename SPMDKernel, typename... Args>
2022
CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
2023
{
2024
SPMDKernel kernel;
2025
kernel.init(exec_mask::all_on());
2026
kernel._call(std::forward<Args>(args)...);
2027
}
2028
#endif
2029
2030
CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
2031
{
2032
m_exec = kernel_exec;
2033
m_kernel_exec = kernel_exec;
2034
m_continue_mask = exec_mask::all_off();
2035
2036
#ifdef _DEBUG
2037
m_in_loop = false;
2038
#endif
2039
}
2040
2041
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
2042
{
2043
CPPSPMD_ALIGN(16) int vindex[4];
2044
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2045
2046
CPPSPMD_ALIGN(16) float stored[4];
2047
_mm_store_ps(stored, src.m_value);
2048
2049
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
2050
for (int i = 0; i < 4; i++)
2051
{
2052
if (mask & (1 << i))
2053
dst.m_pValue[vindex[i]] = stored[i];
2054
}
2055
return dst;
2056
}
2057
2058
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
2059
{
2060
CPPSPMD_ALIGN(16) int vindex[4];
2061
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2062
2063
CPPSPMD_ALIGN(16) float stored[4];
2064
_mm_store_ps(stored, src.m_value);
2065
2066
for (int i = 0; i < 4; i++)
2067
dst.m_pValue[vindex[i]] = stored[i];
2068
return dst;
2069
}
2070
2071
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
2072
{
2073
CPPSPMD_ALIGN(16) int vindex[4];
2074
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2075
2076
CPPSPMD_ALIGN(16) float stored[4];
2077
_mm_store_ps(stored, src.m_value);
2078
2079
int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
2080
for (int i = 0; i < 4; i++)
2081
{
2082
if (mask & (1 << i))
2083
dst.m_pValue[vindex[i]] = stored[i];
2084
}
2085
return dst;
2086
}
2087
2088
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
2089
{
2090
CPPSPMD_ALIGN(16) int vindex[4];
2091
_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2092
2093
CPPSPMD_ALIGN(16) float stored[4];
2094
_mm_store_ps(stored, src.m_value);
2095
2096
for (int i = 0; i < 4; i++)
2097
dst.m_pValue[vindex[i]] = stored[i];
2098
return dst;
2099
}
2100
2101
#include "cppspmd_flow.h"
2102
#include "cppspmd_math.h"
2103
2104
} // namespace cppspmd_sse41
2105
2106
2107