CoCalc -- cppspmd

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/cppspmd_sse.h
⁹⁹⁰³ views
1
// cppspmd_sse.h
2
// Copyright 2020-2022 Binomial LLC
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
//    http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
//
16
// Notes for Basis Universal:
17
// All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
18
// The techniques used in this code were originally demonstrated for AVX2 by Nicolas Guillemot, Jefferson Amstutz in their "CppSPMD" project.
19
// This is new code for use in Basis Universal, although it uses the same general SPMD techniques in SSE 2/4.
20

21
#include <stdlib.h>
22
#include <stdint.h>
23
#include <assert.h>
24
#include <math.h>
25
#include <utility>
26
#include <algorithm>
27

28
#if CPPSPMD_SSE2
29
#include <xmmintrin.h>		// SSE
30
#include <emmintrin.h>		// SSE2
31
#else
32
#include <xmmintrin.h>		// SSE
33
#include <emmintrin.h>		// SSE2
34
#include <pmmintrin.h>		// SSE3
35
#include <tmmintrin.h>		// SSSE3
36
#include <smmintrin.h>		// SSE4.1
37
//#include <nmmintrin.h>		// SSE4.2
38
#endif
39

40
#undef CPPSPMD_SSE
41
#undef CPPSPMD_AVX1
42
#undef CPPSPMD_AVX2
43
#undef CPPSPMD_AVX
44
#undef CPPSPMD_FLOAT4
45
#undef CPPSPMD_INT16
46

47
#define CPPSPMD_SSE 1
48
#define CPPSPMD_AVX 0
49
#define CPPSPMD_AVX1 0
50
#define CPPSPMD_AVX2 0
51
#define CPPSPMD_FLOAT4 0
52
#define CPPSPMD_INT16 0
53

54
#ifdef _MSC_VER
55
	#ifndef CPPSPMD_DECL
56
	#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
57
	#endif
58

59
	#ifndef CPPSPMD_ALIGN
60
	#define CPPSPMD_ALIGN(v) __declspec(align(v))
61
	#endif
62

63
	#define _mm_undefined_si128 _mm_setzero_si128
64
	#define _mm_undefined_ps _mm_setzero_ps
65
#else
66
	#ifndef CPPSPMD_DECL
67
	#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
68
	#endif
69

70
	#ifndef CPPSPMD_ALIGN
71
	#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
72
	#endif
73
#endif
74

75
#ifndef CPPSPMD_FORCE_INLINE
76
#ifdef _DEBUG
77
#define CPPSPMD_FORCE_INLINE inline
78
#else
79
	#ifdef _MSC_VER
80
		#define CPPSPMD_FORCE_INLINE __forceinline
81
	#else
82
		#define CPPSPMD_FORCE_INLINE inline
83
	#endif
84
#endif
85
#endif
86

87
#undef CPPSPMD
88
#undef CPPSPMD_ARCH
89

90
#if CPPSPMD_SSE2
91
	#define CPPSPMD_SSE41 0
92
	#define CPPSPMD cppspmd_sse2
93
	#define CPPSPMD_ARCH _sse2
94
#else
95
	#define CPPSPMD_SSE41 1
96
	#define CPPSPMD cppspmd_sse41
97
	#define CPPSPMD_ARCH _sse41
98
#endif
99

100
#ifndef CPPSPMD_GLUER
101
	#define CPPSPMD_GLUER(a, b) a##b
102
#endif
103

104
#ifndef CPPSPMD_GLUER2
105
	#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
106
#endif
107

108
#ifndef CPPSPMD_NAME
109
#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
110
#endif
111

112
#undef VASSERT
113
#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
114
#define VASSERT(cond) assert( VCOND(cond) )
115

116
#define CPPSPMD_ALIGNMENT (16)
117

118
#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
119

120
namespace CPPSPMD
121
{
122

123
const int PROGRAM_COUNT_SHIFT = 2;
124
const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
125

126
template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N;	return static_cast<N*>(p); }
127
template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
128

129
CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
130
CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
131
CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
132
CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
133

134
CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) = 
135
{ 
136
	{ UINT32_MAX, 0, 0, 0 },
137
	{ 0, UINT32_MAX, 0, 0 },
138
	{ 0, 0, UINT32_MAX, 0 },
139
	{ 0, 0, 0, UINT32_MAX },
140
};
141

142
#if CPPSPMD_SSE41
143
CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
144
#endif
145

146
CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
147
{
148
#if CPPSPMD_SSE2
149
	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
150
#else
151
	return _mm_blendv_epi8(a, b, mask);
152
#endif
153
}
154

155
CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
156
{
157
#if CPPSPMD_SSE2
158
	// We know it's a mask, so we can just emulate the blend.
159
	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
160
#else
161
	return _mm_blendv_ps(a, b, mask);
162
#endif
163
}
164

165
CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
166
{
167
#if CPPSPMD_SSE2
168
	// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
169
	mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
170
	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
171
#else
172
	return _mm_blendv_ps(a, b, mask);
173
#endif
174
}
175

176
CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
177
{
178
	return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
179
}
180

181
CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
182
{
183
	return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
184
}
185

186
#if CPPSPMD_SSE2
187
CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
188
CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
189
CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
190
CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
191

192
// Returns float bits as int, to emulate _mm_extract_ps()
193
CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f;  }
194
CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
195
CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
196
CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
197

198
// Returns floats
199
CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
200
CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
201
CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
202
CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
203
#else
204
CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
205
CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
206
CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
207
CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
208

209
// Returns float bits as int
210
CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
211
CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
212
CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
213
CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
214

215
// Returns floats
216
CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
217
CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
218
CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
219
CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
220
#endif
221

222
#if CPPSPMD_SSE2
223
CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
224
CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
225
CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
226
CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
227
#else
228
CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
229
CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
230
CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
231
CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
232
#endif
233

234
#if CPPSPMD_SSE2
235
inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
236
{
237
	// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
238
	CPPSPMD_ALIGN(16) uint8_t av[16];
239
	_mm_store_si128((__m128i*)av, a);
240
		
241
	CPPSPMD_ALIGN(16) uint8_t bvi[16];
242
	_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
243

244
	CPPSPMD_ALIGN(16) uint8_t result[16];
245

246
	result[0] = av[bvi[0]];
247
	result[1] = av[bvi[1]];
248
	result[2] = av[bvi[2]];
249
	result[3] = av[bvi[3]];
250
	
251
	result[4] = av[bvi[4]];
252
	result[5] = av[bvi[5]];
253
	result[6] = av[bvi[6]];
254
	result[7] = av[bvi[7]];
255

256
	result[8] = av[bvi[8]];
257
	result[9] = av[bvi[9]];
258
	result[10] = av[bvi[10]];
259
	result[11] = av[bvi[11]];
260

261
	result[12] = av[bvi[12]];
262
	result[13] = av[bvi[13]];
263
	result[14] = av[bvi[14]];
264
	result[15] = av[bvi[15]];
265

266
	return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
267
}
268
#else
269
CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b) 
270
{ 
271
	return _mm_shuffle_epi8(a, b); 
272
}
273
#endif
274

275
#if CPPSPMD_SSE2
276
CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
277
{
278
	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
279
}
280
CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
281
{
282
	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
283
}
284
CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
285
{
286
	__m128i n = _mm_set1_epi32(0x80000000);
287
	__m128i ac = _mm_add_epi32(a, n);
288
	__m128i bc = _mm_add_epi32(b, n);
289
	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
290
}
291
CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
292
{
293
	__m128i n = _mm_set1_epi32(0x80000000);
294
	__m128i ac = _mm_add_epi32(a, n);
295
	__m128i bc = _mm_add_epi32(b, n);
296
	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
297
}
298
#else
299
CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
300
{
301
	return _mm_min_epi32(a, b);
302
}
303
CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
304
{
305
	return _mm_max_epi32(a, b);
306
}
307
CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
308
{
309
	return _mm_min_epu32(a, b);
310
}
311
CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
312
{
313
	return _mm_max_epu32(a, b);
314
}
315
#endif
316

317
#if CPPSPMD_SSE2
318
CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
319
{
320
	__m128i sign_mask = _mm_srai_epi32(a, 31);
321
	return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
322
}
323
#else
324
CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
325
{
326
	return _mm_abs_epi32(a);
327
}
328
#endif
329

330
#if CPPSPMD_SSE2
331
CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
332
{
333
	__m128i tmp1 = _mm_mul_epu32(a, b);
334
	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
335
	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
336
}
337
#else
338
CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
339
{
340
	return _mm_mullo_epi32(a, b);
341
}
342
#endif
343

344
CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
345
{
346
	__m128i tmp1 = _mm_mul_epu32(a, b);
347
	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
348
	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
349
}
350

351
#if CPPSPMD_SSE2
352
inline __m128i load_rgba32(const void* p)
353
{
354
	__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
355
	xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
356
	xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
357
	return xmm;
358
}
359
#else
360
inline __m128i load_rgba32(const void* p)
361
{
362
	return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
363
}
364
#endif
365

366
inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
367
{
368
	__m128i t0 = _mm_unpacklo_epi32(r0, r1);
369
	__m128i t1 = _mm_unpacklo_epi32(r2, r3);
370
	__m128i t2 = _mm_unpackhi_epi32(r0, r1);
371
	__m128i t3 = _mm_unpackhi_epi32(r2, r3);
372
	x = _mm_unpacklo_epi64(t0, t1);
373
	y = _mm_unpackhi_epi64(t0, t1);
374
	z = _mm_unpacklo_epi64(t2, t3);
375
	w = _mm_unpackhi_epi64(t2, t3);
376
}
377

378
const uint32_t ALL_ON_MOVEMASK = 0xF;
379

380
struct spmd_kernel
381
{
382
	struct vint;
383
	struct lint;
384
	struct vbool;
385
	struct vfloat;
386

387
	typedef int int_t;
388
	typedef vint vint_t;
389
	typedef lint lint_t;
390
		
391
	// Exec mask
392
	struct exec_mask
393
	{
394
		__m128i m_mask;
395

396
		exec_mask() = default;
397

398
		CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
399
		CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
400

401
		CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
402
				
403
		static CPPSPMD_FORCE_INLINE exec_mask all_on()	{ return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) };	}
404
		static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
405

406
		CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
407
	};
408

409
	friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
410
	friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
411

412
	CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
413
	CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
414
	CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
415

416
	// true if cond is true for all active lanes - false if no active lanes
417
	CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
418
	// true if cond is true for any active lanes
419
	CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
420
	CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
421

422
	friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
423
	friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
424
	friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
425
		
426
	exec_mask m_exec;
427
	exec_mask m_kernel_exec;
428
	exec_mask m_continue_mask;
429
#ifdef _DEBUG
430
	bool m_in_loop;
431
#endif
432
		
433
	CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
434
		
435
	void init(const exec_mask& kernel_exec);
436
	
437
	// Varying bool
438
		
439
	struct vbool
440
	{
441
		__m128i m_value;
442

443
		vbool() = default;
444

445
		CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
446

447
		CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
448

449
		CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
450
		CPPSPMD_FORCE_INLINE explicit operator vint() const;
451
								
452
	private:
453
		//vbool& operator=(const vbool&);
454
	};
455

456
	friend vbool operator!(const vbool& v);
457
		
458
	CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
459
	{
460
		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
461
		return dst;
462
	}
463
		
464
	CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
465
	{
466
		dst.m_value = src.m_value;
467
		return dst;
468
	}
469
	
470
	// Varying float
471
	struct vfloat
472
	{
473
		__m128 m_value;
474

475
		vfloat() = default;
476

477
		CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
478

479
		CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
480

481
		CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
482

483
	private:
484
		//vfloat& operator=(const vfloat&);
485
	};
486

487
	CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
488
	{
489
		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
490
		return dst;
491
	}
492

493
	CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
494
	{
495
		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
496
		return dst;
497
	}
498
	
499
	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
500
	{
501
		dst.m_value = src.m_value;
502
		return dst;
503
	}
504

505
	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
506
	{
507
		dst.m_value = src.m_value;
508
		return dst;
509
	}
510

511
	// Linear ref to floats
512
	struct float_lref
513
	{
514
		float* m_pValue;
515

516
	private:
517
		//float_lref& operator=(const float_lref&);
518
	};
519

520
	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
521
	{
522
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
523
		if (mask == ALL_ON_MOVEMASK)
524
			_mm_storeu_ps(dst.m_pValue, src.m_value);
525
		else
526
			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
527
		return dst;
528
	}
529

530
	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
531
	{
532
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
533
		if (mask == ALL_ON_MOVEMASK)
534
			_mm_storeu_ps(dst.m_pValue, src.m_value);
535
		else
536
			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
537
		return dst;
538
	}
539
	
540
	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
541
	{
542
		_mm_storeu_ps(dst.m_pValue, src.m_value);
543
		return dst;
544
	}
545

546
	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
547
	{
548
		_mm_storeu_ps(dst.m_pValue, src.m_value);
549
		return dst;
550
	}
551

552
	CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
553
	{
554
		return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
555
	}
556
		
557
	// Varying ref to floats
558
	struct float_vref
559
	{
560
		__m128i m_vindex;
561
		float* m_pValue;
562
		
563
	private:
564
		//float_vref& operator=(const float_vref&);
565
	};
566

567
	// Varying ref to varying float
568
	struct vfloat_vref
569
	{
570
		__m128i m_vindex;
571
		vfloat* m_pValue;
572
		
573
	private:
574
		//vfloat_vref& operator=(const vfloat_vref&);
575
	};
576

577
	// Varying ref to varying int
578
	struct vint_vref
579
	{
580
		__m128i m_vindex;
581
		vint* m_pValue;
582
		
583
	private:
584
		//vint_vref& operator=(const vint_vref&);
585
	};
586

587
	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
588
	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
589
		
590
	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
591
	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
592

593
	CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
594
	{
595
		CPPSPMD_ALIGN(16) int vindex[4];
596
		_mm_store_si128((__m128i *)vindex, src.m_vindex);
597

598
		CPPSPMD_ALIGN(16) float loaded[4];
599

600
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
601
		for (int i = 0; i < 4; i++)
602
		{
603
			if (mask & (1 << i))
604
				loaded[i] = src.m_pValue[vindex[i]];
605
		}
606
		return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
607
	}
608

609
	CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
610
	{
611
		CPPSPMD_ALIGN(16) int vindex[4];
612
		_mm_store_si128((__m128i *)vindex, src.m_vindex);
613

614
		CPPSPMD_ALIGN(16) float loaded[4];
615

616
		for (int i = 0; i < 4; i++)
617
			loaded[i] = src.m_pValue[vindex[i]];
618
		return vfloat{ _mm_load_ps((const float*)loaded) };
619
	}
620

621
	// Linear ref to ints
622
	struct int_lref
623
	{
624
		int* m_pValue;
625

626
	private:
627
		//int_lref& operator=(const int_lref&);
628
	};
629
		
630
	CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
631
	{
632
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
633
		if (mask == ALL_ON_MOVEMASK)
634
		{
635
			_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
636
		}
637
		else
638
		{
639
			CPPSPMD_ALIGN(16) int stored[4];
640
			_mm_store_si128((__m128i *)stored, src.m_value);
641

642
			for (int i = 0; i < 4; i++)
643
			{
644
				if (mask & (1 << i))
645
					dst.m_pValue[i] = stored[i];
646
			}
647
		}
648
		return dst;
649
	}
650

651
	CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
652
	{
653
		__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
654

655
		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
656

657
		return vint{ v };
658
	}
659

660
	// Linear ref to int16's
661
	struct int16_lref
662
	{
663
		int16_t* m_pValue;
664

665
	private:
666
		//int16_lref& operator=(const int16_lref&);
667
	};
668

669
	CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
670
	{
671
		CPPSPMD_ALIGN(16) int stored[4];
672
		_mm_store_si128((__m128i *)stored, src.m_value);
673

674
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
675
		for (int i = 0; i < 4; i++)
676
		{
677
			if (mask & (1 << i))
678
				dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
679
		}
680
		return dst;
681
	}
682

683
	CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
684
	{
685
		CPPSPMD_ALIGN(16) int stored[4];
686
		_mm_store_si128((__m128i *)stored, src.m_value);
687

688
		for (int i = 0; i < 4; i++)
689
			dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
690
		return dst;
691
	}
692
		
693
	CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
694
	{
695
		CPPSPMD_ALIGN(16) int values[4];
696

697
		for (int i = 0; i < 4; i++)
698
			values[i] = static_cast<int16_t>(src.m_pValue[i]);
699

700
		__m128i t = _mm_load_si128( (const __m128i *)values );
701

702
		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
703
	}
704

705
	CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
706
	{
707
		CPPSPMD_ALIGN(16) int values[4];
708

709
		for (int i = 0; i < 4; i++)
710
			values[i] = static_cast<int16_t>(src.m_pValue[i]);
711

712
		__m128i t = _mm_load_si128( (const __m128i *)values );
713

714
		return vint{ t };
715
	}
716
		
717
	// Linear ref to constant ints
718
	struct cint_lref
719
	{
720
		const int* m_pValue;
721

722
	private:
723
		//cint_lref& operator=(const cint_lref&);
724
	};
725

726
	CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
727
	{
728
		__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
729
		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
730
		return vint{ v };
731
	}
732

733
	CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
734
	{
735
		return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
736
	}
737
	
738
	// Varying ref to ints
739
	struct int_vref
740
	{
741
		__m128i m_vindex;
742
		int* m_pValue;
743

744
	private:
745
		//int_vref& operator=(const int_vref&);
746
	};
747

748
	// Varying ref to constant ints
749
	struct cint_vref
750
	{
751
		__m128i m_vindex;
752
		const int* m_pValue;
753

754
	private:
755
		//cint_vref& operator=(const cint_vref&);
756
	};
757

758
	// Varying int
759
	struct vint
760
	{
761
		__m128i m_value;
762

763
		vint() = default;
764

765
		CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value)	{ }
766

767
		CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
768

769
		CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
770

771
		CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
772

773
		CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value))	{ }
774

775
		CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
776

777
		CPPSPMD_FORCE_INLINE explicit operator vbool() const 
778
		{
779
			return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
780
		}
781

782
		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
783
		{
784
			return vfloat{ _mm_cvtepi32_ps(m_value) };
785
		}
786

787
		CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
788
		{
789
			return int_vref{ m_value, ptr };
790
		}
791

792
		CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
793
		{
794
			return cint_vref{ m_value, ptr };
795
		}
796

797
		CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
798
		{
799
			return float_vref{ m_value, ptr };
800
		}
801

802
		CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
803
		{
804
			return vfloat_vref{ m_value, ptr };
805
		}
806

807
		CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
808
		{
809
			return vint_vref{ m_value, ptr };
810
		}
811

812
	private:
813
		//vint& operator=(const vint&);
814
	};
815

816
	// Load/store linear int
817
	CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
818
	{
819
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
820
		if (mask == ALL_ON_MOVEMASK)
821
			_mm_storeu_si128((__m128i *)pDst, src.m_value);
822
		else
823
		{
824
			if (mask & 1) pDst[0] = extract_x(src.m_value);
825
			if (mask & 2) pDst[1] = extract_y(src.m_value);
826
			if (mask & 4) pDst[2] = extract_z(src.m_value);
827
			if (mask & 8) pDst[3] = extract_w(src.m_value);
828
		}
829
	}
830

831
	CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
832
	{
833
		_mm_storeu_si128((__m128i*)pDst, src.m_value);
834
	}
835

836
	CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
837
	{
838
		_mm_store_si128((__m128i*)pDst, src.m_value);
839
	}
840
		
841
	CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
842
	{
843
		__m128i v = _mm_loadu_si128((const __m128i*)pSrc);
844

845
		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
846

847
		return vint{ v };
848
	}
849

850
	CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
851
	{
852
		return vint{ _mm_loadu_si128((__m128i*)pSrc) };
853
	}
854

855
	CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
856
	{
857
		return vint{ _mm_load_si128((__m128i*)pSrc) };
858
	}
859

860
	// Load/store linear float
861
	CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
862
	{
863
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
864
		if (mask == ALL_ON_MOVEMASK)
865
			_mm_storeu_ps((float*)pDst, src.m_value);
866
		else
867
		{
868
			int *pDstI = (int *)pDst;
869
			if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
870
			if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
871
			if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
872
			if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
873
		}
874
	}
875

876
	CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
877
	{
878
		_mm_storeu_ps((float*)pDst, src.m_value);
879
	}
880

881
	CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
882
	{
883
		_mm_store_ps((float*)pDst, src.m_value);
884
	}
885
		
886
	CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
887
	{
888
		__m128 v = _mm_loadu_ps((const float*)pSrc);
889

890
		v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
891

892
		return vfloat{ v };
893
	}
894

895
	CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
896
	{
897
		return vfloat{ _mm_loadu_ps((float*)pSrc) };
898
	}
899

900
	CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
901
	{
902
		return vfloat{ _mm_load_ps((float*)pSrc) };
903
	}
904
	
905
	CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
906
	{
907
		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
908
		return dst;
909
	}
910

911
	CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
912
	{
913
		CPPSPMD_ALIGN(16) int vindex[4];
914
		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
915

916
		CPPSPMD_ALIGN(16) int stored[4];
917
		_mm_store_si128((__m128i*)stored, src.m_value);
918

919
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
920
		for (int i = 0; i < 4; i++)
921
		{
922
			if (mask & (1 << i))
923
				dst.m_pValue[vindex[i]] = stored[i];
924
		}
925
		return dst;
926
	}
927
	
928
	CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
929
	{
930
		dst.m_value = src.m_value;
931
		return dst;
932
	}
933
				
934
	CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
935
	{
936
		CPPSPMD_ALIGN(16) int vindex[4];
937
		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
938

939
		CPPSPMD_ALIGN(16) int stored[4];
940
		_mm_store_si128((__m128i*)stored, src.m_value);
941

942
		for (int i = 0; i < 4; i++)
943
			dst.m_pValue[vindex[i]] = stored[i];
944

945
		return dst;
946
	}
947

948
	CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
949
	{
950
		CPPSPMD_ALIGN(16) int values[4];
951

952
		CPPSPMD_ALIGN(16) int indices[4];
953
		_mm_store_si128((__m128i *)indices, src.m_vindex);
954

955
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
956
		for (int i = 0; i < 4; i++)
957
		{
958
			if (mask & (1 << i))
959
				values[i] = src.m_pValue[indices[i]];
960
		}
961

962
		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
963
	}
964
		
965
	CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
966
	{
967
		CPPSPMD_ALIGN(16) int values[4];
968

969
		CPPSPMD_ALIGN(16) int indices[4];
970
		_mm_store_si128((__m128i *)indices, src.m_vindex);
971

972
		for (int i = 0; i < 4; i++)
973
			values[i] = src.m_pValue[indices[i]];
974

975
		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
976
	}
977
		
978
	CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
979
	{
980
		CPPSPMD_ALIGN(16) int values[4];
981

982
		CPPSPMD_ALIGN(16) int indices[4];
983
		_mm_store_si128((__m128i *)indices, src.m_vindex);
984

985
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
986
		for (int i = 0; i < 4; i++)
987
		{
988
			if (mask & (1 << i))
989
				values[i] = src.m_pValue[indices[i]];
990
		}
991

992
		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
993
	}
994
		
995
	CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
996
	{
997
		CPPSPMD_ALIGN(16) int values[4];
998

999
		CPPSPMD_ALIGN(16) int indices[4];
1000
		_mm_store_si128((__m128i *)indices, src.m_vindex);
1001

1002
		for (int i = 0; i < 4; i++)
1003
			values[i] = src.m_pValue[indices[i]];
1004

1005
		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
1006
	}
1007

1008
	CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
1009
	{
1010
		__m128i v0_l;
1011

1012
		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
1013
		v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
1014
		v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
1015
		v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
1016
		v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
1017

1018
		return vint{ v0_l };
1019
	}
1020

1021
	CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
1022
	{
1023
		__m128i v0_l;
1024

1025
		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
1026
		v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
1027
		v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
1028
		v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
1029
		v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
1030

1031
		return vint{ v0_l };
1032
	}
1033

1034
	CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
1035
	{
1036
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1037
		
1038
		if (mask & 1) pDst[0] = extract_x(v.m_value);
1039
		if (mask & 2) pDst[stride] = extract_y(v.m_value);
1040
		if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
1041
		if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
1042
	}
1043

1044
	CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
1045
	{
1046
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1047

1048
		if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
1049
		if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
1050
		if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
1051
		if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
1052
	}
1053

1054
	CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
1055
	{
1056
		pDst[0] = extract_x(v.m_value);
1057
		pDst[stride] = extract_y(v.m_value);
1058
		pDst[stride*2] = extract_z(v.m_value);
1059
		pDst[stride*3] = extract_w(v.m_value);
1060
	}
1061

1062
	CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
1063
	{
1064
		((int *)pDstF)[0] = extract_ps_x(v.m_value);
1065
		((int *)pDstF)[stride] = extract_ps_y(v.m_value);
1066
		((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
1067
		((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
1068
	}
1069

1070
	CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
1071
	{
1072
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1073
								
1074
#if CPPSPMD_SSE2
1075
		CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
1076
		if (mask & 1) vals[0] = pSrc[0];
1077
		if (mask & 2) vals[1] = pSrc[stride];
1078
		if (mask & 4) vals[2] = pSrc[stride * 2];
1079
		if (mask & 8) vals[3] = pSrc[stride * 3];
1080
		return vint{ _mm_load_si128((__m128i*)vals) };
1081
#else
1082
		const float* pSrcF = (const float*)pSrc;
1083
		__m128 v = _mm_setzero_ps();
1084
		if (mask & 1) v = _mm_load_ss(pSrcF);
1085
		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
1086
		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
1087
		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
1088
		return vint{ _mm_castps_si128(v) };
1089
#endif
1090
	}
1091

1092
	CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
1093
	{
1094
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1095

1096
#if CPPSPMD_SSE2
1097
		CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
1098
		if (mask & 1) vals[0] = pSrc[0];
1099
		if (mask & 2) vals[1] = pSrc[stride];
1100
		if (mask & 4) vals[2] = pSrc[stride * 2];
1101
		if (mask & 8) vals[3] = pSrc[stride * 3];
1102
		return vfloat{ _mm_load_ps(vals) };
1103
#else
1104
		__m128 v = _mm_setzero_ps();
1105
		if (mask & 1) v = _mm_load_ss(pSrc);
1106
		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
1107
		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
1108
		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
1109
		return vfloat{ v };
1110
#endif
1111
	}
1112

1113
	CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
1114
	{
1115
#if CPPSPMD_SSE2
1116
		CPPSPMD_ALIGN(16) int vals[4];
1117
		vals[0] = pSrc[0];
1118
		vals[1] = pSrc[stride];
1119
		vals[2] = pSrc[stride * 2];
1120
		vals[3] = pSrc[stride * 3];
1121
		return vint{ _mm_load_si128((__m128i*)vals) };
1122
#else		
1123
		const float* pSrcF = (const float*)pSrc;
1124
		__m128 v = _mm_load_ss(pSrcF);
1125
		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
1126
		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
1127
		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
1128
		return vint{ _mm_castps_si128(v) };
1129
#endif
1130
	}
1131

1132
	CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
1133
	{
1134
#if CPPSPMD_SSE2
1135
		CPPSPMD_ALIGN(16) float vals[4];
1136
		vals[0] = pSrc[0];
1137
		vals[1] = pSrc[stride];
1138
		vals[2] = pSrc[stride * 2];
1139
		vals[3] = pSrc[stride * 3];
1140
		return vfloat{ _mm_load_ps(vals) };
1141
#else
1142
		__m128 v = _mm_load_ss(pSrc);
1143
		v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
1144
		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
1145
		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
1146
		return vfloat{ v };
1147
#endif
1148
	}
1149

1150
	CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
1151
	{
1152
		// TODO: There's surely a better way
1153
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1154
		
1155
		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
1156
		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
1157
		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
1158
		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
1159

1160
		return dst;
1161
	}
1162

1163
	CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
1164
	{
1165
		// TODO: There's surely a better way
1166
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1167

1168
		__m128i k = _mm_setzero_si128();
1169

1170
		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1171
		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1172
		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1173
		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1174

1175
		return vfloat{ _mm_castsi128_ps(k) };
1176
	}
1177

1178
	CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
1179
	{
1180
		// TODO: There's surely a better way
1181
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1182
		
1183
		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
1184
		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
1185
		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
1186
		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
1187

1188
		return dst;
1189
	}
1190

1191
	CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
1192
	{
1193
		// TODO: There's surely a better way
1194
		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
1195

1196
		__m128i k = _mm_setzero_si128();
1197

1198
		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1199
		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1200
		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1201
		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1202

1203
		return vint{ k };
1204
	}
1205

1206
	CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
1207
	{
1208
		// TODO: There's surely a better way
1209
		__m128i k = _mm_setzero_si128();
1210

1211
		k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
1212
		k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
1213
		k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
1214
		k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
1215

1216
		return vint{ k };
1217
	}
1218
			
1219
	// Linear integer
1220
	struct lint
1221
	{
1222
		__m128i m_value;
1223

1224
		CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
1225
			: m_value(value)
1226
		{ }
1227

1228
		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
1229
		{
1230
			return vfloat{ _mm_cvtepi32_ps(m_value) };
1231
		}
1232

1233
		CPPSPMD_FORCE_INLINE explicit operator vint() const
1234
		{
1235
			return vint{ m_value };
1236
		}
1237

1238
		CPPSPMD_FORCE_INLINE int get_first_value() const 
1239
		{
1240
			return _mm_cvtsi128_si32(m_value);
1241
		}
1242

1243
		CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
1244
		{
1245
			return float_lref{ ptr + get_first_value() };
1246
		}
1247

1248
		CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
1249
		{
1250
			return int_lref{ ptr + get_first_value() };
1251
		}
1252

1253
		CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
1254
		{
1255
			return int16_lref{ ptr + get_first_value() };
1256
		}
1257

1258
		CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
1259
		{
1260
			return cint_lref{ ptr + get_first_value() };
1261
		}
1262

1263
	private:
1264
		//lint& operator=(const lint&);
1265
	};
1266

1267
	CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
1268
	{
1269
		dst.m_value = src.m_value;
1270
		return dst;
1271
	}
1272
	
1273
	const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
1274
	
1275
	// SPMD condition helpers
1276

1277
	template<typename IfBody>
1278
	CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
1279

1280
	CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
1281

1282
	// No breaks, continues, etc. allowed
1283
	template<typename IfBody>
1284
	CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
1285

1286
	// No breaks, continues, etc. allowed
1287
	template<typename IfBody, typename ElseBody>
1288
	CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
1289

1290
	template<typename IfBody, typename ElseBody>
1291
	CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
1292

1293
	template<typename WhileCondBody, typename WhileBody>
1294
	CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
1295

1296
	template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
1297
	CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
1298

1299
	template<typename ForeachBody>
1300
	CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
1301
		
1302
#ifdef _DEBUG
1303
	CPPSPMD_FORCE_INLINE void check_masks();
1304
#else
1305
	CPPSPMD_FORCE_INLINE void check_masks() { }
1306
#endif
1307

1308
	CPPSPMD_FORCE_INLINE void spmd_break();
1309
	CPPSPMD_FORCE_INLINE void spmd_continue();
1310
	
1311
	CPPSPMD_FORCE_INLINE void spmd_return();
1312
	
1313
	template<typename UnmaskedBody>
1314
	CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
1315

1316
	template<typename SPMDKernel, typename... Args>
1317
	//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
1318
	CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
1319

1320
	CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
1321
	CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
1322
	CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
1323

1324
	CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)	
1325
	{ 
1326
		__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
1327
		__m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
1328
		return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp));
1329
	}
1330

1331
	CPPSPMD_FORCE_INLINE int reduce_add(vint v)
1332
	{
1333
		__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
1334
		__m128i temp = _mm_add_epi32(_mm_shuffle_epi32(k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
1335
		return extract_x(_mm_add_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(temp), _mm_castsi128_ps(temp))), temp));
1336
	}
1337

1338
	#include "cppspmd_math_declares.h"
1339

1340
}; // struct spmd_kernel
1341

1342
using exec_mask = spmd_kernel::exec_mask;
1343
using vint = spmd_kernel::vint;
1344
using int_lref = spmd_kernel::int_lref;
1345
using cint_vref = spmd_kernel::cint_vref;
1346
using cint_lref = spmd_kernel::cint_lref;
1347
using int_vref = spmd_kernel::int_vref;
1348
using lint = spmd_kernel::lint;
1349
using vbool = spmd_kernel::vbool;
1350
using vfloat = spmd_kernel::vfloat;
1351
using float_lref = spmd_kernel::float_lref;
1352
using float_vref = spmd_kernel::float_vref;
1353
using vfloat_vref = spmd_kernel::vfloat_vref;
1354
using vint_vref = spmd_kernel::vint_vref;
1355

1356
CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const 
1357
{ 
1358
	return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) }; 
1359
}
1360
	
1361
// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
1362
CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const 
1363
{ 
1364
	return vint { m_value };
1365
}
1366

1367
CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
1368
{
1369
	return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
1370
}
1371

1372
CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
1373

1374
CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
1375
CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) {	return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
1376
CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
1377

1378
CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
1379
CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
1380

1381
// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
1382
CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
1383
CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
1384

1385
CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
1386
CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
1387
CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
1388

1389
CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
1390
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
1391
CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
1392
CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
1393
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
1394
CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
1395
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
1396
CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
1397
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
1398
CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
1399

1400
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
1401
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
1402
CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
1403
CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
1404
CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
1405

1406
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
1407
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
1408
CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
1409
CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
1410
CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
1411
CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
1412

1413
CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
1414
CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
1415

1416
CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
1417
CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
1418

1419
CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
1420
CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
1421

1422
CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
1423
CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
1424

1425
CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
1426
CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
1427

1428
CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
1429
CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
1430

1431
CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
1432
CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
1433

1434
CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
1435
CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
1436
CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
1437
CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
1438

1439
#if CPPSPMD_SSE2
1440
CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
1441
{
1442
	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
1443
	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1444
		
1445
	__m128i ai = _mm_cvttps_epi32(a.m_value);
1446
	
1447
	__m128 af = _mm_cvtepi32_ps(ai);
1448
	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1449
}
1450

1451
CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
1452
{
1453
	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1454
	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1455

1456
	__m128i ai = _mm_cvtps_epi32(a.m_value);
1457
	__m128 af = _mm_cvtepi32_ps(ai);
1458
	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
1459

1460
	af = _mm_add_ps(af, changed);
1461

1462
	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1463
}
1464

1465
CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
1466
{
1467
	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1468
	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
1469
	
1470
	__m128i ai = _mm_cvtps_epi32(a.m_value);
1471
	__m128 af = _mm_cvtepi32_ps(ai);
1472
	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
1473
	
1474
	af = _mm_sub_ps(af, changed);
1475

1476
	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
1477
}
1478

1479
// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
1480
// I wish there was a better way.
1481
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1482
inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
1483
#elif defined(__clang__)
1484
inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
1485
#elif defined (_MSC_VER)
1486
#pragma float_control(push)
1487
#pragma float_control(precise, on)
1488
inline __m128 add_sub(__m128 a, __m128 b)
1489
#else
1490
inline __m128 add_sub(__m128 a, __m128 b)
1491
#endif
1492
{
1493
	return _mm_sub_ps(_mm_add_ps(a, b), b);
1494
}
1495

1496
#if defined (_MSC_VER)
1497
#pragma float_control(pop)
1498
#endif
1499

1500
CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
1501
{
1502
	__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
1503

1504
	__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
1505
	__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
1506
	
1507
	// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
1508
	//__m128 temp1 = _mm_add_ps(a.m_value, force_int);
1509
	//__m128 temp2 = _mm_sub_ps(temp1, force_int);
1510
	__m128 temp2 = add_sub(a.m_value, force_int);
1511
	
1512
	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
1513
	__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
1514
	return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
1515
}
1516

1517
#else
1518
CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
1519
CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
1520
CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
1521
CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
1522
#endif
1523

1524
CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
1525
CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
1526
CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
1527

1528
CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
1529
CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) {	return vint{ min_epi32(a.m_value, b.m_value) }; }
1530

1531
CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
1532
CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
1533

1534
CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
1535

1536
CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) {	return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15,  8,  9, 10, 11,  4,  5,  6,  7,  0,  1,  2,  3)) }; }
1537

1538
CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
1539
CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
1540

1541
CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
1542
{
1543
	return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
1544
}
1545

1546
CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
1547
{
1548
	return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
1549
}
1550

1551
CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
1552
{
1553
	return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
1554
}
1555

1556
CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
1557
{
1558
	return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
1559
}
1560

1561
CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
1562
{
1563
	return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
1564
}
1565

1566
CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
1567
{
1568
	return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
1569
}
1570

1571
CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
1572

1573
CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
1574
CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
1575
CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
1576
CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
1577
CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
1578
CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
1579

1580
CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
1581
CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
1582
CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
1583
CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
1584
CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
1585
CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
1586
CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
1587
CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1588
CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1589
CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1590
CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1591
CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1592
CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1593
CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
1594
CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
1595
CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
1596
CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
1597
CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
1598
CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
1599
CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
1600
CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
1601
CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
1602

1603
CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
1604

1605
CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
1606

1607
CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
1608

1609
// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
1610
CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) {	return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
1611
CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
1612
CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
1613
CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
1614
CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
1615
CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
1616

1617
CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
1618
CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
1619
CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
1620
CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
1621
CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
1622
CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
1623
CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
1624
CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
1625
CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
1626
CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
1627
CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
1628

1629
CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
1630
CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
1631
CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
1632
CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
1633
CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
1634

1635
CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
1636
CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
1637

1638
CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
1639
CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
1640
CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
1641
CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
1642
CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
1643
CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
1644
CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
1645
CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
1646
CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
1647
CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
1648
CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
1649
CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
1650
CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
1651
CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
1652
CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
1653
CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
1654
CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
1655
CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
1656

1657
CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
1658
CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
1659
CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
1660

1661
#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
1662
#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
1663
#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
1664

1665
CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
1666
CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
1667

1668
CPPSPMD_FORCE_INLINE vint zero_vint() { return vint{ _mm_setzero_si128() }; }
1669
CPPSPMD_FORCE_INLINE vfloat zero_vfloat() { return vfloat{ _mm_setzero_ps() }; }
1670

1671
CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
1672
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
1673
CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
1674
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
1675
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
1676
#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
1677
#define VFLOAT_LANE_SHUFFLE_PS(a, b, control) vfloat(_mm_shuffle_ps((a).m_value, (b).m_value, control))
1678

1679
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
1680
#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
1681
#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
1682

1683
#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
1684
#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
1685

1686
#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
1687
#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
1688

1689
// Unpack and interleave 8-bit integers from the low or high half of a and b
1690
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
1691
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
1692

1693
// Unpack and interleave 16-bit integers from the low or high half of a and b
1694
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
1695
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
1696

1697
// Unpack and interleave 32-bit integers from the low or high half of a and b
1698
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
1699
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
1700

1701
// Unpack and interleave 64-bit integers from the low or high half of a and b
1702
CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
1703
CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
1704

1705
CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
1706
CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
1707
CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
1708
CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
1709

1710
CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
1711

1712
CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
1713
{
1714
	__m128d al = _mm_cvtepi32_pd(a.m_value);
1715
	__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
1716

1717
	__m128d bl = _mm_cvtepi32_pd(b.m_value);
1718
	__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
1719

1720
	__m128d rl = _mm_div_pd(al, bl);
1721
	__m128d rh = _mm_div_pd(ah, bh);
1722

1723
	__m128i rli = _mm_cvttpd_epi32(rl);
1724
	__m128i rhi = _mm_cvttpd_epi32(rh);
1725

1726
	return vint(_mm_unpacklo_epi64(rli, rhi));
1727
}
1728

1729
CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
1730
{
1731
	vint aa = abs(a), ab = abs(b);
1732
	vint q = div_epi32(aa, ab);
1733
	vint r = aa - q * ab;
1734
	return spmd_ternaryi(a < 0, -r, r);
1735
}
1736

1737
CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
1738
{
1739
	return div_epi32(a, b);
1740
}
1741

1742
CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
1743
{
1744
	return div_epi32(a, vint(b));
1745
}
1746

1747
CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
1748
{
1749
	return mod_epi32(a, b);
1750
}
1751

1752
CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
1753
{
1754
	return mod_epi32(a, vint(b));
1755
}
1756

1757
CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
1758
{
1759
#if 0
1760
	CPPSPMD_ALIGN(32) int result[4];
1761
	result[0] = extract_x(a.m_value) << extract_x(b.m_value);
1762
	result[1] = extract_y(a.m_value) << extract_y(b.m_value);
1763
	result[2] = extract_z(a.m_value) << extract_z(b.m_value);
1764
	result[3] = extract_w(a.m_value) << extract_w(b.m_value);
1765

1766
	return vint{ _mm_load_si128((__m128i*)result) };
1767
#elif 0
1768
	int x = extract_x(a.m_value) << extract_x(b.m_value);
1769
	int y = extract_y(a.m_value) << extract_y(b.m_value);
1770
	int z = extract_z(a.m_value) << extract_z(b.m_value);
1771
	int w = extract_w(a.m_value) << extract_w(b.m_value);
1772

1773
	__m128i v = insert_x(_mm_undefined_si128(), x);
1774
	v = insert_y(v, y);
1775
	v = insert_z(v, z);
1776
	return vint{ insert_w(v, w) };
1777
#else
1778
	// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
1779
	return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
1780
#endif
1781
}
1782

1783
// uniform shift left
1784
CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
1785
{
1786
	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1787
	return vint{ _mm_sll_epi32(a.m_value, bv) };
1788
}
1789

1790
// uniform arithmetic shift right
1791
CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
1792
{
1793
	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1794
	return vint{ _mm_sra_epi32(a.m_value, bv) };
1795
}
1796

1797
// uniform shift right
1798
CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
1799
{
1800
	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
1801
	return vint{ _mm_srl_epi32(a.m_value, bv) };
1802
}
1803

1804
CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
1805
{
1806
#if 0
1807
	CPPSPMD_ALIGN(32) int result[4];
1808
	result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
1809
	result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
1810
	result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
1811
	result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
1812

1813
	return vint{ _mm_load_si128((__m128i*)result) };
1814
#elif 0
1815
	uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
1816
	uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
1817
	uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
1818
	uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
1819

1820
	__m128i v = insert_x(_mm_undefined_si128(), x);
1821
	v = insert_y(v, y);
1822
	v = insert_z(v, z);
1823
	return vint{ insert_w(v, w) };
1824
#else
1825
	//vint inv_shift = 32 - b;
1826
	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
1827
	
1828
	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
1829
	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
1830

1831
	// Now convert scale factor to integer.
1832
	vint r = vint(f);
1833

1834
	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
1835
	vint q(mulhi_epu32(a.m_value, r.m_value));
1836

1837
	// Handle shift amounts of 0.
1838
	return spmd_ternaryi(b > 0, q, a);
1839
#endif
1840
}
1841

1842
CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
1843
{
1844
	//vint inv_shift = 32 - b;
1845
	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
1846
	
1847
	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
1848
	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
1849

1850
	// Now convert scale factor to integer.
1851
	vint r = vint(f);
1852

1853
	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
1854
	return vint(mulhi_epu32(a.m_value, r.m_value));
1855
}
1856

1857
CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
1858
{
1859
#if 0
1860
	CPPSPMD_ALIGN(32) int result[4];
1861
	result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
1862
	result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
1863
	result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
1864
	result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
1865

1866
	return vint{ _mm_load_si128((__m128i*)result) };
1867
#elif 0
1868
	int x = extract_x(a.m_value) >> extract_x(b.m_value);
1869
	int y = extract_y(a.m_value) >> extract_y(b.m_value);
1870
	int z = extract_z(a.m_value) >> extract_z(b.m_value);
1871
	int w = extract_w(a.m_value) >> extract_w(b.m_value);
1872

1873
	__m128i v = insert_x(_mm_undefined_si128(), x);
1874
	v = insert_y(v, y);
1875
	v = insert_z(v, z);
1876
	return vint{ insert_w(v, w) };
1877
#else
1878
	vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
1879
	vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
1880
	return a_shifted;
1881
#endif
1882
}
1883

1884
#undef VINT_SHIFT_LEFT
1885
#undef VINT_SHIFT_RIGHT
1886
#undef VUINT_SHIFT_RIGHT
1887

1888
// Shift left/right by a uniform immediate constant
1889
#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
1890
#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) ) 
1891
#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
1892
#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
1893

1894
CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
1895
CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
1896
CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
1897
CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1898
CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1899
CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
1900
CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
1901

1902
CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
1903
CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
1904
CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
1905
CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
1906

1907
#undef VINT_EXTRACT
1908
#undef VBOOL_EXTRACT
1909
#undef VFLOAT_EXTRACT
1910

1911
#if CPPSPMD_SSE2
1912
// Pass in an immediate constant and the compiler will optimize these expressions.
1913
#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
1914
#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
1915
#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
1916
#else
1917
CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
1918

1919
#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
1920
#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
1921
#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
1922
#endif
1923

1924
CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
1925
{
1926
	assert(instance < 4);
1927
	CPPSPMD_ALIGN(16) float values[4];
1928
	_mm_store_ps(values, v.m_value);
1929
	values[instance] = f;
1930
	v.m_value = _mm_load_ps(values);
1931
	return v;
1932
}
1933

1934
CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
1935
{
1936
	assert(instance < 4);
1937
	CPPSPMD_ALIGN(16) int values[4];
1938
	_mm_store_si128((__m128i *)values, v.m_value);
1939
	values[instance] = i;
1940
	v.m_value = _mm_load_si128((__m128i *)values);
1941
	return v;
1942
}
1943

1944
CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
1945
{
1946
	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
1947
	return vint{ l };
1948
}
1949

1950
CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
1951
{
1952
	return vint{ shuffle_epi8(table.m_value, a.m_value) };
1953
}
1954

1955
CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
1956
{
1957
	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
1958
	__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
1959
	table_0.m_value = l;
1960
	table_1.m_value = h;
1961
}
1962

1963
CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
1964
{
1965
	__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
1966
	__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
1967

1968
	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
1969

1970
	__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
1971

1972
	return vint{ _mm_castps_si128(v_0) };
1973
}
1974

1975
CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
1976
{
1977
	__m128i a = _mm_loadu_si128((const __m128i*)pTab);
1978
	__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
1979
	__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
1980
	__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
1981

1982
	table_0.m_value = a;
1983
	table_1.m_value = b;
1984
	table_2.m_value = c;
1985
	table_3.m_value = d;
1986
}
1987

1988
CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
1989
{
1990
	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
1991

1992
	__m128 av_0;
1993
	{
1994
		__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
1995
		__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
1996
		av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
1997
	}
1998

1999
	__m128 bv_0;
2000
	{
2001
		__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
2002
		__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
2003
		bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
2004
	}
2005

2006
	__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
2007
	__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
2008

2009
	return vint{ _mm_castps_si128(v2_0) };
2010
}
2011

2012
#if 0
2013
template<typename SPMDKernel, typename... Args>
2014
CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
2015
{
2016
	SPMDKernel kernel;
2017
	kernel.init(exec_mask::all_on());
2018
	return kernel._call(std::forward<Args>(args)...);
2019
}
2020
#else
2021
template<typename SPMDKernel, typename... Args>
2022
CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
2023
{
2024
	SPMDKernel kernel;
2025
	kernel.init(exec_mask::all_on());
2026
	kernel._call(std::forward<Args>(args)...);
2027
}
2028
#endif
2029

2030
CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
2031
{
2032
	m_exec = kernel_exec;
2033
	m_kernel_exec = kernel_exec;
2034
	m_continue_mask = exec_mask::all_off();
2035

2036
#ifdef _DEBUG
2037
	m_in_loop = false;
2038
#endif
2039
}
2040

2041
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
2042
{
2043
	CPPSPMD_ALIGN(16) int vindex[4];
2044
	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2045

2046
	CPPSPMD_ALIGN(16) float stored[4];
2047
	_mm_store_ps(stored, src.m_value);
2048

2049
	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
2050
	for (int i = 0; i < 4; i++)
2051
	{
2052
		if (mask & (1 << i))
2053
			dst.m_pValue[vindex[i]] = stored[i];
2054
	}
2055
	return dst;
2056
}
2057

2058
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
2059
{
2060
	CPPSPMD_ALIGN(16) int vindex[4];
2061
	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2062

2063
	CPPSPMD_ALIGN(16) float stored[4];
2064
	_mm_store_ps(stored, src.m_value);
2065

2066
	for (int i = 0; i < 4; i++)
2067
		dst.m_pValue[vindex[i]] = stored[i];
2068
	return dst;
2069
}
2070

2071
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
2072
{
2073
	CPPSPMD_ALIGN(16) int vindex[4];
2074
	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2075

2076
	CPPSPMD_ALIGN(16) float stored[4];
2077
	_mm_store_ps(stored, src.m_value);
2078

2079
	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
2080
	for (int i = 0; i < 4; i++)
2081
	{
2082
		if (mask & (1 << i))
2083
			dst.m_pValue[vindex[i]] = stored[i];
2084
	}
2085
	return dst;
2086
}
2087

2088
CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
2089
{
2090
	CPPSPMD_ALIGN(16) int vindex[4];
2091
	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
2092

2093
	CPPSPMD_ALIGN(16) float stored[4];
2094
	_mm_store_ps(stored, src.m_value);
2095

2096
	for (int i = 0; i < 4; i++)
2097
		dst.m_pValue[vindex[i]] = stored[i];
2098
	return dst;
2099
}
2100

2101
#include "cppspmd_flow.h"
2102
#include "cppspmd_math.h"
2103

2104
} // namespace cppspmd_sse41
2105

2106

2107
Product

Resources

Company