Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/arm/avx2neon.h
9917 views
1
#pragma once
2
3
#if !defined(__aarch64__)
4
#error "avx2neon is only supported for AARCH64"
5
#endif
6
7
#include "sse2neon.h"
8
9
#define AVX2NEON_ABI static inline __attribute__((always_inline))
10
11
12
struct __m256 {
13
__m128 lo,hi;
14
__m256() {}
15
};
16
17
18
19
20
struct __m256i {
21
__m128i lo,hi;
22
explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
23
operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
24
__m256i() {}
25
};
26
27
28
29
30
struct __m256d {
31
float64x2_t lo,hi;
32
__m256d() {}
33
__m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
34
__m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
35
};
36
37
#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
38
39
40
#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
41
#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
42
43
#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
44
45
46
#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
47
48
49
50
#define _mm_stream_load_si128 _mm_load_si128
51
#define _mm256_stream_load_si256 _mm256_load_si256
52
53
54
AVX2NEON_ABI
55
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
56
{
57
__m128 af = _mm_castsi128_ps(a);
58
__m128 bf = _mm_castsi128_ps(b);
59
__m128 blendf = _mm_blend_ps(af, bf, imm8);
60
return _mm_castps_si128(blendf);
61
}
62
63
AVX2NEON_ABI
64
int _mm_movemask_popcnt(__m128 a)
65
{
66
return __builtin_popcount(_mm_movemask_ps(a));
67
}
68
69
AVX2NEON_ABI
70
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
71
{
72
float32x4_t res;
73
uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
74
for (int i=0;i<4;i++) {
75
if (mask_u32[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
76
}
77
return vreinterpretq_m128_f32(res);
78
}
79
80
AVX2NEON_ABI
81
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
82
{
83
float32x4_t a_f32 = vreinterpretq_f32_m128(a);
84
uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
85
for (int i=0;i<4;i++) {
86
if (mask_u32[i] & 0x80000000) mem_addr[i] = a_f32[i];
87
}
88
}
89
90
AVX2NEON_ABI
91
void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
92
{
93
uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
94
int32x4_t a_s32 = vreinterpretq_s32_m128i(a);
95
for (int i=0;i<4;i++) {
96
if (mask_u32[i] & 0x80000000) mem_addr[i] = a_s32[i];
97
}
98
}
99
100
101
#define _mm_fmadd_ss _mm_fmadd_ps
102
#define _mm_fmsub_ss _mm_fmsub_ps
103
#define _mm_fnmsub_ss _mm_fnmsub_ps
104
#define _mm_fnmadd_ss _mm_fnmadd_ps
105
106
template<int code>
107
AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
108
{
109
float v;
110
v = 0;
111
v += (code & 0x10) ? a[0]*b[0] : 0;
112
v += (code & 0x20) ? a[1]*b[1] : 0;
113
v += (code & 0x40) ? a[2]*b[2] : 0;
114
v += (code & 0x80) ? a[3]*b[3] : 0;
115
float32x4_t res;
116
res[0] = (code & 0x1) ? v : 0;
117
res[1] = (code & 0x2) ? v : 0;
118
res[2] = (code & 0x4) ? v : 0;
119
res[3] = (code & 0x8) ? v : 0;
120
return res;
121
}
122
123
template<>
124
inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
125
{
126
float v;
127
float32x4_t m = _mm_mul_ps(a,b);
128
m[3] = 0;
129
v = vaddvq_f32(m);
130
return _mm_set1_ps(v);
131
}
132
133
template<>
134
inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
135
{
136
float v;
137
float32x4_t m = _mm_mul_ps(a,b);
138
v = vaddvq_f32(m);
139
return _mm_set1_ps(v);
140
}
141
142
#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
143
144
145
AVX2NEON_ABI
146
__m128 _mm_permutevar_ps (__m128 a, __m128i b)
147
{
148
uint32x4_t b_u32 = vreinterpretq_u32_m128i(b);
149
float32x4_t x;
150
for (int i=0;i<4;i++)
151
{
152
x[i] = a[b_u32[i]];
153
}
154
return vreinterpretq_m128_f32(x);
155
}
156
157
AVX2NEON_ABI
158
__m256i _mm256_setzero_si256()
159
{
160
__m256i res;
161
res.lo = res.hi = vdupq_n_s32(0);
162
return res;
163
}
164
165
AVX2NEON_ABI
166
__m256 _mm256_setzero_ps()
167
{
168
__m256 res;
169
res.lo = res.hi = vdupq_n_f32(0.0f);
170
return res;
171
}
172
173
AVX2NEON_ABI
174
__m256i _mm256_undefined_si256()
175
{
176
return _mm256_setzero_si256();
177
}
178
179
AVX2NEON_ABI
180
__m256 _mm256_undefined_ps()
181
{
182
return _mm256_setzero_ps();
183
}
184
185
CAST_SIMD_TYPE(__m256d, _mm256_castps_pd, __m256, float64x2_t)
186
CAST_SIMD_TYPE(__m256i, _mm256_castps_si256, __m256, __m128i)
187
CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i, __m128)
188
CAST_SIMD_TYPE(__m256, _mm256_castpd_ps , __m256d, __m128)
189
CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i, float64x2_t)
190
CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d, __m128i)
191
192
193
194
195
AVX2NEON_ABI
196
__m128 _mm256_castps256_ps128 (__m256 a)
197
{
198
return a.lo;
199
}
200
201
AVX2NEON_ABI
202
__m256i _mm256_castsi128_si256 (__m128i a)
203
{
204
__m256i res;
205
res.lo = a ;
206
res.hi = vdupq_n_s32(0);
207
return res;
208
}
209
210
AVX2NEON_ABI
211
__m128i _mm256_castsi256_si128 (__m256i a)
212
{
213
return a.lo;
214
}
215
216
AVX2NEON_ABI
217
__m256 _mm256_castps128_ps256 (__m128 a)
218
{
219
__m256 res;
220
res.lo = a;
221
res.hi = vdupq_n_f32(0);
222
return res;
223
}
224
225
226
AVX2NEON_ABI
227
__m256 _mm256_broadcast_ss (float const * mem_addr)
228
{
229
__m256 res;
230
res.lo = res.hi = vdupq_n_f32(*mem_addr);
231
return res;
232
}
233
234
235
AVX2NEON_ABI
236
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
237
{
238
__m256i res;
239
res.lo = _mm_set_epi32(e3,e2,e1,e0);
240
res.hi = _mm_set_epi32(e7,e6,e5,e4);
241
return res;
242
243
}
244
245
AVX2NEON_ABI
246
__m256i _mm256_set1_epi32 (int a)
247
{
248
__m256i res;
249
res.lo = res.hi = vdupq_n_s32(a);
250
return res;
251
}
252
AVX2NEON_ABI
253
__m256i _mm256_set1_epi8 (int a)
254
{
255
__m256i res;
256
res.lo = res.hi = vdupq_n_s8(a);
257
return res;
258
}
259
AVX2NEON_ABI
260
__m256i _mm256_set1_epi16 (int a)
261
{
262
__m256i res;
263
res.lo = res.hi = vdupq_n_s16(a);
264
return res;
265
}
266
267
268
269
270
AVX2NEON_ABI
271
int _mm256_movemask_ps(const __m256& v)
272
{
273
return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
274
}
275
276
template<int imm8>
277
AVX2NEON_ABI
278
__m256 __mm256_permute_ps (const __m256& a)
279
{
280
__m256 res;
281
res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
282
res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
283
return res;
284
285
}
286
287
#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
288
289
290
template<int imm8>
291
AVX2NEON_ABI
292
__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
293
{
294
__m256 res;
295
res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
296
res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
297
return res;
298
299
}
300
301
template<int imm8>
302
AVX2NEON_ABI
303
__m256i __mm256_shuffle_epi32 (const __m256i a)
304
{
305
__m256i res;
306
res.lo = _mm_shuffle_epi32(a.lo,imm8);
307
res.hi = _mm_shuffle_epi32(a.hi,imm8);
308
return res;
309
310
}
311
312
template<int imm8>
313
AVX2NEON_ABI
314
__m256i __mm256_srli_si256 (__m256i a)
315
{
316
__m256i res;
317
res.lo = _mm_srli_si128(a.lo,imm8);
318
res.hi = _mm_srli_si128(a.hi,imm8);
319
return res;
320
}
321
322
template<int imm8>
323
AVX2NEON_ABI
324
__m256i __mm256_slli_si256 (__m256i a)
325
{
326
__m256i res;
327
res.lo = _mm_slli_si128(a.lo,imm8);
328
res.hi = _mm_slli_si128(a.hi,imm8);
329
return res;
330
}
331
332
333
#define _mm256_srli_si256(a,b) __mm256_srli_si256<b>(a)
334
#define _mm256_slli_si256(a,b) __mm256_slli_si256<b>(a)
335
336
337
338
#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
339
#define _mm256_shuffle_epi32(a,c) __mm256_shuffle_epi32<c>(a)
340
341
342
AVX2NEON_ABI
343
__m256i _mm256_set1_epi64x (long long a)
344
{
345
__m256i res;
346
int64x2_t t = vdupq_n_s64(a);
347
res.lo = res.hi = __m128i(t);
348
return res;
349
}
350
351
352
AVX2NEON_ABI
353
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
354
{
355
__m256 res;
356
__m128 tmp;
357
switch (imm8 & 0x7)
358
{
359
case 0: tmp = a.lo; break;
360
case 1: tmp = a.hi; break;
361
case 2: tmp = b.lo; break;
362
case 3: tmp = b.hi; break;
363
}
364
if (imm8 & 0x8)
365
tmp = _mm_setzero_ps();
366
367
368
369
res.lo = tmp;
370
imm8 >>= 4;
371
372
switch (imm8 & 0x7)
373
{
374
case 0: tmp = a.lo; break;
375
case 1: tmp = a.hi; break;
376
case 2: tmp = b.lo; break;
377
case 3: tmp = b.hi; break;
378
}
379
if (imm8 & 0x8)
380
tmp = _mm_setzero_ps();
381
382
res.hi = tmp;
383
384
return res;
385
}
386
387
AVX2NEON_ABI
388
__m256 _mm256_moveldup_ps (__m256 a)
389
{
390
__m256 res;
391
res.lo = _mm_moveldup_ps(a.lo);
392
res.hi = _mm_moveldup_ps(a.hi);
393
return res;
394
}
395
396
AVX2NEON_ABI
397
__m256 _mm256_movehdup_ps (__m256 a)
398
{
399
__m256 res;
400
res.lo = _mm_movehdup_ps(a.lo);
401
res.hi = _mm_movehdup_ps(a.hi);
402
return res;
403
}
404
405
AVX2NEON_ABI
406
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
407
{
408
__m256 res = a;
409
if (imm8 & 1) res.hi = b;
410
else res.lo = b;
411
return res;
412
}
413
414
415
AVX2NEON_ABI
416
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
417
{
418
if (imm8 & 1) return a.hi;
419
return a.lo;
420
}
421
422
423
AVX2NEON_ABI
424
__m256d _mm256_movedup_pd (__m256d a)
425
{
426
__m256d res;
427
res.lo = _mm_movedup_pd(a.lo);
428
res.hi = _mm_movedup_pd(a.hi);
429
return res;
430
}
431
432
AVX2NEON_ABI
433
__m256i _mm256_abs_epi32(__m256i a)
434
{
435
__m256i res;
436
res.lo = vabsq_s32(a.lo);
437
res.hi = vabsq_s32(a.hi);
438
return res;
439
}
440
441
UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
442
UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
443
UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
444
UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
445
UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
446
UNARY_AVX_OP(__m256i,_mm256_abs_epi16,_mm_abs_epi16)
447
448
449
BINARY_AVX_OP(__m256i,_mm256_add_epi8,_mm_add_epi8)
450
BINARY_AVX_OP(__m256i,_mm256_adds_epi8,_mm_adds_epi8)
451
452
BINARY_AVX_OP(__m256i,_mm256_hadd_epi32,_mm_hadd_epi32)
453
BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
454
BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
455
BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
456
457
BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
458
BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
459
BINARY_AVX_OP(__m256i,_mm256_min_epi16,_mm_min_epi16)
460
BINARY_AVX_OP(__m256i,_mm256_max_epi16,_mm_max_epi16)
461
BINARY_AVX_OP(__m256i,_mm256_min_epi8,_mm_min_epi8)
462
BINARY_AVX_OP(__m256i,_mm256_max_epi8,_mm_max_epi8)
463
BINARY_AVX_OP(__m256i,_mm256_min_epu16,_mm_min_epu16)
464
BINARY_AVX_OP(__m256i,_mm256_max_epu16,_mm_max_epu16)
465
BINARY_AVX_OP(__m256i,_mm256_min_epu8,_mm_min_epu8)
466
BINARY_AVX_OP(__m256i,_mm256_max_epu8,_mm_max_epu8)
467
BINARY_AVX_OP(__m256i,_mm256_sign_epi16,_mm_sign_epi16)
468
469
470
BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
471
BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
472
473
BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
474
BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
475
476
BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
477
BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
478
BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
479
BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
480
481
BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
482
BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
483
BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
484
BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
485
486
BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
487
BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
488
BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
489
490
491
492
BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
493
BINARY_AVX_OP(__m256i,_mm256_andnot_si256,_mm_andnot_si128)
494
BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
495
BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
496
497
498
BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
499
BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
500
TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
501
TERNARY_AVX_OP(__m256i,_mm256_blendv_epi8,_mm_blendv_epi8)
502
503
504
TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
505
TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
506
TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
507
TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
508
509
510
511
BINARY_AVX_OP(__m256i,_mm256_packs_epi32,_mm_packs_epi32)
512
BINARY_AVX_OP(__m256i,_mm256_packs_epi16,_mm_packs_epi16)
513
BINARY_AVX_OP(__m256i,_mm256_packus_epi32,_mm_packus_epi32)
514
BINARY_AVX_OP(__m256i,_mm256_packus_epi16,_mm_packus_epi16)
515
516
517
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi64,_mm_unpackhi_epi64)
518
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
519
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi16,_mm_unpackhi_epi16)
520
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi8,_mm_unpackhi_epi8)
521
522
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi64,_mm_unpacklo_epi64)
523
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
524
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi16,_mm_unpacklo_epi16)
525
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi8,_mm_unpacklo_epi8)
526
527
BINARY_AVX_OP(__m256i,_mm256_mulhrs_epi16,_mm_mulhrs_epi16)
528
BINARY_AVX_OP(__m256i,_mm256_mulhi_epu16,_mm_mulhi_epu16)
529
BINARY_AVX_OP(__m256i,_mm256_mulhi_epi16,_mm_mulhi_epi16)
530
//BINARY_AVX_OP(__m256i,_mm256_mullo_epu16,_mm_mullo_epu16)
531
BINARY_AVX_OP(__m256i,_mm256_mullo_epi16,_mm_mullo_epi16)
532
533
BINARY_AVX_OP(__m256i,_mm256_subs_epu16,_mm_subs_epu16)
534
BINARY_AVX_OP(__m256i,_mm256_adds_epu16,_mm_adds_epu16)
535
BINARY_AVX_OP(__m256i,_mm256_subs_epi16,_mm_subs_epi16)
536
BINARY_AVX_OP(__m256i,_mm256_adds_epi16,_mm_adds_epi16)
537
BINARY_AVX_OP(__m256i,_mm256_sub_epi16,_mm_sub_epi16)
538
BINARY_AVX_OP(__m256i,_mm256_add_epi16,_mm_add_epi16)
539
BINARY_AVX_OP(__m256i,_mm256_sub_epi8,_mm_sub_epi8)
540
541
542
BINARY_AVX_OP(__m256i,_mm256_hadd_epi16,_mm_hadd_epi16)
543
BINARY_AVX_OP(__m256i,_mm256_hadds_epi16,_mm_hadds_epi16)
544
545
546
547
548
BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
549
BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
550
551
BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi8,_mm_cmpeq_epi8)
552
BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi8,_mm_cmpgt_epi8)
553
554
BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi16,_mm_cmpeq_epi16)
555
BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi16,_mm_cmpgt_epi16)
556
557
558
BINARY_AVX_OP(__m256i,_mm256_shuffle_epi8,_mm_shuffle_epi8)
559
560
561
BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
562
BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
563
BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
564
BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
565
BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
566
BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
567
BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
568
BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
569
BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
570
BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
571
572
573
AVX2NEON_ABI
574
__m256i _mm256_cvtps_epi32 (__m256 a)
575
{
576
__m256i res;
577
res.lo = _mm_cvtps_epi32(a.lo);
578
res.hi = _mm_cvtps_epi32(a.hi);
579
return res;
580
581
}
582
583
AVX2NEON_ABI
584
__m256i _mm256_cvttps_epi32 (__m256 a)
585
{
586
__m256i res;
587
res.lo = _mm_cvttps_epi32(a.lo);
588
res.hi = _mm_cvttps_epi32(a.hi);
589
return res;
590
591
}
592
593
AVX2NEON_ABI
594
__m256 _mm256_loadu_ps (float const * mem_addr)
595
{
596
__m256 res;
597
res.lo = *(__m128 *)(mem_addr + 0);
598
res.hi = *(__m128 *)(mem_addr + 4);
599
return res;
600
}
601
#define _mm256_load_ps _mm256_loadu_ps
602
603
604
AVX2NEON_ABI
605
int _mm256_testz_ps (const __m256& a, const __m256& b)
606
{
607
__m256 t = a;
608
if (&a != &b)
609
t = _mm256_and_ps(a,b);
610
611
int32x4_t l = vshrq_n_s32(vreinterpretq_s32_m128(t.lo),31);
612
int32x4_t h = vshrq_n_s32(vreinterpretq_s32_m128(t.hi),31);
613
return vaddvq_s32(vaddq_s32(l,h)) == 0;
614
}
615
616
617
AVX2NEON_ABI
618
__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
619
{
620
__m256i res;
621
int64x2_t t0 = {e0,e1};
622
int64x2_t t1 = {e2,e3};
623
res.lo = __m128i(t0);
624
res.hi = __m128i(t1);
625
return res;
626
}
627
AVX2NEON_ABI
628
__m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3)
629
{
630
__m256i res;
631
int64x2_t t0 = {e0,e1};
632
int64x2_t t1 = {e2,e3};
633
res.lo = __m128i(t0);
634
res.hi = __m128i(t1);
635
return res;
636
}
637
638
639
640
AVX2NEON_ABI
641
__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
642
{
643
int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
644
int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
645
__m256i res;
646
res.lo = lo; res.hi = hi;
647
return res;
648
}
649
650
AVX2NEON_ABI
651
__m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5, char e6, char e7, char e8, char e9, char e10, char e11, char e12, char e13, char e14, char e15, char e16, char e17, char e18, char e19, char e20, char e21, char e22, char e23, char e24, char e25, char e26, char e27, char e28, char e29, char e30, char e31)
652
{
653
int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
654
int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
655
__m256i res;
656
res.lo = lo; res.hi = hi;
657
return res;
658
}
659
660
661
AVX2NEON_ABI
662
__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
663
{
664
int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
665
int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
666
__m256i res;
667
res.lo = lo; res.hi = hi;
668
return res;
669
}
670
671
AVX2NEON_ABI
672
__m256i _mm256_setr_epi16 (short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15)
673
{
674
int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
675
int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
676
__m256i res;
677
res.lo = lo; res.hi = hi;
678
return res;
679
}
680
681
682
683
684
AVX2NEON_ABI
685
int _mm256_movemask_epi8(const __m256i& a)
686
{
687
return (_mm_movemask_epi8(a.hi) << 16) | _mm_movemask_epi8(a.lo);
688
}
689
690
691
AVX2NEON_ABI
692
int _mm256_testz_si256(const __m256i& a,const __m256i& b)
693
{
694
uint32x4_t lo = vandq_u32(a.lo,b.lo);
695
uint32x4_t hi = vandq_u32(a.hi,b.hi);
696
697
return (vaddvq_u32(lo) + vaddvq_u32(hi)) == 0;
698
}
699
700
AVX2NEON_ABI
701
__m256d _mm256_setzero_pd ()
702
{
703
__m256d res;
704
res.lo = res.hi = vdupq_n_f64(0);
705
return res;
706
}
707
708
AVX2NEON_ABI
709
int _mm256_movemask_pd (__m256d a)
710
{
711
return (_mm_movemask_pd(a.hi) << 2) | _mm_movemask_pd(a.lo);
712
}
713
714
AVX2NEON_ABI
715
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
716
{
717
__m256i res;
718
res.lo = _mm_cmpeq_epi64(a.lo, b.lo);
719
res.hi = _mm_cmpeq_epi64(a.hi, b.hi);
720
return res;
721
}
722
723
AVX2NEON_ABI
724
__m256d _mm256_cmpeq_pd (__m256d a, __m256d b)
725
{
726
__m256d res;
727
res.lo = _mm_cmpeq_pd(a.lo, b.lo);
728
res.hi = _mm_cmpeq_pd(a.hi, b.hi);
729
return res;
730
}
731
732
733
AVX2NEON_ABI
734
int _mm256_testz_pd (const __m256d& a, const __m256d& b)
735
{
736
__m256d t = a;
737
738
if (&a != &b)
739
t = _mm256_and_pd(a,b);
740
741
return _mm256_movemask_pd(t) == 0;
742
}
743
744
AVX2NEON_ABI
745
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
746
{
747
__m256d res;
748
res.lo = _mm_blendv_pd(a.lo, b.lo, mask.lo);
749
res.hi = _mm_blendv_pd(a.hi, b.hi, mask.hi);
750
return res;
751
}
752
753
template<int imm8>
754
AVX2NEON_ABI
755
__m256 __mm256_dp_ps (__m256 a, __m256 b)
756
{
757
__m256 res;
758
res.lo = _mm_dp_ps(a.lo, b.lo, imm8);
759
res.hi = _mm_dp_ps(a.hi, b.hi, imm8);
760
return res;
761
}
762
763
#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
764
765
AVX2NEON_ABI
766
double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
767
{
768
switch (imm8 & 3) {
769
case 0:
770
return ((float64x2_t)a.lo)[0];
771
case 1:
772
return ((float64x2_t)a.lo)[1];
773
case 2:
774
return ((float64x2_t)a.hi)[0];
775
case 3:
776
return ((float64x2_t)a.hi)[1];
777
}
778
__builtin_unreachable();
779
return 0;
780
}
781
782
AVX2NEON_ABI
783
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
784
{
785
float64x2_t lo,hi;
786
lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
787
lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
788
hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
789
hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
790
791
__m256d res;
792
res.lo = lo; res.hi = hi;
793
return res;
794
}
795
796
AVX2NEON_ABI
797
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
798
{
799
return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
800
}
801
802
803
AVX2NEON_ABI
804
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
805
{
806
__m256i res;
807
res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
808
res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
809
return res;
810
}
811
812
#define _mm256_load_si256 _mm256_loadu_si256
813
814
AVX2NEON_ABI
815
void _mm256_storeu_ps (float * mem_addr, __m256 a)
816
{
817
*(__m128 *)(mem_addr + 0) = a.lo;
818
*(__m128 *)(mem_addr + 4) = a.hi;
819
}
820
821
#define _mm256_store_ps _mm256_storeu_ps
822
#define _mm256_stream_ps _mm256_storeu_ps
823
824
825
AVX2NEON_ABI
826
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
827
{
828
*(__m128i *)((int32_t *)mem_addr + 0) = a.lo;
829
*(__m128i *)((int32_t *)mem_addr + 4) = a.hi;
830
}
831
832
#define _mm256_store_si256 _mm256_storeu_si256
833
834
835
836
AVX2NEON_ABI
837
__m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8)
838
{
839
uint8x16x2_t tbl = {a.lo, a.hi};
840
841
uint8_t sz = sizeof(uint64_t);
842
uint8_t u64[4] = {
843
(uint8_t)(((imm8 >> 0) & 0x3) * sz),
844
(uint8_t)(((imm8 >> 2) & 0x3) * sz),
845
(uint8_t)(((imm8 >> 4) & 0x3) * sz),
846
(uint8_t)(((imm8 >> 6) & 0x3) * sz),
847
};
848
849
uint8x16_t idx_lo = {
850
// lo[0] bytes
851
(uint8_t)(u64[0]+0), (uint8_t)(u64[0]+1), (uint8_t)(u64[0]+2), (uint8_t)(u64[0]+3),
852
(uint8_t)(u64[0]+4), (uint8_t)(u64[0]+5), (uint8_t)(u64[0]+6), (uint8_t)(u64[0]+7),
853
854
// lo[1] bytes
855
(uint8_t)(u64[1]+0), (uint8_t)(u64[1]+1), (uint8_t)(u64[1]+2), (uint8_t)(u64[1]+3),
856
(uint8_t)(u64[1]+4), (uint8_t)(u64[1]+5), (uint8_t)(u64[1]+6), (uint8_t)(u64[1]+7),
857
};
858
uint8x16_t idx_hi = {
859
// hi[0] bytes
860
(uint8_t)(u64[2]+0), (uint8_t)(u64[2]+1), (uint8_t)(u64[2]+2), (uint8_t)(u64[2]+3),
861
(uint8_t)(u64[2]+4), (uint8_t)(u64[2]+5), (uint8_t)(u64[2]+6), (uint8_t)(u64[2]+7),
862
863
// hi[1] bytes
864
(uint8_t)(u64[3]+0), (uint8_t)(u64[3]+1), (uint8_t)(u64[3]+2), (uint8_t)(u64[3]+3),
865
(uint8_t)(u64[3]+4), (uint8_t)(u64[3]+5), (uint8_t)(u64[3]+6), (uint8_t)(u64[3]+7),
866
};
867
868
uint8x16_t lo = vqtbl2q_u8(tbl, idx_lo);
869
uint8x16_t hi = vqtbl2q_u8(tbl, idx_hi);
870
871
__m256i res;
872
res.lo = lo; res.hi = hi;
873
return res;
874
}
875
876
877
AVX2NEON_ABI
878
__m256i _mm256_permute2x128_si256(const __m256i a,const __m256i b, const int imm8)
879
{
880
return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
881
}
882
883
884
885
AVX2NEON_ABI
886
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
887
{
888
__m256 res;
889
res.lo = _mm_maskload_ps(mem_addr,mask.lo);
890
res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
891
return res;
892
}
893
894
895
AVX2NEON_ABI
896
__m256i _mm256_cvtepu8_epi32 (__m128i a)
897
{
898
uint8x16_t a_u8 = vreinterpretq_u8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA
899
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(a_u8)); // 00HH 00GG 00FF 00EE 00DD 00CC 00BB 00AA
900
uint32x4_t lo = vmovl_u16(vget_low_u16(u16x8)); // 0000 00DD 0000 00CC 0000 00BB 0000 00AA
901
uint32x4_t hi = vmovl_high_u16(u16x8); // 0000 00HH 0000 00GG 0000 00FF 0000 00EE
902
903
__m256i res;
904
res.lo = lo; res.hi = hi;
905
return res;
906
}
907
908
909
AVX2NEON_ABI
910
__m256i _mm256_cvtepi8_epi32 (__m128i a)
911
{
912
int8x16_t a_s8 = vreinterpretq_s8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA
913
int16x8_t s16x8 = vmovl_s8(vget_low_s8(a_s8)); // ssHH ssGG ssFF ssEE ssDD ssCC ssBB ssAA
914
int32x4_t lo = vmovl_s16(vget_low_s16(s16x8)); // ssss ssDD ssss ssCC ssss ssBB ssss ssAA
915
int32x4_t hi = vmovl_high_s16(s16x8); // ssss ssHH ssss ssGG ssss ssFF ssss ssEE
916
917
__m256i res;
918
res.lo = lo; res.hi = hi;
919
return res;
920
}
921
922
923
AVX2NEON_ABI
924
__m256i _mm256_cvtepi16_epi32 (__m128i a)
925
{
926
int16x8_t a_s16 = vreinterpretq_s16_m128i(a); // HHHH GGGG FFFF EEEE DDDD CCCC BBBB AAAA
927
int32x4_t lo = vmovl_s16(vget_low_s16(a_s16)); // ssss DDDD ssss CCCC ssss BBBB ssss AAAA
928
int32x4_t hi = vmovl_high_s16(a_s16); // ssss HHHH ssss GGGG ssss FFFF ssss EEEE
929
930
__m256i res;
931
res.lo = lo; res.hi = hi;
932
return res;
933
}
934
935
936
937
AVX2NEON_ABI
938
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
939
{
940
_mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
941
_mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
942
}
943
944
AVX2NEON_ABI
945
__m256i _mm256_slli_epi64 (__m256i a, int imm8)
946
{
947
__m256i res;
948
res.lo = _mm_slli_epi64(a.lo,imm8);
949
res.hi = _mm_slli_epi64(a.hi,imm8);
950
return res;
951
}
952
953
AVX2NEON_ABI
954
__m256i _mm256_slli_epi32 (__m256i a, int imm8)
955
{
956
__m256i res;
957
res.lo = _mm_slli_epi32(a.lo,imm8);
958
res.hi = _mm_slli_epi32(a.hi,imm8);
959
return res;
960
}
961
962
963
AVX2NEON_ABI
964
__m256i __mm256_slli_epi16 (__m256i a, int imm8)
965
{
966
__m256i res;
967
res.lo = _mm_slli_epi16(a.lo,imm8);
968
res.hi = _mm_slli_epi16(a.hi,imm8);
969
return res;
970
}
971
972
973
AVX2NEON_ABI
974
__m256i _mm256_srli_epi32 (__m256i a, int imm8)
975
{
976
__m256i res;
977
res.lo = _mm_srli_epi32(a.lo,imm8);
978
res.hi = _mm_srli_epi32(a.hi,imm8);
979
return res;
980
}
981
982
AVX2NEON_ABI
983
__m256i __mm256_srli_epi16 (__m256i a, int imm8)
984
{
985
__m256i res;
986
res.lo = _mm_srli_epi16(a.lo,imm8);
987
res.hi = _mm_srli_epi16(a.hi,imm8);
988
return res;
989
}
990
991
AVX2NEON_ABI
992
__m256i _mm256_cvtepu16_epi32(__m128i a)
993
{
994
__m256i res;
995
res.lo = vmovl_u16(vget_low_u16(a));
996
res.hi = vmovl_high_u16(a);
997
return res;
998
}
999
1000
AVX2NEON_ABI
1001
__m256i _mm256_cvtepu8_epi16(__m128i a)
1002
{
1003
__m256i res;
1004
res.lo = vmovl_u8(vget_low_u8(a));
1005
res.hi = vmovl_high_u8(a);
1006
return res;
1007
}
1008
1009
1010
AVX2NEON_ABI
1011
__m256i _mm256_srai_epi32 (__m256i a, int imm8)
1012
{
1013
__m256i res;
1014
res.lo = _mm_srai_epi32(a.lo,imm8);
1015
res.hi = _mm_srai_epi32(a.hi,imm8);
1016
return res;
1017
}
1018
1019
AVX2NEON_ABI
1020
__m256i _mm256_srai_epi16 (__m256i a, int imm8)
1021
{
1022
__m256i res;
1023
res.lo = _mm_srai_epi16(a.lo,imm8);
1024
res.hi = _mm_srai_epi16(a.hi,imm8);
1025
return res;
1026
}
1027
1028
1029
AVX2NEON_ABI
1030
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
1031
{
1032
__m256i res;
1033
res.lo = vshlq_s32(a.lo,count.lo);
1034
res.hi = vshlq_s32(a.hi,count.hi);
1035
return res;
1036
1037
}
1038
1039
1040
AVX2NEON_ABI
1041
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
1042
{
1043
__m256i res;
1044
res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
1045
res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
1046
return res;
1047
1048
}
1049
1050
AVX2NEON_ABI
1051
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
1052
{
1053
__m256i res;
1054
res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
1055
res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
1056
return res;
1057
1058
}
1059
1060
1061
AVX2NEON_ABI
1062
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
1063
{
1064
return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
1065
}
1066
1067
1068
AVX2NEON_ABI
1069
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
1070
{
1071
if (imm8 & 1) return a.hi;
1072
return a.lo;
1073
}
1074
1075
AVX2NEON_ABI
1076
__m256 _mm256_set1_ps(float x)
1077
{
1078
__m256 res;
1079
res.lo = res.hi = vdupq_n_f32(x);
1080
return res;
1081
}
1082
1083
AVX2NEON_ABI
1084
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
1085
{
1086
__m256 res;
1087
res.lo = _mm_set_ps(e3,e2,e1,e0);
1088
res.hi = _mm_set_ps(e7,e6,e5,e4);
1089
return res;
1090
}
1091
1092
AVX2NEON_ABI
1093
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
1094
{
1095
__m256 res;
1096
res.lo = res.hi = *mem_addr;
1097
return res;
1098
}
1099
1100
AVX2NEON_ABI
1101
__m256 _mm256_cvtepi32_ps (__m256i a)
1102
{
1103
__m256 res;
1104
res.lo = _mm_cvtepi32_ps(a.lo);
1105
res.hi = _mm_cvtepi32_ps(a.hi);
1106
return res;
1107
}
1108
AVX2NEON_ABI
1109
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
1110
{
1111
uint32x4_t mask_lo = mask.lo;
1112
uint32x4_t mask_hi = mask.hi;
1113
float32x4_t a_lo = a.lo;
1114
float32x4_t a_hi = a.hi;
1115
1116
for (int i=0;i<4;i++) {
1117
if (mask_lo[i] & 0x80000000) mem_addr[i] = a_lo[i];
1118
if (mask_hi[i] & 0x80000000) mem_addr[i+4] = a_hi[i];
1119
}
1120
}
1121
1122
AVX2NEON_ABI
1123
__m256d _mm256_andnot_pd (__m256d a, __m256d b)
1124
{
1125
__m256d res;
1126
res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
1127
res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
1128
return res;
1129
}
1130
1131
AVX2NEON_ABI
1132
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
1133
{
1134
__m256 res;
1135
res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
1136
res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
1137
return res;
1138
1139
}
1140
1141
1142
AVX2NEON_ABI
1143
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
1144
{
1145
return __m256i(_mm256_blend_ps(__m256(a),__m256(b),imm8));
1146
1147
}
1148
1149
AVX2NEON_ABI
1150
__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
1151
{
1152
__m256i res;
1153
res.lo = _mm_blend_epi16(a.lo,b.lo,imm8);
1154
res.hi = _mm_blend_epi16(a.hi,b.hi,imm8);
1155
return res;
1156
}
1157
1158
1159
1160
AVX2NEON_ABI
1161
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
1162
{
1163
int32x4_t vindex_lo = vindex.lo;
1164
int32x4_t vindex_hi = vindex.hi;
1165
int32x4_t lo,hi;
1166
for (int i=0;i<4;i++)
1167
{
1168
lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
1169
hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
1170
}
1171
1172
__m256i res;
1173
res.lo = lo; res.hi = hi;
1174
return res;
1175
}
1176
1177
1178
AVX2NEON_ABI
1179
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
1180
{
1181
uint32x4_t mask_lo = mask.lo;
1182
uint32x4_t mask_hi = mask.hi;
1183
int32x4_t vindex_lo = vindex.lo;
1184
int32x4_t vindex_hi = vindex.hi;
1185
int32x4_t lo,hi;
1186
lo = hi = _mm_setzero_si128();
1187
for (int i=0;i<4;i++)
1188
{
1189
if (mask_lo[i] >> 31) lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
1190
if (mask_hi[i] >> 31) hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
1191
}
1192
1193
__m256i res;
1194
res.lo = lo; res.hi = hi;
1195
return res;
1196
}
1197
1198