Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/system/include/compat/avxintrin.h
6171 views
1
/*
2
* Copyright 2020 The Emscripten Authors. All rights reserved.
3
* Emscripten is available under two separate licenses, the MIT license and the
4
* University of Illinois/NCSA Open Source License. Both these licenses can be
5
* found in the LICENSE file.
6
*/
7
8
#ifndef __emscripten_immintrin_h__
9
#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
10
#endif
11
12
#ifndef __emscripten_avxintrin_h__
13
#define __emscripten_avxintrin_h__
14
15
#ifndef __AVX__
16
#error "AVX instruction set not enabled"
17
#endif
18
19
typedef struct {
20
__m128d v0;
21
__m128d v1;
22
} __m256d;
23
24
typedef struct {
25
__m128 v0;
26
__m128 v1;
27
} __m256;
28
29
typedef struct {
30
__m128i v0;
31
__m128i v1;
32
} __m256i;
33
34
typedef int64_t __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
35
36
typedef struct {
37
__m128i_u v0;
38
__m128i_u v1;
39
} __m256i_u;
40
41
union __m256_data {
42
__m256i int_view;
43
__m256d double_view;
44
__m256 float_view;
45
__m128i_u int_u_view;
46
};
47
48
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
49
_mm256_add_pd(__m256d __a, __m256d __b) {
50
__m256d ret;
51
ret.v0 = _mm_add_pd(__a.v0, __b.v0);
52
ret.v1 = _mm_add_pd(__a.v1, __b.v1);
53
return ret;
54
}
55
56
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
57
_mm256_add_ps(__m256 __a, __m256 __b) {
58
__m256 ret;
59
ret.v0 = _mm_add_ps(__a.v0, __b.v0);
60
ret.v1 = _mm_add_ps(__a.v1, __b.v1);
61
return ret;
62
}
63
64
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
65
_mm256_sub_pd(__m256d __a, __m256d __b) {
66
__m256d ret;
67
ret.v0 = _mm_sub_pd(__a.v0, __b.v0);
68
ret.v1 = _mm_sub_pd(__a.v1, __b.v1);
69
return ret;
70
}
71
72
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
73
_mm256_sub_ps(__m256 __a, __m256 __b) {
74
__m256 ret;
75
ret.v0 = _mm_sub_ps(__a.v0, __b.v0);
76
ret.v1 = _mm_sub_ps(__a.v1, __b.v1);
77
return ret;
78
}
79
80
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
81
_mm256_addsub_pd(__m256d __a, __m256d __b) {
82
__m256d ret;
83
ret.v0 = _mm_addsub_pd(__a.v0, __b.v0);
84
ret.v1 = _mm_addsub_pd(__a.v1, __b.v1);
85
return ret;
86
}
87
88
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
89
_mm256_addsub_ps(__m256 __a, __m256 __b) {
90
__m256 ret;
91
ret.v0 = _mm_addsub_ps(__a.v0, __b.v0);
92
ret.v1 = _mm_addsub_ps(__a.v1, __b.v1);
93
return ret;
94
}
95
96
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
97
_mm256_div_pd(__m256d __a, __m256d __b) {
98
__m256d ret;
99
ret.v0 = _mm_div_pd(__a.v0, __b.v0);
100
ret.v1 = _mm_div_pd(__a.v1, __b.v1);
101
return ret;
102
}
103
104
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
105
_mm256_div_ps(__m256 __a, __m256 __b) {
106
__m256 ret;
107
ret.v0 = _mm_div_ps(__a.v0, __b.v0);
108
ret.v1 = _mm_div_ps(__a.v1, __b.v1);
109
return ret;
110
}
111
112
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
113
_mm256_max_pd(__m256d __a, __m256d __b) {
114
__m256d ret;
115
ret.v0 = _mm_max_pd(__a.v0, __b.v0);
116
ret.v1 = _mm_max_pd(__a.v1, __b.v1);
117
return ret;
118
}
119
120
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
121
_mm256_max_ps(__m256 __a, __m256 __b) {
122
__m256 ret;
123
ret.v0 = _mm_max_ps(__a.v0, __b.v0);
124
ret.v1 = _mm_max_ps(__a.v1, __b.v1);
125
return ret;
126
}
127
128
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
129
_mm256_min_pd(__m256d __a, __m256d __b) {
130
__m256d ret;
131
ret.v0 = _mm_min_pd(__a.v0, __b.v0);
132
ret.v1 = _mm_min_pd(__a.v1, __b.v1);
133
return ret;
134
}
135
136
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
137
_mm256_min_ps(__m256 __a, __m256 __b) {
138
__m256 ret;
139
ret.v0 = _mm_min_ps(__a.v0, __b.v0);
140
ret.v1 = _mm_min_ps(__a.v1, __b.v1);
141
return ret;
142
}
143
144
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
145
_mm256_mul_pd(__m256d __a, __m256d __b) {
146
__m256d ret;
147
ret.v0 = _mm_mul_pd(__a.v0, __b.v0);
148
ret.v1 = _mm_mul_pd(__a.v1, __b.v1);
149
return ret;
150
}
151
152
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
153
_mm256_mul_ps(__m256 __a, __m256 __b) {
154
__m256 ret;
155
ret.v0 = _mm_mul_ps(__a.v0, __b.v0);
156
ret.v1 = _mm_mul_ps(__a.v1, __b.v1);
157
return ret;
158
}
159
160
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
161
_mm256_sqrt_pd(__m256d __a) {
162
__m256d ret;
163
ret.v0 = _mm_sqrt_pd(__a.v0);
164
ret.v1 = _mm_sqrt_pd(__a.v1);
165
return ret;
166
}
167
168
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
169
_mm256_sqrt_ps(__m256 __a) {
170
__m256 ret;
171
ret.v0 = _mm_sqrt_ps(__a.v0);
172
ret.v1 = _mm_sqrt_ps(__a.v1);
173
return ret;
174
}
175
176
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
177
_mm256_rsqrt_ps(__m256 __a) {
178
__m256 ret;
179
ret.v0 = _mm_rsqrt_ps(__a.v0);
180
ret.v1 = _mm_rsqrt_ps(__a.v1);
181
return ret;
182
}
183
184
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
185
_mm256_rcp_ps(__m256 __a) {
186
__m256 ret;
187
ret.v0 = _mm_rcp_ps(__a.v0);
188
ret.v1 = _mm_rcp_ps(__a.v1);
189
return ret;
190
}
191
192
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
193
_mm256_round_pd(__m256d __a, int __rounding) {
194
__m256d ret;
195
ret.v0 = _mm_round_pd(__a.v0, __rounding);
196
ret.v1 = _mm_round_pd(__a.v1, __rounding);
197
return ret;
198
}
199
200
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
201
_mm256_round_ps(__m256 __a, int __rounding) {
202
__m256 ret;
203
ret.v0 = _mm_round_ps(__a.v0, __rounding);
204
ret.v1 = _mm_round_ps(__a.v1, __rounding);
205
return ret;
206
}
207
208
#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
209
#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
210
#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
211
#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
212
213
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
214
_mm256_and_pd(__m256d __a, __m256d __b) {
215
__m256d ret;
216
ret.v0 = _mm_and_pd(__a.v0, __b.v0);
217
ret.v1 = _mm_and_pd(__a.v1, __b.v1);
218
return ret;
219
}
220
221
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
222
_mm256_and_ps(__m256 __a, __m256 __b) {
223
__m256 ret;
224
ret.v0 = _mm_and_ps(__a.v0, __b.v0);
225
ret.v1 = _mm_and_ps(__a.v1, __b.v1);
226
return ret;
227
}
228
229
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
230
_mm256_andnot_pd(__m256d __a, __m256d __b) {
231
__m256d ret;
232
ret.v0 = _mm_andnot_pd(__a.v0, __b.v0);
233
ret.v1 = _mm_andnot_pd(__a.v1, __b.v1);
234
return ret;
235
}
236
237
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
238
_mm256_andnot_ps(__m256 __a, __m256 __b) {
239
__m256 ret;
240
ret.v0 = _mm_andnot_ps(__a.v0, __b.v0);
241
ret.v1 = _mm_andnot_ps(__a.v1, __b.v1);
242
return ret;
243
}
244
245
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
246
_mm256_or_pd(__m256d __a, __m256d __b) {
247
__m256d ret;
248
ret.v0 = _mm_or_pd(__a.v0, __b.v0);
249
ret.v1 = _mm_or_pd(__a.v1, __b.v1);
250
return ret;
251
}
252
253
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
254
_mm256_or_ps(__m256 __a, __m256 __b) {
255
__m256 ret;
256
ret.v0 = _mm_or_ps(__a.v0, __b.v0);
257
ret.v1 = _mm_or_ps(__a.v1, __b.v1);
258
return ret;
259
}
260
261
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
262
_mm256_xor_pd(__m256d __a, __m256d __b) {
263
__m256d ret;
264
ret.v0 = _mm_xor_pd(__a.v0, __b.v0);
265
ret.v1 = _mm_xor_pd(__a.v1, __b.v1);
266
return ret;
267
}
268
269
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
270
_mm256_xor_ps(__m256 __a, __m256 __b) {
271
__m256 ret;
272
ret.v0 = _mm_xor_ps(__a.v0, __b.v0);
273
ret.v1 = _mm_xor_ps(__a.v1, __b.v1);
274
return ret;
275
}
276
277
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
278
_mm256_hadd_pd(__m256d __a, __m256d __b) {
279
__m256d ret;
280
ret.v0 = _mm_hadd_pd(__a.v0, __b.v0);
281
ret.v1 = _mm_hadd_pd(__a.v1, __b.v1);
282
return ret;
283
}
284
285
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
286
_mm256_hadd_ps(__m256 __a, __m256 __b) {
287
__m256 ret;
288
ret.v0 = _mm_hadd_ps(__a.v0, __b.v0);
289
ret.v1 = _mm_hadd_ps(__a.v1, __b.v1);
290
return ret;
291
}
292
293
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
294
_mm256_hsub_pd(__m256d __a, __m256d __b) {
295
__m256d ret;
296
ret.v0 = _mm_hsub_pd(__a.v0, __b.v0);
297
ret.v1 = _mm_hsub_pd(__a.v1, __b.v1);
298
return ret;
299
}
300
301
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
302
_mm256_hsub_ps(__m256 __a, __m256 __b) {
303
__m256 ret;
304
ret.v0 = _mm_hsub_ps(__a.v0, __b.v0);
305
ret.v1 = _mm_hsub_ps(__a.v1, __b.v1);
306
return ret;
307
}
308
309
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
310
_mm_permutevar_pd(__m128d __a, __m128i __c) {
311
return (__m128d)wasm_f64x2_make(
312
((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 0) >> 1) & 1],
313
((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 1) >> 1) & 1]);
314
}
315
316
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
317
_mm256_permutevar_pd(__m256d __a, __m256i __c) {
318
__m256d ret;
319
ret.v0 = _mm_permutevar_pd(__a.v0, __c.v0);
320
ret.v1 = _mm_permutevar_pd(__a.v1, __c.v1);
321
return ret;
322
}
323
324
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
325
_mm_permutevar_ps(__m128 __a, __m128i __c) {
326
return (__m128)wasm_f32x4_make(
327
((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 0) & 3],
328
((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 1) & 3],
329
((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 2) & 3],
330
((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 3) & 3]);
331
}
332
333
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
334
_mm256_permutevar_ps(__m256 __a, __m256i __c) {
335
__m256 ret;
336
ret.v0 = _mm_permutevar_ps(__a.v0, __c.v0);
337
ret.v1 = _mm_permutevar_ps(__a.v1, __c.v1);
338
return ret;
339
}
340
341
#define _mm_permute_pd(__a, __imm) \
342
((__m128d)wasm_i64x2_shuffle( \
343
(__m128d)(__a), (__m128d)(__a), ((__imm) & 1), (((__imm) >> 1) & 1)))
344
345
#define _mm256_permute_pd(__A, __imm) \
346
__extension__({ \
347
__m256d __a = (__A); \
348
_mm256_set_m128d(_mm_permute_pd(__a.v1, (__imm) >> 2), \
349
_mm_permute_pd(__a.v0, (__imm))); \
350
})
351
352
#define _mm_permute_ps(__a, __imm) \
353
((__m128)wasm_i32x4_shuffle((__m128)(__a), \
354
(__m128)(__a), \
355
((__imm) & 3), \
356
(((__imm) >> 2) & 3), \
357
(((__imm) >> 4) & 3), \
358
(((__imm) >> 6) & 3)))
359
360
#define _mm256_permute_ps(__A, __imm) \
361
__extension__({ \
362
__m256 __a = (__A); \
363
_mm256_set_m128(_mm_permute_ps(__a.v1, (__imm)), \
364
_mm_permute_ps(__a.v0, (__imm))); \
365
})
366
367
static __inline__ __m128d
368
__avx_select4d(__m256d __a, __m256d __b, const int imm8) {
369
switch (imm8 & 0xF) {
370
case 0:
371
case 4:
372
return __a.v0;
373
case 1:
374
case 5:
375
return __a.v1;
376
case 2:
377
case 6:
378
return __b.v0;
379
case 3:
380
case 7:
381
return __b.v1;
382
default:
383
return (__m128d)wasm_i64x2_const_splat(0);
384
}
385
}
386
387
static __inline__ __m128 __avx_select4(__m256 __a, __m256 __b, const int imm8) {
388
switch (imm8 & 0xF) {
389
case 0:
390
case 4:
391
return __a.v0;
392
case 1:
393
case 5:
394
return __a.v1;
395
case 2:
396
case 6:
397
return __b.v0;
398
case 3:
399
case 7:
400
return __b.v1;
401
default:
402
return (__m128)wasm_i64x2_const_splat(0);
403
}
404
}
405
406
static __inline__ __m128i
407
__avx_select4i(__m256i __a, __m256i __b, const int imm8) {
408
switch (imm8 & 0xF) {
409
case 0:
410
case 4:
411
return __a.v0;
412
case 1:
413
case 5:
414
return __a.v1;
415
case 2:
416
case 6:
417
return __b.v0;
418
case 3:
419
case 7:
420
return __b.v1;
421
default:
422
return wasm_i64x2_const_splat(0);
423
}
424
}
425
426
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
427
_mm256_permute2f128_pd(__m256d __a, __m256d __b, const int imm8) {
428
__m256d ret;
429
ret.v0 = __avx_select4d(__a, __b, imm8);
430
ret.v1 = __avx_select4d(__a, __b, imm8 >> 4);
431
return ret;
432
}
433
434
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
435
_mm256_permute2f128_ps(__m256 __a, __m256 __b, const int imm8) {
436
__m256 ret;
437
ret.v0 = __avx_select4(__a, __b, imm8);
438
ret.v1 = __avx_select4(__a, __b, imm8 >> 4);
439
return ret;
440
}
441
442
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
443
_mm256_permute2f128_si256(__m256i __a, __m256i __b, const int imm8) {
444
__m256i ret;
445
ret.v0 = __avx_select4i(__a, __b, imm8);
446
ret.v1 = __avx_select4i(__a, __b, imm8 >> 4);
447
return ret;
448
}
449
450
#define _mm256_blend_pd(__A, __B, imm8) \
451
__extension__({ \
452
__m256d __a = (__A); \
453
__m256d __b = (__B); \
454
_mm256_set_m128d(_mm_blend_pd(__a.v1, __b.v1, (imm8) >> 2), \
455
_mm_blend_pd(__a.v0, __b.v0, (imm8))); \
456
})
457
458
#define _mm256_blend_ps(__A, __B, imm) \
459
__extension__({ \
460
__m256 __a = (__A); \
461
__m256 __b = (__B); \
462
_mm256_set_m128(_mm_blend_ps(__a.v1, __b.v1, (imm) >> 4), \
463
_mm_blend_ps(__a.v0, __b.v0, (imm))); \
464
})
465
466
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
467
_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
468
__m256d ret;
469
ret.v0 = _mm_blendv_pd(__a.v0, __b.v0, __c.v0);
470
ret.v1 = _mm_blendv_pd(__a.v1, __b.v1, __c.v1);
471
return ret;
472
}
473
474
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
475
_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
476
__m256 ret;
477
ret.v0 = _mm_blendv_ps(__a.v0, __b.v0, __c.v0);
478
ret.v1 = _mm_blendv_ps(__a.v1, __b.v1, __c.v1);
479
return ret;
480
}
481
482
#define _mm256_dp_ps(__A, __B, imm) \
483
__extension__({ \
484
__m256 __a = (__A); \
485
__m256 __b = (__B); \
486
_mm256_set_m128(_mm_dp_ps(__a.v1, __b.v1, (imm)), \
487
_mm_dp_ps(__a.v0, __b.v0, (imm))); \
488
})
489
490
#define _mm256_shuffle_ps(__A, __B, mask) \
491
__extension__({ \
492
__m256 __a = (__A); \
493
__m256 __b = (__B); \
494
_mm256_set_m128(_mm_shuffle_ps(__a.v1, __b.v1, (mask)), \
495
_mm_shuffle_ps(__a.v0, __b.v0, (mask))); \
496
})
497
498
#define _mm256_shuffle_pd(__A, __B, mask) \
499
__extension__({ \
500
__m256d __a = (__A); \
501
__m256d __b = (__B); \
502
_mm256_set_m128d(_mm_shuffle_pd(__a.v1, __b.v1, (mask) >> 2), \
503
_mm_shuffle_pd(__a.v0, __b.v0, (mask))); \
504
})
505
506
#define _CMP_EQ_OQ 0
507
#define _CMP_LT_OS 1
508
#define _CMP_LE_OS 2
509
#define _CMP_UNORD_Q 3
510
#define _CMP_NEQ_UQ 4
511
#define _CMP_NLT_US 5
512
#define _CMP_NLE_US 6
513
#define _CMP_ORD_Q 7
514
#define _CMP_EQ_UQ 8
515
#define _CMP_NGE_US 9
516
#define _CMP_NGT_US 10
517
#define _CMP_FALSE_OQ 11
518
#define _CMP_NEQ_OQ 12
519
#define _CMP_GE_OS 13
520
#define _CMP_GT_OS 14
521
#define _CMP_TRUE_UQ 15
522
#define _CMP_EQ_OS 16
523
#define _CMP_LT_OQ 17
524
#define _CMP_LE_OQ 18
525
#define _CMP_UNORD_S 19
526
#define _CMP_NEQ_US 20
527
#define _CMP_NLT_UQ 21
528
#define _CMP_NLE_UQ 22
529
#define _CMP_ORD_S 23
530
#define _CMP_EQ_US 24
531
#define _CMP_NGE_UQ 25
532
#define _CMP_NGT_UQ 26
533
#define _CMP_FALSE_OS 27
534
#define _CMP_NEQ_OS 28
535
#define _CMP_GE_OQ 29
536
#define _CMP_GT_OQ 30
537
#define _CMP_TRUE_US 31
538
539
#define _mm_cmp_pd(__a, __b, __imm) \
540
__extension__({ \
541
__m128d __ret; \
542
switch ((__imm)) { \
543
case _CMP_EQ_OQ: \
544
case _CMP_EQ_OS: \
545
__ret = _mm_cmpeq_pd((__a), (__b)); \
546
break; \
547
case _CMP_EQ_UQ: \
548
case _CMP_EQ_US: \
549
__ret = _mm_or_pd(_mm_cmpeq_pd((__a), (__b)), \
550
_mm_cmpunord_pd((__a), (__b))); \
551
break; \
552
case _CMP_LT_OS: \
553
case _CMP_LT_OQ: \
554
__ret = _mm_cmplt_pd((__a), (__b)); \
555
break; \
556
case _CMP_LE_OS: \
557
case _CMP_LE_OQ: \
558
__ret = _mm_cmple_pd((__a), (__b)); \
559
break; \
560
case _CMP_UNORD_Q: \
561
case _CMP_UNORD_S: \
562
__ret = _mm_cmpunord_pd((__a), (__b)); \
563
break; \
564
case _CMP_NEQ_UQ: \
565
case _CMP_NEQ_US: \
566
__ret = _mm_cmpneq_pd((__a), (__b)); \
567
break; \
568
case _CMP_NEQ_OQ: \
569
case _CMP_NEQ_OS: \
570
__ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)), \
571
_mm_cmpneq_pd((__a), (__b))); \
572
break; \
573
case _CMP_NLT_US: \
574
case _CMP_NLT_UQ: \
575
__ret = _mm_cmpnlt_pd((__a), (__b)); \
576
break; \
577
case _CMP_ORD_Q: \
578
case _CMP_ORD_S: \
579
__ret = _mm_cmpord_pd((__a), (__b)); \
580
break; \
581
case _CMP_NGE_US: \
582
case _CMP_NGE_UQ: \
583
__ret = _mm_cmpnge_pd((__a), (__b)); \
584
break; \
585
case _CMP_NGT_US: \
586
case _CMP_NGT_UQ: \
587
__ret = _mm_cmpngt_pd((__a), (__b)); \
588
break; \
589
case _CMP_FALSE_OQ: \
590
case _CMP_FALSE_OS: \
591
__ret = _mm_setzero_pd(); \
592
break; \
593
case _CMP_GE_OS: \
594
case _CMP_GE_OQ: \
595
__ret = _mm_cmpge_pd((__a), (__b)); \
596
break; \
597
case _CMP_GT_OS: \
598
case _CMP_GT_OQ: \
599
__ret = _mm_cmpgt_pd((__a), (__b)); \
600
break; \
601
case _CMP_TRUE_UQ: \
602
case _CMP_TRUE_US: \
603
__ret = (__m128d)wasm_i8x16_splat(0xFF); \
604
break; \
605
case _CMP_NLE_US: \
606
case _CMP_NLE_UQ: \
607
__ret = _mm_cmpnle_pd((__a), (__b)); \
608
break; \
609
} \
610
__ret; \
611
})
612
613
#define _mm_cmp_ps(__a, __b, __imm) \
614
__extension__({ \
615
__m128 __ret; \
616
switch ((__imm)) { \
617
case _CMP_EQ_OQ: \
618
case _CMP_EQ_OS: \
619
__ret = _mm_cmpeq_ps((__a), (__b)); \
620
break; \
621
case _CMP_EQ_UQ: \
622
case _CMP_EQ_US: \
623
__ret = _mm_or_ps(_mm_cmpeq_ps((__a), (__b)), \
624
_mm_cmpunord_ps((__a), (__b))); \
625
break; \
626
case _CMP_LT_OS: \
627
case _CMP_LT_OQ: \
628
__ret = _mm_cmplt_ps((__a), (__b)); \
629
break; \
630
case _CMP_LE_OS: \
631
case _CMP_LE_OQ: \
632
__ret = _mm_cmple_ps((__a), (__b)); \
633
break; \
634
case _CMP_UNORD_Q: \
635
case _CMP_UNORD_S: \
636
__ret = _mm_cmpunord_ps((__a), (__b)); \
637
break; \
638
case _CMP_NEQ_UQ: \
639
case _CMP_NEQ_US: \
640
__ret = _mm_cmpneq_ps((__a), (__b)); \
641
break; \
642
case _CMP_NEQ_OQ: \
643
case _CMP_NEQ_OS: \
644
__ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)), \
645
_mm_cmpneq_ps((__a), (__b))); \
646
break; \
647
case _CMP_NLT_US: \
648
case _CMP_NLT_UQ: \
649
__ret = _mm_cmpnlt_ps((__a), (__b)); \
650
break; \
651
case _CMP_ORD_Q: \
652
case _CMP_ORD_S: \
653
__ret = _mm_cmpord_ps((__a), (__b)); \
654
break; \
655
case _CMP_NGE_US: \
656
case _CMP_NGE_UQ: \
657
__ret = _mm_cmpnge_ps((__a), (__b)); \
658
break; \
659
case _CMP_NGT_US: \
660
case _CMP_NGT_UQ: \
661
__ret = _mm_cmpngt_ps((__a), (__b)); \
662
break; \
663
case _CMP_FALSE_OQ: \
664
case _CMP_FALSE_OS: \
665
__ret = _mm_setzero_ps(); \
666
break; \
667
case _CMP_GE_OS: \
668
case _CMP_GE_OQ: \
669
__ret = _mm_cmpge_ps((__a), (__b)); \
670
break; \
671
case _CMP_GT_OS: \
672
case _CMP_GT_OQ: \
673
__ret = _mm_cmpgt_ps((__a), (__b)); \
674
break; \
675
case _CMP_TRUE_UQ: \
676
case _CMP_TRUE_US: \
677
__ret = (__m128)wasm_i8x16_splat(0xFF); \
678
break; \
679
case _CMP_NLE_US: \
680
case _CMP_NLE_UQ: \
681
__ret = _mm_cmpnle_ps((__a), (__b)); \
682
break; \
683
} \
684
__ret; \
685
})
686
687
#define _mm_cmp_sd(__a, __b, __imm) \
688
__extension__({ \
689
__m128d __ret; \
690
switch ((__imm)) { \
691
case _CMP_EQ_OQ: \
692
case _CMP_EQ_OS: \
693
__ret = _mm_cmpeq_sd((__a), (__b)); \
694
break; \
695
case _CMP_EQ_UQ: \
696
case _CMP_EQ_US: \
697
__ret = _mm_move_sd((__a), \
698
_mm_or_pd(_mm_cmpeq_sd((__a), (__b)), \
699
_mm_cmpunord_sd((__a), (__b)))); \
700
break; \
701
case _CMP_LT_OS: \
702
case _CMP_LT_OQ: \
703
__ret = _mm_cmplt_sd((__a), (__b)); \
704
break; \
705
case _CMP_LE_OS: \
706
case _CMP_LE_OQ: \
707
__ret = _mm_cmple_sd((__a), (__b)); \
708
break; \
709
case _CMP_UNORD_Q: \
710
case _CMP_UNORD_S: \
711
__ret = _mm_cmpunord_sd((__a), (__b)); \
712
break; \
713
case _CMP_NEQ_UQ: \
714
case _CMP_NEQ_US: \
715
__ret = _mm_cmpneq_sd((__a), (__b)); \
716
break; \
717
case _CMP_NEQ_OQ: \
718
case _CMP_NEQ_OS: \
719
__ret = _mm_move_sd((__a), \
720
_mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)), \
721
_mm_cmpneq_sd((__a), (__b)))); \
722
break; \
723
case _CMP_NLT_US: \
724
case _CMP_NLT_UQ: \
725
__ret = _mm_cmpnlt_sd((__a), (__b)); \
726
break; \
727
case _CMP_ORD_Q: \
728
case _CMP_ORD_S: \
729
__ret = _mm_cmpord_sd((__a), (__b)); \
730
break; \
731
case _CMP_NGE_US: \
732
case _CMP_NGE_UQ: \
733
__ret = _mm_cmpnge_sd((__a), (__b)); \
734
break; \
735
case _CMP_NGT_US: \
736
case _CMP_NGT_UQ: \
737
__ret = _mm_cmpngt_sd((__a), (__b)); \
738
break; \
739
case _CMP_FALSE_OQ: \
740
case _CMP_FALSE_OS: \
741
__ret = _mm_move_sd((__a), _mm_setzero_pd()); \
742
break; \
743
case _CMP_GE_OS: \
744
case _CMP_GE_OQ: \
745
__ret = _mm_cmpge_sd((__a), (__b)); \
746
break; \
747
case _CMP_GT_OS: \
748
case _CMP_GT_OQ: \
749
__ret = _mm_cmpgt_sd((__a), (__b)); \
750
break; \
751
case _CMP_TRUE_UQ: \
752
case _CMP_TRUE_US: \
753
__ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF)); \
754
break; \
755
case _CMP_NLE_US: \
756
case _CMP_NLE_UQ: \
757
__ret = _mm_cmpnle_sd((__a), (__b)); \
758
break; \
759
} \
760
__ret; \
761
})
762
763
#define _mm_cmp_ss(__a, __b, __imm) \
764
__extension__({ \
765
__m128 __ret; \
766
switch ((__imm)) { \
767
case _CMP_EQ_OQ: \
768
case _CMP_EQ_OS: \
769
__ret = _mm_cmpeq_ss((__a), (__b)); \
770
break; \
771
case _CMP_EQ_UQ: \
772
case _CMP_EQ_US: \
773
__ret = _mm_move_ss((__a), \
774
_mm_or_ps(_mm_cmpeq_ss((__a), (__b)), \
775
_mm_cmpunord_ss((__a), (__b)))); \
776
break; \
777
case _CMP_LT_OS: \
778
case _CMP_LT_OQ: \
779
__ret = _mm_cmplt_ss((__a), (__b)); \
780
break; \
781
case _CMP_LE_OS: \
782
case _CMP_LE_OQ: \
783
__ret = _mm_cmple_ss((__a), (__b)); \
784
break; \
785
case _CMP_UNORD_Q: \
786
case _CMP_UNORD_S: \
787
__ret = _mm_cmpunord_ss((__a), (__b)); \
788
break; \
789
case _CMP_NEQ_UQ: \
790
case _CMP_NEQ_US: \
791
__ret = _mm_cmpneq_ss((__a), (__b)); \
792
break; \
793
case _CMP_NEQ_OQ: \
794
case _CMP_NEQ_OS: \
795
__ret = _mm_move_ss((__a), \
796
_mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)), \
797
_mm_cmpneq_ss((__a), (__b)))); \
798
break; \
799
case _CMP_NLT_US: \
800
case _CMP_NLT_UQ: \
801
__ret = _mm_cmpnlt_ss((__a), (__b)); \
802
break; \
803
case _CMP_ORD_Q: \
804
case _CMP_ORD_S: \
805
__ret = _mm_cmpord_ss((__a), (__b)); \
806
break; \
807
case _CMP_NGE_US: \
808
case _CMP_NGE_UQ: \
809
__ret = _mm_cmpnge_ss((__a), (__b)); \
810
break; \
811
case _CMP_NGT_US: \
812
case _CMP_NGT_UQ: \
813
__ret = _mm_cmpngt_ss((__a), (__b)); \
814
break; \
815
case _CMP_FALSE_OQ: \
816
case _CMP_FALSE_OS: \
817
__ret = _mm_move_ss((__a), _mm_setzero_ps()); \
818
break; \
819
case _CMP_GE_OS: \
820
case _CMP_GE_OQ: \
821
__ret = _mm_cmpge_ss((__a), (__b)); \
822
break; \
823
case _CMP_GT_OS: \
824
case _CMP_GT_OQ: \
825
__ret = _mm_cmpgt_ss((__a), (__b)); \
826
break; \
827
case _CMP_TRUE_UQ: \
828
case _CMP_TRUE_US: \
829
__ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF)); \
830
break; \
831
case _CMP_NLE_US: \
832
case _CMP_NLE_UQ: \
833
__ret = _mm_cmpnle_ss((__a), (__b)); \
834
break; \
835
} \
836
__ret; \
837
})
838
839
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
840
_mm256_cmp_pd(__m256d a, __m256d b, const int imm8) {
841
__m256d ret;
842
ret.v0 = _mm_cmp_pd(a.v0, b.v0, imm8);
843
ret.v1 = _mm_cmp_pd(a.v1, b.v1, imm8);
844
return ret;
845
}
846
847
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
848
_mm256_cmp_ps(__m256 __a, __m256 __b, const int imm8) {
849
__m256 ret;
850
ret.v0 = _mm_cmp_ps(__a.v0, __b.v0, imm8);
851
ret.v1 = _mm_cmp_ps(__a.v1, __b.v1, imm8);
852
return ret;
853
}
854
855
#define _mm256_extract_epi32(__A, N) \
856
__extension__({ \
857
__m256i __a = (__A); \
858
((N) & 0x7) < 4 ? _mm_extract_epi32(__a.v0, (N) & 0x3) \
859
: _mm_extract_epi32(__a.v1, (N) & 0x3); \
860
})
861
862
#define _mm256_extract_epi16(__A, N) \
863
__extension__({ \
864
__m256i __a = (__A); \
865
((N) & 0xF) < 8 ? _mm_extract_epi16(__a.v0, (N) & 0x7) \
866
: _mm_extract_epi16(__a.v1, (N) & 0x7); \
867
})
868
869
#define _mm256_extract_epi8(__A, N) \
870
__extension__({ \
871
__m256i __a = (__A); \
872
((N) & 0x1F) < 16 ? _mm_extract_epi8(__a.v0, (N) & 0xF) \
873
: _mm_extract_epi8(__a.v1, (N) & 0xF); \
874
})
875
876
#define _mm256_extract_epi64(__A, N) \
877
__extension__({ \
878
__m256i __a = (__A); \
879
((N) & 0x3) < 2 ? _mm_extract_epi64(__a.v0, (N) & 0x1) \
880
: _mm_extract_epi64(__a.v1, (N) & 0x1); \
881
})
882
883
#define _mm256_insert_epi32(__A, __I, N) \
884
__extension__({ \
885
__m256i __a = (__A); \
886
int32_t __i = (__I); \
887
((N) & 0x7) < 4 \
888
? _mm256_set_m128i(__a.v1, _mm_insert_epi32(__a.v0, __i, (N) & 0x3)) \
889
: _mm256_set_m128i(_mm_insert_epi32(__a.v1, __i, (N) & 0x3), __a.v0); \
890
})
891
892
#define _mm256_insert_epi16(__A, __I, N) \
893
__extension__({ \
894
__m256i __a = (__A); \
895
int16_t __i = (__I); \
896
((N) & 0xF) < 8 \
897
? _mm256_set_m128i(__a.v1, _mm_insert_epi16(__a.v0, __i, (N) & 0x7)) \
898
: _mm256_set_m128i(_mm_insert_epi16(__a.v1, __i, (N) & 0x7), __a.v0); \
899
})
900
901
#define _mm256_insert_epi8(__A, __I, N) \
902
__extension__({ \
903
__m256i __a = (__A); \
904
int8_t __i = (__I); \
905
((N) & 0x1F) < 16 \
906
? _mm256_set_m128i(__a.v1, _mm_insert_epi8(__a.v0, __i, (N) & 0xF)) \
907
: _mm256_set_m128i(_mm_insert_epi8(__a.v1, __i, (N) & 0xF), __a.v0); \
908
})
909
910
#define _mm256_insert_epi64(__A, __I, N) \
911
__extension__({ \
912
__m256i __a = (__A); \
913
int64_t __i = (__I); \
914
((N) & 0x3) < 2 \
915
? _mm256_set_m128i(__a.v1, _mm_insert_epi64(__a.v0, __i, (N) & 0x1)) \
916
: _mm256_set_m128i(_mm_insert_epi64(__a.v1, __i, (N) & 0x1), __a.v0); \
917
})
918
919
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
920
_mm256_cvtepi32_pd(__m128i __a) {
921
__m256d ret;
922
ret.v0 = _mm_cvtepi32_pd(__a);
923
__m128i __a1 = wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
924
ret.v1 = _mm_cvtepi32_pd(__a1);
925
return ret;
926
}
927
928
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
929
_mm256_cvtepi32_ps(__m256i __a) {
930
__m256 ret;
931
ret.v0 = _mm_cvtepi32_ps(__a.v0);
932
ret.v1 = _mm_cvtepi32_ps(__a.v1);
933
return ret;
934
}
935
936
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
937
_mm256_cvtpd_ps(__m256d __a) {
938
__m128 low = _mm_cvtpd_ps(__a.v0);
939
__m128 high = _mm_cvtpd_ps(__a.v1);
940
__m128 ret = (__m128)wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
941
return ret;
942
}
943
944
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
945
_mm256_cvtps_epi32(__m256 __a) {
946
__m256i ret;
947
ret.v0 = _mm_cvtps_epi32(__a.v0);
948
ret.v1 = _mm_cvtps_epi32(__a.v1);
949
return ret;
950
}
951
952
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
953
_mm256_cvtps_pd(__m128 __a) {
954
__m256d ret;
955
ret.v0 = _mm_cvtps_pd(__a);
956
__m128 __a1 = (__m128)wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
957
ret.v1 = _mm_cvtps_pd(__a1);
958
return ret;
959
}
960
961
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
962
_mm256_cvttpd_epi32(__m256d __a) {
963
__m128i low = _mm_cvttpd_epi32(__a.v0);
964
__m128i high = _mm_cvttpd_epi32(__a.v1);
965
__m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
966
return ret;
967
}
968
969
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
970
_mm256_cvtpd_epi32(__m256d __a) {
971
__m128i low = _mm_cvtpd_epi32(__a.v0);
972
__m128i high = _mm_cvtpd_epi32(__a.v1);
973
__m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
974
return ret;
975
}
976
977
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
978
_mm256_cvttps_epi32(__m256 __a) {
979
__m256i ret;
980
ret.v0 = _mm_cvttps_epi32(__a.v0);
981
ret.v1 = _mm_cvttps_epi32(__a.v1);
982
return ret;
983
}
984
985
static __inline__ double __attribute__((__always_inline__, __nodebug__))
986
_mm256_cvtsd_f64(__m256d __a) {
987
return _mm_cvtsd_f64(__a.v0);
988
}
989
990
static __inline__ int __attribute__((__always_inline__, __nodebug__))
991
_mm256_cvtsi256_si32(__m256i __a) {
992
return _mm_cvtsi128_si32(__a.v0);
993
}
994
995
static __inline__ float __attribute__((__always_inline__, __nodebug__))
996
_mm256_cvtss_f32(__m256 __a) {
997
return _mm_cvtss_f32(__a.v0);
998
}
999
1000
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1001
_mm256_movehdup_ps(__m256 __a) {
1002
__m256 ret;
1003
ret.v0 = _mm_movehdup_ps(__a.v0);
1004
ret.v1 = _mm_movehdup_ps(__a.v1);
1005
return ret;
1006
}
1007
1008
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1009
_mm256_moveldup_ps(__m256 __a) {
1010
__m256 ret;
1011
ret.v0 = _mm_moveldup_ps(__a.v0);
1012
ret.v1 = _mm_moveldup_ps(__a.v1);
1013
return ret;
1014
}
1015
1016
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1017
_mm256_movedup_pd(__m256d __a) {
1018
__m256d ret;
1019
ret.v0 = _mm_movedup_pd(__a.v0);
1020
ret.v1 = _mm_movedup_pd(__a.v1);
1021
return ret;
1022
}
1023
1024
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1025
_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
1026
__m256d ret;
1027
ret.v0 = _mm_unpackhi_pd(__a.v0, __b.v0);
1028
ret.v1 = _mm_unpackhi_pd(__a.v1, __b.v1);
1029
return ret;
1030
}
1031
1032
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1033
_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
1034
__m256d ret;
1035
ret.v0 = _mm_unpacklo_pd(__a.v0, __b.v0);
1036
ret.v1 = _mm_unpacklo_pd(__a.v1, __b.v1);
1037
return ret;
1038
}
1039
1040
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1041
_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
1042
__m256 ret;
1043
ret.v0 = _mm_unpackhi_ps(__a.v0, __b.v0);
1044
ret.v1 = _mm_unpackhi_ps(__a.v1, __b.v1);
1045
return ret;
1046
}
1047
1048
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1049
_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
1050
__m256 ret;
1051
ret.v0 = _mm_unpacklo_ps(__a.v0, __b.v0);
1052
ret.v1 = _mm_unpacklo_ps(__a.v1, __b.v1);
1053
return ret;
1054
}
1055
1056
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1057
_mm_testz_pd(__m128d __a, __m128d __b) {
1058
v128_t __m =
1059
wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);
1060
return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
1061
}
1062
1063
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1064
_mm_testc_pd(__m128d __a, __m128d __b) {
1065
v128_t __m =
1066
wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);
1067
return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
1068
}
1069
1070
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1071
_mm_testnzc_pd(__m128d __a, __m128d __b) {
1072
v128_t __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);
1073
v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);
1074
return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) &
1075
(wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));
1076
}
1077
1078
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1079
_mm_testz_ps(__m128 __a, __m128 __b) {
1080
v128_t __m =
1081
wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);
1082
__m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
1083
__m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
1084
return wasm_i32x4_extract_lane(__m, 0);
1085
}
1086
1087
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1088
_mm_testc_ps(__m128 __a, __m128 __b) {
1089
v128_t __m =
1090
wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);
1091
__m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
1092
__m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
1093
return wasm_i32x4_extract_lane(__m, 0);
1094
}
1095
1096
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1097
_mm_testnzc_ps(__m128 __a, __m128 __b) {
1098
v128_t __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);
1099
v128_t __m2 = wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 31);
1100
1101
__m = wasm_v128_or(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
1102
__m2 = wasm_v128_or(__m2, (v128_t)_mm_movehl_ps((__m128)__m2, (__m128)__m2));
1103
__m = wasm_v128_or(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
1104
__m2 = wasm_v128_or(__m2, _mm_shuffle_epi32(__m2, _MM_SHUFFLE(3, 2, 0, 1)));
1105
1106
return wasm_i32x4_extract_lane(__m, 0) & wasm_i32x4_extract_lane(__m2, 0);
1107
}
1108
1109
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1110
_mm256_testz_pd(__m256d __a, __m256d __b) {
1111
return _mm_testz_pd(__a.v0, __b.v0) & _mm_testz_pd(__a.v1, __b.v1);
1112
}
1113
1114
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1115
_mm256_testc_pd(__m256d __a, __m256d __b) {
1116
return _mm_testc_pd(__a.v0, __b.v0) & _mm_testc_pd(__a.v1, __b.v1);
1117
}
1118
1119
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1120
_mm256_testnzc_pd(__m256d __a, __m256d __b) {
1121
v128_t __m =
1122
wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 63);
1123
v128_t __m1 =
1124
wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 63);
1125
v128_t __m2 =
1126
wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 63);
1127
v128_t __m3 =
1128
wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 63);
1129
return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
1130
wasm_v128_any_true(wasm_v128_or(__m2, __m3));
1131
}
1132
1133
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1134
_mm256_testz_ps(__m256 __a, __m256 __b) {
1135
return _mm_testz_ps(__a.v0, __b.v0) & _mm_testz_ps(__a.v1, __b.v1);
1136
}
1137
1138
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1139
_mm256_testc_ps(__m256 __a, __m256 __b) {
1140
return _mm_testc_ps(__a.v0, __b.v0) & _mm_testc_ps(__a.v1, __b.v1);
1141
}
1142
1143
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1144
_mm256_testnzc_ps(__m256 __a, __m256 __b) {
1145
v128_t __m =
1146
wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 31);
1147
v128_t __m1 =
1148
wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 31);
1149
v128_t __m2 =
1150
wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 31);
1151
v128_t __m3 =
1152
wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 31);
1153
1154
return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
1155
wasm_v128_any_true(wasm_v128_or(__m2, __m3));
1156
}
1157
1158
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1159
_mm256_testz_si256(__m256i __a, __m256i __b) {
1160
return _mm_testz_si128(__a.v0, __b.v0) & _mm_testz_si128(__a.v1, __b.v1);
1161
}
1162
1163
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1164
_mm256_testc_si256(__m256i __a, __m256i __b) {
1165
return _mm_testc_si128(__a.v0, __b.v0) & _mm_testc_si128(__a.v1, __b.v1);
1166
}
1167
1168
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1169
_mm256_testnzc_si256(__m256i __a, __m256i __b) {
1170
v128_t __m = wasm_v128_and(__a.v0, __b.v0);
1171
v128_t __m1 = wasm_v128_and(__a.v1, __b.v1);
1172
v128_t __m2 = wasm_v128_andnot(__b.v0, __a.v0);
1173
v128_t __m3 = wasm_v128_andnot(__b.v1, __a.v1);
1174
return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
1175
wasm_v128_any_true(wasm_v128_or(__m2, __m3));
1176
}
1177
1178
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1179
_mm256_movemask_pd(__m256d __a) {
1180
return _mm_movemask_pd(__a.v0) | (_mm_movemask_pd(__a.v1) << 2);
1181
}
1182
1183
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1184
_mm256_movemask_ps(__m256 __a) {
1185
return _mm_movemask_ps(__a.v0) | (_mm_movemask_ps(__a.v1) << 4);
1186
}
1187
1188
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1189
_mm256_zeroall(void) {
1190
// Do nothing
1191
// when porting any assembly code that would have calls to these functions
1192
// around, that assembly code in the first place will not compile.
1193
}
1194
1195
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1196
_mm256_zeroupper(void) {
1197
// Do nothing
1198
// when porting any assembly code that would have calls to these functions
1199
// around, that assembly code in the first place will not compile.
1200
}
1201
1202
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1203
_mm_broadcast_ss(float const* __a) {
1204
return (__m128)wasm_v128_load32_splat(__a);
1205
}
1206
1207
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1208
_mm256_broadcast_sd(double const* __a) {
1209
__m256d ret;
1210
ret.v1 = ret.v0 = (__m128d)wasm_v128_load64_splat(__a);
1211
return ret;
1212
}
1213
1214
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1215
_mm256_broadcast_ss(float const* __a) {
1216
__m256 ret;
1217
ret.v1 = ret.v0 = _mm_broadcast_ss(__a);
1218
return ret;
1219
}
1220
1221
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1222
_mm256_broadcast_pd(__m128d const* __a) {
1223
__m256d ret;
1224
ret.v1 = ret.v0 = (__m128d)wasm_v128_load(__a);
1225
return ret;
1226
}
1227
1228
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1229
_mm256_broadcast_ps(__m128 const* __a) {
1230
__m256 ret;
1231
ret.v1 = ret.v0 = (__m128)wasm_v128_load(__a);
1232
return ret;
1233
}
1234
1235
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1236
_mm256_load_pd(double const* __p) {
1237
__m256d ret;
1238
ret.v0 = _mm_load_pd(__p);
1239
ret.v1 = _mm_load_pd(__p + 2);
1240
return ret;
1241
}
1242
1243
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1244
_mm256_load_ps(float const* __p) {
1245
__m256 ret;
1246
ret.v0 = _mm_load_ps(__p);
1247
ret.v1 = _mm_load_ps(__p + 4);
1248
return ret;
1249
}
1250
1251
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1252
_mm256_loadu_pd(double const* __p) {
1253
__m256d ret;
1254
ret.v0 = _mm_loadu_pd(__p);
1255
ret.v1 = _mm_loadu_pd(__p + 2);
1256
return ret;
1257
}
1258
1259
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1260
_mm256_loadu_ps(float const* __p) {
1261
__m256 ret;
1262
ret.v0 = _mm_loadu_ps(__p);
1263
ret.v1 = _mm_loadu_ps(__p + 4);
1264
return ret;
1265
}
1266
1267
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1268
_mm256_load_si256(__m256i const* __p) {
1269
__m256i ret;
1270
ret.v0 = _mm_load_si128((__m128i const*)__p);
1271
ret.v1 = _mm_load_si128(((__m128i const*)__p) + 1);
1272
return ret;
1273
}
1274
1275
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1276
_mm256_loadu_si256(__m256i_u const* __p) {
1277
__m256i ret;
1278
ret.v0 = _mm_loadu_si128((__m128i const*)__p);
1279
ret.v1 = _mm_loadu_si128(((__m128i const*)__p) + 1);
1280
return ret;
1281
}
1282
1283
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1284
_mm256_lddqu_si256(__m256i_u const* __p) {
1285
__m256i ret;
1286
ret.v0 = _mm_lddqu_si128((__m128i const*)__p);
1287
ret.v1 = _mm_lddqu_si128(((__m128i const*)__p) + 1);
1288
return ret;
1289
}
1290
1291
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1292
_mm256_store_pd(double* __p, __m256d __a) {
1293
_mm_store_pd(__p, __a.v0);
1294
_mm_store_pd(__p + 2, __a.v1);
1295
}
1296
1297
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1298
_mm256_store_ps(float* __p, __m256 __a) {
1299
_mm_store_ps(__p, __a.v0);
1300
_mm_store_ps(__p + 4, __a.v1);
1301
}
1302
1303
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1304
_mm256_storeu_pd(double* __p, __m256d __a) {
1305
_mm_storeu_pd(__p, __a.v0);
1306
_mm_storeu_pd(__p + 2, __a.v1);
1307
}
1308
1309
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1310
_mm256_storeu_ps(float* __p, __m256 __a) {
1311
_mm_storeu_ps(__p, __a.v0);
1312
_mm_storeu_ps(__p + 4, __a.v1);
1313
}
1314
1315
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1316
_mm256_store_si256(__m256i* __p, __m256i __a) {
1317
_mm_store_si128((__m128i*)__p, __a.v0);
1318
_mm_store_si128(((__m128i*)__p) + 1, __a.v1);
1319
}
1320
1321
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1322
_mm256_storeu_si256(__m256i_u* __p, __m256i __a) {
1323
_mm_storeu_si128((__m128i*)__p, __a.v0);
1324
_mm_storeu_si128(((__m128i*)__p) + 1, __a.v1);
1325
}
1326
1327
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1328
_mm_maskload_pd(double const* __p, __m128i __m) {
1329
// This may cause an out-of-bounds memory load since we first load and
1330
// then mask, but since there are no segmentation faults in Wasm memory
1331
// accesses, that is ok (as long as we are within the heap bounds -
1332
// a negligible limitation in practice)
1333
return _mm_and_pd(_mm_load_pd(__p), (__m128d)wasm_i64x2_shr(__m, 63));
1334
}
1335
1336
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1337
_mm256_maskload_pd(double const* __p, __m256i __m) {
1338
__m256d ret;
1339
ret.v0 = _mm_maskload_pd(__p, __m.v0);
1340
ret.v1 = _mm_maskload_pd(__p + 2, __m.v1);
1341
return ret;
1342
}
1343
1344
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1345
_mm_maskload_ps(float const* __p, __m128i __m) {
1346
// This may cause an out-of-bounds memory load since we first load and
1347
// then mask, but since there are no segmentation faults in Wasm memory
1348
// accesses, that is ok (as long as we are within the heap bounds -
1349
// a negligible limitation in practice)
1350
return _mm_and_ps(_mm_load_ps(__p), (__m128)_mm_srai_epi32(__m, 31));
1351
}
1352
1353
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1354
_mm256_maskload_ps(float const* __p, __m256i __m) {
1355
__m256 ret;
1356
ret.v0 = _mm_maskload_ps(__p, __m.v0);
1357
ret.v1 = _mm_maskload_ps(__p + 4, __m.v1);
1358
return ret;
1359
}
1360
1361
static __inline__ void
1362
__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
1363
_mm_maskstore_ps(float* __p, __m128i __m, __m128 __a) {
1364
if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0)
1365
__p[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
1366
if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0)
1367
__p[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
1368
if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0)
1369
__p[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
1370
if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0)
1371
__p[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
1372
}
1373
1374
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1375
_mm256_maskstore_ps(float* __p, __m256i __m, __m256 __a) {
1376
_mm_maskstore_ps(__p, __m.v0, __a.v0);
1377
_mm_maskstore_ps(__p + 4, __m.v1, __a.v1);
1378
}
1379
1380
static __inline__ void
1381
__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
1382
_mm_maskstore_pd(double* __p, __m128i __m, __m128d __a) {
1383
if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0)
1384
__p[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
1385
if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0)
1386
__p[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
1387
}
1388
1389
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1390
_mm256_maskstore_pd(double* __p, __m256i __m, __m256d __a) {
1391
_mm_maskstore_pd(__p, __m.v0, __a.v0);
1392
_mm_maskstore_pd(__p + 2, __m.v1, __a.v1);
1393
}
1394
1395
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1396
_mm256_stream_si256(void* __a, __m256i __b) {
1397
_mm_stream_si128((__m128i*)__a, __b.v0);
1398
_mm_stream_si128(((__m128i*)__a) + 1, __b.v1);
1399
}
1400
1401
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1402
_mm256_stream_pd(void* __a, __m256d __b) {
1403
_mm_stream_pd((double*)__a, __b.v0);
1404
_mm_stream_pd(((double*)__a) + 2, __b.v1);
1405
}
1406
1407
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1408
_mm256_stream_ps(void* __p, __m256 __a) {
1409
_mm_stream_ps((float*)__p, __a.v0);
1410
_mm_stream_ps(((float*)__p) + 4, __a.v1);
1411
}
1412
1413
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1414
_mm256_undefined_pd(void) {
1415
__m256d val;
1416
return val;
1417
}
1418
1419
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1420
_mm256_undefined_ps(void) {
1421
__m256 val;
1422
return val;
1423
}
1424
1425
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1426
_mm256_undefined_si256(void) {
1427
__m256i val;
1428
return val;
1429
}
1430
1431
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1432
_mm256_set_pd(double __a, double __b, double __c, double __d) {
1433
__m256d ret;
1434
ret.v0 = _mm_set_pd(__c, __d);
1435
ret.v1 = _mm_set_pd(__a, __b);
1436
return ret;
1437
}
1438
1439
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1440
_mm256_set_ps(float __a,
1441
float __b,
1442
float __c,
1443
float __d,
1444
float __e,
1445
float __f,
1446
float __g,
1447
float __h) {
1448
__m256 ret;
1449
ret.v0 = _mm_set_ps(__e, __f, __g, __h);
1450
ret.v1 = _mm_set_ps(__a, __b, __c, __d);
1451
return ret;
1452
}
1453
1454
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1455
_mm256_set_epi32(int __i0,
1456
int __i1,
1457
int __i2,
1458
int __i3,
1459
int __i4,
1460
int __i5,
1461
int __i6,
1462
int __i7) {
1463
__m256i ret;
1464
ret.v0 = _mm_set_epi32(__i4, __i5, __i6, __i7);
1465
ret.v1 = _mm_set_epi32(__i0, __i1, __i2, __i3);
1466
return ret;
1467
}
1468
1469
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1470
_mm256_set_epi16(short __w15,
1471
short __w14,
1472
short __w13,
1473
short __w12,
1474
short __w11,
1475
short __w10,
1476
short __w09,
1477
short __w08,
1478
short __w07,
1479
short __w06,
1480
short __w05,
1481
short __w04,
1482
short __w03,
1483
short __w02,
1484
short __w01,
1485
short __w00) {
1486
__m256i ret;
1487
ret.v0 =
1488
_mm_set_epi16(__w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00);
1489
ret.v1 =
1490
_mm_set_epi16(__w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08);
1491
return ret;
1492
}
1493
1494
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1495
_mm256_set_epi8(char __b31,
1496
char __b30,
1497
char __b29,
1498
char __b28,
1499
char __b27,
1500
char __b26,
1501
char __b25,
1502
char __b24,
1503
char __b23,
1504
char __b22,
1505
char __b21,
1506
char __b20,
1507
char __b19,
1508
char __b18,
1509
char __b17,
1510
char __b16,
1511
char __b15,
1512
char __b14,
1513
char __b13,
1514
char __b12,
1515
char __b11,
1516
char __b10,
1517
char __b09,
1518
char __b08,
1519
char __b07,
1520
char __b06,
1521
char __b05,
1522
char __b04,
1523
char __b03,
1524
char __b02,
1525
char __b01,
1526
char __b00) {
1527
__m256i ret;
1528
ret.v0 = _mm_set_epi8(__b15,
1529
__b14,
1530
__b13,
1531
__b12,
1532
__b11,
1533
__b10,
1534
__b09,
1535
__b08,
1536
__b07,
1537
__b06,
1538
__b05,
1539
__b04,
1540
__b03,
1541
__b02,
1542
__b01,
1543
__b00);
1544
ret.v1 = _mm_set_epi8(__b31,
1545
__b30,
1546
__b29,
1547
__b28,
1548
__b27,
1549
__b26,
1550
__b25,
1551
__b24,
1552
__b23,
1553
__b22,
1554
__b21,
1555
__b20,
1556
__b19,
1557
__b18,
1558
__b17,
1559
__b16);
1560
return ret;
1561
}
1562
1563
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1564
_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) {
1565
__m256i ret;
1566
ret.v0 = _mm_set_epi64x(__c, __d);
1567
ret.v1 = _mm_set_epi64x(__a, __b);
1568
return ret;
1569
}
1570
1571
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1572
_mm256_setr_pd(double __a, double __b, double __c, double __d) {
1573
return _mm256_set_pd(__d, __c, __b, __a);
1574
}
1575
1576
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1577
_mm256_setr_ps(float __a,
1578
float __b,
1579
float __c,
1580
float __d,
1581
float __e,
1582
float __f,
1583
float __g,
1584
float __h) {
1585
return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
1586
}
1587
1588
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1589
_mm256_setr_epi32(int __i0,
1590
int __i1,
1591
int __i2,
1592
int __i3,
1593
int __i4,
1594
int __i5,
1595
int __i6,
1596
int __i7) {
1597
return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
1598
}
1599
1600
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1601
_mm256_setr_epi16(short __w15,
1602
short __w14,
1603
short __w13,
1604
short __w12,
1605
short __w11,
1606
short __w10,
1607
short __w09,
1608
short __w08,
1609
short __w07,
1610
short __w06,
1611
short __w05,
1612
short __w04,
1613
short __w03,
1614
short __w02,
1615
short __w01,
1616
short __w00) {
1617
return _mm256_set_epi16(__w00,
1618
__w01,
1619
__w02,
1620
__w03,
1621
__w04,
1622
__w05,
1623
__w06,
1624
__w07,
1625
__w08,
1626
__w09,
1627
__w10,
1628
__w11,
1629
__w12,
1630
__w13,
1631
__w14,
1632
__w15);
1633
}
1634
1635
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1636
_mm256_setr_epi8(char __b31,
1637
char __b30,
1638
char __b29,
1639
char __b28,
1640
char __b27,
1641
char __b26,
1642
char __b25,
1643
char __b24,
1644
char __b23,
1645
char __b22,
1646
char __b21,
1647
char __b20,
1648
char __b19,
1649
char __b18,
1650
char __b17,
1651
char __b16,
1652
char __b15,
1653
char __b14,
1654
char __b13,
1655
char __b12,
1656
char __b11,
1657
char __b10,
1658
char __b09,
1659
char __b08,
1660
char __b07,
1661
char __b06,
1662
char __b05,
1663
char __b04,
1664
char __b03,
1665
char __b02,
1666
char __b01,
1667
char __b00) {
1668
return _mm256_set_epi8(__b00,
1669
__b01,
1670
__b02,
1671
__b03,
1672
__b04,
1673
__b05,
1674
__b06,
1675
__b07,
1676
__b08,
1677
__b09,
1678
__b10,
1679
__b11,
1680
__b12,
1681
__b13,
1682
__b14,
1683
__b15,
1684
__b16,
1685
__b17,
1686
__b18,
1687
__b19,
1688
__b20,
1689
__b21,
1690
__b22,
1691
__b23,
1692
__b24,
1693
__b25,
1694
__b26,
1695
__b27,
1696
__b28,
1697
__b29,
1698
__b30,
1699
__b31);
1700
}
1701
1702
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1703
_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) {
1704
return _mm256_set_epi64x(__d, __c, __b, __a);
1705
}
1706
1707
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1708
_mm256_set1_pd(double __w) {
1709
__m256d ret;
1710
ret.v1 = ret.v0 = (__m128d)wasm_f64x2_splat(__w);
1711
return ret;
1712
}
1713
1714
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1715
_mm256_set1_ps(float __w) {
1716
__m256 ret;
1717
ret.v1 = ret.v0 = (__m128)wasm_f32x4_splat(__w);
1718
return ret;
1719
}
1720
1721
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1722
_mm256_set1_epi32(int __i) {
1723
__m256i ret;
1724
ret.v1 = ret.v0 = wasm_i32x4_splat(__i);
1725
return ret;
1726
}
1727
1728
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1729
_mm256_set1_epi16(short __w) {
1730
__m256i ret;
1731
ret.v1 = ret.v0 = wasm_i16x8_splat(__w);
1732
return ret;
1733
}
1734
1735
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1736
_mm256_set1_epi8(char __b) {
1737
__m256i ret;
1738
ret.v1 = ret.v0 = wasm_i8x16_splat(__b);
1739
return ret;
1740
}
1741
1742
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1743
_mm256_set1_epi64x(long long __q) {
1744
__m256i ret;
1745
ret.v1 = ret.v0 = wasm_i64x2_splat(__q);
1746
return ret;
1747
}
1748
1749
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1750
_mm256_setzero_pd(void) {
1751
__m256d ret;
1752
ret.v1 = ret.v0 = _mm_setzero_pd();
1753
return ret;
1754
}
1755
1756
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1757
_mm256_setzero_ps(void) {
1758
__m256 ret;
1759
ret.v1 = ret.v0 = _mm_setzero_ps();
1760
return ret;
1761
}
1762
1763
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1764
_mm256_setzero_si256(void) {
1765
__m256i ret;
1766
ret.v1 = ret.v0 = _mm_setzero_si128();
1767
return ret;
1768
}
1769
1770
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1771
_mm256_castpd_ps(__m256d __a) {
1772
union __m256_data ret;
1773
ret.double_view = __a;
1774
return ret.float_view;
1775
}
1776
1777
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1778
_mm256_castpd_si256(__m256d __a) {
1779
union __m256_data ret;
1780
ret.double_view = __a;
1781
return ret.int_view;
1782
}
1783
1784
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1785
_mm256_castps_pd(__m256 __a) {
1786
union __m256_data ret;
1787
ret.float_view = __a;
1788
return ret.double_view;
1789
}
1790
1791
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1792
_mm256_castps_si256(__m256 __a) {
1793
union __m256_data ret;
1794
ret.float_view = __a;
1795
return ret.int_view;
1796
}
1797
1798
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1799
_mm256_castsi256_ps(__m256i __a) {
1800
union __m256_data ret;
1801
ret.int_view = __a;
1802
return ret.float_view;
1803
}
1804
1805
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1806
_mm256_castsi256_pd(__m256i __a) {
1807
union __m256_data ret;
1808
ret.int_view = __a;
1809
return ret.double_view;
1810
}
1811
1812
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1813
_mm256_castpd256_pd128(__m256d __a) {
1814
return __a.v0;
1815
}
1816
1817
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1818
_mm256_castps256_ps128(__m256 __a) {
1819
return __a.v0;
1820
}
1821
1822
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1823
_mm256_castsi256_si128(__m256i __a) {
1824
return __a.v0;
1825
}
1826
1827
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1828
_mm256_castpd128_pd256(__m128d __a) {
1829
__m256d ret;
1830
ret.v0 = __a;
1831
ret.v1 = _mm_setzero_pd();
1832
return ret;
1833
}
1834
1835
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1836
_mm256_castps128_ps256(__m128 __a) {
1837
__m256 ret;
1838
ret.v0 = __a;
1839
ret.v1 = _mm_setzero_ps();
1840
return ret;
1841
}
1842
1843
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1844
_mm256_castsi128_si256(__m128i __a) {
1845
__m256i ret;
1846
ret.v0 = __a;
1847
ret.v1 = _mm_setzero_si128();
1848
return ret;
1849
}
1850
1851
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1852
_mm256_zextpd128_pd256(__m128d __a) {
1853
__m256d ret;
1854
ret.v0 = __a;
1855
ret.v1 = _mm_setzero_pd();
1856
return ret;
1857
}
1858
1859
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1860
_mm256_zextps128_ps256(__m128 __a) {
1861
__m256 ret;
1862
ret.v0 = __a;
1863
ret.v1 = _mm_setzero_ps();
1864
return ret;
1865
}
1866
1867
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1868
_mm256_zextsi128_si256(__m128i __a) {
1869
__m256i ret;
1870
ret.v0 = __a;
1871
ret.v1 = _mm_setzero_si128();
1872
return ret;
1873
}
1874
1875
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1876
_mm256_insertf128_ps(__m256 __a, __m128 __b, const int imm8) {
1877
__m256 ret = __a;
1878
if (imm8 & 0x1) {
1879
ret.v1 = __b;
1880
} else {
1881
ret.v0 = __b;
1882
}
1883
return ret;
1884
}
1885
1886
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1887
_mm256_insertf128_pd(__m256d __a, __m128d __b, const int imm8) {
1888
__m256d ret = __a;
1889
if (imm8 & 0x1) {
1890
ret.v1 = __b;
1891
} else {
1892
ret.v0 = __b;
1893
}
1894
return ret;
1895
}
1896
1897
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1898
_mm256_insertf128_si256(__m256i __a, __m128i __b, const int imm8) {
1899
__m256i ret = __a;
1900
if (imm8 & 0x1) {
1901
ret.v1 = __b;
1902
} else {
1903
ret.v0 = __b;
1904
}
1905
return ret;
1906
}
1907
1908
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1909
_mm256_extractf128_ps(__m256 __a, const int imm8) {
1910
if (imm8 & 0x1) {
1911
return __a.v1;
1912
} else {
1913
return __a.v0;
1914
}
1915
}
1916
1917
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1918
_mm256_extractf128_pd(__m256d __a, const int imm8) {
1919
if (imm8 & 0x1) {
1920
return __a.v1;
1921
} else {
1922
return __a.v0;
1923
}
1924
}
1925
1926
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1927
_mm256_extractf128_si256(__m256i __a, const int imm8) {
1928
if (imm8 & 0x1) {
1929
return __a.v1;
1930
} else {
1931
return __a.v0;
1932
}
1933
}
1934
1935
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1936
_mm256_set_m128(__m128 __hi, __m128 __lo) {
1937
__m256 ret;
1938
ret.v0 = __lo;
1939
ret.v1 = __hi;
1940
return ret;
1941
}
1942
1943
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1944
_mm256_set_m128d(__m128d __hi, __m128d __lo) {
1945
__m256d ret;
1946
ret.v0 = __lo;
1947
ret.v1 = __hi;
1948
return ret;
1949
}
1950
1951
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1952
_mm256_set_m128i(__m128i __hi, __m128i __lo) {
1953
__m256i ret;
1954
ret.v0 = __lo;
1955
ret.v1 = __hi;
1956
return ret;
1957
}
1958
1959
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1960
_mm256_setr_m128(__m128 __lo, __m128 __hi) {
1961
return _mm256_set_m128(__hi, __lo);
1962
}
1963
1964
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1965
_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
1966
return (__m256d)_mm256_set_m128d(__hi, __lo);
1967
}
1968
1969
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1970
_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
1971
return (__m256i)_mm256_set_m128i(__hi, __lo);
1972
}
1973
1974
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
1975
_mm256_loadu2_m128(float const* __addr_hi, float const* __addr_lo) {
1976
return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
1977
}
1978
1979
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
1980
_mm256_loadu2_m128d(double const* __addr_hi, double const* __addr_lo) {
1981
return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
1982
}
1983
1984
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
1985
_mm256_loadu2_m128i(__m128i_u const* __addr_hi, __m128i_u const* __addr_lo) {
1986
return _mm256_set_m128i(_mm_loadu_si128((__m128i const*)__addr_hi),
1987
_mm_loadu_si128((__m128i const*)__addr_lo));
1988
}
1989
1990
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1991
_mm256_storeu2_m128(float* __addr_hi, float* __addr_lo, __m256 __a) {
1992
_mm_storeu_ps(__addr_lo, __a.v0);
1993
_mm_storeu_ps(__addr_hi, __a.v1);
1994
}
1995
1996
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1997
_mm256_storeu2_m128d(double* __addr_hi, double* __addr_lo, __m256d __a) {
1998
_mm_storeu_pd(__addr_lo, __a.v0);
1999
_mm_storeu_pd(__addr_hi, __a.v1);
2000
}
2001
2002
static __inline__ void __attribute__((__always_inline__, __nodebug__))
2003
_mm256_storeu2_m128i(__m128i_u* __addr_hi, __m128i_u* __addr_lo, __m256i __a) {
2004
_mm_storeu_si128((__m128i*)__addr_lo, __a.v0);
2005
_mm_storeu_si128((__m128i*)__addr_hi, __a.v1);
2006
}
2007
2008
#endif /* __emscripten_avxintrin_h__ */
2009
2010