Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/system/include/compat/emmintrin.h
6175 views
1
/*
2
* Copyright 2020 The Emscripten Authors. All rights reserved.
3
* Emscripten is available under two separate licenses, the MIT license and the
4
* University of Illinois/NCSA Open Source License. Both these licenses can be
5
* found in the LICENSE file.
6
*/
7
#ifndef __emscripten_emmintrin_h__
8
#define __emscripten_emmintrin_h__
9
10
#ifndef __SSE2__
11
#error "SSE2 instruction set not enabled"
12
#endif
13
14
#include <xmmintrin.h>
15
16
// Alias different (functionally) equivalent intrinsics.
17
#define _mm_set_epi64x _mm_set_epi64
18
#define _mm_cvtsd_si64x _mm_cvtsd_si64
19
#define _mm_cvtsi128_si64x _mm_cvtsi128_si64
20
#define _mm_cvtsi64x_sd _mm_cvtsi64_sd
21
#define _mm_cvtsi64x_si128 _mm_cvtsi64_si128
22
#define _mm_cvttsd_si64x _mm_cvttsd_si64
23
#define _mm_store_pd1 _mm_store1_pd
24
25
typedef __f64x2 __m128d;
26
27
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
28
_mm_move_sd(__m128d __a, __m128d __b)
29
{
30
return (__m128d){ __b[0], __a[1] };
31
}
32
33
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
34
_mm_add_pd(__m128d __a, __m128d __b)
35
{
36
return (__m128d)wasm_f64x2_add((v128_t)__a, (v128_t)__b);
37
}
38
39
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
40
_mm_add_sd(__m128d __a, __m128d __b)
41
{
42
return _mm_move_sd(__a, _mm_add_pd(__a, __b));
43
}
44
45
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
46
_mm_sub_pd(__m128d __a, __m128d __b)
47
{
48
return (__m128d)wasm_f64x2_sub((v128_t)__a, (v128_t)__b);
49
}
50
51
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
52
_mm_sub_sd(__m128d __a, __m128d __b)
53
{
54
return _mm_move_sd(__a, _mm_sub_pd(__a, __b));
55
}
56
57
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
58
_mm_mul_pd(__m128d __a, __m128d __b)
59
{
60
return (__m128d)wasm_f64x2_mul((v128_t)__a, (v128_t)__b);
61
}
62
63
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
64
_mm_mul_sd(__m128d __a, __m128d __b)
65
{
66
return _mm_move_sd(__a, _mm_mul_pd(__a, __b));
67
}
68
69
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
70
_mm_div_pd(__m128d __a, __m128d __b)
71
{
72
return (__m128d)wasm_f64x2_div((v128_t)__a, (v128_t)__b);
73
}
74
75
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76
_mm_div_sd(__m128d __a, __m128d __b)
77
{
78
return _mm_move_sd(__a, _mm_div_pd(__a, __b));
79
}
80
81
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82
_mm_sqrt_pd(__m128d __a)
83
{
84
return (__m128d)wasm_f64x2_sqrt((v128_t)__a);
85
}
86
87
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
88
_mm_sqrt_sd(__m128d __a, __m128d __b)
89
{
90
return _mm_move_sd(__a, _mm_sqrt_pd(__b));
91
}
92
93
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
94
_mm_min_pd(__m128d __a, __m128d __b)
95
{
96
// return (__m128d)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
97
return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_lt((v128_t)__a, (v128_t)__b));
98
}
99
100
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
101
_mm_min_sd(__m128d __a, __m128d __b)
102
{
103
return _mm_move_sd(__a, _mm_min_pd(__a, __b));
104
}
105
106
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
107
_mm_max_pd(__m128d __a, __m128d __b)
108
{
109
// return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
110
return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_gt((v128_t)__a, (v128_t)__b));
111
}
112
113
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114
_mm_max_sd(__m128d __a, __m128d __b)
115
{
116
return _mm_move_sd(__a, _mm_max_pd(__a, __b));
117
}
118
119
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120
_mm_and_pd(__m128d __a, __m128d __b)
121
{
122
return (__m128d)wasm_v128_and((v128_t)__a, (v128_t)__b);
123
}
124
125
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126
_mm_andnot_pd(__m128d __a, __m128d __b)
127
{
128
return (__m128d)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
129
}
130
131
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132
_mm_or_pd(__m128d __a, __m128d __b)
133
{
134
return (__m128d)wasm_v128_or((v128_t)__a, (v128_t)__b);
135
}
136
137
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138
_mm_xor_pd(__m128d __a, __m128d __b)
139
{
140
return (__m128d)wasm_v128_xor((v128_t)__a, (v128_t)__b);
141
}
142
143
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144
_mm_cmpeq_pd(__m128d __a, __m128d __b)
145
{
146
return (__m128d)wasm_f64x2_eq((v128_t)__a, (v128_t)__b);
147
}
148
149
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150
_mm_cmplt_pd(__m128d __a, __m128d __b)
151
{
152
return (__m128d)wasm_f64x2_lt((v128_t)__a, (v128_t)__b);
153
}
154
155
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156
_mm_cmple_pd(__m128d __a, __m128d __b)
157
{
158
return (__m128d)wasm_f64x2_le((v128_t)__a, (v128_t)__b);
159
}
160
161
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162
_mm_cmpgt_pd(__m128d __a, __m128d __b)
163
{
164
return (__m128d)wasm_f64x2_gt((v128_t)__a, (v128_t)__b);
165
}
166
167
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168
_mm_cmpge_pd(__m128d __a, __m128d __b)
169
{
170
return (__m128d)wasm_f64x2_ge((v128_t)__a, (v128_t)__b);
171
}
172
173
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174
_mm_cmpord_pd(__m128d __a, __m128d __b)
175
{
176
return (__m128d)wasm_v128_and(wasm_f64x2_eq((v128_t)__a, (v128_t)__a),
177
wasm_f64x2_eq((v128_t)__b, (v128_t)__b));
178
}
179
180
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
181
_mm_cmpunord_pd(__m128d __a, __m128d __b)
182
{
183
return (__m128d)wasm_v128_or(wasm_f64x2_ne((v128_t)__a, (v128_t)__a),
184
wasm_f64x2_ne((v128_t)__b, (v128_t)__b));
185
}
186
187
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
188
_mm_cmpneq_pd(__m128d __a, __m128d __b)
189
{
190
return (__m128d)wasm_f64x2_ne((v128_t)__a, (v128_t)__b);
191
}
192
193
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
194
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
195
{
196
return (__m128d)wasm_v128_not((v128_t)_mm_cmplt_pd(__a, __b));
197
}
198
199
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
200
_mm_cmpnle_pd(__m128d __a, __m128d __b)
201
{
202
return (__m128d)wasm_v128_not((v128_t)_mm_cmple_pd(__a, __b));
203
}
204
205
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
206
_mm_cmpngt_pd(__m128d __a, __m128d __b)
207
{
208
return (__m128d)wasm_v128_not((v128_t)_mm_cmpgt_pd(__a, __b));
209
}
210
211
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
212
_mm_cmpnge_pd(__m128d __a, __m128d __b)
213
{
214
return (__m128d)wasm_v128_not((v128_t)_mm_cmpge_pd(__a, __b));
215
}
216
217
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
218
_mm_cmpeq_sd(__m128d __a, __m128d __b)
219
{
220
return _mm_move_sd(__a, _mm_cmpeq_pd(__a, __b));
221
}
222
223
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
224
_mm_cmplt_sd(__m128d __a, __m128d __b)
225
{
226
return _mm_move_sd(__a, _mm_cmplt_pd(__a, __b));
227
}
228
229
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
230
_mm_cmple_sd(__m128d __a, __m128d __b)
231
{
232
return _mm_move_sd(__a, _mm_cmple_pd(__a, __b));
233
}
234
235
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
236
_mm_cmpgt_sd(__m128d __a, __m128d __b)
237
{
238
return _mm_move_sd(__a, _mm_cmpgt_pd(__a, __b));
239
}
240
241
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
242
_mm_cmpge_sd(__m128d __a, __m128d __b)
243
{
244
return _mm_move_sd(__a, _mm_cmpge_pd(__a, __b));
245
}
246
247
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
248
_mm_cmpord_sd(__m128d __a, __m128d __b)
249
{
250
return _mm_move_sd(__a, _mm_cmpord_pd(__a, __b));
251
}
252
253
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
254
_mm_cmpunord_sd(__m128d __a, __m128d __b)
255
{
256
return _mm_move_sd(__a, _mm_cmpunord_pd(__a, __b));
257
}
258
259
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260
_mm_cmpneq_sd(__m128d __a, __m128d __b)
261
{
262
return _mm_move_sd(__a, _mm_cmpneq_pd(__a, __b));
263
}
264
265
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
267
{
268
return _mm_move_sd(__a, _mm_cmpnlt_pd(__a, __b));
269
}
270
271
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272
_mm_cmpnle_sd(__m128d __a, __m128d __b)
273
{
274
return _mm_move_sd(__a, _mm_cmpnle_pd(__a, __b));
275
}
276
277
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278
_mm_cmpngt_sd(__m128d __a, __m128d __b)
279
{
280
return _mm_move_sd(__a, _mm_cmpngt_pd(__a, __b));
281
}
282
283
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284
_mm_cmpnge_sd(__m128d __a, __m128d __b)
285
{
286
return _mm_move_sd(__a, _mm_cmpnge_pd(__a, __b));
287
}
288
289
static __inline__ int __attribute__((__always_inline__, __nodebug__))
290
_mm_comieq_sd(__m128d __a, __m128d __b)
291
{
292
return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);
293
}
294
295
static __inline__ int __attribute__((__always_inline__, __nodebug__))
296
_mm_comilt_sd(__m128d __a, __m128d __b)
297
{
298
return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);
299
}
300
301
static __inline__ int __attribute__((__always_inline__, __nodebug__))
302
_mm_comile_sd(__m128d __a, __m128d __b)
303
{
304
return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);
305
}
306
307
static __inline__ int __attribute__((__always_inline__, __nodebug__))
308
_mm_comigt_sd(__m128d __a, __m128d __b)
309
{
310
return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);
311
}
312
313
static __inline__ int __attribute__((__always_inline__, __nodebug__))
314
_mm_comige_sd(__m128d __a, __m128d __b)
315
{
316
return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);
317
}
318
319
static __inline__ int __attribute__((__always_inline__, __nodebug__))
320
_mm_comineq_sd(__m128d __a, __m128d __b)
321
{
322
return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);
323
}
324
325
static __inline__ int __attribute__((__always_inline__, __nodebug__))
326
_mm_ucomieq_sd(__m128d __a, __m128d __b)
327
{
328
return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);
329
}
330
331
static __inline__ int __attribute__((__always_inline__, __nodebug__))
332
_mm_ucomilt_sd(__m128d __a, __m128d __b)
333
{
334
return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);
335
}
336
337
static __inline__ int __attribute__((__always_inline__, __nodebug__))
338
_mm_ucomile_sd(__m128d __a, __m128d __b)
339
{
340
return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);
341
}
342
343
static __inline__ int __attribute__((__always_inline__, __nodebug__))
344
_mm_ucomigt_sd(__m128d __a, __m128d __b)
345
{
346
return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);
347
}
348
349
static __inline__ int __attribute__((__always_inline__, __nodebug__))
350
_mm_ucomige_sd(__m128d __a, __m128d __b)
351
{
352
return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);
353
}
354
355
static __inline__ int __attribute__((__always_inline__, __nodebug__))
356
_mm_ucomineq_sd(__m128d __a, __m128d __b)
357
{
358
return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);
359
}
360
361
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
362
_mm_cvtpd_ps(__m128d __a)
363
{
364
return (__m128)wasm_f32x4_demote_f64x2_zero((v128_t)__a);
365
}
366
367
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
368
_mm_cvtps_pd(__m128 __a)
369
{
370
return (__m128d)wasm_f64x2_promote_low_f32x4((v128_t)__a);
371
}
372
373
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
374
_mm_cvtepi32_pd(__m128i __a)
375
{
376
return (__m128d)wasm_f64x2_convert_low_i32x4((v128_t)__a);
377
}
378
379
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
380
_mm_cvtpd_epi32(__m128d __a)
381
{
382
// TODO: OPTIMIZE!
383
int m[2];
384
for(int i = 0; i < 2; ++i)
385
{
386
double e = __a[i];
387
int x = lrint(e);
388
if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))
389
m[i] = x;
390
else
391
m[i] = (int)0x80000000;
392
}
393
return wasm_i32x4_make(m[0], m[1], 0, 0);
394
}
395
396
static __inline__ int __attribute__((__always_inline__, __nodebug__))
397
_mm_cvtsd_si32(__m128d __a)
398
{
399
// TODO: OPTIMIZE!
400
double e = __a[0];
401
int x = lrint(e);
402
if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))
403
return x;
404
else
405
return (int)0x80000000;
406
}
407
408
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
409
_mm_cvtsd_ss(__m128 __a, __m128d __b)
410
{
411
__a[0] = __b[0];
412
return __a;
413
}
414
415
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416
_mm_cvtsi32_sd(__m128d __a, int __b)
417
{
418
__a[0] = __b;
419
return __a;
420
}
421
422
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
423
_mm_cvtss_sd(__m128d __a, __m128 __b)
424
{
425
__a[0] = __b[0];
426
return __a;
427
}
428
429
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
430
_mm_cvttpd_epi32(__m128d __a)
431
{
432
// TODO: OPTIMIZE!
433
int m[2];
434
for(int i = 0; i < 2; ++i)
435
{
436
double elem = __a[i];
437
if (elem < 2147483648.0 && elem >= -2147483648.0 && (lrint(elem) != 0 || fabs(elem) < 2.0))
438
// Use the trapping instruction here since we have explicit bounds checks
439
// above.
440
m[i] = __builtin_wasm_trunc_s_i32_f64(elem);
441
else
442
m[i] = (int)0x80000000;
443
}
444
return wasm_i32x4_make(m[0], m[1], 0, 0);
445
}
446
447
static __inline__ int __attribute__((__always_inline__, __nodebug__))
448
_mm_cvttsd_si32(__m128d __a)
449
{
450
// TODO: OPTIMIZE!
451
double elem = __a[0];
452
if (elem < 2147483648.0 && elem >= -2147483648.0 && (lrint(elem) != 0 || fabs(elem) < 2.0))
453
// Use the trapping instruction here since we have explicit bounds checks
454
// above.
455
return __builtin_wasm_trunc_s_i32_f64(elem);
456
else
457
return (int)0x80000000;
458
}
459
460
static __inline__ double __attribute__((__always_inline__, __nodebug__))
461
_mm_cvtsd_f64(__m128d __a)
462
{
463
return wasm_f64x2_extract_lane((v128_t)__a, 0);
464
}
465
466
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
467
_mm_load_pd(double const *__dp)
468
{
469
return *(__m128d*)__dp;
470
}
471
472
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
473
_mm_load1_pd(double const *__dp)
474
{
475
return (__m128d)wasm_v64x2_load_splat(__dp);
476
}
477
478
#define _mm_load_pd1(dp) _mm_load1_pd(dp)
479
480
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481
_mm_loadr_pd(double const *__p)
482
{
483
__m128d __u = *(__m128d*)__p; // aligned load
484
return (__m128d)wasm_i64x2_shuffle((v128_t)__u, (v128_t)__u, 1, 0);
485
}
486
487
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488
_mm_loadu_pd(double const *__dp)
489
{
490
struct __loadu_pd {
491
__m128d __v;
492
} __attribute__((__packed__, __may_alias__));
493
return ((struct __loadu_pd*)__dp)->__v;
494
}
495
496
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497
_mm_load_sd(double const *__p)
498
{
499
return (__m128d)wasm_v128_load64_zero((const void*)__p);
500
}
501
502
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503
_mm_loadh_pd(__m128d __a, double const *__dp)
504
{
505
return (__m128d)wasm_v128_load64_lane((const void*)__dp, (v128_t)__a, 1);
506
}
507
508
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
509
_mm_loadl_pd(__m128d __a, double const *__dp)
510
{
511
return (__m128d)wasm_v128_load64_lane((const void*)__dp, (v128_t)__a, 0);
512
}
513
514
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
515
_mm_set_sd(double __w)
516
{
517
return (__m128d)wasm_f64x2_make(__w, 0);
518
}
519
520
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
521
_mm_set1_pd(double __w)
522
{
523
return (__m128d)wasm_f64x2_splat(__w);
524
}
525
526
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527
_mm_set_pd(double __c1, double __c0)
528
{
529
return (__m128d)wasm_f64x2_make(__c0, __c1);
530
}
531
532
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533
_mm_setr_pd(double __c0, double __c1)
534
{
535
return (__m128d)wasm_f64x2_make(__c0, __c1);
536
}
537
538
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539
_mm_setzero_pd(void)
540
{
541
return (__m128d)wasm_f64x2_const(0.0, 0.0);
542
}
543
544
static __inline__ void __attribute__((__always_inline__, __nodebug__))
545
_mm_store_sd(double *__dp, __m128d __a)
546
{
547
wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 0);
548
}
549
550
static __inline__ void __attribute__((__always_inline__, __nodebug__))
551
_mm_store1_pd(double *__dp, __m128d __a)
552
{
553
struct __mm_store1_pd_struct {
554
double __u[2];
555
} __attribute__((__packed__, __may_alias__));
556
((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
557
((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
558
}
559
560
static __inline__ void __attribute__((__always_inline__, __nodebug__))
561
_mm_store_pd(double *__dp, __m128d __a)
562
{
563
*(__m128d *)__dp = __a;
564
}
565
566
static __inline__ void __attribute__((__always_inline__, __nodebug__))
567
_mm_storeu_pd(double *__dp, __m128d __a)
568
{
569
struct __unaligned {
570
__m128d __v;
571
} __attribute__((__packed__, __may_alias__));
572
573
((struct __unaligned *)__dp)->__v = __a;
574
}
575
576
static __inline__ void __attribute__((__always_inline__, __nodebug__))
577
_mm_storer_pd(double *__p, __m128d __a)
578
{
579
*(__m128d *)__p = (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__a, 1, 0);
580
}
581
582
static __inline__ void __attribute__((__always_inline__, __nodebug__))
583
_mm_storeh_pd(double *__dp, __m128d __a)
584
{
585
wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 1);
586
}
587
588
static __inline__ void __attribute__((__always_inline__, __nodebug__))
589
_mm_storel_pd(double *__dp, __m128d __a)
590
{
591
wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 0);
592
}
593
594
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
595
_mm_add_epi8(__m128i __a, __m128i __b)
596
{
597
return (__m128i)wasm_i8x16_add((v128_t)__a, (v128_t)__b);
598
}
599
600
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
601
_mm_add_epi16(__m128i __a, __m128i __b)
602
{
603
return (__m128i)wasm_i16x8_add((v128_t)__a, (v128_t)__b);
604
}
605
606
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
607
_mm_add_epi32(__m128i __a, __m128i __b)
608
{
609
return (__m128i)wasm_i32x4_add((v128_t)__a, (v128_t)__b);
610
}
611
612
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
613
_mm_add_epi64(__m128i __a, __m128i __b)
614
{
615
return (__m128i)wasm_i64x2_add((v128_t)__a, (v128_t)__b);
616
}
617
618
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619
_mm_adds_epi8(__m128i __a, __m128i __b)
620
{
621
return (__m128i)wasm_i8x16_add_saturate((v128_t)__a, (v128_t)__b);
622
}
623
624
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625
_mm_adds_epi16(__m128i __a, __m128i __b)
626
{
627
return (__m128i)wasm_i16x8_add_saturate((v128_t)__a, (v128_t)__b);
628
}
629
630
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631
_mm_adds_epu8(__m128i __a, __m128i __b)
632
{
633
return (__m128i)wasm_u8x16_add_saturate((v128_t)__a, (v128_t)__b);
634
}
635
636
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
637
_mm_adds_epu16(__m128i __a, __m128i __b)
638
{
639
return (__m128i)wasm_u16x8_add_saturate((v128_t)__a, (v128_t)__b);
640
}
641
642
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643
_mm_avg_epu8(__m128i __a, __m128i __b)
644
{
645
return (__m128i)wasm_u8x16_avgr((v128_t)__a, (v128_t)__b);
646
}
647
648
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649
_mm_avg_epu16(__m128i __a, __m128i __b)
650
{
651
return (__m128i)wasm_u16x8_avgr((v128_t)__a, (v128_t)__b);
652
}
653
654
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655
_mm_madd_epi16(__m128i __a, __m128i __b)
656
{
657
return (__m128i)wasm_i32x4_dot_i16x8((v128_t)__a, (v128_t)__b);
658
}
659
660
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661
_mm_max_epi16(__m128i __a, __m128i __b)
662
{
663
return (__m128i)wasm_i16x8_max((v128_t)__a, (v128_t)__b);
664
}
665
666
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667
_mm_max_epu8(__m128i __a, __m128i __b)
668
{
669
return (__m128i)wasm_u8x16_max((v128_t)__a, (v128_t)__b);
670
}
671
672
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673
_mm_min_epi16(__m128i __a, __m128i __b)
674
{
675
return (__m128i)wasm_i16x8_min((v128_t)__a, (v128_t)__b);
676
}
677
678
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679
_mm_min_epu8(__m128i __a, __m128i __b)
680
{
681
return (__m128i)wasm_u8x16_min((v128_t)__a, (v128_t)__b);
682
}
683
684
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685
_mm_mulhi_epi16(__m128i __a, __m128i __b)
686
{
687
const v128_t lo = wasm_i32x4_extmul_low_i16x8((v128_t)__a, (v128_t)__b);
688
const v128_t hi = wasm_i32x4_extmul_high_i16x8((v128_t)__a, (v128_t)__b);
689
return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);
690
}
691
692
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693
_mm_mulhi_epu16(__m128i __a, __m128i __b)
694
{
695
const v128_t lo = wasm_u32x4_extmul_low_u16x8((v128_t)__a, (v128_t)__b);
696
const v128_t hi = wasm_u32x4_extmul_high_u16x8((v128_t)__a, (v128_t)__b);
697
return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);
698
}
699
700
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
701
_mm_mullo_epi16(__m128i __a, __m128i __b)
702
{
703
return (__m128i)wasm_i16x8_mul((v128_t)__a, (v128_t)__b);
704
}
705
706
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
707
_mm_mul_epu32(__m128i __a, __m128i __b)
708
{
709
return (__m128i)wasm_u64x2_extmul_low_u32x4(
710
wasm_v32x4_shuffle((v128_t)__a, (v128_t)__a, 0, 2, 0, 2),
711
wasm_v32x4_shuffle((v128_t)__b, (v128_t)__b, 0, 2, 0, 2));
712
}
713
714
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715
_mm_sub_epi8(__m128i __a, __m128i __b)
716
{
717
return (__m128i)wasm_i8x16_sub((v128_t)__a, (v128_t)__b);
718
}
719
720
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721
_mm_sub_epi16(__m128i __a, __m128i __b)
722
{
723
return (__m128i)wasm_i16x8_sub((v128_t)__a, (v128_t)__b);
724
}
725
726
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727
_mm_sub_epi32(__m128i __a, __m128i __b)
728
{
729
return (__m128i)wasm_i32x4_sub((v128_t)__a, (v128_t)__b);
730
}
731
732
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
733
_mm_sub_epi64(__m128i __a, __m128i __b)
734
{
735
return (__m128i)wasm_i64x2_sub((v128_t)__a, (v128_t)__b);
736
}
737
738
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739
_mm_subs_epi8(__m128i __a, __m128i __b)
740
{
741
return (__m128i)wasm_i8x16_sub_saturate((v128_t)__a, (v128_t)__b);
742
}
743
744
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745
_mm_subs_epi16(__m128i __a, __m128i __b)
746
{
747
return (__m128i)wasm_i16x8_sub_saturate((v128_t)__a, (v128_t)__b);
748
}
749
750
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751
_mm_subs_epu8(__m128i __a, __m128i __b)
752
{
753
return (__m128i)wasm_u8x16_sub_saturate((v128_t)__a, (v128_t)__b);
754
}
755
756
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757
_mm_subs_epu16(__m128i __a, __m128i __b)
758
{
759
return (__m128i)wasm_u16x8_sub_saturate((v128_t)__a, (v128_t)__b);
760
}
761
762
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763
_mm_and_si128(__m128i __a, __m128i __b)
764
{
765
return (__m128i)wasm_v128_and((v128_t)__a, (v128_t)__b);
766
}
767
768
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
769
_mm_andnot_si128(__m128i __a, __m128i __b)
770
{
771
return (__m128i)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
772
}
773
774
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775
_mm_or_si128(__m128i __a, __m128i __b)
776
{
777
return (__m128i)wasm_v128_or((v128_t)__b, (v128_t)__a);
778
}
779
780
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781
_mm_xor_si128(__m128i __a, __m128i __b)
782
{
783
return (__m128i)wasm_v128_xor((v128_t)__b, (v128_t)__a);
784
}
785
786
#define _mm_slli_si128(__a, __imm) __extension__ ({ \
787
(__m128i)wasm_i8x16_shuffle(_mm_setzero_si128(), \
788
(__a), \
789
((__imm)&0xF0) ? 0 : 16 - ((__imm)&0xF), \
790
((__imm)&0xF0) ? 0 : 17 - ((__imm)&0xF), \
791
((__imm)&0xF0) ? 0 : 18 - ((__imm)&0xF), \
792
((__imm)&0xF0) ? 0 : 19 - ((__imm)&0xF), \
793
((__imm)&0xF0) ? 0 : 20 - ((__imm)&0xF), \
794
((__imm)&0xF0) ? 0 : 21 - ((__imm)&0xF), \
795
((__imm)&0xF0) ? 0 : 22 - ((__imm)&0xF), \
796
((__imm)&0xF0) ? 0 : 23 - ((__imm)&0xF), \
797
((__imm)&0xF0) ? 0 : 24 - ((__imm)&0xF), \
798
((__imm)&0xF0) ? 0 : 25 - ((__imm)&0xF), \
799
((__imm)&0xF0) ? 0 : 26 - ((__imm)&0xF), \
800
((__imm)&0xF0) ? 0 : 27 - ((__imm)&0xF), \
801
((__imm)&0xF0) ? 0 : 28 - ((__imm)&0xF), \
802
((__imm)&0xF0) ? 0 : 29 - ((__imm)&0xF), \
803
((__imm)&0xF0) ? 0 : 30 - ((__imm)&0xF), \
804
((__imm)&0xF0) ? 0 : 31 - ((__imm)&0xF)); })
805
#define _mm_bslli_si128(__a, __imm) \
806
_mm_slli_si128((__a), (__imm))
807
808
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
809
_mm_slli_epi16(__m128i __a, int __count)
810
{
811
return (__m128i)((__count < 16) ? wasm_i16x8_shl((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
812
}
813
814
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
815
_mm_sll_epi16(__m128i __a, __m128i __count)
816
{
817
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
818
return (__m128i)((__c < 16) ? wasm_i16x8_shl((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
819
}
820
821
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
822
_mm_slli_epi32(__m128i __a, int __count)
823
{
824
return (__m128i)((__count < 32) ? wasm_i32x4_shl((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
825
}
826
827
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
828
_mm_sll_epi32(__m128i __a, __m128i __count)
829
{
830
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
831
return (__m128i)((__c < 32) ? wasm_i32x4_shl((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
832
}
833
834
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835
_mm_slli_epi64(__m128i __a, int __count)
836
{
837
return (__m128i)((__count < 64) ? wasm_i64x2_shl((v128_t)__a, __count) : wasm_i64x2_const(0,0));
838
}
839
840
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841
_mm_sll_epi64(__m128i __a, __m128i __count)
842
{
843
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
844
return (__m128i)((__c < 64) ? wasm_i64x2_shl((v128_t)__a, __c) : wasm_i64x2_const(0,0));
845
}
846
847
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
848
_mm_srai_epi16(__m128i __a, int __count)
849
{
850
__count = __count < 15 ? __count : 15;
851
return (__m128i)wasm_i16x8_shr((v128_t)__a, __count);
852
}
853
854
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
855
_mm_sra_epi16(__m128i __a, __m128i __count)
856
{
857
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
858
__c = __c < 15 ? __c : 15;
859
return (__m128i)wasm_i16x8_shr((v128_t)__a, __c);
860
}
861
862
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
863
_mm_srai_epi32(__m128i __a, int __count)
864
{
865
__count = __count < 31 ? __count : 31;
866
return (__m128i)wasm_i32x4_shr((v128_t)__a, __count);
867
}
868
869
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
870
_mm_sra_epi32(__m128i __a, __m128i __count)
871
{
872
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
873
__c = __c < 31 ? __c : 31;
874
return (__m128i)wasm_i32x4_shr((v128_t)__a, __c);
875
}
876
877
#define _mm_srli_si128(__a, __imm) __extension__ ({ \
878
(__m128i)wasm_i8x16_shuffle((__a), \
879
_mm_setzero_si128(), \
880
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 0, \
881
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 1, \
882
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 2, \
883
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 3, \
884
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 4, \
885
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 5, \
886
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 6, \
887
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 7, \
888
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 8, \
889
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 9, \
890
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 10, \
891
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 11, \
892
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 12, \
893
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 13, \
894
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 14, \
895
((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 15); })
896
897
#define _mm_bsrli_si128(__a, __imm) \
898
_mm_srli_si128((__a), (__imm))
899
900
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
901
_mm_srli_epi16(__m128i __a, int __count)
902
{
903
return (__m128i)(((unsigned int)__count < 16) ? wasm_u16x8_shr((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
904
}
905
906
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
907
_mm_srl_epi16(__m128i __a, __m128i __count)
908
{
909
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
910
return (__m128i)((__c < 16) ? wasm_u16x8_shr((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
911
}
912
913
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914
_mm_srli_epi32(__m128i __a, int __count)
915
{
916
return (__m128i)(((unsigned int)__count < 32) ? wasm_u32x4_shr((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));
917
}
918
919
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920
_mm_srl_epi32(__m128i __a, __m128i __count)
921
{
922
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
923
return (__m128i)((__c < 32) ? wasm_u32x4_shr((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));
924
}
925
926
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
927
_mm_srli_epi64(__m128i __a, int __count)
928
{
929
return (__m128i)(((unsigned int)__count < 64) ? wasm_u64x2_shr((v128_t)__a, __count) : wasm_i64x2_const(0,0));
930
}
931
932
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
933
_mm_srl_epi64(__m128i __a, __m128i __count)
934
{
935
unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];
936
return (__m128i)((__c < 64) ? wasm_u64x2_shr((v128_t)__a, __c) : wasm_i64x2_const(0,0));
937
}
938
939
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
940
_mm_cmpeq_epi8(__m128i __a, __m128i __b)
941
{
942
return (__m128i)wasm_i8x16_eq((v128_t)__a, (v128_t)__b);
943
}
944
945
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
946
_mm_cmpeq_epi16(__m128i __a, __m128i __b)
947
{
948
return (__m128i)wasm_i16x8_eq((v128_t)__a, (v128_t)__b);
949
}
950
951
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
952
_mm_cmpeq_epi32(__m128i __a, __m128i __b)
953
{
954
return (__m128i)wasm_i32x4_eq((v128_t)__a, (v128_t)__b);
955
}
956
957
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
958
_mm_cmpgt_epi8(__m128i __a, __m128i __b)
959
{
960
return (__m128i)wasm_i8x16_gt((v128_t)__a, (v128_t)__b);
961
}
962
963
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
964
_mm_cmpgt_epi16(__m128i __a, __m128i __b)
965
{
966
return (__m128i)wasm_i16x8_gt((v128_t)__a, (v128_t)__b);
967
}
968
969
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
970
_mm_cmpgt_epi32(__m128i __a, __m128i __b)
971
{
972
return (__m128i)wasm_i32x4_gt((v128_t)__a, (v128_t)__b);
973
}
974
975
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
976
_mm_cmplt_epi8(__m128i __a, __m128i __b)
977
{
978
return (__m128i)wasm_i8x16_lt((v128_t)__a, (v128_t)__b);
979
}
980
981
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
982
_mm_cmplt_epi16(__m128i __a, __m128i __b)
983
{
984
return (__m128i)wasm_i16x8_lt((v128_t)__a, (v128_t)__b);
985
}
986
987
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
988
_mm_cmplt_epi32(__m128i __a, __m128i __b)
989
{
990
return (__m128i)wasm_i32x4_lt((v128_t)__a, (v128_t)__b);
991
}
992
993
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
994
_mm_cvtsi64_sd(__m128d __a, long long __b)
995
{
996
// TODO: optimize
997
union {
998
double x[2];
999
__m128d m;
1000
} m;
1001
m.m = __a;
1002
m.x[0] = (double)__b;
1003
return m.m;
1004
}
1005
1006
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1007
_mm_cvtsd_si64(__m128d __a)
1008
{
1009
// TODO: optimize
1010
double e = __a[0];
1011
if (isnan(e) || isinf(e)) return 0x8000000000000000LL;
1012
long long x = llrint(e);
1013
if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabs(e) < 2.f))
1014
return x;
1015
else
1016
return 0x8000000000000000LL;
1017
}
1018
1019
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1020
_mm_cvttsd_si64(__m128d __a)
1021
{
1022
// TODO: optimize
1023
double e = __a[0];
1024
if (isnan(e) || isinf(e) || e > LLONG_MAX || e < LLONG_MIN) return 0x8000000000000000LL;
1025
long long x = llrint(e);
1026
if (x != 0 || fabs(e) < 2.f)
1027
// Use the trapping instruction here since we have explicit bounds checks
1028
// above
1029
return __builtin_wasm_trunc_s_i64_f64(e);
1030
else
1031
return 0x8000000000000000LL;
1032
}
1033
1034
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1035
_mm_cvtepi32_ps(__m128i __a)
1036
{
1037
return (__m128)wasm_f32x4_convert_i32x4(__a);
1038
}
1039
1040
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041
_mm_cvtps_epi32(__m128 __a)
1042
{
1043
// TODO: optimize
1044
union {
1045
int x[4];
1046
__m128i m;
1047
} u;
1048
for(int i = 0; i < 4; ++i)
1049
{
1050
double e = __a[i];
1051
int x = lrint(e);
1052
if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))
1053
u.x[i] = x;
1054
else
1055
u.x[i] = (int)0x80000000;
1056
}
1057
return u.m;
1058
}
1059
1060
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1061
_mm_cvttps_epi32(__m128 __a)
1062
{
1063
// TODO: optimize
1064
union {
1065
int x[4];
1066
__m128i m;
1067
} u;
1068
for(int i = 0; i < 4; ++i)
1069
{
1070
float e = __a[i];
1071
if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabs(e) < 2.0))
1072
// Use the trapping instruction here since we have explicit bounds checks
1073
// above.
1074
u.x[i] = __builtin_wasm_trunc_s_i32_f32(e);
1075
else
1076
u.x[i] = (int)0x80000000;
1077
}
1078
return u.m;
1079
}
1080
1081
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1082
_mm_cvtsi32_si128(int __a)
1083
{
1084
return (__m128i)wasm_i32x4_make(__a, 0, 0, 0);
1085
}
1086
1087
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1088
_mm_cvtsi64_si128(long long __a)
1089
{
1090
return (__m128i)wasm_i64x2_make(__a, 0);
1091
}
1092
1093
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1094
_mm_cvtsi128_si32(__m128i __a)
1095
{
1096
return wasm_i32x4_extract_lane(__a, 0);
1097
}
1098
1099
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1100
_mm_cvtsi128_si64(__m128i __a)
1101
{
1102
return wasm_i64x2_extract_lane(__a, 0);
1103
}
1104
1105
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1106
_mm_load_si128(__m128i const *__p)
1107
{
1108
return *__p;
1109
}
1110
1111
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1112
_mm_loadu_si128(__m128i const *__p)
1113
{
1114
// UB-free unaligned access copied from wasm_simd128.h
1115
struct __mm_loadu_si128_struct {
1116
__m128i __v;
1117
} __attribute__((__packed__, __may_alias__));
1118
return ((struct __mm_loadu_si128_struct*)__p)->__v;
1119
}
1120
1121
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1122
_mm_loadu_si16(void const *__p)
1123
{
1124
return (__m128i)wasm_v128_load16_lane(__p, wasm_i64x2_const(0, 0), 0);
1125
}
1126
1127
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1128
_mm_loadu_si32(void const *__p)
1129
{
1130
return (__m128i)wasm_v128_load32_zero(__p);
1131
}
1132
1133
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1134
_mm_loadu_si64(void const *__p)
1135
{
1136
return (__m128i)wasm_v128_load64_zero(__p);
1137
}
1138
1139
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1140
_mm_loadl_epi64(__m128i const *__p)
1141
{
1142
return _mm_loadu_si64(__p);
1143
}
1144
1145
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1146
_mm_set_epi64(long long q1, long long q0)
1147
{
1148
return (__m128i)wasm_i64x2_make(q0, q1);
1149
}
1150
1151
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1152
_mm_set_epi32(int i3, int i2, int i1, int i0)
1153
{
1154
return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1155
}
1156
1157
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1158
_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1159
{
1160
return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1161
}
1162
1163
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1164
_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1165
{
1166
return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1167
}
1168
1169
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1170
_mm_set1_epi64x(long long __q)
1171
{
1172
return (__m128i)wasm_i64x2_splat(__q);
1173
}
1174
1175
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1176
_mm_set1_epi32(int __i)
1177
{
1178
return (__m128i)wasm_i32x4_splat(__i);
1179
}
1180
1181
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1182
_mm_set1_epi16(short __w)
1183
{
1184
return (__m128i)wasm_i16x8_splat(__w);
1185
}
1186
1187
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1188
_mm_set1_epi8(char __b)
1189
{
1190
return (__m128i)wasm_i8x16_splat(__b);
1191
}
1192
1193
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1194
_mm_setr_epi32(int i0, int i1, int i2, int i3)
1195
{
1196
return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);
1197
}
1198
1199
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1200
_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1201
{
1202
return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);
1203
}
1204
1205
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1206
_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1207
{
1208
return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1209
}
1210
1211
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1212
_mm_setzero_si128(void)
1213
{
1214
return wasm_i64x2_const(0, 0);
1215
}
1216
1217
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1218
_mm_store_si128(__m128i *__p, __m128i __b)
1219
{
1220
*__p = __b;
1221
}
1222
1223
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1224
_mm_storeu_si16(void *__p, __m128i __a)
1225
{
1226
wasm_v128_store16_lane(__p, (v128_t)__a, 0);
1227
}
1228
1229
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1230
_mm_storeu_si32(void *__p, __m128i __a)
1231
{
1232
wasm_v128_store32_lane(__p, (v128_t)__a, 0);
1233
}
1234
1235
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1236
_mm_storeu_si64(void *__p, __m128i __a)
1237
{
1238
wasm_v128_store64_lane(__p, (v128_t)__a, 0);
1239
}
1240
1241
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1242
_mm_storeu_si128(__m128i *__p, __m128i __a)
1243
{
1244
// UB-free unaligned access copied from wasm_simd128.h
1245
struct __mm_storeu_si128_struct {
1246
__m128i __v;
1247
} __attribute__((__packed__, __may_alias__));
1248
((struct __mm_storeu_si128_struct *)__p)->__v = __a;
1249
}
1250
1251
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1252
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1253
{
1254
// TODO: optimize
1255
union {
1256
unsigned char x[16];
1257
__m128i m;
1258
} mask, data;
1259
mask.m = __n;
1260
data.m = __d;
1261
for(int i = 0; i < 16; ++i)
1262
if (mask.x[i] & 0x80)
1263
__p[i] = data.x[i];
1264
}
1265
1266
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1267
_mm_storel_epi64(__m128i *__p, __m128i __a)
1268
{
1269
_mm_storeu_si64(__p, __a);
1270
}
1271
1272
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1273
_mm_stream_pd(double *__p, __m128d __a)
1274
{
1275
// Emscripten/SIMD.js does not have cache hinting.
1276
_mm_store_pd(__p, __a);
1277
}
1278
1279
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1280
_mm_stream_si128(__m128i *__p, __m128i __a)
1281
{
1282
// Emscripten/SIMD.js does not have cache hinting.
1283
_mm_store_si128(__p, __a);
1284
}
1285
1286
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1287
_mm_stream_si32(int *__p, int __a)
1288
{
1289
// No cache hinting available.
1290
*__p = __a;
1291
}
1292
1293
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1294
_mm_stream_si64(long long *__p, long long __a)
1295
{
1296
// No cache hinting available.
1297
*__p = __a;
1298
}
1299
1300
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1301
_mm_clflush(void const *__p)
1302
{
1303
// Wasm SIMD does not have cache hinting
1304
}
1305
1306
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1307
_mm_lfence(void)
1308
{
1309
__sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1310
}
1311
1312
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1313
_mm_mfence(void)
1314
{
1315
__sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.
1316
}
1317
1318
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1319
_mm_packs_epi16(__m128i __a, __m128i __b)
1320
{
1321
return wasm_i8x16_narrow_i16x8(__a, __b);
1322
}
1323
1324
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1325
_mm_packs_epi32(__m128i __a, __m128i __b)
1326
{
1327
return wasm_i16x8_narrow_i32x4(__a, __b);
1328
}
1329
1330
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1331
_mm_packus_epi16(__m128i __a, __m128i __b)
1332
{
1333
return wasm_u8x16_narrow_i16x8(__a, __b);
1334
}
1335
1336
#define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)
1337
#define _mm_insert_epi16(__a, __b, __imm) wasm_i16x8_replace_lane((__a), (__imm) & 7, (__b))
1338
1339
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1340
_mm_movemask_epi8(__m128i __a)
1341
{
1342
return (int)wasm_i8x16_bitmask((v128_t)__a);
1343
}
1344
1345
#define _mm_shuffle_epi32(__a, __imm) __extension__ ({ \
1346
(__m128i)wasm_i32x4_shuffle((__a), \
1347
_mm_set1_epi32(0), \
1348
((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1349
(((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6)); })
1350
1351
#define _mm_shufflelo_epi16(__a, __imm) __extension__ ({ \
1352
(__m128i)wasm_i16x8_shuffle((__a), \
1353
_mm_set1_epi16(0), \
1354
((__imm) & 0x3), (((__imm) & 0xc) >> 2), \
1355
(((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6), \
1356
4, 5, 6, 7); })
1357
1358
#define _mm_shufflehi_epi16(__a, __imm) __extension__ ({ \
1359
(__m128i)wasm_i16x8_shuffle((__a), \
1360
_mm_set1_epi16(0), \
1361
0, 1, 2, 3, \
1362
(4 + (((__imm) & 0x03) >> 0)), \
1363
(4 + (((__imm) & 0x0c) >> 2)), \
1364
(4 + (((__imm) & 0x30) >> 4)), \
1365
(4 + (((__imm) & 0xc0) >> 6))); })
1366
1367
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1368
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1369
{
1370
return (__m128i)wasm_i8x16_shuffle(__a, __b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
1371
}
1372
1373
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1374
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1375
{
1376
return (__m128i)wasm_i16x8_shuffle(__a, __b, 4, 12, 5, 13, 6, 14, 7, 15);
1377
}
1378
1379
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1380
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1381
{
1382
return (__m128i)wasm_i32x4_shuffle(__a, __b, 2, 6, 3, 7);
1383
}
1384
1385
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1386
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1387
{
1388
return (__m128i)wasm_i64x2_shuffle(__a, __b, 1, 3);
1389
}
1390
1391
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1392
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1393
{
1394
return (__m128i)wasm_i8x16_shuffle(__a, __b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
1395
}
1396
1397
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1398
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1399
{
1400
return (__m128i)wasm_i16x8_shuffle(__a, __b, 0, 8, 1, 9, 2, 10, 3, 11);
1401
}
1402
1403
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1404
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1405
{
1406
return (__m128i)wasm_i32x4_shuffle(__a, __b, 0, 4, 1, 5);
1407
}
1408
1409
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1410
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1411
{
1412
return (__m128i)wasm_i64x2_shuffle(__a, __b, 0, 2);
1413
}
1414
1415
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1416
_mm_move_epi64(__m128i __a)
1417
{
1418
return wasm_i64x2_shuffle(__a, wasm_i64x2_const(0, 0), 0, 2);
1419
}
1420
1421
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1422
_mm_unpackhi_pd(__m128d __a, __m128d __b)
1423
{
1424
return (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__b, 1, 3);
1425
}
1426
1427
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1428
_mm_unpacklo_pd(__m128d __a, __m128d __b)
1429
{
1430
return (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__b, 0, 2);
1431
}
1432
1433
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1434
_mm_movemask_pd(__m128d __a)
1435
{
1436
return (int)wasm_i64x2_bitmask((v128_t)__a);
1437
}
1438
1439
#define _mm_shuffle_pd(__a, __b, __i) __extension__ ({ \
1440
(__m128d) __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), \
1441
(__i) & 1, \
1442
(((__i) & 2) >> 1) + 2); })
1443
1444
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1445
_mm_castpd_ps(__m128d __a)
1446
{
1447
return (__m128)__a;
1448
}
1449
1450
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1451
_mm_castpd_si128(__m128d __a)
1452
{
1453
return (__m128i)__a;
1454
}
1455
1456
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1457
_mm_castps_pd(__m128 __a)
1458
{
1459
return (__m128d)__a;
1460
}
1461
1462
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1463
_mm_castps_si128(__m128 __a)
1464
{
1465
return (__m128i)__a;
1466
}
1467
1468
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1469
_mm_castsi128_ps(__m128i __a)
1470
{
1471
return (__m128)__a;
1472
}
1473
1474
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1475
_mm_castsi128_pd(__m128i __a)
1476
{
1477
return (__m128d)__a;
1478
}
1479
1480
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1481
_mm_pause(void)
1482
{
1483
// No pause/wait instruction in Wasm/SIMD.
1484
}
1485
1486
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1487
_mm_undefined_pd()
1488
{
1489
__m128d val;
1490
return val;
1491
}
1492
1493
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1494
_mm_undefined_si128()
1495
{
1496
__m128i val;
1497
return val;
1498
}
1499
1500
// Must be in the very end as it uses other SSE2 intrinsics
1501
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1502
_mm_sad_epu8(__m128i __a, __m128i __b)
1503
{
1504
__m128i __diff = _mm_or_si128(_mm_subs_epu8(__a, __b),
1505
_mm_subs_epu8(__b, __a));
1506
__diff = _mm_add_epi16(_mm_srli_epi16(__diff, 8),
1507
_mm_and_si128(__diff, _mm_set1_epi16(0x00FF)));
1508
__diff = _mm_add_epi16(__diff, _mm_slli_epi32(__diff, 16));
1509
__diff = _mm_add_epi16(__diff, _mm_slli_epi64(__diff, 32));
1510
return _mm_srli_epi64(__diff, 48);
1511
}
1512
1513
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1514
1515
#endif /* __emscripten_emmintrin_h__ */
1516
1517