CoCalc -- xmmintrin.h

GitHub Repository: emscripten-core/emscripten
Path: blob/main/system/include/compat/xmmintrin.h
⁶¹⁷¹ views
1
/*
2
 * Copyright 2020 The Emscripten Authors.  All rights reserved.
3
 * Emscripten is available under two separate licenses, the MIT license and the
4
 * University of Illinois/NCSA Open Source License.  Both these licenses can be
5
 * found in the LICENSE file.
6
 */
7
#ifndef __emscripten_xmmintrin_h__
8
#define __emscripten_xmmintrin_h__
9

10
#include <wasm_simd128.h>
11

12
#include <limits.h>
13
#include <math.h>
14
#include <string.h>
15

16
#ifndef __SSE__
17
#error "SSE instruction set not enabled"
18
#endif
19

20
#ifdef WASM_SIMD_COMPAT_SLOW
21
#define DIAGNOSE_SLOW diagnose_if(1, "Instruction emulated via slow path.", "warning")
22
#else
23
#define DIAGNOSE_SLOW
24
#endif
25

26
// Emscripten SIMD support doesn't support MMX/float32x2/__m64.
27
// However, we support loading and storing 2-vectors, so
28
// recognize the type at least.
29
typedef float __m64 __attribute__((__vector_size__(8), __aligned__(8)));
30
typedef __f32x4 __m128;
31
typedef v128_t __m128i;
32

33
#define __f32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3)                   \
34
  ((v128_t)(__builtin_shufflevector((__f32x4)(__a), (__f32x4)(__b), __c0,   \
35
                                    __c1, __c2, __c3)))
36

37
// This is defined as a macro because __builtin_shufflevector requires its
38
// mask argument to be a compile-time constant.
39
#define _mm_shuffle_ps(__a, __b, __mask) __extension__ ({ \
40
  ((__m128)__f32x4_shuffle(__a, __b, \
41
                           (((__mask) >> 0) & 0x3) + 0, \
42
                           (((__mask) >> 2) & 0x3) + 0, \
43
                           (((__mask) >> 4) & 0x3) + 4, \
44
                           (((__mask) >> 6) & 0x3) + 4)); })
45

46
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
47
_mm_set_ps(float __z, float __y, float __x, float __w)
48
{
49
  return (__m128)wasm_f32x4_make(__w, __x, __y, __z);
50
}
51

52
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
53
_mm_setr_ps(float __z, float __y, float __x, float __w)
54
{
55
  return (__m128)wasm_f32x4_make(__z, __y, __x, __w);
56
}
57

58
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
59
_mm_set_ss(float __w)
60
{
61
  return (__m128)wasm_f32x4_make(__w, 0, 0, 0);
62
}
63

64
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
65
_mm_set_ps1(float __w)
66
{
67
  return (__m128)wasm_f32x4_splat(__w);
68
}
69

70
#define _mm_set1_ps _mm_set_ps1
71

72
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
73
_mm_setzero_ps(void)
74
{
75
  return (__m128)wasm_f32x4_const(0.f, 0.f, 0.f, 0.f);
76
}
77

78
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
79
_mm_load_ps(const float *__p)
80
{
81
  return *(__m128*)__p;
82
}
83

84
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
85
_mm_loadl_pi(__m128 __a, const void /*__m64*/ *__p)
86
{
87
  return (__m128)wasm_v128_load64_lane(__p, (v128_t)__a, 0);
88
}
89

90
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
91
_mm_loadh_pi(__m128 __a, const void /*__m64*/ *__p)
92
{
93
  return (__m128)wasm_v128_load64_lane(__p, (v128_t)__a, 1);
94
}
95

96
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
97
_mm_loadr_ps(const float *__p)
98
{
99
  __m128 __v = _mm_load_ps(__p);
100
  return (__m128)__f32x4_shuffle(__v, __v, 3, 2, 1, 0);
101
}
102

103
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
104
_mm_loadu_ps(const float *__p)
105
{
106
  return (__m128)wasm_v128_load(__p);
107
}
108

109
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
110
_mm_load_ps1(const float *__p)
111
{
112
  return (__m128)wasm_v32x4_load_splat(__p);
113
}
114
#define _mm_load1_ps _mm_load_ps1
115

116
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
117
_mm_load_ss(const float *__p)
118
{
119
  return (__m128)wasm_v128_load32_zero(__p);
120
}
121

122
static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
123
_mm_storel_pi(__m64 *__p, __m128 __a)
124
{
125
  wasm_v128_store64_lane((void*)__p, (v128_t)__a, 0);
126
}
127

128
static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
129
_mm_storeh_pi(__m64 *__p, __m128 __a)
130
{
131
  wasm_v128_store64_lane((void*)__p, (v128_t)__a, 1);
132
}
133

134
static __inline__ void __attribute__((__always_inline__, __nodebug__))
135
_mm_store_ps(float *__p, __m128 __a)
136
{
137
  *(__m128 *)__p = __a;
138
}
139
// No NTA cache hint available.
140
#define _mm_stream_ps _mm_store_ps
141

142
#define _MM_HINT_T0 3
143
#define _MM_HINT_T1 2
144
#define _MM_HINT_T2 1
145
#define _MM_HINT_NTA 0
146
// No prefetch available, dummy it out.
147
static __inline__ void __attribute__((__always_inline__, __nodebug__))
148
_mm_prefetch(const void *__p, int __i)
149
{
150
  ((void)__p);
151
  ((void)__i);
152
}
153

154
static __inline__ void __attribute__((__always_inline__, __nodebug__))
155
_mm_sfence(void)
156
{
157
  // Wasm/SharedArrayBuffer memory model is sequentially consistent.
158
  // Perhaps a future version of the spec can provide a related fence.
159
  __sync_synchronize();
160
}
161

162
#define _MM_SHUFFLE(w, z, y, x) (((w) << 6) | ((z) << 4) | ((y) << 2) | (x))
163

164
static __inline__ void __attribute__((__always_inline__, __nodebug__))
165
_mm_storer_ps(float *__p, __m128 __a)
166
{
167
  _mm_store_ps(__p, _mm_shuffle_ps(__a, __a, _MM_SHUFFLE(0, 1, 2, 3)));
168
}
169

170
static __inline__ void __attribute__((__always_inline__, __nodebug__))
171
_mm_store_ps1(float *__p, __m128 __a)
172
{
173
  _mm_store_ps(__p, _mm_shuffle_ps(__a, __a, _MM_SHUFFLE(0, 0, 0, 0)));
174
}
175
#define _mm_store1_ps _mm_store_ps1
176

177
static __inline__ void __attribute__((__always_inline__, __nodebug__))
178
_mm_store_ss(float *__p, __m128 __a)
179
{
180
  wasm_v128_store32_lane((void*)__p, (v128_t)__a, 0);
181
}
182

183
static __inline__ void __attribute__((__always_inline__, __nodebug__))
184
_mm_storeu_ps(float *__p, __m128 __a)
185
{
186
  struct __unaligned {
187
    __m128 __v;
188
  } __attribute__((__packed__, __may_alias__));
189
  ((struct __unaligned *)__p)->__v = __a;
190
}
191

192
static __inline__ int __attribute__((__always_inline__, __nodebug__))
193
_mm_movemask_ps(__m128 __a)
194
{
195
  return (int)wasm_i32x4_bitmask((v128_t)__a);
196
}
197

198
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
199
_mm_move_ss(__m128 __a, __m128 __b)
200
{
201
  return (__m128)__f32x4_shuffle(__a, __b, 4, 1, 2, 3);
202
}
203

204
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
205
_mm_add_ps(__m128 __a, __m128 __b)
206
{
207
  return (__m128)wasm_f32x4_add((v128_t)__a, (v128_t)__b);
208
}
209

210
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
211
_mm_add_ss(__m128 __a, __m128 __b)
212
{
213
  return _mm_move_ss(__a, _mm_add_ps(__a, __b));
214
}
215

216
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
217
_mm_sub_ps(__m128 __a, __m128 __b)
218
{
219
  return (__m128)wasm_f32x4_sub((v128_t)__a, (v128_t)__b);
220
}
221

222
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
223
_mm_sub_ss(__m128 __a, __m128 __b)
224
{
225
  return _mm_move_ss(__a, _mm_sub_ps(__a, __b));
226
}
227

228
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
229
_mm_mul_ps(__m128 __a, __m128 __b)
230
{
231
  return (__m128)wasm_f32x4_mul((v128_t)__a, (v128_t)__b);
232
}
233

234
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
235
_mm_mul_ss(__m128 __a, __m128 __b)
236
{
237
  return _mm_move_ss(__a, _mm_mul_ps(__a, __b));
238
}
239

240
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
241
_mm_div_ps(__m128 __a, __m128 __b)
242
{
243
  return (__m128)wasm_f32x4_div((v128_t)__a, (v128_t)__b);
244
}
245

246
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
247
_mm_div_ss(__m128 __a, __m128 __b)
248
{
249
  return _mm_move_ss(__a, _mm_div_ps(__a, __b));
250
}
251

252
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
253
_mm_min_ps(__m128 __a, __m128 __b)
254
{
255
//  return (__m128)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
256
  return (__m128)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f32x4_lt((v128_t)__a, (v128_t)__b));
257
}
258

259
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
260
_mm_min_ss(__m128 __a, __m128 __b)
261
{
262
  return _mm_move_ss(__a, _mm_min_ps(__a, __b));
263
}
264

265
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
266
_mm_max_ps(__m128 __a, __m128 __b)
267
{
268
//  return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs
269
  return (__m128)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f32x4_gt((v128_t)__a, (v128_t)__b));
270
}
271

272
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
273
_mm_max_ss(__m128 __a, __m128 __b)
274
{
275
  return _mm_move_ss(__a, _mm_max_ps(__a, __b));
276
}
277

278
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
279
_mm_rcp_ps(__m128 __a)
280
{
281
    return (__m128)wasm_f32x4_div((v128_t)_mm_set1_ps(1.0f), (v128_t)__a);
282
}
283

284
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
285
_mm_rcp_ss(__m128 __a)
286
{
287
  return _mm_move_ss(__a, _mm_rcp_ps(__a));
288
}
289

290
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
291
_mm_sqrt_ps(__m128 __a)
292
{
293
  return (__m128)wasm_f32x4_sqrt((v128_t)__a);
294
}
295

296
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
297
_mm_sqrt_ss(__m128 __a)
298
{
299
  return _mm_move_ss(__a, _mm_sqrt_ps(__a));
300
}
301

302
#define _mm_rsqrt_ps(__a) _mm_rcp_ps(_mm_sqrt_ps((__a)))
303

304
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
305
_mm_rsqrt_ss(__m128 __a)
306
{
307
  return _mm_move_ss(__a, _mm_rsqrt_ps(__a));
308
}
309

310
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
311
_mm_unpackhi_ps(__m128 __a, __m128 __b)
312
{
313
  return (__m128)__f32x4_shuffle(__a, __b, 2, 6, 3, 7);
314
}
315

316
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
317
_mm_unpacklo_ps(__m128 __a, __m128 __b)
318
{
319
  return (__m128)__f32x4_shuffle(__a, __b, 0, 4, 1, 5);
320
}
321

322
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
323
_mm_movehl_ps(__m128 __a, __m128 __b)
324
{
325
  return (__m128)__f32x4_shuffle(__a, __b, 6, 7, 2, 3);
326
}
327

328
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
329
_mm_movelh_ps(__m128 __a, __m128 __b)
330
{
331
  return (__m128)__f32x4_shuffle(__a, __b, 0, 1, 4, 5);
332
}
333

334
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
335
  do { \
336
    __m128 __row0 = (row0); \
337
    __m128 __row1 = (row1); \
338
    __m128 __row2 = (row2); \
339
    __m128 __row3 = (row3); \
340
    __m128 __tmp0 = _mm_unpacklo_ps(__row0, __row1); \
341
    __m128 __tmp1 = _mm_unpackhi_ps(__row0, __row1); \
342
    __m128 __tmp2 = _mm_unpacklo_ps(__row2, __row3); \
343
    __m128 __tmp3 = _mm_unpackhi_ps(__row2, __row3); \
344
    (row0) = _mm_movelh_ps(__tmp0, __tmp2); \
345
    (row1) = _mm_movehl_ps(__tmp2, __tmp0); \
346
    (row2) = _mm_movelh_ps(__tmp1, __tmp3); \
347
    (row3) = _mm_movehl_ps(__tmp3, __tmp1); \
348
  } while (0)
349

350
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
351
_mm_cmplt_ps(__m128 __a, __m128 __b)
352
{
353
  return (__m128)wasm_f32x4_lt((v128_t)__a, (v128_t)__b);
354
}
355

356
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
357
_mm_cmplt_ss(__m128 __a, __m128 __b)
358
{
359
  return _mm_move_ss(__a, _mm_cmplt_ps(__a, __b));
360
}
361

362
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
363
_mm_cmple_ps(__m128 __a, __m128 __b)
364
{
365
  return (__m128)wasm_f32x4_le((v128_t)__a, (v128_t)__b);
366
}
367

368
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
369
_mm_cmple_ss(__m128 __a, __m128 __b)
370
{
371
  return _mm_move_ss(__a, _mm_cmple_ps(__a, __b));
372
}
373

374
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
375
_mm_cmpeq_ps(__m128 __a, __m128 __b)
376
{
377
  return (__m128)wasm_f32x4_eq((v128_t)__a, (v128_t)__b);
378
}
379

380
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
381
_mm_cmpeq_ss(__m128 __a, __m128 __b)
382
{
383
  return _mm_move_ss(__a, _mm_cmpeq_ps(__a, __b));
384
}
385

386
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
387
_mm_cmpge_ps(__m128 __a, __m128 __b)
388
{
389
  return (__m128)wasm_f32x4_ge((v128_t)__a, (v128_t)__b);
390
}
391

392
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
393
_mm_cmpge_ss(__m128 __a, __m128 __b)
394
{
395
  return _mm_move_ss(__a, _mm_cmpge_ps(__a, __b));
396
}
397

398
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
399
_mm_cmpgt_ps(__m128 __a, __m128 __b)
400
{
401
  return (__m128)wasm_f32x4_gt((v128_t)__a, (v128_t)__b);
402
}
403

404
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
405
_mm_cmpgt_ss(__m128 __a, __m128 __b)
406
{
407
  return _mm_move_ss(__a, _mm_cmpgt_ps(__a, __b));
408
}
409

410
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpord_ps(__m128 __a, __m128 __b)
411
{
412
  return (__m128)wasm_v128_and(wasm_f32x4_eq((v128_t)__a, (v128_t)__a),
413
                               wasm_f32x4_eq((v128_t)__b, (v128_t)__b));
414
}
415

416
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpord_ss(__m128 __a, __m128 __b)
417
{
418
  return _mm_move_ss(__a, _mm_cmpord_ps(__a, __b));
419
}
420

421
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpunord_ps(__m128 __a, __m128 __b)
422
{
423
  return (__m128)wasm_v128_or(wasm_f32x4_ne((v128_t)__a, (v128_t)__a),
424
                              wasm_f32x4_ne((v128_t)__b, (v128_t)__b));
425
}
426

427
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpunord_ss(__m128 __a, __m128 __b)
428
{
429
  return _mm_move_ss(__a, _mm_cmpunord_ps(__a, __b));
430
}
431

432
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
433
_mm_and_ps(__m128 __a, __m128 __b)
434
{
435
  return (__m128)wasm_v128_and((v128_t)__a, (v128_t)__b);
436
}
437

438
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
439
_mm_andnot_ps(__m128 __a, __m128 __b)
440
{
441
  return (__m128)wasm_v128_andnot((v128_t)__b, (v128_t)__a);
442
}
443

444
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
445
_mm_or_ps(__m128 __a, __m128 __b)
446
{
447
  return (__m128)wasm_v128_or((v128_t)__a, (v128_t)__b);
448
}
449

450
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
451
_mm_xor_ps(__m128 __a, __m128 __b)
452
{
453
  return (__m128)wasm_v128_xor((v128_t)__a, (v128_t)__b);
454
}
455

456
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
457
_mm_cmpneq_ps(__m128 __a, __m128 __b)
458
{
459
  return (__m128)wasm_f32x4_ne((v128_t)__a, (v128_t)__b);
460
}
461

462
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
463
_mm_cmpneq_ss(__m128 __a, __m128 __b)
464
{
465
  return _mm_move_ss(__a, _mm_cmpneq_ps(__a, __b));
466
}
467

468
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
469
_mm_cmpnge_ps(__m128 __a, __m128 __b)
470
{
471
  return (__m128)wasm_v128_not((v128_t)_mm_cmpge_ps(__a, __b));
472
}
473

474
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475
_mm_cmpnge_ss(__m128 __a, __m128 __b)
476
{
477
  return _mm_move_ss(__a, _mm_cmpnge_ps(__a, __b));
478
}
479

480
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
481
_mm_cmpngt_ps(__m128 __a, __m128 __b)
482
{
483
  return (__m128)wasm_v128_not((v128_t)_mm_cmpgt_ps(__a, __b));
484
}
485

486
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
487
_mm_cmpngt_ss(__m128 __a, __m128 __b)
488
{
489
  return _mm_move_ss(__a, _mm_cmpngt_ps(__a, __b));
490
}
491

492
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
493
_mm_cmpnle_ps(__m128 __a, __m128 __b)
494
{
495
  return (__m128)wasm_v128_not((v128_t)_mm_cmple_ps(__a, __b));
496
}
497

498
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
499
_mm_cmpnle_ss(__m128 __a, __m128 __b)
500
{
501
  return _mm_move_ss(__a, _mm_cmpnle_ps(__a, __b));
502
}
503

504
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
505
_mm_cmpnlt_ps(__m128 __a, __m128 __b)
506
{
507
  return (__m128)wasm_v128_not((v128_t)_mm_cmplt_ps(__a, __b));
508
}
509

510
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
511
_mm_cmpnlt_ss(__m128 __a, __m128 __b)
512
{
513
  return _mm_move_ss(__a, _mm_cmpnlt_ps(__a, __b));
514
}
515

516
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
517
_mm_comieq_ss(__m128 __a, __m128 __b)
518
{
519
  return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);
520
}
521

522
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
523
_mm_comige_ss(__m128 __a, __m128 __b)
524
{
525
  return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);
526
}
527

528
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
529
_mm_comigt_ss(__m128 __a, __m128 __b)
530
{
531
  return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);
532
}
533

534
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
535
_mm_comile_ss(__m128 __a, __m128 __b)
536
{
537
  return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);
538
}
539

540
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
541
_mm_comilt_ss(__m128 __a, __m128 __b)
542
{
543
  return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);
544
}
545

546
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
547
_mm_comineq_ss(__m128 __a, __m128 __b)
548
{
549
  return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);
550
}
551

552
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
553
_mm_ucomieq_ss(__m128 __a, __m128 __b)
554
{
555
  return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);
556
}
557

558
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
559
_mm_ucomige_ss(__m128 __a, __m128 __b)
560
{
561
  return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);
562
}
563

564
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
565
_mm_ucomigt_ss(__m128 __a, __m128 __b)
566
{
567
  return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);
568
}
569

570
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
571
_mm_ucomile_ss(__m128 __a, __m128 __b)
572
{
573
  return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);
574
}
575

576
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
577
_mm_ucomilt_ss(__m128 __a, __m128 __b)
578
{
579
  return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);
580
}
581

582
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
583
_mm_ucomineq_ss(__m128 __a, __m128 __b)
584
{
585
  return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);
586
}
587

588
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
589
_mm_cvtsi32_ss(__m128 __a, int __b)
590
{
591
  __f32x4 __v = (__f32x4)__a;
592
  __v[0] = (float)__b;
593
  return (__m128)__v;
594
}
595
#define _mm_cvt_si2ss _mm_cvtsi32_ss
596

597
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cvtss_si32(__m128 __a)
598
{
599
  float e = ((__f32x4)__a)[0];
600
  if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabsf(e) < 2.f))
601
    return lrint(e);
602
  else
603
    return (int)0x80000000;
604
}
605
#define _mm_cvt_ss2si _mm_cvtss_si32
606

607
static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cvttss_si32(__m128 __a)
608
{
609
  float e = ((__f32x4)__a)[0];
610
  if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabsf(e) < 2.f))
611
    return (int)e;
612
  else
613
    return (int)0x80000000;
614
}
615
#define _mm_cvtt_ss2si _mm_cvttss_si32
616

617
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
618
_mm_cvtsi64_ss(__m128 __a, long long __b)
619
{
620
  __f32x4 __v = (__f32x4)__a;
621
  __v[0] = (float)__b;
622
  return (__m128)__v;
623
}
624

625
static __inline__ long long __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
626
_mm_cvtss_si64(__m128 __a)
627
{
628
  float e = ((__f32x4)__a)[0];
629
  long long x = llrintf(e);
630
  if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabsf(e) < 2.f))
631
    return x;
632
  else
633
    return 0x8000000000000000LL;
634
}
635

636
static __inline__ long long __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
637
_mm_cvttss_si64(__m128 __a)
638
{
639
  float e = ((__f32x4)__a)[0];
640
  long long x = llrintf(e);
641
  if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabsf(e) < 2.f))
642
    return (long long)e;
643
  else
644
    return 0x8000000000000000LL;
645
}
646

647
static __inline__ float __attribute__((__always_inline__, __nodebug__))
648
_mm_cvtss_f32(__m128 __a)
649
{
650
  return (float)((__f32x4)__a)[0];
651
}
652

653
#define _mm_malloc(__size, __align) memalign((__align), (__size))
654
#define _mm_free free
655

656
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
657
_mm_undefined()
658
{
659
  __m128 val;
660
  return val;
661
}
662

663
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
664
_mm_undefined_ps()
665
{
666
  __m128 val;
667
  return val;
668
}
669

670
#define _MM_EXCEPT_MASK       0x003f
671
#define _MM_EXCEPT_INVALID    0x0001
672
#define _MM_EXCEPT_DENORM     0x0002
673
#define _MM_EXCEPT_DIV_ZERO   0x0004
674
#define _MM_EXCEPT_OVERFLOW   0x0008
675
#define _MM_EXCEPT_UNDERFLOW  0x0010
676
#define _MM_EXCEPT_INEXACT    0x0020
677

678
#define _MM_MASK_MASK         0x1f80
679
#define _MM_MASK_INVALID      0x0080
680
#define _MM_MASK_DENORM       0x0100
681
#define _MM_MASK_DIV_ZERO     0x0200
682
#define _MM_MASK_OVERFLOW     0x0400
683
#define _MM_MASK_UNDERFLOW    0x0800
684
#define _MM_MASK_INEXACT      0x1000
685

686
#define _MM_ROUND_MASK        0x6000
687
#define _MM_ROUND_NEAREST     0x0000
688
#define _MM_ROUND_DOWN        0x2000
689
#define _MM_ROUND_UP          0x4000
690
#define _MM_ROUND_TOWARD_ZERO 0x6000
691

692
#define _MM_FLUSH_ZERO_MASK   0x8000
693
#define _MM_FLUSH_ZERO_ON     0x8000
694
#define _MM_FLUSH_ZERO_OFF    0x0000
695

696
static __inline__ int __attribute__((__always_inline__, __nodebug__))
697
_mm_getcsr()
698
{
699
  return _MM_MASK_INEXACT | _MM_MASK_DENORM | _MM_MASK_DIV_ZERO | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_INVALID
700
    | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_OFF;
701
}
702

703
#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
704
#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
705
#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
706
#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
707

708
// Unavailable functions:
709
// void _MM_SET_EXCEPTION_STATE(unsigned int __a);
710
// void _MM_SET_EXCEPTION_MASK(unsigned int __a);
711
// void _MM_GET_ROUNDING_MODE(unsigned int __a);
712
// void _MM_GET_FLUSH_ZERO_MODE(unsigned int __a);
713

714
#endif /* __emscripten_xmmintrin_h__ */
715

716
Product

Resources

Company