Path: blob/main/system/include/compat/xmmintrin.h
6171 views
/*1* Copyright 2020 The Emscripten Authors. All rights reserved.2* Emscripten is available under two separate licenses, the MIT license and the3* University of Illinois/NCSA Open Source License. Both these licenses can be4* found in the LICENSE file.5*/6#ifndef __emscripten_xmmintrin_h__7#define __emscripten_xmmintrin_h__89#include <wasm_simd128.h>1011#include <limits.h>12#include <math.h>13#include <string.h>1415#ifndef __SSE__16#error "SSE instruction set not enabled"17#endif1819#ifdef WASM_SIMD_COMPAT_SLOW20#define DIAGNOSE_SLOW diagnose_if(1, "Instruction emulated via slow path.", "warning")21#else22#define DIAGNOSE_SLOW23#endif2425// Emscripten SIMD support doesn't support MMX/float32x2/__m64.26// However, we support loading and storing 2-vectors, so27// recognize the type at least.28typedef float __m64 __attribute__((__vector_size__(8), __aligned__(8)));29typedef __f32x4 __m128;30typedef v128_t __m128i;3132#define __f32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3) \33((v128_t)(__builtin_shufflevector((__f32x4)(__a), (__f32x4)(__b), __c0, \34__c1, __c2, __c3)))3536// This is defined as a macro because __builtin_shufflevector requires its37// mask argument to be a compile-time constant.38#define _mm_shuffle_ps(__a, __b, __mask) __extension__ ({ \39((__m128)__f32x4_shuffle(__a, __b, \40(((__mask) >> 0) & 0x3) + 0, \41(((__mask) >> 2) & 0x3) + 0, \42(((__mask) >> 4) & 0x3) + 4, \43(((__mask) >> 6) & 0x3) + 4)); })4445static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))46_mm_set_ps(float __z, float __y, float __x, float __w)47{48return (__m128)wasm_f32x4_make(__w, __x, __y, __z);49}5051static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))52_mm_setr_ps(float __z, float __y, float __x, float __w)53{54return (__m128)wasm_f32x4_make(__z, __y, __x, __w);55}5657static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))58_mm_set_ss(float __w)59{60return (__m128)wasm_f32x4_make(__w, 0, 0, 0);61}6263static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))64_mm_set_ps1(float __w)65{66return (__m128)wasm_f32x4_splat(__w);67}6869#define _mm_set1_ps _mm_set_ps17071static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))72_mm_setzero_ps(void)73{74return (__m128)wasm_f32x4_const(0.f, 0.f, 0.f, 0.f);75}7677static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))78_mm_load_ps(const float *__p)79{80return *(__m128*)__p;81}8283static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))84_mm_loadl_pi(__m128 __a, const void /*__m64*/ *__p)85{86return (__m128)wasm_v128_load64_lane(__p, (v128_t)__a, 0);87}8889static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))90_mm_loadh_pi(__m128 __a, const void /*__m64*/ *__p)91{92return (__m128)wasm_v128_load64_lane(__p, (v128_t)__a, 1);93}9495static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))96_mm_loadr_ps(const float *__p)97{98__m128 __v = _mm_load_ps(__p);99return (__m128)__f32x4_shuffle(__v, __v, 3, 2, 1, 0);100}101102static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))103_mm_loadu_ps(const float *__p)104{105return (__m128)wasm_v128_load(__p);106}107108static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))109_mm_load_ps1(const float *__p)110{111return (__m128)wasm_v32x4_load_splat(__p);112}113#define _mm_load1_ps _mm_load_ps1114115static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))116_mm_load_ss(const float *__p)117{118return (__m128)wasm_v128_load32_zero(__p);119}120121static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))122_mm_storel_pi(__m64 *__p, __m128 __a)123{124wasm_v128_store64_lane((void*)__p, (v128_t)__a, 0);125}126127static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))128_mm_storeh_pi(__m64 *__p, __m128 __a)129{130wasm_v128_store64_lane((void*)__p, (v128_t)__a, 1);131}132133static __inline__ void __attribute__((__always_inline__, __nodebug__))134_mm_store_ps(float *__p, __m128 __a)135{136*(__m128 *)__p = __a;137}138// No NTA cache hint available.139#define _mm_stream_ps _mm_store_ps140141#define _MM_HINT_T0 3142#define _MM_HINT_T1 2143#define _MM_HINT_T2 1144#define _MM_HINT_NTA 0145// No prefetch available, dummy it out.146static __inline__ void __attribute__((__always_inline__, __nodebug__))147_mm_prefetch(const void *__p, int __i)148{149((void)__p);150((void)__i);151}152153static __inline__ void __attribute__((__always_inline__, __nodebug__))154_mm_sfence(void)155{156// Wasm/SharedArrayBuffer memory model is sequentially consistent.157// Perhaps a future version of the spec can provide a related fence.158__sync_synchronize();159}160161#define _MM_SHUFFLE(w, z, y, x) (((w) << 6) | ((z) << 4) | ((y) << 2) | (x))162163static __inline__ void __attribute__((__always_inline__, __nodebug__))164_mm_storer_ps(float *__p, __m128 __a)165{166_mm_store_ps(__p, _mm_shuffle_ps(__a, __a, _MM_SHUFFLE(0, 1, 2, 3)));167}168169static __inline__ void __attribute__((__always_inline__, __nodebug__))170_mm_store_ps1(float *__p, __m128 __a)171{172_mm_store_ps(__p, _mm_shuffle_ps(__a, __a, _MM_SHUFFLE(0, 0, 0, 0)));173}174#define _mm_store1_ps _mm_store_ps1175176static __inline__ void __attribute__((__always_inline__, __nodebug__))177_mm_store_ss(float *__p, __m128 __a)178{179wasm_v128_store32_lane((void*)__p, (v128_t)__a, 0);180}181182static __inline__ void __attribute__((__always_inline__, __nodebug__))183_mm_storeu_ps(float *__p, __m128 __a)184{185struct __unaligned {186__m128 __v;187} __attribute__((__packed__, __may_alias__));188((struct __unaligned *)__p)->__v = __a;189}190191static __inline__ int __attribute__((__always_inline__, __nodebug__))192_mm_movemask_ps(__m128 __a)193{194return (int)wasm_i32x4_bitmask((v128_t)__a);195}196197static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))198_mm_move_ss(__m128 __a, __m128 __b)199{200return (__m128)__f32x4_shuffle(__a, __b, 4, 1, 2, 3);201}202203static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))204_mm_add_ps(__m128 __a, __m128 __b)205{206return (__m128)wasm_f32x4_add((v128_t)__a, (v128_t)__b);207}208209static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))210_mm_add_ss(__m128 __a, __m128 __b)211{212return _mm_move_ss(__a, _mm_add_ps(__a, __b));213}214215static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))216_mm_sub_ps(__m128 __a, __m128 __b)217{218return (__m128)wasm_f32x4_sub((v128_t)__a, (v128_t)__b);219}220221static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))222_mm_sub_ss(__m128 __a, __m128 __b)223{224return _mm_move_ss(__a, _mm_sub_ps(__a, __b));225}226227static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))228_mm_mul_ps(__m128 __a, __m128 __b)229{230return (__m128)wasm_f32x4_mul((v128_t)__a, (v128_t)__b);231}232233static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))234_mm_mul_ss(__m128 __a, __m128 __b)235{236return _mm_move_ss(__a, _mm_mul_ps(__a, __b));237}238239static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))240_mm_div_ps(__m128 __a, __m128 __b)241{242return (__m128)wasm_f32x4_div((v128_t)__a, (v128_t)__b);243}244245static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))246_mm_div_ss(__m128 __a, __m128 __b)247{248return _mm_move_ss(__a, _mm_div_ps(__a, __b));249}250251static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))252_mm_min_ps(__m128 __a, __m128 __b)253{254// return (__m128)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs255return (__m128)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f32x4_lt((v128_t)__a, (v128_t)__b));256}257258static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))259_mm_min_ss(__m128 __a, __m128 __b)260{261return _mm_move_ss(__a, _mm_min_ps(__a, __b));262}263264static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))265_mm_max_ps(__m128 __a, __m128 __b)266{267// return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs268return (__m128)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f32x4_gt((v128_t)__a, (v128_t)__b));269}270271static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))272_mm_max_ss(__m128 __a, __m128 __b)273{274return _mm_move_ss(__a, _mm_max_ps(__a, __b));275}276277static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))278_mm_rcp_ps(__m128 __a)279{280return (__m128)wasm_f32x4_div((v128_t)_mm_set1_ps(1.0f), (v128_t)__a);281}282283static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))284_mm_rcp_ss(__m128 __a)285{286return _mm_move_ss(__a, _mm_rcp_ps(__a));287}288289static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))290_mm_sqrt_ps(__m128 __a)291{292return (__m128)wasm_f32x4_sqrt((v128_t)__a);293}294295static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))296_mm_sqrt_ss(__m128 __a)297{298return _mm_move_ss(__a, _mm_sqrt_ps(__a));299}300301#define _mm_rsqrt_ps(__a) _mm_rcp_ps(_mm_sqrt_ps((__a)))302303static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))304_mm_rsqrt_ss(__m128 __a)305{306return _mm_move_ss(__a, _mm_rsqrt_ps(__a));307}308309static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))310_mm_unpackhi_ps(__m128 __a, __m128 __b)311{312return (__m128)__f32x4_shuffle(__a, __b, 2, 6, 3, 7);313}314315static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))316_mm_unpacklo_ps(__m128 __a, __m128 __b)317{318return (__m128)__f32x4_shuffle(__a, __b, 0, 4, 1, 5);319}320321static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))322_mm_movehl_ps(__m128 __a, __m128 __b)323{324return (__m128)__f32x4_shuffle(__a, __b, 6, 7, 2, 3);325}326327static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))328_mm_movelh_ps(__m128 __a, __m128 __b)329{330return (__m128)__f32x4_shuffle(__a, __b, 0, 1, 4, 5);331}332333#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \334do { \335__m128 __row0 = (row0); \336__m128 __row1 = (row1); \337__m128 __row2 = (row2); \338__m128 __row3 = (row3); \339__m128 __tmp0 = _mm_unpacklo_ps(__row0, __row1); \340__m128 __tmp1 = _mm_unpackhi_ps(__row0, __row1); \341__m128 __tmp2 = _mm_unpacklo_ps(__row2, __row3); \342__m128 __tmp3 = _mm_unpackhi_ps(__row2, __row3); \343(row0) = _mm_movelh_ps(__tmp0, __tmp2); \344(row1) = _mm_movehl_ps(__tmp2, __tmp0); \345(row2) = _mm_movelh_ps(__tmp1, __tmp3); \346(row3) = _mm_movehl_ps(__tmp3, __tmp1); \347} while (0)348349static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))350_mm_cmplt_ps(__m128 __a, __m128 __b)351{352return (__m128)wasm_f32x4_lt((v128_t)__a, (v128_t)__b);353}354355static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))356_mm_cmplt_ss(__m128 __a, __m128 __b)357{358return _mm_move_ss(__a, _mm_cmplt_ps(__a, __b));359}360361static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))362_mm_cmple_ps(__m128 __a, __m128 __b)363{364return (__m128)wasm_f32x4_le((v128_t)__a, (v128_t)__b);365}366367static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))368_mm_cmple_ss(__m128 __a, __m128 __b)369{370return _mm_move_ss(__a, _mm_cmple_ps(__a, __b));371}372373static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))374_mm_cmpeq_ps(__m128 __a, __m128 __b)375{376return (__m128)wasm_f32x4_eq((v128_t)__a, (v128_t)__b);377}378379static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))380_mm_cmpeq_ss(__m128 __a, __m128 __b)381{382return _mm_move_ss(__a, _mm_cmpeq_ps(__a, __b));383}384385static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))386_mm_cmpge_ps(__m128 __a, __m128 __b)387{388return (__m128)wasm_f32x4_ge((v128_t)__a, (v128_t)__b);389}390391static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))392_mm_cmpge_ss(__m128 __a, __m128 __b)393{394return _mm_move_ss(__a, _mm_cmpge_ps(__a, __b));395}396397static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))398_mm_cmpgt_ps(__m128 __a, __m128 __b)399{400return (__m128)wasm_f32x4_gt((v128_t)__a, (v128_t)__b);401}402403static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))404_mm_cmpgt_ss(__m128 __a, __m128 __b)405{406return _mm_move_ss(__a, _mm_cmpgt_ps(__a, __b));407}408409static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpord_ps(__m128 __a, __m128 __b)410{411return (__m128)wasm_v128_and(wasm_f32x4_eq((v128_t)__a, (v128_t)__a),412wasm_f32x4_eq((v128_t)__b, (v128_t)__b));413}414415static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpord_ss(__m128 __a, __m128 __b)416{417return _mm_move_ss(__a, _mm_cmpord_ps(__a, __b));418}419420static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpunord_ps(__m128 __a, __m128 __b)421{422return (__m128)wasm_v128_or(wasm_f32x4_ne((v128_t)__a, (v128_t)__a),423wasm_f32x4_ne((v128_t)__b, (v128_t)__b));424}425426static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cmpunord_ss(__m128 __a, __m128 __b)427{428return _mm_move_ss(__a, _mm_cmpunord_ps(__a, __b));429}430431static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))432_mm_and_ps(__m128 __a, __m128 __b)433{434return (__m128)wasm_v128_and((v128_t)__a, (v128_t)__b);435}436437static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))438_mm_andnot_ps(__m128 __a, __m128 __b)439{440return (__m128)wasm_v128_andnot((v128_t)__b, (v128_t)__a);441}442443static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))444_mm_or_ps(__m128 __a, __m128 __b)445{446return (__m128)wasm_v128_or((v128_t)__a, (v128_t)__b);447}448449static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))450_mm_xor_ps(__m128 __a, __m128 __b)451{452return (__m128)wasm_v128_xor((v128_t)__a, (v128_t)__b);453}454455static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))456_mm_cmpneq_ps(__m128 __a, __m128 __b)457{458return (__m128)wasm_f32x4_ne((v128_t)__a, (v128_t)__b);459}460461static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))462_mm_cmpneq_ss(__m128 __a, __m128 __b)463{464return _mm_move_ss(__a, _mm_cmpneq_ps(__a, __b));465}466467static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))468_mm_cmpnge_ps(__m128 __a, __m128 __b)469{470return (__m128)wasm_v128_not((v128_t)_mm_cmpge_ps(__a, __b));471}472473static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))474_mm_cmpnge_ss(__m128 __a, __m128 __b)475{476return _mm_move_ss(__a, _mm_cmpnge_ps(__a, __b));477}478479static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))480_mm_cmpngt_ps(__m128 __a, __m128 __b)481{482return (__m128)wasm_v128_not((v128_t)_mm_cmpgt_ps(__a, __b));483}484485static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))486_mm_cmpngt_ss(__m128 __a, __m128 __b)487{488return _mm_move_ss(__a, _mm_cmpngt_ps(__a, __b));489}490491static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))492_mm_cmpnle_ps(__m128 __a, __m128 __b)493{494return (__m128)wasm_v128_not((v128_t)_mm_cmple_ps(__a, __b));495}496497static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))498_mm_cmpnle_ss(__m128 __a, __m128 __b)499{500return _mm_move_ss(__a, _mm_cmpnle_ps(__a, __b));501}502503static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))504_mm_cmpnlt_ps(__m128 __a, __m128 __b)505{506return (__m128)wasm_v128_not((v128_t)_mm_cmplt_ps(__a, __b));507}508509static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))510_mm_cmpnlt_ss(__m128 __a, __m128 __b)511{512return _mm_move_ss(__a, _mm_cmpnlt_ps(__a, __b));513}514515static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))516_mm_comieq_ss(__m128 __a, __m128 __b)517{518return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);519}520521static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))522_mm_comige_ss(__m128 __a, __m128 __b)523{524return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);525}526527static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))528_mm_comigt_ss(__m128 __a, __m128 __b)529{530return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);531}532533static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))534_mm_comile_ss(__m128 __a, __m128 __b)535{536return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);537}538539static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))540_mm_comilt_ss(__m128 __a, __m128 __b)541{542return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);543}544545static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))546_mm_comineq_ss(__m128 __a, __m128 __b)547{548return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);549}550551static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))552_mm_ucomieq_ss(__m128 __a, __m128 __b)553{554return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);555}556557static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))558_mm_ucomige_ss(__m128 __a, __m128 __b)559{560return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);561}562563static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))564_mm_ucomigt_ss(__m128 __a, __m128 __b)565{566return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);567}568569static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))570_mm_ucomile_ss(__m128 __a, __m128 __b)571{572return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);573}574575static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))576_mm_ucomilt_ss(__m128 __a, __m128 __b)577{578return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);579}580581static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))582_mm_ucomineq_ss(__m128 __a, __m128 __b)583{584return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);585}586587static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))588_mm_cvtsi32_ss(__m128 __a, int __b)589{590__f32x4 __v = (__f32x4)__a;591__v[0] = (float)__b;592return (__m128)__v;593}594#define _mm_cvt_si2ss _mm_cvtsi32_ss595596static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cvtss_si32(__m128 __a)597{598float e = ((__f32x4)__a)[0];599if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabsf(e) < 2.f))600return lrint(e);601else602return (int)0x80000000;603}604#define _mm_cvt_ss2si _mm_cvtss_si32605606static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) _mm_cvttss_si32(__m128 __a)607{608float e = ((__f32x4)__a)[0];609if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabsf(e) < 2.f))610return (int)e;611else612return (int)0x80000000;613}614#define _mm_cvtt_ss2si _mm_cvttss_si32615616static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))617_mm_cvtsi64_ss(__m128 __a, long long __b)618{619__f32x4 __v = (__f32x4)__a;620__v[0] = (float)__b;621return (__m128)__v;622}623624static __inline__ long long __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))625_mm_cvtss_si64(__m128 __a)626{627float e = ((__f32x4)__a)[0];628long long x = llrintf(e);629if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabsf(e) < 2.f))630return x;631else632return 0x8000000000000000LL;633}634635static __inline__ long long __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))636_mm_cvttss_si64(__m128 __a)637{638float e = ((__f32x4)__a)[0];639long long x = llrintf(e);640if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabsf(e) < 2.f))641return (long long)e;642else643return 0x8000000000000000LL;644}645646static __inline__ float __attribute__((__always_inline__, __nodebug__))647_mm_cvtss_f32(__m128 __a)648{649return (float)((__f32x4)__a)[0];650}651652#define _mm_malloc(__size, __align) memalign((__align), (__size))653#define _mm_free free654655static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))656_mm_undefined()657{658__m128 val;659return val;660}661662static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))663_mm_undefined_ps()664{665__m128 val;666return val;667}668669#define _MM_EXCEPT_MASK 0x003f670#define _MM_EXCEPT_INVALID 0x0001671#define _MM_EXCEPT_DENORM 0x0002672#define _MM_EXCEPT_DIV_ZERO 0x0004673#define _MM_EXCEPT_OVERFLOW 0x0008674#define _MM_EXCEPT_UNDERFLOW 0x0010675#define _MM_EXCEPT_INEXACT 0x0020676677#define _MM_MASK_MASK 0x1f80678#define _MM_MASK_INVALID 0x0080679#define _MM_MASK_DENORM 0x0100680#define _MM_MASK_DIV_ZERO 0x0200681#define _MM_MASK_OVERFLOW 0x0400682#define _MM_MASK_UNDERFLOW 0x0800683#define _MM_MASK_INEXACT 0x1000684685#define _MM_ROUND_MASK 0x6000686#define _MM_ROUND_NEAREST 0x0000687#define _MM_ROUND_DOWN 0x2000688#define _MM_ROUND_UP 0x4000689#define _MM_ROUND_TOWARD_ZERO 0x6000690691#define _MM_FLUSH_ZERO_MASK 0x8000692#define _MM_FLUSH_ZERO_ON 0x8000693#define _MM_FLUSH_ZERO_OFF 0x0000694695static __inline__ int __attribute__((__always_inline__, __nodebug__))696_mm_getcsr()697{698return _MM_MASK_INEXACT | _MM_MASK_DENORM | _MM_MASK_DIV_ZERO | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_INVALID699| _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_OFF;700}701702#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)703#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)704#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)705#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)706707// Unavailable functions:708// void _MM_SET_EXCEPTION_STATE(unsigned int __a);709// void _MM_SET_EXCEPTION_MASK(unsigned int __a);710// void _MM_GET_ROUNDING_MODE(unsigned int __a);711// void _MM_GET_FLUSH_ZERO_MODE(unsigned int __a);712713#endif /* __emscripten_xmmintrin_h__ */714715716