Path: blob/main/system/include/compat/emmintrin.h
6175 views
/*1* Copyright 2020 The Emscripten Authors. All rights reserved.2* Emscripten is available under two separate licenses, the MIT license and the3* University of Illinois/NCSA Open Source License. Both these licenses can be4* found in the LICENSE file.5*/6#ifndef __emscripten_emmintrin_h__7#define __emscripten_emmintrin_h__89#ifndef __SSE2__10#error "SSE2 instruction set not enabled"11#endif1213#include <xmmintrin.h>1415// Alias different (functionally) equivalent intrinsics.16#define _mm_set_epi64x _mm_set_epi6417#define _mm_cvtsd_si64x _mm_cvtsd_si6418#define _mm_cvtsi128_si64x _mm_cvtsi128_si6419#define _mm_cvtsi64x_sd _mm_cvtsi64_sd20#define _mm_cvtsi64x_si128 _mm_cvtsi64_si12821#define _mm_cvttsd_si64x _mm_cvttsd_si6422#define _mm_store_pd1 _mm_store1_pd2324typedef __f64x2 __m128d;2526static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))27_mm_move_sd(__m128d __a, __m128d __b)28{29return (__m128d){ __b[0], __a[1] };30}3132static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))33_mm_add_pd(__m128d __a, __m128d __b)34{35return (__m128d)wasm_f64x2_add((v128_t)__a, (v128_t)__b);36}3738static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))39_mm_add_sd(__m128d __a, __m128d __b)40{41return _mm_move_sd(__a, _mm_add_pd(__a, __b));42}4344static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))45_mm_sub_pd(__m128d __a, __m128d __b)46{47return (__m128d)wasm_f64x2_sub((v128_t)__a, (v128_t)__b);48}4950static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))51_mm_sub_sd(__m128d __a, __m128d __b)52{53return _mm_move_sd(__a, _mm_sub_pd(__a, __b));54}5556static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))57_mm_mul_pd(__m128d __a, __m128d __b)58{59return (__m128d)wasm_f64x2_mul((v128_t)__a, (v128_t)__b);60}6162static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))63_mm_mul_sd(__m128d __a, __m128d __b)64{65return _mm_move_sd(__a, _mm_mul_pd(__a, __b));66}6768static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))69_mm_div_pd(__m128d __a, __m128d __b)70{71return (__m128d)wasm_f64x2_div((v128_t)__a, (v128_t)__b);72}7374static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))75_mm_div_sd(__m128d __a, __m128d __b)76{77return _mm_move_sd(__a, _mm_div_pd(__a, __b));78}7980static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))81_mm_sqrt_pd(__m128d __a)82{83return (__m128d)wasm_f64x2_sqrt((v128_t)__a);84}8586static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))87_mm_sqrt_sd(__m128d __a, __m128d __b)88{89return _mm_move_sd(__a, _mm_sqrt_pd(__b));90}9192static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))93_mm_min_pd(__m128d __a, __m128d __b)94{95// return (__m128d)wasm_f32x4_pmin((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs96return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_lt((v128_t)__a, (v128_t)__b));97}9899static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))100_mm_min_sd(__m128d __a, __m128d __b)101{102return _mm_move_sd(__a, _mm_min_pd(__a, __b));103}104105static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))106_mm_max_pd(__m128d __a, __m128d __b)107{108// return (__m128)wasm_f32x4_pmax((v128_t)__a, (v128_t)__b); // TODO: Migrate to this, once it works in VMs109return (__m128d)wasm_v128_bitselect((v128_t)__a, (v128_t)__b, (v128_t)wasm_f64x2_gt((v128_t)__a, (v128_t)__b));110}111112static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))113_mm_max_sd(__m128d __a, __m128d __b)114{115return _mm_move_sd(__a, _mm_max_pd(__a, __b));116}117118static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))119_mm_and_pd(__m128d __a, __m128d __b)120{121return (__m128d)wasm_v128_and((v128_t)__a, (v128_t)__b);122}123124static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))125_mm_andnot_pd(__m128d __a, __m128d __b)126{127return (__m128d)wasm_v128_andnot((v128_t)__b, (v128_t)__a);128}129130static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))131_mm_or_pd(__m128d __a, __m128d __b)132{133return (__m128d)wasm_v128_or((v128_t)__a, (v128_t)__b);134}135136static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))137_mm_xor_pd(__m128d __a, __m128d __b)138{139return (__m128d)wasm_v128_xor((v128_t)__a, (v128_t)__b);140}141142static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))143_mm_cmpeq_pd(__m128d __a, __m128d __b)144{145return (__m128d)wasm_f64x2_eq((v128_t)__a, (v128_t)__b);146}147148static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))149_mm_cmplt_pd(__m128d __a, __m128d __b)150{151return (__m128d)wasm_f64x2_lt((v128_t)__a, (v128_t)__b);152}153154static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))155_mm_cmple_pd(__m128d __a, __m128d __b)156{157return (__m128d)wasm_f64x2_le((v128_t)__a, (v128_t)__b);158}159160static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))161_mm_cmpgt_pd(__m128d __a, __m128d __b)162{163return (__m128d)wasm_f64x2_gt((v128_t)__a, (v128_t)__b);164}165166static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))167_mm_cmpge_pd(__m128d __a, __m128d __b)168{169return (__m128d)wasm_f64x2_ge((v128_t)__a, (v128_t)__b);170}171172static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))173_mm_cmpord_pd(__m128d __a, __m128d __b)174{175return (__m128d)wasm_v128_and(wasm_f64x2_eq((v128_t)__a, (v128_t)__a),176wasm_f64x2_eq((v128_t)__b, (v128_t)__b));177}178179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))180_mm_cmpunord_pd(__m128d __a, __m128d __b)181{182return (__m128d)wasm_v128_or(wasm_f64x2_ne((v128_t)__a, (v128_t)__a),183wasm_f64x2_ne((v128_t)__b, (v128_t)__b));184}185186static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))187_mm_cmpneq_pd(__m128d __a, __m128d __b)188{189return (__m128d)wasm_f64x2_ne((v128_t)__a, (v128_t)__b);190}191192static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))193_mm_cmpnlt_pd(__m128d __a, __m128d __b)194{195return (__m128d)wasm_v128_not((v128_t)_mm_cmplt_pd(__a, __b));196}197198static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))199_mm_cmpnle_pd(__m128d __a, __m128d __b)200{201return (__m128d)wasm_v128_not((v128_t)_mm_cmple_pd(__a, __b));202}203204static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))205_mm_cmpngt_pd(__m128d __a, __m128d __b)206{207return (__m128d)wasm_v128_not((v128_t)_mm_cmpgt_pd(__a, __b));208}209210static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))211_mm_cmpnge_pd(__m128d __a, __m128d __b)212{213return (__m128d)wasm_v128_not((v128_t)_mm_cmpge_pd(__a, __b));214}215216static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))217_mm_cmpeq_sd(__m128d __a, __m128d __b)218{219return _mm_move_sd(__a, _mm_cmpeq_pd(__a, __b));220}221222static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))223_mm_cmplt_sd(__m128d __a, __m128d __b)224{225return _mm_move_sd(__a, _mm_cmplt_pd(__a, __b));226}227228static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))229_mm_cmple_sd(__m128d __a, __m128d __b)230{231return _mm_move_sd(__a, _mm_cmple_pd(__a, __b));232}233234static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))235_mm_cmpgt_sd(__m128d __a, __m128d __b)236{237return _mm_move_sd(__a, _mm_cmpgt_pd(__a, __b));238}239240static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))241_mm_cmpge_sd(__m128d __a, __m128d __b)242{243return _mm_move_sd(__a, _mm_cmpge_pd(__a, __b));244}245246static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))247_mm_cmpord_sd(__m128d __a, __m128d __b)248{249return _mm_move_sd(__a, _mm_cmpord_pd(__a, __b));250}251252static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))253_mm_cmpunord_sd(__m128d __a, __m128d __b)254{255return _mm_move_sd(__a, _mm_cmpunord_pd(__a, __b));256}257258static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))259_mm_cmpneq_sd(__m128d __a, __m128d __b)260{261return _mm_move_sd(__a, _mm_cmpneq_pd(__a, __b));262}263264static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))265_mm_cmpnlt_sd(__m128d __a, __m128d __b)266{267return _mm_move_sd(__a, _mm_cmpnlt_pd(__a, __b));268}269270static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))271_mm_cmpnle_sd(__m128d __a, __m128d __b)272{273return _mm_move_sd(__a, _mm_cmpnle_pd(__a, __b));274}275276static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))277_mm_cmpngt_sd(__m128d __a, __m128d __b)278{279return _mm_move_sd(__a, _mm_cmpngt_pd(__a, __b));280}281282static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))283_mm_cmpnge_sd(__m128d __a, __m128d __b)284{285return _mm_move_sd(__a, _mm_cmpnge_pd(__a, __b));286}287288static __inline__ int __attribute__((__always_inline__, __nodebug__))289_mm_comieq_sd(__m128d __a, __m128d __b)290{291return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);292}293294static __inline__ int __attribute__((__always_inline__, __nodebug__))295_mm_comilt_sd(__m128d __a, __m128d __b)296{297return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);298}299300static __inline__ int __attribute__((__always_inline__, __nodebug__))301_mm_comile_sd(__m128d __a, __m128d __b)302{303return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);304}305306static __inline__ int __attribute__((__always_inline__, __nodebug__))307_mm_comigt_sd(__m128d __a, __m128d __b)308{309return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);310}311312static __inline__ int __attribute__((__always_inline__, __nodebug__))313_mm_comige_sd(__m128d __a, __m128d __b)314{315return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);316}317318static __inline__ int __attribute__((__always_inline__, __nodebug__))319_mm_comineq_sd(__m128d __a, __m128d __b)320{321return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);322}323324static __inline__ int __attribute__((__always_inline__, __nodebug__))325_mm_ucomieq_sd(__m128d __a, __m128d __b)326{327return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);328}329330static __inline__ int __attribute__((__always_inline__, __nodebug__))331_mm_ucomilt_sd(__m128d __a, __m128d __b)332{333return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);334}335336static __inline__ int __attribute__((__always_inline__, __nodebug__))337_mm_ucomile_sd(__m128d __a, __m128d __b)338{339return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);340}341342static __inline__ int __attribute__((__always_inline__, __nodebug__))343_mm_ucomigt_sd(__m128d __a, __m128d __b)344{345return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);346}347348static __inline__ int __attribute__((__always_inline__, __nodebug__))349_mm_ucomige_sd(__m128d __a, __m128d __b)350{351return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);352}353354static __inline__ int __attribute__((__always_inline__, __nodebug__))355_mm_ucomineq_sd(__m128d __a, __m128d __b)356{357return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);358}359360static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))361_mm_cvtpd_ps(__m128d __a)362{363return (__m128)wasm_f32x4_demote_f64x2_zero((v128_t)__a);364}365366static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))367_mm_cvtps_pd(__m128 __a)368{369return (__m128d)wasm_f64x2_promote_low_f32x4((v128_t)__a);370}371372static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))373_mm_cvtepi32_pd(__m128i __a)374{375return (__m128d)wasm_f64x2_convert_low_i32x4((v128_t)__a);376}377378static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))379_mm_cvtpd_epi32(__m128d __a)380{381// TODO: OPTIMIZE!382int m[2];383for(int i = 0; i < 2; ++i)384{385double e = __a[i];386int x = lrint(e);387if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))388m[i] = x;389else390m[i] = (int)0x80000000;391}392return wasm_i32x4_make(m[0], m[1], 0, 0);393}394395static __inline__ int __attribute__((__always_inline__, __nodebug__))396_mm_cvtsd_si32(__m128d __a)397{398// TODO: OPTIMIZE!399double e = __a[0];400int x = lrint(e);401if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))402return x;403else404return (int)0x80000000;405}406407static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))408_mm_cvtsd_ss(__m128 __a, __m128d __b)409{410__a[0] = __b[0];411return __a;412}413414static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))415_mm_cvtsi32_sd(__m128d __a, int __b)416{417__a[0] = __b;418return __a;419}420421static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))422_mm_cvtss_sd(__m128d __a, __m128 __b)423{424__a[0] = __b[0];425return __a;426}427428static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))429_mm_cvttpd_epi32(__m128d __a)430{431// TODO: OPTIMIZE!432int m[2];433for(int i = 0; i < 2; ++i)434{435double elem = __a[i];436if (elem < 2147483648.0 && elem >= -2147483648.0 && (lrint(elem) != 0 || fabs(elem) < 2.0))437// Use the trapping instruction here since we have explicit bounds checks438// above.439m[i] = __builtin_wasm_trunc_s_i32_f64(elem);440else441m[i] = (int)0x80000000;442}443return wasm_i32x4_make(m[0], m[1], 0, 0);444}445446static __inline__ int __attribute__((__always_inline__, __nodebug__))447_mm_cvttsd_si32(__m128d __a)448{449// TODO: OPTIMIZE!450double elem = __a[0];451if (elem < 2147483648.0 && elem >= -2147483648.0 && (lrint(elem) != 0 || fabs(elem) < 2.0))452// Use the trapping instruction here since we have explicit bounds checks453// above.454return __builtin_wasm_trunc_s_i32_f64(elem);455else456return (int)0x80000000;457}458459static __inline__ double __attribute__((__always_inline__, __nodebug__))460_mm_cvtsd_f64(__m128d __a)461{462return wasm_f64x2_extract_lane((v128_t)__a, 0);463}464465static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))466_mm_load_pd(double const *__dp)467{468return *(__m128d*)__dp;469}470471static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))472_mm_load1_pd(double const *__dp)473{474return (__m128d)wasm_v64x2_load_splat(__dp);475}476477#define _mm_load_pd1(dp) _mm_load1_pd(dp)478479static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))480_mm_loadr_pd(double const *__p)481{482__m128d __u = *(__m128d*)__p; // aligned load483return (__m128d)wasm_i64x2_shuffle((v128_t)__u, (v128_t)__u, 1, 0);484}485486static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))487_mm_loadu_pd(double const *__dp)488{489struct __loadu_pd {490__m128d __v;491} __attribute__((__packed__, __may_alias__));492return ((struct __loadu_pd*)__dp)->__v;493}494495static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))496_mm_load_sd(double const *__p)497{498return (__m128d)wasm_v128_load64_zero((const void*)__p);499}500501static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))502_mm_loadh_pd(__m128d __a, double const *__dp)503{504return (__m128d)wasm_v128_load64_lane((const void*)__dp, (v128_t)__a, 1);505}506507static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))508_mm_loadl_pd(__m128d __a, double const *__dp)509{510return (__m128d)wasm_v128_load64_lane((const void*)__dp, (v128_t)__a, 0);511}512513static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))514_mm_set_sd(double __w)515{516return (__m128d)wasm_f64x2_make(__w, 0);517}518519static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))520_mm_set1_pd(double __w)521{522return (__m128d)wasm_f64x2_splat(__w);523}524525static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))526_mm_set_pd(double __c1, double __c0)527{528return (__m128d)wasm_f64x2_make(__c0, __c1);529}530531static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))532_mm_setr_pd(double __c0, double __c1)533{534return (__m128d)wasm_f64x2_make(__c0, __c1);535}536537static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))538_mm_setzero_pd(void)539{540return (__m128d)wasm_f64x2_const(0.0, 0.0);541}542543static __inline__ void __attribute__((__always_inline__, __nodebug__))544_mm_store_sd(double *__dp, __m128d __a)545{546wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 0);547}548549static __inline__ void __attribute__((__always_inline__, __nodebug__))550_mm_store1_pd(double *__dp, __m128d __a)551{552struct __mm_store1_pd_struct {553double __u[2];554} __attribute__((__packed__, __may_alias__));555((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];556((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];557}558559static __inline__ void __attribute__((__always_inline__, __nodebug__))560_mm_store_pd(double *__dp, __m128d __a)561{562*(__m128d *)__dp = __a;563}564565static __inline__ void __attribute__((__always_inline__, __nodebug__))566_mm_storeu_pd(double *__dp, __m128d __a)567{568struct __unaligned {569__m128d __v;570} __attribute__((__packed__, __may_alias__));571572((struct __unaligned *)__dp)->__v = __a;573}574575static __inline__ void __attribute__((__always_inline__, __nodebug__))576_mm_storer_pd(double *__p, __m128d __a)577{578*(__m128d *)__p = (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__a, 1, 0);579}580581static __inline__ void __attribute__((__always_inline__, __nodebug__))582_mm_storeh_pd(double *__dp, __m128d __a)583{584wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 1);585}586587static __inline__ void __attribute__((__always_inline__, __nodebug__))588_mm_storel_pd(double *__dp, __m128d __a)589{590wasm_v128_store64_lane((void*)__dp, (v128_t)__a, 0);591}592593static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))594_mm_add_epi8(__m128i __a, __m128i __b)595{596return (__m128i)wasm_i8x16_add((v128_t)__a, (v128_t)__b);597}598599static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))600_mm_add_epi16(__m128i __a, __m128i __b)601{602return (__m128i)wasm_i16x8_add((v128_t)__a, (v128_t)__b);603}604605static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))606_mm_add_epi32(__m128i __a, __m128i __b)607{608return (__m128i)wasm_i32x4_add((v128_t)__a, (v128_t)__b);609}610611static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))612_mm_add_epi64(__m128i __a, __m128i __b)613{614return (__m128i)wasm_i64x2_add((v128_t)__a, (v128_t)__b);615}616617static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))618_mm_adds_epi8(__m128i __a, __m128i __b)619{620return (__m128i)wasm_i8x16_add_saturate((v128_t)__a, (v128_t)__b);621}622623static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))624_mm_adds_epi16(__m128i __a, __m128i __b)625{626return (__m128i)wasm_i16x8_add_saturate((v128_t)__a, (v128_t)__b);627}628629static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))630_mm_adds_epu8(__m128i __a, __m128i __b)631{632return (__m128i)wasm_u8x16_add_saturate((v128_t)__a, (v128_t)__b);633}634635static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))636_mm_adds_epu16(__m128i __a, __m128i __b)637{638return (__m128i)wasm_u16x8_add_saturate((v128_t)__a, (v128_t)__b);639}640641static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))642_mm_avg_epu8(__m128i __a, __m128i __b)643{644return (__m128i)wasm_u8x16_avgr((v128_t)__a, (v128_t)__b);645}646647static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))648_mm_avg_epu16(__m128i __a, __m128i __b)649{650return (__m128i)wasm_u16x8_avgr((v128_t)__a, (v128_t)__b);651}652653static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))654_mm_madd_epi16(__m128i __a, __m128i __b)655{656return (__m128i)wasm_i32x4_dot_i16x8((v128_t)__a, (v128_t)__b);657}658659static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))660_mm_max_epi16(__m128i __a, __m128i __b)661{662return (__m128i)wasm_i16x8_max((v128_t)__a, (v128_t)__b);663}664665static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))666_mm_max_epu8(__m128i __a, __m128i __b)667{668return (__m128i)wasm_u8x16_max((v128_t)__a, (v128_t)__b);669}670671static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))672_mm_min_epi16(__m128i __a, __m128i __b)673{674return (__m128i)wasm_i16x8_min((v128_t)__a, (v128_t)__b);675}676677static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))678_mm_min_epu8(__m128i __a, __m128i __b)679{680return (__m128i)wasm_u8x16_min((v128_t)__a, (v128_t)__b);681}682683static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))684_mm_mulhi_epi16(__m128i __a, __m128i __b)685{686const v128_t lo = wasm_i32x4_extmul_low_i16x8((v128_t)__a, (v128_t)__b);687const v128_t hi = wasm_i32x4_extmul_high_i16x8((v128_t)__a, (v128_t)__b);688return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);689}690691static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))692_mm_mulhi_epu16(__m128i __a, __m128i __b)693{694const v128_t lo = wasm_u32x4_extmul_low_u16x8((v128_t)__a, (v128_t)__b);695const v128_t hi = wasm_u32x4_extmul_high_u16x8((v128_t)__a, (v128_t)__b);696return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);697}698699static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))700_mm_mullo_epi16(__m128i __a, __m128i __b)701{702return (__m128i)wasm_i16x8_mul((v128_t)__a, (v128_t)__b);703}704705static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))706_mm_mul_epu32(__m128i __a, __m128i __b)707{708return (__m128i)wasm_u64x2_extmul_low_u32x4(709wasm_v32x4_shuffle((v128_t)__a, (v128_t)__a, 0, 2, 0, 2),710wasm_v32x4_shuffle((v128_t)__b, (v128_t)__b, 0, 2, 0, 2));711}712713static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))714_mm_sub_epi8(__m128i __a, __m128i __b)715{716return (__m128i)wasm_i8x16_sub((v128_t)__a, (v128_t)__b);717}718719static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))720_mm_sub_epi16(__m128i __a, __m128i __b)721{722return (__m128i)wasm_i16x8_sub((v128_t)__a, (v128_t)__b);723}724725static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))726_mm_sub_epi32(__m128i __a, __m128i __b)727{728return (__m128i)wasm_i32x4_sub((v128_t)__a, (v128_t)__b);729}730731static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))732_mm_sub_epi64(__m128i __a, __m128i __b)733{734return (__m128i)wasm_i64x2_sub((v128_t)__a, (v128_t)__b);735}736737static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))738_mm_subs_epi8(__m128i __a, __m128i __b)739{740return (__m128i)wasm_i8x16_sub_saturate((v128_t)__a, (v128_t)__b);741}742743static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))744_mm_subs_epi16(__m128i __a, __m128i __b)745{746return (__m128i)wasm_i16x8_sub_saturate((v128_t)__a, (v128_t)__b);747}748749static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))750_mm_subs_epu8(__m128i __a, __m128i __b)751{752return (__m128i)wasm_u8x16_sub_saturate((v128_t)__a, (v128_t)__b);753}754755static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))756_mm_subs_epu16(__m128i __a, __m128i __b)757{758return (__m128i)wasm_u16x8_sub_saturate((v128_t)__a, (v128_t)__b);759}760761static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))762_mm_and_si128(__m128i __a, __m128i __b)763{764return (__m128i)wasm_v128_and((v128_t)__a, (v128_t)__b);765}766767static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))768_mm_andnot_si128(__m128i __a, __m128i __b)769{770return (__m128i)wasm_v128_andnot((v128_t)__b, (v128_t)__a);771}772773static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))774_mm_or_si128(__m128i __a, __m128i __b)775{776return (__m128i)wasm_v128_or((v128_t)__b, (v128_t)__a);777}778779static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))780_mm_xor_si128(__m128i __a, __m128i __b)781{782return (__m128i)wasm_v128_xor((v128_t)__b, (v128_t)__a);783}784785#define _mm_slli_si128(__a, __imm) __extension__ ({ \786(__m128i)wasm_i8x16_shuffle(_mm_setzero_si128(), \787(__a), \788((__imm)&0xF0) ? 0 : 16 - ((__imm)&0xF), \789((__imm)&0xF0) ? 0 : 17 - ((__imm)&0xF), \790((__imm)&0xF0) ? 0 : 18 - ((__imm)&0xF), \791((__imm)&0xF0) ? 0 : 19 - ((__imm)&0xF), \792((__imm)&0xF0) ? 0 : 20 - ((__imm)&0xF), \793((__imm)&0xF0) ? 0 : 21 - ((__imm)&0xF), \794((__imm)&0xF0) ? 0 : 22 - ((__imm)&0xF), \795((__imm)&0xF0) ? 0 : 23 - ((__imm)&0xF), \796((__imm)&0xF0) ? 0 : 24 - ((__imm)&0xF), \797((__imm)&0xF0) ? 0 : 25 - ((__imm)&0xF), \798((__imm)&0xF0) ? 0 : 26 - ((__imm)&0xF), \799((__imm)&0xF0) ? 0 : 27 - ((__imm)&0xF), \800((__imm)&0xF0) ? 0 : 28 - ((__imm)&0xF), \801((__imm)&0xF0) ? 0 : 29 - ((__imm)&0xF), \802((__imm)&0xF0) ? 0 : 30 - ((__imm)&0xF), \803((__imm)&0xF0) ? 0 : 31 - ((__imm)&0xF)); })804#define _mm_bslli_si128(__a, __imm) \805_mm_slli_si128((__a), (__imm))806807static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))808_mm_slli_epi16(__m128i __a, int __count)809{810return (__m128i)((__count < 16) ? wasm_i16x8_shl((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));811}812813static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))814_mm_sll_epi16(__m128i __a, __m128i __count)815{816unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];817return (__m128i)((__c < 16) ? wasm_i16x8_shl((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));818}819820static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))821_mm_slli_epi32(__m128i __a, int __count)822{823return (__m128i)((__count < 32) ? wasm_i32x4_shl((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));824}825826static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))827_mm_sll_epi32(__m128i __a, __m128i __count)828{829unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];830return (__m128i)((__c < 32) ? wasm_i32x4_shl((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));831}832833static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))834_mm_slli_epi64(__m128i __a, int __count)835{836return (__m128i)((__count < 64) ? wasm_i64x2_shl((v128_t)__a, __count) : wasm_i64x2_const(0,0));837}838839static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))840_mm_sll_epi64(__m128i __a, __m128i __count)841{842unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];843return (__m128i)((__c < 64) ? wasm_i64x2_shl((v128_t)__a, __c) : wasm_i64x2_const(0,0));844}845846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))847_mm_srai_epi16(__m128i __a, int __count)848{849__count = __count < 15 ? __count : 15;850return (__m128i)wasm_i16x8_shr((v128_t)__a, __count);851}852853static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))854_mm_sra_epi16(__m128i __a, __m128i __count)855{856unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];857__c = __c < 15 ? __c : 15;858return (__m128i)wasm_i16x8_shr((v128_t)__a, __c);859}860861static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))862_mm_srai_epi32(__m128i __a, int __count)863{864__count = __count < 31 ? __count : 31;865return (__m128i)wasm_i32x4_shr((v128_t)__a, __count);866}867868static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))869_mm_sra_epi32(__m128i __a, __m128i __count)870{871unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];872__c = __c < 31 ? __c : 31;873return (__m128i)wasm_i32x4_shr((v128_t)__a, __c);874}875876#define _mm_srli_si128(__a, __imm) __extension__ ({ \877(__m128i)wasm_i8x16_shuffle((__a), \878_mm_setzero_si128(), \879((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 0, \880((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 1, \881((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 2, \882((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 3, \883((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 4, \884((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 5, \885((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 6, \886((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 7, \887((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 8, \888((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 9, \889((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 10, \890((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 11, \891((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 12, \892((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 13, \893((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 14, \894((__imm)&0xF0) ? 16 : ((__imm)&0xF) + 15); })895896#define _mm_bsrli_si128(__a, __imm) \897_mm_srli_si128((__a), (__imm))898899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))900_mm_srli_epi16(__m128i __a, int __count)901{902return (__m128i)(((unsigned int)__count < 16) ? wasm_u16x8_shr((v128_t)__a, __count) : wasm_i16x8_const(0,0,0,0,0,0,0,0));903}904905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))906_mm_srl_epi16(__m128i __a, __m128i __count)907{908unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];909return (__m128i)((__c < 16) ? wasm_u16x8_shr((v128_t)__a, __c) : wasm_i16x8_const(0,0,0,0,0,0,0,0));910}911912static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))913_mm_srli_epi32(__m128i __a, int __count)914{915return (__m128i)(((unsigned int)__count < 32) ? wasm_u32x4_shr((v128_t)__a, __count) : wasm_i32x4_const(0,0,0,0));916}917918static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))919_mm_srl_epi32(__m128i __a, __m128i __count)920{921unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];922return (__m128i)((__c < 32) ? wasm_u32x4_shr((v128_t)__a, __c) : wasm_i32x4_const(0,0,0,0));923}924925static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))926_mm_srli_epi64(__m128i __a, int __count)927{928return (__m128i)(((unsigned int)__count < 64) ? wasm_u64x2_shr((v128_t)__a, __count) : wasm_i64x2_const(0,0));929}930931static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))932_mm_srl_epi64(__m128i __a, __m128i __count)933{934unsigned long long __c = (unsigned long long)((__u64x2)__count)[0];935return (__m128i)((__c < 64) ? wasm_u64x2_shr((v128_t)__a, __c) : wasm_i64x2_const(0,0));936}937938static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))939_mm_cmpeq_epi8(__m128i __a, __m128i __b)940{941return (__m128i)wasm_i8x16_eq((v128_t)__a, (v128_t)__b);942}943944static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))945_mm_cmpeq_epi16(__m128i __a, __m128i __b)946{947return (__m128i)wasm_i16x8_eq((v128_t)__a, (v128_t)__b);948}949950static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))951_mm_cmpeq_epi32(__m128i __a, __m128i __b)952{953return (__m128i)wasm_i32x4_eq((v128_t)__a, (v128_t)__b);954}955956static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))957_mm_cmpgt_epi8(__m128i __a, __m128i __b)958{959return (__m128i)wasm_i8x16_gt((v128_t)__a, (v128_t)__b);960}961962static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))963_mm_cmpgt_epi16(__m128i __a, __m128i __b)964{965return (__m128i)wasm_i16x8_gt((v128_t)__a, (v128_t)__b);966}967968static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))969_mm_cmpgt_epi32(__m128i __a, __m128i __b)970{971return (__m128i)wasm_i32x4_gt((v128_t)__a, (v128_t)__b);972}973974static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))975_mm_cmplt_epi8(__m128i __a, __m128i __b)976{977return (__m128i)wasm_i8x16_lt((v128_t)__a, (v128_t)__b);978}979980static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))981_mm_cmplt_epi16(__m128i __a, __m128i __b)982{983return (__m128i)wasm_i16x8_lt((v128_t)__a, (v128_t)__b);984}985986static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))987_mm_cmplt_epi32(__m128i __a, __m128i __b)988{989return (__m128i)wasm_i32x4_lt((v128_t)__a, (v128_t)__b);990}991992static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))993_mm_cvtsi64_sd(__m128d __a, long long __b)994{995// TODO: optimize996union {997double x[2];998__m128d m;999} m;1000m.m = __a;1001m.x[0] = (double)__b;1002return m.m;1003}10041005static __inline__ long long __attribute__((__always_inline__, __nodebug__))1006_mm_cvtsd_si64(__m128d __a)1007{1008// TODO: optimize1009double e = __a[0];1010if (isnan(e) || isinf(e)) return 0x8000000000000000LL;1011long long x = llrint(e);1012if (e <= LLONG_MAX && e >= LLONG_MIN && (x != 0 || fabs(e) < 2.f))1013return x;1014else1015return 0x8000000000000000LL;1016}10171018static __inline__ long long __attribute__((__always_inline__, __nodebug__))1019_mm_cvttsd_si64(__m128d __a)1020{1021// TODO: optimize1022double e = __a[0];1023if (isnan(e) || isinf(e) || e > LLONG_MAX || e < LLONG_MIN) return 0x8000000000000000LL;1024long long x = llrint(e);1025if (x != 0 || fabs(e) < 2.f)1026// Use the trapping instruction here since we have explicit bounds checks1027// above1028return __builtin_wasm_trunc_s_i64_f64(e);1029else1030return 0x8000000000000000LL;1031}10321033static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1034_mm_cvtepi32_ps(__m128i __a)1035{1036return (__m128)wasm_f32x4_convert_i32x4(__a);1037}10381039static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1040_mm_cvtps_epi32(__m128 __a)1041{1042// TODO: optimize1043union {1044int x[4];1045__m128i m;1046} u;1047for(int i = 0; i < 4; ++i)1048{1049double e = __a[i];1050int x = lrint(e);1051if (e <= INT_MAX && e >= INT_MIN && (x != 0 || fabs(e) < 2.0))1052u.x[i] = x;1053else1054u.x[i] = (int)0x80000000;1055}1056return u.m;1057}10581059static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1060_mm_cvttps_epi32(__m128 __a)1061{1062// TODO: optimize1063union {1064int x[4];1065__m128i m;1066} u;1067for(int i = 0; i < 4; ++i)1068{1069float e = __a[i];1070if (e < 2147483648.0f && e >= -2147483648.0f && (lrint(e) != 0 || fabs(e) < 2.0))1071// Use the trapping instruction here since we have explicit bounds checks1072// above.1073u.x[i] = __builtin_wasm_trunc_s_i32_f32(e);1074else1075u.x[i] = (int)0x80000000;1076}1077return u.m;1078}10791080static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1081_mm_cvtsi32_si128(int __a)1082{1083return (__m128i)wasm_i32x4_make(__a, 0, 0, 0);1084}10851086static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1087_mm_cvtsi64_si128(long long __a)1088{1089return (__m128i)wasm_i64x2_make(__a, 0);1090}10911092static __inline__ int __attribute__((__always_inline__, __nodebug__))1093_mm_cvtsi128_si32(__m128i __a)1094{1095return wasm_i32x4_extract_lane(__a, 0);1096}10971098static __inline__ long long __attribute__((__always_inline__, __nodebug__))1099_mm_cvtsi128_si64(__m128i __a)1100{1101return wasm_i64x2_extract_lane(__a, 0);1102}11031104static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1105_mm_load_si128(__m128i const *__p)1106{1107return *__p;1108}11091110static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1111_mm_loadu_si128(__m128i const *__p)1112{1113// UB-free unaligned access copied from wasm_simd128.h1114struct __mm_loadu_si128_struct {1115__m128i __v;1116} __attribute__((__packed__, __may_alias__));1117return ((struct __mm_loadu_si128_struct*)__p)->__v;1118}11191120static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1121_mm_loadu_si16(void const *__p)1122{1123return (__m128i)wasm_v128_load16_lane(__p, wasm_i64x2_const(0, 0), 0);1124}11251126static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1127_mm_loadu_si32(void const *__p)1128{1129return (__m128i)wasm_v128_load32_zero(__p);1130}11311132static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1133_mm_loadu_si64(void const *__p)1134{1135return (__m128i)wasm_v128_load64_zero(__p);1136}11371138static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1139_mm_loadl_epi64(__m128i const *__p)1140{1141return _mm_loadu_si64(__p);1142}11431144static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1145_mm_set_epi64(long long q1, long long q0)1146{1147return (__m128i)wasm_i64x2_make(q0, q1);1148}11491150static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1151_mm_set_epi32(int i3, int i2, int i1, int i0)1152{1153return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);1154}11551156static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1157_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)1158{1159return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);1160}11611162static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1163_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)1164{1165return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);1166}11671168static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1169_mm_set1_epi64x(long long __q)1170{1171return (__m128i)wasm_i64x2_splat(__q);1172}11731174static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1175_mm_set1_epi32(int __i)1176{1177return (__m128i)wasm_i32x4_splat(__i);1178}11791180static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1181_mm_set1_epi16(short __w)1182{1183return (__m128i)wasm_i16x8_splat(__w);1184}11851186static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1187_mm_set1_epi8(char __b)1188{1189return (__m128i)wasm_i8x16_splat(__b);1190}11911192static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1193_mm_setr_epi32(int i0, int i1, int i2, int i3)1194{1195return (__m128i)wasm_i32x4_make(i0, i1, i2, i3);1196}11971198static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1199_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)1200{1201return (__m128i)wasm_i16x8_make(w0, w1, w2, w3, w4, w5, w6, w7);1202}12031204static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1205_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)1206{1207return (__m128i)wasm_i8x16_make(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);1208}12091210static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1211_mm_setzero_si128(void)1212{1213return wasm_i64x2_const(0, 0);1214}12151216static __inline__ void __attribute__((__always_inline__, __nodebug__))1217_mm_store_si128(__m128i *__p, __m128i __b)1218{1219*__p = __b;1220}12211222static __inline__ void __attribute__((__always_inline__, __nodebug__))1223_mm_storeu_si16(void *__p, __m128i __a)1224{1225wasm_v128_store16_lane(__p, (v128_t)__a, 0);1226}12271228static __inline__ void __attribute__((__always_inline__, __nodebug__))1229_mm_storeu_si32(void *__p, __m128i __a)1230{1231wasm_v128_store32_lane(__p, (v128_t)__a, 0);1232}12331234static __inline__ void __attribute__((__always_inline__, __nodebug__))1235_mm_storeu_si64(void *__p, __m128i __a)1236{1237wasm_v128_store64_lane(__p, (v128_t)__a, 0);1238}12391240static __inline__ void __attribute__((__always_inline__, __nodebug__))1241_mm_storeu_si128(__m128i *__p, __m128i __a)1242{1243// UB-free unaligned access copied from wasm_simd128.h1244struct __mm_storeu_si128_struct {1245__m128i __v;1246} __attribute__((__packed__, __may_alias__));1247((struct __mm_storeu_si128_struct *)__p)->__v = __a;1248}12491250static __inline__ void __attribute__((__always_inline__, __nodebug__))1251_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)1252{1253// TODO: optimize1254union {1255unsigned char x[16];1256__m128i m;1257} mask, data;1258mask.m = __n;1259data.m = __d;1260for(int i = 0; i < 16; ++i)1261if (mask.x[i] & 0x80)1262__p[i] = data.x[i];1263}12641265static __inline__ void __attribute__((__always_inline__, __nodebug__))1266_mm_storel_epi64(__m128i *__p, __m128i __a)1267{1268_mm_storeu_si64(__p, __a);1269}12701271static __inline__ void __attribute__((__always_inline__, __nodebug__))1272_mm_stream_pd(double *__p, __m128d __a)1273{1274// Emscripten/SIMD.js does not have cache hinting.1275_mm_store_pd(__p, __a);1276}12771278static __inline__ void __attribute__((__always_inline__, __nodebug__))1279_mm_stream_si128(__m128i *__p, __m128i __a)1280{1281// Emscripten/SIMD.js does not have cache hinting.1282_mm_store_si128(__p, __a);1283}12841285static __inline__ void __attribute__((__always_inline__, __nodebug__))1286_mm_stream_si32(int *__p, int __a)1287{1288// No cache hinting available.1289*__p = __a;1290}12911292static __inline__ void __attribute__((__always_inline__, __nodebug__))1293_mm_stream_si64(long long *__p, long long __a)1294{1295// No cache hinting available.1296*__p = __a;1297}12981299static __inline__ void __attribute__((__always_inline__, __nodebug__))1300_mm_clflush(void const *__p)1301{1302// Wasm SIMD does not have cache hinting1303}13041305static __inline__ void __attribute__((__always_inline__, __nodebug__))1306_mm_lfence(void)1307{1308__sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.1309}13101311static __inline__ void __attribute__((__always_inline__, __nodebug__))1312_mm_mfence(void)1313{1314__sync_synchronize(); // Wasm/SharedArrayBuffer has only a full barrier instruction, which gives a stronger guarantee.1315}13161317static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1318_mm_packs_epi16(__m128i __a, __m128i __b)1319{1320return wasm_i8x16_narrow_i16x8(__a, __b);1321}13221323static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1324_mm_packs_epi32(__m128i __a, __m128i __b)1325{1326return wasm_i16x8_narrow_i32x4(__a, __b);1327}13281329static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1330_mm_packus_epi16(__m128i __a, __m128i __b)1331{1332return wasm_u8x16_narrow_i16x8(__a, __b);1333}13341335#define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)1336#define _mm_insert_epi16(__a, __b, __imm) wasm_i16x8_replace_lane((__a), (__imm) & 7, (__b))13371338static __inline__ int __attribute__((__always_inline__, __nodebug__))1339_mm_movemask_epi8(__m128i __a)1340{1341return (int)wasm_i8x16_bitmask((v128_t)__a);1342}13431344#define _mm_shuffle_epi32(__a, __imm) __extension__ ({ \1345(__m128i)wasm_i32x4_shuffle((__a), \1346_mm_set1_epi32(0), \1347((__imm) & 0x3), (((__imm) & 0xc) >> 2), \1348(((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6)); })13491350#define _mm_shufflelo_epi16(__a, __imm) __extension__ ({ \1351(__m128i)wasm_i16x8_shuffle((__a), \1352_mm_set1_epi16(0), \1353((__imm) & 0x3), (((__imm) & 0xc) >> 2), \1354(((__imm) & 0x30) >> 4), (((__imm) & 0xc0) >> 6), \13554, 5, 6, 7); })13561357#define _mm_shufflehi_epi16(__a, __imm) __extension__ ({ \1358(__m128i)wasm_i16x8_shuffle((__a), \1359_mm_set1_epi16(0), \13600, 1, 2, 3, \1361(4 + (((__imm) & 0x03) >> 0)), \1362(4 + (((__imm) & 0x0c) >> 2)), \1363(4 + (((__imm) & 0x30) >> 4)), \1364(4 + (((__imm) & 0xc0) >> 6))); })13651366static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1367_mm_unpackhi_epi8(__m128i __a, __m128i __b)1368{1369return (__m128i)wasm_i8x16_shuffle(__a, __b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);1370}13711372static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1373_mm_unpackhi_epi16(__m128i __a, __m128i __b)1374{1375return (__m128i)wasm_i16x8_shuffle(__a, __b, 4, 12, 5, 13, 6, 14, 7, 15);1376}13771378static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1379_mm_unpackhi_epi32(__m128i __a, __m128i __b)1380{1381return (__m128i)wasm_i32x4_shuffle(__a, __b, 2, 6, 3, 7);1382}13831384static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1385_mm_unpackhi_epi64(__m128i __a, __m128i __b)1386{1387return (__m128i)wasm_i64x2_shuffle(__a, __b, 1, 3);1388}13891390static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1391_mm_unpacklo_epi8(__m128i __a, __m128i __b)1392{1393return (__m128i)wasm_i8x16_shuffle(__a, __b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);1394}13951396static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1397_mm_unpacklo_epi16(__m128i __a, __m128i __b)1398{1399return (__m128i)wasm_i16x8_shuffle(__a, __b, 0, 8, 1, 9, 2, 10, 3, 11);1400}14011402static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1403_mm_unpacklo_epi32(__m128i __a, __m128i __b)1404{1405return (__m128i)wasm_i32x4_shuffle(__a, __b, 0, 4, 1, 5);1406}14071408static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1409_mm_unpacklo_epi64(__m128i __a, __m128i __b)1410{1411return (__m128i)wasm_i64x2_shuffle(__a, __b, 0, 2);1412}14131414static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1415_mm_move_epi64(__m128i __a)1416{1417return wasm_i64x2_shuffle(__a, wasm_i64x2_const(0, 0), 0, 2);1418}14191420static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1421_mm_unpackhi_pd(__m128d __a, __m128d __b)1422{1423return (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__b, 1, 3);1424}14251426static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1427_mm_unpacklo_pd(__m128d __a, __m128d __b)1428{1429return (__m128d)wasm_i64x2_shuffle((v128_t)__a, (v128_t)__b, 0, 2);1430}14311432static __inline__ int __attribute__((__always_inline__, __nodebug__))1433_mm_movemask_pd(__m128d __a)1434{1435return (int)wasm_i64x2_bitmask((v128_t)__a);1436}14371438#define _mm_shuffle_pd(__a, __b, __i) __extension__ ({ \1439(__m128d) __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), \1440(__i) & 1, \1441(((__i) & 2) >> 1) + 2); })14421443static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1444_mm_castpd_ps(__m128d __a)1445{1446return (__m128)__a;1447}14481449static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1450_mm_castpd_si128(__m128d __a)1451{1452return (__m128i)__a;1453}14541455static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1456_mm_castps_pd(__m128 __a)1457{1458return (__m128d)__a;1459}14601461static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1462_mm_castps_si128(__m128 __a)1463{1464return (__m128i)__a;1465}14661467static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1468_mm_castsi128_ps(__m128i __a)1469{1470return (__m128)__a;1471}14721473static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1474_mm_castsi128_pd(__m128i __a)1475{1476return (__m128d)__a;1477}14781479static __inline__ void __attribute__((__always_inline__, __nodebug__))1480_mm_pause(void)1481{1482// No pause/wait instruction in Wasm/SIMD.1483}14841485static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1486_mm_undefined_pd()1487{1488__m128d val;1489return val;1490}14911492static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1493_mm_undefined_si128()1494{1495__m128i val;1496return val;1497}14981499// Must be in the very end as it uses other SSE2 intrinsics1500static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1501_mm_sad_epu8(__m128i __a, __m128i __b)1502{1503__m128i __diff = _mm_or_si128(_mm_subs_epu8(__a, __b),1504_mm_subs_epu8(__b, __a));1505__diff = _mm_add_epi16(_mm_srli_epi16(__diff, 8),1506_mm_and_si128(__diff, _mm_set1_epi16(0x00FF)));1507__diff = _mm_add_epi16(__diff, _mm_slli_epi32(__diff, 16));1508__diff = _mm_add_epi16(__diff, _mm_slli_epi64(__diff, 32));1509return _mm_srli_epi64(__diff, 48);1510}15111512#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))15131514#endif /* __emscripten_emmintrin_h__ */151515161517