Path: blob/main/system/include/compat/avxintrin.h
6171 views
/*1* Copyright 2020 The Emscripten Authors. All rights reserved.2* Emscripten is available under two separate licenses, the MIT license and the3* University of Illinois/NCSA Open Source License. Both these licenses can be4* found in the LICENSE file.5*/67#ifndef __emscripten_immintrin_h__8#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."9#endif1011#ifndef __emscripten_avxintrin_h__12#define __emscripten_avxintrin_h__1314#ifndef __AVX__15#error "AVX instruction set not enabled"16#endif1718typedef struct {19__m128d v0;20__m128d v1;21} __m256d;2223typedef struct {24__m128 v0;25__m128 v1;26} __m256;2728typedef struct {29__m128i v0;30__m128i v1;31} __m256i;3233typedef int64_t __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));3435typedef struct {36__m128i_u v0;37__m128i_u v1;38} __m256i_u;3940union __m256_data {41__m256i int_view;42__m256d double_view;43__m256 float_view;44__m128i_u int_u_view;45};4647static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))48_mm256_add_pd(__m256d __a, __m256d __b) {49__m256d ret;50ret.v0 = _mm_add_pd(__a.v0, __b.v0);51ret.v1 = _mm_add_pd(__a.v1, __b.v1);52return ret;53}5455static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))56_mm256_add_ps(__m256 __a, __m256 __b) {57__m256 ret;58ret.v0 = _mm_add_ps(__a.v0, __b.v0);59ret.v1 = _mm_add_ps(__a.v1, __b.v1);60return ret;61}6263static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))64_mm256_sub_pd(__m256d __a, __m256d __b) {65__m256d ret;66ret.v0 = _mm_sub_pd(__a.v0, __b.v0);67ret.v1 = _mm_sub_pd(__a.v1, __b.v1);68return ret;69}7071static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))72_mm256_sub_ps(__m256 __a, __m256 __b) {73__m256 ret;74ret.v0 = _mm_sub_ps(__a.v0, __b.v0);75ret.v1 = _mm_sub_ps(__a.v1, __b.v1);76return ret;77}7879static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))80_mm256_addsub_pd(__m256d __a, __m256d __b) {81__m256d ret;82ret.v0 = _mm_addsub_pd(__a.v0, __b.v0);83ret.v1 = _mm_addsub_pd(__a.v1, __b.v1);84return ret;85}8687static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))88_mm256_addsub_ps(__m256 __a, __m256 __b) {89__m256 ret;90ret.v0 = _mm_addsub_ps(__a.v0, __b.v0);91ret.v1 = _mm_addsub_ps(__a.v1, __b.v1);92return ret;93}9495static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))96_mm256_div_pd(__m256d __a, __m256d __b) {97__m256d ret;98ret.v0 = _mm_div_pd(__a.v0, __b.v0);99ret.v1 = _mm_div_pd(__a.v1, __b.v1);100return ret;101}102103static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))104_mm256_div_ps(__m256 __a, __m256 __b) {105__m256 ret;106ret.v0 = _mm_div_ps(__a.v0, __b.v0);107ret.v1 = _mm_div_ps(__a.v1, __b.v1);108return ret;109}110111static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))112_mm256_max_pd(__m256d __a, __m256d __b) {113__m256d ret;114ret.v0 = _mm_max_pd(__a.v0, __b.v0);115ret.v1 = _mm_max_pd(__a.v1, __b.v1);116return ret;117}118119static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))120_mm256_max_ps(__m256 __a, __m256 __b) {121__m256 ret;122ret.v0 = _mm_max_ps(__a.v0, __b.v0);123ret.v1 = _mm_max_ps(__a.v1, __b.v1);124return ret;125}126127static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))128_mm256_min_pd(__m256d __a, __m256d __b) {129__m256d ret;130ret.v0 = _mm_min_pd(__a.v0, __b.v0);131ret.v1 = _mm_min_pd(__a.v1, __b.v1);132return ret;133}134135static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))136_mm256_min_ps(__m256 __a, __m256 __b) {137__m256 ret;138ret.v0 = _mm_min_ps(__a.v0, __b.v0);139ret.v1 = _mm_min_ps(__a.v1, __b.v1);140return ret;141}142143static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))144_mm256_mul_pd(__m256d __a, __m256d __b) {145__m256d ret;146ret.v0 = _mm_mul_pd(__a.v0, __b.v0);147ret.v1 = _mm_mul_pd(__a.v1, __b.v1);148return ret;149}150151static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))152_mm256_mul_ps(__m256 __a, __m256 __b) {153__m256 ret;154ret.v0 = _mm_mul_ps(__a.v0, __b.v0);155ret.v1 = _mm_mul_ps(__a.v1, __b.v1);156return ret;157}158159static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))160_mm256_sqrt_pd(__m256d __a) {161__m256d ret;162ret.v0 = _mm_sqrt_pd(__a.v0);163ret.v1 = _mm_sqrt_pd(__a.v1);164return ret;165}166167static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))168_mm256_sqrt_ps(__m256 __a) {169__m256 ret;170ret.v0 = _mm_sqrt_ps(__a.v0);171ret.v1 = _mm_sqrt_ps(__a.v1);172return ret;173}174175static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))176_mm256_rsqrt_ps(__m256 __a) {177__m256 ret;178ret.v0 = _mm_rsqrt_ps(__a.v0);179ret.v1 = _mm_rsqrt_ps(__a.v1);180return ret;181}182183static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))184_mm256_rcp_ps(__m256 __a) {185__m256 ret;186ret.v0 = _mm_rcp_ps(__a.v0);187ret.v1 = _mm_rcp_ps(__a.v1);188return ret;189}190191static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))192_mm256_round_pd(__m256d __a, int __rounding) {193__m256d ret;194ret.v0 = _mm_round_pd(__a.v0, __rounding);195ret.v1 = _mm_round_pd(__a.v1, __rounding);196return ret;197}198199static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))200_mm256_round_ps(__m256 __a, int __rounding) {201__m256 ret;202ret.v0 = _mm_round_ps(__a.v0, __rounding);203ret.v1 = _mm_round_ps(__a.v1, __rounding);204return ret;205}206207#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)208#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)209#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)210#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)211212static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))213_mm256_and_pd(__m256d __a, __m256d __b) {214__m256d ret;215ret.v0 = _mm_and_pd(__a.v0, __b.v0);216ret.v1 = _mm_and_pd(__a.v1, __b.v1);217return ret;218}219220static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))221_mm256_and_ps(__m256 __a, __m256 __b) {222__m256 ret;223ret.v0 = _mm_and_ps(__a.v0, __b.v0);224ret.v1 = _mm_and_ps(__a.v1, __b.v1);225return ret;226}227228static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))229_mm256_andnot_pd(__m256d __a, __m256d __b) {230__m256d ret;231ret.v0 = _mm_andnot_pd(__a.v0, __b.v0);232ret.v1 = _mm_andnot_pd(__a.v1, __b.v1);233return ret;234}235236static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))237_mm256_andnot_ps(__m256 __a, __m256 __b) {238__m256 ret;239ret.v0 = _mm_andnot_ps(__a.v0, __b.v0);240ret.v1 = _mm_andnot_ps(__a.v1, __b.v1);241return ret;242}243244static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))245_mm256_or_pd(__m256d __a, __m256d __b) {246__m256d ret;247ret.v0 = _mm_or_pd(__a.v0, __b.v0);248ret.v1 = _mm_or_pd(__a.v1, __b.v1);249return ret;250}251252static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))253_mm256_or_ps(__m256 __a, __m256 __b) {254__m256 ret;255ret.v0 = _mm_or_ps(__a.v0, __b.v0);256ret.v1 = _mm_or_ps(__a.v1, __b.v1);257return ret;258}259260static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))261_mm256_xor_pd(__m256d __a, __m256d __b) {262__m256d ret;263ret.v0 = _mm_xor_pd(__a.v0, __b.v0);264ret.v1 = _mm_xor_pd(__a.v1, __b.v1);265return ret;266}267268static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))269_mm256_xor_ps(__m256 __a, __m256 __b) {270__m256 ret;271ret.v0 = _mm_xor_ps(__a.v0, __b.v0);272ret.v1 = _mm_xor_ps(__a.v1, __b.v1);273return ret;274}275276static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))277_mm256_hadd_pd(__m256d __a, __m256d __b) {278__m256d ret;279ret.v0 = _mm_hadd_pd(__a.v0, __b.v0);280ret.v1 = _mm_hadd_pd(__a.v1, __b.v1);281return ret;282}283284static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))285_mm256_hadd_ps(__m256 __a, __m256 __b) {286__m256 ret;287ret.v0 = _mm_hadd_ps(__a.v0, __b.v0);288ret.v1 = _mm_hadd_ps(__a.v1, __b.v1);289return ret;290}291292static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))293_mm256_hsub_pd(__m256d __a, __m256d __b) {294__m256d ret;295ret.v0 = _mm_hsub_pd(__a.v0, __b.v0);296ret.v1 = _mm_hsub_pd(__a.v1, __b.v1);297return ret;298}299300static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))301_mm256_hsub_ps(__m256 __a, __m256 __b) {302__m256 ret;303ret.v0 = _mm_hsub_ps(__a.v0, __b.v0);304ret.v1 = _mm_hsub_ps(__a.v1, __b.v1);305return ret;306}307308static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))309_mm_permutevar_pd(__m128d __a, __m128i __c) {310return (__m128d)wasm_f64x2_make(311((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 0) >> 1) & 1],312((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 1) >> 1) & 1]);313}314315static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))316_mm256_permutevar_pd(__m256d __a, __m256i __c) {317__m256d ret;318ret.v0 = _mm_permutevar_pd(__a.v0, __c.v0);319ret.v1 = _mm_permutevar_pd(__a.v1, __c.v1);320return ret;321}322323static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))324_mm_permutevar_ps(__m128 __a, __m128i __c) {325return (__m128)wasm_f32x4_make(326((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 0) & 3],327((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 1) & 3],328((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 2) & 3],329((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 3) & 3]);330}331332static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))333_mm256_permutevar_ps(__m256 __a, __m256i __c) {334__m256 ret;335ret.v0 = _mm_permutevar_ps(__a.v0, __c.v0);336ret.v1 = _mm_permutevar_ps(__a.v1, __c.v1);337return ret;338}339340#define _mm_permute_pd(__a, __imm) \341((__m128d)wasm_i64x2_shuffle( \342(__m128d)(__a), (__m128d)(__a), ((__imm) & 1), (((__imm) >> 1) & 1)))343344#define _mm256_permute_pd(__A, __imm) \345__extension__({ \346__m256d __a = (__A); \347_mm256_set_m128d(_mm_permute_pd(__a.v1, (__imm) >> 2), \348_mm_permute_pd(__a.v0, (__imm))); \349})350351#define _mm_permute_ps(__a, __imm) \352((__m128)wasm_i32x4_shuffle((__m128)(__a), \353(__m128)(__a), \354((__imm) & 3), \355(((__imm) >> 2) & 3), \356(((__imm) >> 4) & 3), \357(((__imm) >> 6) & 3)))358359#define _mm256_permute_ps(__A, __imm) \360__extension__({ \361__m256 __a = (__A); \362_mm256_set_m128(_mm_permute_ps(__a.v1, (__imm)), \363_mm_permute_ps(__a.v0, (__imm))); \364})365366static __inline__ __m128d367__avx_select4d(__m256d __a, __m256d __b, const int imm8) {368switch (imm8 & 0xF) {369case 0:370case 4:371return __a.v0;372case 1:373case 5:374return __a.v1;375case 2:376case 6:377return __b.v0;378case 3:379case 7:380return __b.v1;381default:382return (__m128d)wasm_i64x2_const_splat(0);383}384}385386static __inline__ __m128 __avx_select4(__m256 __a, __m256 __b, const int imm8) {387switch (imm8 & 0xF) {388case 0:389case 4:390return __a.v0;391case 1:392case 5:393return __a.v1;394case 2:395case 6:396return __b.v0;397case 3:398case 7:399return __b.v1;400default:401return (__m128)wasm_i64x2_const_splat(0);402}403}404405static __inline__ __m128i406__avx_select4i(__m256i __a, __m256i __b, const int imm8) {407switch (imm8 & 0xF) {408case 0:409case 4:410return __a.v0;411case 1:412case 5:413return __a.v1;414case 2:415case 6:416return __b.v0;417case 3:418case 7:419return __b.v1;420default:421return wasm_i64x2_const_splat(0);422}423}424425static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))426_mm256_permute2f128_pd(__m256d __a, __m256d __b, const int imm8) {427__m256d ret;428ret.v0 = __avx_select4d(__a, __b, imm8);429ret.v1 = __avx_select4d(__a, __b, imm8 >> 4);430return ret;431}432433static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))434_mm256_permute2f128_ps(__m256 __a, __m256 __b, const int imm8) {435__m256 ret;436ret.v0 = __avx_select4(__a, __b, imm8);437ret.v1 = __avx_select4(__a, __b, imm8 >> 4);438return ret;439}440441static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))442_mm256_permute2f128_si256(__m256i __a, __m256i __b, const int imm8) {443__m256i ret;444ret.v0 = __avx_select4i(__a, __b, imm8);445ret.v1 = __avx_select4i(__a, __b, imm8 >> 4);446return ret;447}448449#define _mm256_blend_pd(__A, __B, imm8) \450__extension__({ \451__m256d __a = (__A); \452__m256d __b = (__B); \453_mm256_set_m128d(_mm_blend_pd(__a.v1, __b.v1, (imm8) >> 2), \454_mm_blend_pd(__a.v0, __b.v0, (imm8))); \455})456457#define _mm256_blend_ps(__A, __B, imm) \458__extension__({ \459__m256 __a = (__A); \460__m256 __b = (__B); \461_mm256_set_m128(_mm_blend_ps(__a.v1, __b.v1, (imm) >> 4), \462_mm_blend_ps(__a.v0, __b.v0, (imm))); \463})464465static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))466_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {467__m256d ret;468ret.v0 = _mm_blendv_pd(__a.v0, __b.v0, __c.v0);469ret.v1 = _mm_blendv_pd(__a.v1, __b.v1, __c.v1);470return ret;471}472473static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))474_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {475__m256 ret;476ret.v0 = _mm_blendv_ps(__a.v0, __b.v0, __c.v0);477ret.v1 = _mm_blendv_ps(__a.v1, __b.v1, __c.v1);478return ret;479}480481#define _mm256_dp_ps(__A, __B, imm) \482__extension__({ \483__m256 __a = (__A); \484__m256 __b = (__B); \485_mm256_set_m128(_mm_dp_ps(__a.v1, __b.v1, (imm)), \486_mm_dp_ps(__a.v0, __b.v0, (imm))); \487})488489#define _mm256_shuffle_ps(__A, __B, mask) \490__extension__({ \491__m256 __a = (__A); \492__m256 __b = (__B); \493_mm256_set_m128(_mm_shuffle_ps(__a.v1, __b.v1, (mask)), \494_mm_shuffle_ps(__a.v0, __b.v0, (mask))); \495})496497#define _mm256_shuffle_pd(__A, __B, mask) \498__extension__({ \499__m256d __a = (__A); \500__m256d __b = (__B); \501_mm256_set_m128d(_mm_shuffle_pd(__a.v1, __b.v1, (mask) >> 2), \502_mm_shuffle_pd(__a.v0, __b.v0, (mask))); \503})504505#define _CMP_EQ_OQ 0506#define _CMP_LT_OS 1507#define _CMP_LE_OS 2508#define _CMP_UNORD_Q 3509#define _CMP_NEQ_UQ 4510#define _CMP_NLT_US 5511#define _CMP_NLE_US 6512#define _CMP_ORD_Q 7513#define _CMP_EQ_UQ 8514#define _CMP_NGE_US 9515#define _CMP_NGT_US 10516#define _CMP_FALSE_OQ 11517#define _CMP_NEQ_OQ 12518#define _CMP_GE_OS 13519#define _CMP_GT_OS 14520#define _CMP_TRUE_UQ 15521#define _CMP_EQ_OS 16522#define _CMP_LT_OQ 17523#define _CMP_LE_OQ 18524#define _CMP_UNORD_S 19525#define _CMP_NEQ_US 20526#define _CMP_NLT_UQ 21527#define _CMP_NLE_UQ 22528#define _CMP_ORD_S 23529#define _CMP_EQ_US 24530#define _CMP_NGE_UQ 25531#define _CMP_NGT_UQ 26532#define _CMP_FALSE_OS 27533#define _CMP_NEQ_OS 28534#define _CMP_GE_OQ 29535#define _CMP_GT_OQ 30536#define _CMP_TRUE_US 31537538#define _mm_cmp_pd(__a, __b, __imm) \539__extension__({ \540__m128d __ret; \541switch ((__imm)) { \542case _CMP_EQ_OQ: \543case _CMP_EQ_OS: \544__ret = _mm_cmpeq_pd((__a), (__b)); \545break; \546case _CMP_EQ_UQ: \547case _CMP_EQ_US: \548__ret = _mm_or_pd(_mm_cmpeq_pd((__a), (__b)), \549_mm_cmpunord_pd((__a), (__b))); \550break; \551case _CMP_LT_OS: \552case _CMP_LT_OQ: \553__ret = _mm_cmplt_pd((__a), (__b)); \554break; \555case _CMP_LE_OS: \556case _CMP_LE_OQ: \557__ret = _mm_cmple_pd((__a), (__b)); \558break; \559case _CMP_UNORD_Q: \560case _CMP_UNORD_S: \561__ret = _mm_cmpunord_pd((__a), (__b)); \562break; \563case _CMP_NEQ_UQ: \564case _CMP_NEQ_US: \565__ret = _mm_cmpneq_pd((__a), (__b)); \566break; \567case _CMP_NEQ_OQ: \568case _CMP_NEQ_OS: \569__ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)), \570_mm_cmpneq_pd((__a), (__b))); \571break; \572case _CMP_NLT_US: \573case _CMP_NLT_UQ: \574__ret = _mm_cmpnlt_pd((__a), (__b)); \575break; \576case _CMP_ORD_Q: \577case _CMP_ORD_S: \578__ret = _mm_cmpord_pd((__a), (__b)); \579break; \580case _CMP_NGE_US: \581case _CMP_NGE_UQ: \582__ret = _mm_cmpnge_pd((__a), (__b)); \583break; \584case _CMP_NGT_US: \585case _CMP_NGT_UQ: \586__ret = _mm_cmpngt_pd((__a), (__b)); \587break; \588case _CMP_FALSE_OQ: \589case _CMP_FALSE_OS: \590__ret = _mm_setzero_pd(); \591break; \592case _CMP_GE_OS: \593case _CMP_GE_OQ: \594__ret = _mm_cmpge_pd((__a), (__b)); \595break; \596case _CMP_GT_OS: \597case _CMP_GT_OQ: \598__ret = _mm_cmpgt_pd((__a), (__b)); \599break; \600case _CMP_TRUE_UQ: \601case _CMP_TRUE_US: \602__ret = (__m128d)wasm_i8x16_splat(0xFF); \603break; \604case _CMP_NLE_US: \605case _CMP_NLE_UQ: \606__ret = _mm_cmpnle_pd((__a), (__b)); \607break; \608} \609__ret; \610})611612#define _mm_cmp_ps(__a, __b, __imm) \613__extension__({ \614__m128 __ret; \615switch ((__imm)) { \616case _CMP_EQ_OQ: \617case _CMP_EQ_OS: \618__ret = _mm_cmpeq_ps((__a), (__b)); \619break; \620case _CMP_EQ_UQ: \621case _CMP_EQ_US: \622__ret = _mm_or_ps(_mm_cmpeq_ps((__a), (__b)), \623_mm_cmpunord_ps((__a), (__b))); \624break; \625case _CMP_LT_OS: \626case _CMP_LT_OQ: \627__ret = _mm_cmplt_ps((__a), (__b)); \628break; \629case _CMP_LE_OS: \630case _CMP_LE_OQ: \631__ret = _mm_cmple_ps((__a), (__b)); \632break; \633case _CMP_UNORD_Q: \634case _CMP_UNORD_S: \635__ret = _mm_cmpunord_ps((__a), (__b)); \636break; \637case _CMP_NEQ_UQ: \638case _CMP_NEQ_US: \639__ret = _mm_cmpneq_ps((__a), (__b)); \640break; \641case _CMP_NEQ_OQ: \642case _CMP_NEQ_OS: \643__ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)), \644_mm_cmpneq_ps((__a), (__b))); \645break; \646case _CMP_NLT_US: \647case _CMP_NLT_UQ: \648__ret = _mm_cmpnlt_ps((__a), (__b)); \649break; \650case _CMP_ORD_Q: \651case _CMP_ORD_S: \652__ret = _mm_cmpord_ps((__a), (__b)); \653break; \654case _CMP_NGE_US: \655case _CMP_NGE_UQ: \656__ret = _mm_cmpnge_ps((__a), (__b)); \657break; \658case _CMP_NGT_US: \659case _CMP_NGT_UQ: \660__ret = _mm_cmpngt_ps((__a), (__b)); \661break; \662case _CMP_FALSE_OQ: \663case _CMP_FALSE_OS: \664__ret = _mm_setzero_ps(); \665break; \666case _CMP_GE_OS: \667case _CMP_GE_OQ: \668__ret = _mm_cmpge_ps((__a), (__b)); \669break; \670case _CMP_GT_OS: \671case _CMP_GT_OQ: \672__ret = _mm_cmpgt_ps((__a), (__b)); \673break; \674case _CMP_TRUE_UQ: \675case _CMP_TRUE_US: \676__ret = (__m128)wasm_i8x16_splat(0xFF); \677break; \678case _CMP_NLE_US: \679case _CMP_NLE_UQ: \680__ret = _mm_cmpnle_ps((__a), (__b)); \681break; \682} \683__ret; \684})685686#define _mm_cmp_sd(__a, __b, __imm) \687__extension__({ \688__m128d __ret; \689switch ((__imm)) { \690case _CMP_EQ_OQ: \691case _CMP_EQ_OS: \692__ret = _mm_cmpeq_sd((__a), (__b)); \693break; \694case _CMP_EQ_UQ: \695case _CMP_EQ_US: \696__ret = _mm_move_sd((__a), \697_mm_or_pd(_mm_cmpeq_sd((__a), (__b)), \698_mm_cmpunord_sd((__a), (__b)))); \699break; \700case _CMP_LT_OS: \701case _CMP_LT_OQ: \702__ret = _mm_cmplt_sd((__a), (__b)); \703break; \704case _CMP_LE_OS: \705case _CMP_LE_OQ: \706__ret = _mm_cmple_sd((__a), (__b)); \707break; \708case _CMP_UNORD_Q: \709case _CMP_UNORD_S: \710__ret = _mm_cmpunord_sd((__a), (__b)); \711break; \712case _CMP_NEQ_UQ: \713case _CMP_NEQ_US: \714__ret = _mm_cmpneq_sd((__a), (__b)); \715break; \716case _CMP_NEQ_OQ: \717case _CMP_NEQ_OS: \718__ret = _mm_move_sd((__a), \719_mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)), \720_mm_cmpneq_sd((__a), (__b)))); \721break; \722case _CMP_NLT_US: \723case _CMP_NLT_UQ: \724__ret = _mm_cmpnlt_sd((__a), (__b)); \725break; \726case _CMP_ORD_Q: \727case _CMP_ORD_S: \728__ret = _mm_cmpord_sd((__a), (__b)); \729break; \730case _CMP_NGE_US: \731case _CMP_NGE_UQ: \732__ret = _mm_cmpnge_sd((__a), (__b)); \733break; \734case _CMP_NGT_US: \735case _CMP_NGT_UQ: \736__ret = _mm_cmpngt_sd((__a), (__b)); \737break; \738case _CMP_FALSE_OQ: \739case _CMP_FALSE_OS: \740__ret = _mm_move_sd((__a), _mm_setzero_pd()); \741break; \742case _CMP_GE_OS: \743case _CMP_GE_OQ: \744__ret = _mm_cmpge_sd((__a), (__b)); \745break; \746case _CMP_GT_OS: \747case _CMP_GT_OQ: \748__ret = _mm_cmpgt_sd((__a), (__b)); \749break; \750case _CMP_TRUE_UQ: \751case _CMP_TRUE_US: \752__ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF)); \753break; \754case _CMP_NLE_US: \755case _CMP_NLE_UQ: \756__ret = _mm_cmpnle_sd((__a), (__b)); \757break; \758} \759__ret; \760})761762#define _mm_cmp_ss(__a, __b, __imm) \763__extension__({ \764__m128 __ret; \765switch ((__imm)) { \766case _CMP_EQ_OQ: \767case _CMP_EQ_OS: \768__ret = _mm_cmpeq_ss((__a), (__b)); \769break; \770case _CMP_EQ_UQ: \771case _CMP_EQ_US: \772__ret = _mm_move_ss((__a), \773_mm_or_ps(_mm_cmpeq_ss((__a), (__b)), \774_mm_cmpunord_ss((__a), (__b)))); \775break; \776case _CMP_LT_OS: \777case _CMP_LT_OQ: \778__ret = _mm_cmplt_ss((__a), (__b)); \779break; \780case _CMP_LE_OS: \781case _CMP_LE_OQ: \782__ret = _mm_cmple_ss((__a), (__b)); \783break; \784case _CMP_UNORD_Q: \785case _CMP_UNORD_S: \786__ret = _mm_cmpunord_ss((__a), (__b)); \787break; \788case _CMP_NEQ_UQ: \789case _CMP_NEQ_US: \790__ret = _mm_cmpneq_ss((__a), (__b)); \791break; \792case _CMP_NEQ_OQ: \793case _CMP_NEQ_OS: \794__ret = _mm_move_ss((__a), \795_mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)), \796_mm_cmpneq_ss((__a), (__b)))); \797break; \798case _CMP_NLT_US: \799case _CMP_NLT_UQ: \800__ret = _mm_cmpnlt_ss((__a), (__b)); \801break; \802case _CMP_ORD_Q: \803case _CMP_ORD_S: \804__ret = _mm_cmpord_ss((__a), (__b)); \805break; \806case _CMP_NGE_US: \807case _CMP_NGE_UQ: \808__ret = _mm_cmpnge_ss((__a), (__b)); \809break; \810case _CMP_NGT_US: \811case _CMP_NGT_UQ: \812__ret = _mm_cmpngt_ss((__a), (__b)); \813break; \814case _CMP_FALSE_OQ: \815case _CMP_FALSE_OS: \816__ret = _mm_move_ss((__a), _mm_setzero_ps()); \817break; \818case _CMP_GE_OS: \819case _CMP_GE_OQ: \820__ret = _mm_cmpge_ss((__a), (__b)); \821break; \822case _CMP_GT_OS: \823case _CMP_GT_OQ: \824__ret = _mm_cmpgt_ss((__a), (__b)); \825break; \826case _CMP_TRUE_UQ: \827case _CMP_TRUE_US: \828__ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF)); \829break; \830case _CMP_NLE_US: \831case _CMP_NLE_UQ: \832__ret = _mm_cmpnle_ss((__a), (__b)); \833break; \834} \835__ret; \836})837838static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))839_mm256_cmp_pd(__m256d a, __m256d b, const int imm8) {840__m256d ret;841ret.v0 = _mm_cmp_pd(a.v0, b.v0, imm8);842ret.v1 = _mm_cmp_pd(a.v1, b.v1, imm8);843return ret;844}845846static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))847_mm256_cmp_ps(__m256 __a, __m256 __b, const int imm8) {848__m256 ret;849ret.v0 = _mm_cmp_ps(__a.v0, __b.v0, imm8);850ret.v1 = _mm_cmp_ps(__a.v1, __b.v1, imm8);851return ret;852}853854#define _mm256_extract_epi32(__A, N) \855__extension__({ \856__m256i __a = (__A); \857((N) & 0x7) < 4 ? _mm_extract_epi32(__a.v0, (N) & 0x3) \858: _mm_extract_epi32(__a.v1, (N) & 0x3); \859})860861#define _mm256_extract_epi16(__A, N) \862__extension__({ \863__m256i __a = (__A); \864((N) & 0xF) < 8 ? _mm_extract_epi16(__a.v0, (N) & 0x7) \865: _mm_extract_epi16(__a.v1, (N) & 0x7); \866})867868#define _mm256_extract_epi8(__A, N) \869__extension__({ \870__m256i __a = (__A); \871((N) & 0x1F) < 16 ? _mm_extract_epi8(__a.v0, (N) & 0xF) \872: _mm_extract_epi8(__a.v1, (N) & 0xF); \873})874875#define _mm256_extract_epi64(__A, N) \876__extension__({ \877__m256i __a = (__A); \878((N) & 0x3) < 2 ? _mm_extract_epi64(__a.v0, (N) & 0x1) \879: _mm_extract_epi64(__a.v1, (N) & 0x1); \880})881882#define _mm256_insert_epi32(__A, __I, N) \883__extension__({ \884__m256i __a = (__A); \885int32_t __i = (__I); \886((N) & 0x7) < 4 \887? _mm256_set_m128i(__a.v1, _mm_insert_epi32(__a.v0, __i, (N) & 0x3)) \888: _mm256_set_m128i(_mm_insert_epi32(__a.v1, __i, (N) & 0x3), __a.v0); \889})890891#define _mm256_insert_epi16(__A, __I, N) \892__extension__({ \893__m256i __a = (__A); \894int16_t __i = (__I); \895((N) & 0xF) < 8 \896? _mm256_set_m128i(__a.v1, _mm_insert_epi16(__a.v0, __i, (N) & 0x7)) \897: _mm256_set_m128i(_mm_insert_epi16(__a.v1, __i, (N) & 0x7), __a.v0); \898})899900#define _mm256_insert_epi8(__A, __I, N) \901__extension__({ \902__m256i __a = (__A); \903int8_t __i = (__I); \904((N) & 0x1F) < 16 \905? _mm256_set_m128i(__a.v1, _mm_insert_epi8(__a.v0, __i, (N) & 0xF)) \906: _mm256_set_m128i(_mm_insert_epi8(__a.v1, __i, (N) & 0xF), __a.v0); \907})908909#define _mm256_insert_epi64(__A, __I, N) \910__extension__({ \911__m256i __a = (__A); \912int64_t __i = (__I); \913((N) & 0x3) < 2 \914? _mm256_set_m128i(__a.v1, _mm_insert_epi64(__a.v0, __i, (N) & 0x1)) \915: _mm256_set_m128i(_mm_insert_epi64(__a.v1, __i, (N) & 0x1), __a.v0); \916})917918static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))919_mm256_cvtepi32_pd(__m128i __a) {920__m256d ret;921ret.v0 = _mm_cvtepi32_pd(__a);922__m128i __a1 = wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);923ret.v1 = _mm_cvtepi32_pd(__a1);924return ret;925}926927static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))928_mm256_cvtepi32_ps(__m256i __a) {929__m256 ret;930ret.v0 = _mm_cvtepi32_ps(__a.v0);931ret.v1 = _mm_cvtepi32_ps(__a.v1);932return ret;933}934935static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))936_mm256_cvtpd_ps(__m256d __a) {937__m128 low = _mm_cvtpd_ps(__a.v0);938__m128 high = _mm_cvtpd_ps(__a.v1);939__m128 ret = (__m128)wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);940return ret;941}942943static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))944_mm256_cvtps_epi32(__m256 __a) {945__m256i ret;946ret.v0 = _mm_cvtps_epi32(__a.v0);947ret.v1 = _mm_cvtps_epi32(__a.v1);948return ret;949}950951static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))952_mm256_cvtps_pd(__m128 __a) {953__m256d ret;954ret.v0 = _mm_cvtps_pd(__a);955__m128 __a1 = (__m128)wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);956ret.v1 = _mm_cvtps_pd(__a1);957return ret;958}959960static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))961_mm256_cvttpd_epi32(__m256d __a) {962__m128i low = _mm_cvttpd_epi32(__a.v0);963__m128i high = _mm_cvttpd_epi32(__a.v1);964__m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);965return ret;966}967968static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))969_mm256_cvtpd_epi32(__m256d __a) {970__m128i low = _mm_cvtpd_epi32(__a.v0);971__m128i high = _mm_cvtpd_epi32(__a.v1);972__m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);973return ret;974}975976static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))977_mm256_cvttps_epi32(__m256 __a) {978__m256i ret;979ret.v0 = _mm_cvttps_epi32(__a.v0);980ret.v1 = _mm_cvttps_epi32(__a.v1);981return ret;982}983984static __inline__ double __attribute__((__always_inline__, __nodebug__))985_mm256_cvtsd_f64(__m256d __a) {986return _mm_cvtsd_f64(__a.v0);987}988989static __inline__ int __attribute__((__always_inline__, __nodebug__))990_mm256_cvtsi256_si32(__m256i __a) {991return _mm_cvtsi128_si32(__a.v0);992}993994static __inline__ float __attribute__((__always_inline__, __nodebug__))995_mm256_cvtss_f32(__m256 __a) {996return _mm_cvtss_f32(__a.v0);997}998999static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1000_mm256_movehdup_ps(__m256 __a) {1001__m256 ret;1002ret.v0 = _mm_movehdup_ps(__a.v0);1003ret.v1 = _mm_movehdup_ps(__a.v1);1004return ret;1005}10061007static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1008_mm256_moveldup_ps(__m256 __a) {1009__m256 ret;1010ret.v0 = _mm_moveldup_ps(__a.v0);1011ret.v1 = _mm_moveldup_ps(__a.v1);1012return ret;1013}10141015static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1016_mm256_movedup_pd(__m256d __a) {1017__m256d ret;1018ret.v0 = _mm_movedup_pd(__a.v0);1019ret.v1 = _mm_movedup_pd(__a.v1);1020return ret;1021}10221023static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1024_mm256_unpackhi_pd(__m256d __a, __m256d __b) {1025__m256d ret;1026ret.v0 = _mm_unpackhi_pd(__a.v0, __b.v0);1027ret.v1 = _mm_unpackhi_pd(__a.v1, __b.v1);1028return ret;1029}10301031static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1032_mm256_unpacklo_pd(__m256d __a, __m256d __b) {1033__m256d ret;1034ret.v0 = _mm_unpacklo_pd(__a.v0, __b.v0);1035ret.v1 = _mm_unpacklo_pd(__a.v1, __b.v1);1036return ret;1037}10381039static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1040_mm256_unpackhi_ps(__m256 __a, __m256 __b) {1041__m256 ret;1042ret.v0 = _mm_unpackhi_ps(__a.v0, __b.v0);1043ret.v1 = _mm_unpackhi_ps(__a.v1, __b.v1);1044return ret;1045}10461047static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1048_mm256_unpacklo_ps(__m256 __a, __m256 __b) {1049__m256 ret;1050ret.v0 = _mm_unpacklo_ps(__a.v0, __b.v0);1051ret.v1 = _mm_unpacklo_ps(__a.v1, __b.v1);1052return ret;1053}10541055static __inline__ int __attribute__((__always_inline__, __nodebug__))1056_mm_testz_pd(__m128d __a, __m128d __b) {1057v128_t __m =1058wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);1059return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);1060}10611062static __inline__ int __attribute__((__always_inline__, __nodebug__))1063_mm_testc_pd(__m128d __a, __m128d __b) {1064v128_t __m =1065wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);1066return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);1067}10681069static __inline__ int __attribute__((__always_inline__, __nodebug__))1070_mm_testnzc_pd(__m128d __a, __m128d __b) {1071v128_t __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);1072v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);1073return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) &1074(wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));1075}10761077static __inline__ int __attribute__((__always_inline__, __nodebug__))1078_mm_testz_ps(__m128 __a, __m128 __b) {1079v128_t __m =1080wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);1081__m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));1082__m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));1083return wasm_i32x4_extract_lane(__m, 0);1084}10851086static __inline__ int __attribute__((__always_inline__, __nodebug__))1087_mm_testc_ps(__m128 __a, __m128 __b) {1088v128_t __m =1089wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);1090__m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));1091__m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));1092return wasm_i32x4_extract_lane(__m, 0);1093}10941095static __inline__ int __attribute__((__always_inline__, __nodebug__))1096_mm_testnzc_ps(__m128 __a, __m128 __b) {1097v128_t __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);1098v128_t __m2 = wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 31);10991100__m = wasm_v128_or(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));1101__m2 = wasm_v128_or(__m2, (v128_t)_mm_movehl_ps((__m128)__m2, (__m128)__m2));1102__m = wasm_v128_or(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));1103__m2 = wasm_v128_or(__m2, _mm_shuffle_epi32(__m2, _MM_SHUFFLE(3, 2, 0, 1)));11041105return wasm_i32x4_extract_lane(__m, 0) & wasm_i32x4_extract_lane(__m2, 0);1106}11071108static __inline__ int __attribute__((__always_inline__, __nodebug__))1109_mm256_testz_pd(__m256d __a, __m256d __b) {1110return _mm_testz_pd(__a.v0, __b.v0) & _mm_testz_pd(__a.v1, __b.v1);1111}11121113static __inline__ int __attribute__((__always_inline__, __nodebug__))1114_mm256_testc_pd(__m256d __a, __m256d __b) {1115return _mm_testc_pd(__a.v0, __b.v0) & _mm_testc_pd(__a.v1, __b.v1);1116}11171118static __inline__ int __attribute__((__always_inline__, __nodebug__))1119_mm256_testnzc_pd(__m256d __a, __m256d __b) {1120v128_t __m =1121wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 63);1122v128_t __m1 =1123wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 63);1124v128_t __m2 =1125wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 63);1126v128_t __m3 =1127wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 63);1128return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &1129wasm_v128_any_true(wasm_v128_or(__m2, __m3));1130}11311132static __inline__ int __attribute__((__always_inline__, __nodebug__))1133_mm256_testz_ps(__m256 __a, __m256 __b) {1134return _mm_testz_ps(__a.v0, __b.v0) & _mm_testz_ps(__a.v1, __b.v1);1135}11361137static __inline__ int __attribute__((__always_inline__, __nodebug__))1138_mm256_testc_ps(__m256 __a, __m256 __b) {1139return _mm_testc_ps(__a.v0, __b.v0) & _mm_testc_ps(__a.v1, __b.v1);1140}11411142static __inline__ int __attribute__((__always_inline__, __nodebug__))1143_mm256_testnzc_ps(__m256 __a, __m256 __b) {1144v128_t __m =1145wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 31);1146v128_t __m1 =1147wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 31);1148v128_t __m2 =1149wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 31);1150v128_t __m3 =1151wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 31);11521153return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &1154wasm_v128_any_true(wasm_v128_or(__m2, __m3));1155}11561157static __inline__ int __attribute__((__always_inline__, __nodebug__))1158_mm256_testz_si256(__m256i __a, __m256i __b) {1159return _mm_testz_si128(__a.v0, __b.v0) & _mm_testz_si128(__a.v1, __b.v1);1160}11611162static __inline__ int __attribute__((__always_inline__, __nodebug__))1163_mm256_testc_si256(__m256i __a, __m256i __b) {1164return _mm_testc_si128(__a.v0, __b.v0) & _mm_testc_si128(__a.v1, __b.v1);1165}11661167static __inline__ int __attribute__((__always_inline__, __nodebug__))1168_mm256_testnzc_si256(__m256i __a, __m256i __b) {1169v128_t __m = wasm_v128_and(__a.v0, __b.v0);1170v128_t __m1 = wasm_v128_and(__a.v1, __b.v1);1171v128_t __m2 = wasm_v128_andnot(__b.v0, __a.v0);1172v128_t __m3 = wasm_v128_andnot(__b.v1, __a.v1);1173return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &1174wasm_v128_any_true(wasm_v128_or(__m2, __m3));1175}11761177static __inline__ int __attribute__((__always_inline__, __nodebug__))1178_mm256_movemask_pd(__m256d __a) {1179return _mm_movemask_pd(__a.v0) | (_mm_movemask_pd(__a.v1) << 2);1180}11811182static __inline__ int __attribute__((__always_inline__, __nodebug__))1183_mm256_movemask_ps(__m256 __a) {1184return _mm_movemask_ps(__a.v0) | (_mm_movemask_ps(__a.v1) << 4);1185}11861187static __inline__ void __attribute__((__always_inline__, __nodebug__))1188_mm256_zeroall(void) {1189// Do nothing1190// when porting any assembly code that would have calls to these functions1191// around, that assembly code in the first place will not compile.1192}11931194static __inline__ void __attribute__((__always_inline__, __nodebug__))1195_mm256_zeroupper(void) {1196// Do nothing1197// when porting any assembly code that would have calls to these functions1198// around, that assembly code in the first place will not compile.1199}12001201static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1202_mm_broadcast_ss(float const* __a) {1203return (__m128)wasm_v128_load32_splat(__a);1204}12051206static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1207_mm256_broadcast_sd(double const* __a) {1208__m256d ret;1209ret.v1 = ret.v0 = (__m128d)wasm_v128_load64_splat(__a);1210return ret;1211}12121213static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1214_mm256_broadcast_ss(float const* __a) {1215__m256 ret;1216ret.v1 = ret.v0 = _mm_broadcast_ss(__a);1217return ret;1218}12191220static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1221_mm256_broadcast_pd(__m128d const* __a) {1222__m256d ret;1223ret.v1 = ret.v0 = (__m128d)wasm_v128_load(__a);1224return ret;1225}12261227static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1228_mm256_broadcast_ps(__m128 const* __a) {1229__m256 ret;1230ret.v1 = ret.v0 = (__m128)wasm_v128_load(__a);1231return ret;1232}12331234static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1235_mm256_load_pd(double const* __p) {1236__m256d ret;1237ret.v0 = _mm_load_pd(__p);1238ret.v1 = _mm_load_pd(__p + 2);1239return ret;1240}12411242static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1243_mm256_load_ps(float const* __p) {1244__m256 ret;1245ret.v0 = _mm_load_ps(__p);1246ret.v1 = _mm_load_ps(__p + 4);1247return ret;1248}12491250static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1251_mm256_loadu_pd(double const* __p) {1252__m256d ret;1253ret.v0 = _mm_loadu_pd(__p);1254ret.v1 = _mm_loadu_pd(__p + 2);1255return ret;1256}12571258static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1259_mm256_loadu_ps(float const* __p) {1260__m256 ret;1261ret.v0 = _mm_loadu_ps(__p);1262ret.v1 = _mm_loadu_ps(__p + 4);1263return ret;1264}12651266static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1267_mm256_load_si256(__m256i const* __p) {1268__m256i ret;1269ret.v0 = _mm_load_si128((__m128i const*)__p);1270ret.v1 = _mm_load_si128(((__m128i const*)__p) + 1);1271return ret;1272}12731274static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1275_mm256_loadu_si256(__m256i_u const* __p) {1276__m256i ret;1277ret.v0 = _mm_loadu_si128((__m128i const*)__p);1278ret.v1 = _mm_loadu_si128(((__m128i const*)__p) + 1);1279return ret;1280}12811282static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1283_mm256_lddqu_si256(__m256i_u const* __p) {1284__m256i ret;1285ret.v0 = _mm_lddqu_si128((__m128i const*)__p);1286ret.v1 = _mm_lddqu_si128(((__m128i const*)__p) + 1);1287return ret;1288}12891290static __inline__ void __attribute__((__always_inline__, __nodebug__))1291_mm256_store_pd(double* __p, __m256d __a) {1292_mm_store_pd(__p, __a.v0);1293_mm_store_pd(__p + 2, __a.v1);1294}12951296static __inline__ void __attribute__((__always_inline__, __nodebug__))1297_mm256_store_ps(float* __p, __m256 __a) {1298_mm_store_ps(__p, __a.v0);1299_mm_store_ps(__p + 4, __a.v1);1300}13011302static __inline__ void __attribute__((__always_inline__, __nodebug__))1303_mm256_storeu_pd(double* __p, __m256d __a) {1304_mm_storeu_pd(__p, __a.v0);1305_mm_storeu_pd(__p + 2, __a.v1);1306}13071308static __inline__ void __attribute__((__always_inline__, __nodebug__))1309_mm256_storeu_ps(float* __p, __m256 __a) {1310_mm_storeu_ps(__p, __a.v0);1311_mm_storeu_ps(__p + 4, __a.v1);1312}13131314static __inline__ void __attribute__((__always_inline__, __nodebug__))1315_mm256_store_si256(__m256i* __p, __m256i __a) {1316_mm_store_si128((__m128i*)__p, __a.v0);1317_mm_store_si128(((__m128i*)__p) + 1, __a.v1);1318}13191320static __inline__ void __attribute__((__always_inline__, __nodebug__))1321_mm256_storeu_si256(__m256i_u* __p, __m256i __a) {1322_mm_storeu_si128((__m128i*)__p, __a.v0);1323_mm_storeu_si128(((__m128i*)__p) + 1, __a.v1);1324}13251326static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1327_mm_maskload_pd(double const* __p, __m128i __m) {1328// This may cause an out-of-bounds memory load since we first load and1329// then mask, but since there are no segmentation faults in Wasm memory1330// accesses, that is ok (as long as we are within the heap bounds -1331// a negligible limitation in practice)1332return _mm_and_pd(_mm_load_pd(__p), (__m128d)wasm_i64x2_shr(__m, 63));1333}13341335static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1336_mm256_maskload_pd(double const* __p, __m256i __m) {1337__m256d ret;1338ret.v0 = _mm_maskload_pd(__p, __m.v0);1339ret.v1 = _mm_maskload_pd(__p + 2, __m.v1);1340return ret;1341}13421343static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1344_mm_maskload_ps(float const* __p, __m128i __m) {1345// This may cause an out-of-bounds memory load since we first load and1346// then mask, but since there are no segmentation faults in Wasm memory1347// accesses, that is ok (as long as we are within the heap bounds -1348// a negligible limitation in practice)1349return _mm_and_ps(_mm_load_ps(__p), (__m128)_mm_srai_epi32(__m, 31));1350}13511352static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1353_mm256_maskload_ps(float const* __p, __m256i __m) {1354__m256 ret;1355ret.v0 = _mm_maskload_ps(__p, __m.v0);1356ret.v1 = _mm_maskload_ps(__p + 4, __m.v1);1357return ret;1358}13591360static __inline__ void1361__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1362_mm_maskstore_ps(float* __p, __m128i __m, __m128 __a) {1363if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0)1364__p[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);1365if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0)1366__p[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);1367if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0)1368__p[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);1369if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0)1370__p[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);1371}13721373static __inline__ void __attribute__((__always_inline__, __nodebug__))1374_mm256_maskstore_ps(float* __p, __m256i __m, __m256 __a) {1375_mm_maskstore_ps(__p, __m.v0, __a.v0);1376_mm_maskstore_ps(__p + 4, __m.v1, __a.v1);1377}13781379static __inline__ void1380__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1381_mm_maskstore_pd(double* __p, __m128i __m, __m128d __a) {1382if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0)1383__p[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);1384if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0)1385__p[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);1386}13871388static __inline__ void __attribute__((__always_inline__, __nodebug__))1389_mm256_maskstore_pd(double* __p, __m256i __m, __m256d __a) {1390_mm_maskstore_pd(__p, __m.v0, __a.v0);1391_mm_maskstore_pd(__p + 2, __m.v1, __a.v1);1392}13931394static __inline__ void __attribute__((__always_inline__, __nodebug__))1395_mm256_stream_si256(void* __a, __m256i __b) {1396_mm_stream_si128((__m128i*)__a, __b.v0);1397_mm_stream_si128(((__m128i*)__a) + 1, __b.v1);1398}13991400static __inline__ void __attribute__((__always_inline__, __nodebug__))1401_mm256_stream_pd(void* __a, __m256d __b) {1402_mm_stream_pd((double*)__a, __b.v0);1403_mm_stream_pd(((double*)__a) + 2, __b.v1);1404}14051406static __inline__ void __attribute__((__always_inline__, __nodebug__))1407_mm256_stream_ps(void* __p, __m256 __a) {1408_mm_stream_ps((float*)__p, __a.v0);1409_mm_stream_ps(((float*)__p) + 4, __a.v1);1410}14111412static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1413_mm256_undefined_pd(void) {1414__m256d val;1415return val;1416}14171418static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1419_mm256_undefined_ps(void) {1420__m256 val;1421return val;1422}14231424static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1425_mm256_undefined_si256(void) {1426__m256i val;1427return val;1428}14291430static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1431_mm256_set_pd(double __a, double __b, double __c, double __d) {1432__m256d ret;1433ret.v0 = _mm_set_pd(__c, __d);1434ret.v1 = _mm_set_pd(__a, __b);1435return ret;1436}14371438static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1439_mm256_set_ps(float __a,1440float __b,1441float __c,1442float __d,1443float __e,1444float __f,1445float __g,1446float __h) {1447__m256 ret;1448ret.v0 = _mm_set_ps(__e, __f, __g, __h);1449ret.v1 = _mm_set_ps(__a, __b, __c, __d);1450return ret;1451}14521453static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1454_mm256_set_epi32(int __i0,1455int __i1,1456int __i2,1457int __i3,1458int __i4,1459int __i5,1460int __i6,1461int __i7) {1462__m256i ret;1463ret.v0 = _mm_set_epi32(__i4, __i5, __i6, __i7);1464ret.v1 = _mm_set_epi32(__i0, __i1, __i2, __i3);1465return ret;1466}14671468static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1469_mm256_set_epi16(short __w15,1470short __w14,1471short __w13,1472short __w12,1473short __w11,1474short __w10,1475short __w09,1476short __w08,1477short __w07,1478short __w06,1479short __w05,1480short __w04,1481short __w03,1482short __w02,1483short __w01,1484short __w00) {1485__m256i ret;1486ret.v0 =1487_mm_set_epi16(__w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00);1488ret.v1 =1489_mm_set_epi16(__w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08);1490return ret;1491}14921493static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1494_mm256_set_epi8(char __b31,1495char __b30,1496char __b29,1497char __b28,1498char __b27,1499char __b26,1500char __b25,1501char __b24,1502char __b23,1503char __b22,1504char __b21,1505char __b20,1506char __b19,1507char __b18,1508char __b17,1509char __b16,1510char __b15,1511char __b14,1512char __b13,1513char __b12,1514char __b11,1515char __b10,1516char __b09,1517char __b08,1518char __b07,1519char __b06,1520char __b05,1521char __b04,1522char __b03,1523char __b02,1524char __b01,1525char __b00) {1526__m256i ret;1527ret.v0 = _mm_set_epi8(__b15,1528__b14,1529__b13,1530__b12,1531__b11,1532__b10,1533__b09,1534__b08,1535__b07,1536__b06,1537__b05,1538__b04,1539__b03,1540__b02,1541__b01,1542__b00);1543ret.v1 = _mm_set_epi8(__b31,1544__b30,1545__b29,1546__b28,1547__b27,1548__b26,1549__b25,1550__b24,1551__b23,1552__b22,1553__b21,1554__b20,1555__b19,1556__b18,1557__b17,1558__b16);1559return ret;1560}15611562static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1563_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) {1564__m256i ret;1565ret.v0 = _mm_set_epi64x(__c, __d);1566ret.v1 = _mm_set_epi64x(__a, __b);1567return ret;1568}15691570static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1571_mm256_setr_pd(double __a, double __b, double __c, double __d) {1572return _mm256_set_pd(__d, __c, __b, __a);1573}15741575static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1576_mm256_setr_ps(float __a,1577float __b,1578float __c,1579float __d,1580float __e,1581float __f,1582float __g,1583float __h) {1584return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);1585}15861587static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1588_mm256_setr_epi32(int __i0,1589int __i1,1590int __i2,1591int __i3,1592int __i4,1593int __i5,1594int __i6,1595int __i7) {1596return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);1597}15981599static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1600_mm256_setr_epi16(short __w15,1601short __w14,1602short __w13,1603short __w12,1604short __w11,1605short __w10,1606short __w09,1607short __w08,1608short __w07,1609short __w06,1610short __w05,1611short __w04,1612short __w03,1613short __w02,1614short __w01,1615short __w00) {1616return _mm256_set_epi16(__w00,1617__w01,1618__w02,1619__w03,1620__w04,1621__w05,1622__w06,1623__w07,1624__w08,1625__w09,1626__w10,1627__w11,1628__w12,1629__w13,1630__w14,1631__w15);1632}16331634static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1635_mm256_setr_epi8(char __b31,1636char __b30,1637char __b29,1638char __b28,1639char __b27,1640char __b26,1641char __b25,1642char __b24,1643char __b23,1644char __b22,1645char __b21,1646char __b20,1647char __b19,1648char __b18,1649char __b17,1650char __b16,1651char __b15,1652char __b14,1653char __b13,1654char __b12,1655char __b11,1656char __b10,1657char __b09,1658char __b08,1659char __b07,1660char __b06,1661char __b05,1662char __b04,1663char __b03,1664char __b02,1665char __b01,1666char __b00) {1667return _mm256_set_epi8(__b00,1668__b01,1669__b02,1670__b03,1671__b04,1672__b05,1673__b06,1674__b07,1675__b08,1676__b09,1677__b10,1678__b11,1679__b12,1680__b13,1681__b14,1682__b15,1683__b16,1684__b17,1685__b18,1686__b19,1687__b20,1688__b21,1689__b22,1690__b23,1691__b24,1692__b25,1693__b26,1694__b27,1695__b28,1696__b29,1697__b30,1698__b31);1699}17001701static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1702_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) {1703return _mm256_set_epi64x(__d, __c, __b, __a);1704}17051706static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1707_mm256_set1_pd(double __w) {1708__m256d ret;1709ret.v1 = ret.v0 = (__m128d)wasm_f64x2_splat(__w);1710return ret;1711}17121713static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1714_mm256_set1_ps(float __w) {1715__m256 ret;1716ret.v1 = ret.v0 = (__m128)wasm_f32x4_splat(__w);1717return ret;1718}17191720static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1721_mm256_set1_epi32(int __i) {1722__m256i ret;1723ret.v1 = ret.v0 = wasm_i32x4_splat(__i);1724return ret;1725}17261727static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1728_mm256_set1_epi16(short __w) {1729__m256i ret;1730ret.v1 = ret.v0 = wasm_i16x8_splat(__w);1731return ret;1732}17331734static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1735_mm256_set1_epi8(char __b) {1736__m256i ret;1737ret.v1 = ret.v0 = wasm_i8x16_splat(__b);1738return ret;1739}17401741static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1742_mm256_set1_epi64x(long long __q) {1743__m256i ret;1744ret.v1 = ret.v0 = wasm_i64x2_splat(__q);1745return ret;1746}17471748static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1749_mm256_setzero_pd(void) {1750__m256d ret;1751ret.v1 = ret.v0 = _mm_setzero_pd();1752return ret;1753}17541755static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1756_mm256_setzero_ps(void) {1757__m256 ret;1758ret.v1 = ret.v0 = _mm_setzero_ps();1759return ret;1760}17611762static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1763_mm256_setzero_si256(void) {1764__m256i ret;1765ret.v1 = ret.v0 = _mm_setzero_si128();1766return ret;1767}17681769static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1770_mm256_castpd_ps(__m256d __a) {1771union __m256_data ret;1772ret.double_view = __a;1773return ret.float_view;1774}17751776static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1777_mm256_castpd_si256(__m256d __a) {1778union __m256_data ret;1779ret.double_view = __a;1780return ret.int_view;1781}17821783static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1784_mm256_castps_pd(__m256 __a) {1785union __m256_data ret;1786ret.float_view = __a;1787return ret.double_view;1788}17891790static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1791_mm256_castps_si256(__m256 __a) {1792union __m256_data ret;1793ret.float_view = __a;1794return ret.int_view;1795}17961797static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1798_mm256_castsi256_ps(__m256i __a) {1799union __m256_data ret;1800ret.int_view = __a;1801return ret.float_view;1802}18031804static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1805_mm256_castsi256_pd(__m256i __a) {1806union __m256_data ret;1807ret.int_view = __a;1808return ret.double_view;1809}18101811static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1812_mm256_castpd256_pd128(__m256d __a) {1813return __a.v0;1814}18151816static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1817_mm256_castps256_ps128(__m256 __a) {1818return __a.v0;1819}18201821static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1822_mm256_castsi256_si128(__m256i __a) {1823return __a.v0;1824}18251826static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1827_mm256_castpd128_pd256(__m128d __a) {1828__m256d ret;1829ret.v0 = __a;1830ret.v1 = _mm_setzero_pd();1831return ret;1832}18331834static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1835_mm256_castps128_ps256(__m128 __a) {1836__m256 ret;1837ret.v0 = __a;1838ret.v1 = _mm_setzero_ps();1839return ret;1840}18411842static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1843_mm256_castsi128_si256(__m128i __a) {1844__m256i ret;1845ret.v0 = __a;1846ret.v1 = _mm_setzero_si128();1847return ret;1848}18491850static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1851_mm256_zextpd128_pd256(__m128d __a) {1852__m256d ret;1853ret.v0 = __a;1854ret.v1 = _mm_setzero_pd();1855return ret;1856}18571858static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1859_mm256_zextps128_ps256(__m128 __a) {1860__m256 ret;1861ret.v0 = __a;1862ret.v1 = _mm_setzero_ps();1863return ret;1864}18651866static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1867_mm256_zextsi128_si256(__m128i __a) {1868__m256i ret;1869ret.v0 = __a;1870ret.v1 = _mm_setzero_si128();1871return ret;1872}18731874static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1875_mm256_insertf128_ps(__m256 __a, __m128 __b, const int imm8) {1876__m256 ret = __a;1877if (imm8 & 0x1) {1878ret.v1 = __b;1879} else {1880ret.v0 = __b;1881}1882return ret;1883}18841885static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1886_mm256_insertf128_pd(__m256d __a, __m128d __b, const int imm8) {1887__m256d ret = __a;1888if (imm8 & 0x1) {1889ret.v1 = __b;1890} else {1891ret.v0 = __b;1892}1893return ret;1894}18951896static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1897_mm256_insertf128_si256(__m256i __a, __m128i __b, const int imm8) {1898__m256i ret = __a;1899if (imm8 & 0x1) {1900ret.v1 = __b;1901} else {1902ret.v0 = __b;1903}1904return ret;1905}19061907static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))1908_mm256_extractf128_ps(__m256 __a, const int imm8) {1909if (imm8 & 0x1) {1910return __a.v1;1911} else {1912return __a.v0;1913}1914}19151916static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))1917_mm256_extractf128_pd(__m256d __a, const int imm8) {1918if (imm8 & 0x1) {1919return __a.v1;1920} else {1921return __a.v0;1922}1923}19241925static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1926_mm256_extractf128_si256(__m256i __a, const int imm8) {1927if (imm8 & 0x1) {1928return __a.v1;1929} else {1930return __a.v0;1931}1932}19331934static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1935_mm256_set_m128(__m128 __hi, __m128 __lo) {1936__m256 ret;1937ret.v0 = __lo;1938ret.v1 = __hi;1939return ret;1940}19411942static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1943_mm256_set_m128d(__m128d __hi, __m128d __lo) {1944__m256d ret;1945ret.v0 = __lo;1946ret.v1 = __hi;1947return ret;1948}19491950static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1951_mm256_set_m128i(__m128i __hi, __m128i __lo) {1952__m256i ret;1953ret.v0 = __lo;1954ret.v1 = __hi;1955return ret;1956}19571958static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1959_mm256_setr_m128(__m128 __lo, __m128 __hi) {1960return _mm256_set_m128(__hi, __lo);1961}19621963static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1964_mm256_setr_m128d(__m128d __lo, __m128d __hi) {1965return (__m256d)_mm256_set_m128d(__hi, __lo);1966}19671968static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1969_mm256_setr_m128i(__m128i __lo, __m128i __hi) {1970return (__m256i)_mm256_set_m128i(__hi, __lo);1971}19721973static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))1974_mm256_loadu2_m128(float const* __addr_hi, float const* __addr_lo) {1975return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));1976}19771978static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))1979_mm256_loadu2_m128d(double const* __addr_hi, double const* __addr_lo) {1980return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));1981}19821983static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1984_mm256_loadu2_m128i(__m128i_u const* __addr_hi, __m128i_u const* __addr_lo) {1985return _mm256_set_m128i(_mm_loadu_si128((__m128i const*)__addr_hi),1986_mm_loadu_si128((__m128i const*)__addr_lo));1987}19881989static __inline__ void __attribute__((__always_inline__, __nodebug__))1990_mm256_storeu2_m128(float* __addr_hi, float* __addr_lo, __m256 __a) {1991_mm_storeu_ps(__addr_lo, __a.v0);1992_mm_storeu_ps(__addr_hi, __a.v1);1993}19941995static __inline__ void __attribute__((__always_inline__, __nodebug__))1996_mm256_storeu2_m128d(double* __addr_hi, double* __addr_lo, __m256d __a) {1997_mm_storeu_pd(__addr_lo, __a.v0);1998_mm_storeu_pd(__addr_hi, __a.v1);1999}20002001static __inline__ void __attribute__((__always_inline__, __nodebug__))2002_mm256_storeu2_m128i(__m128i_u* __addr_hi, __m128i_u* __addr_lo, __m256i __a) {2003_mm_storeu_si128((__m128i*)__addr_lo, __a.v0);2004_mm_storeu_si128((__m128i*)__addr_hi, __a.v1);2005}20062007#endif /* __emscripten_avxintrin_h__ */200820092010