Path: blob/main/system/include/compat/smmintrin.h
6171 views
/*1* Copyright 2020 The Emscripten Authors. All rights reserved.2* Emscripten is available under two separate licenses, the MIT license and the3* University of Illinois/NCSA Open Source License. Both these licenses can be4* found in the LICENSE file.5*/6#ifndef __emscripten_smmintrin_h__7#define __emscripten_smmintrin_h__89#ifndef __SSE4_1__10#error "SSE4.1 instruction set not enabled"11#endif1213#include <tmmintrin.h>14#include <math.h> // For rint and rintf1516#define _mm_blend_epi16(__a, __b, __imm8) __extension__ ({ \17(__m128i)__builtin_shufflevector((__i16x8)(__m128i)(__a), \18(__i16x8)(__m128i)(__b), \19(((__imm8) & 1) ? 8 : 0), \20(((__imm8) & 2) ? 9 : 1), \21(((__imm8) & 4) ? 10 : 2), \22(((__imm8) & 8) ? 11 : 3), \23(((__imm8) & 16) ? 12 : 4), \24(((__imm8) & 32) ? 13 : 5), \25(((__imm8) & 64) ? 14 : 6), \26(((__imm8) & 128) ? 15 : 7)); })2728#define _mm_blend_pd(__a, __b, __imm8) __extension__ ({ \29(__m128d)__builtin_shufflevector((__f64x2)(__m128d)(__a), \30(__f64x2)(__m128d)(__b), \31(((__imm8) & 0x01) ? 2 : 0), \32(((__imm8) & 0x02) ? 3 : 1)); })3334#define _mm_blend_ps(__a, __b, __imm8) __extension__ ({ \35(__m128)__builtin_shufflevector((__f32x4)(__m128)(__a), (__f32x4)(__m128)(__b), \36(((__imm8) & 0x01) ? 4 : 0), \37(((__imm8) & 0x02) ? 5 : 1), \38(((__imm8) & 0x04) ? 6 : 2), \39(((__imm8) & 0x08) ? 7 : 3)); })4041static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))42_mm_blendv_epi8(__m128i __a, __m128i __b, __m128i __mask)43{44v128_t __M = wasm_i8x16_shr((v128_t)__mask, 7);45return (__m128i)wasm_v128_bitselect((v128_t)__b, (v128_t)__a, __M);46}4748static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))49_mm_blendv_pd(__m128d __a, __m128d __b, __m128d __mask)50{51v128_t __M = wasm_i64x2_shr((v128_t)__mask, 63);52return (__m128d)wasm_v128_bitselect((v128_t)__b, (v128_t)__a, __M);53}5455static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))56_mm_blendv_ps(__m128 __a, __m128 __b, __m128 __mask)57{58v128_t __M = wasm_i32x4_shr((v128_t)__mask, 31);59return (__m128)wasm_v128_bitselect((v128_t)__b, (v128_t)__a, __M);60}6162#define _MM_FROUND_TO_NEAREST_INT 0x0063#define _MM_FROUND_TO_NEG_INF 0x0164#define _MM_FROUND_TO_POS_INF 0x0265#define _MM_FROUND_TO_ZERO 0x0366#define _MM_FROUND_CUR_DIRECTION 0x046768#define _MM_FROUND_RAISE_EXC 0x0069#define _MM_FROUND_NO_EXC 0x087071#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)72#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)73#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)74#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)75#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)76#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)7778static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))79_mm_ceil_pd(__m128d __a)80{81return (__m128d)wasm_f64x2_ceil((v128_t)__a);82}8384static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))85_mm_ceil_ps(__m128 __a)86{87return (__m128)wasm_f32x4_ceil((v128_t)__a);88}8990static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))91_mm_ceil_ss(__m128 __a, __m128 __b)92{93return _mm_move_ss(__a, _mm_ceil_ps(__b));94}9596static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))97_mm_ceil_sd(__m128d __a, __m128d __b)98{99return _mm_move_sd(__a, _mm_ceil_pd(__b));100}101102static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))103_mm_floor_pd(__m128d __a)104{105return (__m128d)wasm_f64x2_floor((v128_t)__a);106}107108static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))109_mm_floor_ps(__m128 __a)110{111return (__m128)wasm_f32x4_floor((v128_t)__a);112}113114static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))115_mm_floor_ss(__m128 __a, __m128 __b)116{117return _mm_move_ss(__a, _mm_floor_ps(__b));118}119120static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))121_mm_floor_sd(__m128d __a, __m128d __b)122{123return _mm_move_sd(__a, _mm_floor_pd(__b));124}125126static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))127_mm_round_pd(__m128d __a, int __rounding)128{129switch(__rounding & 7)130{131case _MM_FROUND_TO_NEG_INF: return _mm_floor_pd(__a);132case _MM_FROUND_TO_POS_INF: return _mm_ceil_pd(__a);133case _MM_FROUND_TO_ZERO:134return (__m128d)wasm_f64x2_trunc((v128_t)__a);135default:136// _MM_FROUND_TO_NEAREST_INT and _MM_FROUND_CUR_DIRECTION (which is always nearest in Wasm SIMD)137// SSE implements "Banker's rounding", where even half-ways, e.g. 2.5 are rounded down,138// and odd numbers e.g. 3.5 are rounded up.139return (__m128d)wasm_f64x2_nearest((v128_t)__a);140}141}142143static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))144_mm_round_ps(__m128 __a, int __rounding)145{146switch(__rounding & 7)147{148case _MM_FROUND_TO_NEG_INF: return _mm_floor_ps(__a);149case _MM_FROUND_TO_POS_INF: return _mm_ceil_ps(__a);150case _MM_FROUND_TO_ZERO:151return (__m128)wasm_f32x4_trunc((v128_t)__a);152default:153// _MM_FROUND_TO_NEAREST_INT and _MM_FROUND_CUR_DIRECTION (which is always nearest in Wasm SIMD)154// SSE implements "Banker's rounding", where even half-ways, e.g. 2.5 are rounded down,155// and odd numbers e.g. 3.5 are rounded up.156return (__m128)wasm_f32x4_nearest((v128_t)__a);157}158}159160static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))161_mm_round_ss(__m128 __a, __m128 __b, int __rounding)162{163return _mm_move_ss(__a, _mm_round_ps(__b, __rounding));164}165166static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))167_mm_round_sd(__m128d __a, __m128d __b, int __rounding)168{169return _mm_move_sd(__a, _mm_round_pd(__b, __rounding));170}171172static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))173_mm_mullo_epi32(__m128i __a, __m128i __b)174{175return (__m128i)wasm_i32x4_mul(__a, __b);176}177178static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))179_mm_mul_epi32(__m128i __a, __m128i __b)180{181return (__m128i)wasm_i64x2_extmul_low_i32x4(182(v128_t)_mm_shuffle_epi32(__a, _MM_SHUFFLE(2, 0, 2, 0)),183(v128_t)_mm_shuffle_epi32(__b, _MM_SHUFFLE(2, 0, 2, 0)));184}185186#define _mm_dp_ps(__a, __b, __imm8) __extension__ ({ \187__m128 __tmp = _mm_mul_ps(__a, __b); \188__m128 __zero = _mm_setzero_ps(); \189__tmp = _mm_blend_ps(__zero, __tmp, __imm8 >> 4); \190__m128 __sum = _mm_add_ps(__tmp, _mm_shuffle_ps(__tmp, __tmp, _MM_SHUFFLE(2, 3, 0, 1))); \191__sum = _mm_add_ps(__sum, _mm_shuffle_ps(__sum, __sum, _MM_SHUFFLE(1, 0, 3, 2))); \192_mm_blend_ps(__zero, __sum, __imm8); })193194#define _mm_dp_pd(__a, __b, __imm8) __extension__ ({ \195__m128d __tmp = _mm_mul_pd(__a, __b); \196__m128d __zero = _mm_setzero_pd(); \197__tmp = _mm_blend_pd(__zero, __tmp, __imm8 >> 4); \198__m128d __sum = _mm_add_pd(__tmp, _mm_shuffle_pd(__tmp, __tmp, _MM_SHUFFLE2(0, 1))); \199_mm_blend_pd(__zero, __sum, __imm8); })200201#define _mm_stream_load_si128 _mm_load_si128202203static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))204_mm_min_epi8(__m128i __a, __m128i __b)205{206return (__m128i)wasm_i8x16_min(__a, __b);207}208209static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))210_mm_max_epi8(__m128i __a, __m128i __b)211{212return (__m128i)wasm_i8x16_max(__a, __b);213}214215static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))216_mm_min_epu16(__m128i __a, __m128i __b)217{218return (__m128i)wasm_u16x8_min(__a, __b);219}220221static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))222_mm_max_epu16(__m128i __a, __m128i __b)223{224return (__m128i)wasm_u16x8_max(__a, __b);225}226227static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))228_mm_min_epi32(__m128i __a, __m128i __b)229{230return (__m128i)wasm_i32x4_min(__a, __b);231}232233static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))234_mm_max_epi32(__m128i __a, __m128i __b)235{236return (__m128i)wasm_i32x4_max(__a, __b);237}238239static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))240_mm_min_epu32(__m128i __a, __m128i __b)241{242return (__m128i)wasm_u32x4_min(__a, __b);243}244245static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))246_mm_max_epu32(__m128i __a, __m128i __b)247{248return (__m128i)wasm_u32x4_max(__a, __b);249}250251#define _mm_insert_ps(__a, __b, __imm8) __extension__ ({ \252_Static_assert(__builtin_constant_p(__imm8), "Expected constant"); \253__m128 __tmp = __builtin_shufflevector((__f32x4)__a, (__f32x4)__b, \254((((__imm8) >> 4) & 3) == 0) ? ((((__imm8) >> 6) & 3) + 4) : 0, \255((((__imm8) >> 4) & 3) == 1) ? ((((__imm8) >> 6) & 3) + 4) : 1, \256((((__imm8) >> 4) & 3) == 2) ? ((((__imm8) >> 6) & 3) + 4) : 2, \257((((__imm8) >> 4) & 3) == 3) ? ((((__imm8) >> 6) & 3) + 4) : 3); \258(__m128)__builtin_shufflevector(__tmp, _mm_setzero_ps(), \259(((__imm8) & 1) ? 4 : 0), \260(((__imm8) & 2) ? 5 : 1), \261(((__imm8) & 4) ? 6 : 2), \262(((__imm8) & 8) ? 7 : 3)); })263264#define _mm_extract_ps(__a, __imm8) \265__extension__({ wasm_i32x4_extract_lane((v128_t)(__a), (__imm8)&3); })266267#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __f32x4 __a = (__f32x4)(X); \268(D) = __a[N]; }))269270#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))271272#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps(_mm_setzero_ps(), (X), \273_MM_MK_INSERTPS_NDX((N), 0, 0x0e))274275#define _mm_insert_epi8(__a, __i, __imm8) __extension__ ({ \276(__m128i)wasm_i8x16_replace_lane((__a), (__imm8) & 15, (__i)); })277278#define _mm_insert_epi32(__a, __i, __imm8) __extension__ ({ \279(__m128i)wasm_i32x4_replace_lane((__a), (__imm8) & 3, (__i)); })280281#define _mm_insert_epi64(__a, __i, __imm8) __extension__ ({ \282(__m128i)wasm_i64x2_replace_lane((__a), (__imm8) & 1, (__i)); })283284#define _mm_extract_epi8(__a, __imm8) __extension__ ({ \285wasm_u8x16_extract_lane((__a), (__imm8) & 15); })286287#define _mm_extract_epi32(__a, __imm8) __extension__ ({ \288wasm_i32x4_extract_lane((__a), (__imm8) & 3); })289290#define _mm_extract_epi64(__a, __imm8) __extension__ ({ \291wasm_i64x2_extract_lane((__a), (__imm8) & 1); })292293static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))294_mm_testz_si128(__m128i __a, __m128i __b)295{296v128_t __m = wasm_v128_and(__a, __b);297return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) == 0;298}299300static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))301_mm_testc_si128(__m128i __a, __m128i __b)302{303v128_t __m = wasm_v128_andnot(__b, __a);304return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) == 0;305}306307static __inline__ int __attribute__((__always_inline__, __nodebug__))308_mm_testnzc_si128(__m128i __a, __m128i __b)309{310v128_t __m1 = wasm_v128_and(__a, __b);311v128_t __m2 = wasm_v128_andnot(__b, __a);312return (wasm_i64x2_extract_lane(__m1, 0) | wasm_i64x2_extract_lane(__m1, 1))313&& (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));314}315316static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))317_mm_test_all_ones(__m128i __a)318{319return (wasm_i64x2_extract_lane(__a, 0) & wasm_i64x2_extract_lane(__a, 1)) == 0xFFFFFFFFFFFFFFFFull;320}321322static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))323_mm_test_all_zeros(__m128i __a, __m128i __mask)324{325v128_t __m = wasm_v128_and(__a, __mask);326return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) == 0;327}328329static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))330_mm_test_mix_ones_zeros(__m128i __a, __m128i __mask)331{332v128_t __m = wasm_v128_and(__a, __mask);333long long __c0 = wasm_i64x2_extract_lane(__m, 0);334long long __c1 = wasm_i64x2_extract_lane(__m, 1);335long long __ones = __c0 | __c1;336long long __zeros = ~(__c0 & __c1);337return __ones && __zeros;338}339340static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))341_mm_cmpeq_epi64(__m128i __a, __m128i __b)342{343const __m128i __mask = _mm_cmpeq_epi32(__a, __b);344return _mm_and_si128(__mask, _mm_shuffle_epi32(__mask, _MM_SHUFFLE(2, 3, 0, 1)));345}346347static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))348_mm_cvtepi8_epi16(__m128i __a)349{350return (__m128i)wasm_i16x8_widen_low_i8x16((v128_t)__a);351}352353static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))354_mm_cvtepi8_epi32(__m128i __a)355{356return (__m128i)wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16((v128_t)__a));357}358359static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))360_mm_cvtepi8_epi64(__m128i __a)361{362const __m128i __exta = _mm_cvtepi8_epi32(__a);363const __m128i __sign = _mm_cmpgt_epi32(_mm_setzero_si128(), __exta);364return _mm_unpacklo_epi32(__exta, __sign);365}366367static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))368_mm_cvtepi16_epi32(__m128i __a)369{370return (__m128i)wasm_i32x4_widen_low_i16x8((v128_t)__a);371}372373static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))374_mm_cvtepi16_epi64(__m128i __a)375{376const __m128i __exta = _mm_cvtepi16_epi32(__a);377const __m128i __sign = _mm_cmpgt_epi32(_mm_setzero_si128(), __exta);378return _mm_unpacklo_epi32(__exta, __sign);379}380381static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))382_mm_cvtepi32_epi64(__m128i __a)383{384const __m128i __sign = _mm_cmpgt_epi32(_mm_setzero_si128(), __a);385return _mm_unpacklo_epi32(__a, __sign);386}387388static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))389_mm_cvtepu8_epi16(__m128i __a)390{391return (__m128i)wasm_u16x8_extend_low_u8x16((v128_t)__a);392}393394static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))395_mm_cvtepu8_epi32(__m128i __a)396{397return (__m128i)wasm_u32x4_extend_low_u16x8(wasm_i16x8_widen_low_u8x16((v128_t)__a));398}399400static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))401_mm_cvtepu8_epi64(__m128i __a)402{403const __m128i __zero = _mm_setzero_si128();404return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(__a, __zero), __zero), __zero);405}406407static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))408_mm_cvtepu16_epi32(__m128i __a)409{410return (__m128i)wasm_u32x4_extend_low_u16x8((v128_t)__a);411}412413static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))414_mm_cvtepu16_epi64(__m128i __a)415{416const __m128i __zero = _mm_setzero_si128();417return _mm_unpacklo_epi32(_mm_unpacklo_epi16(__a, __zero), __zero);418}419420static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))421_mm_cvtepu32_epi64(__m128i __a)422{423const __m128i __zero = _mm_setzero_si128();424return _mm_unpacklo_epi32(__a, __zero);425}426427static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))428_mm_packus_epi32(__m128i __a, __m128i __b)429{430return (__m128i)wasm_u16x8_narrow_i32x4(__a, __b);431}432433static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))434__uabs(int __i)435{436return (unsigned short)((__i >= 0) ? __i : -__i);437}438439static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))440_mm_mpsadbw_epu8(__m128i __a, __m128i __b, int __imm8)441{442int __aOffset = __imm8 & 4;443int __bOffset = (__imm8 & 3) << 2;444unsigned short __ret[8];445for(int __i = 0; __i < 8; ++__i)446{447__ret[__i] = __uabs(((__u8x16)__a)[__i + __aOffset ] - ((__u8x16)__b)[__bOffset ])448+ __uabs(((__u8x16)__a)[__i + __aOffset + 1] - ((__u8x16)__b)[__bOffset + 1])449+ __uabs(((__u8x16)__a)[__i + __aOffset + 2] - ((__u8x16)__b)[__bOffset + 2])450+ __uabs(((__u8x16)__a)[__i + __aOffset + 3] - ((__u8x16)__b)[__bOffset + 3]);451}452return (__m128i)wasm_v128_load(__ret);453}454455static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))456_mm_minpos_epu16(__m128i __a)457{458unsigned short __min[2] = { 0xFFFF, 0 };459for(int __i = 0; __i < 8; ++__i)460{461unsigned short __v = ((__u16x8)__a)[__i];462if (__v < __min[0])463{464__min[0] = __v;465__min[1] = __i;466}467}468return (__m128i)wasm_i32x4_make(*(int*)__min, 0, 0, 0);469}470471// Clang and GCC compatibility: Both Clang and GCC include SSE4.2 headers from SSE4.1 headers472#ifdef __SSE4_2__473#include <nmmintrin.h>474#endif475476#endif /* __emscripten_smmintrin_h__ */477478479