Path: blob/master/thirdparty/embree/common/simd/arm/avx2neon.h
9917 views
#pragma once12#if !defined(__aarch64__)3#error "avx2neon is only supported for AARCH64"4#endif56#include "sse2neon.h"78#define AVX2NEON_ABI static inline __attribute__((always_inline))91011struct __m256 {12__m128 lo,hi;13__m256() {}14};1516171819struct __m256i {20__m128i lo,hi;21explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}22operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}23__m256i() {}24};2526272829struct __m256d {30float64x2_t lo,hi;31__m256d() {}32__m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}33__m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}34};3536#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}373839#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}40#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}4142#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}434445#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}46474849#define _mm_stream_load_si128 _mm_load_si12850#define _mm256_stream_load_si256 _mm256_load_si256515253AVX2NEON_ABI54__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)55{56__m128 af = _mm_castsi128_ps(a);57__m128 bf = _mm_castsi128_ps(b);58__m128 blendf = _mm_blend_ps(af, bf, imm8);59return _mm_castps_si128(blendf);60}6162AVX2NEON_ABI63int _mm_movemask_popcnt(__m128 a)64{65return __builtin_popcount(_mm_movemask_ps(a));66}6768AVX2NEON_ABI69__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)70{71float32x4_t res;72uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);73for (int i=0;i<4;i++) {74if (mask_u32[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;75}76return vreinterpretq_m128_f32(res);77}7879AVX2NEON_ABI80void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)81{82float32x4_t a_f32 = vreinterpretq_f32_m128(a);83uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);84for (int i=0;i<4;i++) {85if (mask_u32[i] & 0x80000000) mem_addr[i] = a_f32[i];86}87}8889AVX2NEON_ABI90void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)91{92uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);93int32x4_t a_s32 = vreinterpretq_s32_m128i(a);94for (int i=0;i<4;i++) {95if (mask_u32[i] & 0x80000000) mem_addr[i] = a_s32[i];96}97}9899100#define _mm_fmadd_ss _mm_fmadd_ps101#define _mm_fmsub_ss _mm_fmsub_ps102#define _mm_fnmsub_ss _mm_fnmsub_ps103#define _mm_fnmadd_ss _mm_fnmadd_ps104105template<int code>106AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)107{108float v;109v = 0;110v += (code & 0x10) ? a[0]*b[0] : 0;111v += (code & 0x20) ? a[1]*b[1] : 0;112v += (code & 0x40) ? a[2]*b[2] : 0;113v += (code & 0x80) ? a[3]*b[3] : 0;114float32x4_t res;115res[0] = (code & 0x1) ? v : 0;116res[1] = (code & 0x2) ? v : 0;117res[2] = (code & 0x4) ? v : 0;118res[3] = (code & 0x8) ? v : 0;119return res;120}121122template<>123inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)124{125float v;126float32x4_t m = _mm_mul_ps(a,b);127m[3] = 0;128v = vaddvq_f32(m);129return _mm_set1_ps(v);130}131132template<>133inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)134{135float v;136float32x4_t m = _mm_mul_ps(a,b);137v = vaddvq_f32(m);138return _mm_set1_ps(v);139}140141#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))142143144AVX2NEON_ABI145__m128 _mm_permutevar_ps (__m128 a, __m128i b)146{147uint32x4_t b_u32 = vreinterpretq_u32_m128i(b);148float32x4_t x;149for (int i=0;i<4;i++)150{151x[i] = a[b_u32[i]];152}153return vreinterpretq_m128_f32(x);154}155156AVX2NEON_ABI157__m256i _mm256_setzero_si256()158{159__m256i res;160res.lo = res.hi = vdupq_n_s32(0);161return res;162}163164AVX2NEON_ABI165__m256 _mm256_setzero_ps()166{167__m256 res;168res.lo = res.hi = vdupq_n_f32(0.0f);169return res;170}171172AVX2NEON_ABI173__m256i _mm256_undefined_si256()174{175return _mm256_setzero_si256();176}177178AVX2NEON_ABI179__m256 _mm256_undefined_ps()180{181return _mm256_setzero_ps();182}183184CAST_SIMD_TYPE(__m256d, _mm256_castps_pd, __m256, float64x2_t)185CAST_SIMD_TYPE(__m256i, _mm256_castps_si256, __m256, __m128i)186CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i, __m128)187CAST_SIMD_TYPE(__m256, _mm256_castpd_ps , __m256d, __m128)188CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i, float64x2_t)189CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d, __m128i)190191192193194AVX2NEON_ABI195__m128 _mm256_castps256_ps128 (__m256 a)196{197return a.lo;198}199200AVX2NEON_ABI201__m256i _mm256_castsi128_si256 (__m128i a)202{203__m256i res;204res.lo = a ;205res.hi = vdupq_n_s32(0);206return res;207}208209AVX2NEON_ABI210__m128i _mm256_castsi256_si128 (__m256i a)211{212return a.lo;213}214215AVX2NEON_ABI216__m256 _mm256_castps128_ps256 (__m128 a)217{218__m256 res;219res.lo = a;220res.hi = vdupq_n_f32(0);221return res;222}223224225AVX2NEON_ABI226__m256 _mm256_broadcast_ss (float const * mem_addr)227{228__m256 res;229res.lo = res.hi = vdupq_n_f32(*mem_addr);230return res;231}232233234AVX2NEON_ABI235__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)236{237__m256i res;238res.lo = _mm_set_epi32(e3,e2,e1,e0);239res.hi = _mm_set_epi32(e7,e6,e5,e4);240return res;241242}243244AVX2NEON_ABI245__m256i _mm256_set1_epi32 (int a)246{247__m256i res;248res.lo = res.hi = vdupq_n_s32(a);249return res;250}251AVX2NEON_ABI252__m256i _mm256_set1_epi8 (int a)253{254__m256i res;255res.lo = res.hi = vdupq_n_s8(a);256return res;257}258AVX2NEON_ABI259__m256i _mm256_set1_epi16 (int a)260{261__m256i res;262res.lo = res.hi = vdupq_n_s16(a);263return res;264}265266267268269AVX2NEON_ABI270int _mm256_movemask_ps(const __m256& v)271{272return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);273}274275template<int imm8>276AVX2NEON_ABI277__m256 __mm256_permute_ps (const __m256& a)278{279__m256 res;280res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);281res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);282return res;283284}285286#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)287288289template<int imm8>290AVX2NEON_ABI291__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)292{293__m256 res;294res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);295res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);296return res;297298}299300template<int imm8>301AVX2NEON_ABI302__m256i __mm256_shuffle_epi32 (const __m256i a)303{304__m256i res;305res.lo = _mm_shuffle_epi32(a.lo,imm8);306res.hi = _mm_shuffle_epi32(a.hi,imm8);307return res;308309}310311template<int imm8>312AVX2NEON_ABI313__m256i __mm256_srli_si256 (__m256i a)314{315__m256i res;316res.lo = _mm_srli_si128(a.lo,imm8);317res.hi = _mm_srli_si128(a.hi,imm8);318return res;319}320321template<int imm8>322AVX2NEON_ABI323__m256i __mm256_slli_si256 (__m256i a)324{325__m256i res;326res.lo = _mm_slli_si128(a.lo,imm8);327res.hi = _mm_slli_si128(a.hi,imm8);328return res;329}330331332#define _mm256_srli_si256(a,b) __mm256_srli_si256<b>(a)333#define _mm256_slli_si256(a,b) __mm256_slli_si256<b>(a)334335336337#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)338#define _mm256_shuffle_epi32(a,c) __mm256_shuffle_epi32<c>(a)339340341AVX2NEON_ABI342__m256i _mm256_set1_epi64x (long long a)343{344__m256i res;345int64x2_t t = vdupq_n_s64(a);346res.lo = res.hi = __m128i(t);347return res;348}349350351AVX2NEON_ABI352__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)353{354__m256 res;355__m128 tmp;356switch (imm8 & 0x7)357{358case 0: tmp = a.lo; break;359case 1: tmp = a.hi; break;360case 2: tmp = b.lo; break;361case 3: tmp = b.hi; break;362}363if (imm8 & 0x8)364tmp = _mm_setzero_ps();365366367368res.lo = tmp;369imm8 >>= 4;370371switch (imm8 & 0x7)372{373case 0: tmp = a.lo; break;374case 1: tmp = a.hi; break;375case 2: tmp = b.lo; break;376case 3: tmp = b.hi; break;377}378if (imm8 & 0x8)379tmp = _mm_setzero_ps();380381res.hi = tmp;382383return res;384}385386AVX2NEON_ABI387__m256 _mm256_moveldup_ps (__m256 a)388{389__m256 res;390res.lo = _mm_moveldup_ps(a.lo);391res.hi = _mm_moveldup_ps(a.hi);392return res;393}394395AVX2NEON_ABI396__m256 _mm256_movehdup_ps (__m256 a)397{398__m256 res;399res.lo = _mm_movehdup_ps(a.lo);400res.hi = _mm_movehdup_ps(a.hi);401return res;402}403404AVX2NEON_ABI405__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)406{407__m256 res = a;408if (imm8 & 1) res.hi = b;409else res.lo = b;410return res;411}412413414AVX2NEON_ABI415__m128 _mm256_extractf128_ps (__m256 a, const int imm8)416{417if (imm8 & 1) return a.hi;418return a.lo;419}420421422AVX2NEON_ABI423__m256d _mm256_movedup_pd (__m256d a)424{425__m256d res;426res.lo = _mm_movedup_pd(a.lo);427res.hi = _mm_movedup_pd(a.hi);428return res;429}430431AVX2NEON_ABI432__m256i _mm256_abs_epi32(__m256i a)433{434__m256i res;435res.lo = vabsq_s32(a.lo);436res.hi = vabsq_s32(a.hi);437return res;438}439440UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)441UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)442UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)443UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)444UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)445UNARY_AVX_OP(__m256i,_mm256_abs_epi16,_mm_abs_epi16)446447448BINARY_AVX_OP(__m256i,_mm256_add_epi8,_mm_add_epi8)449BINARY_AVX_OP(__m256i,_mm256_adds_epi8,_mm_adds_epi8)450451BINARY_AVX_OP(__m256i,_mm256_hadd_epi32,_mm_hadd_epi32)452BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)453BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)454BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)455456BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)457BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)458BINARY_AVX_OP(__m256i,_mm256_min_epi16,_mm_min_epi16)459BINARY_AVX_OP(__m256i,_mm256_max_epi16,_mm_max_epi16)460BINARY_AVX_OP(__m256i,_mm256_min_epi8,_mm_min_epi8)461BINARY_AVX_OP(__m256i,_mm256_max_epi8,_mm_max_epi8)462BINARY_AVX_OP(__m256i,_mm256_min_epu16,_mm_min_epu16)463BINARY_AVX_OP(__m256i,_mm256_max_epu16,_mm_max_epu16)464BINARY_AVX_OP(__m256i,_mm256_min_epu8,_mm_min_epu8)465BINARY_AVX_OP(__m256i,_mm256_max_epu8,_mm_max_epu8)466BINARY_AVX_OP(__m256i,_mm256_sign_epi16,_mm_sign_epi16)467468469BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)470BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)471472BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)473BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)474475BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)476BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)477BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)478BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)479480BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)481BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)482BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)483BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)484485BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)486BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)487BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)488489490491BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)492BINARY_AVX_OP(__m256i,_mm256_andnot_si256,_mm_andnot_si128)493BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)494BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)495496497BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)498BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)499TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)500TERNARY_AVX_OP(__m256i,_mm256_blendv_epi8,_mm_blendv_epi8)501502503TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)504TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)505TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)506TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)507508509510BINARY_AVX_OP(__m256i,_mm256_packs_epi32,_mm_packs_epi32)511BINARY_AVX_OP(__m256i,_mm256_packs_epi16,_mm_packs_epi16)512BINARY_AVX_OP(__m256i,_mm256_packus_epi32,_mm_packus_epi32)513BINARY_AVX_OP(__m256i,_mm256_packus_epi16,_mm_packus_epi16)514515516BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi64,_mm_unpackhi_epi64)517BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)518BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi16,_mm_unpackhi_epi16)519BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi8,_mm_unpackhi_epi8)520521BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi64,_mm_unpacklo_epi64)522BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)523BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi16,_mm_unpacklo_epi16)524BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi8,_mm_unpacklo_epi8)525526BINARY_AVX_OP(__m256i,_mm256_mulhrs_epi16,_mm_mulhrs_epi16)527BINARY_AVX_OP(__m256i,_mm256_mulhi_epu16,_mm_mulhi_epu16)528BINARY_AVX_OP(__m256i,_mm256_mulhi_epi16,_mm_mulhi_epi16)529//BINARY_AVX_OP(__m256i,_mm256_mullo_epu16,_mm_mullo_epu16)530BINARY_AVX_OP(__m256i,_mm256_mullo_epi16,_mm_mullo_epi16)531532BINARY_AVX_OP(__m256i,_mm256_subs_epu16,_mm_subs_epu16)533BINARY_AVX_OP(__m256i,_mm256_adds_epu16,_mm_adds_epu16)534BINARY_AVX_OP(__m256i,_mm256_subs_epi16,_mm_subs_epi16)535BINARY_AVX_OP(__m256i,_mm256_adds_epi16,_mm_adds_epi16)536BINARY_AVX_OP(__m256i,_mm256_sub_epi16,_mm_sub_epi16)537BINARY_AVX_OP(__m256i,_mm256_add_epi16,_mm_add_epi16)538BINARY_AVX_OP(__m256i,_mm256_sub_epi8,_mm_sub_epi8)539540541BINARY_AVX_OP(__m256i,_mm256_hadd_epi16,_mm_hadd_epi16)542BINARY_AVX_OP(__m256i,_mm256_hadds_epi16,_mm_hadds_epi16)543544545546547BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)548BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)549550BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi8,_mm_cmpeq_epi8)551BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi8,_mm_cmpgt_epi8)552553BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi16,_mm_cmpeq_epi16)554BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi16,_mm_cmpgt_epi16)555556557BINARY_AVX_OP(__m256i,_mm256_shuffle_epi8,_mm_shuffle_epi8)558559560BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)561BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)562BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)563BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)564BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)565BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)566BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)567BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)568BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)569BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)570571572AVX2NEON_ABI573__m256i _mm256_cvtps_epi32 (__m256 a)574{575__m256i res;576res.lo = _mm_cvtps_epi32(a.lo);577res.hi = _mm_cvtps_epi32(a.hi);578return res;579580}581582AVX2NEON_ABI583__m256i _mm256_cvttps_epi32 (__m256 a)584{585__m256i res;586res.lo = _mm_cvttps_epi32(a.lo);587res.hi = _mm_cvttps_epi32(a.hi);588return res;589590}591592AVX2NEON_ABI593__m256 _mm256_loadu_ps (float const * mem_addr)594{595__m256 res;596res.lo = *(__m128 *)(mem_addr + 0);597res.hi = *(__m128 *)(mem_addr + 4);598return res;599}600#define _mm256_load_ps _mm256_loadu_ps601602603AVX2NEON_ABI604int _mm256_testz_ps (const __m256& a, const __m256& b)605{606__m256 t = a;607if (&a != &b)608t = _mm256_and_ps(a,b);609610int32x4_t l = vshrq_n_s32(vreinterpretq_s32_m128(t.lo),31);611int32x4_t h = vshrq_n_s32(vreinterpretq_s32_m128(t.hi),31);612return vaddvq_s32(vaddq_s32(l,h)) == 0;613}614615616AVX2NEON_ABI617__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)618{619__m256i res;620int64x2_t t0 = {e0,e1};621int64x2_t t1 = {e2,e3};622res.lo = __m128i(t0);623res.hi = __m128i(t1);624return res;625}626AVX2NEON_ABI627__m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3)628{629__m256i res;630int64x2_t t0 = {e0,e1};631int64x2_t t1 = {e2,e3};632res.lo = __m128i(t0);633res.hi = __m128i(t1);634return res;635}636637638639AVX2NEON_ABI640__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)641{642int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};643int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};644__m256i res;645res.lo = lo; res.hi = hi;646return res;647}648649AVX2NEON_ABI650__m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5, char e6, char e7, char e8, char e9, char e10, char e11, char e12, char e13, char e14, char e15, char e16, char e17, char e18, char e19, char e20, char e21, char e22, char e23, char e24, char e25, char e26, char e27, char e28, char e29, char e30, char e31)651{652int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};653int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};654__m256i res;655res.lo = lo; res.hi = hi;656return res;657}658659660AVX2NEON_ABI661__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)662{663int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};664int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};665__m256i res;666res.lo = lo; res.hi = hi;667return res;668}669670AVX2NEON_ABI671__m256i _mm256_setr_epi16 (short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15)672{673int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};674int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};675__m256i res;676res.lo = lo; res.hi = hi;677return res;678}679680681682683AVX2NEON_ABI684int _mm256_movemask_epi8(const __m256i& a)685{686return (_mm_movemask_epi8(a.hi) << 16) | _mm_movemask_epi8(a.lo);687}688689690AVX2NEON_ABI691int _mm256_testz_si256(const __m256i& a,const __m256i& b)692{693uint32x4_t lo = vandq_u32(a.lo,b.lo);694uint32x4_t hi = vandq_u32(a.hi,b.hi);695696return (vaddvq_u32(lo) + vaddvq_u32(hi)) == 0;697}698699AVX2NEON_ABI700__m256d _mm256_setzero_pd ()701{702__m256d res;703res.lo = res.hi = vdupq_n_f64(0);704return res;705}706707AVX2NEON_ABI708int _mm256_movemask_pd (__m256d a)709{710return (_mm_movemask_pd(a.hi) << 2) | _mm_movemask_pd(a.lo);711}712713AVX2NEON_ABI714__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)715{716__m256i res;717res.lo = _mm_cmpeq_epi64(a.lo, b.lo);718res.hi = _mm_cmpeq_epi64(a.hi, b.hi);719return res;720}721722AVX2NEON_ABI723__m256d _mm256_cmpeq_pd (__m256d a, __m256d b)724{725__m256d res;726res.lo = _mm_cmpeq_pd(a.lo, b.lo);727res.hi = _mm_cmpeq_pd(a.hi, b.hi);728return res;729}730731732AVX2NEON_ABI733int _mm256_testz_pd (const __m256d& a, const __m256d& b)734{735__m256d t = a;736737if (&a != &b)738t = _mm256_and_pd(a,b);739740return _mm256_movemask_pd(t) == 0;741}742743AVX2NEON_ABI744__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)745{746__m256d res;747res.lo = _mm_blendv_pd(a.lo, b.lo, mask.lo);748res.hi = _mm_blendv_pd(a.hi, b.hi, mask.hi);749return res;750}751752template<int imm8>753AVX2NEON_ABI754__m256 __mm256_dp_ps (__m256 a, __m256 b)755{756__m256 res;757res.lo = _mm_dp_ps(a.lo, b.lo, imm8);758res.hi = _mm_dp_ps(a.hi, b.hi, imm8);759return res;760}761762#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)763764AVX2NEON_ABI765double _mm256_permute4x64_pd_select(__m256d a, const int imm8)766{767switch (imm8 & 3) {768case 0:769return ((float64x2_t)a.lo)[0];770case 1:771return ((float64x2_t)a.lo)[1];772case 2:773return ((float64x2_t)a.hi)[0];774case 3:775return ((float64x2_t)a.hi)[1];776}777__builtin_unreachable();778return 0;779}780781AVX2NEON_ABI782__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)783{784float64x2_t lo,hi;785lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);786lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);787hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);788hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);789790__m256d res;791res.lo = lo; res.hi = hi;792return res;793}794795AVX2NEON_ABI796__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)797{798return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));799}800801802AVX2NEON_ABI803__m256i _mm256_loadu_si256 (__m256i const * mem_addr)804{805__m256i res;806res.lo = *(__m128i *)((int32_t *)mem_addr + 0);807res.hi = *(__m128i *)((int32_t *)mem_addr + 4);808return res;809}810811#define _mm256_load_si256 _mm256_loadu_si256812813AVX2NEON_ABI814void _mm256_storeu_ps (float * mem_addr, __m256 a)815{816*(__m128 *)(mem_addr + 0) = a.lo;817*(__m128 *)(mem_addr + 4) = a.hi;818}819820#define _mm256_store_ps _mm256_storeu_ps821#define _mm256_stream_ps _mm256_storeu_ps822823824AVX2NEON_ABI825void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)826{827*(__m128i *)((int32_t *)mem_addr + 0) = a.lo;828*(__m128i *)((int32_t *)mem_addr + 4) = a.hi;829}830831#define _mm256_store_si256 _mm256_storeu_si256832833834835AVX2NEON_ABI836__m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8)837{838uint8x16x2_t tbl = {a.lo, a.hi};839840uint8_t sz = sizeof(uint64_t);841uint8_t u64[4] = {842(uint8_t)(((imm8 >> 0) & 0x3) * sz),843(uint8_t)(((imm8 >> 2) & 0x3) * sz),844(uint8_t)(((imm8 >> 4) & 0x3) * sz),845(uint8_t)(((imm8 >> 6) & 0x3) * sz),846};847848uint8x16_t idx_lo = {849// lo[0] bytes850(uint8_t)(u64[0]+0), (uint8_t)(u64[0]+1), (uint8_t)(u64[0]+2), (uint8_t)(u64[0]+3),851(uint8_t)(u64[0]+4), (uint8_t)(u64[0]+5), (uint8_t)(u64[0]+6), (uint8_t)(u64[0]+7),852853// lo[1] bytes854(uint8_t)(u64[1]+0), (uint8_t)(u64[1]+1), (uint8_t)(u64[1]+2), (uint8_t)(u64[1]+3),855(uint8_t)(u64[1]+4), (uint8_t)(u64[1]+5), (uint8_t)(u64[1]+6), (uint8_t)(u64[1]+7),856};857uint8x16_t idx_hi = {858// hi[0] bytes859(uint8_t)(u64[2]+0), (uint8_t)(u64[2]+1), (uint8_t)(u64[2]+2), (uint8_t)(u64[2]+3),860(uint8_t)(u64[2]+4), (uint8_t)(u64[2]+5), (uint8_t)(u64[2]+6), (uint8_t)(u64[2]+7),861862// hi[1] bytes863(uint8_t)(u64[3]+0), (uint8_t)(u64[3]+1), (uint8_t)(u64[3]+2), (uint8_t)(u64[3]+3),864(uint8_t)(u64[3]+4), (uint8_t)(u64[3]+5), (uint8_t)(u64[3]+6), (uint8_t)(u64[3]+7),865};866867uint8x16_t lo = vqtbl2q_u8(tbl, idx_lo);868uint8x16_t hi = vqtbl2q_u8(tbl, idx_hi);869870__m256i res;871res.lo = lo; res.hi = hi;872return res;873}874875876AVX2NEON_ABI877__m256i _mm256_permute2x128_si256(const __m256i a,const __m256i b, const int imm8)878{879return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));880}881882883884AVX2NEON_ABI885__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)886{887__m256 res;888res.lo = _mm_maskload_ps(mem_addr,mask.lo);889res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);890return res;891}892893894AVX2NEON_ABI895__m256i _mm256_cvtepu8_epi32 (__m128i a)896{897uint8x16_t a_u8 = vreinterpretq_u8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA898uint16x8_t u16x8 = vmovl_u8(vget_low_u8(a_u8)); // 00HH 00GG 00FF 00EE 00DD 00CC 00BB 00AA899uint32x4_t lo = vmovl_u16(vget_low_u16(u16x8)); // 0000 00DD 0000 00CC 0000 00BB 0000 00AA900uint32x4_t hi = vmovl_high_u16(u16x8); // 0000 00HH 0000 00GG 0000 00FF 0000 00EE901902__m256i res;903res.lo = lo; res.hi = hi;904return res;905}906907908AVX2NEON_ABI909__m256i _mm256_cvtepi8_epi32 (__m128i a)910{911int8x16_t a_s8 = vreinterpretq_s8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA912int16x8_t s16x8 = vmovl_s8(vget_low_s8(a_s8)); // ssHH ssGG ssFF ssEE ssDD ssCC ssBB ssAA913int32x4_t lo = vmovl_s16(vget_low_s16(s16x8)); // ssss ssDD ssss ssCC ssss ssBB ssss ssAA914int32x4_t hi = vmovl_high_s16(s16x8); // ssss ssHH ssss ssGG ssss ssFF ssss ssEE915916__m256i res;917res.lo = lo; res.hi = hi;918return res;919}920921922AVX2NEON_ABI923__m256i _mm256_cvtepi16_epi32 (__m128i a)924{925int16x8_t a_s16 = vreinterpretq_s16_m128i(a); // HHHH GGGG FFFF EEEE DDDD CCCC BBBB AAAA926int32x4_t lo = vmovl_s16(vget_low_s16(a_s16)); // ssss DDDD ssss CCCC ssss BBBB ssss AAAA927int32x4_t hi = vmovl_high_s16(a_s16); // ssss HHHH ssss GGGG ssss FFFF ssss EEEE928929__m256i res;930res.lo = lo; res.hi = hi;931return res;932}933934935936AVX2NEON_ABI937void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)938{939_mm_maskstore_epi32(mem_addr,mask.lo,a.lo);940_mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);941}942943AVX2NEON_ABI944__m256i _mm256_slli_epi64 (__m256i a, int imm8)945{946__m256i res;947res.lo = _mm_slli_epi64(a.lo,imm8);948res.hi = _mm_slli_epi64(a.hi,imm8);949return res;950}951952AVX2NEON_ABI953__m256i _mm256_slli_epi32 (__m256i a, int imm8)954{955__m256i res;956res.lo = _mm_slli_epi32(a.lo,imm8);957res.hi = _mm_slli_epi32(a.hi,imm8);958return res;959}960961962AVX2NEON_ABI963__m256i __mm256_slli_epi16 (__m256i a, int imm8)964{965__m256i res;966res.lo = _mm_slli_epi16(a.lo,imm8);967res.hi = _mm_slli_epi16(a.hi,imm8);968return res;969}970971972AVX2NEON_ABI973__m256i _mm256_srli_epi32 (__m256i a, int imm8)974{975__m256i res;976res.lo = _mm_srli_epi32(a.lo,imm8);977res.hi = _mm_srli_epi32(a.hi,imm8);978return res;979}980981AVX2NEON_ABI982__m256i __mm256_srli_epi16 (__m256i a, int imm8)983{984__m256i res;985res.lo = _mm_srli_epi16(a.lo,imm8);986res.hi = _mm_srli_epi16(a.hi,imm8);987return res;988}989990AVX2NEON_ABI991__m256i _mm256_cvtepu16_epi32(__m128i a)992{993__m256i res;994res.lo = vmovl_u16(vget_low_u16(a));995res.hi = vmovl_high_u16(a);996return res;997}998999AVX2NEON_ABI1000__m256i _mm256_cvtepu8_epi16(__m128i a)1001{1002__m256i res;1003res.lo = vmovl_u8(vget_low_u8(a));1004res.hi = vmovl_high_u8(a);1005return res;1006}100710081009AVX2NEON_ABI1010__m256i _mm256_srai_epi32 (__m256i a, int imm8)1011{1012__m256i res;1013res.lo = _mm_srai_epi32(a.lo,imm8);1014res.hi = _mm_srai_epi32(a.hi,imm8);1015return res;1016}10171018AVX2NEON_ABI1019__m256i _mm256_srai_epi16 (__m256i a, int imm8)1020{1021__m256i res;1022res.lo = _mm_srai_epi16(a.lo,imm8);1023res.hi = _mm_srai_epi16(a.hi,imm8);1024return res;1025}102610271028AVX2NEON_ABI1029__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)1030{1031__m256i res;1032res.lo = vshlq_s32(a.lo,count.lo);1033res.hi = vshlq_s32(a.hi,count.hi);1034return res;10351036}103710381039AVX2NEON_ABI1040__m256i _mm256_srav_epi32 (__m256i a, __m256i count)1041{1042__m256i res;1043res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));1044res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));1045return res;10461047}10481049AVX2NEON_ABI1050__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)1051{1052__m256i res;1053res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));1054res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));1055return res;10561057}105810591060AVX2NEON_ABI1061__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)1062{1063return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));1064}106510661067AVX2NEON_ABI1068__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)1069{1070if (imm8 & 1) return a.hi;1071return a.lo;1072}10731074AVX2NEON_ABI1075__m256 _mm256_set1_ps(float x)1076{1077__m256 res;1078res.lo = res.hi = vdupq_n_f32(x);1079return res;1080}10811082AVX2NEON_ABI1083__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)1084{1085__m256 res;1086res.lo = _mm_set_ps(e3,e2,e1,e0);1087res.hi = _mm_set_ps(e7,e6,e5,e4);1088return res;1089}10901091AVX2NEON_ABI1092__m256 _mm256_broadcast_ps (__m128 const * mem_addr)1093{1094__m256 res;1095res.lo = res.hi = *mem_addr;1096return res;1097}10981099AVX2NEON_ABI1100__m256 _mm256_cvtepi32_ps (__m256i a)1101{1102__m256 res;1103res.lo = _mm_cvtepi32_ps(a.lo);1104res.hi = _mm_cvtepi32_ps(a.hi);1105return res;1106}1107AVX2NEON_ABI1108void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)1109{1110uint32x4_t mask_lo = mask.lo;1111uint32x4_t mask_hi = mask.hi;1112float32x4_t a_lo = a.lo;1113float32x4_t a_hi = a.hi;11141115for (int i=0;i<4;i++) {1116if (mask_lo[i] & 0x80000000) mem_addr[i] = a_lo[i];1117if (mask_hi[i] & 0x80000000) mem_addr[i+4] = a_hi[i];1118}1119}11201121AVX2NEON_ABI1122__m256d _mm256_andnot_pd (__m256d a, __m256d b)1123{1124__m256d res;1125res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));1126res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));1127return res;1128}11291130AVX2NEON_ABI1131__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)1132{1133__m256 res;1134res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);1135res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);1136return res;11371138}113911401141AVX2NEON_ABI1142__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)1143{1144return __m256i(_mm256_blend_ps(__m256(a),__m256(b),imm8));11451146}11471148AVX2NEON_ABI1149__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)1150{1151__m256i res;1152res.lo = _mm_blend_epi16(a.lo,b.lo,imm8);1153res.hi = _mm_blend_epi16(a.hi,b.hi,imm8);1154return res;1155}1156115711581159AVX2NEON_ABI1160__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)1161{1162int32x4_t vindex_lo = vindex.lo;1163int32x4_t vindex_hi = vindex.hi;1164int32x4_t lo,hi;1165for (int i=0;i<4;i++)1166{1167lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));1168hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));1169}11701171__m256i res;1172res.lo = lo; res.hi = hi;1173return res;1174}117511761177AVX2NEON_ABI1178__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)1179{1180uint32x4_t mask_lo = mask.lo;1181uint32x4_t mask_hi = mask.hi;1182int32x4_t vindex_lo = vindex.lo;1183int32x4_t vindex_hi = vindex.hi;1184int32x4_t lo,hi;1185lo = hi = _mm_setzero_si128();1186for (int i=0;i<4;i++)1187{1188if (mask_lo[i] >> 31) lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));1189if (mask_hi[i] >> 31) hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));1190}11911192__m256i res;1193res.lo = lo; res.hi = hi;1194return res;1195}119611971198