Path: blob/master/thirdparty/embree/common/simd/arm/sse2neon.h
9715 views
#ifndef SSE2NEON_H1#define SSE2NEON_H23// This header file provides a simple API translation layer4// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions5//6// Contributors to this work are:7// John W. Ratcliff <[email protected]>8// Brandon Rowlett <[email protected]>9// Ken Fast <[email protected]>10// Eric van Beurden <[email protected]>11// Alexander Potylitsin <[email protected]>12// Hasindu Gamaarachchi <[email protected]>13// Jim Huang <[email protected]>14// Mark Cheng <[email protected]>15// Malcolm James MacLeod <[email protected]>16// Devin Hussey (easyaspi314) <[email protected]>17// Sebastian Pop <[email protected]>18// Developer Ecosystem Engineering <[email protected]>19// Danila Kutenin <[email protected]>20// François Turban (JishinMaster) <[email protected]>21// Pei-Hsuan Hung <[email protected]>22// Yang-Hao Yuan <[email protected]>23// Syoyo Fujita <[email protected]>24// Brecht Van Lommel <[email protected]>25// Jonathan Hue <[email protected]>26// Cuda Chen <[email protected]>27// Aymen Qader <[email protected]>2829/*30* sse2neon is freely redistributable under the MIT License.31*32* Permission is hereby granted, free of charge, to any person obtaining a copy33* of this software and associated documentation files (the "Software"), to deal34* in the Software without restriction, including without limitation the rights35* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell36* copies of the Software, and to permit persons to whom the Software is37* furnished to do so, subject to the following conditions:38*39* The above copyright notice and this permission notice shall be included in40* all copies or substantial portions of the Software.41*42* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR43* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,44* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE45* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER46* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,47* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE48* SOFTWARE.49*/5051/* Tunable configurations */5253/* Enable precise implementation of math operations54* This would slow down the computation a bit, but gives consistent result with55* x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)56*/57/* _mm_min|max_ps|ss|pd|sd */58#ifndef SSE2NEON_PRECISE_MINMAX59#define SSE2NEON_PRECISE_MINMAX (0)60#endif61/* _mm_rcp_ps and _mm_div_ps */62#ifndef SSE2NEON_PRECISE_DIV63#define SSE2NEON_PRECISE_DIV (0)64#endif65/* _mm_sqrt_ps and _mm_rsqrt_ps */66#ifndef SSE2NEON_PRECISE_SQRT67#define SSE2NEON_PRECISE_SQRT (0)68#endif69/* _mm_dp_pd */70#ifndef SSE2NEON_PRECISE_DP71#define SSE2NEON_PRECISE_DP (0)72#endif7374/* compiler specific definitions */75#if defined(__GNUC__) || defined(__clang__)76#pragma push_macro("FORCE_INLINE")77#pragma push_macro("ALIGN_STRUCT")78#define FORCE_INLINE static inline __attribute__((always_inline))79#define ALIGN_STRUCT(x) __attribute__((aligned(x)))80#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)81#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)82#else /* non-GNU / non-clang compilers */83#warning "Macro name collisions may happen with unsupported compiler."84#ifndef FORCE_INLINE85#define FORCE_INLINE static inline86#endif87#ifndef ALIGN_STRUCT88#define ALIGN_STRUCT(x) __declspec(align(x))89#endif90#define _sse2neon_likely(x) (x)91#define _sse2neon_unlikely(x) (x)92#endif9394/* C language does not allow initializing a variable with a function call. */95#ifdef __cplusplus96#define _sse2neon_const static const97#else98#define _sse2neon_const const99#endif100101#include <stdint.h>102#include <stdlib.h>103104#if defined(_WIN32) && !defined(__MINGW32__)105/* Definitions for _mm_{malloc,free} are provided by <malloc.h>106* from both MinGW-w64 and MSVC.107*/108#define SSE2NEON_ALLOC_DEFINED109#endif110111/* If using MSVC */112#ifdef _MSC_VER113#include <intrin.h>114#if (defined(_M_AMD64) || defined(__x86_64__)) || \115(defined(_M_ARM) || defined(__arm__))116#define SSE2NEON_HAS_BITSCAN64117#endif118#endif119120/* Compiler barrier */121#define SSE2NEON_BARRIER() \122do { \123__asm__ __volatile__("" ::: "memory"); \124(void) 0; \125} while (0)126127/* Memory barriers128* __atomic_thread_fence does not include a compiler barrier; instead,129* the barrier is part of __atomic_load/__atomic_store's "volatile-like"130* semantics.131*/132#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)133#include <stdatomic.h>134#endif135136FORCE_INLINE void _sse2neon_smp_mb(void)137{138SSE2NEON_BARRIER();139#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \140!defined(__STDC_NO_ATOMICS__)141atomic_thread_fence(memory_order_seq_cst);142#elif defined(__GNUC__) || defined(__clang__)143__atomic_thread_fence(__ATOMIC_SEQ_CST);144#else145/* FIXME: MSVC support */146#endif147}148149/* Architecture-specific build options */150/* FIXME: #pragma GCC push_options is only available on GCC */151#if defined(__GNUC__)152#if defined(__arm__) && __ARM_ARCH == 7153/* According to ARM C Language Extensions Architecture specification,154* __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)155* architecture supported.156*/157#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)158#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."159#endif160#if !defined(__clang__)161#pragma GCC push_options162#pragma GCC target("fpu=neon")163#endif164#elif defined(__aarch64__)165#if !defined(__clang__)166#pragma GCC push_options167#pragma GCC target("+simd")168#endif169#elif __ARM_ARCH == 8170#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)171#error \172"You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."173#endif174#if !defined(__clang__)175#pragma GCC push_options176#endif177#else178#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."179#endif180#endif181182#include <arm_neon.h>183#if !defined(__aarch64__) && (__ARM_ARCH == 8)184#if defined __has_include && __has_include(<arm_acle.h>)185#include <arm_acle.h>186#endif187#endif188189/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD190* and other Arm microarchtectures use.191* From sysctl -a on Apple M1:192* hw.cachelinesize: 128193*/194#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))195#define SSE2NEON_CACHELINE_SIZE 128196#else197#define SSE2NEON_CACHELINE_SIZE 64198#endif199200/* Rounding functions require either Aarch64 instructions or libm failback */201#if !defined(__aarch64__)202#include <math.h>203#endif204205/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only206* or even not accessible in user mode.207* To write or access to these registers in user mode,208* we have to perform syscall instead.209*/210#if !defined(__aarch64__)211#include <sys/time.h>212#endif213214/* "__has_builtin" can be used to query support for built-in functions215* provided by gcc/clang and other compilers that support it.216*/217#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */218/* Compatibility with gcc <= 9 */219#if defined(__GNUC__) && (__GNUC__ <= 9)220#define __has_builtin(x) HAS##x221#define HAS__builtin_popcount 1222#define HAS__builtin_popcountll 1223224// __builtin_shuffle introduced in GCC 4.7.0225#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))226#define HAS__builtin_shuffle 1227#else228#define HAS__builtin_shuffle 0229#endif230231#define HAS__builtin_shufflevector 0232#define HAS__builtin_nontemporal_store 0233#else234#define __has_builtin(x) 0235#endif236#endif237238/**239* MACRO for shuffle parameter for _mm_shuffle_ps().240* Argument fp3 is a digit[0123] that represents the fp from argument "b"241* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same242* for fp2 in result. fp1 is a digit[0123] that represents the fp from243* argument "a" of mm_shuffle_ps that will be places in fp1 of result.244* fp0 is the same for fp0 of result.245*/246#if defined(__aarch64__)247#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\2482), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )249#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\2502), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\2514)+16+3) } )252#endif253254#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \255(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))256257#if __has_builtin(__builtin_shufflevector)258#define _sse2neon_shuffle(type, a, b, ...) \259__builtin_shufflevector(a, b, __VA_ARGS__)260#elif __has_builtin(__builtin_shuffle)261#define _sse2neon_shuffle(type, a, b, ...) \262__extension__({ \263type tmp = {__VA_ARGS__}; \264__builtin_shuffle(a, b, tmp); \265})266#endif267268#ifdef _sse2neon_shuffle269#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)270#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)271#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)272#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)273#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)274#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)275#endif276277/* Rounding mode macros. */278#define _MM_FROUND_TO_NEAREST_INT 0x00279#define _MM_FROUND_TO_NEG_INF 0x01280#define _MM_FROUND_TO_POS_INF 0x02281#define _MM_FROUND_TO_ZERO 0x03282#define _MM_FROUND_CUR_DIRECTION 0x04283#define _MM_FROUND_NO_EXC 0x08284#define _MM_FROUND_RAISE_EXC 0x00285#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)286#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)287#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)288#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)289#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)290#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)291#define _MM_ROUND_NEAREST 0x0000292#define _MM_ROUND_DOWN 0x2000293#define _MM_ROUND_UP 0x4000294#define _MM_ROUND_TOWARD_ZERO 0x6000295/* Flush zero mode macros. */296#define _MM_FLUSH_ZERO_MASK 0x8000297#define _MM_FLUSH_ZERO_ON 0x8000298#define _MM_FLUSH_ZERO_OFF 0x0000299/* Denormals are zeros mode macros. */300#define _MM_DENORMALS_ZERO_MASK 0x0040301#define _MM_DENORMALS_ZERO_ON 0x0040302#define _MM_DENORMALS_ZERO_OFF 0x0000303304/* indicate immediate constant argument in a given range */305#define __constrange(a, b) const306307/* A few intrinsics accept traditional data types like ints or floats, but308* most operate on data types that are specific to SSE.309* If a vector type ends in d, it contains doubles, and if it does not have310* a suffix, it contains floats. An integer vector type can contain any type311* of integer, from chars to shorts to unsigned long longs.312*/313typedef int64x1_t __m64;314typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */315// On ARM 32-bit architecture, the float64x2_t is not supported.316// The data type __m128d should be represented in a different way for related317// intrinsic conversion.318#if defined(__aarch64__)319typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */320#else321typedef float32x4_t __m128d;322#endif323typedef int64x2_t __m128i; /* 128-bit vector containing integers */324325// __int64 is defined in the Intrinsics Guide which maps to different datatype326// in different data model327#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))328#if (defined(__x86_64__) || defined(__i386__))329#define __int64 long long330#else331#define __int64 int64_t332#endif333#endif334335/* type-safe casting between types */336337#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)338#define vreinterpretq_m128_f32(x) (x)339#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)340341#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)342#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)343#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)344#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)345346#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)347#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)348#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)349#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)350351#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)352#define vreinterpretq_f32_m128(x) (x)353#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)354355#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)356#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)357#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)358#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)359360#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)361#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)362#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)363#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)364365#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)366#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)367#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)368#define vreinterpretq_m128i_s64(x) (x)369370#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)371#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)372#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)373#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)374375#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)376#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)377378#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)379#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)380#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)381#define vreinterpretq_s64_m128i(x) (x)382383#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)384#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)385#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)386#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)387388#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)389#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)390#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)391#define vreinterpret_m64_s64(x) (x)392393#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)394#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)395#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)396#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)397398#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)399#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)400#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)401402#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)403#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)404#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)405#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)406407#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)408#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)409#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)410#define vreinterpret_s64_m64(x) (x)411412#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)413414#if defined(__aarch64__)415#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)416#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)417418#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)419420#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)421#define vreinterpretq_m128d_f64(x) (x)422423#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)424425#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)426#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)427428#define vreinterpretq_f64_m128d(x) (x)429#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)430#else431#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)432#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)433434#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)435#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)436437#define vreinterpretq_m128d_f32(x) (x)438439#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)440441#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)442#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)443444#define vreinterpretq_f32_m128d(x) (x)445#endif446447// A struct is defined in this header file called 'SIMDVec' which can be used448// by applications which attempt to access the contents of an __m128 struct449// directly. It is important to note that accessing the __m128 struct directly450// is bad coding practice by Microsoft: @see:451// https://docs.microsoft.com/en-us/cpp/cpp/m128452//453// However, some legacy source code may try to access the contents of an __m128454// struct directly so the developer can use the SIMDVec as an alias for it. Any455// casting must be done manually by the developer, as you cannot cast or456// otherwise alias the base NEON data type for intrinsic operations.457//458// union intended to allow direct access to an __m128 variable using the names459// that the MSVC compiler provides. This union should really only be used when460// trying to access the members of the vector as integer values. GCC/clang461// allow native access to the float members through a simple array access462// operator (in C since 4.6, in C++ since 4.8).463//464// Ideally direct accesses to SIMD vectors should not be used since it can cause465// a performance hit. If it really is needed however, the original __m128466// variable can be aliased with a pointer to this union and used to access467// individual components. The use of this union should be hidden behind a macro468// that is used throughout the codebase to access the members instead of always469// declaring this type of variable.470typedef union ALIGN_STRUCT(16) SIMDVec {471float m128_f32[4]; // as floats - DON'T USE. Added for convenience.472int8_t m128_i8[16]; // as signed 8-bit integers.473int16_t m128_i16[8]; // as signed 16-bit integers.474int32_t m128_i32[4]; // as signed 32-bit integers.475int64_t m128_i64[2]; // as signed 64-bit integers.476uint8_t m128_u8[16]; // as unsigned 8-bit integers.477uint16_t m128_u16[8]; // as unsigned 16-bit integers.478uint32_t m128_u32[4]; // as unsigned 32-bit integers.479uint64_t m128_u64[2]; // as unsigned 64-bit integers.480} SIMDVec;481482// casting using SIMDVec483#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])484#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])485#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])486487/* SSE macros */488#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode489#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode490#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode491#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode492493// Function declaration494// SSE495FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();496FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);497FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);498FORCE_INLINE __m128 _mm_set_ps1(float);499FORCE_INLINE __m128 _mm_setzero_ps(void);500// SSE2501FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);502FORCE_INLINE __m128i _mm_castps_si128(__m128);503FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);504FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);505FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);506FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);507FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);508FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);509FORCE_INLINE __m128d _mm_set_pd(double, double);510FORCE_INLINE __m128i _mm_set1_epi32(int);511FORCE_INLINE __m128i _mm_setzero_si128();512// SSE4.1513FORCE_INLINE __m128d _mm_ceil_pd(__m128d);514FORCE_INLINE __m128 _mm_ceil_ps(__m128);515FORCE_INLINE __m128d _mm_floor_pd(__m128d);516FORCE_INLINE __m128 _mm_floor_ps(__m128);517FORCE_INLINE __m128d _mm_round_pd(__m128d, int);518FORCE_INLINE __m128 _mm_round_ps(__m128, int);519// SSE4.2520FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);521522/* Backwards compatibility for compilers with lack of specific type support */523524// Older gcc does not define vld1q_u8_x4 type525#if defined(__GNUC__) && !defined(__clang__) && \526((__GNUC__ <= 12 && defined(__arm__)) || \527(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \528(__GNUC__ <= 9 && defined(__aarch64__)))529FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)530{531uint8x16x4_t ret;532ret.val[0] = vld1q_u8(p + 0);533ret.val[1] = vld1q_u8(p + 16);534ret.val[2] = vld1q_u8(p + 32);535ret.val[3] = vld1q_u8(p + 48);536return ret;537}538#else539// Wraps vld1q_u8_x4540FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)541{542return vld1q_u8_x4(p);543}544#endif545546#if !defined(__aarch64__)547/* emulate vaddv u8 variant */548FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)549{550const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));551return vget_lane_u8(vreinterpret_u8_u64(v1), 0);552}553#else554// Wraps vaddv_u8555FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)556{557return vaddv_u8(v8);558}559#endif560561#if !defined(__aarch64__)562/* emulate vaddvq u8 variant */563FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)564{565uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));566uint8_t res = 0;567for (int i = 0; i < 8; ++i)568res += tmp[i];569return res;570}571#else572// Wraps vaddvq_u8573FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)574{575return vaddvq_u8(a);576}577#endif578579#if !defined(__aarch64__)580/* emulate vaddvq u16 variant */581FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)582{583uint32x4_t m = vpaddlq_u16(a);584uint64x2_t n = vpaddlq_u32(m);585uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);586587return vget_lane_u32((uint32x2_t) o, 0);588}589#else590// Wraps vaddvq_u16591FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)592{593return vaddvq_u16(a);594}595#endif596597/* Function Naming Conventions598* The naming convention of SSE intrinsics is straightforward. A generic SSE599* intrinsic function is given as follows:600* _mm_<name>_<data_type>601*602* The parts of this format are given as follows:603* 1. <name> describes the operation performed by the intrinsic604* 2. <data_type> identifies the data type of the function's primary arguments605*606* This last part, <data_type>, is a little complicated. It identifies the607* content of the input values, and can be set to any of the following values:608* + ps - vectors contain floats (ps stands for packed single-precision)609* + pd - vectors cantain doubles (pd stands for packed double-precision)610* + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit611* signed integers612* + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit613* unsigned integers614* + si128 - unspecified 128-bit vector or 256-bit vector615* + m128/m128i/m128d - identifies input vector types when they are different616* than the type of the returned vector617*618* For example, _mm_setzero_ps. The _mm implies that the function returns619* a 128-bit vector. The _ps at the end implies that the argument vectors620* contain floats.621*622* A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)623* // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits624* __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);625* // Set packed 8-bit integers626* // 128 bits, 16 chars, per 8 bits627* __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,628* 4, 5, 12, 13, 6, 7, 14, 15);629* // Shuffle packed 8-bit integers630* __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb631*632* Data (Number, Binary, Byte Index):633+------+------+-------------+------+------+-------------+634| 1 | 2 | 3 | 4 | Number635+------+------+------+------+------+------+------+------+636| 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary637+------+------+------+------+------+------+------+------+638| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index639+------+------+------+------+------+------+------+------+640641+------+------+------+------+------+------+------+------+642| 5 | 6 | 7 | 8 | Number643+------+------+------+------+------+------+------+------+644| 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary645+------+------+------+------+------+------+------+------+646| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index647+------+------+------+------+------+------+------+------+648* Index (Byte Index):649+------+------+------+------+------+------+------+------+650| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |651+------+------+------+------+------+------+------+------+652653+------+------+------+------+------+------+------+------+654| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |655+------+------+------+------+------+------+------+------+656* Result:657+------+------+------+------+------+------+------+------+658| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index659+------+------+------+------+------+------+------+------+660| 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary661+------+------+------+------+------+------+------+------+662| 256 | 2 | 5 | 6 | Number663+------+------+------+------+------+------+------+------+664665+------+------+------+------+------+------+------+------+666| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index667+------+------+------+------+------+------+------+------+668| 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary669+------+------+------+------+------+------+------+------+670| 3 | 7 | 4 | 8 | Number671+------+------+------+------+------+------+-------------+672*/673674/* Constants for use with _mm_prefetch. */675enum _mm_hint {676_MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */677_MM_HINT_T0 = 1, /* load data to L1 and L2 cache */678_MM_HINT_T1 = 2, /* load data to L2 cache only */679_MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */680};681682// The bit field mapping to the FPCR(floating-point control register)683typedef struct {684uint16_t res0;685uint8_t res1 : 6;686uint8_t bit22 : 1;687uint8_t bit23 : 1;688uint8_t bit24 : 1;689uint8_t res2 : 7;690#if defined(__aarch64__)691uint32_t res3;692#endif693} fpcr_bitfield;694695// Takes the upper 64 bits of a and places it in the low end of the result696// Takes the lower 64 bits of b and places it into the high end of the result.697FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)698{699float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));700float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));701return vreinterpretq_m128_f32(vcombine_f32(a32, b10));702}703704// takes the lower two 32-bit values from a and swaps them and places in high705// end of result takes the higher two 32 bit values from b and swaps them and706// places in low end of result.707FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)708{709float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));710float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));711return vreinterpretq_m128_f32(vcombine_f32(a01, b23));712}713714FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)715{716float32x2_t a21 = vget_high_f32(717vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));718float32x2_t b03 = vget_low_f32(719vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));720return vreinterpretq_m128_f32(vcombine_f32(a21, b03));721}722723FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)724{725float32x2_t a03 = vget_low_f32(726vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));727float32x2_t b21 = vget_high_f32(728vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));729return vreinterpretq_m128_f32(vcombine_f32(a03, b21));730}731732FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)733{734float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));735float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));736return vreinterpretq_m128_f32(vcombine_f32(a10, b10));737}738739FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)740{741float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));742float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));743return vreinterpretq_m128_f32(vcombine_f32(a01, b10));744}745746FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)747{748float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));749float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));750return vreinterpretq_m128_f32(vcombine_f32(a01, b01));751}752753// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the754// high755FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)756{757float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));758float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));759return vreinterpretq_m128_f32(vcombine_f32(a10, b32));760}761762FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)763{764float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);765float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);766return vreinterpretq_m128_f32(vcombine_f32(a11, b00));767}768769FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)770{771float32x2_t a22 =772vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);773float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);774return vreinterpretq_m128_f32(vcombine_f32(a22, b00));775}776777FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)778{779float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);780float32x2_t b22 =781vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);782return vreinterpretq_m128_f32(vcombine_f32(a00, b22));783}784785FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)786{787float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);788float32x2_t a22 =789vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);790float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/791float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));792return vreinterpretq_m128_f32(vcombine_f32(a02, b32));793}794795FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)796{797float32x2_t a33 =798vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);799float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);800return vreinterpretq_m128_f32(vcombine_f32(a33, b11));801}802803FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)804{805float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));806float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);807float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);808float32x2_t b20 = vset_lane_f32(b2, b00, 1);809return vreinterpretq_m128_f32(vcombine_f32(a10, b20));810}811812FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)813{814float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));815float32_t b2 = vgetq_lane_f32(b, 2);816float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);817float32x2_t b20 = vset_lane_f32(b2, b00, 1);818return vreinterpretq_m128_f32(vcombine_f32(a01, b20));819}820821FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)822{823float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));824float32_t b2 = vgetq_lane_f32(b, 2);825float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);826float32x2_t b20 = vset_lane_f32(b2, b00, 1);827return vreinterpretq_m128_f32(vcombine_f32(a32, b20));828}829830// Kahan summation for accurate summation of floating-point numbers.831// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html832FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)833{834y -= *c;835float t = *sum + y;836*c = (t - *sum) - y;837*sum = t;838}839840#if defined(__ARM_FEATURE_CRYPTO) && \841(defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))842// Wraps vmull_p64843FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)844{845poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);846poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);847return vreinterpretq_u64_p128(vmull_p64(a, b));848}849#else // ARMv7 polyfill850// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.851//852// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a853// 64-bit->128-bit polynomial multiply.854//855// It needs some work and is somewhat slow, but it is still faster than all856// known scalar methods.857//858// Algorithm adapted to C from859// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted860// from "Fast Software Polynomial Multiplication on ARM Processors Using the861// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab862// (https://hal.inria.fr/hal-01506572)863static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)864{865poly8x8_t a = vreinterpret_p8_u64(_a);866poly8x8_t b = vreinterpret_p8_u64(_b);867868// Masks869uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),870vcreate_u8(0x00000000ffffffff));871uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),872vcreate_u8(0x0000000000000000));873874// Do the multiplies, rotating with vext to get all combinations875uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0876uint8x16_t e =877vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1878uint8x16_t f =879vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0880uint8x16_t g =881vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2882uint8x16_t h =883vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0884uint8x16_t i =885vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3886uint8x16_t j =887vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0888uint8x16_t k =889vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4890891// Add cross products892uint8x16_t l = veorq_u8(e, f); // L = E + F893uint8x16_t m = veorq_u8(g, h); // M = G + H894uint8x16_t n = veorq_u8(i, j); // N = I + J895896// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL897// instructions.898#if defined(__aarch64__)899uint8x16_t lm_p0 = vreinterpretq_u8_u64(900vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));901uint8x16_t lm_p1 = vreinterpretq_u8_u64(902vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));903uint8x16_t nk_p0 = vreinterpretq_u8_u64(904vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));905uint8x16_t nk_p1 = vreinterpretq_u8_u64(906vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));907#else908uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));909uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));910uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));911uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));912#endif913// t0 = (L) (P0 + P1) << 8914// t1 = (M) (P2 + P3) << 16915uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);916uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);917uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);918919// t2 = (N) (P4 + P5) << 24920// t3 = (K) (P6 + P7) << 32921uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);922uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);923uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);924925// De-interleave926#if defined(__aarch64__)927uint8x16_t t0 = vreinterpretq_u8_u64(928vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));929uint8x16_t t1 = vreinterpretq_u8_u64(930vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));931uint8x16_t t2 = vreinterpretq_u8_u64(932vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));933uint8x16_t t3 = vreinterpretq_u8_u64(934vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));935#else936uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));937uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));938uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));939uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));940#endif941// Shift the cross products942uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8943uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16944uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24945uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32946947// Accumulate the products948uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);949uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);950uint8x16_t mix = veorq_u8(d, cross1);951uint8x16_t r = veorq_u8(mix, cross2);952return vreinterpretq_u64_u8(r);953}954#endif // ARMv7 polyfill955956// C equivalent:957// __m128i _mm_shuffle_epi32_default(__m128i a,958// __constrange(0, 255) int imm) {959// __m128i ret;960// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];961// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];962// return ret;963// }964#define _mm_shuffle_epi32_default(a, imm) \965__extension__({ \966int32x4_t ret; \967ret = vmovq_n_s32( \968vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \969ret = vsetq_lane_s32( \970vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \971ret, 1); \972ret = vsetq_lane_s32( \973vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \974ret, 2); \975ret = vsetq_lane_s32( \976vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \977ret, 3); \978vreinterpretq_m128i_s32(ret); \979})980981// Takes the upper 64 bits of a and places it in the low end of the result982// Takes the lower 64 bits of a and places it into the high end of the result.983FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)984{985int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));986int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));987return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));988}989990// takes the lower two 32-bit values from a and swaps them and places in low end991// of result takes the higher two 32 bit values from a and swaps them and places992// in high end of result.993FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)994{995int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));996int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));997return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));998}9991000// rotates the least significant 32 bits into the most significant 32 bits, and1001// shifts the rest down1002FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)1003{1004return vreinterpretq_m128i_s32(1005vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));1006}10071008// rotates the most significant 32 bits into the least significant 32 bits, and1009// shifts the rest up1010FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)1011{1012return vreinterpretq_m128i_s32(1013vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));1014}10151016// gets the lower 64 bits of a, and places it in the upper 64 bits1017// gets the lower 64 bits of a and places it in the lower 64 bits1018FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)1019{1020int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));1021return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));1022}10231024// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the1025// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits1026FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)1027{1028int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));1029int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));1030return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));1031}10321033// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the1034// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and1035// places it in the lower 64 bits1036FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)1037{1038int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));1039return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));1040}10411042FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)1043{1044int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);1045int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);1046return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));1047}10481049FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)1050{1051int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);1052int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));1053return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));1054}10551056FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)1057{1058int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));1059int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);1060return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));1061}10621063// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)1064// int imm)1065#if defined(__aarch64__)1066#define _mm_shuffle_epi32_splat(a, imm) \1067__extension__({ \1068vreinterpretq_m128i_s32( \1069vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \1070})1071#else1072#define _mm_shuffle_epi32_splat(a, imm) \1073__extension__({ \1074vreinterpretq_m128i_s32( \1075vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \1076})1077#endif10781079// NEON does not support a general purpose permute intrinsic1080// Selects four specific single-precision, floating-point values from a and b,1081// based on the mask i.1082//1083// C equivalent:1084// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,1085// __constrange(0, 255) int imm) {1086// __m128 ret;1087// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];1088// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];1089// return ret;1090// }1091//1092// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx1093#define _mm_shuffle_ps_default(a, b, imm) \1094__extension__({ \1095float32x4_t ret; \1096ret = vmovq_n_f32( \1097vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \1098ret = vsetq_lane_f32( \1099vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \1100ret, 1); \1101ret = vsetq_lane_f32( \1102vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \1103ret, 2); \1104ret = vsetq_lane_f32( \1105vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \1106ret, 3); \1107vreinterpretq_m128_f32(ret); \1108})11091110// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified1111// by imm.1112// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)1113// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,1114// __constrange(0,255) int1115// imm)1116#define _mm_shufflelo_epi16_function(a, imm) \1117__extension__({ \1118int16x8_t ret = vreinterpretq_s16_m128i(a); \1119int16x4_t lowBits = vget_low_s16(ret); \1120ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \1121ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \11221); \1123ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \11242); \1125ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \11263); \1127vreinterpretq_m128i_s16(ret); \1128})11291130// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified1131// by imm.1132// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx1133// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,1134// __constrange(0,255) int1135// imm)1136#define _mm_shufflehi_epi16_function(a, imm) \1137__extension__({ \1138int16x8_t ret = vreinterpretq_s16_m128i(a); \1139int16x4_t highBits = vget_high_s16(ret); \1140ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \1141ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \11425); \1143ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \11446); \1145ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \11467); \1147vreinterpretq_m128i_s16(ret); \1148})11491150/* MMX */11511152//_mm_empty is a no-op on arm1153FORCE_INLINE void _mm_empty(void) {}11541155/* SSE */11561157// Adds the four single-precision, floating-point values of a and b.1158//1159// r0 := a0 + b01160// r1 := a1 + b11161// r2 := a2 + b21162// r3 := a3 + b31163//1164// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx1165FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)1166{1167return vreinterpretq_m128_f32(1168vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1169}11701171// adds the scalar single-precision floating point values of a and b.1172// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx1173FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)1174{1175float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);1176float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);1177// the upper values in the result must be the remnants of <a>.1178return vreinterpretq_m128_f32(vaddq_f32(a, value));1179}11801181// Computes the bitwise AND of the four single-precision, floating-point values1182// of a and b.1183//1184// r0 := a0 & b01185// r1 := a1 & b11186// r2 := a2 & b21187// r3 := a3 & b31188//1189// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx1190FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)1191{1192return vreinterpretq_m128_s32(1193vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));1194}11951196// Computes the bitwise AND-NOT of the four single-precision, floating-point1197// values of a and b.1198//1199// r0 := ~a0 & b01200// r1 := ~a1 & b11201// r2 := ~a2 & b21202// r3 := ~a3 & b31203//1204// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx1205FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)1206{1207return vreinterpretq_m128_s32(1208vbicq_s32(vreinterpretq_s32_m128(b),1209vreinterpretq_s32_m128(a))); // *NOTE* argument swap1210}12111212// Average packed unsigned 16-bit integers in a and b, and store the results in1213// dst.1214//1215// FOR j := 0 to 31216// i := j*161217// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 11218// ENDFOR1219//1220// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu161221FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)1222{1223return vreinterpret_m64_u16(1224vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));1225}12261227// Average packed unsigned 8-bit integers in a and b, and store the results in1228// dst.1229//1230// FOR j := 0 to 71231// i := j*81232// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 11233// ENDFOR1234//1235// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu81236FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)1237{1238return vreinterpret_m64_u8(1239vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));1240}12411242// Compares for equality.1243// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx1244FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)1245{1246return vreinterpretq_m128_u32(1247vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1248}12491250// Compares for equality.1251// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)1252FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)1253{1254return _mm_move_ss(a, _mm_cmpeq_ps(a, b));1255}12561257// Compares for greater than or equal.1258// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx1259FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)1260{1261return vreinterpretq_m128_u32(1262vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1263}12641265// Compares for greater than or equal.1266// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)1267FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)1268{1269return _mm_move_ss(a, _mm_cmpge_ps(a, b));1270}12711272// Compares for greater than.1273//1274// r0 := (a0 > b0) ? 0xffffffff : 0x01275// r1 := (a1 > b1) ? 0xffffffff : 0x01276// r2 := (a2 > b2) ? 0xffffffff : 0x01277// r3 := (a3 > b3) ? 0xffffffff : 0x01278//1279// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx1280FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)1281{1282return vreinterpretq_m128_u32(1283vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1284}12851286// Compares for greater than.1287// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)1288FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)1289{1290return _mm_move_ss(a, _mm_cmpgt_ps(a, b));1291}12921293// Compares for less than or equal.1294//1295// r0 := (a0 <= b0) ? 0xffffffff : 0x01296// r1 := (a1 <= b1) ? 0xffffffff : 0x01297// r2 := (a2 <= b2) ? 0xffffffff : 0x01298// r3 := (a3 <= b3) ? 0xffffffff : 0x01299//1300// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx1301FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)1302{1303return vreinterpretq_m128_u32(1304vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1305}13061307// Compares for less than or equal.1308// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)1309FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)1310{1311return _mm_move_ss(a, _mm_cmple_ps(a, b));1312}13131314// Compares for less than1315// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx1316FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)1317{1318return vreinterpretq_m128_u32(1319vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1320}13211322// Compares for less than1323// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)1324FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)1325{1326return _mm_move_ss(a, _mm_cmplt_ps(a, b));1327}13281329// Compares for inequality.1330// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx1331FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)1332{1333return vreinterpretq_m128_u32(vmvnq_u32(1334vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));1335}13361337// Compares for inequality.1338// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)1339FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)1340{1341return _mm_move_ss(a, _mm_cmpneq_ps(a, b));1342}13431344// Compares for not greater than or equal.1345// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)1346FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)1347{1348return vreinterpretq_m128_u32(vmvnq_u32(1349vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));1350}13511352// Compares for not greater than or equal.1353// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)1354FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)1355{1356return _mm_move_ss(a, _mm_cmpnge_ps(a, b));1357}13581359// Compares for not greater than.1360// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)1361FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)1362{1363return vreinterpretq_m128_u32(vmvnq_u32(1364vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));1365}13661367// Compares for not greater than.1368// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)1369FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)1370{1371return _mm_move_ss(a, _mm_cmpngt_ps(a, b));1372}13731374// Compares for not less than or equal.1375// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)1376FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)1377{1378return vreinterpretq_m128_u32(vmvnq_u32(1379vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));1380}13811382// Compares for not less than or equal.1383// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)1384FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)1385{1386return _mm_move_ss(a, _mm_cmpnle_ps(a, b));1387}13881389// Compares for not less than.1390// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)1391FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)1392{1393return vreinterpretq_m128_u32(vmvnq_u32(1394vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));1395}13961397// Compares for not less than.1398// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)1399FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)1400{1401return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));1402}14031404// Compares the four 32-bit floats in a and b to check if any values are NaN.1405// Ordered compare between each value returns true for "orderable" and false for1406// "not orderable" (NaN).1407// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see1408// also:1409// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean1410// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics1411FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)1412{1413// Note: NEON does not have ordered compare builtin1414// Need to compare a eq a and b eq b to check for NaN1415// Do AND of results to get final1416uint32x4_t ceqaa =1417vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));1418uint32x4_t ceqbb =1419vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));1420return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));1421}14221423// Compares for ordered.1424// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)1425FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)1426{1427return _mm_move_ss(a, _mm_cmpord_ps(a, b));1428}14291430// Compares for unordered.1431// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)1432FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)1433{1434uint32x4_t f32a =1435vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));1436uint32x4_t f32b =1437vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));1438return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));1439}14401441// Compares for unordered.1442// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)1443FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)1444{1445return _mm_move_ss(a, _mm_cmpunord_ps(a, b));1446}14471448// Compares the lower single-precision floating point scalar values of a and b1449// using an equality operation. :1450// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx1451FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)1452{1453uint32x4_t a_eq_b =1454vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));1455return vgetq_lane_u32(a_eq_b, 0) & 0x1;1456}14571458// Compares the lower single-precision floating point scalar values of a and b1459// using a greater than or equal operation. :1460// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx1461FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)1462{1463uint32x4_t a_ge_b =1464vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));1465return vgetq_lane_u32(a_ge_b, 0) & 0x1;1466}14671468// Compares the lower single-precision floating point scalar values of a and b1469// using a greater than operation. :1470// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx1471FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)1472{1473uint32x4_t a_gt_b =1474vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));1475return vgetq_lane_u32(a_gt_b, 0) & 0x1;1476}14771478// Compares the lower single-precision floating point scalar values of a and b1479// using a less than or equal operation. :1480// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx1481FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)1482{1483uint32x4_t a_le_b =1484vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));1485return vgetq_lane_u32(a_le_b, 0) & 0x1;1486}14871488// Compares the lower single-precision floating point scalar values of a and b1489// using a less than operation. :1490// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important1491// note!! The documentation on MSDN is incorrect! If either of the values is a1492// NAN the docs say you will get a one, but in fact, it will return a zero!!1493FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)1494{1495uint32x4_t a_lt_b =1496vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));1497return vgetq_lane_u32(a_lt_b, 0) & 0x1;1498}14991500// Compares the lower single-precision floating point scalar values of a and b1501// using an inequality operation. :1502// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx1503FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)1504{1505return !_mm_comieq_ss(a, b);1506}15071508// Convert packed signed 32-bit integers in b to packed single-precision1509// (32-bit) floating-point elements, store the results in the lower 2 elements1510// of dst, and copy the upper 2 packed elements from a to the upper elements of1511// dst.1512//1513// dst[31:0] := Convert_Int32_To_FP32(b[31:0])1514// dst[63:32] := Convert_Int32_To_FP32(b[63:32])1515// dst[95:64] := a[95:64]1516// dst[127:96] := a[127:96]1517//1518// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps1519FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)1520{1521return vreinterpretq_m128_f32(1522vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),1523vget_high_f32(vreinterpretq_f32_m128(a))));1524}15251526// Convert packed single-precision (32-bit) floating-point elements in a to1527// packed 32-bit integers, and store the results in dst.1528//1529// FOR j := 0 to 11530// i := 32*j1531// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])1532// ENDFOR1533//1534// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi1535FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)1536{1537#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)1538return vreinterpret_m64_s32(1539vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));1540#else1541return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(1542vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));1543#endif1544}15451546// Convert the signed 32-bit integer b to a single-precision (32-bit)1547// floating-point element, store the result in the lower element of dst, and1548// copy the upper 3 packed elements from a to the upper elements of dst.1549//1550// dst[31:0] := Convert_Int32_To_FP32(b[31:0])1551// dst[127:32] := a[127:32]1552//1553// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss1554FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)1555{1556return vreinterpretq_m128_f32(1557vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));1558}15591560// Convert the lower single-precision (32-bit) floating-point element in a to a1561// 32-bit integer, and store the result in dst.1562// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si1563FORCE_INLINE int _mm_cvt_ss2si(__m128 a)1564{1565#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)1566return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),15670);1568#else1569float32_t data = vgetq_lane_f32(1570vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);1571return (int32_t) data;1572#endif1573}15741575// Convert packed 16-bit integers in a to packed single-precision (32-bit)1576// floating-point elements, and store the results in dst.1577//1578// FOR j := 0 to 31579// i := j*161580// m := j*321581// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])1582// ENDFOR1583//1584// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps1585FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)1586{1587return vreinterpretq_m128_f32(1588vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));1589}15901591// Convert packed 32-bit integers in b to packed single-precision (32-bit)1592// floating-point elements, store the results in the lower 2 elements of dst,1593// and copy the upper 2 packed elements from a to the upper elements of dst.1594//1595// dst[31:0] := Convert_Int32_To_FP32(b[31:0])1596// dst[63:32] := Convert_Int32_To_FP32(b[63:32])1597// dst[95:64] := a[95:64]1598// dst[127:96] := a[127:96]1599//1600// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps1601FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)1602{1603return vreinterpretq_m128_f32(1604vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),1605vget_high_f32(vreinterpretq_f32_m128(a))));1606}16071608// Convert packed signed 32-bit integers in a to packed single-precision1609// (32-bit) floating-point elements, store the results in the lower 2 elements1610// of dst, then convert the packed signed 32-bit integers in b to1611// single-precision (32-bit) floating-point element, and store the results in1612// the upper 2 elements of dst.1613//1614// dst[31:0] := Convert_Int32_To_FP32(a[31:0])1615// dst[63:32] := Convert_Int32_To_FP32(a[63:32])1616// dst[95:64] := Convert_Int32_To_FP32(b[31:0])1617// dst[127:96] := Convert_Int32_To_FP32(b[63:32])1618//1619// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps1620FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)1621{1622return vreinterpretq_m128_f32(vcvtq_f32_s32(1623vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));1624}16251626// Convert the lower packed 8-bit integers in a to packed single-precision1627// (32-bit) floating-point elements, and store the results in dst.1628//1629// FOR j := 0 to 31630// i := j*81631// m := j*321632// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])1633// ENDFOR1634//1635// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps1636FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)1637{1638return vreinterpretq_m128_f32(vcvtq_f32_s32(1639vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));1640}16411642// Convert packed single-precision (32-bit) floating-point elements in a to1643// packed 16-bit integers, and store the results in dst. Note: this intrinsic1644// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and1645// 0x7FFFFFFF.1646//1647// FOR j := 0 to 31648// i := 16*j1649// k := 32*j1650// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)1651// dst[i+15:i] := 0x7FFF1652// ELSE1653// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])1654// FI1655// ENDFOR1656//1657// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi161658FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)1659{1660return vreinterpret_m64_s16(1661vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));1662}16631664// Convert packed single-precision (32-bit) floating-point elements in a to1665// packed 32-bit integers, and store the results in dst.1666//1667// FOR j := 0 to 11668// i := 32*j1669// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])1670// ENDFOR1671//1672// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi321673#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)16741675// Convert packed single-precision (32-bit) floating-point elements in a to1676// packed 8-bit integers, and store the results in lower 4 elements of dst.1677// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values1678// between 0x7F and 0x7FFFFFFF.1679//1680// FOR j := 0 to 31681// i := 8*j1682// k := 32*j1683// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)1684// dst[i+7:i] := 0x7F1685// ELSE1686// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])1687// FI1688// ENDFOR1689//1690// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi81691FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)1692{1693return vreinterpret_m64_s8(vqmovn_s16(1694vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));1695}16961697// Convert packed unsigned 16-bit integers in a to packed single-precision1698// (32-bit) floating-point elements, and store the results in dst.1699//1700// FOR j := 0 to 31701// i := j*161702// m := j*321703// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])1704// ENDFOR1705//1706// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps1707FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)1708{1709return vreinterpretq_m128_f32(1710vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));1711}17121713// Convert the lower packed unsigned 8-bit integers in a to packed1714// single-precision (32-bit) floating-point elements, and store the results in1715// dst.1716//1717// FOR j := 0 to 31718// i := j*81719// m := j*321720// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])1721// ENDFOR1722//1723// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps1724FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)1725{1726return vreinterpretq_m128_f32(vcvtq_f32_u32(1727vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));1728}17291730// Convert the signed 32-bit integer b to a single-precision (32-bit)1731// floating-point element, store the result in the lower element of dst, and1732// copy the upper 3 packed elements from a to the upper elements of dst.1733//1734// dst[31:0] := Convert_Int32_To_FP32(b[31:0])1735// dst[127:32] := a[127:32]1736//1737// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss1738#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)17391740// Convert the signed 64-bit integer b to a single-precision (32-bit)1741// floating-point element, store the result in the lower element of dst, and1742// copy the upper 3 packed elements from a to the upper elements of dst.1743//1744// dst[31:0] := Convert_Int64_To_FP32(b[63:0])1745// dst[127:32] := a[127:32]1746//1747// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss1748FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)1749{1750return vreinterpretq_m128_f32(1751vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));1752}17531754// Copy the lower single-precision (32-bit) floating-point element of a to dst.1755//1756// dst[31:0] := a[31:0]1757//1758// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f321759FORCE_INLINE float _mm_cvtss_f32(__m128 a)1760{1761return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);1762}17631764// Convert the lower single-precision (32-bit) floating-point element in a to a1765// 32-bit integer, and store the result in dst.1766//1767// dst[31:0] := Convert_FP32_To_Int32(a[31:0])1768//1769// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si321770#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)17711772// Convert the lower single-precision (32-bit) floating-point element in a to a1773// 64-bit integer, and store the result in dst.1774//1775// dst[63:0] := Convert_FP32_To_Int64(a[31:0])1776//1777// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si641778FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)1779{1780#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)1781return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);1782#else1783float32_t data = vgetq_lane_f32(1784vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);1785return (int64_t) data;1786#endif1787}17881789// Convert packed single-precision (32-bit) floating-point elements in a to1790// packed 32-bit integers with truncation, and store the results in dst.1791//1792// FOR j := 0 to 11793// i := 32*j1794// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])1795// ENDFOR1796//1797// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi1798FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)1799{1800return vreinterpret_m64_s32(1801vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));1802}18031804// Convert the lower single-precision (32-bit) floating-point element in a to a1805// 32-bit integer with truncation, and store the result in dst.1806//1807// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])1808//1809// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si1810FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)1811{1812return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);1813}18141815// Convert packed single-precision (32-bit) floating-point elements in a to1816// packed 32-bit integers with truncation, and store the results in dst.1817//1818// FOR j := 0 to 11819// i := 32*j1820// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])1821// ENDFOR1822//1823// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi321824#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)18251826// Convert the lower single-precision (32-bit) floating-point element in a to a1827// 32-bit integer with truncation, and store the result in dst.1828//1829// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])1830//1831// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si321832#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)18331834// Convert the lower single-precision (32-bit) floating-point element in a to a1835// 64-bit integer with truncation, and store the result in dst.1836//1837// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])1838//1839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si641840FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)1841{1842return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);1843}18441845// Divides the four single-precision, floating-point values of a and b.1846//1847// r0 := a0 / b01848// r1 := a1 / b11849// r2 := a2 / b21850// r3 := a3 / b31851//1852// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx1853FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)1854{1855#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV1856return vreinterpretq_m128_f32(1857vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));1858#else1859float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));1860recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));1861#if SSE2NEON_PRECISE_DIV1862// Additional Netwon-Raphson iteration for accuracy1863recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));1864#endif1865return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));1866#endif1867}18681869// Divides the scalar single-precision floating point value of a by b.1870// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx1871FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)1872{1873float32_t value =1874vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);1875return vreinterpretq_m128_f32(1876vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));1877}18781879// Extract a 16-bit integer from a, selected with imm8, and store the result in1880// the lower element of dst.1881// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi161882#define _mm_extract_pi16(a, imm) \1883(int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))18841885// Free aligned memory that was allocated with _mm_malloc.1886// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free1887#if !defined(SSE2NEON_ALLOC_DEFINED)1888FORCE_INLINE void _mm_free(void *addr)1889{1890#if defined(_WIN32)1891_aligned_free(addr);1892#else1893free(addr);1894#endif1895}1896#endif18971898// Macro: Get the flush zero bits from the MXCSR control and status register.1899// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or1900// _MM_FLUSH_ZERO_OFF1901// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE1902FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()1903{1904union {1905fpcr_bitfield field;1906#if defined(__aarch64__)1907uint64_t value;1908#else1909uint32_t value;1910#endif1911} r;19121913#if defined(__aarch64__)1914__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */1915#else1916__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */1917#endif19181919return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;1920}19211922// Macro: Get the rounding mode bits from the MXCSR control and status register.1923// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,1924// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO1925// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE1926FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()1927{1928union {1929fpcr_bitfield field;1930#if defined(__aarch64__)1931uint64_t value;1932#else1933uint32_t value;1934#endif1935} r;19361937#if defined(__aarch64__)1938__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */1939#else1940__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */1941#endif19421943if (r.field.bit22) {1944return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;1945} else {1946return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;1947}1948}19491950// Copy a to dst, and insert the 16-bit integer i into dst at the location1951// specified by imm8.1952// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi161953#define _mm_insert_pi16(a, b, imm) \1954__extension__({ \1955vreinterpret_m64_s16( \1956vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \1957})19581959// Loads four single-precision, floating-point values.1960// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx1961FORCE_INLINE __m128 _mm_load_ps(const float *p)1962{1963return vreinterpretq_m128_f32(vld1q_f32(p));1964}19651966// Load a single-precision (32-bit) floating-point element from memory into all1967// elements of dst.1968//1969// dst[31:0] := MEM[mem_addr+31:mem_addr]1970// dst[63:32] := MEM[mem_addr+31:mem_addr]1971// dst[95:64] := MEM[mem_addr+31:mem_addr]1972// dst[127:96] := MEM[mem_addr+31:mem_addr]1973//1974// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps11975#define _mm_load_ps1 _mm_load1_ps19761977// Loads an single - precision, floating - point value into the low word and1978// clears the upper three words.1979// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx1980FORCE_INLINE __m128 _mm_load_ss(const float *p)1981{1982return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));1983}19841985// Loads a single single-precision, floating-point value, copying it into all1986// four words1987// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx1988FORCE_INLINE __m128 _mm_load1_ps(const float *p)1989{1990return vreinterpretq_m128_f32(vld1q_dup_f32(p));1991}19921993// Sets the upper two single-precision, floating-point values with 641994// bits of data loaded from the address p; the lower two values are passed1995// through from a.1996//1997// r0 := a01998// r1 := a11999// r2 := *p02000// r3 := *p12001//2002// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx2003FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)2004{2005return vreinterpretq_m128_f32(2006vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));2007}20082009// Sets the lower two single-precision, floating-point values with 642010// bits of data loaded from the address p; the upper two values are passed2011// through from a.2012//2013// Return Value2014// r0 := *p02015// r1 := *p12016// r2 := a22017// r3 := a32018//2019// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx2020FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)2021{2022return vreinterpretq_m128_f32(2023vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));2024}20252026// Load 4 single-precision (32-bit) floating-point elements from memory into dst2027// in reverse order. mem_addr must be aligned on a 16-byte boundary or a2028// general-protection exception may be generated.2029//2030// dst[31:0] := MEM[mem_addr+127:mem_addr+96]2031// dst[63:32] := MEM[mem_addr+95:mem_addr+64]2032// dst[95:64] := MEM[mem_addr+63:mem_addr+32]2033// dst[127:96] := MEM[mem_addr+31:mem_addr]2034//2035// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps2036FORCE_INLINE __m128 _mm_loadr_ps(const float *p)2037{2038float32x4_t v = vrev64q_f32(vld1q_f32(p));2039return vreinterpretq_m128_f32(vextq_f32(v, v, 2));2040}20412042// Loads four single-precision, floating-point values.2043// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx2044FORCE_INLINE __m128 _mm_loadu_ps(const float *p)2045{2046// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are2047// equivalent for neon2048return vreinterpretq_m128_f32(vld1q_f32(p));2049}20502051// Load unaligned 16-bit integer from memory into the first element of dst.2052//2053// dst[15:0] := MEM[mem_addr+15:mem_addr]2054// dst[MAX:16] := 02055//2056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si162057FORCE_INLINE __m128i _mm_loadu_si16(const void *p)2058{2059return vreinterpretq_m128i_s16(2060vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));2061}20622063// Load unaligned 64-bit integer from memory into the first element of dst.2064//2065// dst[63:0] := MEM[mem_addr+63:mem_addr]2066// dst[MAX:64] := 02067//2068// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si642069FORCE_INLINE __m128i _mm_loadu_si64(const void *p)2070{2071return vreinterpretq_m128i_s64(2072vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));2073}20742075// Allocate aligned blocks of memory.2076// https://software.intel.com/en-us/2077// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks2078#if !defined(SSE2NEON_ALLOC_DEFINED)2079FORCE_INLINE void *_mm_malloc(size_t size, size_t align)2080{2081void *ptr;2082if (align == 1)2083return malloc(size);2084if (align == 2 || (sizeof(void *) == 8 && align == 4))2085align = sizeof(void *);2086#if defined(_WIN32)2087ptr = _aligned_malloc(size, align);2088if (ptr)2089return ptr;2090#else2091if (!posix_memalign(&ptr, align, size))2092return ptr;2093#endif2094return NULL;2095}2096#endif20972098// Conditionally store 8-bit integer elements from a into memory using mask2099// (elements are not stored when the highest bit is not set in the corresponding2100// element) and a non-temporal memory hint.2101// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si642102FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)2103{2104int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);2105__m128 b = _mm_load_ps((const float *) mem_addr);2106int8x8_t masked =2107vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),2108vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));2109vst1_s8((int8_t *) mem_addr, masked);2110}21112112// Conditionally store 8-bit integer elements from a into memory using mask2113// (elements are not stored when the highest bit is not set in the corresponding2114// element) and a non-temporal memory hint.2115// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq2116#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)21172118// Compare packed signed 16-bit integers in a and b, and store packed maximum2119// values in dst.2120//2121// FOR j := 0 to 32122// i := j*162123// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])2124// ENDFOR2125//2126// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi162127FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)2128{2129return vreinterpret_m64_s16(2130vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));2131}21322133// Computes the maximums of the four single-precision, floating-point values of2134// a and b.2135// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx2136FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)2137{2138#if SSE2NEON_PRECISE_MINMAX2139float32x4_t _a = vreinterpretq_f32_m128(a);2140float32x4_t _b = vreinterpretq_f32_m128(b);2141return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));2142#else2143return vreinterpretq_m128_f32(2144vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));2145#endif2146}21472148// Compare packed unsigned 8-bit integers in a and b, and store packed maximum2149// values in dst.2150//2151// FOR j := 0 to 72152// i := j*82153// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])2154// ENDFOR2155//2156// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu82157FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)2158{2159return vreinterpret_m64_u8(2160vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));2161}21622163// Computes the maximum of the two lower scalar single-precision floating point2164// values of a and b.2165// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx2166FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)2167{2168float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);2169return vreinterpretq_m128_f32(2170vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));2171}21722173// Compare packed signed 16-bit integers in a and b, and store packed minimum2174// values in dst.2175//2176// FOR j := 0 to 32177// i := j*162178// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])2179// ENDFOR2180//2181// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi162182FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)2183{2184return vreinterpret_m64_s16(2185vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));2186}21872188// Computes the minima of the four single-precision, floating-point values of a2189// and b.2190// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx2191FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)2192{2193#if SSE2NEON_PRECISE_MINMAX2194float32x4_t _a = vreinterpretq_f32_m128(a);2195float32x4_t _b = vreinterpretq_f32_m128(b);2196return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));2197#else2198return vreinterpretq_m128_f32(2199vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));2200#endif2201}22022203// Compare packed unsigned 8-bit integers in a and b, and store packed minimum2204// values in dst.2205//2206// FOR j := 0 to 72207// i := j*82208// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])2209// ENDFOR2210//2211// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu82212FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)2213{2214return vreinterpret_m64_u8(2215vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));2216}22172218// Computes the minimum of the two lower scalar single-precision floating point2219// values of a and b.2220// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx2221FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)2222{2223float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);2224return vreinterpretq_m128_f32(2225vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));2226}22272228// Sets the low word to the single-precision, floating-point value of b2229// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)2230FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)2231{2232return vreinterpretq_m128_f32(2233vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),2234vreinterpretq_f32_m128(a), 0));2235}22362237// Moves the upper two values of B into the lower two values of A.2238//2239// r3 := a32240// r2 := a22241// r1 := b32242// r0 := b22243FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)2244{2245float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));2246float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));2247return vreinterpretq_m128_f32(vcombine_f32(b32, a32));2248}22492250// Moves the lower two values of B into the upper two values of A.2251//2252// r3 := b12253// r2 := b02254// r1 := a12255// r0 := a02256FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)2257{2258float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));2259float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));2260return vreinterpretq_m128_f32(vcombine_f32(a10, b10));2261}22622263// Create mask from the most significant bit of each 8-bit element in a, and2264// store the result in dst.2265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi82266FORCE_INLINE int _mm_movemask_pi8(__m64 a)2267{2268uint8x8_t input = vreinterpret_u8_m64(a);2269#if defined(__aarch64__)2270static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};2271uint8x8_t tmp = vshr_n_u8(input, 7);2272return vaddv_u8(vshl_u8(tmp, shift));2273#else2274// Refer the implementation of `_mm_movemask_epi8`2275uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));2276uint32x2_t paired16 =2277vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));2278uint8x8_t paired32 =2279vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));2280return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);2281#endif2282}22832284// NEON does not provide this method2285// Creates a 4-bit mask from the most significant bits of the four2286// single-precision, floating-point values.2287// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx2288FORCE_INLINE int _mm_movemask_ps(__m128 a)2289{2290uint32x4_t input = vreinterpretq_u32_m128(a);2291#if defined(__aarch64__)2292static const int32x4_t shift = {0, 1, 2, 3};2293uint32x4_t tmp = vshrq_n_u32(input, 31);2294return vaddvq_u32(vshlq_u32(tmp, shift));2295#else2296// Uses the exact same method as _mm_movemask_epi8, see that for details.2297// Shift out everything but the sign bits with a 32-bit unsigned shift2298// right.2299uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));2300// Merge the two pairs together with a 64-bit unsigned shift right + add.2301uint8x16_t paired =2302vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));2303// Extract the result.2304return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);2305#endif2306}23072308// Multiplies the four single-precision, floating-point values of a and b.2309//2310// r0 := a0 * b02311// r1 := a1 * b12312// r2 := a2 * b22313// r3 := a3 * b32314//2315// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx2316FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)2317{2318return vreinterpretq_m128_f32(2319vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));2320}23212322// Multiply the lower single-precision (32-bit) floating-point element in a and2323// b, store the result in the lower element of dst, and copy the upper 3 packed2324// elements from a to the upper elements of dst.2325//2326// dst[31:0] := a[31:0] * b[31:0]2327// dst[127:32] := a[127:32]2328//2329// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss2330FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)2331{2332return _mm_move_ss(a, _mm_mul_ps(a, b));2333}23342335// Multiply the packed unsigned 16-bit integers in a and b, producing2336// intermediate 32-bit integers, and store the high 16 bits of the intermediate2337// integers in dst.2338// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu162339FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)2340{2341return vreinterpret_m64_u16(vshrn_n_u32(2342vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));2343}23442345// Computes the bitwise OR of the four single-precision, floating-point values2346// of a and b.2347// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx2348FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)2349{2350return vreinterpretq_m128_s32(2351vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));2352}23532354// Average packed unsigned 8-bit integers in a and b, and store the results in2355// dst.2356//2357// FOR j := 0 to 72358// i := j*82359// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 12360// ENDFOR2361//2362// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb2363#define _m_pavgb(a, b) _mm_avg_pu8(a, b)23642365// Average packed unsigned 16-bit integers in a and b, and store the results in2366// dst.2367//2368// FOR j := 0 to 32369// i := j*162370// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 12371// ENDFOR2372//2373// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw2374#define _m_pavgw(a, b) _mm_avg_pu16(a, b)23752376// Extract a 16-bit integer from a, selected with imm8, and store the result in2377// the lower element of dst.2378// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw2379#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)23802381// Copy a to dst, and insert the 16-bit integer i into dst at the location2382// specified by imm8.2383// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw2384#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)23852386// Compare packed signed 16-bit integers in a and b, and store packed maximum2387// values in dst.2388// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw2389#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)23902391// Compare packed unsigned 8-bit integers in a and b, and store packed maximum2392// values in dst.2393// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub2394#define _m_pmaxub(a, b) _mm_max_pu8(a, b)23952396// Compare packed signed 16-bit integers in a and b, and store packed minimum2397// values in dst.2398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw2399#define _m_pminsw(a, b) _mm_min_pi16(a, b)24002401// Compare packed unsigned 8-bit integers in a and b, and store packed minimum2402// values in dst.2403// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub2404#define _m_pminub(a, b) _mm_min_pu8(a, b)24052406// Create mask from the most significant bit of each 8-bit element in a, and2407// store the result in dst.2408// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb2409#define _m_pmovmskb(a) _mm_movemask_pi8(a)24102411// Multiply the packed unsigned 16-bit integers in a and b, producing2412// intermediate 32-bit integers, and store the high 16 bits of the intermediate2413// integers in dst.2414// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw2415#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)24162417// Fetch the line of data from memory that contains address p to a location in2418// the cache heirarchy specified by the locality hint i.2419// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch2420FORCE_INLINE void _mm_prefetch(char const *p, int i)2421{2422switch (i) {2423case _MM_HINT_NTA:2424__builtin_prefetch(p, 0, 0);2425break;2426case _MM_HINT_T0:2427__builtin_prefetch(p, 0, 3);2428break;2429case _MM_HINT_T1:2430__builtin_prefetch(p, 0, 2);2431break;2432case _MM_HINT_T2:2433__builtin_prefetch(p, 0, 1);2434break;2435}2436}24372438// Compute the absolute differences of packed unsigned 8-bit integers in a and2439// b, then horizontally sum each consecutive 8 differences to produce four2440// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low2441// 16 bits of dst.2442// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw2443#define _m_psadbw(a, b) _mm_sad_pu8(a, b)24442445// Shuffle 16-bit integers in a using the control in imm8, and store the results2446// in dst.2447// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw2448#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)24492450// Compute the approximate reciprocal of packed single-precision (32-bit)2451// floating-point elements in a, and store the results in dst. The maximum2452// relative error for this approximation is less than 1.5*2^-12.2453// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps2454FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)2455{2456float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));2457recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));2458#if SSE2NEON_PRECISE_DIV2459// Additional Netwon-Raphson iteration for accuracy2460recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));2461#endif2462return vreinterpretq_m128_f32(recip);2463}24642465// Compute the approximate reciprocal of the lower single-precision (32-bit)2466// floating-point element in a, store the result in the lower element of dst,2467// and copy the upper 3 packed elements from a to the upper elements of dst. The2468// maximum relative error for this approximation is less than 1.5*2^-12.2469//2470// dst[31:0] := (1.0 / a[31:0])2471// dst[127:32] := a[127:32]2472//2473// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss2474FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)2475{2476return _mm_move_ss(a, _mm_rcp_ps(a));2477}24782479// Computes the approximations of the reciprocal square roots of the four2480// single-precision floating point values of in.2481// The current precision is 1% error.2482// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx2483FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)2484{2485float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));2486#if SSE2NEON_PRECISE_SQRT2487// Additional Netwon-Raphson iteration for accuracy2488out = vmulq_f32(2489out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));2490out = vmulq_f32(2491out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));2492#endif2493return vreinterpretq_m128_f32(out);2494}24952496// Compute the approximate reciprocal square root of the lower single-precision2497// (32-bit) floating-point element in a, store the result in the lower element2498// of dst, and copy the upper 3 packed elements from a to the upper elements of2499// dst.2500// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss2501FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)2502{2503return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);2504}25052506// Compute the absolute differences of packed unsigned 8-bit integers in a and2507// b, then horizontally sum each consecutive 8 differences to produce four2508// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low2509// 16 bits of dst.2510// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu82511FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)2512{2513uint64x1_t t = vpaddl_u32(vpaddl_u16(2514vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));2515return vreinterpret_m64_u16(2516vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));2517}25182519// Macro: Set the flush zero bits of the MXCSR control and status register to2520// the value in unsigned 32-bit integer a. The flush zero may contain any of the2521// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF2522// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE2523FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)2524{2525// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,2526// regardless of the value of the FZ bit.2527union {2528fpcr_bitfield field;2529#if defined(__aarch64__)2530uint64_t value;2531#else2532uint32_t value;2533#endif2534} r;25352536#if defined(__aarch64__)2537__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */2538#else2539__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */2540#endif25412542r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;25432544#if defined(__aarch64__)2545__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */2546#else2547__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */2548#endif2549}25502551// Sets the four single-precision, floating-point values to the four inputs.2552// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx2553FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)2554{2555float ALIGN_STRUCT(16) data[4] = {x, y, z, w};2556return vreinterpretq_m128_f32(vld1q_f32(data));2557}25582559// Sets the four single-precision, floating-point values to w.2560// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx2561FORCE_INLINE __m128 _mm_set_ps1(float _w)2562{2563return vreinterpretq_m128_f32(vdupq_n_f32(_w));2564}25652566// Macro: Set the rounding mode bits of the MXCSR control and status register to2567// the value in unsigned 32-bit integer a. The rounding mode may contain any of2568// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,2569// _MM_ROUND_TOWARD_ZERO2570// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE2571FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)2572{2573union {2574fpcr_bitfield field;2575#if defined(__aarch64__)2576uint64_t value;2577#else2578uint32_t value;2579#endif2580} r;25812582#if defined(__aarch64__)2583__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */2584#else2585__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */2586#endif25872588switch (rounding) {2589case _MM_ROUND_TOWARD_ZERO:2590r.field.bit22 = 1;2591r.field.bit23 = 1;2592break;2593case _MM_ROUND_DOWN:2594r.field.bit22 = 0;2595r.field.bit23 = 1;2596break;2597case _MM_ROUND_UP:2598r.field.bit22 = 1;2599r.field.bit23 = 0;2600break;2601default: //_MM_ROUND_NEAREST2602r.field.bit22 = 0;2603r.field.bit23 = 0;2604}26052606#if defined(__aarch64__)2607__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */2608#else2609__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */2610#endif2611}26122613// Copy single-precision (32-bit) floating-point element a to the lower element2614// of dst, and zero the upper 3 elements.2615// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss2616FORCE_INLINE __m128 _mm_set_ss(float a)2617{2618return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));2619}26202621// Sets the four single-precision, floating-point values to w.2622//2623// r0 := r1 := r2 := r3 := w2624//2625// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx2626FORCE_INLINE __m128 _mm_set1_ps(float _w)2627{2628return vreinterpretq_m128_f32(vdupq_n_f32(_w));2629}26302631// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.2632FORCE_INLINE void _mm_setcsr(unsigned int a)2633{2634_MM_SET_ROUNDING_MODE(a);2635}26362637// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.2638FORCE_INLINE unsigned int _mm_getcsr()2639{2640return _MM_GET_ROUNDING_MODE();2641}26422643// Sets the four single-precision, floating-point values to the four inputs in2644// reverse order.2645// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx2646FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)2647{2648float ALIGN_STRUCT(16) data[4] = {w, z, y, x};2649return vreinterpretq_m128_f32(vld1q_f32(data));2650}26512652// Clears the four single-precision, floating-point values.2653// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx2654FORCE_INLINE __m128 _mm_setzero_ps(void)2655{2656return vreinterpretq_m128_f32(vdupq_n_f32(0));2657}26582659// Shuffle 16-bit integers in a using the control in imm8, and store the results2660// in dst.2661// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi162662#ifdef _sse2neon_shuffle2663#define _mm_shuffle_pi16(a, imm) \2664__extension__({ \2665vreinterpret_m64_s16(vshuffle_s16( \2666vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \2667((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \2668})2669#else2670#define _mm_shuffle_pi16(a, imm) \2671__extension__({ \2672int16x4_t ret; \2673ret = \2674vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \2675ret = vset_lane_s16( \2676vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \26771); \2678ret = vset_lane_s16( \2679vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \26802); \2681ret = vset_lane_s16( \2682vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \26833); \2684vreinterpret_m64_s16(ret); \2685})2686#endif26872688// Perform a serializing operation on all store-to-memory instructions that were2689// issued prior to this instruction. Guarantees that every store instruction2690// that precedes, in program order, is globally visible before any store2691// instruction which follows the fence in program order.2692// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence2693FORCE_INLINE void _mm_sfence(void)2694{2695_sse2neon_smp_mb();2696}26972698// Perform a serializing operation on all load-from-memory and store-to-memory2699// instructions that were issued prior to this instruction. Guarantees that2700// every memory access that precedes, in program order, the memory fence2701// instruction is globally visible before any memory instruction which follows2702// the fence in program order.2703// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence2704FORCE_INLINE void _mm_mfence(void)2705{2706_sse2neon_smp_mb();2707}27082709// Perform a serializing operation on all load-from-memory instructions that2710// were issued prior to this instruction. Guarantees that every load instruction2711// that precedes, in program order, is globally visible before any load2712// instruction which follows the fence in program order.2713// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence2714FORCE_INLINE void _mm_lfence(void)2715{2716_sse2neon_smp_mb();2717}27182719// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)2720// int imm)2721#ifdef _sse2neon_shuffle2722#define _mm_shuffle_ps(a, b, imm) \2723__extension__({ \2724float32x4_t _input1 = vreinterpretq_f32_m128(a); \2725float32x4_t _input2 = vreinterpretq_f32_m128(b); \2726float32x4_t _shuf = \2727vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \2728(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \2729vreinterpretq_m128_f32(_shuf); \2730})2731#else // generic2732#define _mm_shuffle_ps(a, b, imm) \2733__extension__({ \2734__m128 ret; \2735switch (imm) { \2736case _MM_SHUFFLE(1, 0, 3, 2): \2737ret = _mm_shuffle_ps_1032((a), (b)); \2738break; \2739case _MM_SHUFFLE(2, 3, 0, 1): \2740ret = _mm_shuffle_ps_2301((a), (b)); \2741break; \2742case _MM_SHUFFLE(0, 3, 2, 1): \2743ret = _mm_shuffle_ps_0321((a), (b)); \2744break; \2745case _MM_SHUFFLE(2, 1, 0, 3): \2746ret = _mm_shuffle_ps_2103((a), (b)); \2747break; \2748case _MM_SHUFFLE(1, 0, 1, 0): \2749ret = _mm_movelh_ps((a), (b)); \2750break; \2751case _MM_SHUFFLE(1, 0, 0, 1): \2752ret = _mm_shuffle_ps_1001((a), (b)); \2753break; \2754case _MM_SHUFFLE(0, 1, 0, 1): \2755ret = _mm_shuffle_ps_0101((a), (b)); \2756break; \2757case _MM_SHUFFLE(3, 2, 1, 0): \2758ret = _mm_shuffle_ps_3210((a), (b)); \2759break; \2760case _MM_SHUFFLE(0, 0, 1, 1): \2761ret = _mm_shuffle_ps_0011((a), (b)); \2762break; \2763case _MM_SHUFFLE(0, 0, 2, 2): \2764ret = _mm_shuffle_ps_0022((a), (b)); \2765break; \2766case _MM_SHUFFLE(2, 2, 0, 0): \2767ret = _mm_shuffle_ps_2200((a), (b)); \2768break; \2769case _MM_SHUFFLE(3, 2, 0, 2): \2770ret = _mm_shuffle_ps_3202((a), (b)); \2771break; \2772case _MM_SHUFFLE(3, 2, 3, 2): \2773ret = _mm_movehl_ps((b), (a)); \2774break; \2775case _MM_SHUFFLE(1, 1, 3, 3): \2776ret = _mm_shuffle_ps_1133((a), (b)); \2777break; \2778case _MM_SHUFFLE(2, 0, 1, 0): \2779ret = _mm_shuffle_ps_2010((a), (b)); \2780break; \2781case _MM_SHUFFLE(2, 0, 0, 1): \2782ret = _mm_shuffle_ps_2001((a), (b)); \2783break; \2784case _MM_SHUFFLE(2, 0, 3, 2): \2785ret = _mm_shuffle_ps_2032((a), (b)); \2786break; \2787default: \2788ret = _mm_shuffle_ps_default((a), (b), (imm)); \2789break; \2790} \2791ret; \2792})2793#endif27942795// Computes the approximations of square roots of the four single-precision,2796// floating-point values of a. First computes reciprocal square roots and then2797// reciprocals of the four values.2798//2799// r0 := sqrt(a0)2800// r1 := sqrt(a1)2801// r2 := sqrt(a2)2802// r3 := sqrt(a3)2803//2804// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx2805FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)2806{2807#if SSE2NEON_PRECISE_SQRT2808float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));28092810// Test for vrsqrteq_f32(0) -> positive infinity case.2811// Change to zero, so that s * 1/sqrt(s) result is zero too.2812const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);2813const uint32x4_t div_by_zero =2814vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));2815recip = vreinterpretq_f32_u32(2816vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));28172818// Additional Netwon-Raphson iteration for accuracy2819recip = vmulq_f32(2820vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),2821recip);2822recip = vmulq_f32(2823vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),2824recip);28252826// sqrt(s) = s * 1/sqrt(s)2827return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));2828#elif defined(__aarch64__)2829return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));2830#else2831float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));2832float32x4_t sq = vrecpeq_f32(recipsq);2833return vreinterpretq_m128_f32(sq);2834#endif2835}28362837// Computes the approximation of the square root of the scalar single-precision2838// floating point value of in.2839// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx2840FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)2841{2842float32_t value =2843vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);2844return vreinterpretq_m128_f32(2845vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));2846}28472848// Stores four single-precision, floating-point values.2849// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx2850FORCE_INLINE void _mm_store_ps(float *p, __m128 a)2851{2852vst1q_f32(p, vreinterpretq_f32_m128(a));2853}28542855// Store the lower single-precision (32-bit) floating-point element from a into2856// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte2857// boundary or a general-protection exception may be generated.2858//2859// MEM[mem_addr+31:mem_addr] := a[31:0]2860// MEM[mem_addr+63:mem_addr+32] := a[31:0]2861// MEM[mem_addr+95:mem_addr+64] := a[31:0]2862// MEM[mem_addr+127:mem_addr+96] := a[31:0]2863//2864// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps12865FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)2866{2867float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);2868vst1q_f32(p, vdupq_n_f32(a0));2869}28702871// Stores the lower single - precision, floating - point value.2872// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx2873FORCE_INLINE void _mm_store_ss(float *p, __m128 a)2874{2875vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);2876}28772878// Store the lower single-precision (32-bit) floating-point element from a into2879// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte2880// boundary or a general-protection exception may be generated.2881//2882// MEM[mem_addr+31:mem_addr] := a[31:0]2883// MEM[mem_addr+63:mem_addr+32] := a[31:0]2884// MEM[mem_addr+95:mem_addr+64] := a[31:0]2885// MEM[mem_addr+127:mem_addr+96] := a[31:0]2886//2887// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps2888#define _mm_store1_ps _mm_store_ps128892890// Stores the upper two single-precision, floating-point values of a to the2891// address p.2892//2893// *p0 := a22894// *p1 := a32895//2896// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx2897FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)2898{2899*p = vreinterpret_m64_f32(vget_high_f32(a));2900}29012902// Stores the lower two single-precision floating point values of a to the2903// address p.2904//2905// *p0 := a02906// *p1 := a12907//2908// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx2909FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)2910{2911*p = vreinterpret_m64_f32(vget_low_f32(a));2912}29132914// Store 4 single-precision (32-bit) floating-point elements from a into memory2915// in reverse order. mem_addr must be aligned on a 16-byte boundary or a2916// general-protection exception may be generated.2917//2918// MEM[mem_addr+31:mem_addr] := a[127:96]2919// MEM[mem_addr+63:mem_addr+32] := a[95:64]2920// MEM[mem_addr+95:mem_addr+64] := a[63:32]2921// MEM[mem_addr+127:mem_addr+96] := a[31:0]2922//2923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps2924FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)2925{2926float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));2927float32x4_t rev = vextq_f32(tmp, tmp, 2);2928vst1q_f32(p, rev);2929}29302931// Stores four single-precision, floating-point values.2932// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx2933FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)2934{2935vst1q_f32(p, vreinterpretq_f32_m128(a));2936}29372938// Stores 16-bits of integer data a at the address p.2939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si162940FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)2941{2942vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);2943}29442945// Stores 64-bits of integer data a at the address p.2946// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si642947FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)2948{2949vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);2950}29512952// Store 64-bits of integer data from a into memory using a non-temporal memory2953// hint.2954// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi2955FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)2956{2957vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));2958}29592960// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-2961// point elements) from a into memory using a non-temporal memory hint.2962// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps2963FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)2964{2965#if __has_builtin(__builtin_nontemporal_store)2966__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);2967#else2968vst1q_f32(p, vreinterpretq_f32_m128(a));2969#endif2970}29712972// Subtracts the four single-precision, floating-point values of a and b.2973//2974// r0 := a0 - b02975// r1 := a1 - b12976// r2 := a2 - b22977// r3 := a3 - b32978//2979// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx2980FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)2981{2982return vreinterpretq_m128_f32(2983vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));2984}29852986// Subtract the lower single-precision (32-bit) floating-point element in b from2987// the lower single-precision (32-bit) floating-point element in a, store the2988// result in the lower element of dst, and copy the upper 3 packed elements from2989// a to the upper elements of dst.2990//2991// dst[31:0] := a[31:0] - b[31:0]2992// dst[127:32] := a[127:32]2993//2994// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss2995FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)2996{2997return _mm_move_ss(a, _mm_sub_ps(a, b));2998}29993000// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision3001// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the3002// transposed matrix in these vectors (row0 now contains column 0, etc.).3003// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS3004#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \3005do { \3006float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \3007float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \3008row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \3009vget_low_f32(ROW23.val[0])); \3010row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \3011vget_low_f32(ROW23.val[1])); \3012row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \3013vget_high_f32(ROW23.val[0])); \3014row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \3015vget_high_f32(ROW23.val[1])); \3016} while (0)30173018// according to the documentation, these intrinsics behave the same as the3019// non-'u' versions. We'll just alias them here.3020#define _mm_ucomieq_ss _mm_comieq_ss3021#define _mm_ucomige_ss _mm_comige_ss3022#define _mm_ucomigt_ss _mm_comigt_ss3023#define _mm_ucomile_ss _mm_comile_ss3024#define _mm_ucomilt_ss _mm_comilt_ss3025#define _mm_ucomineq_ss _mm_comineq_ss30263027// Return vector of type __m128i with undefined elements.3028// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si1283029FORCE_INLINE __m128i _mm_undefined_si128(void)3030{3031#if defined(__GNUC__) || defined(__clang__)3032#pragma GCC diagnostic push3033#pragma GCC diagnostic ignored "-Wuninitialized"3034#endif3035__m128i a;3036return a;3037#if defined(__GNUC__) || defined(__clang__)3038#pragma GCC diagnostic pop3039#endif3040}30413042// Return vector of type __m128 with undefined elements.3043// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps3044FORCE_INLINE __m128 _mm_undefined_ps(void)3045{3046#if defined(__GNUC__) || defined(__clang__)3047#pragma GCC diagnostic push3048#pragma GCC diagnostic ignored "-Wuninitialized"3049#endif3050__m128 a;3051return a;3052#if defined(__GNUC__) || defined(__clang__)3053#pragma GCC diagnostic pop3054#endif3055}30563057// Selects and interleaves the upper two single-precision, floating-point values3058// from a and b.3059//3060// r0 := a23061// r1 := b23062// r2 := a33063// r3 := b33064//3065// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx3066FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)3067{3068#if defined(__aarch64__)3069return vreinterpretq_m128_f32(3070vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));3071#else3072float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));3073float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));3074float32x2x2_t result = vzip_f32(a1, b1);3075return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));3076#endif3077}30783079// Selects and interleaves the lower two single-precision, floating-point values3080// from a and b.3081//3082// r0 := a03083// r1 := b03084// r2 := a13085// r3 := b13086//3087// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx3088FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)3089{3090#if defined(__aarch64__)3091return vreinterpretq_m128_f32(3092vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));3093#else3094float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));3095float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));3096float32x2x2_t result = vzip_f32(a1, b1);3097return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));3098#endif3099}31003101// Computes bitwise EXOR (exclusive-or) of the four single-precision,3102// floating-point values of a and b.3103// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx3104FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)3105{3106return vreinterpretq_m128_s32(3107veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));3108}31093110/* SSE2 */31113112// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or3113// unsigned 16-bit integers in b.3114// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx3115FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)3116{3117return vreinterpretq_m128i_s16(3118vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));3119}31203121// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or3122// unsigned 32-bit integers in b.3123//3124// r0 := a0 + b03125// r1 := a1 + b13126// r2 := a2 + b23127// r3 := a3 + b33128//3129// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx3130FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)3131{3132return vreinterpretq_m128i_s32(3133vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));3134}31353136// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or3137// unsigned 32-bit integers in b.3138// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx3139FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)3140{3141return vreinterpretq_m128i_s64(3142vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));3143}31443145// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or3146// unsigned 8-bit integers in b.3147// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)3148FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)3149{3150return vreinterpretq_m128i_s8(3151vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));3152}31533154// Add packed double-precision (64-bit) floating-point elements in a and b, and3155// store the results in dst.3156// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd3157FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)3158{3159#if defined(__aarch64__)3160return vreinterpretq_m128d_f64(3161vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3162#else3163double *da = (double *) &a;3164double *db = (double *) &b;3165double c[2];3166c[0] = da[0] + db[0];3167c[1] = da[1] + db[1];3168return vld1q_f32((float32_t *) c);3169#endif3170}31713172// Add the lower double-precision (64-bit) floating-point element in a and b,3173// store the result in the lower element of dst, and copy the upper element from3174// a to the upper element of dst.3175//3176// dst[63:0] := a[63:0] + b[63:0]3177// dst[127:64] := a[127:64]3178//3179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd3180FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)3181{3182#if defined(__aarch64__)3183return _mm_move_sd(a, _mm_add_pd(a, b));3184#else3185double *da = (double *) &a;3186double *db = (double *) &b;3187double c[2];3188c[0] = da[0] + db[0];3189c[1] = da[1];3190return vld1q_f32((float32_t *) c);3191#endif3192}31933194// Add 64-bit integers a and b, and store the result in dst.3195//3196// dst[63:0] := a[63:0] + b[63:0]3197//3198// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si643199FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)3200{3201return vreinterpret_m64_s64(3202vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));3203}32043205// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b3206// and saturates.3207//3208// r0 := SignedSaturate(a0 + b0)3209// r1 := SignedSaturate(a1 + b1)3210// ...3211// r7 := SignedSaturate(a7 + b7)3212//3213// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx3214FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)3215{3216return vreinterpretq_m128i_s16(3217vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));3218}32193220// Add packed signed 8-bit integers in a and b using saturation, and store the3221// results in dst.3222//3223// FOR j := 0 to 153224// i := j*83225// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )3226// ENDFOR3227//3228// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi83229FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)3230{3231return vreinterpretq_m128i_s8(3232vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));3233}32343235// Add packed unsigned 16-bit integers in a and b using saturation, and store3236// the results in dst.3237// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu163238FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)3239{3240return vreinterpretq_m128i_u16(3241vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));3242}32433244// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in3245// b and saturates..3246// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx3247FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)3248{3249return vreinterpretq_m128i_u8(3250vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));3251}32523253// Compute the bitwise AND of packed double-precision (64-bit) floating-point3254// elements in a and b, and store the results in dst.3255//3256// FOR j := 0 to 13257// i := j*643258// dst[i+63:i] := a[i+63:i] AND b[i+63:i]3259// ENDFOR3260//3261// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd3262FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)3263{3264return vreinterpretq_m128d_s64(3265vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));3266}32673268// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in3269// b.3270//3271// r := a & b3272//3273// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx3274FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)3275{3276return vreinterpretq_m128i_s32(3277vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));3278}32793280// Compute the bitwise NOT of packed double-precision (64-bit) floating-point3281// elements in a and then AND with b, and store the results in dst.3282//3283// FOR j := 0 to 13284// i := j*643285// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])3286// ENDFOR3287//3288// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd3289FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)3290{3291// *NOTE* argument swap3292return vreinterpretq_m128d_s64(3293vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));3294}32953296// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the3297// 128-bit value in a.3298//3299// r := (~a) & b3300//3301// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx3302FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)3303{3304return vreinterpretq_m128i_s32(3305vbicq_s32(vreinterpretq_s32_m128i(b),3306vreinterpretq_s32_m128i(a))); // *NOTE* argument swap3307}33083309// Computes the average of the 8 unsigned 16-bit integers in a and the 83310// unsigned 16-bit integers in b and rounds.3311//3312// r0 := (a0 + b0) / 23313// r1 := (a1 + b1) / 23314// ...3315// r7 := (a7 + b7) / 23316//3317// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx3318FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)3319{3320return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),3321vreinterpretq_u16_m128i(b));3322}33233324// Computes the average of the 16 unsigned 8-bit integers in a and the 163325// unsigned 8-bit integers in b and rounds.3326//3327// r0 := (a0 + b0) / 23328// r1 := (a1 + b1) / 23329// ...3330// r15 := (a15 + b15) / 23331//3332// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx3333FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)3334{3335return vreinterpretq_m128i_u8(3336vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));3337}33383339// Shift a left by imm8 bytes while shifting in zeros, and store the results in3340// dst.3341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si1283342#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)33433344// Shift a right by imm8 bytes while shifting in zeros, and store the results in3345// dst.3346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si1283347#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)33483349// Cast vector of type __m128d to type __m128. This intrinsic is only used for3350// compilation and does not generate any instructions, thus it has zero latency.3351// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps3352FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)3353{3354return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));3355}33563357// Cast vector of type __m128d to type __m128i. This intrinsic is only used for3358// compilation and does not generate any instructions, thus it has zero latency.3359// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si1283360FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)3361{3362return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));3363}33643365// Cast vector of type __m128 to type __m128d. This intrinsic is only used for3366// compilation and does not generate any instructions, thus it has zero latency.3367// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd3368FORCE_INLINE __m128d _mm_castps_pd(__m128 a)3369{3370return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));3371}33723373// Applies a type cast to reinterpret four 32-bit floating point values passed3374// in as a 128-bit parameter as packed 32-bit integers.3375// https://msdn.microsoft.com/en-us/library/bb514099.aspx3376FORCE_INLINE __m128i _mm_castps_si128(__m128 a)3377{3378return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));3379}33803381// Cast vector of type __m128i to type __m128d. This intrinsic is only used for3382// compilation and does not generate any instructions, thus it has zero latency.3383// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd3384FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)3385{3386#if defined(__aarch64__)3387return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));3388#else3389return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));3390#endif3391}33923393// Applies a type cast to reinterpret four 32-bit integers passed in as a3394// 128-bit parameter as packed 32-bit floating point values.3395// https://msdn.microsoft.com/en-us/library/bb514029.aspx3396FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)3397{3398return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));3399}34003401// Invalidate and flush the cache line that contains p from all levels of the3402// cache hierarchy.3403// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush3404#if defined(__APPLE__)3405#include <libkern/OSCacheControl.h>3406#endif3407FORCE_INLINE void _mm_clflush(void const *p)3408{3409(void) p;34103411/* sys_icache_invalidate is supported since macOS 10.5.3412* However, it does not work on non-jailbroken iOS devices, although the3413* compilation is successful.3414*/3415#if defined(__APPLE__)3416sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);3417#elif defined(__GNUC__) || defined(__clang__)3418uintptr_t ptr = (uintptr_t) p;3419__builtin___clear_cache((char *) ptr,3420(char *) ptr + SSE2NEON_CACHELINE_SIZE);3421#else3422/* FIXME: MSVC support */3423#endif3424}34253426// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or3427// unsigned 16-bit integers in b for equality.3428// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx3429FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)3430{3431return vreinterpretq_m128i_u16(3432vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));3433}34343435// Compare packed 32-bit integers in a and b for equality, and store the results3436// in dst3437FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)3438{3439return vreinterpretq_m128i_u32(3440vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));3441}34423443// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or3444// unsigned 8-bit integers in b for equality.3445// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx3446FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)3447{3448return vreinterpretq_m128i_u8(3449vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));3450}34513452// Compare packed double-precision (64-bit) floating-point elements in a and b3453// for equality, and store the results in dst.3454// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd3455FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)3456{3457#if defined(__aarch64__)3458return vreinterpretq_m128d_u64(3459vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3460#else3461// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)3462uint32x4_t cmp =3463vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));3464uint32x4_t swapped = vrev64q_u32(cmp);3465return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));3466#endif3467}34683469// Compare the lower double-precision (64-bit) floating-point elements in a and3470// b for equality, store the result in the lower element of dst, and copy the3471// upper element from a to the upper element of dst.3472// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd3473FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)3474{3475return _mm_move_sd(a, _mm_cmpeq_pd(a, b));3476}34773478// Compare packed double-precision (64-bit) floating-point elements in a and b3479// for greater-than-or-equal, and store the results in dst.3480// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd3481FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)3482{3483#if defined(__aarch64__)3484return vreinterpretq_m128d_u64(3485vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3486#else3487uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3488uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3489uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3490uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3491uint64_t d[2];3492d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3493d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);34943495return vreinterpretq_m128d_u64(vld1q_u64(d));3496#endif3497}34983499// Compare the lower double-precision (64-bit) floating-point elements in a and3500// b for greater-than-or-equal, store the result in the lower element of dst,3501// and copy the upper element from a to the upper element of dst.3502// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd3503FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)3504{3505#if defined(__aarch64__)3506return _mm_move_sd(a, _mm_cmpge_pd(a, b));3507#else3508// expand "_mm_cmpge_pd()" to reduce unnecessary operations3509uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3510uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3511uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3512uint64_t d[2];3513d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3514d[1] = a1;35153516return vreinterpretq_m128d_u64(vld1q_u64(d));3517#endif3518}35193520// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers3521// in b for greater than.3522//3523// r0 := (a0 > b0) ? 0xffff : 0x03524// r1 := (a1 > b1) ? 0xffff : 0x03525// ...3526// r7 := (a7 > b7) ? 0xffff : 0x03527//3528// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx3529FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)3530{3531return vreinterpretq_m128i_u16(3532vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));3533}35343535// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers3536// in b for greater than.3537// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx3538FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)3539{3540return vreinterpretq_m128i_u32(3541vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));3542}35433544// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers3545// in b for greater than.3546//3547// r0 := (a0 > b0) ? 0xff : 0x03548// r1 := (a1 > b1) ? 0xff : 0x03549// ...3550// r15 := (a15 > b15) ? 0xff : 0x03551//3552// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx3553FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)3554{3555return vreinterpretq_m128i_u8(3556vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));3557}35583559// Compare packed double-precision (64-bit) floating-point elements in a and b3560// for greater-than, and store the results in dst.3561// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd3562FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)3563{3564#if defined(__aarch64__)3565return vreinterpretq_m128d_u64(3566vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3567#else3568uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3569uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3570uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3571uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3572uint64_t d[2];3573d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3574d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);35753576return vreinterpretq_m128d_u64(vld1q_u64(d));3577#endif3578}35793580// Compare the lower double-precision (64-bit) floating-point elements in a and3581// b for greater-than, store the result in the lower element of dst, and copy3582// the upper element from a to the upper element of dst.3583// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd3584FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)3585{3586#if defined(__aarch64__)3587return _mm_move_sd(a, _mm_cmpgt_pd(a, b));3588#else3589// expand "_mm_cmpge_pd()" to reduce unnecessary operations3590uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3591uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3592uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3593uint64_t d[2];3594d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3595d[1] = a1;35963597return vreinterpretq_m128d_u64(vld1q_u64(d));3598#endif3599}36003601// Compare packed double-precision (64-bit) floating-point elements in a and b3602// for less-than-or-equal, and store the results in dst.3603// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd3604FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)3605{3606#if defined(__aarch64__)3607return vreinterpretq_m128d_u64(3608vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3609#else3610uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3611uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3612uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3613uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3614uint64_t d[2];3615d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3616d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);36173618return vreinterpretq_m128d_u64(vld1q_u64(d));3619#endif3620}36213622// Compare the lower double-precision (64-bit) floating-point elements in a and3623// b for less-than-or-equal, store the result in the lower element of dst, and3624// copy the upper element from a to the upper element of dst.3625// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd3626FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)3627{3628#if defined(__aarch64__)3629return _mm_move_sd(a, _mm_cmple_pd(a, b));3630#else3631// expand "_mm_cmpge_pd()" to reduce unnecessary operations3632uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3633uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3634uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3635uint64_t d[2];3636d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3637d[1] = a1;36383639return vreinterpretq_m128d_u64(vld1q_u64(d));3640#endif3641}36423643// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers3644// in b for less than.3645//3646// r0 := (a0 < b0) ? 0xffff : 0x03647// r1 := (a1 < b1) ? 0xffff : 0x03648// ...3649// r7 := (a7 < b7) ? 0xffff : 0x03650//3651// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx3652FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)3653{3654return vreinterpretq_m128i_u16(3655vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));3656}365736583659// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers3660// in b for less than.3661// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx3662FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)3663{3664return vreinterpretq_m128i_u32(3665vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));3666}36673668// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers3669// in b for lesser than.3670// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx3671FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)3672{3673return vreinterpretq_m128i_u8(3674vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));3675}36763677// Compare packed double-precision (64-bit) floating-point elements in a and b3678// for less-than, and store the results in dst.3679// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd3680FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)3681{3682#if defined(__aarch64__)3683return vreinterpretq_m128d_u64(3684vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));3685#else3686uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3687uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3688uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3689uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3690uint64_t d[2];3691d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3692d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);36933694return vreinterpretq_m128d_u64(vld1q_u64(d));3695#endif3696}36973698// Compare the lower double-precision (64-bit) floating-point elements in a and3699// b for less-than, store the result in the lower element of dst, and copy the3700// upper element from a to the upper element of dst.3701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd3702FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)3703{3704#if defined(__aarch64__)3705return _mm_move_sd(a, _mm_cmplt_pd(a, b));3706#else3707uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3708uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3709uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3710uint64_t d[2];3711d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);3712d[1] = a1;37133714return vreinterpretq_m128d_u64(vld1q_u64(d));3715#endif3716}37173718// Compare packed double-precision (64-bit) floating-point elements in a and b3719// for not-equal, and store the results in dst.3720// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd3721FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)3722{3723#if defined(__aarch64__)3724return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(3725vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));3726#else3727// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)3728uint32x4_t cmp =3729vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));3730uint32x4_t swapped = vrev64q_u32(cmp);3731return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));3732#endif3733}37343735// Compare the lower double-precision (64-bit) floating-point elements in a and3736// b for not-equal, store the result in the lower element of dst, and copy the3737// upper element from a to the upper element of dst.3738// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd3739FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)3740{3741return _mm_move_sd(a, _mm_cmpneq_pd(a, b));3742}37433744// Compare packed double-precision (64-bit) floating-point elements in a and b3745// for not-greater-than-or-equal, and store the results in dst.3746// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd3747FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)3748{3749#if defined(__aarch64__)3750return vreinterpretq_m128d_u64(veorq_u64(3751vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),3752vdupq_n_u64(UINT64_MAX)));3753#else3754uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3755uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3756uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3757uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3758uint64_t d[2];3759d[0] =3760!((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);3761d[1] =3762!((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);37633764return vreinterpretq_m128d_u64(vld1q_u64(d));3765#endif3766}37673768// Compare the lower double-precision (64-bit) floating-point elements in a and3769// b for not-greater-than-or-equal, store the result in the lower element of3770// dst, and copy the upper element from a to the upper element of dst.3771// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd3772FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)3773{3774return _mm_move_sd(a, _mm_cmpnge_pd(a, b));3775}37763777// Compare packed double-precision (64-bit) floating-point elements in a and b3778// for not-greater-than, and store the results in dst.3779// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd3780FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)3781{3782#if defined(__aarch64__)3783return vreinterpretq_m128d_u64(veorq_u64(3784vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),3785vdupq_n_u64(UINT64_MAX)));3786#else3787uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3788uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3789uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3790uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3791uint64_t d[2];3792d[0] =3793!((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);3794d[1] =3795!((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);37963797return vreinterpretq_m128d_u64(vld1q_u64(d));3798#endif3799}38003801// Compare the lower double-precision (64-bit) floating-point elements in a and3802// b for not-greater-than, store the result in the lower element of dst, and3803// copy the upper element from a to the upper element of dst.3804// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd3805FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)3806{3807return _mm_move_sd(a, _mm_cmpngt_pd(a, b));3808}38093810// Compare packed double-precision (64-bit) floating-point elements in a and b3811// for not-less-than-or-equal, and store the results in dst.3812// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd3813FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)3814{3815#if defined(__aarch64__)3816return vreinterpretq_m128d_u64(veorq_u64(3817vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),3818vdupq_n_u64(UINT64_MAX)));3819#else3820uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3821uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3822uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3823uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3824uint64_t d[2];3825d[0] =3826!((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);3827d[1] =3828!((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);38293830return vreinterpretq_m128d_u64(vld1q_u64(d));3831#endif3832}38333834// Compare the lower double-precision (64-bit) floating-point elements in a and3835// b for not-less-than-or-equal, store the result in the lower element of dst,3836// and copy the upper element from a to the upper element of dst.3837// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd3838FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)3839{3840return _mm_move_sd(a, _mm_cmpnle_pd(a, b));3841}38423843// Compare packed double-precision (64-bit) floating-point elements in a and b3844// for not-less-than, and store the results in dst.3845// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd3846FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)3847{3848#if defined(__aarch64__)3849return vreinterpretq_m128d_u64(veorq_u64(3850vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),3851vdupq_n_u64(UINT64_MAX)));3852#else3853uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3854uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3855uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3856uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3857uint64_t d[2];3858d[0] =3859!((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);3860d[1] =3861!((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);38623863return vreinterpretq_m128d_u64(vld1q_u64(d));3864#endif3865}38663867// Compare the lower double-precision (64-bit) floating-point elements in a and3868// b for not-less-than, store the result in the lower element of dst, and copy3869// the upper element from a to the upper element of dst.3870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd3871FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)3872{3873return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));3874}38753876// Compare packed double-precision (64-bit) floating-point elements in a and b3877// to see if neither is NaN, and store the results in dst.3878// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd3879FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)3880{3881#if defined(__aarch64__)3882// Excluding NaNs, any two floating point numbers can be compared.3883uint64x2_t not_nan_a =3884vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));3885uint64x2_t not_nan_b =3886vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));3887return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));3888#else3889uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3890uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3891uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3892uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3893uint64_t d[2];3894d[0] = ((*(double *) &a0) == (*(double *) &a0) &&3895(*(double *) &b0) == (*(double *) &b0))3896? ~UINT64_C(0)3897: UINT64_C(0);3898d[1] = ((*(double *) &a1) == (*(double *) &a1) &&3899(*(double *) &b1) == (*(double *) &b1))3900? ~UINT64_C(0)3901: UINT64_C(0);39023903return vreinterpretq_m128d_u64(vld1q_u64(d));3904#endif3905}39063907// Compare the lower double-precision (64-bit) floating-point elements in a and3908// b to see if neither is NaN, store the result in the lower element of dst, and3909// copy the upper element from a to the upper element of dst.3910// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd3911FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)3912{3913#if defined(__aarch64__)3914return _mm_move_sd(a, _mm_cmpord_pd(a, b));3915#else3916uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3917uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3918uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3919uint64_t d[2];3920d[0] = ((*(double *) &a0) == (*(double *) &a0) &&3921(*(double *) &b0) == (*(double *) &b0))3922? ~UINT64_C(0)3923: UINT64_C(0);3924d[1] = a1;39253926return vreinterpretq_m128d_u64(vld1q_u64(d));3927#endif3928}39293930// Compare packed double-precision (64-bit) floating-point elements in a and b3931// to see if either is NaN, and store the results in dst.3932// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd3933FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)3934{3935#if defined(__aarch64__)3936// Two NaNs are not equal in comparison operation.3937uint64x2_t not_nan_a =3938vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));3939uint64x2_t not_nan_b =3940vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));3941return vreinterpretq_m128d_s32(3942vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));3943#else3944uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3945uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3946uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3947uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));3948uint64_t d[2];3949d[0] = ((*(double *) &a0) == (*(double *) &a0) &&3950(*(double *) &b0) == (*(double *) &b0))3951? UINT64_C(0)3952: ~UINT64_C(0);3953d[1] = ((*(double *) &a1) == (*(double *) &a1) &&3954(*(double *) &b1) == (*(double *) &b1))3955? UINT64_C(0)3956: ~UINT64_C(0);39573958return vreinterpretq_m128d_u64(vld1q_u64(d));3959#endif3960}39613962// Compare the lower double-precision (64-bit) floating-point elements in a and3963// b to see if either is NaN, store the result in the lower element of dst, and3964// copy the upper element from a to the upper element of dst.3965// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd3966FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)3967{3968#if defined(__aarch64__)3969return _mm_move_sd(a, _mm_cmpunord_pd(a, b));3970#else3971uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3972uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));3973uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));3974uint64_t d[2];3975d[0] = ((*(double *) &a0) == (*(double *) &a0) &&3976(*(double *) &b0) == (*(double *) &b0))3977? UINT64_C(0)3978: ~UINT64_C(0);3979d[1] = a1;39803981return vreinterpretq_m128d_u64(vld1q_u64(d));3982#endif3983}39843985// Compare the lower double-precision (64-bit) floating-point element in a and b3986// for greater-than-or-equal, and return the boolean result (0 or 1).3987// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd3988FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)3989{3990#if defined(__aarch64__)3991return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;3992#else3993uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));3994uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));39953996return (*(double *) &a0 >= *(double *) &b0);3997#endif3998}39994000// Compare the lower double-precision (64-bit) floating-point element in a and b4001// for greater-than, and return the boolean result (0 or 1).4002// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd4003FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)4004{4005#if defined(__aarch64__)4006return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;4007#else4008uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));4009uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));40104011return (*(double *) &a0 > *(double *) &b0);4012#endif4013}40144015// Compare the lower double-precision (64-bit) floating-point element in a and b4016// for less-than-or-equal, and return the boolean result (0 or 1).4017// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd4018FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)4019{4020#if defined(__aarch64__)4021return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;4022#else4023uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));4024uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));40254026return (*(double *) &a0 <= *(double *) &b0);4027#endif4028}40294030// Compare the lower double-precision (64-bit) floating-point element in a and b4031// for less-than, and return the boolean result (0 or 1).4032// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd4033FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)4034{4035#if defined(__aarch64__)4036return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;4037#else4038uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));4039uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));40404041return (*(double *) &a0 < *(double *) &b0);4042#endif4043}40444045// Compare the lower double-precision (64-bit) floating-point element in a and b4046// for equality, and return the boolean result (0 or 1).4047// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd4048FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)4049{4050#if defined(__aarch64__)4051return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;4052#else4053uint32x4_t a_not_nan =4054vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));4055uint32x4_t b_not_nan =4056vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));4057uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);4058uint32x4_t a_eq_b =4059vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));4060uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),4061vreinterpretq_u64_u32(a_eq_b));4062return vgetq_lane_u64(and_results, 0) & 0x1;4063#endif4064}40654066// Compare the lower double-precision (64-bit) floating-point element in a and b4067// for not-equal, and return the boolean result (0 or 1).4068// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd4069FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)4070{4071return !_mm_comieq_sd(a, b);4072}40734074// Convert packed signed 32-bit integers in a to packed double-precision4075// (64-bit) floating-point elements, and store the results in dst.4076//4077// FOR j := 0 to 14078// i := j*324079// m := j*644080// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])4081// ENDFOR4082//4083// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd4084FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)4085{4086#if defined(__aarch64__)4087return vreinterpretq_m128d_f64(4088vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));4089#else4090double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);4091double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);4092return _mm_set_pd(a1, a0);4093#endif4094}40954096// Converts the four signed 32-bit integer values of a to single-precision,4097// floating-point values4098// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx4099FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)4100{4101return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));4102}41034104// Convert packed double-precision (64-bit) floating-point elements in a to4105// packed 32-bit integers, and store the results in dst.4106//4107// FOR j := 0 to 14108// i := 32*j4109// k := 64*j4110// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])4111// ENDFOR4112//4113// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi324114FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)4115{4116// vrnd32xq_f64 not supported on clang4117#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)4118float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));4119int64x2_t integers = vcvtq_s64_f64(rounded);4120return vreinterpretq_m128i_s32(4121vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));4122#else4123__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);4124double d0 = ((double *) &rnd)[0];4125double d1 = ((double *) &rnd)[1];4126return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);4127#endif4128}41294130// Convert packed double-precision (64-bit) floating-point elements in a to4131// packed 32-bit integers, and store the results in dst.4132//4133// FOR j := 0 to 14134// i := 32*j4135// k := 64*j4136// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])4137// ENDFOR4138//4139// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi324140FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)4141{4142__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);4143double d0 = ((double *) &rnd)[0];4144double d1 = ((double *) &rnd)[1];4145int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};4146return vreinterpret_m64_s32(vld1_s32(data));4147}41484149// Convert packed double-precision (64-bit) floating-point elements in a to4150// packed single-precision (32-bit) floating-point elements, and store the4151// results in dst.4152//4153// FOR j := 0 to 14154// i := 32*j4155// k := 64*j4156// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])4157// ENDFOR4158// dst[127:64] := 04159//4160// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps4161FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)4162{4163#if defined(__aarch64__)4164float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));4165return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));4166#else4167float a0 = (float) ((double *) &a)[0];4168float a1 = (float) ((double *) &a)[1];4169return _mm_set_ps(0, 0, a1, a0);4170#endif4171}41724173// Convert packed signed 32-bit integers in a to packed double-precision4174// (64-bit) floating-point elements, and store the results in dst.4175//4176// FOR j := 0 to 14177// i := j*324178// m := j*644179// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])4180// ENDFOR4181//4182// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd4183FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)4184{4185#if defined(__aarch64__)4186return vreinterpretq_m128d_f64(4187vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));4188#else4189double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);4190double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);4191return _mm_set_pd(a1, a0);4192#endif4193}41944195// Converts the four single-precision, floating-point values of a to signed4196// 32-bit integer values.4197//4198// r0 := (int) a04199// r1 := (int) a14200// r2 := (int) a24201// r3 := (int) a34202//4203// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx4204// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A4205// does not support! It is supported on ARMv8-A however.4206FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)4207{4208#if defined(__ARM_FEATURE_FRINT)4209return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));4210#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)4211switch (_MM_GET_ROUNDING_MODE()) {4212case _MM_ROUND_NEAREST:4213return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));4214case _MM_ROUND_DOWN:4215return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));4216case _MM_ROUND_UP:4217return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));4218default: // _MM_ROUND_TOWARD_ZERO4219return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));4220}4221#else4222float *f = (float *) &a;4223switch (_MM_GET_ROUNDING_MODE()) {4224case _MM_ROUND_NEAREST: {4225uint32x4_t signmask = vdupq_n_u32(0x80000000);4226float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),4227vdupq_n_f32(0.5f)); /* +/- 0.5 */4228int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(4229vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/4230int32x4_t r_trunc = vcvtq_s32_f32(4231vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */4232int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(4233vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */4234int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),4235vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */4236float32x4_t delta = vsubq_f32(4237vreinterpretq_f32_m128(a),4238vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */4239uint32x4_t is_delta_half =4240vceqq_f32(delta, half); /* delta == +/- 0.5 */4241return vreinterpretq_m128i_s32(4242vbslq_s32(is_delta_half, r_even, r_normal));4243}4244case _MM_ROUND_DOWN:4245return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),4246floorf(f[0]));4247case _MM_ROUND_UP:4248return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),4249ceilf(f[0]));4250default: // _MM_ROUND_TOWARD_ZERO4251return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],4252(int32_t) f[0]);4253}4254#endif4255}42564257// Convert packed single-precision (32-bit) floating-point elements in a to4258// packed double-precision (64-bit) floating-point elements, and store the4259// results in dst.4260//4261// FOR j := 0 to 14262// i := 64*j4263// k := 32*j4264// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])4265// ENDFOR4266//4267// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd4268FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)4269{4270#if defined(__aarch64__)4271return vreinterpretq_m128d_f64(4272vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));4273#else4274double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);4275double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);4276return _mm_set_pd(a1, a0);4277#endif4278}42794280// Copy the lower double-precision (64-bit) floating-point element of a to dst.4281//4282// dst[63:0] := a[63:0]4283//4284// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f644285FORCE_INLINE double _mm_cvtsd_f64(__m128d a)4286{4287#if defined(__aarch64__)4288return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);4289#else4290return ((double *) &a)[0];4291#endif4292}42934294// Convert the lower double-precision (64-bit) floating-point element in a to a4295// 32-bit integer, and store the result in dst.4296//4297// dst[31:0] := Convert_FP64_To_Int32(a[63:0])4298//4299// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si324300FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)4301{4302#if defined(__aarch64__)4303return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);4304#else4305__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);4306double ret = ((double *) &rnd)[0];4307return (int32_t) ret;4308#endif4309}43104311// Convert the lower double-precision (64-bit) floating-point element in a to a4312// 64-bit integer, and store the result in dst.4313//4314// dst[63:0] := Convert_FP64_To_Int64(a[63:0])4315//4316// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si644317FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)4318{4319#if defined(__aarch64__)4320return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);4321#else4322__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);4323double ret = ((double *) &rnd)[0];4324return (int64_t) ret;4325#endif4326}43274328// Convert the lower double-precision (64-bit) floating-point element in a to a4329// 64-bit integer, and store the result in dst.4330//4331// dst[63:0] := Convert_FP64_To_Int64(a[63:0])4332//4333// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x4334#define _mm_cvtsd_si64x _mm_cvtsd_si6443354336// Convert the lower double-precision (64-bit) floating-point element in b to a4337// single-precision (32-bit) floating-point element, store the result in the4338// lower element of dst, and copy the upper 3 packed elements from a to the4339// upper elements of dst.4340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss4341FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)4342{4343#if defined(__aarch64__)4344return vreinterpretq_m128_f32(vsetq_lane_f32(4345vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),4346vreinterpretq_f32_m128(a), 0));4347#else4348return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],4349vreinterpretq_f32_m128(a), 0));4350#endif4351}43524353// Copy the lower 32-bit integer in a to dst.4354//4355// dst[31:0] := a[31:0]4356//4357// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si324358FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)4359{4360return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);4361}43624363// Copy the lower 64-bit integer in a to dst.4364//4365// dst[63:0] := a[63:0]4366//4367// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si644368FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)4369{4370return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);4371}43724373// Copy the lower 64-bit integer in a to dst.4374// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x4375#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)43764377// Convert the signed 32-bit integer b to a double-precision (64-bit)4378// floating-point element, store the result in the lower element of dst, and4379// copy the upper element from a to the upper element of dst.4380// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd4381FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)4382{4383#if defined(__aarch64__)4384return vreinterpretq_m128d_f64(4385vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));4386#else4387double bf = (double) b;4388return vreinterpretq_m128d_s64(4389vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));4390#endif4391}43924393// Copy the lower 64-bit integer in a to dst.4394//4395// dst[63:0] := a[63:0]4396//4397// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x4398#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)43994400// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,4401// zero extending the upper bits.4402//4403// r0 := a4404// r1 := 0x04405// r2 := 0x04406// r3 := 0x04407//4408// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx4409FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)4410{4411return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));4412}44134414// Convert the signed 64-bit integer b to a double-precision (64-bit)4415// floating-point element, store the result in the lower element of dst, and4416// copy the upper element from a to the upper element of dst.4417// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd4418FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)4419{4420#if defined(__aarch64__)4421return vreinterpretq_m128d_f64(4422vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));4423#else4424double bf = (double) b;4425return vreinterpretq_m128d_s64(4426vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));4427#endif4428}44294430// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,4431// zero extending the upper bits.4432//4433// r0 := a4434// r1 := 0x04435FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)4436{4437return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));4438}44394440// Copy 64-bit integer a to the lower element of dst, and zero the upper4441// element.4442// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si1284443#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)44444445// Convert the signed 64-bit integer b to a double-precision (64-bit)4446// floating-point element, store the result in the lower element of dst, and4447// copy the upper element from a to the upper element of dst.4448// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd4449#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)44504451// Convert the lower single-precision (32-bit) floating-point element in b to a4452// double-precision (64-bit) floating-point element, store the result in the4453// lower element of dst, and copy the upper element from a to the upper element4454// of dst.4455//4456// dst[63:0] := Convert_FP32_To_FP64(b[31:0])4457// dst[127:64] := a[127:64]4458//4459// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd4460FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)4461{4462double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);4463#if defined(__aarch64__)4464return vreinterpretq_m128d_f64(4465vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));4466#else4467return vreinterpretq_m128d_s64(4468vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));4469#endif4470}44714472// Convert packed double-precision (64-bit) floating-point elements in a to4473// packed 32-bit integers with truncation, and store the results in dst.4474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi324475FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)4476{4477double a0 = ((double *) &a)[0];4478double a1 = ((double *) &a)[1];4479return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);4480}44814482// Convert packed double-precision (64-bit) floating-point elements in a to4483// packed 32-bit integers with truncation, and store the results in dst.4484// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi324485FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)4486{4487double a0 = ((double *) &a)[0];4488double a1 = ((double *) &a)[1];4489int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};4490return vreinterpret_m64_s32(vld1_s32(data));4491}44924493// Converts the four single-precision, floating-point values of a to signed4494// 32-bit integer values using truncate.4495// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx4496FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)4497{4498return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));4499}45004501// Convert the lower double-precision (64-bit) floating-point element in a to a4502// 32-bit integer with truncation, and store the result in dst.4503//4504// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])4505//4506// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si324507FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)4508{4509double ret = *((double *) &a);4510return (int32_t) ret;4511}45124513// Convert the lower double-precision (64-bit) floating-point element in a to a4514// 64-bit integer with truncation, and store the result in dst.4515//4516// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])4517//4518// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si644519FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)4520{4521#if defined(__aarch64__)4522return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);4523#else4524double ret = *((double *) &a);4525return (int64_t) ret;4526#endif4527}45284529// Convert the lower double-precision (64-bit) floating-point element in a to a4530// 64-bit integer with truncation, and store the result in dst.4531//4532// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])4533//4534// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x4535#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)45364537// Divide packed double-precision (64-bit) floating-point elements in a by4538// packed elements in b, and store the results in dst.4539//4540// FOR j := 0 to 14541// i := 64*j4542// dst[i+63:i] := a[i+63:i] / b[i+63:i]4543// ENDFOR4544//4545// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd4546FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)4547{4548#if defined(__aarch64__)4549return vreinterpretq_m128d_f64(4550vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));4551#else4552double *da = (double *) &a;4553double *db = (double *) &b;4554double c[2];4555c[0] = da[0] / db[0];4556c[1] = da[1] / db[1];4557return vld1q_f32((float32_t *) c);4558#endif4559}45604561// Divide the lower double-precision (64-bit) floating-point element in a by the4562// lower double-precision (64-bit) floating-point element in b, store the result4563// in the lower element of dst, and copy the upper element from a to the upper4564// element of dst.4565// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd4566FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)4567{4568#if defined(__aarch64__)4569float64x2_t tmp =4570vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));4571return vreinterpretq_m128d_f64(4572vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));4573#else4574return _mm_move_sd(a, _mm_div_pd(a, b));4575#endif4576}45774578// Extracts the selected signed or unsigned 16-bit integer from a and zero4579// extends.4580// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx4581// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)4582#define _mm_extract_epi16(a, imm) \4583vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))45844585// Inserts the least significant 16 bits of b into the selected 16-bit integer4586// of a.4587// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx4588// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,4589// __constrange(0,8) int imm)4590#define _mm_insert_epi16(a, b, imm) \4591__extension__({ \4592vreinterpretq_m128i_s16( \4593vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \4594})45954596// Loads two double-precision from 16-byte aligned memory, floating-point4597// values.4598//4599// dst[127:0] := MEM[mem_addr+127:mem_addr]4600//4601// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd4602FORCE_INLINE __m128d _mm_load_pd(const double *p)4603{4604#if defined(__aarch64__)4605return vreinterpretq_m128d_f64(vld1q_f64(p));4606#else4607const float *fp = (const float *) p;4608float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};4609return vreinterpretq_m128d_f32(vld1q_f32(data));4610#endif4611}46124613// Load a double-precision (64-bit) floating-point element from memory into both4614// elements of dst.4615//4616// dst[63:0] := MEM[mem_addr+63:mem_addr]4617// dst[127:64] := MEM[mem_addr+63:mem_addr]4618//4619// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd14620#define _mm_load_pd1 _mm_load1_pd46214622// Load a double-precision (64-bit) floating-point element from memory into the4623// lower of dst, and zero the upper element. mem_addr does not need to be4624// aligned on any particular boundary.4625//4626// dst[63:0] := MEM[mem_addr+63:mem_addr]4627// dst[127:64] := 04628//4629// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd4630FORCE_INLINE __m128d _mm_load_sd(const double *p)4631{4632#if defined(__aarch64__)4633return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));4634#else4635const float *fp = (const float *) p;4636float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};4637return vreinterpretq_m128d_f32(vld1q_f32(data));4638#endif4639}46404641// Loads 128-bit value. :4642// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx4643FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)4644{4645return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));4646}46474648// Load a double-precision (64-bit) floating-point element from memory into both4649// elements of dst.4650//4651// dst[63:0] := MEM[mem_addr+63:mem_addr]4652// dst[127:64] := MEM[mem_addr+63:mem_addr]4653//4654// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd4655FORCE_INLINE __m128d _mm_load1_pd(const double *p)4656{4657#if defined(__aarch64__)4658return vreinterpretq_m128d_f64(vld1q_dup_f64(p));4659#else4660return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));4661#endif4662}46634664// Load a double-precision (64-bit) floating-point element from memory into the4665// upper element of dst, and copy the lower element from a to dst. mem_addr does4666// not need to be aligned on any particular boundary.4667//4668// dst[63:0] := a[63:0]4669// dst[127:64] := MEM[mem_addr+63:mem_addr]4670//4671// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd4672FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)4673{4674#if defined(__aarch64__)4675return vreinterpretq_m128d_f64(4676vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));4677#else4678return vreinterpretq_m128d_f32(vcombine_f32(4679vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));4680#endif4681}46824683// Load 64-bit integer from memory into the first element of dst.4684// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi644685FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)4686{4687/* Load the lower 64 bits of the value pointed to by p into the4688* lower 64 bits of the result, zeroing the upper 64 bits of the result.4689*/4690return vreinterpretq_m128i_s32(4691vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));4692}46934694// Load a double-precision (64-bit) floating-point element from memory into the4695// lower element of dst, and copy the upper element from a to dst. mem_addr does4696// not need to be aligned on any particular boundary.4697//4698// dst[63:0] := MEM[mem_addr+63:mem_addr]4699// dst[127:64] := a[127:64]4700//4701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd4702FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)4703{4704#if defined(__aarch64__)4705return vreinterpretq_m128d_f64(4706vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));4707#else4708return vreinterpretq_m128d_f32(4709vcombine_f32(vld1_f32((const float *) p),4710vget_high_f32(vreinterpretq_f32_m128d(a))));4711#endif4712}47134714// Load 2 double-precision (64-bit) floating-point elements from memory into dst4715// in reverse order. mem_addr must be aligned on a 16-byte boundary or a4716// general-protection exception may be generated.4717//4718// dst[63:0] := MEM[mem_addr+127:mem_addr+64]4719// dst[127:64] := MEM[mem_addr+63:mem_addr]4720//4721// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd4722FORCE_INLINE __m128d _mm_loadr_pd(const double *p)4723{4724#if defined(__aarch64__)4725float64x2_t v = vld1q_f64(p);4726return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));4727#else4728int64x2_t v = vld1q_s64((const int64_t *) p);4729return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));4730#endif4731}47324733// Loads two double-precision from unaligned memory, floating-point values.4734// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd4735FORCE_INLINE __m128d _mm_loadu_pd(const double *p)4736{4737return _mm_load_pd(p);4738}47394740// Loads 128-bit value. :4741// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx4742FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)4743{4744return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));4745}47464747// Load unaligned 32-bit integer from memory into the first element of dst.4748//4749// dst[31:0] := MEM[mem_addr+31:mem_addr]4750// dst[MAX:32] := 04751//4752// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si324753FORCE_INLINE __m128i _mm_loadu_si32(const void *p)4754{4755return vreinterpretq_m128i_s32(4756vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));4757}47584759// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit4760// integers from b.4761//4762// r0 := (a0 * b0) + (a1 * b1)4763// r1 := (a2 * b2) + (a3 * b3)4764// r2 := (a4 * b4) + (a5 * b5)4765// r3 := (a6 * b6) + (a7 * b7)4766// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx4767FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)4768{4769int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),4770vget_low_s16(vreinterpretq_s16_m128i(b)));4771#if defined(__aarch64__)4772int32x4_t high =4773vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));47744775return vreinterpretq_m128i_s32(vpaddq_s32(low, high));4776#else4777int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),4778vget_high_s16(vreinterpretq_s16_m128i(b)));47794780int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));4781int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));47824783return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));4784#endif4785}47864787// Conditionally store 8-bit integer elements from a into memory using mask4788// (elements are not stored when the highest bit is not set in the corresponding4789// element) and a non-temporal memory hint. mem_addr does not need to be aligned4790// on any particular boundary.4791// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si1284792FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)4793{4794int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);4795__m128 b = _mm_load_ps((const float *) mem_addr);4796int8x16_t masked =4797vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),4798vreinterpretq_s8_m128(b));4799vst1q_s8((int8_t *) mem_addr, masked);4800}48014802// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 84803// signed 16-bit integers from b.4804// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx4805FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)4806{4807return vreinterpretq_m128i_s16(4808vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));4809}48104811// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the4812// 16 unsigned 8-bit integers from b.4813// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx4814FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)4815{4816return vreinterpretq_m128i_u8(4817vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));4818}48194820// Compare packed double-precision (64-bit) floating-point elements in a and b,4821// and store packed maximum values in dst.4822// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd4823FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)4824{4825#if defined(__aarch64__)4826#if SSE2NEON_PRECISE_MINMAX4827float64x2_t _a = vreinterpretq_f64_m128d(a);4828float64x2_t _b = vreinterpretq_f64_m128d(b);4829return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));4830#else4831return vreinterpretq_m128d_f64(4832vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));4833#endif4834#else4835uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));4836uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));4837uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));4838uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));4839uint64_t d[2];4840d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;4841d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;48424843return vreinterpretq_m128d_u64(vld1q_u64(d));4844#endif4845}48464847// Compare the lower double-precision (64-bit) floating-point elements in a and4848// b, store the maximum value in the lower element of dst, and copy the upper4849// element from a to the upper element of dst.4850// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd4851FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)4852{4853#if defined(__aarch64__)4854return _mm_move_sd(a, _mm_max_pd(a, b));4855#else4856double *da = (double *) &a;4857double *db = (double *) &b;4858double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};4859return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));4860#endif4861}48624863// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 84864// signed 16-bit integers from b.4865// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx4866FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)4867{4868return vreinterpretq_m128i_s16(4869vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));4870}48714872// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the4873// 16 unsigned 8-bit integers from b.4874// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx4875FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)4876{4877return vreinterpretq_m128i_u8(4878vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));4879}48804881// Compare packed double-precision (64-bit) floating-point elements in a and b,4882// and store packed minimum values in dst.4883// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd4884FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)4885{4886#if defined(__aarch64__)4887#if SSE2NEON_PRECISE_MINMAX4888float64x2_t _a = vreinterpretq_f64_m128d(a);4889float64x2_t _b = vreinterpretq_f64_m128d(b);4890return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));4891#else4892return vreinterpretq_m128d_f64(4893vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));4894#endif4895#else4896uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));4897uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));4898uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));4899uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));4900uint64_t d[2];4901d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;4902d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;4903return vreinterpretq_m128d_u64(vld1q_u64(d));4904#endif4905}49064907// Compare the lower double-precision (64-bit) floating-point elements in a and4908// b, store the minimum value in the lower element of dst, and copy the upper4909// element from a to the upper element of dst.4910// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd4911FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)4912{4913#if defined(__aarch64__)4914return _mm_move_sd(a, _mm_min_pd(a, b));4915#else4916double *da = (double *) &a;4917double *db = (double *) &b;4918double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};4919return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));4920#endif4921}49224923// Copy the lower 64-bit integer in a to the lower element of dst, and zero the4924// upper element.4925//4926// dst[63:0] := a[63:0]4927// dst[127:64] := 04928//4929// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi644930FORCE_INLINE __m128i _mm_move_epi64(__m128i a)4931{4932return vreinterpretq_m128i_s64(4933vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));4934}49354936// Move the lower double-precision (64-bit) floating-point element from b to the4937// lower element of dst, and copy the upper element from a to the upper element4938// of dst.4939//4940// dst[63:0] := b[63:0]4941// dst[127:64] := a[127:64]4942//4943// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd4944FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)4945{4946return vreinterpretq_m128d_f32(4947vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),4948vget_high_f32(vreinterpretq_f32_m128d(a))));4949}49504951// NEON does not provide a version of this function.4952// Creates a 16-bit mask from the most significant bits of the 16 signed or4953// unsigned 8-bit integers in a and zero extends the upper bits.4954// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx4955FORCE_INLINE int _mm_movemask_epi8(__m128i a)4956{4957// Use increasingly wide shifts+adds to collect the sign bits4958// together.4959// Since the widening shifts would be rather confusing to follow in little4960// endian, everything will be illustrated in big endian order instead. This4961// has a different result - the bits would actually be reversed on a big4962// endian machine.49634964// Starting input (only half the elements are shown):4965// 89 ff 1d c0 00 10 99 334966uint8x16_t input = vreinterpretq_u8_m128i(a);49674968// Shift out everything but the sign bits with an unsigned shift right.4969//4970// Bytes of the vector::4971// 89 ff 1d c0 00 10 99 334972// \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)4973// | | | | | | | |4974// 01 01 00 01 00 00 01 004975//4976// Bits of first important lane(s):4977// 10001001 (89)4978// \______4979// |4980// 00000001 (01)4981uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));49824983// Merge the even lanes together with a 16-bit unsigned shift right + add.4984// 'xx' represents garbage data which will be ignored in the final result.4985// In the important bytes, the add functions like a binary OR.4986//4987// 01 01 00 01 00 00 01 004988// \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))4989// \| \| \| \|4990// xx 03 xx 01 xx 00 xx 024991//4992// 00000001 00000001 (01 01)4993// \_______ |4994// \|4995// xxxxxxxx xxxxxx11 (xx 03)4996uint32x4_t paired16 =4997vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));49984999// Repeat with a wider 32-bit shift + add.5000// xx 03 xx 01 xx 00 xx 025001// \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>5002// 14))5003// \| \|5004// xx xx xx 0d xx xx xx 025005//5006// 00000011 00000001 (03 01)5007// \\_____ ||5008// '----.\||5009// xxxxxxxx xxxx1101 (xx 0d)5010uint64x2_t paired32 =5011vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));50125013// Last, an even wider 64-bit shift + add to get our result in the low 8 bit5014// lanes. xx xx xx 0d xx xx xx 025015// \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>5016// 28))5017// \|5018// xx xx xx xx xx xx xx d25019//5020// 00001101 00000010 (0d 02)5021// \ \___ | |5022// '---. \| |5023// xxxxxxxx 11010010 (xx d2)5024uint8x16_t paired64 =5025vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));50265027// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.5028// xx xx xx xx xx xx xx d25029// || return paired64[0]5030// d25031// Note: Little endian would return the correct value 4b (01001011) instead.5032return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);5033}50345035// Set each bit of mask dst based on the most significant bit of the5036// corresponding packed double-precision (64-bit) floating-point element in a.5037// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd5038FORCE_INLINE int _mm_movemask_pd(__m128d a)5039{5040uint64x2_t input = vreinterpretq_u64_m128d(a);5041uint64x2_t high_bits = vshrq_n_u64(input, 63);5042return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);5043}50445045// Copy the lower 64-bit integer in a to dst.5046//5047// dst[63:0] := a[63:0]5048//5049// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi645050FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)5051{5052return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));5053}50545055// Copy the 64-bit integer a to the lower element of dst, and zero the upper5056// element.5057//5058// dst[63:0] := a[63:0]5059// dst[127:64] := 05060//5061// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi645062FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)5063{5064return vreinterpretq_m128i_s64(5065vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));5066}50675068// Multiply the low unsigned 32-bit integers from each packed 64-bit element in5069// a and b, and store the unsigned 64-bit results in dst.5070//5071// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)5072// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)5073FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)5074{5075// vmull_u32 upcasts instead of masking, so we downcast.5076uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));5077uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));5078return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));5079}50805081// Multiply packed double-precision (64-bit) floating-point elements in a and b,5082// and store the results in dst.5083// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd5084FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)5085{5086#if defined(__aarch64__)5087return vreinterpretq_m128d_f64(5088vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));5089#else5090double *da = (double *) &a;5091double *db = (double *) &b;5092double c[2];5093c[0] = da[0] * db[0];5094c[1] = da[1] * db[1];5095return vld1q_f32((float32_t *) c);5096#endif5097}50985099// Multiply the lower double-precision (64-bit) floating-point element in a and5100// b, store the result in the lower element of dst, and copy the upper element5101// from a to the upper element of dst.5102// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd5103FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)5104{5105return _mm_move_sd(a, _mm_mul_pd(a, b));5106}51075108// Multiply the low unsigned 32-bit integers from a and b, and store the5109// unsigned 64-bit result in dst.5110//5111// dst[63:0] := a[31:0] * b[31:0]5112//5113// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su325114FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)5115{5116return vreinterpret_m64_u64(vget_low_u64(5117vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));5118}51195120// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit5121// integers from b.5122//5123// r0 := (a0 * b0)[31:16]5124// r1 := (a1 * b1)[31:16]5125// ...5126// r7 := (a7 * b7)[31:16]5127//5128// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx5129FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)5130{5131/* FIXME: issue with large values because of result saturation */5132// int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),5133// vreinterpretq_s16_m128i(b)); /* =2*a*b */ return5134// vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));5135int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));5136int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));5137int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */5138int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));5139int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));5140int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */5141uint16x8x2_t r =5142vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));5143return vreinterpretq_m128i_u16(r.val[1]);5144}51455146// Multiply the packed unsigned 16-bit integers in a and b, producing5147// intermediate 32-bit integers, and store the high 16 bits of the intermediate5148// integers in dst.5149// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu165150FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)5151{5152uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));5153uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));5154uint32x4_t ab3210 = vmull_u16(a3210, b3210);5155#if defined(__aarch64__)5156uint32x4_t ab7654 =5157vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));5158uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),5159vreinterpretq_u16_u32(ab7654));5160return vreinterpretq_m128i_u16(r);5161#else5162uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));5163uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));5164uint32x4_t ab7654 = vmull_u16(a7654, b7654);5165uint16x8x2_t r =5166vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));5167return vreinterpretq_m128i_u16(r.val[1]);5168#endif5169}51705171// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or5172// unsigned 16-bit integers from b.5173//5174// r0 := (a0 * b0)[15:0]5175// r1 := (a1 * b1)[15:0]5176// ...5177// r7 := (a7 * b7)[15:0]5178//5179// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx5180FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)5181{5182return vreinterpretq_m128i_s16(5183vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));5184}51855186// Compute the bitwise OR of packed double-precision (64-bit) floating-point5187// elements in a and b, and store the results in dst.5188// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd5189FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)5190{5191return vreinterpretq_m128d_s64(5192vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));5193}51945195// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.5196//5197// r := a | b5198//5199// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx5200FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)5201{5202return vreinterpretq_m128i_s32(5203vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));5204}52055206// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and5207// saturates.5208// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx5209FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)5210{5211return vreinterpretq_m128i_s8(5212vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),5213vqmovn_s16(vreinterpretq_s16_m128i(b))));5214}52155216// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers5217// and saturates.5218//5219// r0 := SignedSaturate(a0)5220// r1 := SignedSaturate(a1)5221// r2 := SignedSaturate(a2)5222// r3 := SignedSaturate(a3)5223// r4 := SignedSaturate(b0)5224// r5 := SignedSaturate(b1)5225// r6 := SignedSaturate(b2)5226// r7 := SignedSaturate(b3)5227//5228// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx5229FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)5230{5231return vreinterpretq_m128i_s16(5232vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),5233vqmovn_s32(vreinterpretq_s32_m128i(b))));5234}52355236// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned5237// integers and saturates.5238//5239// r0 := UnsignedSaturate(a0)5240// r1 := UnsignedSaturate(a1)5241// ...5242// r7 := UnsignedSaturate(a7)5243// r8 := UnsignedSaturate(b0)5244// r9 := UnsignedSaturate(b1)5245// ...5246// r15 := UnsignedSaturate(b7)5247//5248// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx5249FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)5250{5251return vreinterpretq_m128i_u8(5252vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),5253vqmovun_s16(vreinterpretq_s16_m128i(b))));5254}52555256// Pause the processor. This is typically used in spin-wait loops and depending5257// on the x86 processor typical values are in the 40-100 cycle range. The5258// 'yield' instruction isn't a good fit because it's effectively a nop on most5259// Arm cores. Experience with several databases has shown has shown an 'isb' is5260// a reasonable approximation.5261FORCE_INLINE void _mm_pause()5262{5263__asm__ __volatile__("isb\n");5264}52655266// Compute the absolute differences of packed unsigned 8-bit integers in a and5267// b, then horizontally sum each consecutive 8 differences to produce two5268// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low5269// 16 bits of 64-bit elements in dst.5270// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu85271FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)5272{5273uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));5274return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));5275}52765277// Sets the 8 signed 16-bit integer values.5278// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx5279FORCE_INLINE __m128i _mm_set_epi16(short i7,5280short i6,5281short i5,5282short i4,5283short i3,5284short i2,5285short i1,5286short i0)5287{5288int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};5289return vreinterpretq_m128i_s16(vld1q_s16(data));5290}52915292// Sets the 4 signed 32-bit integer values.5293// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx5294FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)5295{5296int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};5297return vreinterpretq_m128i_s32(vld1q_s32(data));5298}52995300// Returns the __m128i structure with its two 64-bit integer values5301// initialized to the values of the two 64-bit integers passed in.5302// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx5303FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)5304{5305return _mm_set_epi64x((int64_t) i1, (int64_t) i2);5306}53075308// Returns the __m128i structure with its two 64-bit integer values5309// initialized to the values of the two 64-bit integers passed in.5310// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx5311FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)5312{5313return vreinterpretq_m128i_s64(5314vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));5315}53165317// Sets the 16 signed 8-bit integer values.5318// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx5319FORCE_INLINE __m128i _mm_set_epi8(signed char b15,5320signed char b14,5321signed char b13,5322signed char b12,5323signed char b11,5324signed char b10,5325signed char b9,5326signed char b8,5327signed char b7,5328signed char b6,5329signed char b5,5330signed char b4,5331signed char b3,5332signed char b2,5333signed char b1,5334signed char b0)5335{5336int8_t ALIGN_STRUCT(16)5337data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,5338(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,5339(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,5340(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};5341return (__m128i) vld1q_s8(data);5342}53435344// Set packed double-precision (64-bit) floating-point elements in dst with the5345// supplied values.5346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd5347FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)5348{5349double ALIGN_STRUCT(16) data[2] = {e0, e1};5350#if defined(__aarch64__)5351return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));5352#else5353return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));5354#endif5355}53565357// Broadcast double-precision (64-bit) floating-point value a to all elements of5358// dst.5359// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd15360#define _mm_set_pd1 _mm_set1_pd53615362// Copy double-precision (64-bit) floating-point element a to the lower element5363// of dst, and zero the upper element.5364// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd5365FORCE_INLINE __m128d _mm_set_sd(double a)5366{5367#if defined(__aarch64__)5368return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));5369#else5370return _mm_set_pd(0, a);5371#endif5372}53735374// Sets the 8 signed 16-bit integer values to w.5375//5376// r0 := w5377// r1 := w5378// ...5379// r7 := w5380//5381// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx5382FORCE_INLINE __m128i _mm_set1_epi16(short w)5383{5384return vreinterpretq_m128i_s16(vdupq_n_s16(w));5385}53865387// Sets the 4 signed 32-bit integer values to i.5388//5389// r0 := i5390// r1 := i5391// r2 := i5392// r3 := I5393//5394// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx5395FORCE_INLINE __m128i _mm_set1_epi32(int _i)5396{5397return vreinterpretq_m128i_s32(vdupq_n_s32(_i));5398}53995400// Sets the 2 signed 64-bit integer values to i.5401// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)5402FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)5403{5404return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));5405}54065407// Sets the 2 signed 64-bit integer values to i.5408// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x5409FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)5410{5411return vreinterpretq_m128i_s64(vdupq_n_s64(_i));5412}54135414// Sets the 16 signed 8-bit integer values to b.5415//5416// r0 := b5417// r1 := b5418// ...5419// r15 := b5420//5421// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx5422FORCE_INLINE __m128i _mm_set1_epi8(signed char w)5423{5424return vreinterpretq_m128i_s8(vdupq_n_s8(w));5425}54265427// Broadcast double-precision (64-bit) floating-point value a to all elements of5428// dst.5429// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd5430FORCE_INLINE __m128d _mm_set1_pd(double d)5431{5432#if defined(__aarch64__)5433return vreinterpretq_m128d_f64(vdupq_n_f64(d));5434#else5435return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));5436#endif5437}54385439// Sets the 8 signed 16-bit integer values in reverse order.5440//5441// Return Value5442// r0 := w05443// r1 := w15444// ...5445// r7 := w75446FORCE_INLINE __m128i _mm_setr_epi16(short w0,5447short w1,5448short w2,5449short w3,5450short w4,5451short w5,5452short w6,5453short w7)5454{5455int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};5456return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));5457}54585459// Sets the 4 signed 32-bit integer values in reverse order5460// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx5461FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)5462{5463int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};5464return vreinterpretq_m128i_s32(vld1q_s32(data));5465}54665467// Set packed 64-bit integers in dst with the supplied values in reverse order.5468// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi645469FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)5470{5471return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));5472}54735474// Sets the 16 signed 8-bit integer values in reverse order.5475// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx5476FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,5477signed char b1,5478signed char b2,5479signed char b3,5480signed char b4,5481signed char b5,5482signed char b6,5483signed char b7,5484signed char b8,5485signed char b9,5486signed char b10,5487signed char b11,5488signed char b12,5489signed char b13,5490signed char b14,5491signed char b15)5492{5493int8_t ALIGN_STRUCT(16)5494data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,5495(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,5496(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,5497(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};5498return (__m128i) vld1q_s8(data);5499}55005501// Set packed double-precision (64-bit) floating-point elements in dst with the5502// supplied values in reverse order.5503// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd5504FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)5505{5506return _mm_set_pd(e0, e1);5507}55085509// Return vector of type __m128d with all elements set to zero.5510// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd5511FORCE_INLINE __m128d _mm_setzero_pd(void)5512{5513#if defined(__aarch64__)5514return vreinterpretq_m128d_f64(vdupq_n_f64(0));5515#else5516return vreinterpretq_m128d_f32(vdupq_n_f32(0));5517#endif5518}55195520// Sets the 128-bit value to zero5521// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx5522FORCE_INLINE __m128i _mm_setzero_si128(void)5523{5524return vreinterpretq_m128i_s32(vdupq_n_s32(0));5525}55265527// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.5528// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx5529// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,5530// __constrange(0,255) int imm)5531#ifdef _sse2neon_shuffle5532#define _mm_shuffle_epi32(a, imm) \5533__extension__({ \5534int32x4_t _input = vreinterpretq_s32_m128i(a); \5535int32x4_t _shuf = \5536vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \5537((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \5538vreinterpretq_m128i_s32(_shuf); \5539})5540#else // generic5541#define _mm_shuffle_epi32(a, imm) \5542__extension__({ \5543__m128i ret; \5544switch (imm) { \5545case _MM_SHUFFLE(1, 0, 3, 2): \5546ret = _mm_shuffle_epi_1032((a)); \5547break; \5548case _MM_SHUFFLE(2, 3, 0, 1): \5549ret = _mm_shuffle_epi_2301((a)); \5550break; \5551case _MM_SHUFFLE(0, 3, 2, 1): \5552ret = _mm_shuffle_epi_0321((a)); \5553break; \5554case _MM_SHUFFLE(2, 1, 0, 3): \5555ret = _mm_shuffle_epi_2103((a)); \5556break; \5557case _MM_SHUFFLE(1, 0, 1, 0): \5558ret = _mm_shuffle_epi_1010((a)); \5559break; \5560case _MM_SHUFFLE(1, 0, 0, 1): \5561ret = _mm_shuffle_epi_1001((a)); \5562break; \5563case _MM_SHUFFLE(0, 1, 0, 1): \5564ret = _mm_shuffle_epi_0101((a)); \5565break; \5566case _MM_SHUFFLE(2, 2, 1, 1): \5567ret = _mm_shuffle_epi_2211((a)); \5568break; \5569case _MM_SHUFFLE(0, 1, 2, 2): \5570ret = _mm_shuffle_epi_0122((a)); \5571break; \5572case _MM_SHUFFLE(3, 3, 3, 2): \5573ret = _mm_shuffle_epi_3332((a)); \5574break; \5575case _MM_SHUFFLE(0, 0, 0, 0): \5576ret = _mm_shuffle_epi32_splat((a), 0); \5577break; \5578case _MM_SHUFFLE(1, 1, 1, 1): \5579ret = _mm_shuffle_epi32_splat((a), 1); \5580break; \5581case _MM_SHUFFLE(2, 2, 2, 2): \5582ret = _mm_shuffle_epi32_splat((a), 2); \5583break; \5584case _MM_SHUFFLE(3, 3, 3, 3): \5585ret = _mm_shuffle_epi32_splat((a), 3); \5586break; \5587default: \5588ret = _mm_shuffle_epi32_default((a), (imm)); \5589break; \5590} \5591ret; \5592})5593#endif55945595// Shuffle double-precision (64-bit) floating-point elements using the control5596// in imm8, and store the results in dst.5597//5598// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]5599// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]5600//5601// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd5602#ifdef _sse2neon_shuffle5603#define _mm_shuffle_pd(a, b, imm8) \5604vreinterpretq_m128d_s64( \5605vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \5606imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))5607#else5608#define _mm_shuffle_pd(a, b, imm8) \5609_mm_castsi128_pd(_mm_set_epi64x( \5610vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \5611vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))5612#endif56135614// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,5615// __constrange(0,255) int imm)5616#ifdef _sse2neon_shuffle5617#define _mm_shufflehi_epi16(a, imm) \5618__extension__({ \5619int16x8_t _input = vreinterpretq_s16_m128i(a); \5620int16x8_t _shuf = \5621vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \5622(((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \5623(((imm) >> 6) & 0x3) + 4); \5624vreinterpretq_m128i_s16(_shuf); \5625})5626#else // generic5627#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))5628#endif56295630// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,5631// __constrange(0,255) int imm)5632#ifdef _sse2neon_shuffle5633#define _mm_shufflelo_epi16(a, imm) \5634__extension__({ \5635int16x8_t _input = vreinterpretq_s16_m128i(a); \5636int16x8_t _shuf = vshuffleq_s16( \5637_input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \5638(((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \5639vreinterpretq_m128i_s16(_shuf); \5640})5641#else // generic5642#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))5643#endif56445645// Shift packed 16-bit integers in a left by count while shifting in zeros, and5646// store the results in dst.5647//5648// FOR j := 0 to 75649// i := j*165650// IF count[63:0] > 155651// dst[i+15:i] := 05652// ELSE5653// dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])5654// FI5655// ENDFOR5656//5657// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi165658FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)5659{5660uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5661if (_sse2neon_unlikely(c & ~15))5662return _mm_setzero_si128();56635664int16x8_t vc = vdupq_n_s16((int16_t) c);5665return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));5666}56675668// Shift packed 32-bit integers in a left by count while shifting in zeros, and5669// store the results in dst.5670//5671// FOR j := 0 to 35672// i := j*325673// IF count[63:0] > 315674// dst[i+31:i] := 05675// ELSE5676// dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])5677// FI5678// ENDFOR5679//5680// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi325681FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)5682{5683uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5684if (_sse2neon_unlikely(c & ~31))5685return _mm_setzero_si128();56865687int32x4_t vc = vdupq_n_s32((int32_t) c);5688return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));5689}56905691// Shift packed 64-bit integers in a left by count while shifting in zeros, and5692// store the results in dst.5693//5694// FOR j := 0 to 15695// i := j*645696// IF count[63:0] > 635697// dst[i+63:i] := 05698// ELSE5699// dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])5700// FI5701// ENDFOR5702//5703// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi645704FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)5705{5706uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5707if (_sse2neon_unlikely(c & ~63))5708return _mm_setzero_si128();57095710int64x2_t vc = vdupq_n_s64((int64_t) c);5711return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));5712}57135714// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and5715// store the results in dst.5716//5717// FOR j := 0 to 75718// i := j*165719// IF imm8[7:0] > 155720// dst[i+15:i] := 05721// ELSE5722// dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])5723// FI5724// ENDFOR5725//5726// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi165727FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)5728{5729if (_sse2neon_unlikely(imm & ~15))5730return _mm_setzero_si128();5731return vreinterpretq_m128i_s16(5732vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));5733}57345735// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and5736// store the results in dst.5737//5738// FOR j := 0 to 35739// i := j*325740// IF imm8[7:0] > 315741// dst[i+31:i] := 05742// ELSE5743// dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])5744// FI5745// ENDFOR5746//5747// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi325748FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)5749{5750if (_sse2neon_unlikely(imm & ~31))5751return _mm_setzero_si128();5752return vreinterpretq_m128i_s32(5753vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));5754}57555756// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and5757// store the results in dst.5758//5759// FOR j := 0 to 15760// i := j*645761// IF imm8[7:0] > 635762// dst[i+63:i] := 05763// ELSE5764// dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])5765// FI5766// ENDFOR5767//5768// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi645769FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)5770{5771if (_sse2neon_unlikely(imm & ~63))5772return _mm_setzero_si128();5773return vreinterpretq_m128i_s64(5774vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));5775}57765777// Shift a left by imm8 bytes while shifting in zeros, and store the results in5778// dst.5779//5780// tmp := imm8[7:0]5781// IF tmp > 155782// tmp := 165783// FI5784// dst[127:0] := a[127:0] << (tmp*8)5785//5786// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si1285787#define _mm_slli_si128(a, imm) \5788__extension__({ \5789int8x16_t ret; \5790if (_sse2neon_unlikely(imm == 0)) \5791ret = vreinterpretq_s8_m128i(a); \5792else if (_sse2neon_unlikely((imm) & ~15)) \5793ret = vdupq_n_s8(0); \5794else \5795ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), \5796((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \5797vreinterpretq_m128i_s8(ret); \5798})57995800// Compute the square root of packed double-precision (64-bit) floating-point5801// elements in a, and store the results in dst.5802// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd5803FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)5804{5805#if defined(__aarch64__)5806return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));5807#else5808double a0 = sqrt(((double *) &a)[0]);5809double a1 = sqrt(((double *) &a)[1]);5810return _mm_set_pd(a1, a0);5811#endif5812}58135814// Compute the square root of the lower double-precision (64-bit) floating-point5815// element in b, store the result in the lower element of dst, and copy the5816// upper element from a to the upper element of dst.5817// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd5818FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)5819{5820#if defined(__aarch64__)5821return _mm_move_sd(a, _mm_sqrt_pd(b));5822#else5823return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));5824#endif5825}58265827// Shift packed 16-bit integers in a right by count while shifting in sign bits,5828// and store the results in dst.5829//5830// FOR j := 0 to 75831// i := j*165832// IF count[63:0] > 155833// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)5834// ELSE5835// dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])5836// FI5837// ENDFOR5838//5839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi165840FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)5841{5842int64_t c = (int64_t) vget_low_s64((int64x2_t) count);5843if (_sse2neon_unlikely(c & ~15))5844return _mm_cmplt_epi16(a, _mm_setzero_si128());5845return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));5846}58475848// Shift packed 32-bit integers in a right by count while shifting in sign bits,5849// and store the results in dst.5850//5851// FOR j := 0 to 35852// i := j*325853// IF count[63:0] > 315854// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)5855// ELSE5856// dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])5857// FI5858// ENDFOR5859//5860// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi325861FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)5862{5863int64_t c = (int64_t) vget_low_s64((int64x2_t) count);5864if (_sse2neon_unlikely(c & ~31))5865return _mm_cmplt_epi32(a, _mm_setzero_si128());5866return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));5867}58685869// Shift packed 16-bit integers in a right by imm8 while shifting in sign5870// bits, and store the results in dst.5871//5872// FOR j := 0 to 75873// i := j*165874// IF imm8[7:0] > 155875// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)5876// ELSE5877// dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])5878// FI5879// ENDFOR5880//5881// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi165882FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)5883{5884const int count = (imm & ~15) ? 15 : imm;5885return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));5886}58875888// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,5889// and store the results in dst.5890//5891// FOR j := 0 to 35892// i := j*325893// IF imm8[7:0] > 315894// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)5895// ELSE5896// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])5897// FI5898// ENDFOR5899//5900// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi325901// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)5902#define _mm_srai_epi32(a, imm) \5903__extension__({ \5904__m128i ret; \5905if (_sse2neon_unlikely((imm) == 0)) { \5906ret = a; \5907} else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \5908ret = vreinterpretq_m128i_s32( \5909vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \5910} else { \5911ret = vreinterpretq_m128i_s32( \5912vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \5913} \5914ret; \5915})59165917// Shift packed 16-bit integers in a right by count while shifting in zeros, and5918// store the results in dst.5919//5920// FOR j := 0 to 75921// i := j*165922// IF count[63:0] > 155923// dst[i+15:i] := 05924// ELSE5925// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])5926// FI5927// ENDFOR5928//5929// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi165930FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)5931{5932uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5933if (_sse2neon_unlikely(c & ~15))5934return _mm_setzero_si128();59355936int16x8_t vc = vdupq_n_s16(-(int16_t) c);5937return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));5938}59395940// Shift packed 32-bit integers in a right by count while shifting in zeros, and5941// store the results in dst.5942//5943// FOR j := 0 to 35944// i := j*325945// IF count[63:0] > 315946// dst[i+31:i] := 05947// ELSE5948// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])5949// FI5950// ENDFOR5951//5952// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi325953FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)5954{5955uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5956if (_sse2neon_unlikely(c & ~31))5957return _mm_setzero_si128();59585959int32x4_t vc = vdupq_n_s32(-(int32_t) c);5960return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));5961}59625963// Shift packed 64-bit integers in a right by count while shifting in zeros, and5964// store the results in dst.5965//5966// FOR j := 0 to 15967// i := j*645968// IF count[63:0] > 635969// dst[i+63:i] := 05970// ELSE5971// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])5972// FI5973// ENDFOR5974//5975// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi645976FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)5977{5978uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);5979if (_sse2neon_unlikely(c & ~63))5980return _mm_setzero_si128();59815982int64x2_t vc = vdupq_n_s64(-(int64_t) c);5983return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));5984}59855986// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and5987// store the results in dst.5988//5989// FOR j := 0 to 75990// i := j*165991// IF imm8[7:0] > 155992// dst[i+15:i] := 05993// ELSE5994// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])5995// FI5996// ENDFOR5997//5998// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi165999#define _mm_srli_epi16(a, imm) \6000__extension__({ \6001__m128i ret; \6002if (_sse2neon_unlikely((imm) & ~15)) { \6003ret = _mm_setzero_si128(); \6004} else { \6005ret = vreinterpretq_m128i_u16( \6006vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \6007} \6008ret; \6009})60106011// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and6012// store the results in dst.6013//6014// FOR j := 0 to 36015// i := j*326016// IF imm8[7:0] > 316017// dst[i+31:i] := 06018// ELSE6019// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])6020// FI6021// ENDFOR6022//6023// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi326024// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)6025#define _mm_srli_epi32(a, imm) \6026__extension__({ \6027__m128i ret; \6028if (_sse2neon_unlikely((imm) & ~31)) { \6029ret = _mm_setzero_si128(); \6030} else { \6031ret = vreinterpretq_m128i_u32( \6032vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \6033} \6034ret; \6035})60366037// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and6038// store the results in dst.6039//6040// FOR j := 0 to 16041// i := j*646042// IF imm8[7:0] > 636043// dst[i+63:i] := 06044// ELSE6045// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])6046// FI6047// ENDFOR6048//6049// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi646050#define _mm_srli_epi64(a, imm) \6051__extension__({ \6052__m128i ret; \6053if (_sse2neon_unlikely((imm) & ~63)) { \6054ret = _mm_setzero_si128(); \6055} else { \6056ret = vreinterpretq_m128i_u64( \6057vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \6058} \6059ret; \6060})60616062// Shift a right by imm8 bytes while shifting in zeros, and store the results in6063// dst.6064//6065// tmp := imm8[7:0]6066// IF tmp > 156067// tmp := 166068// FI6069// dst[127:0] := a[127:0] >> (tmp*8)6070//6071// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si1286072#define _mm_srli_si128(a, imm) \6073__extension__({ \6074int8x16_t ret; \6075if (_sse2neon_unlikely((imm) & ~15)) \6076ret = vdupq_n_s8(0); \6077else \6078ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \6079(imm > 15 ? 0 : imm)); \6080vreinterpretq_m128i_s8(ret); \6081})60826083// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point6084// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary6085// or a general-protection exception may be generated.6086// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd6087FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)6088{6089#if defined(__aarch64__)6090vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));6091#else6092vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));6093#endif6094}60956096// Store the lower double-precision (64-bit) floating-point element from a into6097// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte6098// boundary or a general-protection exception may be generated.6099// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd16100FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)6101{6102#if defined(__aarch64__)6103float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));6104vst1q_f64((float64_t *) mem_addr,6105vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));6106#else6107float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));6108vst1q_f32((float32_t *) mem_addr,6109vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));6110#endif6111}61126113// Store the lower double-precision (64-bit) floating-point element from a into6114// memory. mem_addr does not need to be aligned on any particular boundary.6115// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd6116FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)6117{6118#if defined(__aarch64__)6119vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));6120#else6121vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));6122#endif6123}61246125// Stores four 32-bit integer values as (as a __m128i value) at the address p.6126// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx6127FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)6128{6129vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));6130}61316132// Store the lower double-precision (64-bit) floating-point element from a into6133// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte6134// boundary or a general-protection exception may be generated.6135// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd6136#define _mm_store1_pd _mm_store_pd161376138// Store the upper double-precision (64-bit) floating-point element from a into6139// memory.6140//6141// MEM[mem_addr+63:mem_addr] := a[127:64]6142//6143// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd6144FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)6145{6146#if defined(__aarch64__)6147vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));6148#else6149vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));6150#endif6151}61526153// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.6154// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx6155FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)6156{6157vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));6158}61596160// Store the lower double-precision (64-bit) floating-point element from a into6161// memory.6162//6163// MEM[mem_addr+63:mem_addr] := a[63:0]6164//6165// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd6166FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)6167{6168#if defined(__aarch64__)6169vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));6170#else6171vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));6172#endif6173}61746175// Store 2 double-precision (64-bit) floating-point elements from a into memory6176// in reverse order. mem_addr must be aligned on a 16-byte boundary or a6177// general-protection exception may be generated.6178//6179// MEM[mem_addr+63:mem_addr] := a[127:64]6180// MEM[mem_addr+127:mem_addr+64] := a[63:0]6181//6182// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd6183FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)6184{6185float32x4_t f = vreinterpretq_f32_m128d(a);6186_mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));6187}61886189// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point6190// elements) from a into memory. mem_addr does not need to be aligned on any6191// particular boundary.6192// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd6193FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)6194{6195_mm_store_pd(mem_addr, a);6196}61976198// Stores 128-bits of integer data a at the address p.6199// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si1286200FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)6201{6202vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));6203}62046205// Stores 32-bits of integer data a at the address p.6206// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si326207FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)6208{6209vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);6210}62116212// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point6213// elements) from a into memory using a non-temporal memory hint. mem_addr must6214// be aligned on a 16-byte boundary or a general-protection exception may be6215// generated.6216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd6217FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)6218{6219#if __has_builtin(__builtin_nontemporal_store)6220__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);6221#elif defined(__aarch64__)6222vst1q_f64(p, vreinterpretq_f64_m128d(a));6223#else6224vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));6225#endif6226}62276228// Stores the data in a to the address p without polluting the caches. If the6229// cache line containing address p is already in the cache, the cache will be6230// updated.6231// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx6232FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)6233{6234#if __has_builtin(__builtin_nontemporal_store)6235__builtin_nontemporal_store(a, p);6236#else6237vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));6238#endif6239}62406241// Store 32-bit integer a into memory using a non-temporal hint to minimize6242// cache pollution. If the cache line containing address mem_addr is already in6243// the cache, the cache will be updated.6244// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si326245FORCE_INLINE void _mm_stream_si32(int *p, int a)6246{6247vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);6248}62496250// Store 64-bit integer a into memory using a non-temporal hint to minimize6251// cache pollution. If the cache line containing address mem_addr is already in6252// the cache, the cache will be updated.6253// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si646254FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)6255{6256vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));6257}62586259// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and6260// store the results in dst.6261// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi166262FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)6263{6264return vreinterpretq_m128i_s16(6265vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));6266}62676268// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or6269// unsigned 32-bit integers of a.6270//6271// r0 := a0 - b06272// r1 := a1 - b16273// r2 := a2 - b26274// r3 := a3 - b36275//6276// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx6277FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)6278{6279return vreinterpretq_m128i_s32(6280vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));6281}62826283// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,6284// and store the results in dst.6285// r0 := a0 - b06286// r1 := a1 - b16287FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)6288{6289return vreinterpretq_m128i_s64(6290vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));6291}62926293// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and6294// store the results in dst.6295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi86296FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)6297{6298return vreinterpretq_m128i_s8(6299vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));6300}63016302// Subtract packed double-precision (64-bit) floating-point elements in b from6303// packed double-precision (64-bit) floating-point elements in a, and store the6304// results in dst.6305//6306// FOR j := 0 to 16307// i := j*646308// dst[i+63:i] := a[i+63:i] - b[i+63:i]6309// ENDFOR6310//6311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd6312FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)6313{6314#if defined(__aarch64__)6315return vreinterpretq_m128d_f64(6316vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));6317#else6318double *da = (double *) &a;6319double *db = (double *) &b;6320double c[2];6321c[0] = da[0] - db[0];6322c[1] = da[1] - db[1];6323return vld1q_f32((float32_t *) c);6324#endif6325}63266327// Subtract the lower double-precision (64-bit) floating-point element in b from6328// the lower double-precision (64-bit) floating-point element in a, store the6329// result in the lower element of dst, and copy the upper element from a to the6330// upper element of dst.6331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd6332FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)6333{6334return _mm_move_sd(a, _mm_sub_pd(a, b));6335}63366337// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.6338//6339// dst[63:0] := a[63:0] - b[63:0]6340//6341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si646342FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)6343{6344return vreinterpret_m64_s64(6345vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));6346}63476348// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers6349// of a and saturates.6350//6351// r0 := SignedSaturate(a0 - b0)6352// r1 := SignedSaturate(a1 - b1)6353// ...6354// r7 := SignedSaturate(a7 - b7)6355//6356// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)6357FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)6358{6359return vreinterpretq_m128i_s16(6360vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));6361}63626363// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers6364// of a and saturates.6365//6366// r0 := SignedSaturate(a0 - b0)6367// r1 := SignedSaturate(a1 - b1)6368// ...6369// r15 := SignedSaturate(a15 - b15)6370//6371// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)6372FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)6373{6374return vreinterpretq_m128i_s8(6375vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));6376}63776378// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit6379// integers of a and saturates..6380// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx6381FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)6382{6383return vreinterpretq_m128i_u16(6384vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));6385}63866387// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit6388// integers of a and saturates.6389//6390// r0 := UnsignedSaturate(a0 - b0)6391// r1 := UnsignedSaturate(a1 - b1)6392// ...6393// r15 := UnsignedSaturate(a15 - b15)6394//6395// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)6396FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)6397{6398return vreinterpretq_m128i_u8(6399vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));6400}64016402#define _mm_ucomieq_sd _mm_comieq_sd6403#define _mm_ucomige_sd _mm_comige_sd6404#define _mm_ucomigt_sd _mm_comigt_sd6405#define _mm_ucomile_sd _mm_comile_sd6406#define _mm_ucomilt_sd _mm_comilt_sd6407#define _mm_ucomineq_sd _mm_comineq_sd64086409// Return vector of type __m128d with undefined elements.6410// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd6411FORCE_INLINE __m128d _mm_undefined_pd(void)6412{6413#if defined(__GNUC__) || defined(__clang__)6414#pragma GCC diagnostic push6415#pragma GCC diagnostic ignored "-Wuninitialized"6416#endif6417__m128d a;6418return a;6419#if defined(__GNUC__) || defined(__clang__)6420#pragma GCC diagnostic pop6421#endif6422}64236424// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the6425// upper 4 signed or unsigned 16-bit integers in b.6426//6427// r0 := a46428// r1 := b46429// r2 := a56430// r3 := b56431// r4 := a66432// r5 := b66433// r6 := a76434// r7 := b76435//6436// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx6437FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)6438{6439#if defined(__aarch64__)6440return vreinterpretq_m128i_s16(6441vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));6442#else6443int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));6444int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));6445int16x4x2_t result = vzip_s16(a1, b1);6446return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));6447#endif6448}64496450// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the6451// upper 2 signed or unsigned 32-bit integers in b.6452// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx6453FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)6454{6455#if defined(__aarch64__)6456return vreinterpretq_m128i_s32(6457vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));6458#else6459int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));6460int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));6461int32x2x2_t result = vzip_s32(a1, b1);6462return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));6463#endif6464}64656466// Interleaves the upper signed or unsigned 64-bit integer in a with the6467// upper signed or unsigned 64-bit integer in b.6468//6469// r0 := a16470// r1 := b16471FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)6472{6473int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));6474int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));6475return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));6476}64776478// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper6479// 8 signed or unsigned 8-bit integers in b.6480//6481// r0 := a86482// r1 := b86483// r2 := a96484// r3 := b96485// ...6486// r14 := a156487// r15 := b156488//6489// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx6490FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)6491{6492#if defined(__aarch64__)6493return vreinterpretq_m128i_s8(6494vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));6495#else6496int8x8_t a1 =6497vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));6498int8x8_t b1 =6499vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));6500int8x8x2_t result = vzip_s8(a1, b1);6501return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));6502#endif6503}65046505// Unpack and interleave double-precision (64-bit) floating-point elements from6506// the high half of a and b, and store the results in dst.6507//6508// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {6509// dst[63:0] := src1[127:64]6510// dst[127:64] := src2[127:64]6511// RETURN dst[127:0]6512// }6513// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])6514//6515// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd6516FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)6517{6518#if defined(__aarch64__)6519return vreinterpretq_m128d_f64(6520vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));6521#else6522return vreinterpretq_m128d_s64(6523vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),6524vget_high_s64(vreinterpretq_s64_m128d(b))));6525#endif6526}65276528// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the6529// lower 4 signed or unsigned 16-bit integers in b.6530//6531// r0 := a06532// r1 := b06533// r2 := a16534// r3 := b16535// r4 := a26536// r5 := b26537// r6 := a36538// r7 := b36539//6540// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx6541FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)6542{6543#if defined(__aarch64__)6544return vreinterpretq_m128i_s16(6545vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));6546#else6547int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));6548int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));6549int16x4x2_t result = vzip_s16(a1, b1);6550return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));6551#endif6552}65536554// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the6555// lower 2 signed or unsigned 32 - bit integers in b.6556//6557// r0 := a06558// r1 := b06559// r2 := a16560// r3 := b16561//6562// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx6563FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)6564{6565#if defined(__aarch64__)6566return vreinterpretq_m128i_s32(6567vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));6568#else6569int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));6570int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));6571int32x2x2_t result = vzip_s32(a1, b1);6572return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));6573#endif6574}65756576FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)6577{6578int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));6579int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));6580return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));6581}65826583// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower6584// 8 signed or unsigned 8-bit integers in b.6585//6586// r0 := a06587// r1 := b06588// r2 := a16589// r3 := b16590// ...6591// r14 := a76592// r15 := b76593//6594// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx6595FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)6596{6597#if defined(__aarch64__)6598return vreinterpretq_m128i_s8(6599vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));6600#else6601int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));6602int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));6603int8x8x2_t result = vzip_s8(a1, b1);6604return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));6605#endif6606}66076608// Unpack and interleave double-precision (64-bit) floating-point elements from6609// the low half of a and b, and store the results in dst.6610//6611// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {6612// dst[63:0] := src1[63:0]6613// dst[127:64] := src2[63:0]6614// RETURN dst[127:0]6615// }6616// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])6617//6618// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd6619FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)6620{6621#if defined(__aarch64__)6622return vreinterpretq_m128d_f64(6623vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));6624#else6625return vreinterpretq_m128d_s64(6626vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),6627vget_low_s64(vreinterpretq_s64_m128d(b))));6628#endif6629}66306631// Compute the bitwise XOR of packed double-precision (64-bit) floating-point6632// elements in a and b, and store the results in dst.6633//6634// FOR j := 0 to 16635// i := j*646636// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]6637// ENDFOR6638//6639// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd6640FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)6641{6642return vreinterpretq_m128d_s64(6643veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));6644}66456646// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in6647// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx6648FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)6649{6650return vreinterpretq_m128i_s32(6651veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));6652}66536654/* SSE3 */66556656// Alternatively add and subtract packed double-precision (64-bit)6657// floating-point elements in a to/from packed elements in b, and store the6658// results in dst.6659//6660// FOR j := 0 to 16661// i := j*646662// IF ((j & 1) == 0)6663// dst[i+63:i] := a[i+63:i] - b[i+63:i]6664// ELSE6665// dst[i+63:i] := a[i+63:i] + b[i+63:i]6666// FI6667// ENDFOR6668//6669// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd6670FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)6671{6672_sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);6673#if defined(__aarch64__)6674return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),6675vreinterpretq_f64_m128d(b),6676vreinterpretq_f64_m128d(mask)));6677#else6678return _mm_add_pd(_mm_mul_pd(b, mask), a);6679#endif6680}66816682// Alternatively add and subtract packed single-precision (32-bit)6683// floating-point elements in a to/from packed elements in b, and store the6684// results in dst.6685// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps6686FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)6687{6688_sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);6689#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */6690return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),6691vreinterpretq_f32_m128(mask),6692vreinterpretq_f32_m128(b)));6693#else6694return _mm_add_ps(_mm_mul_ps(b, mask), a);6695#endif6696}66976698// Horizontally add adjacent pairs of double-precision (64-bit) floating-point6699// elements in a and b, and pack the results in dst.6700// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd6701FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)6702{6703#if defined(__aarch64__)6704return vreinterpretq_m128d_f64(6705vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));6706#else6707double *da = (double *) &a;6708double *db = (double *) &b;6709double c[] = {da[0] + da[1], db[0] + db[1]};6710return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));6711#endif6712}67136714// Computes pairwise add of each argument as single-precision, floating-point6715// values a and b.6716// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx6717FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)6718{6719#if defined(__aarch64__)6720return vreinterpretq_m128_f32(6721vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));6722#else6723float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));6724float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));6725float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));6726float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));6727return vreinterpretq_m128_f32(6728vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));6729#endif6730}67316732// Horizontally subtract adjacent pairs of double-precision (64-bit)6733// floating-point elements in a and b, and pack the results in dst.6734// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd6735FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)6736{6737#if defined(__aarch64__)6738float64x2_t a = vreinterpretq_f64_m128d(_a);6739float64x2_t b = vreinterpretq_f64_m128d(_b);6740return vreinterpretq_m128d_f64(6741vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));6742#else6743double *da = (double *) &_a;6744double *db = (double *) &_b;6745double c[] = {da[0] - da[1], db[0] - db[1]};6746return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));6747#endif6748}67496750// Horizontally subtract adjacent pairs of single-precision (32-bit)6751// floating-point elements in a and b, and pack the results in dst.6752// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps6753FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)6754{6755float32x4_t a = vreinterpretq_f32_m128(_a);6756float32x4_t b = vreinterpretq_f32_m128(_b);6757#if defined(__aarch64__)6758return vreinterpretq_m128_f32(6759vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));6760#else6761float32x4x2_t c = vuzpq_f32(a, b);6762return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));6763#endif6764}67656766// Load 128-bits of integer data from unaligned memory into dst. This intrinsic6767// may perform better than _mm_loadu_si128 when the data crosses a cache line6768// boundary.6769//6770// dst[127:0] := MEM[mem_addr+127:mem_addr]6771//6772// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si1286773#define _mm_lddqu_si128 _mm_loadu_si12867746775// Load a double-precision (64-bit) floating-point element from memory into both6776// elements of dst.6777//6778// dst[63:0] := MEM[mem_addr+63:mem_addr]6779// dst[127:64] := MEM[mem_addr+63:mem_addr]6780//6781// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd6782#define _mm_loaddup_pd _mm_load1_pd67836784// Duplicate the low double-precision (64-bit) floating-point element from a,6785// and store the results in dst.6786// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd6787FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)6788{6789#if defined(__aarch64__)6790return vreinterpretq_m128d_f64(6791vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));6792#else6793return vreinterpretq_m128d_u64(6794vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));6795#endif6796}67976798// Duplicate odd-indexed single-precision (32-bit) floating-point elements6799// from a, and store the results in dst.6800// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps6801FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)6802{6803#if defined(__aarch64__)6804return vreinterpretq_m128_f32(6805vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));6806#elif defined(_sse2neon_shuffle)6807return vreinterpretq_m128_f32(vshuffleq_s32(6808vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));6809#else6810float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);6811float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);6812float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};6813return vreinterpretq_m128_f32(vld1q_f32(data));6814#endif6815}68166817// Duplicate even-indexed single-precision (32-bit) floating-point elements6818// from a, and store the results in dst.6819// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps6820FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)6821{6822#if defined(__aarch64__)6823return vreinterpretq_m128_f32(6824vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));6825#elif defined(_sse2neon_shuffle)6826return vreinterpretq_m128_f32(vshuffleq_s32(6827vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));6828#else6829float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);6830float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);6831float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};6832return vreinterpretq_m128_f32(vld1q_f32(data));6833#endif6834}68356836/* SSSE3 */68376838// Compute the absolute value of packed signed 16-bit integers in a, and store6839// the unsigned results in dst.6840//6841// FOR j := 0 to 76842// i := j*166843// dst[i+15:i] := ABS(a[i+15:i])6844// ENDFOR6845//6846// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi166847FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)6848{6849return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));6850}68516852// Compute the absolute value of packed signed 32-bit integers in a, and store6853// the unsigned results in dst.6854//6855// FOR j := 0 to 36856// i := j*326857// dst[i+31:i] := ABS(a[i+31:i])6858// ENDFOR6859//6860// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi326861FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)6862{6863return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));6864}68656866// Compute the absolute value of packed signed 8-bit integers in a, and store6867// the unsigned results in dst.6868//6869// FOR j := 0 to 156870// i := j*86871// dst[i+7:i] := ABS(a[i+7:i])6872// ENDFOR6873//6874// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi86875FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)6876{6877return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));6878}68796880// Compute the absolute value of packed signed 16-bit integers in a, and store6881// the unsigned results in dst.6882//6883// FOR j := 0 to 36884// i := j*166885// dst[i+15:i] := ABS(a[i+15:i])6886// ENDFOR6887//6888// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi166889FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)6890{6891return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));6892}68936894// Compute the absolute value of packed signed 32-bit integers in a, and store6895// the unsigned results in dst.6896//6897// FOR j := 0 to 16898// i := j*326899// dst[i+31:i] := ABS(a[i+31:i])6900// ENDFOR6901//6902// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi326903FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)6904{6905return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));6906}69076908// Compute the absolute value of packed signed 8-bit integers in a, and store6909// the unsigned results in dst.6910//6911// FOR j := 0 to 76912// i := j*86913// dst[i+7:i] := ABS(a[i+7:i])6914// ENDFOR6915//6916// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi86917FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)6918{6919return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));6920}69216922// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift6923// the result right by imm8 bytes, and store the low 16 bytes in dst.6924//6925// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)6926// dst[127:0] := tmp[127:0]6927//6928// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi86929#define _mm_alignr_epi8(a, b, imm) \6930__extension__({ \6931uint8x16_t _a = vreinterpretq_u8_m128i(a); \6932uint8x16_t _b = vreinterpretq_u8_m128i(b); \6933__m128i ret; \6934if (_sse2neon_unlikely((imm) & ~31)) \6935ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \6936else if (imm >= 16) \6937ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \6938else \6939ret = \6940vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \6941ret; \6942})69436944// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift6945// the result right by imm8 bytes, and store the low 8 bytes in dst.6946//6947// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)6948// dst[63:0] := tmp[63:0]6949//6950// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi86951#define _mm_alignr_pi8(a, b, imm) \6952__extension__({ \6953__m64 ret; \6954if (_sse2neon_unlikely((imm) >= 16)) { \6955ret = vreinterpret_m64_s8(vdup_n_s8(0)); \6956} else { \6957uint8x8_t tmp_low, tmp_high; \6958if ((imm) >= 8) { \6959const int idx = (imm) -8; \6960tmp_low = vreinterpret_u8_m64(a); \6961tmp_high = vdup_n_u8(0); \6962ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \6963} else { \6964const int idx = (imm); \6965tmp_low = vreinterpret_u8_m64(b); \6966tmp_high = vreinterpret_u8_m64(a); \6967ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \6968} \6969} \6970ret; \6971})69726973// Computes pairwise add of each argument as a 16-bit signed or unsigned integer6974// values a and b.6975FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)6976{6977int16x8_t a = vreinterpretq_s16_m128i(_a);6978int16x8_t b = vreinterpretq_s16_m128i(_b);6979#if defined(__aarch64__)6980return vreinterpretq_m128i_s16(vpaddq_s16(a, b));6981#else6982return vreinterpretq_m128i_s16(6983vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),6984vpadd_s16(vget_low_s16(b), vget_high_s16(b))));6985#endif6986}69876988// Computes pairwise add of each argument as a 32-bit signed or unsigned integer6989// values a and b.6990FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)6991{6992int32x4_t a = vreinterpretq_s32_m128i(_a);6993int32x4_t b = vreinterpretq_s32_m128i(_b);6994#if defined(__aarch64__)6995return vreinterpretq_m128i_s32(vpaddq_s32(a, b));6996#else6997return vreinterpretq_m128i_s32(6998vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),6999vpadd_s32(vget_low_s32(b), vget_high_s32(b))));7000#endif7001}70027003// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the7004// signed 16-bit results in dst.7005// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi167006FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)7007{7008return vreinterpret_m64_s16(7009vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));7010}70117012// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the7013// signed 32-bit results in dst.7014// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi327015FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)7016{7017return vreinterpret_m64_s32(7018vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));7019}70207021// Computes saturated pairwise sub of each argument as a 16-bit signed7022// integer values a and b.7023FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)7024{7025#if defined(__aarch64__)7026int16x8_t a = vreinterpretq_s16_m128i(_a);7027int16x8_t b = vreinterpretq_s16_m128i(_b);7028return vreinterpretq_s64_s16(7029vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));7030#else7031int32x4_t a = vreinterpretq_s32_m128i(_a);7032int32x4_t b = vreinterpretq_s32_m128i(_b);7033// Interleave using vshrn/vmovn7034// [a0|a2|a4|a6|b0|b2|b4|b6]7035// [a1|a3|a5|a7|b1|b3|b5|b7]7036int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));7037int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));7038// Saturated add7039return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));7040#endif7041}70427043// Horizontally add adjacent pairs of signed 16-bit integers in a and b using7044// saturation, and pack the signed 16-bit results in dst.7045// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi167046FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)7047{7048int16x4_t a = vreinterpret_s16_m64(_a);7049int16x4_t b = vreinterpret_s16_m64(_b);7050#if defined(__aarch64__)7051return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));7052#else7053int16x4x2_t res = vuzp_s16(a, b);7054return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));7055#endif7056}70577058// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack7059// the signed 16-bit results in dst.7060// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi167061FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)7062{7063int16x8_t a = vreinterpretq_s16_m128i(_a);7064int16x8_t b = vreinterpretq_s16_m128i(_b);7065#if defined(__aarch64__)7066return vreinterpretq_m128i_s16(7067vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));7068#else7069int16x8x2_t c = vuzpq_s16(a, b);7070return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));7071#endif7072}70737074// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack7075// the signed 32-bit results in dst.7076// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi327077FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)7078{7079int32x4_t a = vreinterpretq_s32_m128i(_a);7080int32x4_t b = vreinterpretq_s32_m128i(_b);7081#if defined(__aarch64__)7082return vreinterpretq_m128i_s32(7083vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));7084#else7085int32x4x2_t c = vuzpq_s32(a, b);7086return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));7087#endif7088}70897090// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack7091// the signed 16-bit results in dst.7092// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi167093FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)7094{7095int16x4_t a = vreinterpret_s16_m64(_a);7096int16x4_t b = vreinterpret_s16_m64(_b);7097#if defined(__aarch64__)7098return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));7099#else7100int16x4x2_t c = vuzp_s16(a, b);7101return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));7102#endif7103}71047105// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack7106// the signed 32-bit results in dst.7107// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi327108FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)7109{7110int32x2_t a = vreinterpret_s32_m64(_a);7111int32x2_t b = vreinterpret_s32_m64(_b);7112#if defined(__aarch64__)7113return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));7114#else7115int32x2x2_t c = vuzp_s32(a, b);7116return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));7117#endif7118}71197120// Computes saturated pairwise difference of each argument as a 16-bit signed7121// integer values a and b.7122// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi167123FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)7124{7125int16x8_t a = vreinterpretq_s16_m128i(_a);7126int16x8_t b = vreinterpretq_s16_m128i(_b);7127#if defined(__aarch64__)7128return vreinterpretq_m128i_s16(7129vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));7130#else7131int16x8x2_t c = vuzpq_s16(a, b);7132return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));7133#endif7134}71357136// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b7137// using saturation, and pack the signed 16-bit results in dst.7138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi167139FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)7140{7141int16x4_t a = vreinterpret_s16_m64(_a);7142int16x4_t b = vreinterpret_s16_m64(_b);7143#if defined(__aarch64__)7144return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));7145#else7146int16x4x2_t c = vuzp_s16(a, b);7147return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));7148#endif7149}71507151// Vertically multiply each unsigned 8-bit integer from a with the corresponding7152// signed 8-bit integer from b, producing intermediate signed 16-bit integers.7153// Horizontally add adjacent pairs of intermediate signed 16-bit integers,7154// and pack the saturated results in dst.7155//7156// FOR j := 0 to 77157// i := j*167158// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +7159// a[i+7:i]*b[i+7:i] )7160// ENDFOR7161FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)7162{7163#if defined(__aarch64__)7164uint8x16_t a = vreinterpretq_u8_m128i(_a);7165int8x16_t b = vreinterpretq_s8_m128i(_b);7166int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),7167vmovl_s8(vget_low_s8(b)));7168int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),7169vmovl_s8(vget_high_s8(b)));7170return vreinterpretq_m128i_s16(7171vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));7172#else7173// This would be much simpler if x86 would choose to zero extend OR sign7174// extend, not both. This could probably be optimized better.7175uint16x8_t a = vreinterpretq_u16_m128i(_a);7176int16x8_t b = vreinterpretq_s16_m128i(_b);71777178// Zero extend a7179int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));7180int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));71817182// Sign extend by shifting left then shifting right.7183int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);7184int16x8_t b_odd = vshrq_n_s16(b, 8);71857186// multiply7187int16x8_t prod1 = vmulq_s16(a_even, b_even);7188int16x8_t prod2 = vmulq_s16(a_odd, b_odd);71897190// saturated add7191return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));7192#endif7193}71947195// Vertically multiply each unsigned 8-bit integer from a with the corresponding7196// signed 8-bit integer from b, producing intermediate signed 16-bit integers.7197// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and7198// pack the saturated results in dst.7199// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi167200FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)7201{7202uint16x4_t a = vreinterpret_u16_m64(_a);7203int16x4_t b = vreinterpret_s16_m64(_b);72047205// Zero extend a7206int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));7207int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));72087209// Sign extend by shifting left then shifting right.7210int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);7211int16x4_t b_odd = vshr_n_s16(b, 8);72127213// multiply7214int16x4_t prod1 = vmul_s16(a_even, b_even);7215int16x4_t prod2 = vmul_s16(a_odd, b_odd);72167217// saturated add7218return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));7219}72207221// Multiply packed signed 16-bit integers in a and b, producing intermediate7222// signed 32-bit integers. Shift right by 15 bits while rounding up, and store7223// the packed 16-bit integers in dst.7224//7225// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)7226// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)7227// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)7228// ...7229// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)7230FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)7231{7232// Has issues due to saturation7233// return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));72347235// Multiply7236int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),7237vget_low_s16(vreinterpretq_s16_m128i(b)));7238int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),7239vget_high_s16(vreinterpretq_s16_m128i(b)));72407241// Rounding narrowing shift right7242// narrow = (int16_t)((mul + 16384) >> 15);7243int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);7244int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);72457246// Join together7247return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));7248}72497250// Multiply packed signed 16-bit integers in a and b, producing intermediate7251// signed 32-bit integers. Truncate each intermediate integer to the 18 most7252// significant bits, round by adding 1, and store bits [16:1] to dst.7253// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi167254FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)7255{7256int32x4_t mul_extend =7257vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));72587259// Rounding narrowing shift right7260return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));7261}72627263// Shuffle packed 8-bit integers in a according to shuffle control mask in the7264// corresponding 8-bit element of b, and store the results in dst.7265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi87266FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)7267{7268int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a7269uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b7270uint8x16_t idx_masked =7271vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits7272#if defined(__aarch64__)7273return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));7274#elif defined(__GNUC__)7275int8x16_t ret;7276// %e and %f represent the even and odd D registers7277// respectively.7278__asm__ __volatile__(7279"vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"7280"vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"7281: [ret] "=&w"(ret)7282: [tbl] "w"(tbl), [idx] "w"(idx_masked));7283return vreinterpretq_m128i_s8(ret);7284#else7285// use this line if testing on aarch647286int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};7287return vreinterpretq_m128i_s8(7288vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),7289vtbl2_s8(a_split, vget_high_u8(idx_masked))));7290#endif7291}72927293// Shuffle packed 8-bit integers in a according to shuffle control mask in the7294// corresponding 8-bit element of b, and store the results in dst.7295//7296// FOR j := 0 to 77297// i := j*87298// IF b[i+7] == 17299// dst[i+7:i] := 07300// ELSE7301// index[2:0] := b[i+2:i]7302// dst[i+7:i] := a[index*8+7:index*8]7303// FI7304// ENDFOR7305//7306// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi87307FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)7308{7309const int8x8_t controlMask =7310vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));7311int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);7312return vreinterpret_m64_s8(res);7313}73147315// Negate packed 16-bit integers in a when the corresponding signed7316// 16-bit integer in b is negative, and store the results in dst.7317// Element in dst are zeroed out when the corresponding element7318// in b is zero.7319//7320// for i in 0..77321// if b[i] < 07322// r[i] := -a[i]7323// else if b[i] == 07324// r[i] := 07325// else7326// r[i] := a[i]7327// fi7328// done7329FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)7330{7331int16x8_t a = vreinterpretq_s16_m128i(_a);7332int16x8_t b = vreinterpretq_s16_m128i(_b);73337334// signed shift right: faster than vclt7335// (b < 0) ? 0xFFFF : 07336uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));7337// (b == 0) ? 0xFFFF : 07338#if defined(__aarch64__)7339int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));7340#else7341int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));7342#endif73437344// bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative7345// 'a') based on ltMask7346int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);7347// res = masked & (~zeroMask)7348int16x8_t res = vbicq_s16(masked, zeroMask);7349return vreinterpretq_m128i_s16(res);7350}73517352// Negate packed 32-bit integers in a when the corresponding signed7353// 32-bit integer in b is negative, and store the results in dst.7354// Element in dst are zeroed out when the corresponding element7355// in b is zero.7356//7357// for i in 0..37358// if b[i] < 07359// r[i] := -a[i]7360// else if b[i] == 07361// r[i] := 07362// else7363// r[i] := a[i]7364// fi7365// done7366FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)7367{7368int32x4_t a = vreinterpretq_s32_m128i(_a);7369int32x4_t b = vreinterpretq_s32_m128i(_b);73707371// signed shift right: faster than vclt7372// (b < 0) ? 0xFFFFFFFF : 07373uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));73747375// (b == 0) ? 0xFFFFFFFF : 07376#if defined(__aarch64__)7377int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));7378#else7379int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));7380#endif73817382// bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative7383// 'a') based on ltMask7384int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);7385// res = masked & (~zeroMask)7386int32x4_t res = vbicq_s32(masked, zeroMask);7387return vreinterpretq_m128i_s32(res);7388}73897390// Negate packed 8-bit integers in a when the corresponding signed7391// 8-bit integer in b is negative, and store the results in dst.7392// Element in dst are zeroed out when the corresponding element7393// in b is zero.7394//7395// for i in 0..157396// if b[i] < 07397// r[i] := -a[i]7398// else if b[i] == 07399// r[i] := 07400// else7401// r[i] := a[i]7402// fi7403// done7404FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)7405{7406int8x16_t a = vreinterpretq_s8_m128i(_a);7407int8x16_t b = vreinterpretq_s8_m128i(_b);74087409// signed shift right: faster than vclt7410// (b < 0) ? 0xFF : 07411uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));74127413// (b == 0) ? 0xFF : 07414#if defined(__aarch64__)7415int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));7416#else7417int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));7418#endif74197420// bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')7421// based on ltMask7422int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);7423// res = masked & (~zeroMask)7424int8x16_t res = vbicq_s8(masked, zeroMask);74257426return vreinterpretq_m128i_s8(res);7427}74287429// Negate packed 16-bit integers in a when the corresponding signed 16-bit7430// integer in b is negative, and store the results in dst. Element in dst are7431// zeroed out when the corresponding element in b is zero.7432//7433// FOR j := 0 to 37434// i := j*167435// IF b[i+15:i] < 07436// dst[i+15:i] := -(a[i+15:i])7437// ELSE IF b[i+15:i] == 07438// dst[i+15:i] := 07439// ELSE7440// dst[i+15:i] := a[i+15:i]7441// FI7442// ENDFOR7443//7444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi167445FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)7446{7447int16x4_t a = vreinterpret_s16_m64(_a);7448int16x4_t b = vreinterpret_s16_m64(_b);74497450// signed shift right: faster than vclt7451// (b < 0) ? 0xFFFF : 07452uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));74537454// (b == 0) ? 0xFFFF : 07455#if defined(__aarch64__)7456int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));7457#else7458int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));7459#endif74607461// bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')7462// based on ltMask7463int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);7464// res = masked & (~zeroMask)7465int16x4_t res = vbic_s16(masked, zeroMask);74667467return vreinterpret_m64_s16(res);7468}74697470// Negate packed 32-bit integers in a when the corresponding signed 32-bit7471// integer in b is negative, and store the results in dst. Element in dst are7472// zeroed out when the corresponding element in b is zero.7473//7474// FOR j := 0 to 17475// i := j*327476// IF b[i+31:i] < 07477// dst[i+31:i] := -(a[i+31:i])7478// ELSE IF b[i+31:i] == 07479// dst[i+31:i] := 07480// ELSE7481// dst[i+31:i] := a[i+31:i]7482// FI7483// ENDFOR7484//7485// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi327486FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)7487{7488int32x2_t a = vreinterpret_s32_m64(_a);7489int32x2_t b = vreinterpret_s32_m64(_b);74907491// signed shift right: faster than vclt7492// (b < 0) ? 0xFFFFFFFF : 07493uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));74947495// (b == 0) ? 0xFFFFFFFF : 07496#if defined(__aarch64__)7497int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));7498#else7499int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));7500#endif75017502// bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')7503// based on ltMask7504int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);7505// res = masked & (~zeroMask)7506int32x2_t res = vbic_s32(masked, zeroMask);75077508return vreinterpret_m64_s32(res);7509}75107511// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer7512// in b is negative, and store the results in dst. Element in dst are zeroed out7513// when the corresponding element in b is zero.7514//7515// FOR j := 0 to 77516// i := j*87517// IF b[i+7:i] < 07518// dst[i+7:i] := -(a[i+7:i])7519// ELSE IF b[i+7:i] == 07520// dst[i+7:i] := 07521// ELSE7522// dst[i+7:i] := a[i+7:i]7523// FI7524// ENDFOR7525//7526// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi87527FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)7528{7529int8x8_t a = vreinterpret_s8_m64(_a);7530int8x8_t b = vreinterpret_s8_m64(_b);75317532// signed shift right: faster than vclt7533// (b < 0) ? 0xFF : 07534uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));75357536// (b == 0) ? 0xFF : 07537#if defined(__aarch64__)7538int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));7539#else7540int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));7541#endif75427543// bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')7544// based on ltMask7545int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);7546// res = masked & (~zeroMask)7547int8x8_t res = vbic_s8(masked, zeroMask);75487549return vreinterpret_m64_s8(res);7550}75517552/* SSE4.1 */75537554// Blend packed 16-bit integers from a and b using control mask imm8, and store7555// the results in dst.7556//7557// FOR j := 0 to 77558// i := j*167559// IF imm8[j]7560// dst[i+15:i] := b[i+15:i]7561// ELSE7562// dst[i+15:i] := a[i+15:i]7563// FI7564// ENDFOR7565// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,7566// __constrange(0,255) int imm)7567#define _mm_blend_epi16(a, b, imm) \7568__extension__({ \7569const uint16_t ones = 0xffff; \7570const uint16_t zeros = 0x0000; \7571const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros, \7572((imm) & (1 << 1)) ? ones : zeros, \7573((imm) & (1 << 2)) ? ones : zeros, \7574((imm) & (1 << 3)) ? ones : zeros, \7575((imm) & (1 << 4)) ? ones : zeros, \7576((imm) & (1 << 5)) ? ones : zeros, \7577((imm) & (1 << 6)) ? ones : zeros, \7578((imm) & (1 << 7)) ? ones : zeros}; \7579uint16x8_t _mask_vec = vld1q_u16(_mask); \7580uint16x8_t _a = vreinterpretq_u16_m128i(a); \7581uint16x8_t _b = vreinterpretq_u16_m128i(b); \7582vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \7583})75847585// Blend packed double-precision (64-bit) floating-point elements from a and b7586// using control mask imm8, and store the results in dst.7587// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd7588#define _mm_blend_pd(a, b, imm) \7589__extension__({ \7590const uint64_t _mask[2] = { \7591((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \7592((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \7593uint64x2_t _mask_vec = vld1q_u64(_mask); \7594uint64x2_t _a = vreinterpretq_u64_m128d(a); \7595uint64x2_t _b = vreinterpretq_u64_m128d(b); \7596vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \7597})75987599// Blend packed single-precision (32-bit) floating-point elements from a and b7600// using mask, and store the results in dst.7601// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps7602FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)7603{7604const uint32_t ALIGN_STRUCT(16)7605data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,7606((imm8) & (1 << 1)) ? UINT32_MAX : 0,7607((imm8) & (1 << 2)) ? UINT32_MAX : 0,7608((imm8) & (1 << 3)) ? UINT32_MAX : 0};7609uint32x4_t mask = vld1q_u32(data);7610float32x4_t a = vreinterpretq_f32_m128(_a);7611float32x4_t b = vreinterpretq_f32_m128(_b);7612return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));7613}76147615// Blend packed 8-bit integers from a and b using mask, and store the results in7616// dst.7617//7618// FOR j := 0 to 157619// i := j*87620// IF mask[i+7]7621// dst[i+7:i] := b[i+7:i]7622// ELSE7623// dst[i+7:i] := a[i+7:i]7624// FI7625// ENDFOR7626FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)7627{7628// Use a signed shift right to create a mask with the sign bit7629uint8x16_t mask =7630vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));7631uint8x16_t a = vreinterpretq_u8_m128i(_a);7632uint8x16_t b = vreinterpretq_u8_m128i(_b);7633return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));7634}76357636// Blend packed double-precision (64-bit) floating-point elements from a and b7637// using mask, and store the results in dst.7638// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd7639FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)7640{7641uint64x2_t mask =7642vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));7643#if defined(__aarch64__)7644float64x2_t a = vreinterpretq_f64_m128d(_a);7645float64x2_t b = vreinterpretq_f64_m128d(_b);7646return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));7647#else7648uint64x2_t a = vreinterpretq_u64_m128d(_a);7649uint64x2_t b = vreinterpretq_u64_m128d(_b);7650return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));7651#endif7652}76537654// Blend packed single-precision (32-bit) floating-point elements from a and b7655// using mask, and store the results in dst.7656// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps7657FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)7658{7659// Use a signed shift right to create a mask with the sign bit7660uint32x4_t mask =7661vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));7662float32x4_t a = vreinterpretq_f32_m128(_a);7663float32x4_t b = vreinterpretq_f32_m128(_b);7664return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));7665}76667667// Round the packed double-precision (64-bit) floating-point elements in a up7668// to an integer value, and store the results as packed double-precision7669// floating-point elements in dst.7670// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd7671FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)7672{7673#if defined(__aarch64__)7674return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));7675#else7676double *f = (double *) &a;7677return _mm_set_pd(ceil(f[1]), ceil(f[0]));7678#endif7679}76807681// Round the packed single-precision (32-bit) floating-point elements in a up to7682// an integer value, and store the results as packed single-precision7683// floating-point elements in dst.7684// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps7685FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)7686{7687#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)7688return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));7689#else7690float *f = (float *) &a;7691return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));7692#endif7693}76947695// Round the lower double-precision (64-bit) floating-point element in b up to7696// an integer value, store the result as a double-precision floating-point7697// element in the lower element of dst, and copy the upper element from a to the7698// upper element of dst.7699// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd7700FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)7701{7702return _mm_move_sd(a, _mm_ceil_pd(b));7703}77047705// Round the lower single-precision (32-bit) floating-point element in b up to7706// an integer value, store the result as a single-precision floating-point7707// element in the lower element of dst, and copy the upper 3 packed elements7708// from a to the upper elements of dst.7709//7710// dst[31:0] := CEIL(b[31:0])7711// dst[127:32] := a[127:32]7712//7713// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss7714FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)7715{7716return _mm_move_ss(a, _mm_ceil_ps(b));7717}77187719// Compare packed 64-bit integers in a and b for equality, and store the results7720// in dst7721FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)7722{7723#if defined(__aarch64__)7724return vreinterpretq_m128i_u64(7725vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));7726#else7727// ARMv7 lacks vceqq_u647728// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)7729uint32x4_t cmp =7730vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));7731uint32x4_t swapped = vrev64q_u32(cmp);7732return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));7733#endif7734}77357736// Converts the four signed 16-bit integers in the lower 64 bits to four signed7737// 32-bit integers.7738FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)7739{7740return vreinterpretq_m128i_s32(7741vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));7742}77437744// Converts the two signed 16-bit integers in the lower 32 bits two signed7745// 32-bit integers.7746FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)7747{7748int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */7749int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */7750int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */7751return vreinterpretq_m128i_s64(s64x2);7752}77537754// Converts the two signed 32-bit integers in the lower 64 bits to two signed7755// 64-bit integers.7756FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)7757{7758return vreinterpretq_m128i_s64(7759vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));7760}77617762// Converts the four unsigned 8-bit integers in the lower 16 bits to four7763// unsigned 32-bit integers.7764FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)7765{7766int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */7767int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */7768return vreinterpretq_m128i_s16(s16x8);7769}77707771// Converts the four unsigned 8-bit integers in the lower 32 bits to four7772// unsigned 32-bit integers.7773FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)7774{7775int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */7776int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */7777int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */7778return vreinterpretq_m128i_s32(s32x4);7779}77807781// Converts the two signed 8-bit integers in the lower 32 bits to four7782// signed 64-bit integers.7783FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)7784{7785int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */7786int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */7787int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */7788int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */7789return vreinterpretq_m128i_s64(s64x2);7790}77917792// Converts the four unsigned 16-bit integers in the lower 64 bits to four7793// unsigned 32-bit integers.7794FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)7795{7796return vreinterpretq_m128i_u32(7797vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));7798}77997800// Converts the two unsigned 16-bit integers in the lower 32 bits to two7801// unsigned 64-bit integers.7802FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)7803{7804uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */7805uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */7806uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */7807return vreinterpretq_m128i_u64(u64x2);7808}78097810// Converts the two unsigned 32-bit integers in the lower 64 bits to two7811// unsigned 64-bit integers.7812FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)7813{7814return vreinterpretq_m128i_u64(7815vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));7816}78177818// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,7819// and store the results in dst.7820// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi167821FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)7822{7823uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */7824uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */7825return vreinterpretq_m128i_u16(u16x8);7826}78277828// Converts the four unsigned 8-bit integers in the lower 32 bits to four7829// unsigned 32-bit integers.7830// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx7831FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)7832{7833uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */7834uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */7835uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */7836return vreinterpretq_m128i_u32(u32x4);7837}78387839// Converts the two unsigned 8-bit integers in the lower 16 bits to two7840// unsigned 64-bit integers.7841FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)7842{7843uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */7844uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */7845uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */7846uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */7847return vreinterpretq_m128i_u64(u64x2);7848}78497850// Conditionally multiply the packed double-precision (64-bit) floating-point7851// elements in a and b using the high 4 bits in imm8, sum the four products, and7852// conditionally store the sum in dst using the low 4 bits of imm8.7853// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd7854FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)7855{7856// Generate mask value from constant immediate bit value7857const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;7858const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;7859#if !SSE2NEON_PRECISE_DP7860const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;7861const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;7862#endif7863// Conditional multiplication7864#if !SSE2NEON_PRECISE_DP7865__m128d mul = _mm_mul_pd(a, b);7866const __m128d mulMask =7867_mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));7868__m128d tmp = _mm_and_pd(mul, mulMask);7869#else7870#if defined(__aarch64__)7871double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *7872vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)7873: 0;7874double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *7875vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)7876: 0;7877#else7878double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;7879double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;7880#endif7881__m128d tmp = _mm_set_pd(d1, d0);7882#endif7883// Sum the products7884#if defined(__aarch64__)7885double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));7886#else7887double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);7888#endif7889// Conditionally store the sum7890const __m128d sumMask =7891_mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));7892__m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);7893return res;7894}78957896// Conditionally multiply the packed single-precision (32-bit) floating-point7897// elements in a and b using the high 4 bits in imm8, sum the four products,7898// and conditionally store the sum in dst using the low 4 bits of imm.7899// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps7900FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)7901{7902#if defined(__aarch64__)7903/* shortcuts */7904if (imm == 0xFF) {7905return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));7906}7907if (imm == 0x7F) {7908float32x4_t m = _mm_mul_ps(a, b);7909m[3] = 0;7910return _mm_set1_ps(vaddvq_f32(m));7911}7912#endif79137914float s = 0, c = 0;7915float32x4_t f32a = vreinterpretq_f32_m128(a);7916float32x4_t f32b = vreinterpretq_f32_m128(b);79177918/* To improve the accuracy of floating-point summation, Kahan algorithm7919* is used for each operation.7920*/7921if (imm & (1 << 4))7922_sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);7923if (imm & (1 << 5))7924_sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);7925if (imm & (1 << 6))7926_sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);7927if (imm & (1 << 7))7928_sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);7929s += c;79307931float32x4_t res = {7932(imm & 0x1) ? s : 0,7933(imm & 0x2) ? s : 0,7934(imm & 0x4) ? s : 0,7935(imm & 0x8) ? s : 0,7936};7937return vreinterpretq_m128_f32(res);7938}79397940// Extracts the selected signed or unsigned 32-bit integer from a and zero7941// extends.7942// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)7943#define _mm_extract_epi32(a, imm) \7944vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))79457946// Extracts the selected signed or unsigned 64-bit integer from a and zero7947// extends.7948// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)7949#define _mm_extract_epi64(a, imm) \7950vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))79517952// Extracts the selected signed or unsigned 8-bit integer from a and zero7953// extends.7954// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)7955// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi87956#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))79577958// Extracts the selected single-precision (32-bit) floating-point from a.7959// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)7960#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))79617962// Round the packed double-precision (64-bit) floating-point elements in a down7963// to an integer value, and store the results as packed double-precision7964// floating-point elements in dst.7965// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd7966FORCE_INLINE __m128d _mm_floor_pd(__m128d a)7967{7968#if defined(__aarch64__)7969return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));7970#else7971double *f = (double *) &a;7972return _mm_set_pd(floor(f[1]), floor(f[0]));7973#endif7974}79757976// Round the packed single-precision (32-bit) floating-point elements in a down7977// to an integer value, and store the results as packed single-precision7978// floating-point elements in dst.7979// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps7980FORCE_INLINE __m128 _mm_floor_ps(__m128 a)7981{7982#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)7983return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));7984#else7985float *f = (float *) &a;7986return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));7987#endif7988}79897990// Round the lower double-precision (64-bit) floating-point element in b down to7991// an integer value, store the result as a double-precision floating-point7992// element in the lower element of dst, and copy the upper element from a to the7993// upper element of dst.7994// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd7995FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)7996{7997return _mm_move_sd(a, _mm_floor_pd(b));7998}79998000// Round the lower single-precision (32-bit) floating-point element in b down to8001// an integer value, store the result as a single-precision floating-point8002// element in the lower element of dst, and copy the upper 3 packed elements8003// from a to the upper elements of dst.8004//8005// dst[31:0] := FLOOR(b[31:0])8006// dst[127:32] := a[127:32]8007//8008// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss8009FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)8010{8011return _mm_move_ss(a, _mm_floor_ps(b));8012}80138014// Inserts the least significant 32 bits of b into the selected 32-bit integer8015// of a.8016// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,8017// __constrange(0,4) int imm)8018#define _mm_insert_epi32(a, b, imm) \8019__extension__({ \8020vreinterpretq_m128i_s32( \8021vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \8022})80238024// Inserts the least significant 64 bits of b into the selected 64-bit integer8025// of a.8026// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,8027// __constrange(0,2) int imm)8028#define _mm_insert_epi64(a, b, imm) \8029__extension__({ \8030vreinterpretq_m128i_s64( \8031vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \8032})80338034// Inserts the least significant 8 bits of b into the selected 8-bit integer8035// of a.8036// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,8037// __constrange(0,16) int imm)8038#define _mm_insert_epi8(a, b, imm) \8039__extension__({ \8040vreinterpretq_m128i_s8( \8041vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \8042})80438044// Copy a to tmp, then insert a single-precision (32-bit) floating-point8045// element from b into tmp using the control in imm8. Store tmp to dst using8046// the mask in imm8 (elements are zeroed out when the corresponding bit is set).8047// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps8048#define _mm_insert_ps(a, b, imm8) \8049__extension__({ \8050float32x4_t tmp1 = \8051vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \8052vreinterpretq_f32_m128(a), 0); \8053float32x4_t tmp2 = \8054vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \8055((imm8 >> 4) & 0x3)); \8056const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \8057((imm8) & (1 << 1)) ? UINT32_MAX : 0, \8058((imm8) & (1 << 2)) ? UINT32_MAX : 0, \8059((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \8060uint32x4_t mask = vld1q_u32(data); \8061float32x4_t all_zeros = vdupq_n_f32(0); \8062\8063vreinterpretq_m128_f32( \8064vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \8065})80668067// epi versions of min/max8068// Computes the pariwise maximums of the four signed 32-bit integer values of a8069// and b.8070//8071// A 128-bit parameter that can be defined with the following equations:8072// r0 := (a0 > b0) ? a0 : b08073// r1 := (a1 > b1) ? a1 : b18074// r2 := (a2 > b2) ? a2 : b28075// r3 := (a3 > b3) ? a3 : b38076//8077// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx8078FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)8079{8080return vreinterpretq_m128i_s32(8081vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));8082}80838084// Compare packed signed 8-bit integers in a and b, and store packed maximum8085// values in dst.8086// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi88087FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)8088{8089return vreinterpretq_m128i_s8(8090vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));8091}80928093// Compare packed unsigned 16-bit integers in a and b, and store packed maximum8094// values in dst.8095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu168096FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)8097{8098return vreinterpretq_m128i_u16(8099vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));8100}81018102// Compare packed unsigned 32-bit integers in a and b, and store packed maximum8103// values in dst.8104// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu328105FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)8106{8107return vreinterpretq_m128i_u32(8108vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));8109}81108111// Computes the pariwise minima of the four signed 32-bit integer values of a8112// and b.8113//8114// A 128-bit parameter that can be defined with the following equations:8115// r0 := (a0 < b0) ? a0 : b08116// r1 := (a1 < b1) ? a1 : b18117// r2 := (a2 < b2) ? a2 : b28118// r3 := (a3 < b3) ? a3 : b38119//8120// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx8121FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)8122{8123return vreinterpretq_m128i_s32(8124vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));8125}81268127// Compare packed signed 8-bit integers in a and b, and store packed minimum8128// values in dst.8129// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi88130FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)8131{8132return vreinterpretq_m128i_s8(8133vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));8134}81358136// Compare packed unsigned 16-bit integers in a and b, and store packed minimum8137// values in dst.8138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu168139FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)8140{8141return vreinterpretq_m128i_u16(8142vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));8143}81448145// Compare packed unsigned 32-bit integers in a and b, and store packed minimum8146// values in dst.8147// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu328148FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)8149{8150return vreinterpretq_m128i_u32(8151vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));8152}81538154// Horizontally compute the minimum amongst the packed unsigned 16-bit integers8155// in a, store the minimum and index in dst, and zero the remaining bits in dst.8156//8157// index[2:0] := 08158// min[15:0] := a[15:0]8159// FOR j := 0 to 78160// i := j*168161// IF a[i+15:i] < min[15:0]8162// index[2:0] := j8163// min[15:0] := a[i+15:i]8164// FI8165// ENDFOR8166// dst[15:0] := min[15:0]8167// dst[18:16] := index[2:0]8168// dst[127:19] := 08169//8170// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu168171FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)8172{8173__m128i dst;8174uint16_t min, idx = 0;8175#if defined(__aarch64__)8176// Find the minimum value8177min = vminvq_u16(vreinterpretq_u16_m128i(a));81788179// Get the index of the minimum value8180static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};8181uint16x8_t minv = vdupq_n_u16(min);8182uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));8183idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));8184#else8185// Find the minimum value8186__m64 tmp;8187tmp = vreinterpret_m64_u16(8188vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),8189vget_high_u16(vreinterpretq_u16_m128i(a))));8190tmp = vreinterpret_m64_u16(8191vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));8192tmp = vreinterpret_m64_u16(8193vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));8194min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);8195// Get the index of the minimum value8196int i;8197for (i = 0; i < 8; i++) {8198if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {8199idx = (uint16_t) i;8200break;8201}8202a = _mm_srli_si128(a, 2);8203}8204#endif8205// Generate result8206dst = _mm_setzero_si128();8207dst = vreinterpretq_m128i_u16(8208vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));8209dst = vreinterpretq_m128i_u16(8210vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));8211return dst;8212}82138214// Compute the sum of absolute differences (SADs) of quadruplets of unsigned8215// 8-bit integers in a compared to those in b, and store the 16-bit results in8216// dst. Eight SADs are performed using one quadruplet from b and eight8217// quadruplets from a. One quadruplet is selected from b starting at on the8218// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit8219// integers selected from a starting at the offset specified in imm8.8220// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu88221FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)8222{8223uint8x16_t _a, _b;82248225switch (imm & 0x4) {8226case 0:8227// do nothing8228_a = vreinterpretq_u8_m128i(a);8229break;8230case 4:8231_a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),8232vreinterpretq_u32_m128i(a), 1));8233break;8234default:8235#if defined(__GNUC__) || defined(__clang__)8236__builtin_unreachable();8237#endif8238break;8239}82408241switch (imm & 0x3) {8242case 0:8243_b = vreinterpretq_u8_u32(8244vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));8245break;8246case 1:8247_b = vreinterpretq_u8_u32(8248vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));8249break;8250case 2:8251_b = vreinterpretq_u8_u32(8252vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));8253break;8254case 3:8255_b = vreinterpretq_u8_u32(8256vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));8257break;8258default:8259#if defined(__GNUC__) || defined(__clang__)8260__builtin_unreachable();8261#endif8262break;8263}82648265int16x8_t c04, c15, c26, c37;8266uint8x8_t low_b = vget_low_u8(_b);8267c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));8268uint8x16_t _a_1 = vextq_u8(_a, _a, 1);8269c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));8270uint8x16_t _a_2 = vextq_u8(_a, _a, 2);8271c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));8272uint8x16_t _a_3 = vextq_u8(_a, _a, 3);8273c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));8274#if defined(__aarch64__)8275// |0|4|2|6|8276c04 = vpaddq_s16(c04, c26);8277// |1|5|3|7|8278c15 = vpaddq_s16(c15, c37);82798280int32x4_t trn1_c =8281vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));8282int32x4_t trn2_c =8283vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));8284return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),8285vreinterpretq_s16_s32(trn2_c)));8286#else8287int16x4_t c01, c23, c45, c67;8288c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));8289c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));8290c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));8291c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));82928293return vreinterpretq_m128i_s16(8294vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));8295#endif8296}82978298// Multiply the low signed 32-bit integers from each packed 64-bit element in8299// a and b, and store the signed 64-bit results in dst.8300//8301// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b08302// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b28303FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)8304{8305// vmull_s32 upcasts instead of masking, so we downcast.8306int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));8307int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));8308return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));8309}83108311// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or8312// unsigned 32-bit integers from b.8313// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx8314FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)8315{8316return vreinterpretq_m128i_s32(8317vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));8318}83198320// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit8321// integers and saturates.8322//8323// r0 := UnsignedSaturate(a0)8324// r1 := UnsignedSaturate(a1)8325// r2 := UnsignedSaturate(a2)8326// r3 := UnsignedSaturate(a3)8327// r4 := UnsignedSaturate(b0)8328// r5 := UnsignedSaturate(b1)8329// r6 := UnsignedSaturate(b2)8330// r7 := UnsignedSaturate(b3)8331FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)8332{8333return vreinterpretq_m128i_u16(8334vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),8335vqmovun_s32(vreinterpretq_s32_m128i(b))));8336}83378338// Round the packed double-precision (64-bit) floating-point elements in a using8339// the rounding parameter, and store the results as packed double-precision8340// floating-point elements in dst.8341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd8342FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)8343{8344#if defined(__aarch64__)8345switch (rounding) {8346case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):8347return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));8348case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):8349return _mm_floor_pd(a);8350case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):8351return _mm_ceil_pd(a);8352case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):8353return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));8354default: //_MM_FROUND_CUR_DIRECTION8355return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));8356}8357#else8358double *v_double = (double *) &a;83598360if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||8361(rounding == _MM_FROUND_CUR_DIRECTION &&8362_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {8363double res[2], tmp;8364for (int i = 0; i < 2; i++) {8365tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];8366double roundDown = floor(tmp); // Round down value8367double roundUp = ceil(tmp); // Round up value8368double diffDown = tmp - roundDown;8369double diffUp = roundUp - tmp;8370if (diffDown < diffUp) {8371/* If it's closer to the round down value, then use it */8372res[i] = roundDown;8373} else if (diffDown > diffUp) {8374/* If it's closer to the round up value, then use it */8375res[i] = roundUp;8376} else {8377/* If it's equidistant between round up and round down value,8378* pick the one which is an even number */8379double half = roundDown / 2;8380if (half != floor(half)) {8381/* If the round down value is odd, return the round up value8382*/8383res[i] = roundUp;8384} else {8385/* If the round up value is odd, return the round down value8386*/8387res[i] = roundDown;8388}8389}8390res[i] = (v_double[i] < 0) ? -res[i] : res[i];8391}8392return _mm_set_pd(res[1], res[0]);8393} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||8394(rounding == _MM_FROUND_CUR_DIRECTION &&8395_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {8396return _mm_floor_pd(a);8397} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||8398(rounding == _MM_FROUND_CUR_DIRECTION &&8399_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {8400return _mm_ceil_pd(a);8401}8402return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),8403v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));8404#endif8405}84068407// Round the packed single-precision (32-bit) floating-point elements in a using8408// the rounding parameter, and store the results as packed single-precision8409// floating-point elements in dst.8410// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps8411FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)8412{8413#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)8414switch (rounding) {8415case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):8416return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));8417case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):8418return _mm_floor_ps(a);8419case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):8420return _mm_ceil_ps(a);8421case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):8422return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));8423default: //_MM_FROUND_CUR_DIRECTION8424return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));8425}8426#else8427float *v_float = (float *) &a;84288429if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||8430(rounding == _MM_FROUND_CUR_DIRECTION &&8431_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {8432uint32x4_t signmask = vdupq_n_u32(0x80000000);8433float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),8434vdupq_n_f32(0.5f)); /* +/- 0.5 */8435int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(8436vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/8437int32x4_t r_trunc = vcvtq_s32_f32(8438vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */8439int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(8440vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */8441int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),8442vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */8443float32x4_t delta = vsubq_f32(8444vreinterpretq_f32_m128(a),8445vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */8446uint32x4_t is_delta_half =8447vceqq_f32(delta, half); /* delta == +/- 0.5 */8448return vreinterpretq_m128_f32(8449vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));8450} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||8451(rounding == _MM_FROUND_CUR_DIRECTION &&8452_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {8453return _mm_floor_ps(a);8454} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||8455(rounding == _MM_FROUND_CUR_DIRECTION &&8456_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {8457return _mm_ceil_ps(a);8458}8459return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),8460v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),8461v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),8462v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));8463#endif8464}84658466// Round the lower double-precision (64-bit) floating-point element in b using8467// the rounding parameter, store the result as a double-precision floating-point8468// element in the lower element of dst, and copy the upper element from a to the8469// upper element of dst.8470// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd8471FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)8472{8473return _mm_move_sd(a, _mm_round_pd(b, rounding));8474}84758476// Round the lower single-precision (32-bit) floating-point element in b using8477// the rounding parameter, store the result as a single-precision floating-point8478// element in the lower element of dst, and copy the upper 3 packed elements8479// from a to the upper elements of dst. Rounding is done according to the8480// rounding[3:0] parameter, which can be one of:8481// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and8482// suppress exceptions8483// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and8484// suppress exceptions8485// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress8486// exceptions8487// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress8488// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see8489// _MM_SET_ROUNDING_MODE8490// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss8491FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)8492{8493return _mm_move_ss(a, _mm_round_ps(b, rounding));8494}84958496// Load 128-bits of integer data from memory into dst using a non-temporal8497// memory hint. mem_addr must be aligned on a 16-byte boundary or a8498// general-protection exception may be generated.8499//8500// dst[127:0] := MEM[mem_addr+127:mem_addr]8501//8502// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si1288503FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)8504{8505#if __has_builtin(__builtin_nontemporal_store)8506return __builtin_nontemporal_load(p);8507#else8508return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));8509#endif8510}85118512// Compute the bitwise NOT of a and then AND with a 128-bit vector containing8513// all 1's, and return 1 if the result is zero, otherwise return 0.8514// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones8515FORCE_INLINE int _mm_test_all_ones(__m128i a)8516{8517return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==8518~(uint64_t) 0;8519}85208521// Compute the bitwise AND of 128 bits (representing integer data) in a and8522// mask, and return 1 if the result is zero, otherwise return 0.8523// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros8524FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)8525{8526int64x2_t a_and_mask =8527vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));8528return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));8529}85308531// Compute the bitwise AND of 128 bits (representing integer data) in a and8532// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute8533// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is8534// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,8535// otherwise return 0.8536// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero8537FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)8538{8539uint64x2_t zf =8540vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));8541uint64x2_t cf =8542vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));8543uint64x2_t result = vandq_u64(zf, cf);8544return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));8545}85468547// Compute the bitwise AND of 128 bits (representing integer data) in a and b,8548// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the8549// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,8550// otherwise set CF to 0. Return the CF value.8551// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si1288552FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)8553{8554int64x2_t s64 =8555vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));8556return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));8557}85588559// Compute the bitwise AND of 128 bits (representing integer data) in a and b,8560// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the8561// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,8562// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,8563// otherwise return 0.8564// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si1288565#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)85668567// Compute the bitwise AND of 128 bits (representing integer data) in a and b,8568// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the8569// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,8570// otherwise set CF to 0. Return the ZF value.8571// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si1288572FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)8573{8574int64x2_t s64 =8575vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));8576return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));8577}85788579/* SSE4.2 */85808581const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {85820x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,8583};8584const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {85850x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,85860x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,8587};85888589/* specify the source data format */8590#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */8591#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */8592#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */8593#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */85948595/* specify the comparison operation */8596#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */8597#define _SIDD_CMP_RANGES 0x04 /* compare ranges */8598#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */8599#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */86008601/* specify the polarity */8602#define _SIDD_POSITIVE_POLARITY 0x008603#define _SIDD_MASKED_POSITIVE_POLARITY 0x208604#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */8605#define _SIDD_MASKED_NEGATIVE_POLARITY \86060x30 /* negate results only before end of string */86078608/* specify the output selection in _mm_cmpXstri */8609#define _SIDD_LEAST_SIGNIFICANT 0x008610#define _SIDD_MOST_SIGNIFICANT 0x4086118612/* specify the output selection in _mm_cmpXstrm */8613#define _SIDD_BIT_MASK 0x008614#define _SIDD_UNIT_MASK 0x4086158616/* Pattern Matching for C macros.8617* https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms8618*/86198620/* catenate */8621#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__8622#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)86238624#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)8625/* run the 2nd parameter */8626#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__8627/* run the 1st parameter */8628#define SSE2NEON_IIF_1(t, ...) t86298630#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)8631#define SSE2NEON_COMPL_0 18632#define SSE2NEON_COMPL_1 086338634#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)8635#define SSE2NEON_DEC_1 08636#define SSE2NEON_DEC_2 18637#define SSE2NEON_DEC_3 28638#define SSE2NEON_DEC_4 38639#define SSE2NEON_DEC_5 48640#define SSE2NEON_DEC_6 58641#define SSE2NEON_DEC_7 68642#define SSE2NEON_DEC_8 78643#define SSE2NEON_DEC_9 88644#define SSE2NEON_DEC_10 98645#define SSE2NEON_DEC_11 108646#define SSE2NEON_DEC_12 118647#define SSE2NEON_DEC_13 128648#define SSE2NEON_DEC_14 138649#define SSE2NEON_DEC_15 148650#define SSE2NEON_DEC_16 1586518652/* detection */8653#define SSE2NEON_CHECK_N(x, n, ...) n8654#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )8655#define SSE2NEON_PROBE(x) x, 1,86568657#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))8658#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)86598660#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))8661#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))86628663#define SSE2NEON_EAT(...)8664#define SSE2NEON_EXPAND(...) __VA_ARGS__8665#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)86668667/* recursion */8668/* deferred expression */8669#define SSE2NEON_EMPTY()8670#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()8671#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()8672#define SSE2NEON_EXPAND(...) __VA_ARGS__86738674#define SSE2NEON_EVAL(...) \8675SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))8676#define SSE2NEON_EVAL1(...) \8677SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))8678#define SSE2NEON_EVAL2(...) \8679SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))8680#define SSE2NEON_EVAL3(...) __VA_ARGS__86818682#define SSE2NEON_REPEAT(count, macro, ...) \8683SSE2NEON_WHEN(count) \8684(SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \8685SSE2NEON_DEC(count), macro, \8686__VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \8687__VA_ARGS__))8688#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT86898690#define SSE2NEON_SIZE_OF_byte 88691#define SSE2NEON_NUMBER_OF_LANES_byte 168692#define SSE2NEON_SIZE_OF_word 168693#define SSE2NEON_NUMBER_OF_LANES_word 886948695#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \8696mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \8697vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \8698vreinterpretq_##type##_m128i(a)));86998700#define SSE2NEON_FILL_LANE(i, type) \8701vec_b[i] = \8702vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));87038704#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \8705number_of_lanes, byte_or_word) \8706do { \8707SSE2NEON_CAT( \8708data_type_prefix, \8709SSE2NEON_CAT(size, \8710SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \8711vec_b[number_of_lanes]; \8712__m128i mask = SSE2NEON_IIF(byte_or_word)( \8713vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \8714vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \8715SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \8716SSE2NEON_CAT(type_prefix, size))) \8717for (int i = 0; i < number_of_lanes; i++) { \8718mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \8719size)(SSE2NEON_CAT(vbslq_u, size)( \8720SSE2NEON_CAT(vreinterpretq_u, \8721SSE2NEON_CAT(size, _m128i))(mask), \8722SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \8723vec_b[i], \8724SSE2NEON_CAT( \8725vreinterpretq_, \8726SSE2NEON_CAT(type_prefix, \8727SSE2NEON_CAT(size, _m128i(a))))), \8728SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \8729vec_b[i], \8730SSE2NEON_CAT( \8731vreinterpretq_, \8732SSE2NEON_CAT(type_prefix, \8733SSE2NEON_CAT(size, _m128i(a))))))); \8734} \8735} while (0)87368737#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \8738do { \8739SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \8740SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \8741SSE2NEON_CAT(u, size))) \8742} while (0)87438744#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \8745static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \8746int lb) \8747{ \8748__m128i mtx[16]; \8749PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8750SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \8751return SSE2NEON_CAT( \8752_sse2neon_aggregate_equal_any_, \8753SSE2NEON_CAT( \8754SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8755SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \8756type))))(la, lb, mtx); \8757}87588759#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \8760static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \8761int lb) \8762{ \8763__m128i mtx[16]; \8764PCMPSTR_RANGES( \8765a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8766SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \8767return SSE2NEON_CAT( \8768_sse2neon_aggregate_ranges_, \8769SSE2NEON_CAT( \8770SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8771SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \8772type))))(la, lb, mtx); \8773}87748775#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \8776static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \8777__m128i b, int lb) \8778{ \8779__m128i mtx[16]; \8780PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8781SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \8782return SSE2NEON_CAT( \8783_sse2neon_aggregate_equal_ordered_, \8784SSE2NEON_CAT( \8785SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \8786SSE2NEON_CAT(x, \8787SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \8788SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \8789}87908791static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])8792{8793int res = 0;8794int m = (1 << la) - 1;8795uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);8796uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);8797uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);8798uint8x16_t vec = vcombine_u8(t_lo, t_hi);8799for (int j = 0; j < lb; j++) {8800mtx[j] = vreinterpretq_m128i_u8(8801vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));8802mtx[j] = vreinterpretq_m128i_u8(8803vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));8804int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;8805res |= (tmp << j);8806}8807return res;8808}88098810static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])8811{8812int res = 0;8813int m = (1 << la) - 1;8814uint16x8_t vec =8815vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));8816for (int j = 0; j < lb; j++) {8817mtx[j] = vreinterpretq_m128i_u16(8818vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));8819mtx[j] = vreinterpretq_m128i_u16(8820vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));8821int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;8822res |= (tmp << j);8823}8824return res;8825}88268827/* clang-format off */8828#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \8829prefix##IMPL(byte) \8830prefix##IMPL(word)8831/* clang-format on */88328833SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)88348835static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])8836{8837int res = 0;8838int m = (1 << la) - 1;8839uint16x8_t vec =8840vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));8841for (int j = 0; j < lb; j++) {8842mtx[j] = vreinterpretq_m128i_u16(8843vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));8844mtx[j] = vreinterpretq_m128i_u16(8845vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));8846__m128i tmp = vreinterpretq_m128i_u32(8847vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));8848uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),8849vreinterpretq_u32_m128i(tmp));8850#if defined(__aarch64__)8851int t = vaddvq_u32(vec_res) ? 1 : 0;8852#else8853uint64x2_t sumh = vpaddlq_u32(vec_res);8854int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);8855#endif8856res |= (t << j);8857}8858return res;8859}88608861static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])8862{8863int res = 0;8864int m = (1 << la) - 1;8865uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);8866uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);8867uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);8868uint8x16_t vec = vcombine_u8(t_lo, t_hi);8869for (int j = 0; j < lb; j++) {8870mtx[j] = vreinterpretq_m128i_u8(8871vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));8872mtx[j] = vreinterpretq_m128i_u8(8873vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));8874__m128i tmp = vreinterpretq_m128i_u16(8875vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));8876uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),8877vreinterpretq_u16_m128i(tmp));8878int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;8879res |= (t << j);8880}8881return res;8882}88838884#define SSE2NEON_CMP_RANGES_IS_BYTE 18885#define SSE2NEON_CMP_RANGES_IS_WORD 088868887/* clang-format off */8888#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \8889prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \8890prefix##IMPL(byte, int, s, prefix##IS_BYTE) \8891prefix##IMPL(word, uint, u, prefix##IS_WORD) \8892prefix##IMPL(word, int, s, prefix##IS_WORD)8893/* clang-format on */88948895SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)88968897#undef SSE2NEON_CMP_RANGES_IS_BYTE8898#undef SSE2NEON_CMP_RANGES_IS_WORD88998900static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)8901{8902uint8x16_t mtx =8903vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));8904int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));8905int m1 = 0x10000 - (1 << la);8906int tb = 0x10000 - (1 << lb);8907uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;8908uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;8909vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);8910vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);8911vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);8912vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);8913vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);8914tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);8915tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);89168917res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));8918res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));8919res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);8920res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);8921res_lo = vand_u8(res_lo, vec_mask);8922res_hi = vand_u8(res_hi, vec_mask);89238924int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);8925return res;8926}89278928static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)8929{8930uint16x8_t mtx =8931vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));8932int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));8933int m1 = 0x100 - (1 << la);8934int tb = 0x100 - (1 << lb);8935uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);8936uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);8937uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);8938uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);8939mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);8940mtx = vbslq_u16(vec1, tmp, mtx);8941mtx = vandq_u16(mtx, vec_mask);8942return _sse2neon_vaddvq_u16(mtx);8943}89448945#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 18946#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 089478948#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \8949static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \8950int bound, int la, int lb, __m128i mtx[16]) \8951{ \8952int res = 0; \8953int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \8954uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \8955vld1_u##size(_sse2neon_cmpestr_mask##size##b), \8956vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \8957uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \8958vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \8959vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \8960vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \8961uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \8962uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \8963for (int j = 0; j < lb; j++) { \8964mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \8965vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \8966} \8967for (int j = lb; j < bound; j++) { \8968mtx[j] = vreinterpretq_m128i_u##size( \8969vbslq_u##size(vec1, vec_minusone, vec_zero)); \8970} \8971unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \8972(unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \8973for (int i = 0; i < bound; i++) { \8974int val = 1; \8975for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \8976val &= ptr[k * bound + j]; \8977res += val << i; \8978} \8979return res; \8980}89818982/* clang-format off */8983#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \8984prefix##IMPL(8, 16, prefix##IS_UBYTE) \8985prefix##IMPL(16, 8, prefix##IS_UWORD)8986/* clang-format on */89878988SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)89898990#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE8991#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD89928993/* clang-format off */8994#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \8995prefix##IMPL(byte) \8996prefix##IMPL(word)8997/* clang-format on */89988999SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)90009001#define SSE2NEON_CMPESTR_LIST \9002_(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \9003_(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \9004_(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \9005_(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \9006_(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \9007_(CMP_UWORD_RANGES, cmp_uword_ranges) \9008_(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \9009_(CMP_SWORD_RANGES, cmp_sword_ranges) \9010_(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \9011_(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \9012_(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \9013_(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \9014_(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \9015_(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \9016_(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \9017_(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)90189019enum {9020#define _(name, func_suffix) name,9021SSE2NEON_CMPESTR_LIST9022#undef _9023};9024typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);9025static cmpestr_func_t _sse2neon_cmpfunc_table[] = {9026#define _(name, func_suffix) _sse2neon_##func_suffix,9027SSE2NEON_CMPESTR_LIST9028#undef _9029};90309031FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)9032{9033switch (imm8 & 0x30) {9034case _SIDD_NEGATIVE_POLARITY:9035res ^= 0xffffffff;9036break;9037case _SIDD_MASKED_NEGATIVE_POLARITY:9038res ^= (1 << lb) - 1;9039break;9040default:9041break;9042}90439044return res & ((bound == 8) ? 0xFF : 0xFFFF);9045}90469047FORCE_INLINE int _sse2neon_clz(unsigned int x)9048{9049#if _MSC_VER9050DWORD cnt = 0;9051if (_BitScanForward(&cnt, x))9052return cnt;9053return 32;9054#else9055return x != 0 ? __builtin_clz(x) : 32;9056#endif9057}90589059FORCE_INLINE int _sse2neon_ctz(unsigned int x)9060{9061#if _MSC_VER9062DWORD cnt = 0;9063if (_BitScanReverse(&cnt, x))9064return 31 - cnt;9065return 32;9066#else9067return x != 0 ? __builtin_ctz(x) : 32;9068#endif9069}90709071FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)9072{9073#if _MSC_VER9074unsigned long cnt;9075#ifdef defined(SSE2NEON_HAS_BITSCAN64)9076(defined(_M_AMD64) || defined(__x86_64__))9077if((_BitScanForward64(&cnt, x))9078return (int)(cnt);9079#else9080if (_BitScanForward(&cnt, (unsigned long) (x)))9081return (int) cnt;9082if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))9083return (int) (cnt + 32);9084#endif9085return 64;9086#else9087return x != 0 ? __builtin_ctzll(x) : 64;9088#endif9089}90909091#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)90929093#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \9094const int var = (imm & 0x01) ? 8 : 1690959096#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \9097int tmp1 = la ^ (la >> 31); \9098la = tmp1 - (la >> 31); \9099int tmp2 = lb ^ (lb >> 31); \9100lb = tmp2 - (lb >> 31); \9101la = SSE2NEON_MIN(la, bound); \9102lb = SSE2NEON_MIN(lb, bound)91039104// Compare all pairs of character in string a and b,9105// then aggregate the result.9106// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the9107// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of9108// string a and b.9109#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \9110SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \9111SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \9112int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \9113r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)91149115#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \9116return (r2 == 0) ? bound \9117: ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \9118: _sse2neon_ctz(r2))91199120#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \9121__m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \9122if (imm8 & 0x40) { \9123if (bound == 8) { \9124uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \9125vld1q_u16(_sse2neon_cmpestr_mask16b)); \9126dst = vreinterpretq_m128i_u16(vbslq_u16( \9127tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \9128} else { \9129uint8x16_t vec_r2 = \9130vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \9131uint8x16_t tmp = \9132vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \9133dst = vreinterpretq_m128i_u8( \9134vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \9135} \9136} else { \9137if (bound == 16) { \9138dst = vreinterpretq_m128i_u16( \9139vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \9140} else { \9141dst = vreinterpretq_m128i_u8( \9142vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \9143} \9144} \9145return dst91469147// Compare packed strings in a and b with lengths la and lb using the control9148// in imm8, and returns 1 if b did not contain a null character and the9149// resulting mask was zero, and 0 otherwise.9150// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra9151FORCE_INLINE int _mm_cmpestra(__m128i a,9152int la,9153__m128i b,9154int lb,9155const int imm8)9156{9157int lb_cpy = lb;9158SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);9159return !r2 & (lb_cpy > bound);9160}91619162// Compare packed strings in a and b with lengths la and lb using the control in9163// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.9164// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc9165FORCE_INLINE int _mm_cmpestrc(__m128i a,9166int la,9167__m128i b,9168int lb,9169const int imm8)9170{9171SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);9172return r2 != 0;9173}91749175// Compare packed strings in a and b with lengths la and lb using the control9176// in imm8, and store the generated index in dst.9177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri9178FORCE_INLINE int _mm_cmpestri(__m128i a,9179int la,9180__m128i b,9181int lb,9182const int imm8)9183{9184SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);9185SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);9186}91879188// Compare packed strings in a and b with lengths la and lb using the control9189// in imm8, and store the generated mask in dst.9190// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm9191FORCE_INLINE __m128i9192_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)9193{9194SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);9195SSE2NEON_CMPSTR_GENERATE_MASK(dst);9196}91979198// Compare packed strings in a and b with lengths la and lb using the control in9199// imm8, and returns bit 0 of the resulting bit mask.9200// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro9201FORCE_INLINE int _mm_cmpestro(__m128i a,9202int la,9203__m128i b,9204int lb,9205const int imm8)9206{9207SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);9208return r2 & 1;9209}92109211// Compare packed strings in a and b with lengths la and lb using the control in9212// imm8, and returns 1 if any character in a was null, and 0 otherwise.9213// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs9214FORCE_INLINE int _mm_cmpestrs(__m128i a,9215int la,9216__m128i b,9217int lb,9218const int imm8)9219{9220SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);9221return la <= (bound - 1);9222}92239224// Compare packed strings in a and b with lengths la and lb using the control in9225// imm8, and returns 1 if any character in b was null, and 0 otherwise.9226// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz9227FORCE_INLINE int _mm_cmpestrz(__m128i a,9228int la,9229__m128i b,9230int lb,9231const int imm8)9232{9233SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);9234return lb <= (bound - 1);9235}92369237#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \9238do { \9239if (imm8 & 0x01) { \9240uint16x8_t equal_mask_##str = \9241vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \9242uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \9243uint64_t matches_##str = \9244vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \9245len = _sse2neon_ctzll(matches_##str) >> 3; \9246} else { \9247uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \9248vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \9249uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \9250uint64_t matches_##str = \9251vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \9252len = _sse2neon_ctzll(matches_##str) >> 2; \9253} \9254} while (0)92559256#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \9257int la, lb; \9258do { \9259SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \9260SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \9261} while (0)92629263// Compare packed strings with implicit lengths in a and b using the control in9264// imm8, and returns 1 if b did not contain a null character and the resulting9265// mask was zero, and 0 otherwise.9266// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra9267FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)9268{9269SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);9270return !r2 & (lb >= bound);9271}92729273// Compare packed strings with implicit lengths in a and b using the control in9274// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.9275// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc9276FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)9277{9278SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);9279return r2 != 0;9280}92819282// Compare packed strings with implicit lengths in a and b using the control in9283// imm8, and store the generated index in dst.9284// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri9285FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)9286{9287SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);9288SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);9289}92909291// Compare packed strings with implicit lengths in a and b using the control in9292// imm8, and store the generated mask in dst.9293// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm9294FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)9295{9296SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);9297SSE2NEON_CMPSTR_GENERATE_MASK(dst);9298}92999300// Compare packed strings with implicit lengths in a and b using the control in9301// imm8, and returns bit 0 of the resulting bit mask.9302// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro9303FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)9304{9305SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);9306return r2 & 1;9307}93089309// Compare packed strings with implicit lengths in a and b using the control in9310// imm8, and returns 1 if any character in a was null, and 0 otherwise.9311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs9312FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)9313{9314SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);9315int la;9316SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);9317return la <= (bound - 1);9318}93199320// Compare packed strings with implicit lengths in a and b using the control in9321// imm8, and returns 1 if any character in b was null, and 0 otherwise.9322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz9323FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)9324{9325SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);9326int lb;9327SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);9328return lb <= (bound - 1);9329}93309331// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers9332// in b for greater than.9333FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)9334{9335#if defined(__aarch64__)9336return vreinterpretq_m128i_u64(9337vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));9338#else9339return vreinterpretq_m128i_s64(vshrq_n_s64(9340vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),934163));9342#endif9343}93449345// Starting with the initial value in crc, accumulates a CRC32 value for9346// unsigned 16-bit integer v.9347// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)9348FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)9349{9350#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)9351__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"9352: [c] "+r"(crc)9353: [v] "r"(v));9354#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)9355crc = __crc32ch(crc, v);9356#else9357crc = _mm_crc32_u8(crc, v & 0xff);9358crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);9359#endif9360return crc;9361}93629363// Starting with the initial value in crc, accumulates a CRC32 value for9364// unsigned 32-bit integer v.9365// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)9366FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)9367{9368#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)9369__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"9370: [c] "+r"(crc)9371: [v] "r"(v));9372#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)9373crc = __crc32cw(crc, v);9374#else9375crc = _mm_crc32_u16(crc, v & 0xffff);9376crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);9377#endif9378return crc;9379}93809381// Starting with the initial value in crc, accumulates a CRC32 value for9382// unsigned 64-bit integer v.9383// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)9384FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)9385{9386#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)9387__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"9388: [c] "+r"(crc)9389: [v] "r"(v));9390#else9391crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);9392crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);9393#endif9394return crc;9395}93969397// Starting with the initial value in crc, accumulates a CRC32 value for9398// unsigned 8-bit integer v.9399// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)9400FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)9401{9402#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)9403__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"9404: [c] "+r"(crc)9405: [v] "r"(v));9406#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)9407crc = __crc32cb(crc, v);9408#else9409crc ^= v;9410for (int bit = 0; bit < 8; bit++) {9411if (crc & 1)9412crc = (crc >> 1) ^ UINT32_C(0x82f63b78);9413else9414crc = (crc >> 1);9415}9416#endif9417return crc;9418}94199420/* AES */94219422#if !defined(__ARM_FEATURE_CRYPTO)9423/* clang-format off */9424#define SSE2NEON_AES_SBOX(w) \9425{ \9426w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \9427w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \9428w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \9429w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \9430w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \9431w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \9432w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \9433w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \9434w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \9435w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \9436w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \9437w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \9438w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \9439w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \9440w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \9441w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \9442w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \9443w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \9444w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \9445w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \9446w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \9447w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \9448w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \9449w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \9450w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \9451w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \9452w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \9453w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \9454w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \9455w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \9456w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \9457w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \9458w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \9459w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \9460w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \9461w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \9462w(0xb0), w(0x54), w(0xbb), w(0x16) \9463}9464#define SSE2NEON_AES_RSBOX(w) \9465{ \9466w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \9467w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \9468w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \9469w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \9470w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \9471w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \9472w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \9473w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \9474w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \9475w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \9476w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \9477w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \9478w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \9479w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \9480w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \9481w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \9482w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \9483w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \9484w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \9485w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \9486w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \9487w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \9488w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \9489w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \9490w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \9491w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \9492w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \9493w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \9494w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \9495w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \9496w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \9497w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \9498w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \9499w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \9500w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \9501w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \9502w(0x55), w(0x21), w(0x0c), w(0x7d) \9503}9504/* clang-format on */95059506/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */9507#define SSE2NEON_AES_H0(x) (x)9508static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);9509static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);9510#undef SSE2NEON_AES_H095119512/* x_time function and matrix multiply function */9513#if !defined(__aarch64__)9514#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))9515#define SSE2NEON_MULTIPLY(x, y) \9516(((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \9517((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \9518((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \9519((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))9520#endif95219522// In the absence of crypto extensions, implement aesenc using regular neon9523// intrinsics instead. See:9524// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/9525// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and9526// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L529527// for more information Reproduced with permission of the author.9528FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)9529{9530#if defined(__aarch64__)9531static const uint8_t shift_rows[] = {95320x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,95330x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,9534};9535static const uint8_t ror32by8[] = {95360x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,95370x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,9538};95399540uint8x16_t v;9541uint8x16_t w = vreinterpretq_u8_m128i(a);95429543/* shift rows */9544w = vqtbl1q_u8(w, vld1q_u8(shift_rows));95459546/* sub bytes */9547// Here, we separate the whole 256-bytes table into 4 64-bytes tables, and9548// look up each of the table. After each lookup, we load the next table9549// which locates at the next 64-bytes. In the meantime, the index in the9550// table would be smaller than it was, so the index parameters of9551// `vqtbx4q_u8()` need to be added the same constant as the loaded tables.9552v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);9553// 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'9554v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);9555v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);9556v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);95579558/* mix columns */9559w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);9560w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);9561w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));95629563/* add round key */9564return vreinterpretq_m128i_u8(w) ^ RoundKey;95659566#else /* ARMv7-A implementation for a table-based AES */9567#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \9568(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \9569((uint32_t) (b1) << 8) | (uint32_t) (b0))9570// muliplying 'x' by 2 in GF(2^8)9571#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))9572// muliplying 'x' by 3 in GF(2^8)9573#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)9574#define SSE2NEON_AES_U0(p) \9575SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))9576#define SSE2NEON_AES_U1(p) \9577SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)9578#define SSE2NEON_AES_U2(p) \9579SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)9580#define SSE2NEON_AES_U3(p) \9581SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))95829583// this generates a table containing every possible permutation of9584// shift_rows() and sub_bytes() with mix_columns().9585static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {9586SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),9587SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),9588SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),9589SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),9590};9591#undef SSE2NEON_AES_B2W9592#undef SSE2NEON_AES_F29593#undef SSE2NEON_AES_F39594#undef SSE2NEON_AES_U09595#undef SSE2NEON_AES_U19596#undef SSE2NEON_AES_U29597#undef SSE2NEON_AES_U395989599uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]9600uint32_t x1 =9601_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]9602uint32_t x2 =9603_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]9604uint32_t x3 =9605_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]96069607// finish the modulo addition step in mix_columns()9608__m128i out = _mm_set_epi32(9609(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^9610aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),9611(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^9612aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),9613(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^9614aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),9615(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^9616aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));96179618return _mm_xor_si128(out, RoundKey);9619#endif9620}96219622// Perform one round of an AES decryption flow on data (state) in a using the9623// round key in RoundKey, and store the result in dst.9624// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si1289625FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)9626{9627#if defined(__aarch64__)9628static const uint8_t inv_shift_rows[] = {96290x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,96300x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,9631};9632static const uint8_t ror32by8[] = {96330x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,96340x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,9635};96369637uint8x16_t v;9638uint8x16_t w = vreinterpretq_u8_m128i(a);96399640// inverse shift rows9641w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));96429643// inverse sub bytes9644v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);9645v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);9646v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);9647v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);96489649// inverse mix columns9650// muliplying 'v' by 4 in GF(2^8)9651w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);9652w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);9653v ^= w;9654v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);96559656w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &96570x1b); // muliplying 'v' by 2 in GF(2^8)9658w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);9659w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));96609661// add round key9662return vreinterpretq_m128i_u8(w) ^ RoundKey;96639664#else /* ARMv7-A NEON implementation */9665/* FIXME: optimized for NEON */9666uint8_t i, e, f, g, h, v[4][4];9667uint8_t *_a = (uint8_t *) &a;9668for (i = 0; i < 16; ++i) {9669v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];9670}96719672// inverse mix columns9673for (i = 0; i < 4; ++i) {9674e = v[i][0];9675f = v[i][1];9676g = v[i][2];9677h = v[i][3];96789679v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^9680SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);9681v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^9682SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);9683v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^9684SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);9685v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^9686SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);9687}96889689return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;9690#endif9691}96929693// Perform the last round of an AES encryption flow on data (state) in a using9694// the round key in RoundKey, and store the result in dst.9695// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si1289696FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)9697{9698#if defined(__aarch64__)9699static const uint8_t shift_rows[] = {97000x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,97010x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,9702};97039704uint8x16_t v;9705uint8x16_t w = vreinterpretq_u8_m128i(a);97069707// shift rows9708w = vqtbl1q_u8(w, vld1q_u8(shift_rows));97099710// sub bytes9711v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);9712v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);9713v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);9714v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);97159716// add round key9717return vreinterpretq_m128i_u8(v) ^ RoundKey;97189719#else /* ARMv7-A implementation */9720uint8_t v[16] = {9721_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],9722_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],9723_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],9724_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],9725_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],9726_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],9727_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],9728_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],9729_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],9730_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],9731_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],9732_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],9733_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],9734_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],9735_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],9736_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],9737};97389739return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;9740#endif9741}97429743// Perform the last round of an AES decryption flow on data (state) in a using9744// the round key in RoundKey, and store the result in dst.9745// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si1289746FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)9747{9748#if defined(__aarch64__)9749static const uint8_t inv_shift_rows[] = {97500x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,97510x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,9752};97539754uint8x16_t v;9755uint8x16_t w = vreinterpretq_u8_m128i(a);97569757// inverse shift rows9758w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));97599760// inverse sub bytes9761v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);9762v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);9763v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);9764v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);97659766// add round key9767return vreinterpretq_m128i_u8(v) ^ RoundKey;97689769#else /* ARMv7-A NEON implementation */9770/* FIXME: optimized for NEON */9771uint8_t v[4][4];9772uint8_t *_a = (uint8_t *) &a;9773for (int i = 0; i < 16; ++i) {9774v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];9775}97769777return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;9778#endif9779}97809781// Perform the InvMixColumns transformation on a and store the result in dst.9782// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si1289783FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)9784{9785#if defined(__aarch64__)9786static const uint8_t ror32by8[] = {97870x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,97880x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,9789};9790uint8x16_t v = vreinterpretq_u8_m128i(a);9791uint8x16_t w;97929793// multiplying 'v' by 4 in GF(2^8)9794w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);9795w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);9796v ^= w;9797v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);97989799// multiplying 'v' by 2 in GF(2^8)9800w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);9801w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);9802w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));9803return vreinterpretq_m128i_u8(w);98049805#else /* ARMv7-A NEON implementation */9806uint8_t i, e, f, g, h, v[4][4];9807vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));9808for (i = 0; i < 4; ++i) {9809e = v[i][0];9810f = v[i][1];9811g = v[i][2];9812h = v[i][3];98139814v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^9815SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);9816v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^9817SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);9818v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^9819SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);9820v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^9821SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);9822}98239824return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));9825#endif9826}98279828// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.9829// This instruction generates a round key for AES encryption. See9830// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/9831// for details.9832//9833// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si1289834FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)9835{9836#if defined(__aarch64__)9837uint8x16_t _a = vreinterpretq_u8_m128i(a);9838uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);9839v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);9840v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);9841v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);98429843uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};9844uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);9845uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),9846vreinterpretq_u32_u8(v));9847uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));9848uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));98499850return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));98519852#else /* ARMv7-A NEON implementation */9853uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));9854uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));9855for (int i = 0; i < 4; ++i) {9856((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];9857((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];9858}9859return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,9860((X1 >> 8) | (X1 << 24)) ^ rcon, X1);9861#endif9862}9863#undef SSE2NEON_AES_SBOX9864#undef SSE2NEON_AES_RSBOX98659866#if defined(__aarch64__)9867#undef SSE2NEON_XT9868#undef SSE2NEON_MULTIPLY9869#endif98709871#else /* __ARM_FEATURE_CRYPTO */9872// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and9873// AESMC and then manually applying the real key as an xor operation. This9874// unfortunately means an additional xor op; the compiler should be able to9875// optimize this away for repeated calls however. See9876// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a9877// for more details.9878FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)9879{9880return vreinterpretq_m128i_u8(9881vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^9882vreinterpretq_u8_m128i(b));9883}98849885// Perform one round of an AES decryption flow on data (state) in a using the9886// round key in RoundKey, and store the result in dst.9887// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si1289888FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)9889{9890return vreinterpretq_m128i_u8(veorq_u8(9891vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),9892vreinterpretq_u8_m128i(RoundKey)));9893}98949895// Perform the last round of an AES encryption flow on data (state) in a using9896// the round key in RoundKey, and store the result in dst.9897// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si1289898FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)9899{9900return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(9901vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),9902RoundKey);9903}99049905// Perform the last round of an AES decryption flow on data (state) in a using9906// the round key in RoundKey, and store the result in dst.9907// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si1289908FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)9909{9910return vreinterpretq_m128i_u8(9911vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^9912vreinterpretq_u8_m128i(RoundKey);9913}99149915// Perform the InvMixColumns transformation on a and store the result in dst.9916// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si1289917FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)9918{9919return vreinterpretq_m128i_u8(vaesimcq_u8(a));9920}99219922// Assist in expanding the AES cipher key by computing steps towards generating9923// a round key for encryption cipher using data from a and an 8-bit round9924// constant specified in imm8, and store the result in dst."9925// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si1289926FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)9927{9928// AESE does ShiftRows and SubBytes on A9929uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));99309931uint8x16_t dest = {9932// Undo ShiftRows step from AESE and extract X1 and X39933u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)9934u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))9935u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)9936u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))9937};9938uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};9939return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);9940}9941#endif99429943/* Others */99449945// Perform a carry-less multiplication of two 64-bit integers, selected from a9946// and b according to imm8, and store the results in dst.9947// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si1289948FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)9949{9950uint64x2_t a = vreinterpretq_u64_m128i(_a);9951uint64x2_t b = vreinterpretq_u64_m128i(_b);9952switch (imm & 0x11) {9953case 0x00:9954return vreinterpretq_m128i_u64(9955_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));9956case 0x01:9957return vreinterpretq_m128i_u64(9958_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));9959case 0x10:9960return vreinterpretq_m128i_u64(9961_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));9962case 0x11:9963return vreinterpretq_m128i_u64(9964_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));9965default:9966abort();9967}9968}99699970FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()9971{9972union {9973fpcr_bitfield field;9974#if defined(__aarch64__)9975uint64_t value;9976#else9977uint32_t value;9978#endif9979} r;99809981#if defined(__aarch64__)9982__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */9983#else9984__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */9985#endif99869987return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;9988}99899990// Count the number of bits set to 1 in unsigned 32-bit integer a, and9991// return that count in dst.9992// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u329993FORCE_INLINE int _mm_popcnt_u32(unsigned int a)9994{9995#if defined(__aarch64__)9996#if __has_builtin(__builtin_popcount)9997return __builtin_popcount(a);9998#else9999return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));10000#endif10001#else10002uint32_t count = 0;10003uint8x8_t input_val, count8x8_val;10004uint16x4_t count16x4_val;10005uint32x2_t count32x2_val;1000610007input_val = vld1_u8((uint8_t *) &a);10008count8x8_val = vcnt_u8(input_val);10009count16x4_val = vpaddl_u8(count8x8_val);10010count32x2_val = vpaddl_u16(count16x4_val);1001110012vst1_u32(&count, count32x2_val);10013return count;10014#endif10015}1001610017// Count the number of bits set to 1 in unsigned 64-bit integer a, and10018// return that count in dst.10019// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u6410020FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)10021{10022#if defined(__aarch64__)10023#if __has_builtin(__builtin_popcountll)10024return __builtin_popcountll(a);10025#else10026return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));10027#endif10028#else10029uint64_t count = 0;10030uint8x8_t input_val, count8x8_val;10031uint16x4_t count16x4_val;10032uint32x2_t count32x2_val;10033uint64x1_t count64x1_val;1003410035input_val = vld1_u8((uint8_t *) &a);10036count8x8_val = vcnt_u8(input_val);10037count16x4_val = vpaddl_u8(count8x8_val);10038count32x2_val = vpaddl_u16(count16x4_val);10039count64x1_val = vpaddl_u32(count32x2_val);10040vst1_u64(&count, count64x1_val);10041return count;10042#endif10043}1004410045FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)10046{10047// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,10048// regardless of the value of the FZ bit.10049union {10050fpcr_bitfield field;10051#if defined(__aarch64__)10052uint64_t value;10053#else10054uint32_t value;10055#endif10056} r;1005710058#if defined(__aarch64__)10059__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */10060#else10061__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */10062#endif1006310064r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;1006510066#if defined(__aarch64__)10067__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */10068#else10069__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */10070#endif10071}1007210073// Return the current 64-bit value of the processor's time-stamp counter.10074// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc1007510076FORCE_INLINE uint64_t _rdtsc(void)10077{10078#if defined(__aarch64__)10079uint64_t val;1008010081/* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the10082* system counter is at least 56 bits wide; from Armv8.6, the counter10083* must be 64 bits wide. So the system counter could be less than 6410084* bits wide and it is attributed with the flag 'cap_user_time_short'10085* is true.10086*/10087__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));1008810089return val;10090#else10091uint32_t pmccntr, pmuseren, pmcntenset;10092// Read the user mode Performance Monitoring Unit (PMU)10093// User Enable Register (PMUSERENR) access permissions.10094__asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));10095if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.10096__asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));10097if (pmcntenset & 0x80000000UL) { // Is it counting?10098__asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));10099// The counter is set up to count every 64th cycle10100return (uint64_t) (pmccntr) << 6;10101}10102}1010310104// Fallback to syscall as we can't enable PMUSERENR in user mode.10105struct timeval tv;10106gettimeofday(&tv, NULL);10107return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;10108#endif10109}1011010111#if defined(__GNUC__) || defined(__clang__)10112#pragma pop_macro("ALIGN_STRUCT")10113#pragma pop_macro("FORCE_INLINE")10114#endif1011510116#if defined(__GNUC__) && !defined(__clang__)10117#pragma GCC pop_options10118#endif1011910120#endif101211012210123