Path: blob/master/tools/android-sdk/renderscript/clang-include/avxintrin.h
496 views
/*===---- avxintrin.h - AVX intrinsics -------------------------------------===1*2* Permission is hereby granted, free of charge, to any person obtaining a copy3* of this software and associated documentation files (the "Software"), to deal4* in the Software without restriction, including without limitation the rights5* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell6* copies of the Software, and to permit persons to whom the Software is7* furnished to do so, subject to the following conditions:8*9* The above copyright notice and this permission notice shall be included in10* all copies or substantial portions of the Software.11*12* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR13* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,14* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE15* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER16* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,17* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN18* THE SOFTWARE.19*20*===-----------------------------------------------------------------------===21*/2223#ifndef __IMMINTRIN_H24#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."25#endif2627#ifndef __AVXINTRIN_H28#define __AVXINTRIN_H2930typedef double __v4df __attribute__ ((__vector_size__ (32)));31typedef float __v8sf __attribute__ ((__vector_size__ (32)));32typedef long long __v4di __attribute__ ((__vector_size__ (32)));33typedef int __v8si __attribute__ ((__vector_size__ (32)));34typedef short __v16hi __attribute__ ((__vector_size__ (32)));35typedef char __v32qi __attribute__ ((__vector_size__ (32)));3637/* Unsigned types */38typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));39typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));40typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));41typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));4243/* We need an explicitly signed variant for char. Note that this shouldn't44* appear in the interface though. */45typedef signed char __v32qs __attribute__((__vector_size__(32)));4647typedef float __m256 __attribute__ ((__vector_size__ (32)));48typedef double __m256d __attribute__((__vector_size__(32)));49typedef long long __m256i __attribute__((__vector_size__(32)));5051/* Define the default attributes for the functions in this file. */52#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))5354/* Arithmetic */55/// \brief Adds two 256-bit vectors of [4 x double].56///57/// \headerfile <x86intrin.h>58///59/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.60///61/// \param __a62/// A 256-bit vector of [4 x double] containing one of the source operands.63/// \param __b64/// A 256-bit vector of [4 x double] containing one of the source operands.65/// \returns A 256-bit vector of [4 x double] containing the sums of both66/// operands.67static __inline __m256d __DEFAULT_FN_ATTRS68_mm256_add_pd(__m256d __a, __m256d __b)69{70return (__m256d)((__v4df)__a+(__v4df)__b);71}7273/// \brief Adds two 256-bit vectors of [8 x float].74///75/// \headerfile <x86intrin.h>76///77/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.78///79/// \param __a80/// A 256-bit vector of [8 x float] containing one of the source operands.81/// \param __b82/// A 256-bit vector of [8 x float] containing one of the source operands.83/// \returns A 256-bit vector of [8 x float] containing the sums of both84/// operands.85static __inline __m256 __DEFAULT_FN_ATTRS86_mm256_add_ps(__m256 __a, __m256 __b)87{88return (__m256)((__v8sf)__a+(__v8sf)__b);89}9091/// \brief Subtracts two 256-bit vectors of [4 x double].92///93/// \headerfile <x86intrin.h>94///95/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.96///97/// \param __a98/// A 256-bit vector of [4 x double] containing the minuend.99/// \param __b100/// A 256-bit vector of [4 x double] containing the subtrahend.101/// \returns A 256-bit vector of [4 x double] containing the differences between102/// both operands.103static __inline __m256d __DEFAULT_FN_ATTRS104_mm256_sub_pd(__m256d __a, __m256d __b)105{106return (__m256d)((__v4df)__a-(__v4df)__b);107}108109/// \brief Subtracts two 256-bit vectors of [8 x float].110///111/// \headerfile <x86intrin.h>112///113/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.114///115/// \param __a116/// A 256-bit vector of [8 x float] containing the minuend.117/// \param __b118/// A 256-bit vector of [8 x float] containing the subtrahend.119/// \returns A 256-bit vector of [8 x float] containing the differences between120/// both operands.121static __inline __m256 __DEFAULT_FN_ATTRS122_mm256_sub_ps(__m256 __a, __m256 __b)123{124return (__m256)((__v8sf)__a-(__v8sf)__b);125}126127/// \brief Adds the even-indexed values and subtracts the odd-indexed values of128/// two 256-bit vectors of [4 x double].129///130/// \headerfile <x86intrin.h>131///132/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.133///134/// \param __a135/// A 256-bit vector of [4 x double] containing the left source operand.136/// \param __b137/// A 256-bit vector of [4 x double] containing the right source operand.138/// \returns A 256-bit vector of [4 x double] containing the alternating sums139/// and differences between both operands.140static __inline __m256d __DEFAULT_FN_ATTRS141_mm256_addsub_pd(__m256d __a, __m256d __b)142{143return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);144}145146/// \brief Adds the even-indexed values and subtracts the odd-indexed values of147/// two 256-bit vectors of [8 x float].148///149/// \headerfile <x86intrin.h>150///151/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.152///153/// \param __a154/// A 256-bit vector of [8 x float] containing the left source operand.155/// \param __b156/// A 256-bit vector of [8 x float] containing the right source operand.157/// \returns A 256-bit vector of [8 x float] containing the alternating sums and158/// differences between both operands.159static __inline __m256 __DEFAULT_FN_ATTRS160_mm256_addsub_ps(__m256 __a, __m256 __b)161{162return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);163}164165/// \brief Divides two 256-bit vectors of [4 x double].166///167/// \headerfile <x86intrin.h>168///169/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.170///171/// \param __a172/// A 256-bit vector of [4 x double] containing the dividend.173/// \param __b174/// A 256-bit vector of [4 x double] containing the divisor.175/// \returns A 256-bit vector of [4 x double] containing the quotients of both176/// operands.177static __inline __m256d __DEFAULT_FN_ATTRS178_mm256_div_pd(__m256d __a, __m256d __b)179{180return (__m256d)((__v4df)__a/(__v4df)__b);181}182183/// \brief Divides two 256-bit vectors of [8 x float].184///185/// \headerfile <x86intrin.h>186///187/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.188///189/// \param __a190/// A 256-bit vector of [8 x float] containing the dividend.191/// \param __b192/// A 256-bit vector of [8 x float] containing the divisor.193/// \returns A 256-bit vector of [8 x float] containing the quotients of both194/// operands.195static __inline __m256 __DEFAULT_FN_ATTRS196_mm256_div_ps(__m256 __a, __m256 __b)197{198return (__m256)((__v8sf)__a/(__v8sf)__b);199}200201/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater202/// of each pair of values.203///204/// \headerfile <x86intrin.h>205///206/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.207///208/// \param __a209/// A 256-bit vector of [4 x double] containing one of the operands.210/// \param __b211/// A 256-bit vector of [4 x double] containing one of the operands.212/// \returns A 256-bit vector of [4 x double] containing the maximum values213/// between both operands.214static __inline __m256d __DEFAULT_FN_ATTRS215_mm256_max_pd(__m256d __a, __m256d __b)216{217return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);218}219220/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater221/// of each pair of values.222///223/// \headerfile <x86intrin.h>224///225/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.226///227/// \param __a228/// A 256-bit vector of [8 x float] containing one of the operands.229/// \param __b230/// A 256-bit vector of [8 x float] containing one of the operands.231/// \returns A 256-bit vector of [8 x float] containing the maximum values232/// between both operands.233static __inline __m256 __DEFAULT_FN_ATTRS234_mm256_max_ps(__m256 __a, __m256 __b)235{236return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);237}238239/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser240/// of each pair of values.241///242/// \headerfile <x86intrin.h>243///244/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.245///246/// \param __a247/// A 256-bit vector of [4 x double] containing one of the operands.248/// \param __b249/// A 256-bit vector of [4 x double] containing one of the operands.250/// \returns A 256-bit vector of [4 x double] containing the minimum values251/// between both operands.252static __inline __m256d __DEFAULT_FN_ATTRS253_mm256_min_pd(__m256d __a, __m256d __b)254{255return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);256}257258/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser259/// of each pair of values.260///261/// \headerfile <x86intrin.h>262///263/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.264///265/// \param __a266/// A 256-bit vector of [8 x float] containing one of the operands.267/// \param __b268/// A 256-bit vector of [8 x float] containing one of the operands.269/// \returns A 256-bit vector of [8 x float] containing the minimum values270/// between both operands.271static __inline __m256 __DEFAULT_FN_ATTRS272_mm256_min_ps(__m256 __a, __m256 __b)273{274return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);275}276277/// \brief Multiplies two 256-bit vectors of [4 x double].278///279/// \headerfile <x86intrin.h>280///281/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.282///283/// \param __a284/// A 256-bit vector of [4 x double] containing one of the operands.285/// \param __b286/// A 256-bit vector of [4 x double] containing one of the operands.287/// \returns A 256-bit vector of [4 x double] containing the products of both288/// operands.289static __inline __m256d __DEFAULT_FN_ATTRS290_mm256_mul_pd(__m256d __a, __m256d __b)291{292return (__m256d)((__v4df)__a * (__v4df)__b);293}294295/// \brief Multiplies two 256-bit vectors of [8 x float].296///297/// \headerfile <x86intrin.h>298///299/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.300///301/// \param __a302/// A 256-bit vector of [8 x float] containing one of the operands.303/// \param __b304/// A 256-bit vector of [8 x float] containing one of the operands.305/// \returns A 256-bit vector of [8 x float] containing the products of both306/// operands.307static __inline __m256 __DEFAULT_FN_ATTRS308_mm256_mul_ps(__m256 __a, __m256 __b)309{310return (__m256)((__v8sf)__a * (__v8sf)__b);311}312313/// \brief Calculates the square roots of the values in a 256-bit vector of314/// [4 x double].315///316/// \headerfile <x86intrin.h>317///318/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.319///320/// \param __a321/// A 256-bit vector of [4 x double].322/// \returns A 256-bit vector of [4 x double] containing the square roots of the323/// values in the operand.324static __inline __m256d __DEFAULT_FN_ATTRS325_mm256_sqrt_pd(__m256d __a)326{327return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);328}329330/// \brief Calculates the square roots of the values in a 256-bit vector of331/// [8 x float].332///333/// \headerfile <x86intrin.h>334///335/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.336///337/// \param __a338/// A 256-bit vector of [8 x float].339/// \returns A 256-bit vector of [8 x float] containing the square roots of the340/// values in the operand.341static __inline __m256 __DEFAULT_FN_ATTRS342_mm256_sqrt_ps(__m256 __a)343{344return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);345}346347/// \brief Calculates the reciprocal square roots of the values in a 256-bit348/// vector of [8 x float].349///350/// \headerfile <x86intrin.h>351///352/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.353///354/// \param __a355/// A 256-bit vector of [8 x float].356/// \returns A 256-bit vector of [8 x float] containing the reciprocal square357/// roots of the values in the operand.358static __inline __m256 __DEFAULT_FN_ATTRS359_mm256_rsqrt_ps(__m256 __a)360{361return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);362}363364/// \brief Calculates the reciprocals of the values in a 256-bit vector of365/// [8 x float].366///367/// \headerfile <x86intrin.h>368///369/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.370///371/// \param __a372/// A 256-bit vector of [8 x float].373/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the374/// values in the operand.375static __inline __m256 __DEFAULT_FN_ATTRS376_mm256_rcp_ps(__m256 __a)377{378return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);379}380381/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified382/// by the byte operand. The source values are rounded to integer values and383/// returned as 64-bit double-precision floating-point values.384///385/// \headerfile <x86intrin.h>386///387/// \code388/// __m256d _mm256_round_pd(__m256d V, const int M);389/// \endcode390///391/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.392///393/// \param V394/// A 256-bit vector of [4 x double].395/// \param M396/// An integer value that specifies the rounding operation.397/// Bits [7:4] are reserved.398/// Bit [3] is a precision exception value:399/// 0: A normal PE exception is used.400/// 1: The PE field is not updated.401/// Bit [2] is the rounding control source:402/// 0: Use bits [1:0] of M.403/// 1: Use the current MXCSR setting.404/// Bits [1:0] contain the rounding control definition:405/// 00: Nearest.406/// 01: Downward (toward negative infinity).407/// 10: Upward (toward positive infinity).408/// 11: Truncated.409/// \returns A 256-bit vector of [4 x double] containing the rounded values.410#define _mm256_round_pd(V, M) __extension__ ({ \411(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })412413/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as414/// specified by the byte operand. The source values are rounded to integer415/// values and returned as floating-point values.416///417/// \headerfile <x86intrin.h>418///419/// \code420/// __m256 _mm256_round_ps(__m256 V, const int M);421/// \endcode422///423/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.424///425/// \param V426/// A 256-bit vector of [8 x float].427/// \param M428/// An integer value that specifies the rounding operation.429/// Bits [7:4] are reserved.430/// Bit [3] is a precision exception value:431/// 0: A normal PE exception is used.432/// 1: The PE field is not updated.433/// Bit [2] is the rounding control source:434/// 0: Use bits [1:0] of M.435/// 1: Use the current MXCSR setting.436/// Bits [1:0] contain the rounding control definition:437/// 00: Nearest.438/// 01: Downward (toward negative infinity).439/// 10: Upward (toward positive infinity).440/// 11: Truncated.441/// \returns A 256-bit vector of [8 x float] containing the rounded values.442#define _mm256_round_ps(V, M) __extension__ ({ \443(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })444445/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The446/// source values are rounded up to integer values and returned as 64-bit447/// double-precision floating-point values.448///449/// \headerfile <x86intrin.h>450///451/// \code452/// __m256d _mm256_ceil_pd(__m256d V);453/// \endcode454///455/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.456///457/// \param V458/// A 256-bit vector of [4 x double].459/// \returns A 256-bit vector of [4 x double] containing the rounded up values.460#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)461462/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].463/// The source values are rounded down to integer values and returned as464/// 64-bit double-precision floating-point values.465///466/// \headerfile <x86intrin.h>467///468/// \code469/// __m256d _mm256_floor_pd(__m256d V);470/// \endcode471///472/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.473///474/// \param V475/// A 256-bit vector of [4 x double].476/// \returns A 256-bit vector of [4 x double] containing the rounded down477/// values.478#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)479480/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The481/// source values are rounded up to integer values and returned as482/// floating-point values.483///484/// \headerfile <x86intrin.h>485///486/// \code487/// __m256 _mm256_ceil_ps(__m256 V);488/// \endcode489///490/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.491///492/// \param V493/// A 256-bit vector of [8 x float].494/// \returns A 256-bit vector of [8 x float] containing the rounded up values.495#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)496497/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The498/// source values are rounded down to integer values and returned as499/// floating-point values.500///501/// \headerfile <x86intrin.h>502///503/// \code504/// __m256 _mm256_floor_ps(__m256 V);505/// \endcode506///507/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.508///509/// \param V510/// A 256-bit vector of [8 x float].511/// \returns A 256-bit vector of [8 x float] containing the rounded down values.512#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)513514/* Logical */515/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].516///517/// \headerfile <x86intrin.h>518///519/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.520///521/// \param __a522/// A 256-bit vector of [4 x double] containing one of the source operands.523/// \param __b524/// A 256-bit vector of [4 x double] containing one of the source operands.525/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the526/// values between both operands.527static __inline __m256d __DEFAULT_FN_ATTRS528_mm256_and_pd(__m256d __a, __m256d __b)529{530return (__m256d)((__v4du)__a & (__v4du)__b);531}532533/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].534///535/// \headerfile <x86intrin.h>536///537/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.538///539/// \param __a540/// A 256-bit vector of [8 x float] containing one of the source operands.541/// \param __b542/// A 256-bit vector of [8 x float] containing one of the source operands.543/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the544/// values between both operands.545static __inline __m256 __DEFAULT_FN_ATTRS546_mm256_and_ps(__m256 __a, __m256 __b)547{548return (__m256)((__v8su)__a & (__v8su)__b);549}550551/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using552/// the one's complement of the values contained in the first source operand.553///554/// \headerfile <x86intrin.h>555///556/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.557///558/// \param __a559/// A 256-bit vector of [4 x double] containing the left source operand. The560/// one's complement of this value is used in the bitwise AND.561/// \param __b562/// A 256-bit vector of [4 x double] containing the right source operand.563/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the564/// values of the second operand and the one's complement of the first565/// operand.566static __inline __m256d __DEFAULT_FN_ATTRS567_mm256_andnot_pd(__m256d __a, __m256d __b)568{569return (__m256d)(~(__v4du)__a & (__v4du)__b);570}571572/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using573/// the one's complement of the values contained in the first source operand.574///575/// \headerfile <x86intrin.h>576///577/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.578///579/// \param __a580/// A 256-bit vector of [8 x float] containing the left source operand. The581/// one's complement of this value is used in the bitwise AND.582/// \param __b583/// A 256-bit vector of [8 x float] containing the right source operand.584/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the585/// values of the second operand and the one's complement of the first586/// operand.587static __inline __m256 __DEFAULT_FN_ATTRS588_mm256_andnot_ps(__m256 __a, __m256 __b)589{590return (__m256)(~(__v8su)__a & (__v8su)__b);591}592593/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].594///595/// \headerfile <x86intrin.h>596///597/// This intrinsic corresponds to the \c VORPD / ORPD instruction.598///599/// \param __a600/// A 256-bit vector of [4 x double] containing one of the source operands.601/// \param __b602/// A 256-bit vector of [4 x double] containing one of the source operands.603/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the604/// values between both operands.605static __inline __m256d __DEFAULT_FN_ATTRS606_mm256_or_pd(__m256d __a, __m256d __b)607{608return (__m256d)((__v4du)__a | (__v4du)__b);609}610611/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].612///613/// \headerfile <x86intrin.h>614///615/// This intrinsic corresponds to the \c VORPS / ORPS instruction.616///617/// \param __a618/// A 256-bit vector of [8 x float] containing one of the source operands.619/// \param __b620/// A 256-bit vector of [8 x float] containing one of the source operands.621/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the622/// values between both operands.623static __inline __m256 __DEFAULT_FN_ATTRS624_mm256_or_ps(__m256 __a, __m256 __b)625{626return (__m256)((__v8su)__a | (__v8su)__b);627}628629/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].630///631/// \headerfile <x86intrin.h>632///633/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.634///635/// \param __a636/// A 256-bit vector of [4 x double] containing one of the source operands.637/// \param __b638/// A 256-bit vector of [4 x double] containing one of the source operands.639/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the640/// values between both operands.641static __inline __m256d __DEFAULT_FN_ATTRS642_mm256_xor_pd(__m256d __a, __m256d __b)643{644return (__m256d)((__v4du)__a ^ (__v4du)__b);645}646647/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].648///649/// \headerfile <x86intrin.h>650///651/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.652///653/// \param __a654/// A 256-bit vector of [8 x float] containing one of the source operands.655/// \param __b656/// A 256-bit vector of [8 x float] containing one of the source operands.657/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the658/// values between both operands.659static __inline __m256 __DEFAULT_FN_ATTRS660_mm256_xor_ps(__m256 __a, __m256 __b)661{662return (__m256)((__v8su)__a ^ (__v8su)__b);663}664665/* Horizontal arithmetic */666/// \brief Horizontally adds the adjacent pairs of values contained in two667/// 256-bit vectors of [4 x double].668///669/// \headerfile <x86intrin.h>670///671/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.672///673/// \param __a674/// A 256-bit vector of [4 x double] containing one of the source operands.675/// The horizontal sums of the values are returned in the even-indexed676/// elements of a vector of [4 x double].677/// \param __b678/// A 256-bit vector of [4 x double] containing one of the source operands.679/// The horizontal sums of the values are returned in the odd-indexed680/// elements of a vector of [4 x double].681/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of682/// both operands.683static __inline __m256d __DEFAULT_FN_ATTRS684_mm256_hadd_pd(__m256d __a, __m256d __b)685{686return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);687}688689/// \brief Horizontally adds the adjacent pairs of values contained in two690/// 256-bit vectors of [8 x float].691///692/// \headerfile <x86intrin.h>693///694/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.695///696/// \param __a697/// A 256-bit vector of [8 x float] containing one of the source operands.698/// The horizontal sums of the values are returned in the elements with699/// index 0, 1, 4, 5 of a vector of [8 x float].700/// \param __b701/// A 256-bit vector of [8 x float] containing one of the source operands.702/// The horizontal sums of the values are returned in the elements with703/// index 2, 3, 6, 7 of a vector of [8 x float].704/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of705/// both operands.706static __inline __m256 __DEFAULT_FN_ATTRS707_mm256_hadd_ps(__m256 __a, __m256 __b)708{709return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);710}711712/// \brief Horizontally subtracts the adjacent pairs of values contained in two713/// 256-bit vectors of [4 x double].714///715/// \headerfile <x86intrin.h>716///717/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.718///719/// \param __a720/// A 256-bit vector of [4 x double] containing one of the source operands.721/// The horizontal differences between the values are returned in the722/// even-indexed elements of a vector of [4 x double].723/// \param __b724/// A 256-bit vector of [4 x double] containing one of the source operands.725/// The horizontal differences between the values are returned in the726/// odd-indexed elements of a vector of [4 x double].727/// \returns A 256-bit vector of [4 x double] containing the horizontal728/// differences of both operands.729static __inline __m256d __DEFAULT_FN_ATTRS730_mm256_hsub_pd(__m256d __a, __m256d __b)731{732return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);733}734735/// \brief Horizontally subtracts the adjacent pairs of values contained in two736/// 256-bit vectors of [8 x float].737///738/// \headerfile <x86intrin.h>739///740/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.741///742/// \param __a743/// A 256-bit vector of [8 x float] containing one of the source operands.744/// The horizontal differences between the values are returned in the745/// elements with index 0, 1, 4, 5 of a vector of [8 x float].746/// \param __b747/// A 256-bit vector of [8 x float] containing one of the source operands.748/// The horizontal differences between the values are returned in the749/// elements with index 2, 3, 6, 7 of a vector of [8 x float].750/// \returns A 256-bit vector of [8 x float] containing the horizontal751/// differences of both operands.752static __inline __m256 __DEFAULT_FN_ATTRS753_mm256_hsub_ps(__m256 __a, __m256 __b)754{755return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);756}757758/* Vector permutations */759/// \brief Copies the values in a 128-bit vector of [2 x double] as specified760/// by the 128-bit integer vector operand.761///762/// \headerfile <x86intrin.h>763///764/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.765///766/// \param __a767/// A 128-bit vector of [2 x double].768/// \param __c769/// A 128-bit integer vector operand specifying how the values are to be770/// copied.771/// Bit [1]:772/// 0: Bits [63:0] of the source are copied to bits [63:0] of the773/// returned vector.774/// 1: Bits [127:64] of the source are copied to bits [63:0] of the775/// returned vector.776/// Bit [65]:777/// 0: Bits [63:0] of the source are copied to bits [127:64] of the778/// returned vector.779/// 1: Bits [127:64] of the source are copied to bits [127:64] of the780/// returned vector.781/// \returns A 128-bit vector of [2 x double] containing the copied values.782static __inline __m128d __DEFAULT_FN_ATTRS783_mm_permutevar_pd(__m128d __a, __m128i __c)784{785return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);786}787788/// \brief Copies the values in a 256-bit vector of [4 x double] as789/// specified by the 256-bit integer vector operand.790///791/// \headerfile <x86intrin.h>792///793/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.794///795/// \param __a796/// A 256-bit vector of [4 x double].797/// \param __c798/// A 256-bit integer vector operand specifying how the values are to be799/// copied.800/// Bit [1]:801/// 0: Bits [63:0] of the source are copied to bits [63:0] of the802/// returned vector.803/// 1: Bits [127:64] of the source are copied to bits [63:0] of the804/// returned vector.805/// Bit [65]:806/// 0: Bits [63:0] of the source are copied to bits [127:64] of the807/// returned vector.808/// 1: Bits [127:64] of the source are copied to bits [127:64] of the809/// returned vector.810/// Bit [129]:811/// 0: Bits [191:128] of the source are copied to bits [191:128] of the812/// returned vector.813/// 1: Bits [255:192] of the source are copied to bits [191:128] of the814/// returned vector.815/// Bit [193]:816/// 0: Bits [191:128] of the source are copied to bits [255:192] of the817/// returned vector.818/// 1: Bits [255:192] of the source are copied to bits [255:192] of the819/// returned vector.820/// \returns A 256-bit vector of [4 x double] containing the copied values.821static __inline __m256d __DEFAULT_FN_ATTRS822_mm256_permutevar_pd(__m256d __a, __m256i __c)823{824return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);825}826827/// \brief Copies the values stored in a 128-bit vector of [4 x float] as828/// specified by the 128-bit integer vector operand.829///830/// \headerfile <x86intrin.h>831///832/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.833///834/// \param __a835/// A 128-bit vector of [4 x float].836/// \param __c837/// A 128-bit integer vector operand specifying how the values are to be838/// copied.839/// Bits [1:0]:840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the841/// returned vector.842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the843/// returned vector.844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the845/// returned vector.846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the847/// returned vector.848/// Bits [33:32]:849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the850/// returned vector.851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the852/// returned vector.853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the854/// returned vector.855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the856/// returned vector.857/// Bits [65:64]:858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the859/// returned vector.860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the861/// returned vector.862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the863/// returned vector.864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the865/// returned vector.866/// Bits [97:96]:867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the868/// returned vector.869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the870/// returned vector.871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the872/// returned vector.873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the874/// returned vector.875/// \returns A 128-bit vector of [4 x float] containing the copied values.876static __inline __m128 __DEFAULT_FN_ATTRS877_mm_permutevar_ps(__m128 __a, __m128i __c)878{879return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);880}881882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as883/// specified by the 256-bit integer vector operand.884///885/// \headerfile <x86intrin.h>886///887/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.888///889/// \param __a890/// A 256-bit vector of [8 x float].891/// \param __c892/// A 256-bit integer vector operand specifying how the values are to be893/// copied.894/// Bits [1:0]:895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the896/// returned vector.897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the898/// returned vector.899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the900/// returned vector.901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the902/// returned vector.903/// Bits [33:32]:904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the905/// returned vector.906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the907/// returned vector.908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the909/// returned vector.910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the911/// returned vector.912/// Bits [65:64]:913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the914/// returned vector.915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the916/// returned vector.917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the918/// returned vector.919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the920/// returned vector.921/// Bits [97:96]:922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the923/// returned vector.924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the925/// returned vector.926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the927/// returned vector.928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the929/// returned vector.930/// Bits [129:128]:931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the932/// returned vector.933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the934/// returned vector.935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the936/// returned vector.937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the938/// returned vector.939/// Bits [161:160]:940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the941/// returned vector.942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the943/// returned vector.944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the945/// returned vector.946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the947/// returned vector.948/// Bits [193:192]:949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the950/// returned vector.951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the952/// returned vector.953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the954/// returned vector.955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the956/// returned vector.957/// Bits [225:224]:958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the959/// returned vector.960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the961/// returned vector.962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the963/// returned vector.964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the965/// returned vector.966/// \returns A 256-bit vector of [8 x float] containing the copied values.967static __inline __m256 __DEFAULT_FN_ATTRS968_mm256_permutevar_ps(__m256 __a, __m256i __c)969{970return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);971}972973/// \brief Copies the values in a 128-bit vector of [2 x double] as974/// specified by the immediate integer operand.975///976/// \headerfile <x86intrin.h>977///978/// \code979/// __m128d _mm_permute_pd(__m128d A, const int C);980/// \endcode981///982/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.983///984/// \param A985/// A 128-bit vector of [2 x double].986/// \param C987/// An immediate integer operand specifying how the values are to be copied.988/// Bit [0]:989/// 0: Bits [63:0] of the source are copied to bits [63:0] of the990/// returned vector.991/// 1: Bits [127:64] of the source are copied to bits [63:0] of the992/// returned vector.993/// Bit [1]:994/// 0: Bits [63:0] of the source are copied to bits [127:64] of the995/// returned vector.996/// 1: Bits [127:64] of the source are copied to bits [127:64] of the997/// returned vector.998/// \returns A 128-bit vector of [2 x double] containing the copied values.999#define _mm_permute_pd(A, C) __extension__ ({ \1000(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \1001(__v2df)_mm_undefined_pd(), \1002((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })10031004/// \brief Copies the values in a 256-bit vector of [4 x double] as1005/// specified by the immediate integer operand.1006///1007/// \headerfile <x86intrin.h>1008///1009/// \code1010/// __m256d _mm256_permute_pd(__m256d A, const int C);1011/// \endcode1012///1013/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.1014///1015/// \param A1016/// A 256-bit vector of [4 x double].1017/// \param C1018/// An immediate integer operand specifying how the values are to be copied.1019/// Bit [0]:1020/// 0: Bits [63:0] of the source are copied to bits [63:0] of the1021/// returned vector.1022/// 1: Bits [127:64] of the source are copied to bits [63:0] of the1023/// returned vector.1024/// Bit [1]:1025/// 0: Bits [63:0] of the source are copied to bits [127:64] of the1026/// returned vector.1027/// 1: Bits [127:64] of the source are copied to bits [127:64] of the1028/// returned vector.1029/// Bit [2]:1030/// 0: Bits [191:128] of the source are copied to bits [191:128] of the1031/// returned vector.1032/// 1: Bits [255:192] of the source are copied to bits [191:128] of the1033/// returned vector.1034/// Bit [3]:1035/// 0: Bits [191:128] of the source are copied to bits [255:192] of the1036/// returned vector.1037/// 1: Bits [255:192] of the source are copied to bits [255:192] of the1038/// returned vector.1039/// \returns A 256-bit vector of [4 x double] containing the copied values.1040#define _mm256_permute_pd(A, C) __extension__ ({ \1041(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \1042(__v4df)_mm256_undefined_pd(), \10430 + (((C) >> 0) & 0x1), \10440 + (((C) >> 1) & 0x1), \10452 + (((C) >> 2) & 0x1), \10462 + (((C) >> 3) & 0x1)); })10471048/// \brief Copies the values in a 128-bit vector of [4 x float] as1049/// specified by the immediate integer operand.1050///1051/// \headerfile <x86intrin.h>1052///1053/// \code1054/// __m128 _mm_permute_ps(__m128 A, const int C);1055/// \endcode1056///1057/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.1058///1059/// \param A1060/// A 128-bit vector of [4 x float].1061/// \param C1062/// An immediate integer operand specifying how the values are to be copied.1063/// Bits [1:0]:1064/// 00: Bits [31:0] of the source are copied to bits [31:0] of the1065/// returned vector.1066/// 01: Bits [63:32] of the source are copied to bits [31:0] of the1067/// returned vector.1068/// 10: Bits [95:64] of the source are copied to bits [31:0] of the1069/// returned vector.1070/// 11: Bits [127:96] of the source are copied to bits [31:0] of the1071/// returned vector.1072/// Bits [3:2]:1073/// 00: Bits [31:0] of the source are copied to bits [63:32] of the1074/// returned vector.1075/// 01: Bits [63:32] of the source are copied to bits [63:32] of the1076/// returned vector.1077/// 10: Bits [95:64] of the source are copied to bits [63:32] of the1078/// returned vector.1079/// 11: Bits [127:96] of the source are copied to bits [63:32] of the1080/// returned vector.1081/// Bits [5:4]:1082/// 00: Bits [31:0] of the source are copied to bits [95:64] of the1083/// returned vector.1084/// 01: Bits [63:32] of the source are copied to bits [95:64] of the1085/// returned vector.1086/// 10: Bits [95:64] of the source are copied to bits [95:64] of the1087/// returned vector.1088/// 11: Bits [127:96] of the source are copied to bits [95:64] of the1089/// returned vector.1090/// Bits [7:6]:1091/// 00: Bits [31:0] of the source are copied to bits [127:96] of the1092/// returned vector.1093/// 01: Bits [63:32] of the source are copied to bits [127:96] of the1094/// returned vector.1095/// 10: Bits [95:64] of the source are copied to bits [127:96] of the1096/// returned vector.1097/// 11: Bits [127:96] of the source are copied to bits [127:96] of the1098/// returned vector.1099/// \returns A 128-bit vector of [4 x float] containing the copied values.1100#define _mm_permute_ps(A, C) __extension__ ({ \1101(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \1102(__v4sf)_mm_undefined_ps(), \1103((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \1104((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })11051106/// \brief Copies the values in a 256-bit vector of [8 x float] as1107/// specified by the immediate integer operand.1108///1109/// \headerfile <x86intrin.h>1110///1111/// \code1112/// __m256 _mm256_permute_ps(__m256 A, const int C);1113/// \endcode1114///1115/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.1116///1117/// \param A1118/// A 256-bit vector of [8 x float].1119/// \param C1120/// An immediate integer operand specifying how the values are to be copied.1121/// Bits [1:0]:1122/// 00: Bits [31:0] of the source are copied to bits [31:0] of the1123/// returned vector.1124/// 01: Bits [63:32] of the source are copied to bits [31:0] of the1125/// returned vector.1126/// 10: Bits [95:64] of the source are copied to bits [31:0] of the1127/// returned vector.1128/// 11: Bits [127:96] of the source are copied to bits [31:0] of the1129/// returned vector.1130/// Bits [3:2]:1131/// 00: Bits [31:0] of the source are copied to bits [63:32] of the1132/// returned vector.1133/// 01: Bits [63:32] of the source are copied to bits [63:32] of the1134/// returned vector.1135/// 10: Bits [95:64] of the source are copied to bits [63:32] of the1136/// returned vector.1137/// 11: Bits [127:96] of the source are copied to bits [63:32] of the1138/// returned vector.1139/// Bits [5:4]:1140/// 00: Bits [31:0] of the source are copied to bits [95:64] of the1141/// returned vector.1142/// 01: Bits [63:32] of the source are copied to bits [95:64] of the1143/// returned vector.1144/// 10: Bits [95:64] of the source are copied to bits [95:64] of the1145/// returned vector.1146/// 11: Bits [127:96] of the source are copied to bits [95:64] of the1147/// returned vector.1148/// Bits [7:6]:1149/// 00: Bits [31:0] of the source are copied to bits [127:96] of the1150/// returned vector.1151/// 01: Bits [63:32] of the source are copied to bits [127:96] of the1152/// returned vector.1153/// 10: Bits [95:64] of the source are copied to bits [127:96] of the1154/// returned vector.1155/// 11: Bits [127:96] of the source are copied to bits [127:96] of the1156/// returned vector.1157/// Bits [1:0]:1158/// 00: Bits [159:128] of the source are copied to bits [159:128] of the1159/// returned vector.1160/// 01: Bits [191:160] of the source are copied to bits [159:128] of the1161/// returned vector.1162/// 10: Bits [223:192] of the source are copied to bits [159:128] of the1163/// returned vector.1164/// 11: Bits [255:224] of the source are copied to bits [159:128] of the1165/// returned vector.1166/// Bits [3:2]:1167/// 00: Bits [159:128] of the source are copied to bits [191:160] of the1168/// returned vector.1169/// 01: Bits [191:160] of the source are copied to bits [191:160] of the1170/// returned vector.1171/// 10: Bits [223:192] of the source are copied to bits [191:160] of the1172/// returned vector.1173/// 11: Bits [255:224] of the source are copied to bits [191:160] of the1174/// returned vector.1175/// Bits [5:4]:1176/// 00: Bits [159:128] of the source are copied to bits [223:192] of the1177/// returned vector.1178/// 01: Bits [191:160] of the source are copied to bits [223:192] of the1179/// returned vector.1180/// 10: Bits [223:192] of the source are copied to bits [223:192] of the1181/// returned vector.1182/// 11: Bits [255:224] of the source are copied to bits [223:192] of the1183/// returned vector.1184/// Bits [7:6]:1185/// 00: Bits [159:128] of the source are copied to bits [255:224] of the1186/// returned vector.1187/// 01: Bits [191:160] of the source are copied to bits [255:224] of the1188/// returned vector.1189/// 10: Bits [223:192] of the source are copied to bits [255:224] of the1190/// returned vector.1191/// 11: Bits [255:224] of the source are copied to bits [255:224] of the1192/// returned vector.1193/// \returns A 256-bit vector of [8 x float] containing the copied values.1194#define _mm256_permute_ps(A, C) __extension__ ({ \1195(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \1196(__v8sf)_mm256_undefined_ps(), \11970 + (((C) >> 0) & 0x3), \11980 + (((C) >> 2) & 0x3), \11990 + (((C) >> 4) & 0x3), \12000 + (((C) >> 6) & 0x3), \12014 + (((C) >> 0) & 0x3), \12024 + (((C) >> 2) & 0x3), \12034 + (((C) >> 4) & 0x3), \12044 + (((C) >> 6) & 0x3)); })12051206/// \brief Permutes 128-bit data values stored in two 256-bit vectors of1207/// [4 x double], as specified by the immediate integer operand.1208///1209/// \headerfile <x86intrin.h>1210///1211/// \code1212/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);1213/// \endcode1214///1215/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.1216///1217/// \param V11218/// A 256-bit vector of [4 x double].1219/// \param V21220/// A 256-bit vector of [4 x double.1221/// \param M1222/// An immediate integer operand specifying how the values are to be1223/// permuted.1224/// Bits [1:0]:1225/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the1226/// destination.1227/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the1228/// destination.1229/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the1230/// destination.1231/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the1232/// destination.1233/// Bits [5:4]:1234/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the1235/// destination.1236/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the1237/// destination.1238/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the1239/// destination.1240/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the1241/// destination.1242/// \returns A 256-bit vector of [4 x double] containing the copied values.1243#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \1244(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \1245(__v4df)(__m256d)(V2), (M)); })12461247/// \brief Permutes 128-bit data values stored in two 256-bit vectors of1248/// [8 x float], as specified by the immediate integer operand.1249///1250/// \headerfile <x86intrin.h>1251///1252/// \code1253/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);1254/// \endcode1255///1256/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.1257///1258/// \param V11259/// A 256-bit vector of [8 x float].1260/// \param V21261/// A 256-bit vector of [8 x float].1262/// \param M1263/// An immediate integer operand specifying how the values are to be1264/// permuted.1265/// Bits [1:0]:1266/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the1267/// destination.1268/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the1269/// destination.1270/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the1271/// destination.1272/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the1273/// destination.1274/// Bits [5:4]:1275/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the1276/// destination.1277/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the1278/// destination.1279/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the1280/// destination.1281/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the1282/// destination.1283/// \returns A 256-bit vector of [8 x float] containing the copied values.1284#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \1285(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \1286(__v8sf)(__m256)(V2), (M)); })12871288/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,1289/// as specified by the immediate integer operand.1290///1291/// \headerfile <x86intrin.h>1292///1293/// \code1294/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);1295/// \endcode1296///1297/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.1298///1299/// \param V11300/// A 256-bit integer vector.1301/// \param V21302/// A 256-bit integer vector.1303/// \param M1304/// An immediate integer operand specifying how the values are to be copied.1305/// Bits [1:0]:1306/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the1307/// destination.1308/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the1309/// destination.1310/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the1311/// destination.1312/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the1313/// destination.1314/// Bits [5:4]:1315/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the1316/// destination.1317/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the1318/// destination.1319/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the1320/// destination.1321/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the1322/// destination.1323/// \returns A 256-bit integer vector containing the copied values.1324#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \1325(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \1326(__v8si)(__m256i)(V2), (M)); })13271328/* Vector Blend */1329/// \brief Merges 64-bit double-precision data values stored in either of the1330/// two 256-bit vectors of [4 x double], as specified by the immediate1331/// integer operand.1332///1333/// \headerfile <x86intrin.h>1334///1335/// \code1336/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);1337/// \endcode1338///1339/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.1340///1341/// \param V11342/// A 256-bit vector of [4 x double].1343/// \param V21344/// A 256-bit vector of [4 x double].1345/// \param M1346/// An immediate integer operand, with mask bits [3:0] specifying how the1347/// values are to be copied. The position of the mask bit corresponds to the1348/// index of a copied value. When a mask bit is 0, the corresponding 64-bit1349/// element in operand V1 is copied to the same position in the destination.1350/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is1351/// copied to the same position in the destination.1352/// \returns A 256-bit vector of [4 x double] containing the copied values.1353#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \1354(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \1355(__v4df)(__m256d)(V2), \1356(((M) & 0x01) ? 4 : 0), \1357(((M) & 0x02) ? 5 : 1), \1358(((M) & 0x04) ? 6 : 2), \1359(((M) & 0x08) ? 7 : 3)); })13601361/// \brief Merges 32-bit single-precision data values stored in either of the1362/// two 256-bit vectors of [8 x float], as specified by the immediate1363/// integer operand.1364///1365/// \headerfile <x86intrin.h>1366///1367/// \code1368/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);1369/// \endcode1370///1371/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.1372///1373/// \param V11374/// A 256-bit vector of [8 x float].1375/// \param V21376/// A 256-bit vector of [8 x float].1377/// \param M1378/// An immediate integer operand, with mask bits [7:0] specifying how the1379/// values are to be copied. The position of the mask bit corresponds to the1380/// index of a copied value. When a mask bit is 0, the corresponding 32-bit1381/// element in operand V1 is copied to the same position in the destination.1382/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is1383/// copied to the same position in the destination.1384/// \returns A 256-bit vector of [8 x float] containing the copied values.1385#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \1386(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \1387(__v8sf)(__m256)(V2), \1388(((M) & 0x01) ? 8 : 0), \1389(((M) & 0x02) ? 9 : 1), \1390(((M) & 0x04) ? 10 : 2), \1391(((M) & 0x08) ? 11 : 3), \1392(((M) & 0x10) ? 12 : 4), \1393(((M) & 0x20) ? 13 : 5), \1394(((M) & 0x40) ? 14 : 6), \1395(((M) & 0x80) ? 15 : 7)); })13961397/// \brief Merges 64-bit double-precision data values stored in either of the1398/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector1399/// operand.1400///1401/// \headerfile <x86intrin.h>1402///1403/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.1404///1405/// \param __a1406/// A 256-bit vector of [4 x double].1407/// \param __b1408/// A 256-bit vector of [4 x double].1409/// \param __c1410/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying1411/// how the values are to be copied. The position of the mask bit corresponds1412/// to the most significant bit of a copied value. When a mask bit is 0, the1413/// corresponding 64-bit element in operand __a is copied to the same1414/// position in the destination. When a mask bit is 1, the corresponding1415/// 64-bit element in operand __b is copied to the same position in the1416/// destination.1417/// \returns A 256-bit vector of [4 x double] containing the copied values.1418static __inline __m256d __DEFAULT_FN_ATTRS1419_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)1420{1421return (__m256d)__builtin_ia32_blendvpd256(1422(__v4df)__a, (__v4df)__b, (__v4df)__c);1423}14241425/// \brief Merges 32-bit single-precision data values stored in either of the1426/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector1427/// operand.1428///1429/// \headerfile <x86intrin.h>1430///1431/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.1432///1433/// \param __a1434/// A 256-bit vector of [8 x float].1435/// \param __b1436/// A 256-bit vector of [8 x float].1437/// \param __c1438/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,1439/// and 31 specifying how the values are to be copied. The position of the1440/// mask bit corresponds to the most significant bit of a copied value. When1441/// a mask bit is 0, the corresponding 32-bit element in operand __a is1442/// copied to the same position in the destination. When a mask bit is 1, the1443/// corresponding 32-bit element in operand __b is copied to the same1444/// position in the destination.1445/// \returns A 256-bit vector of [8 x float] containing the copied values.1446static __inline __m256 __DEFAULT_FN_ATTRS1447_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)1448{1449return (__m256)__builtin_ia32_blendvps256(1450(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);1451}14521453/* Vector Dot Product */1454/// \brief Computes two dot products in parallel, using the lower and upper1455/// halves of two [8 x float] vectors as input to the two computations, and1456/// returning the two dot products in the lower and upper halves of the1457/// [8 x float] result. The immediate integer operand controls which1458/// input elements will contribute to the dot product, and where the final1459/// results are returned. In general, for each dot product, the four1460/// corresponding elements of the input vectors are multiplied; the first1461/// two and second two products are summed, then the two sums are added to1462/// form the final result.1463///1464/// \headerfile <x86intrin.h>1465///1466/// \code1467/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);1468/// \endcode1469///1470/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.1471///1472/// \param V11473/// A vector of [8 x float] values, treated as two [4 x float] vectors.1474/// \param V21475/// A vector of [8 x float] values, treated as two [4 x float] vectors.1476/// \param M1477/// An immediate integer argument. Bits [7:4] determine which elements of1478/// the input vectors are used, with bit [4] corresponding to the lowest1479/// element and bit [7] corresponding to the highest element of each [4 x1480/// float] subvector. If a bit is set, the corresponding elements from the1481/// two input vectors are used as an input for dot product; otherwise that1482/// input is treated as zero. Bits [3:0] determine which elements of the1483/// result will receive a copy of the final dot product, with bit [0]1484/// corresponding to the lowest element and bit [3] corresponding to the1485/// highest element of each [4 x float] subvector. If a bit is set, the dot1486/// product is returned in the corresponding element; otherwise that element1487/// is set to zero. The bitmask is applied in the same way to each of the1488/// two parallel dot product computations.1489/// \returns A 256-bit vector of [8 x float] containing the two dot products.1490#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \1491(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \1492(__v8sf)(__m256)(V2), (M)); })14931494/* Vector shuffle */1495/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as1496/// specified by the immediate value operand. The four selected elements in1497/// each operand are copied to the destination according to the bits1498/// specified in the immediate operand. The selected elements from the first1499/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the1500/// destination, and the selected elements from the second 256-bit operand1501/// are copied to bits [127:64] and bits [255:192] of the destination. For1502/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,1503/// the 256-bit destination vector would contain the following values: b[7],1504/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].1505///1506/// \headerfile <x86intrin.h>1507///1508/// \code1509/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);1510/// \endcode1511///1512/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.1513///1514/// \param a1515/// A 256-bit vector of [8 x float]. The four selected elements in this1516/// operand are copied to bits [63:0] and bits [191:128] in the destination,1517/// according to the bits specified in the immediate operand.1518/// \param b1519/// A 256-bit vector of [8 x float]. The four selected elements in this1520/// operand are copied to bits [127:64] and bits [255:192] in the1521/// destination, according to the bits specified in the immediate operand.1522/// \param mask1523/// An immediate value containing an 8-bit value specifying which elements to1524/// copy from a and b. Bits [3:0] specify the values copied from operand a.1525/// Bits [7:4] specify the values copied from operand b.1526/// The destinations within the 256-bit destination are assigned values as1527/// follows, according to the bit value assignments described below:1528/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the1529/// destination.1530/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the1531/// destination.1532/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the1533/// destination.1534/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in1535/// the destination.1536/// Bit value assignments:1537/// 00: Bits [31:0] and [159:128] are copied from the selected operand.1538/// 01: Bits [63:32] and [191:160] are copied from the selected operand.1539/// 10: Bits [95:64] and [223:192] are copied from the selected operand.1540/// 11: Bits [127:96] and [255:224] are copied from the selected operand.1541/// \returns A 256-bit vector of [8 x float] containing the shuffled values.1542#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \1543(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \1544(__v8sf)(__m256)(b), \15450 + (((mask) >> 0) & 0x3), \15460 + (((mask) >> 2) & 0x3), \15478 + (((mask) >> 4) & 0x3), \15488 + (((mask) >> 6) & 0x3), \15494 + (((mask) >> 0) & 0x3), \15504 + (((mask) >> 2) & 0x3), \155112 + (((mask) >> 4) & 0x3), \155212 + (((mask) >> 6) & 0x3)); })15531554/// \brief Selects four double-precision values from the 256-bit operands of1555/// [4 x double], as specified by the immediate value operand. The selected1556/// elements from the first 256-bit operand are copied to bits [63:0] and1557/// bits [191:128] in the destination, and the selected elements from the1558/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in1559/// the destination. For example, if bits [3:0] of the immediate operand1560/// contain a value of 0xF, the 256-bit destination vector would contain the1561/// following values: b[3], a[3], b[1], a[1].1562///1563/// \headerfile <x86intrin.h>1564///1565/// \code1566/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);1567/// \endcode1568///1569/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.1570///1571/// \param a1572/// A 256-bit vector of [4 x double].1573/// \param b1574/// A 256-bit vector of [4 x double].1575/// \param mask1576/// An immediate value containing 8-bit values specifying which elements to1577/// copy from a and b:1578/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the1579/// destination.1580/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the1581/// destination.1582/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the1583/// destination.1584/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the1585/// destination.1586/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the1587/// destination.1588/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the1589/// destination.1590/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the1591/// destination.1592/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the1593/// destination.1594/// \returns A 256-bit vector of [4 x double] containing the shuffled values.1595#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \1596(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \1597(__v4df)(__m256d)(b), \15980 + (((mask) >> 0) & 0x1), \15994 + (((mask) >> 1) & 0x1), \16002 + (((mask) >> 2) & 0x1), \16016 + (((mask) >> 3) & 0x1)); })16021603/* Compare */1604#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */1605#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */1606#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */1607#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */1608#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */1609#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */1610#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */1611#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */1612#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */1613#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */1614#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */1615#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */1616#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */1617#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */1618#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */1619#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */1620#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */1621#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */1622#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */1623#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */1624#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */1625#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */1626#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */1627#define _CMP_ORD_S 0x17 /* Ordered (signaling) */1628#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */1629#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */1630#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */1631#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */1632#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */1633#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */1634#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */1635#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */16361637/// \brief Compares each of the corresponding double-precision values of two1638/// 128-bit vectors of [2 x double], using the operation specified by the1639/// immediate integer operand. Returns a [2 x double] vector consisting of1640/// two doubles corresponding to the two comparison results: zero if the1641/// comparison is false, and all 1's if the comparison is true.1642///1643/// \headerfile <x86intrin.h>1644///1645/// \code1646/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);1647/// \endcode1648///1649/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.1650///1651/// \param a1652/// A 128-bit vector of [2 x double].1653/// \param b1654/// A 128-bit vector of [2 x double].1655/// \param c1656/// An immediate integer operand, with bits [4:0] specifying which comparison1657/// operation to use:1658/// 00h, 08h, 10h, 18h: Equal1659/// 01h, 09h, 11h, 19h: Less than1660/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1661/// operands)1662/// 03h, 0Bh, 13h, 1Bh: Unordered1663/// 04h, 0Ch, 14h, 1Ch: Not equal1664/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1665/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1666/// (swapped operands)1667/// 07h, 0Fh, 17h, 1Fh: Ordered1668/// \returns A 128-bit vector of [2 x double] containing the comparison results.1669#define _mm_cmp_pd(a, b, c) __extension__ ({ \1670(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \1671(__v2df)(__m128d)(b), (c)); })16721673/// \brief Compares each of the corresponding values of two 128-bit vectors of1674/// [4 x float], using the operation specified by the immediate integer1675/// operand. Returns a [4 x float] vector consisting of four floats1676/// corresponding to the four comparison results: zero if the comparison is1677/// false, and all 1's if the comparison is true.1678///1679/// \headerfile <x86intrin.h>1680///1681/// \code1682/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);1683/// \endcode1684///1685/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.1686///1687/// \param a1688/// A 128-bit vector of [4 x float].1689/// \param b1690/// A 128-bit vector of [4 x float].1691/// \param c1692/// An immediate integer operand, with bits [4:0] specifying which comparison1693/// operation to use:1694/// 00h, 08h, 10h, 18h: Equal1695/// 01h, 09h, 11h, 19h: Less than1696/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1697/// operands)1698/// 03h, 0Bh, 13h, 1Bh: Unordered1699/// 04h, 0Ch, 14h, 1Ch: Not equal1700/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1701/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1702/// (swapped operands)1703/// 07h, 0Fh, 17h, 1Fh: Ordered1704/// \returns A 128-bit vector of [4 x float] containing the comparison results.1705#define _mm_cmp_ps(a, b, c) __extension__ ({ \1706(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \1707(__v4sf)(__m128)(b), (c)); })17081709/// \brief Compares each of the corresponding double-precision values of two1710/// 256-bit vectors of [4 x double], using the operation specified by the1711/// immediate integer operand. Returns a [4 x double] vector consisting of1712/// four doubles corresponding to the four comparison results: zero if the1713/// comparison is false, and all 1's if the comparison is true.1714///1715/// \headerfile <x86intrin.h>1716///1717/// \code1718/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);1719/// \endcode1720///1721/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.1722///1723/// \param a1724/// A 256-bit vector of [4 x double].1725/// \param b1726/// A 256-bit vector of [4 x double].1727/// \param c1728/// An immediate integer operand, with bits [4:0] specifying which comparison1729/// operation to use:1730/// 00h, 08h, 10h, 18h: Equal1731/// 01h, 09h, 11h, 19h: Less than1732/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1733/// operands)1734/// 03h, 0Bh, 13h, 1Bh: Unordered1735/// 04h, 0Ch, 14h, 1Ch: Not equal1736/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1737/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1738/// (swapped operands)1739/// 07h, 0Fh, 17h, 1Fh: Ordered1740/// \returns A 256-bit vector of [4 x double] containing the comparison results.1741#define _mm256_cmp_pd(a, b, c) __extension__ ({ \1742(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \1743(__v4df)(__m256d)(b), (c)); })17441745/// \brief Compares each of the corresponding values of two 256-bit vectors of1746/// [8 x float], using the operation specified by the immediate integer1747/// operand. Returns a [8 x float] vector consisting of eight floats1748/// corresponding to the eight comparison results: zero if the comparison is1749/// false, and all 1's if the comparison is true.1750///1751/// \headerfile <x86intrin.h>1752///1753/// \code1754/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);1755/// \endcode1756///1757/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.1758///1759/// \param a1760/// A 256-bit vector of [8 x float].1761/// \param b1762/// A 256-bit vector of [8 x float].1763/// \param c1764/// An immediate integer operand, with bits [4:0] specifying which comparison1765/// operation to use:1766/// 00h, 08h, 10h, 18h: Equal1767/// 01h, 09h, 11h, 19h: Less than1768/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1769/// operands)1770/// 03h, 0Bh, 13h, 1Bh: Unordered1771/// 04h, 0Ch, 14h, 1Ch: Not equal1772/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1773/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1774/// (swapped operands)1775/// 07h, 0Fh, 17h, 1Fh: Ordered1776/// \returns A 256-bit vector of [8 x float] containing the comparison results.1777#define _mm256_cmp_ps(a, b, c) __extension__ ({ \1778(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \1779(__v8sf)(__m256)(b), (c)); })17801781/// \brief Compares each of the corresponding scalar double-precision values of1782/// two 128-bit vectors of [2 x double], using the operation specified by the1783/// immediate integer operand. If the result is true, all 64 bits of the1784/// destination vector are set; otherwise they are cleared.1785///1786/// \headerfile <x86intrin.h>1787///1788/// \code1789/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);1790/// \endcode1791///1792/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.1793///1794/// \param a1795/// A 128-bit vector of [2 x double].1796/// \param b1797/// A 128-bit vector of [2 x double].1798/// \param c1799/// An immediate integer operand, with bits [4:0] specifying which comparison1800/// operation to use:1801/// 00h, 08h, 10h, 18h: Equal1802/// 01h, 09h, 11h, 19h: Less than1803/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1804/// operands)1805/// 03h, 0Bh, 13h, 1Bh: Unordered1806/// 04h, 0Ch, 14h, 1Ch: Not equal1807/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1808/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1809/// (swapped operands)1810/// 07h, 0Fh, 17h, 1Fh: Ordered1811/// \returns A 128-bit vector of [2 x double] containing the comparison results.1812#define _mm_cmp_sd(a, b, c) __extension__ ({ \1813(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \1814(__v2df)(__m128d)(b), (c)); })18151816/// \brief Compares each of the corresponding scalar values of two 128-bit1817/// vectors of [4 x float], using the operation specified by the immediate1818/// integer operand. If the result is true, all 32 bits of the destination1819/// vector are set; otherwise they are cleared.1820///1821/// \headerfile <x86intrin.h>1822///1823/// \code1824/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);1825/// \endcode1826///1827/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.1828///1829/// \param a1830/// A 128-bit vector of [4 x float].1831/// \param b1832/// A 128-bit vector of [4 x float].1833/// \param c1834/// An immediate integer operand, with bits [4:0] specifying which comparison1835/// operation to use:1836/// 00h, 08h, 10h, 18h: Equal1837/// 01h, 09h, 11h, 19h: Less than1838/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped1839/// operands)1840/// 03h, 0Bh, 13h, 1Bh: Unordered1841/// 04h, 0Ch, 14h, 1Ch: Not equal1842/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)1843/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal1844/// (swapped operands)1845/// 07h, 0Fh, 17h, 1Fh: Ordered1846/// \returns A 128-bit vector of [4 x float] containing the comparison results.1847#define _mm_cmp_ss(a, b, c) __extension__ ({ \1848(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \1849(__v4sf)(__m128)(b), (c)); })18501851/// \brief Takes a [8 x i32] vector and returns the vector element value1852/// indexed by the immediate constant operand.1853///1854/// \headerfile <x86intrin.h>1855///1856/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /1857/// EXTRACTF128+COMPOSITE instruction.1858///1859/// \param __a1860/// A 256-bit vector of [8 x i32].1861/// \param __imm1862/// An immediate integer operand with bits [2:0] determining which vector1863/// element is extracted and returned.1864/// \returns A 32-bit integer containing the extracted 32 bits of extended1865/// packed data.1866static __inline int __DEFAULT_FN_ATTRS1867_mm256_extract_epi32(__m256i __a, const int __imm)1868{1869__v8si __b = (__v8si)__a;1870return __b[__imm & 7];1871}18721873/// \brief Takes a [16 x i16] vector and returns the vector element value1874/// indexed by the immediate constant operand.1875///1876/// \headerfile <x86intrin.h>1877///1878/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /1879/// EXTRACTF128+COMPOSITE instruction.1880///1881/// \param __a1882/// A 256-bit integer vector of [16 x i16].1883/// \param __imm1884/// An immediate integer operand with bits [3:0] determining which vector1885/// element is extracted and returned.1886/// \returns A 32-bit integer containing the extracted 16 bits of zero extended1887/// packed data.1888static __inline int __DEFAULT_FN_ATTRS1889_mm256_extract_epi16(__m256i __a, const int __imm)1890{1891__v16hi __b = (__v16hi)__a;1892return (unsigned short)__b[__imm & 15];1893}18941895/// \brief Takes a [32 x i8] vector and returns the vector element value1896/// indexed by the immediate constant operand.1897///1898/// \headerfile <x86intrin.h>1899///1900/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /1901/// EXTRACTF128+COMPOSITE instruction.1902///1903/// \param __a1904/// A 256-bit integer vector of [32 x i8].1905/// \param __imm1906/// An immediate integer operand with bits [4:0] determining which vector1907/// element is extracted and returned.1908/// \returns A 32-bit integer containing the extracted 8 bits of zero extended1909/// packed data.1910static __inline int __DEFAULT_FN_ATTRS1911_mm256_extract_epi8(__m256i __a, const int __imm)1912{1913__v32qi __b = (__v32qi)__a;1914return (unsigned char)__b[__imm & 31];1915}19161917#ifdef __x86_64__1918/// \brief Takes a [4 x i64] vector and returns the vector element value1919/// indexed by the immediate constant operand.1920///1921/// \headerfile <x86intrin.h>1922///1923/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /1924/// EXTRACTF128+COMPOSITE instruction.1925///1926/// \param __a1927/// A 256-bit integer vector of [4 x i64].1928/// \param __imm1929/// An immediate integer operand with bits [1:0] determining which vector1930/// element is extracted and returned.1931/// \returns A 64-bit integer containing the extracted 64 bits of extended1932/// packed data.1933static __inline long long __DEFAULT_FN_ATTRS1934_mm256_extract_epi64(__m256i __a, const int __imm)1935{1936__v4di __b = (__v4di)__a;1937return __b[__imm & 3];1938}1939#endif19401941/// \brief Takes a [8 x i32] vector and replaces the vector element value1942/// indexed by the immediate constant operand by a new value. Returns the1943/// modified vector.1944///1945/// \headerfile <x86intrin.h>1946///1947/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /1948/// INSERTF128+COMPOSITE instruction.1949///1950/// \param __a1951/// A vector of [8 x i32] to be used by the insert operation.1952/// \param __b1953/// An integer value. The replacement value for the insert operation.1954/// \param __imm1955/// An immediate integer specifying the index of the vector element to be1956/// replaced.1957/// \returns A copy of vector __a, after replacing its element indexed by __imm1958/// with __b.1959static __inline __m256i __DEFAULT_FN_ATTRS1960_mm256_insert_epi32(__m256i __a, int __b, int const __imm)1961{1962__v8si __c = (__v8si)__a;1963__c[__imm & 7] = __b;1964return (__m256i)__c;1965}196619671968/// \brief Takes a [16 x i16] vector and replaces the vector element value1969/// indexed by the immediate constant operand with a new value. Returns the1970/// modified vector.1971///1972/// \headerfile <x86intrin.h>1973///1974/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /1975/// INSERTF128+COMPOSITE instruction.1976///1977/// \param __a1978/// A vector of [16 x i16] to be used by the insert operation.1979/// \param __b1980/// An i16 integer value. The replacement value for the insert operation.1981/// \param __imm1982/// An immediate integer specifying the index of the vector element to be1983/// replaced.1984/// \returns A copy of vector __a, after replacing its element indexed by __imm1985/// with __b.1986static __inline __m256i __DEFAULT_FN_ATTRS1987_mm256_insert_epi16(__m256i __a, int __b, int const __imm)1988{1989__v16hi __c = (__v16hi)__a;1990__c[__imm & 15] = __b;1991return (__m256i)__c;1992}19931994/// \brief Takes a [32 x i8] vector and replaces the vector element value1995/// indexed by the immediate constant operand with a new value. Returns the1996/// modified vector.1997///1998/// \headerfile <x86intrin.h>1999///2000/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /2001/// INSERTF128+COMPOSITE instruction.2002///2003/// \param __a2004/// A vector of [32 x i8] to be used by the insert operation.2005/// \param __b2006/// An i8 integer value. The replacement value for the insert operation.2007/// \param __imm2008/// An immediate integer specifying the index of the vector element to be2009/// replaced.2010/// \returns A copy of vector __a, after replacing its element indexed by __imm2011/// with __b.2012static __inline __m256i __DEFAULT_FN_ATTRS2013_mm256_insert_epi8(__m256i __a, int __b, int const __imm)2014{2015__v32qi __c = (__v32qi)__a;2016__c[__imm & 31] = __b;2017return (__m256i)__c;2018}20192020#ifdef __x86_64__2021/// \brief Takes a [4 x i64] vector and replaces the vector element value2022/// indexed by the immediate constant operand with a new value. Returns the2023/// modified vector.2024///2025/// \headerfile <x86intrin.h>2026///2027/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /2028/// INSERTF128+COMPOSITE instruction.2029///2030/// \param __a2031/// A vector of [4 x i64] to be used by the insert operation.2032/// \param __b2033/// A 64-bit integer value. The replacement value for the insert operation.2034/// \param __imm2035/// An immediate integer specifying the index of the vector element to be2036/// replaced.2037/// \returns A copy of vector __a, after replacing its element indexed by __imm2038/// with __b.2039static __inline __m256i __DEFAULT_FN_ATTRS2040_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)2041{2042__v4di __c = (__v4di)__a;2043__c[__imm & 3] = __b;2044return (__m256i)__c;2045}2046#endif20472048/* Conversion */2049/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].2050///2051/// \headerfile <x86intrin.h>2052///2053/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.2054///2055/// \param __a2056/// A 128-bit integer vector of [4 x i32].2057/// \returns A 256-bit vector of [4 x double] containing the converted values.2058static __inline __m256d __DEFAULT_FN_ATTRS2059_mm256_cvtepi32_pd(__m128i __a)2060{2061return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);2062}20632064/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].2065///2066/// \headerfile <x86intrin.h>2067///2068/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.2069///2070/// \param __a2071/// A 256-bit integer vector.2072/// \returns A 256-bit vector of [8 x float] containing the converted values.2073static __inline __m256 __DEFAULT_FN_ATTRS2074_mm256_cvtepi32_ps(__m256i __a)2075{2076return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);2077}20782079/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of2080/// [4 x float].2081///2082/// \headerfile <x86intrin.h>2083///2084/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.2085///2086/// \param __a2087/// A 256-bit vector of [4 x double].2088/// \returns A 128-bit vector of [4 x float] containing the converted values.2089static __inline __m128 __DEFAULT_FN_ATTRS2090_mm256_cvtpd_ps(__m256d __a)2091{2092return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);2093}20942095/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].2096///2097/// \headerfile <x86intrin.h>2098///2099/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.2100///2101/// \param __a2102/// A 256-bit vector of [8 x float].2103/// \returns A 256-bit integer vector containing the converted values.2104static __inline __m256i __DEFAULT_FN_ATTRS2105_mm256_cvtps_epi32(__m256 __a)2106{2107return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);2108}21092110static __inline __m256d __DEFAULT_FN_ATTRS2111_mm256_cvtps_pd(__m128 __a)2112{2113return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);2114}21152116static __inline __m128i __DEFAULT_FN_ATTRS2117_mm256_cvttpd_epi32(__m256d __a)2118{2119return (__m128i)__builtin_convertvector((__v4df) __a, __v4si);2120}21212122static __inline __m128i __DEFAULT_FN_ATTRS2123_mm256_cvtpd_epi32(__m256d __a)2124{2125return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);2126}21272128static __inline __m256i __DEFAULT_FN_ATTRS2129_mm256_cvttps_epi32(__m256 __a)2130{2131return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si);2132}21332134static __inline double __DEFAULT_FN_ATTRS2135_mm256_cvtsd_f64(__m256d __a)2136{2137return __a[0];2138}21392140static __inline int __DEFAULT_FN_ATTRS2141_mm256_cvtsi256_si32(__m256i __a)2142{2143__v8si __b = (__v8si)__a;2144return __b[0];2145}21462147static __inline float __DEFAULT_FN_ATTRS2148_mm256_cvtss_f32(__m256 __a)2149{2150return __a[0];2151}21522153/* Vector replicate */2154static __inline __m256 __DEFAULT_FN_ATTRS2155_mm256_movehdup_ps(__m256 __a)2156{2157return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);2158}21592160static __inline __m256 __DEFAULT_FN_ATTRS2161_mm256_moveldup_ps(__m256 __a)2162{2163return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);2164}21652166static __inline __m256d __DEFAULT_FN_ATTRS2167_mm256_movedup_pd(__m256d __a)2168{2169return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);2170}21712172/* Unpack and Interleave */2173static __inline __m256d __DEFAULT_FN_ATTRS2174_mm256_unpackhi_pd(__m256d __a, __m256d __b)2175{2176return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);2177}21782179static __inline __m256d __DEFAULT_FN_ATTRS2180_mm256_unpacklo_pd(__m256d __a, __m256d __b)2181{2182return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);2183}21842185static __inline __m256 __DEFAULT_FN_ATTRS2186_mm256_unpackhi_ps(__m256 __a, __m256 __b)2187{2188return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);2189}21902191static __inline __m256 __DEFAULT_FN_ATTRS2192_mm256_unpacklo_ps(__m256 __a, __m256 __b)2193{2194return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);2195}21962197/* Bit Test */2198static __inline int __DEFAULT_FN_ATTRS2199_mm_testz_pd(__m128d __a, __m128d __b)2200{2201return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);2202}22032204static __inline int __DEFAULT_FN_ATTRS2205_mm_testc_pd(__m128d __a, __m128d __b)2206{2207return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);2208}22092210static __inline int __DEFAULT_FN_ATTRS2211_mm_testnzc_pd(__m128d __a, __m128d __b)2212{2213return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);2214}22152216static __inline int __DEFAULT_FN_ATTRS2217_mm_testz_ps(__m128 __a, __m128 __b)2218{2219return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);2220}22212222static __inline int __DEFAULT_FN_ATTRS2223_mm_testc_ps(__m128 __a, __m128 __b)2224{2225return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);2226}22272228static __inline int __DEFAULT_FN_ATTRS2229_mm_testnzc_ps(__m128 __a, __m128 __b)2230{2231return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);2232}22332234static __inline int __DEFAULT_FN_ATTRS2235_mm256_testz_pd(__m256d __a, __m256d __b)2236{2237return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);2238}22392240static __inline int __DEFAULT_FN_ATTRS2241_mm256_testc_pd(__m256d __a, __m256d __b)2242{2243return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);2244}22452246static __inline int __DEFAULT_FN_ATTRS2247_mm256_testnzc_pd(__m256d __a, __m256d __b)2248{2249return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);2250}22512252static __inline int __DEFAULT_FN_ATTRS2253_mm256_testz_ps(__m256 __a, __m256 __b)2254{2255return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);2256}22572258static __inline int __DEFAULT_FN_ATTRS2259_mm256_testc_ps(__m256 __a, __m256 __b)2260{2261return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);2262}22632264static __inline int __DEFAULT_FN_ATTRS2265_mm256_testnzc_ps(__m256 __a, __m256 __b)2266{2267return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);2268}22692270static __inline int __DEFAULT_FN_ATTRS2271_mm256_testz_si256(__m256i __a, __m256i __b)2272{2273return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);2274}22752276static __inline int __DEFAULT_FN_ATTRS2277_mm256_testc_si256(__m256i __a, __m256i __b)2278{2279return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);2280}22812282static __inline int __DEFAULT_FN_ATTRS2283_mm256_testnzc_si256(__m256i __a, __m256i __b)2284{2285return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);2286}22872288/* Vector extract sign mask */2289static __inline int __DEFAULT_FN_ATTRS2290_mm256_movemask_pd(__m256d __a)2291{2292return __builtin_ia32_movmskpd256((__v4df)__a);2293}22942295static __inline int __DEFAULT_FN_ATTRS2296_mm256_movemask_ps(__m256 __a)2297{2298return __builtin_ia32_movmskps256((__v8sf)__a);2299}23002301/* Vector __zero */2302static __inline void __DEFAULT_FN_ATTRS2303_mm256_zeroall(void)2304{2305__builtin_ia32_vzeroall();2306}23072308static __inline void __DEFAULT_FN_ATTRS2309_mm256_zeroupper(void)2310{2311__builtin_ia32_vzeroupper();2312}23132314/* Vector load with broadcast */2315static __inline __m128 __DEFAULT_FN_ATTRS2316_mm_broadcast_ss(float const *__a)2317{2318float __f = *__a;2319return (__m128)(__v4sf){ __f, __f, __f, __f };2320}23212322static __inline __m256d __DEFAULT_FN_ATTRS2323_mm256_broadcast_sd(double const *__a)2324{2325double __d = *__a;2326return (__m256d)(__v4df){ __d, __d, __d, __d };2327}23282329static __inline __m256 __DEFAULT_FN_ATTRS2330_mm256_broadcast_ss(float const *__a)2331{2332float __f = *__a;2333return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };2334}23352336static __inline __m256d __DEFAULT_FN_ATTRS2337_mm256_broadcast_pd(__m128d const *__a)2338{2339return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);2340}23412342static __inline __m256 __DEFAULT_FN_ATTRS2343_mm256_broadcast_ps(__m128 const *__a)2344{2345return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);2346}23472348/* SIMD load ops */2349static __inline __m256d __DEFAULT_FN_ATTRS2350_mm256_load_pd(double const *__p)2351{2352return *(__m256d *)__p;2353}23542355static __inline __m256 __DEFAULT_FN_ATTRS2356_mm256_load_ps(float const *__p)2357{2358return *(__m256 *)__p;2359}23602361static __inline __m256d __DEFAULT_FN_ATTRS2362_mm256_loadu_pd(double const *__p)2363{2364struct __loadu_pd {2365__m256d __v;2366} __attribute__((__packed__, __may_alias__));2367return ((struct __loadu_pd*)__p)->__v;2368}23692370static __inline __m256 __DEFAULT_FN_ATTRS2371_mm256_loadu_ps(float const *__p)2372{2373struct __loadu_ps {2374__m256 __v;2375} __attribute__((__packed__, __may_alias__));2376return ((struct __loadu_ps*)__p)->__v;2377}23782379static __inline __m256i __DEFAULT_FN_ATTRS2380_mm256_load_si256(__m256i const *__p)2381{2382return *__p;2383}23842385static __inline __m256i __DEFAULT_FN_ATTRS2386_mm256_loadu_si256(__m256i const *__p)2387{2388struct __loadu_si256 {2389__m256i __v;2390} __attribute__((__packed__, __may_alias__));2391return ((struct __loadu_si256*)__p)->__v;2392}23932394static __inline __m256i __DEFAULT_FN_ATTRS2395_mm256_lddqu_si256(__m256i const *__p)2396{2397return (__m256i)__builtin_ia32_lddqu256((char const *)__p);2398}23992400/* SIMD store ops */2401static __inline void __DEFAULT_FN_ATTRS2402_mm256_store_pd(double *__p, __m256d __a)2403{2404*(__m256d *)__p = __a;2405}24062407static __inline void __DEFAULT_FN_ATTRS2408_mm256_store_ps(float *__p, __m256 __a)2409{2410*(__m256 *)__p = __a;2411}24122413static __inline void __DEFAULT_FN_ATTRS2414_mm256_storeu_pd(double *__p, __m256d __a)2415{2416struct __storeu_pd {2417__m256d __v;2418} __attribute__((__packed__, __may_alias__));2419((struct __storeu_pd*)__p)->__v = __a;2420}24212422static __inline void __DEFAULT_FN_ATTRS2423_mm256_storeu_ps(float *__p, __m256 __a)2424{2425struct __storeu_ps {2426__m256 __v;2427} __attribute__((__packed__, __may_alias__));2428((struct __storeu_ps*)__p)->__v = __a;2429}24302431static __inline void __DEFAULT_FN_ATTRS2432_mm256_store_si256(__m256i *__p, __m256i __a)2433{2434*__p = __a;2435}24362437static __inline void __DEFAULT_FN_ATTRS2438_mm256_storeu_si256(__m256i *__p, __m256i __a)2439{2440struct __storeu_si256 {2441__m256i __v;2442} __attribute__((__packed__, __may_alias__));2443((struct __storeu_si256*)__p)->__v = __a;2444}24452446/* Conditional load ops */2447static __inline __m128d __DEFAULT_FN_ATTRS2448_mm_maskload_pd(double const *__p, __m128i __m)2449{2450return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);2451}24522453static __inline __m256d __DEFAULT_FN_ATTRS2454_mm256_maskload_pd(double const *__p, __m256i __m)2455{2456return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,2457(__v4di)__m);2458}24592460static __inline __m128 __DEFAULT_FN_ATTRS2461_mm_maskload_ps(float const *__p, __m128i __m)2462{2463return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);2464}24652466static __inline __m256 __DEFAULT_FN_ATTRS2467_mm256_maskload_ps(float const *__p, __m256i __m)2468{2469return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);2470}24712472/* Conditional store ops */2473static __inline void __DEFAULT_FN_ATTRS2474_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)2475{2476__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);2477}24782479static __inline void __DEFAULT_FN_ATTRS2480_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)2481{2482__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);2483}24842485static __inline void __DEFAULT_FN_ATTRS2486_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)2487{2488__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);2489}24902491static __inline void __DEFAULT_FN_ATTRS2492_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)2493{2494__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);2495}24962497/* Cacheability support ops */2498static __inline void __DEFAULT_FN_ATTRS2499_mm256_stream_si256(__m256i *__a, __m256i __b)2500{2501__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);2502}25032504static __inline void __DEFAULT_FN_ATTRS2505_mm256_stream_pd(double *__a, __m256d __b)2506{2507__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);2508}25092510static __inline void __DEFAULT_FN_ATTRS2511_mm256_stream_ps(float *__p, __m256 __a)2512{2513__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);2514}25152516/* Create vectors */2517static __inline__ __m256d __DEFAULT_FN_ATTRS2518_mm256_undefined_pd(void)2519{2520return (__m256d)__builtin_ia32_undef256();2521}25222523static __inline__ __m256 __DEFAULT_FN_ATTRS2524_mm256_undefined_ps(void)2525{2526return (__m256)__builtin_ia32_undef256();2527}25282529static __inline__ __m256i __DEFAULT_FN_ATTRS2530_mm256_undefined_si256(void)2531{2532return (__m256i)__builtin_ia32_undef256();2533}25342535static __inline __m256d __DEFAULT_FN_ATTRS2536_mm256_set_pd(double __a, double __b, double __c, double __d)2537{2538return (__m256d){ __d, __c, __b, __a };2539}25402541static __inline __m256 __DEFAULT_FN_ATTRS2542_mm256_set_ps(float __a, float __b, float __c, float __d,2543float __e, float __f, float __g, float __h)2544{2545return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };2546}25472548static __inline __m256i __DEFAULT_FN_ATTRS2549_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,2550int __i4, int __i5, int __i6, int __i7)2551{2552return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };2553}25542555static __inline __m256i __DEFAULT_FN_ATTRS2556_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,2557short __w11, short __w10, short __w09, short __w08,2558short __w07, short __w06, short __w05, short __w04,2559short __w03, short __w02, short __w01, short __w00)2560{2561return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,2562__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };2563}25642565static __inline __m256i __DEFAULT_FN_ATTRS2566_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,2567char __b27, char __b26, char __b25, char __b24,2568char __b23, char __b22, char __b21, char __b20,2569char __b19, char __b18, char __b17, char __b16,2570char __b15, char __b14, char __b13, char __b12,2571char __b11, char __b10, char __b09, char __b08,2572char __b07, char __b06, char __b05, char __b04,2573char __b03, char __b02, char __b01, char __b00)2574{2575return (__m256i)(__v32qi){2576__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,2577__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,2578__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,2579__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b312580};2581}25822583static __inline __m256i __DEFAULT_FN_ATTRS2584_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)2585{2586return (__m256i)(__v4di){ __d, __c, __b, __a };2587}25882589/* Create vectors with elements in reverse order */2590static __inline __m256d __DEFAULT_FN_ATTRS2591_mm256_setr_pd(double __a, double __b, double __c, double __d)2592{2593return (__m256d){ __a, __b, __c, __d };2594}25952596static __inline __m256 __DEFAULT_FN_ATTRS2597_mm256_setr_ps(float __a, float __b, float __c, float __d,2598float __e, float __f, float __g, float __h)2599{2600return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };2601}26022603static __inline __m256i __DEFAULT_FN_ATTRS2604_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,2605int __i4, int __i5, int __i6, int __i7)2606{2607return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };2608}26092610static __inline __m256i __DEFAULT_FN_ATTRS2611_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,2612short __w11, short __w10, short __w09, short __w08,2613short __w07, short __w06, short __w05, short __w04,2614short __w03, short __w02, short __w01, short __w00)2615{2616return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,2617__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };2618}26192620static __inline __m256i __DEFAULT_FN_ATTRS2621_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,2622char __b27, char __b26, char __b25, char __b24,2623char __b23, char __b22, char __b21, char __b20,2624char __b19, char __b18, char __b17, char __b16,2625char __b15, char __b14, char __b13, char __b12,2626char __b11, char __b10, char __b09, char __b08,2627char __b07, char __b06, char __b05, char __b04,2628char __b03, char __b02, char __b01, char __b00)2629{2630return (__m256i)(__v32qi){2631__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,2632__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,2633__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,2634__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };2635}26362637static __inline __m256i __DEFAULT_FN_ATTRS2638_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)2639{2640return (__m256i)(__v4di){ __a, __b, __c, __d };2641}26422643/* Create vectors with repeated elements */2644static __inline __m256d __DEFAULT_FN_ATTRS2645_mm256_set1_pd(double __w)2646{2647return (__m256d){ __w, __w, __w, __w };2648}26492650static __inline __m256 __DEFAULT_FN_ATTRS2651_mm256_set1_ps(float __w)2652{2653return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };2654}26552656static __inline __m256i __DEFAULT_FN_ATTRS2657_mm256_set1_epi32(int __i)2658{2659return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };2660}26612662static __inline __m256i __DEFAULT_FN_ATTRS2663_mm256_set1_epi16(short __w)2664{2665return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,2666__w, __w, __w, __w, __w, __w };2667}26682669static __inline __m256i __DEFAULT_FN_ATTRS2670_mm256_set1_epi8(char __b)2671{2672return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,2673__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,2674__b, __b, __b, __b, __b, __b, __b };2675}26762677static __inline __m256i __DEFAULT_FN_ATTRS2678_mm256_set1_epi64x(long long __q)2679{2680return (__m256i)(__v4di){ __q, __q, __q, __q };2681}26822683/* Create __zeroed vectors */2684static __inline __m256d __DEFAULT_FN_ATTRS2685_mm256_setzero_pd(void)2686{2687return (__m256d){ 0, 0, 0, 0 };2688}26892690static __inline __m256 __DEFAULT_FN_ATTRS2691_mm256_setzero_ps(void)2692{2693return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };2694}26952696static __inline __m256i __DEFAULT_FN_ATTRS2697_mm256_setzero_si256(void)2698{2699return (__m256i){ 0LL, 0LL, 0LL, 0LL };2700}27012702/* Cast between vector types */2703static __inline __m256 __DEFAULT_FN_ATTRS2704_mm256_castpd_ps(__m256d __a)2705{2706return (__m256)__a;2707}27082709static __inline __m256i __DEFAULT_FN_ATTRS2710_mm256_castpd_si256(__m256d __a)2711{2712return (__m256i)__a;2713}27142715static __inline __m256d __DEFAULT_FN_ATTRS2716_mm256_castps_pd(__m256 __a)2717{2718return (__m256d)__a;2719}27202721static __inline __m256i __DEFAULT_FN_ATTRS2722_mm256_castps_si256(__m256 __a)2723{2724return (__m256i)__a;2725}27262727static __inline __m256 __DEFAULT_FN_ATTRS2728_mm256_castsi256_ps(__m256i __a)2729{2730return (__m256)__a;2731}27322733static __inline __m256d __DEFAULT_FN_ATTRS2734_mm256_castsi256_pd(__m256i __a)2735{2736return (__m256d)__a;2737}27382739static __inline __m128d __DEFAULT_FN_ATTRS2740_mm256_castpd256_pd128(__m256d __a)2741{2742return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);2743}27442745static __inline __m128 __DEFAULT_FN_ATTRS2746_mm256_castps256_ps128(__m256 __a)2747{2748return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);2749}27502751static __inline __m128i __DEFAULT_FN_ATTRS2752_mm256_castsi256_si128(__m256i __a)2753{2754return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);2755}27562757static __inline __m256d __DEFAULT_FN_ATTRS2758_mm256_castpd128_pd256(__m128d __a)2759{2760return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);2761}27622763static __inline __m256 __DEFAULT_FN_ATTRS2764_mm256_castps128_ps256(__m128 __a)2765{2766return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);2767}27682769static __inline __m256i __DEFAULT_FN_ATTRS2770_mm256_castsi128_si256(__m128i __a)2771{2772return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);2773}27742775/*2776Vector insert.2777We use macros rather than inlines because we only want to accept2778invocations where the immediate M is a constant expression.2779*/2780#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \2781(__m256)__builtin_shufflevector( \2782(__v8sf)(__m256)(V1), \2783(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \2784(((M) & 1) ? 0 : 8), \2785(((M) & 1) ? 1 : 9), \2786(((M) & 1) ? 2 : 10), \2787(((M) & 1) ? 3 : 11), \2788(((M) & 1) ? 8 : 4), \2789(((M) & 1) ? 9 : 5), \2790(((M) & 1) ? 10 : 6), \2791(((M) & 1) ? 11 : 7) );})27922793#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \2794(__m256d)__builtin_shufflevector( \2795(__v4df)(__m256d)(V1), \2796(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \2797(((M) & 1) ? 0 : 4), \2798(((M) & 1) ? 1 : 5), \2799(((M) & 1) ? 4 : 2), \2800(((M) & 1) ? 5 : 3) );})28012802#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \2803(__m256i)__builtin_shufflevector( \2804(__v4di)(__m256i)(V1), \2805(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \2806(((M) & 1) ? 0 : 4), \2807(((M) & 1) ? 1 : 5), \2808(((M) & 1) ? 4 : 2), \2809(((M) & 1) ? 5 : 3) );})28102811/*2812Vector extract.2813We use macros rather than inlines because we only want to accept2814invocations where the immediate M is a constant expression.2815*/2816#define _mm256_extractf128_ps(V, M) __extension__ ({ \2817(__m128)__builtin_shufflevector( \2818(__v8sf)(__m256)(V), \2819(__v8sf)(_mm256_undefined_ps()), \2820(((M) & 1) ? 4 : 0), \2821(((M) & 1) ? 5 : 1), \2822(((M) & 1) ? 6 : 2), \2823(((M) & 1) ? 7 : 3) );})28242825#define _mm256_extractf128_pd(V, M) __extension__ ({ \2826(__m128d)__builtin_shufflevector( \2827(__v4df)(__m256d)(V), \2828(__v4df)(_mm256_undefined_pd()), \2829(((M) & 1) ? 2 : 0), \2830(((M) & 1) ? 3 : 1) );})28312832#define _mm256_extractf128_si256(V, M) __extension__ ({ \2833(__m128i)__builtin_shufflevector( \2834(__v4di)(__m256i)(V), \2835(__v4di)(_mm256_undefined_si256()), \2836(((M) & 1) ? 2 : 0), \2837(((M) & 1) ? 3 : 1) );})28382839/* SIMD load ops (unaligned) */2840static __inline __m256 __DEFAULT_FN_ATTRS2841_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)2842{2843__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));2844return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);2845}28462847static __inline __m256d __DEFAULT_FN_ATTRS2848_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)2849{2850__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));2851return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);2852}28532854static __inline __m256i __DEFAULT_FN_ATTRS2855_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)2856{2857__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));2858return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);2859}28602861/* SIMD store ops (unaligned) */2862static __inline void __DEFAULT_FN_ATTRS2863_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)2864{2865__m128 __v128;28662867__v128 = _mm256_castps256_ps128(__a);2868_mm_storeu_ps(__addr_lo, __v128);2869__v128 = _mm256_extractf128_ps(__a, 1);2870_mm_storeu_ps(__addr_hi, __v128);2871}28722873static __inline void __DEFAULT_FN_ATTRS2874_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)2875{2876__m128d __v128;28772878__v128 = _mm256_castpd256_pd128(__a);2879_mm_storeu_pd(__addr_lo, __v128);2880__v128 = _mm256_extractf128_pd(__a, 1);2881_mm_storeu_pd(__addr_hi, __v128);2882}28832884static __inline void __DEFAULT_FN_ATTRS2885_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)2886{2887__m128i __v128;28882889__v128 = _mm256_castsi256_si128(__a);2890_mm_storeu_si128(__addr_lo, __v128);2891__v128 = _mm256_extractf128_si256(__a, 1);2892_mm_storeu_si128(__addr_hi, __v128);2893}28942895static __inline __m256 __DEFAULT_FN_ATTRS2896_mm256_set_m128 (__m128 __hi, __m128 __lo) {2897return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);2898}28992900static __inline __m256d __DEFAULT_FN_ATTRS2901_mm256_set_m128d (__m128d __hi, __m128d __lo) {2902return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);2903}29042905static __inline __m256i __DEFAULT_FN_ATTRS2906_mm256_set_m128i (__m128i __hi, __m128i __lo) {2907return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);2908}29092910static __inline __m256 __DEFAULT_FN_ATTRS2911_mm256_setr_m128 (__m128 __lo, __m128 __hi) {2912return _mm256_set_m128(__hi, __lo);2913}29142915static __inline __m256d __DEFAULT_FN_ATTRS2916_mm256_setr_m128d (__m128d __lo, __m128d __hi) {2917return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);2918}29192920static __inline __m256i __DEFAULT_FN_ATTRS2921_mm256_setr_m128i (__m128i __lo, __m128i __hi) {2922return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);2923}29242925#undef __DEFAULT_FN_ATTRS29262927#endif /* __AVXINTRIN_H */292829292930