Path: blob/main/contrib/llvm-project/clang/lib/Headers/avx2intrin.h
35233 views
/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===-----------------------------------------------------------------------===7*/89#ifndef __IMMINTRIN_H10#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."11#endif1213#ifndef __AVX2INTRIN_H14#define __AVX2INTRIN_H1516/* Define the default attributes for the functions in this file. */17#define __DEFAULT_FN_ATTRS256 \18__attribute__((__always_inline__, __nodebug__, \19__target__("avx2,no-evex512"), __min_vector_width__(256)))20#define __DEFAULT_FN_ATTRS128 \21__attribute__((__always_inline__, __nodebug__, \22__target__("avx2,no-evex512"), __min_vector_width__(128)))2324/* SSE4 Multiple Packed Sums of Absolute Difference. */25/// Computes sixteen sum of absolute difference (SAD) operations on sets of26/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and27/// \a Y.28///29/// Eight SAD results are computed using the lower half of the input30/// vectors, and another eight using the upper half. These 16-bit values31/// are returned in the lower and upper halves of the 256-bit result,32/// respectively.33///34/// A single SAD operation selects four bytes from \a X and four bytes from35/// \a Y as input. It computes the differences between each \a X byte and36/// the corresponding \a Y byte, takes the absolute value of each37/// difference, and sums these four values to form one 16-bit result. The38/// intrinsic computes 16 of these results with different sets of input39/// bytes.40///41/// For each set of eight results, the SAD operations use the same four42/// bytes from \a Y; the starting bit position for these four bytes is43/// specified by \a M[1:0] times 32. The eight operations use successive44/// sets of four bytes from \a X; the starting bit position for the first45/// set of four bytes is specified by \a M[2] times 32. These bit positions46/// are all relative to the 128-bit lane for each set of eight operations.47///48/// \code{.operation}49/// r := 050/// FOR i := 0 TO 151/// j := i*352/// Ybase := M[j+1:j]*32 + i*12853/// Xbase := M[j+2]*32 + i*12854/// FOR k := 0 TO 355/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])56/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])57/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])58/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])59/// result[r+15:r] := temp0 + temp1 + temp2 + temp360/// Xbase := Xbase + 861/// r := r + 1662/// ENDFOR63/// ENDFOR64/// \endcode65///66/// \headerfile <immintrin.h>67///68/// \code69/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);70/// \endcode71///72/// This intrinsic corresponds to the \c VMPSADBW instruction.73///74/// \param X75/// A 256-bit integer vector containing one of the inputs.76/// \param Y77/// A 256-bit integer vector containing one of the inputs.78/// \param M79/// An unsigned immediate value specifying the starting positions of the80/// bytes to operate on.81/// \returns A 256-bit vector of [16 x i16] containing the result.82#define _mm256_mpsadbw_epu8(X, Y, M) \83((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \84(__v32qi)(__m256i)(Y), (int)(M)))8586/// Computes the absolute value of each signed byte in the 256-bit integer87/// vector \a __a and returns each value in the corresponding byte of88/// the result.89///90/// \headerfile <immintrin.h>91///92/// This intrinsic corresponds to the \c VPABSB instruction.93///94/// \param __a95/// A 256-bit integer vector.96/// \returns A 256-bit integer vector containing the result.97static __inline__ __m256i __DEFAULT_FN_ATTRS25698_mm256_abs_epi8(__m256i __a)99{100return (__m256i)__builtin_elementwise_abs((__v32qs)__a);101}102103/// Computes the absolute value of each signed 16-bit element in the 256-bit104/// vector of [16 x i16] in \a __a and returns each value in the105/// corresponding element of the result.106///107/// \headerfile <immintrin.h>108///109/// This intrinsic corresponds to the \c VPABSW instruction.110///111/// \param __a112/// A 256-bit vector of [16 x i16].113/// \returns A 256-bit vector of [16 x i16] containing the result.114static __inline__ __m256i __DEFAULT_FN_ATTRS256115_mm256_abs_epi16(__m256i __a)116{117return (__m256i)__builtin_elementwise_abs((__v16hi)__a);118}119120/// Computes the absolute value of each signed 32-bit element in the 256-bit121/// vector of [8 x i32] in \a __a and returns each value in the122/// corresponding element of the result.123///124/// \headerfile <immintrin.h>125///126/// This intrinsic corresponds to the \c VPABSD instruction.127///128/// \param __a129/// A 256-bit vector of [8 x i32].130/// \returns A 256-bit vector of [8 x i32] containing the result.131static __inline__ __m256i __DEFAULT_FN_ATTRS256132_mm256_abs_epi32(__m256i __a)133{134return (__m256i)__builtin_elementwise_abs((__v8si)__a);135}136137/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit138/// integers using signed saturation, and returns the 256-bit result.139///140/// \code{.operation}141/// FOR i := 0 TO 7142/// j := i*16143/// k := i*8144/// result[7+k:k] := SATURATE8(__a[15+j:j])145/// result[71+k:64+k] := SATURATE8(__b[15+j:j])146/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])147/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])148/// ENDFOR149/// \endcode150///151/// \headerfile <immintrin.h>152///153/// This intrinsic corresponds to the \c VPACKSSWB instruction.154///155/// \param __a156/// A 256-bit vector of [16 x i16] used to generate result[63:0] and157/// result[191:128].158/// \param __b159/// A 256-bit vector of [16 x i16] used to generate result[127:64] and160/// result[255:192].161/// \returns A 256-bit integer vector containing the result.162static __inline__ __m256i __DEFAULT_FN_ATTRS256163_mm256_packs_epi16(__m256i __a, __m256i __b)164{165return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);166}167168/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit169/// integers using signed saturation, and returns the resulting 256-bit170/// vector of [16 x i16].171///172/// \code{.operation}173/// FOR i := 0 TO 3174/// j := i*32175/// k := i*16176/// result[15+k:k] := SATURATE16(__a[31+j:j])177/// result[79+k:64+k] := SATURATE16(__b[31+j:j])178/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])179/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])180/// ENDFOR181/// \endcode182///183/// \headerfile <immintrin.h>184///185/// This intrinsic corresponds to the \c VPACKSSDW instruction.186///187/// \param __a188/// A 256-bit vector of [8 x i32] used to generate result[63:0] and189/// result[191:128].190/// \param __b191/// A 256-bit vector of [8 x i32] used to generate result[127:64] and192/// result[255:192].193/// \returns A 256-bit vector of [16 x i16] containing the result.194static __inline__ __m256i __DEFAULT_FN_ATTRS256195_mm256_packs_epi32(__m256i __a, __m256i __b)196{197return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);198}199200/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers201/// using unsigned saturation, and returns the 256-bit result.202///203/// \code{.operation}204/// FOR i := 0 TO 7205/// j := i*16206/// k := i*8207/// result[7+k:k] := SATURATE8U(__a[15+j:j])208/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])209/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])210/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])211/// ENDFOR212/// \endcode213///214/// \headerfile <immintrin.h>215///216/// This intrinsic corresponds to the \c VPACKUSWB instruction.217///218/// \param __a219/// A 256-bit vector of [16 x i16] used to generate result[63:0] and220/// result[191:128].221/// \param __b222/// A 256-bit vector of [16 x i16] used to generate result[127:64] and223/// result[255:192].224/// \returns A 256-bit integer vector containing the result.225static __inline__ __m256i __DEFAULT_FN_ATTRS256226_mm256_packus_epi16(__m256i __a, __m256i __b)227{228return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);229}230231/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers232/// using unsigned saturation, and returns the resulting 256-bit vector of233/// [16 x i16].234///235/// \code{.operation}236/// FOR i := 0 TO 3237/// j := i*32238/// k := i*16239/// result[15+k:k] := SATURATE16U(__V1[31+j:j])240/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])241/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])242/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])243/// ENDFOR244/// \endcode245///246/// \headerfile <immintrin.h>247///248/// This intrinsic corresponds to the \c VPACKUSDW instruction.249///250/// \param __V1251/// A 256-bit vector of [8 x i32] used to generate result[63:0] and252/// result[191:128].253/// \param __V2254/// A 256-bit vector of [8 x i32] used to generate result[127:64] and255/// result[255:192].256/// \returns A 256-bit vector of [16 x i16] containing the result.257static __inline__ __m256i __DEFAULT_FN_ATTRS256258_mm256_packus_epi32(__m256i __V1, __m256i __V2)259{260return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);261}262263/// Adds 8-bit integers from corresponding bytes of two 256-bit integer264/// vectors and returns the lower 8 bits of each sum in the corresponding265/// byte of the 256-bit integer vector result (overflow is ignored).266///267/// \headerfile <immintrin.h>268///269/// This intrinsic corresponds to the \c VPADDB instruction.270///271/// \param __a272/// A 256-bit integer vector containing one of the source operands.273/// \param __b274/// A 256-bit integer vector containing one of the source operands.275/// \returns A 256-bit integer vector containing the sums.276static __inline__ __m256i __DEFAULT_FN_ATTRS256277_mm256_add_epi8(__m256i __a, __m256i __b)278{279return (__m256i)((__v32qu)__a + (__v32qu)__b);280}281282/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of283/// [16 x i16] and returns the lower 16 bits of each sum in the284/// corresponding element of the [16 x i16] result (overflow is ignored).285///286/// \headerfile <immintrin.h>287///288/// This intrinsic corresponds to the \c VPADDW instruction.289///290/// \param __a291/// A 256-bit vector of [16 x i16] containing one of the source operands.292/// \param __b293/// A 256-bit vector of [16 x i16] containing one of the source operands.294/// \returns A 256-bit vector of [16 x i16] containing the sums.295static __inline__ __m256i __DEFAULT_FN_ATTRS256296_mm256_add_epi16(__m256i __a, __m256i __b)297{298return (__m256i)((__v16hu)__a + (__v16hu)__b);299}300301/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of302/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding303/// element of the [8 x i32] result (overflow is ignored).304///305/// \headerfile <immintrin.h>306///307/// This intrinsic corresponds to the \c VPADDD instruction.308///309/// \param __a310/// A 256-bit vector of [8 x i32] containing one of the source operands.311/// \param __b312/// A 256-bit vector of [8 x i32] containing one of the source operands.313/// \returns A 256-bit vector of [8 x i32] containing the sums.314static __inline__ __m256i __DEFAULT_FN_ATTRS256315_mm256_add_epi32(__m256i __a, __m256i __b)316{317return (__m256i)((__v8su)__a + (__v8su)__b);318}319320/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of321/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding322/// element of the [4 x i64] result (overflow is ignored).323///324/// \headerfile <immintrin.h>325///326/// This intrinsic corresponds to the \c VPADDQ instruction.327///328/// \param __a329/// A 256-bit vector of [4 x i64] containing one of the source operands.330/// \param __b331/// A 256-bit vector of [4 x i64] containing one of the source operands.332/// \returns A 256-bit vector of [4 x i64] containing the sums.333static __inline__ __m256i __DEFAULT_FN_ATTRS256334_mm256_add_epi64(__m256i __a, __m256i __b)335{336return (__m256i)((__v4du)__a + (__v4du)__b);337}338339/// Adds 8-bit integers from corresponding bytes of two 256-bit integer340/// vectors using signed saturation, and returns each sum in the341/// corresponding byte of the 256-bit integer vector result.342///343/// \headerfile <immintrin.h>344///345/// This intrinsic corresponds to the \c VPADDSB instruction.346///347/// \param __a348/// A 256-bit integer vector containing one of the source operands.349/// \param __b350/// A 256-bit integer vector containing one of the source operands.351/// \returns A 256-bit integer vector containing the sums.352static __inline__ __m256i __DEFAULT_FN_ATTRS256353_mm256_adds_epi8(__m256i __a, __m256i __b)354{355return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);356}357358/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of359/// [16 x i16] using signed saturation, and returns the [16 x i16] result.360///361/// \headerfile <immintrin.h>362///363/// This intrinsic corresponds to the \c VPADDSW instruction.364///365/// \param __a366/// A 256-bit vector of [16 x i16] containing one of the source operands.367/// \param __b368/// A 256-bit vector of [16 x i16] containing one of the source operands.369/// \returns A 256-bit vector of [16 x i16] containing the sums.370static __inline__ __m256i __DEFAULT_FN_ATTRS256371_mm256_adds_epi16(__m256i __a, __m256i __b)372{373return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);374}375376/// Adds 8-bit integers from corresponding bytes of two 256-bit integer377/// vectors using unsigned saturation, and returns each sum in the378/// corresponding byte of the 256-bit integer vector result.379///380/// \headerfile <immintrin.h>381///382/// This intrinsic corresponds to the \c VPADDUSB instruction.383///384/// \param __a385/// A 256-bit integer vector containing one of the source operands.386/// \param __b387/// A 256-bit integer vector containing one of the source operands.388/// \returns A 256-bit integer vector containing the sums.389static __inline__ __m256i __DEFAULT_FN_ATTRS256390_mm256_adds_epu8(__m256i __a, __m256i __b)391{392return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);393}394395/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of396/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.397///398/// \headerfile <immintrin.h>399///400/// This intrinsic corresponds to the \c VPADDUSW instruction.401///402/// \param __a403/// A 256-bit vector of [16 x i16] containing one of the source operands.404/// \param __b405/// A 256-bit vector of [16 x i16] containing one of the source operands.406/// \returns A 256-bit vector of [16 x i16] containing the sums.407static __inline__ __m256i __DEFAULT_FN_ATTRS256408_mm256_adds_epu16(__m256i __a, __m256i __b)409{410return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);411}412413/// Uses the lower half of the 256-bit vector \a a as the upper half of a414/// temporary 256-bit value, and the lower half of the 256-bit vector \a b415/// as the lower half of the temporary value. Right-shifts the temporary416/// value by \a n bytes, and uses the lower 16 bytes of the shifted value417/// as the lower 16 bytes of the result. Uses the upper halves of \a a and418/// \a b to make another temporary value, right shifts by \a n, and uses419/// the lower 16 bytes of the shifted value as the upper 16 bytes of the420/// result.421///422/// \headerfile <immintrin.h>423///424/// \code425/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);426/// \endcode427///428/// This intrinsic corresponds to the \c VPALIGNR instruction.429///430/// \param a431/// A 256-bit integer vector containing source values.432/// \param b433/// A 256-bit integer vector containing source values.434/// \param n435/// An immediate value specifying the number of bytes to shift.436/// \returns A 256-bit integer vector containing the result.437#define _mm256_alignr_epi8(a, b, n) \438((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \439(__v32qi)(__m256i)(b), (n)))440441/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and442/// \a __b.443///444/// \headerfile <immintrin.h>445///446/// This intrinsic corresponds to the \c VPAND instruction.447///448/// \param __a449/// A 256-bit integer vector.450/// \param __b451/// A 256-bit integer vector.452/// \returns A 256-bit integer vector containing the result.453static __inline__ __m256i __DEFAULT_FN_ATTRS256454_mm256_and_si256(__m256i __a, __m256i __b)455{456return (__m256i)((__v4du)__a & (__v4du)__b);457}458459/// Computes the bitwise AND of the 256-bit integer vector in \a __b with460/// the bitwise NOT of the 256-bit integer vector in \a __a.461///462/// \headerfile <immintrin.h>463///464/// This intrinsic corresponds to the \c VPANDN instruction.465///466/// \param __a467/// A 256-bit integer vector.468/// \param __b469/// A 256-bit integer vector.470/// \returns A 256-bit integer vector containing the result.471static __inline__ __m256i __DEFAULT_FN_ATTRS256472_mm256_andnot_si256(__m256i __a, __m256i __b)473{474return (__m256i)(~(__v4du)__a & (__v4du)__b);475}476477/// Computes the averages of the corresponding unsigned bytes in the two478/// 256-bit integer vectors in \a __a and \a __b and returns each479/// average in the corresponding byte of the 256-bit result.480///481/// \code{.operation}482/// FOR i := 0 TO 31483/// j := i*8484/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1485/// ENDFOR486/// \endcode487///488/// \headerfile <immintrin.h>489///490/// This intrinsic corresponds to the \c VPAVGB instruction.491///492/// \param __a493/// A 256-bit integer vector.494/// \param __b495/// A 256-bit integer vector.496/// \returns A 256-bit integer vector containing the result.497static __inline__ __m256i __DEFAULT_FN_ATTRS256498_mm256_avg_epu8(__m256i __a, __m256i __b)499{500return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);501}502503/// Computes the averages of the corresponding unsigned 16-bit integers in504/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns505/// each average in the corresponding element of the 256-bit result.506///507/// \code{.operation}508/// FOR i := 0 TO 15509/// j := i*16510/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1511/// ENDFOR512/// \endcode513///514/// \headerfile <immintrin.h>515///516/// This intrinsic corresponds to the \c VPAVGW instruction.517///518/// \param __a519/// A 256-bit vector of [16 x i16].520/// \param __b521/// A 256-bit vector of [16 x i16].522/// \returns A 256-bit vector of [16 x i16] containing the result.523static __inline__ __m256i __DEFAULT_FN_ATTRS256524_mm256_avg_epu16(__m256i __a, __m256i __b)525{526return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);527}528529/// Merges 8-bit integer values from either of the two 256-bit vectors530/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns531/// the resulting 256-bit integer vector.532///533/// \code{.operation}534/// FOR i := 0 TO 31535/// j := i*8536/// IF __M[7+i] == 0537/// result[7+j:j] := __V1[7+j:j]538/// ELSE539/// result[7+j:j] := __V2[7+j:j]540/// FI541/// ENDFOR542/// \endcode543///544/// \headerfile <immintrin.h>545///546/// This intrinsic corresponds to the \c VPBLENDVB instruction.547///548/// \param __V1549/// A 256-bit integer vector containing source values.550/// \param __V2551/// A 256-bit integer vector containing source values.552/// \param __M553/// A 256-bit integer vector, with bit [7] of each byte specifying the554/// source for each corresponding byte of the result. When the mask bit555/// is 0, the byte is copied from \a __V1; otherwise, it is copied from556/// \a __V2.557/// \returns A 256-bit integer vector containing the result.558static __inline__ __m256i __DEFAULT_FN_ATTRS256559_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)560{561return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,562(__v32qi)__M);563}564565/// Merges 16-bit integer values from either of the two 256-bit vectors566/// \a V1 or \a V2, as specified by the immediate integer operand \a M,567/// and returns the resulting 256-bit vector of [16 x i16].568///569/// \code{.operation}570/// FOR i := 0 TO 7571/// j := i*16572/// IF M[i] == 0573/// result[7+j:j] := V1[7+j:j]574/// result[135+j:128+j] := V1[135+j:128+j]575/// ELSE576/// result[7+j:j] := V2[7+j:j]577/// result[135+j:128+j] := V2[135+j:128+j]578/// FI579/// ENDFOR580/// \endcode581///582/// \headerfile <immintrin.h>583///584/// \code585/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);586/// \endcode587///588/// This intrinsic corresponds to the \c VPBLENDW instruction.589///590/// \param V1591/// A 256-bit vector of [16 x i16] containing source values.592/// \param V2593/// A 256-bit vector of [16 x i16] containing source values.594/// \param M595/// An immediate 8-bit integer operand, with bits [7:0] specifying the596/// source for each element of the result. The position of the mask bit597/// corresponds to the index of a copied value. When a mask bit is 0, the598/// element is copied from \a V1; otherwise, it is copied from \a V2.599/// \a M[0] determines the source for elements 0 and 8, \a M[1] for600/// elements 1 and 9, and so forth.601/// \returns A 256-bit vector of [16 x i16] containing the result.602#define _mm256_blend_epi16(V1, V2, M) \603((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \604(__v16hi)(__m256i)(V2), (int)(M)))605606/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and607/// \a __b for equality and returns the outcomes in the corresponding608/// bytes of the 256-bit result.609///610/// \code{.operation}611/// FOR i := 0 TO 31612/// j := i*8613/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0614/// ENDFOR615/// \endcode616///617/// \headerfile <immintrin.h>618///619/// This intrinsic corresponds to the \c VPCMPEQB instruction.620///621/// \param __a622/// A 256-bit integer vector containing one of the inputs.623/// \param __b624/// A 256-bit integer vector containing one of the inputs.625/// \returns A 256-bit integer vector containing the result.626static __inline__ __m256i __DEFAULT_FN_ATTRS256627_mm256_cmpeq_epi8(__m256i __a, __m256i __b)628{629return (__m256i)((__v32qi)__a == (__v32qi)__b);630}631632/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in633/// \a __a and \a __b for equality and returns the outcomes in the634/// corresponding elements of the 256-bit result.635///636/// \code{.operation}637/// FOR i := 0 TO 15638/// j := i*16639/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0640/// ENDFOR641/// \endcode642///643/// \headerfile <immintrin.h>644///645/// This intrinsic corresponds to the \c VPCMPEQW instruction.646///647/// \param __a648/// A 256-bit vector of [16 x i16] containing one of the inputs.649/// \param __b650/// A 256-bit vector of [16 x i16] containing one of the inputs.651/// \returns A 256-bit vector of [16 x i16] containing the result.652static __inline__ __m256i __DEFAULT_FN_ATTRS256653_mm256_cmpeq_epi16(__m256i __a, __m256i __b)654{655return (__m256i)((__v16hi)__a == (__v16hi)__b);656}657658/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in659/// \a __a and \a __b for equality and returns the outcomes in the660/// corresponding elements of the 256-bit result.661///662/// \code{.operation}663/// FOR i := 0 TO 7664/// j := i*32665/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0666/// ENDFOR667/// \endcode668///669/// \headerfile <immintrin.h>670///671/// This intrinsic corresponds to the \c VPCMPEQD instruction.672///673/// \param __a674/// A 256-bit vector of [8 x i32] containing one of the inputs.675/// \param __b676/// A 256-bit vector of [8 x i32] containing one of the inputs.677/// \returns A 256-bit vector of [8 x i32] containing the result.678static __inline__ __m256i __DEFAULT_FN_ATTRS256679_mm256_cmpeq_epi32(__m256i __a, __m256i __b)680{681return (__m256i)((__v8si)__a == (__v8si)__b);682}683684/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in685/// \a __a and \a __b for equality and returns the outcomes in the686/// corresponding elements of the 256-bit result.687///688/// \code{.operation}689/// FOR i := 0 TO 3690/// j := i*64691/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0692/// ENDFOR693/// \endcode694///695/// \headerfile <immintrin.h>696///697/// This intrinsic corresponds to the \c VPCMPEQQ instruction.698///699/// \param __a700/// A 256-bit vector of [4 x i64] containing one of the inputs.701/// \param __b702/// A 256-bit vector of [4 x i64] containing one of the inputs.703/// \returns A 256-bit vector of [4 x i64] containing the result.704static __inline__ __m256i __DEFAULT_FN_ATTRS256705_mm256_cmpeq_epi64(__m256i __a, __m256i __b)706{707return (__m256i)((__v4di)__a == (__v4di)__b);708}709710/// Compares corresponding signed bytes in the 256-bit integer vectors in711/// \a __a and \a __b for greater-than and returns the outcomes in the712/// corresponding bytes of the 256-bit result.713///714/// \code{.operation}715/// FOR i := 0 TO 31716/// j := i*8717/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0718/// ENDFOR719/// \endcode720///721/// \headerfile <immintrin.h>722///723/// This intrinsic corresponds to the \c VPCMPGTB instruction.724///725/// \param __a726/// A 256-bit integer vector containing one of the inputs.727/// \param __b728/// A 256-bit integer vector containing one of the inputs.729/// \returns A 256-bit integer vector containing the result.730static __inline__ __m256i __DEFAULT_FN_ATTRS256731_mm256_cmpgt_epi8(__m256i __a, __m256i __b)732{733/* This function always performs a signed comparison, but __v32qi is a char734which may be signed or unsigned, so use __v32qs. */735return (__m256i)((__v32qs)__a > (__v32qs)__b);736}737738/// Compares corresponding signed elements in the 256-bit vectors of739/// [16 x i16] in \a __a and \a __b for greater-than and returns the740/// outcomes in the corresponding elements of the 256-bit result.741///742/// \code{.operation}743/// FOR i := 0 TO 15744/// j := i*16745/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0746/// ENDFOR747/// \endcode748///749/// \headerfile <immintrin.h>750///751/// This intrinsic corresponds to the \c VPCMPGTW instruction.752///753/// \param __a754/// A 256-bit vector of [16 x i16] containing one of the inputs.755/// \param __b756/// A 256-bit vector of [16 x i16] containing one of the inputs.757/// \returns A 256-bit vector of [16 x i16] containing the result.758static __inline__ __m256i __DEFAULT_FN_ATTRS256759_mm256_cmpgt_epi16(__m256i __a, __m256i __b)760{761return (__m256i)((__v16hi)__a > (__v16hi)__b);762}763764/// Compares corresponding signed elements in the 256-bit vectors of765/// [8 x i32] in \a __a and \a __b for greater-than and returns the766/// outcomes in the corresponding elements of the 256-bit result.767///768/// \code{.operation}769/// FOR i := 0 TO 7770/// j := i*32771/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0772/// ENDFOR773/// \endcode774///775/// \headerfile <immintrin.h>776///777/// This intrinsic corresponds to the \c VPCMPGTD instruction.778///779/// \param __a780/// A 256-bit vector of [8 x i32] containing one of the inputs.781/// \param __b782/// A 256-bit vector of [8 x i32] containing one of the inputs.783/// \returns A 256-bit vector of [8 x i32] containing the result.784static __inline__ __m256i __DEFAULT_FN_ATTRS256785_mm256_cmpgt_epi32(__m256i __a, __m256i __b)786{787return (__m256i)((__v8si)__a > (__v8si)__b);788}789790/// Compares corresponding signed elements in the 256-bit vectors of791/// [4 x i64] in \a __a and \a __b for greater-than and returns the792/// outcomes in the corresponding elements of the 256-bit result.793///794/// \code{.operation}795/// FOR i := 0 TO 3796/// j := i*64797/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0798/// ENDFOR799/// \endcode800///801/// \headerfile <immintrin.h>802///803/// This intrinsic corresponds to the \c VPCMPGTQ instruction.804///805/// \param __a806/// A 256-bit vector of [4 x i64] containing one of the inputs.807/// \param __b808/// A 256-bit vector of [4 x i64] containing one of the inputs.809/// \returns A 256-bit vector of [4 x i64] containing the result.810static __inline__ __m256i __DEFAULT_FN_ATTRS256811_mm256_cmpgt_epi64(__m256i __a, __m256i __b)812{813return (__m256i)((__v4di)__a > (__v4di)__b);814}815816/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit817/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an818/// element of the [16 x i16] result (overflow is ignored). Sums from819/// \a __a are returned in the lower 64 bits of each 128-bit half of the820/// result; sums from \a __b are returned in the upper 64 bits of each821/// 128-bit half of the result.822///823/// \code{.operation}824/// FOR i := 0 TO 1825/// j := i*128826/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]827/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]828/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]829/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]830/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]831/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]832/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]833/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]834/// ENDFOR835/// \endcode836///837/// \headerfile <immintrin.h>838///839/// This intrinsic corresponds to the \c VPHADDW instruction.840///841/// \param __a842/// A 256-bit vector of [16 x i16] containing one of the source operands.843/// \param __b844/// A 256-bit vector of [16 x i16] containing one of the source operands.845/// \returns A 256-bit vector of [16 x i16] containing the sums.846static __inline__ __m256i __DEFAULT_FN_ATTRS256847_mm256_hadd_epi16(__m256i __a, __m256i __b)848{849return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);850}851852/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit853/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an854/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a855/// are returned in the lower 64 bits of each 128-bit half of the result;856/// sums from \a __b are returned in the upper 64 bits of each 128-bit half857/// of the result.858///859/// \code{.operation}860/// FOR i := 0 TO 1861/// j := i*128862/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]863/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]864/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]865/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]866/// ENDFOR867/// \endcode868///869/// \headerfile <immintrin.h>870///871/// This intrinsic corresponds to the \c VPHADDD instruction.872///873/// \param __a874/// A 256-bit vector of [8 x i32] containing one of the source operands.875/// \param __b876/// A 256-bit vector of [8 x i32] containing one of the source operands.877/// \returns A 256-bit vector of [8 x i32] containing the sums.878static __inline__ __m256i __DEFAULT_FN_ATTRS256879_mm256_hadd_epi32(__m256i __a, __m256i __b)880{881return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);882}883884/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit885/// vectors of [16 x i16] using signed saturation and returns each sum in886/// an element of the [16 x i16] result. Sums from \a __a are returned in887/// the lower 64 bits of each 128-bit half of the result; sums from \a __b888/// are returned in the upper 64 bits of each 128-bit half of the result.889///890/// \code{.operation}891/// FOR i := 0 TO 1892/// j := i*128893/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])894/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])895/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])896/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])897/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])898/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])899/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])900/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])901/// ENDFOR902/// \endcode903///904/// \headerfile <immintrin.h>905///906/// This intrinsic corresponds to the \c VPHADDSW instruction.907///908/// \param __a909/// A 256-bit vector of [16 x i16] containing one of the source operands.910/// \param __b911/// A 256-bit vector of [16 x i16] containing one of the source operands.912/// \returns A 256-bit vector of [16 x i16] containing the sums.913static __inline__ __m256i __DEFAULT_FN_ATTRS256914_mm256_hadds_epi16(__m256i __a, __m256i __b)915{916return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);917}918919/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit920/// vectors of [16 x i16] and returns the lower 16 bits of each difference921/// in an element of the [16 x i16] result (overflow is ignored).922/// Differences from \a __a are returned in the lower 64 bits of each923/// 128-bit half of the result; differences from \a __b are returned in the924/// upper 64 bits of each 128-bit half of the result.925///926/// \code{.operation}927/// FOR i := 0 TO 1928/// j := i*128929/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]930/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]931/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]932/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]933/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]934/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]935/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]936/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]937/// ENDFOR938/// \endcode939///940/// \headerfile <immintrin.h>941///942/// This intrinsic corresponds to the \c VPHSUBW instruction.943///944/// \param __a945/// A 256-bit vector of [16 x i16] containing one of the source operands.946/// \param __b947/// A 256-bit vector of [16 x i16] containing one of the source operands.948/// \returns A 256-bit vector of [16 x i16] containing the differences.949static __inline__ __m256i __DEFAULT_FN_ATTRS256950_mm256_hsub_epi16(__m256i __a, __m256i __b)951{952return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);953}954955/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit956/// vectors of [8 x i32] and returns the lower 32 bits of each difference in957/// an element of the [8 x i32] result (overflow is ignored). Differences958/// from \a __a are returned in the lower 64 bits of each 128-bit half of959/// the result; differences from \a __b are returned in the upper 64 bits960/// of each 128-bit half of the result.961///962/// \code{.operation}963/// FOR i := 0 TO 1964/// j := i*128965/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]966/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]967/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]968/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]969/// ENDFOR970/// \endcode971///972/// \headerfile <immintrin.h>973///974/// This intrinsic corresponds to the \c VPHSUBD instruction.975///976/// \param __a977/// A 256-bit vector of [8 x i32] containing one of the source operands.978/// \param __b979/// A 256-bit vector of [8 x i32] containing one of the source operands.980/// \returns A 256-bit vector of [8 x i32] containing the differences.981static __inline__ __m256i __DEFAULT_FN_ATTRS256982_mm256_hsub_epi32(__m256i __a, __m256i __b)983{984return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);985}986987/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit988/// vectors of [16 x i16] using signed saturation and returns each sum in989/// an element of the [16 x i16] result. Differences from \a __a are990/// returned in the lower 64 bits of each 128-bit half of the result;991/// differences from \a __b are returned in the upper 64 bits of each992/// 128-bit half of the result.993///994/// \code{.operation}995/// FOR i := 0 TO 1996/// j := i*128997/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])998/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])999/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])1000/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])1001/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])1002/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])1003/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])1004/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])1005/// ENDFOR1006/// \endcode1007///1008/// \headerfile <immintrin.h>1009///1010/// This intrinsic corresponds to the \c VPHSUBSW instruction.1011///1012/// \param __a1013/// A 256-bit vector of [16 x i16] containing one of the source operands.1014/// \param __b1015/// A 256-bit vector of [16 x i16] containing one of the source operands.1016/// \returns A 256-bit vector of [16 x i16] containing the differences.1017static __inline__ __m256i __DEFAULT_FN_ATTRS2561018_mm256_hsubs_epi16(__m256i __a, __m256i __b)1019{1020return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);1021}10221023/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a1024/// with the corresponding signed byte from the 256-bit integer vector in1025/// \a __b, forming signed 16-bit intermediate products. Adds adjacent1026/// pairs of those products using signed saturation to form 16-bit sums1027/// returned as elements of the [16 x i16] result.1028///1029/// \code{.operation}1030/// FOR i := 0 TO 151031/// j := i*161032/// temp1 := __a[j+7:j] * __b[j+7:j]1033/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]1034/// result[j+15:j] := SATURATE16(temp1 + temp2)1035/// ENDFOR1036/// \endcode1037///1038/// \headerfile <immintrin.h>1039///1040/// This intrinsic corresponds to the \c VPMADDUBSW instruction.1041///1042/// \param __a1043/// A 256-bit vector containing one of the source operands.1044/// \param __b1045/// A 256-bit vector containing one of the source operands.1046/// \returns A 256-bit vector of [16 x i16] containing the result.1047static __inline__ __m256i __DEFAULT_FN_ATTRS2561048_mm256_maddubs_epi16(__m256i __a, __m256i __b)1049{1050return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);1051}10521053/// Multiplies corresponding 16-bit elements of two 256-bit vectors of1054/// [16 x i16], forming 32-bit intermediate products, and adds pairs of1055/// those products to form 32-bit sums returned as elements of the1056/// [8 x i32] result.1057///1058/// There is only one wraparound case: when all four of the 16-bit sources1059/// are \c 0x8000, the result will be \c 0x80000000.1060///1061/// \code{.operation}1062/// FOR i := 0 TO 71063/// j := i*321064/// temp1 := __a[j+15:j] * __b[j+15:j]1065/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]1066/// result[j+31:j] := temp1 + temp21067/// ENDFOR1068/// \endcode1069///1070/// \headerfile <immintrin.h>1071///1072/// This intrinsic corresponds to the \c VPMADDWD instruction.1073///1074/// \param __a1075/// A 256-bit vector of [16 x i16] containing one of the source operands.1076/// \param __b1077/// A 256-bit vector of [16 x i16] containing one of the source operands.1078/// \returns A 256-bit vector of [8 x i32] containing the result.1079static __inline__ __m256i __DEFAULT_FN_ATTRS2561080_mm256_madd_epi16(__m256i __a, __m256i __b)1081{1082return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);1083}10841085/// Compares the corresponding signed bytes in the two 256-bit integer vectors1086/// in \a __a and \a __b and returns the larger of each pair in the1087/// corresponding byte of the 256-bit result.1088///1089/// \headerfile <immintrin.h>1090///1091/// This intrinsic corresponds to the \c VPMAXSB instruction.1092///1093/// \param __a1094/// A 256-bit integer vector.1095/// \param __b1096/// A 256-bit integer vector.1097/// \returns A 256-bit integer vector containing the result.1098static __inline__ __m256i __DEFAULT_FN_ATTRS2561099_mm256_max_epi8(__m256i __a, __m256i __b)1100{1101return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);1102}11031104/// Compares the corresponding signed 16-bit integers in the two 256-bit1105/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of1106/// each pair in the corresponding element of the 256-bit result.1107///1108/// \headerfile <immintrin.h>1109///1110/// This intrinsic corresponds to the \c VPMAXSW instruction.1111///1112/// \param __a1113/// A 256-bit vector of [16 x i16].1114/// \param __b1115/// A 256-bit vector of [16 x i16].1116/// \returns A 256-bit vector of [16 x i16] containing the result.1117static __inline__ __m256i __DEFAULT_FN_ATTRS2561118_mm256_max_epi16(__m256i __a, __m256i __b)1119{1120return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);1121}11221123/// Compares the corresponding signed 32-bit integers in the two 256-bit1124/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of1125/// each pair in the corresponding element of the 256-bit result.1126///1127/// \headerfile <immintrin.h>1128///1129/// This intrinsic corresponds to the \c VPMAXSD instruction.1130///1131/// \param __a1132/// A 256-bit vector of [8 x i32].1133/// \param __b1134/// A 256-bit vector of [8 x i32].1135/// \returns A 256-bit vector of [8 x i32] containing the result.1136static __inline__ __m256i __DEFAULT_FN_ATTRS2561137_mm256_max_epi32(__m256i __a, __m256i __b)1138{1139return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);1140}11411142/// Compares the corresponding unsigned bytes in the two 256-bit integer1143/// vectors in \a __a and \a __b and returns the larger of each pair in1144/// the corresponding byte of the 256-bit result.1145///1146/// \headerfile <immintrin.h>1147///1148/// This intrinsic corresponds to the \c VPMAXUB instruction.1149///1150/// \param __a1151/// A 256-bit integer vector.1152/// \param __b1153/// A 256-bit integer vector.1154/// \returns A 256-bit integer vector containing the result.1155static __inline__ __m256i __DEFAULT_FN_ATTRS2561156_mm256_max_epu8(__m256i __a, __m256i __b)1157{1158return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);1159}11601161/// Compares the corresponding unsigned 16-bit integers in the two 256-bit1162/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of1163/// each pair in the corresponding element of the 256-bit result.1164///1165/// \headerfile <immintrin.h>1166///1167/// This intrinsic corresponds to the \c VPMAXUW instruction.1168///1169/// \param __a1170/// A 256-bit vector of [16 x i16].1171/// \param __b1172/// A 256-bit vector of [16 x i16].1173/// \returns A 256-bit vector of [16 x i16] containing the result.1174static __inline__ __m256i __DEFAULT_FN_ATTRS2561175_mm256_max_epu16(__m256i __a, __m256i __b)1176{1177return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);1178}11791180/// Compares the corresponding unsigned 32-bit integers in the two 256-bit1181/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of1182/// each pair in the corresponding element of the 256-bit result.1183///1184/// \headerfile <immintrin.h>1185///1186/// This intrinsic corresponds to the \c VPMAXUD instruction.1187///1188/// \param __a1189/// A 256-bit vector of [8 x i32].1190/// \param __b1191/// A 256-bit vector of [8 x i32].1192/// \returns A 256-bit vector of [8 x i32] containing the result.1193static __inline__ __m256i __DEFAULT_FN_ATTRS2561194_mm256_max_epu32(__m256i __a, __m256i __b)1195{1196return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);1197}11981199/// Compares the corresponding signed bytes in the two 256-bit integer vectors1200/// in \a __a and \a __b and returns the smaller of each pair in the1201/// corresponding byte of the 256-bit result.1202///1203/// \headerfile <immintrin.h>1204///1205/// This intrinsic corresponds to the \c VPMINSB instruction.1206///1207/// \param __a1208/// A 256-bit integer vector.1209/// \param __b1210/// A 256-bit integer vector.1211/// \returns A 256-bit integer vector containing the result.1212static __inline__ __m256i __DEFAULT_FN_ATTRS2561213_mm256_min_epi8(__m256i __a, __m256i __b)1214{1215return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);1216}12171218/// Compares the corresponding signed 16-bit integers in the two 256-bit1219/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of1220/// each pair in the corresponding element of the 256-bit result.1221///1222/// \headerfile <immintrin.h>1223///1224/// This intrinsic corresponds to the \c VPMINSW instruction.1225///1226/// \param __a1227/// A 256-bit vector of [16 x i16].1228/// \param __b1229/// A 256-bit vector of [16 x i16].1230/// \returns A 256-bit vector of [16 x i16] containing the result.1231static __inline__ __m256i __DEFAULT_FN_ATTRS2561232_mm256_min_epi16(__m256i __a, __m256i __b)1233{1234return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);1235}12361237/// Compares the corresponding signed 32-bit integers in the two 256-bit1238/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of1239/// each pair in the corresponding element of the 256-bit result.1240///1241/// \headerfile <immintrin.h>1242///1243/// This intrinsic corresponds to the \c VPMINSD instruction.1244///1245/// \param __a1246/// A 256-bit vector of [8 x i32].1247/// \param __b1248/// A 256-bit vector of [8 x i32].1249/// \returns A 256-bit vector of [8 x i32] containing the result.1250static __inline__ __m256i __DEFAULT_FN_ATTRS2561251_mm256_min_epi32(__m256i __a, __m256i __b)1252{1253return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);1254}12551256/// Compares the corresponding unsigned bytes in the two 256-bit integer1257/// vectors in \a __a and \a __b and returns the smaller of each pair in1258/// the corresponding byte of the 256-bit result.1259///1260/// \headerfile <immintrin.h>1261///1262/// This intrinsic corresponds to the \c VPMINUB instruction.1263///1264/// \param __a1265/// A 256-bit integer vector.1266/// \param __b1267/// A 256-bit integer vector.1268/// \returns A 256-bit integer vector containing the result.1269static __inline__ __m256i __DEFAULT_FN_ATTRS2561270_mm256_min_epu8(__m256i __a, __m256i __b)1271{1272return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);1273}12741275/// Compares the corresponding unsigned 16-bit integers in the two 256-bit1276/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of1277/// each pair in the corresponding element of the 256-bit result.1278///1279/// \headerfile <immintrin.h>1280///1281/// This intrinsic corresponds to the \c VPMINUW instruction.1282///1283/// \param __a1284/// A 256-bit vector of [16 x i16].1285/// \param __b1286/// A 256-bit vector of [16 x i16].1287/// \returns A 256-bit vector of [16 x i16] containing the result.1288static __inline__ __m256i __DEFAULT_FN_ATTRS2561289_mm256_min_epu16(__m256i __a, __m256i __b)1290{1291return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);1292}12931294/// Compares the corresponding unsigned 32-bit integers in the two 256-bit1295/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of1296/// each pair in the corresponding element of the 256-bit result.1297///1298/// \headerfile <immintrin.h>1299///1300/// This intrinsic corresponds to the \c VPMINUD instruction.1301///1302/// \param __a1303/// A 256-bit vector of [8 x i32].1304/// \param __b1305/// A 256-bit vector of [8 x i32].1306/// \returns A 256-bit vector of [8 x i32] containing the result.1307static __inline__ __m256i __DEFAULT_FN_ATTRS2561308_mm256_min_epu32(__m256i __a, __m256i __b)1309{1310return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);1311}13121313/// Creates a 32-bit integer mask from the most significant bit of each byte1314/// in the 256-bit integer vector in \a __a and returns the result.1315///1316/// \code{.operation}1317/// FOR i := 0 TO 311318/// j := i*81319/// result[i] := __a[j+7]1320/// ENDFOR1321/// \endcode1322///1323/// \headerfile <immintrin.h>1324///1325/// This intrinsic corresponds to the \c VPMOVMSKB instruction.1326///1327/// \param __a1328/// A 256-bit integer vector containing the source bytes.1329/// \returns The 32-bit integer mask.1330static __inline__ int __DEFAULT_FN_ATTRS2561331_mm256_movemask_epi8(__m256i __a)1332{1333return __builtin_ia32_pmovmskb256((__v32qi)__a);1334}13351336/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns1337/// the 16-bit values in the corresponding elements of a 256-bit vector1338/// of [16 x i16].1339///1340/// \code{.operation}1341/// FOR i := 0 TO 151342/// j := i*81343/// k := i*161344/// result[k+15:k] := SignExtend(__V[j+7:j])1345/// ENDFOR1346/// \endcode1347///1348/// \headerfile <immintrin.h>1349///1350/// This intrinsic corresponds to the \c VPMOVSXBW instruction.1351///1352/// \param __V1353/// A 128-bit integer vector containing the source bytes.1354/// \returns A 256-bit vector of [16 x i16] containing the sign-extended1355/// values.1356static __inline__ __m256i __DEFAULT_FN_ATTRS2561357_mm256_cvtepi8_epi16(__m128i __V)1358{1359/* This function always performs a signed extension, but __v16qi is a char1360which may be signed or unsigned, so use __v16qs. */1361return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);1362}13631364/// Sign-extends bytes from the lower half of the 128-bit integer vector in1365/// \a __V and returns the 32-bit values in the corresponding elements of a1366/// 256-bit vector of [8 x i32].1367///1368/// \code{.operation}1369/// FOR i := 0 TO 71370/// j := i*81371/// k := i*321372/// result[k+31:k] := SignExtend(__V[j+7:j])1373/// ENDFOR1374/// \endcode1375///1376/// \headerfile <immintrin.h>1377///1378/// This intrinsic corresponds to the \c VPMOVSXBD instruction.1379///1380/// \param __V1381/// A 128-bit integer vector containing the source bytes.1382/// \returns A 256-bit vector of [8 x i32] containing the sign-extended1383/// values.1384static __inline__ __m256i __DEFAULT_FN_ATTRS2561385_mm256_cvtepi8_epi32(__m128i __V)1386{1387/* This function always performs a signed extension, but __v16qi is a char1388which may be signed or unsigned, so use __v16qs. */1389return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);1390}13911392/// Sign-extends the first four bytes from the 128-bit integer vector in1393/// \a __V and returns the 64-bit values in the corresponding elements of a1394/// 256-bit vector of [4 x i64].1395///1396/// \code{.operation}1397/// result[63:0] := SignExtend(__V[7:0])1398/// result[127:64] := SignExtend(__V[15:8])1399/// result[191:128] := SignExtend(__V[23:16])1400/// result[255:192] := SignExtend(__V[31:24])1401/// \endcode1402///1403/// \headerfile <immintrin.h>1404///1405/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.1406///1407/// \param __V1408/// A 128-bit integer vector containing the source bytes.1409/// \returns A 256-bit vector of [4 x i64] containing the sign-extended1410/// values.1411static __inline__ __m256i __DEFAULT_FN_ATTRS2561412_mm256_cvtepi8_epi64(__m128i __V)1413{1414/* This function always performs a signed extension, but __v16qi is a char1415which may be signed or unsigned, so use __v16qs. */1416return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);1417}14181419/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in1420/// \a __V and returns the 32-bit values in the corresponding elements of a1421/// 256-bit vector of [8 x i32].1422///1423/// \code{.operation}1424/// FOR i := 0 TO 71425/// j := i*161426/// k := i*321427/// result[k+31:k] := SignExtend(__V[j+15:j])1428/// ENDFOR1429/// \endcode1430///1431/// \headerfile <immintrin.h>1432///1433/// This intrinsic corresponds to the \c VPMOVSXWD instruction.1434///1435/// \param __V1436/// A 128-bit vector of [8 x i16] containing the source values.1437/// \returns A 256-bit vector of [8 x i32] containing the sign-extended1438/// values.1439static __inline__ __m256i __DEFAULT_FN_ATTRS2561440_mm256_cvtepi16_epi32(__m128i __V)1441{1442return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);1443}14441445/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of1446/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding1447/// elements of a 256-bit vector of [4 x i64].1448///1449/// \code{.operation}1450/// result[63:0] := SignExtend(__V[15:0])1451/// result[127:64] := SignExtend(__V[31:16])1452/// result[191:128] := SignExtend(__V[47:32])1453/// result[255:192] := SignExtend(__V[64:48])1454/// \endcode1455///1456/// \headerfile <immintrin.h>1457///1458/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.1459///1460/// \param __V1461/// A 128-bit vector of [8 x i16] containing the source values.1462/// \returns A 256-bit vector of [4 x i64] containing the sign-extended1463/// values.1464static __inline__ __m256i __DEFAULT_FN_ATTRS2561465_mm256_cvtepi16_epi64(__m128i __V)1466{1467return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);1468}14691470/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in1471/// \a __V and returns the 64-bit values in the corresponding elements of a1472/// 256-bit vector of [4 x i64].1473///1474/// \code{.operation}1475/// result[63:0] := SignExtend(__V[31:0])1476/// result[127:64] := SignExtend(__V[63:32])1477/// result[191:128] := SignExtend(__V[95:64])1478/// result[255:192] := SignExtend(__V[127:96])1479/// \endcode1480///1481/// \headerfile <immintrin.h>1482///1483/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.1484///1485/// \param __V1486/// A 128-bit vector of [4 x i32] containing the source values.1487/// \returns A 256-bit vector of [4 x i64] containing the sign-extended1488/// values.1489static __inline__ __m256i __DEFAULT_FN_ATTRS2561490_mm256_cvtepi32_epi64(__m128i __V)1491{1492return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);1493}14941495/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns1496/// the 16-bit values in the corresponding elements of a 256-bit vector1497/// of [16 x i16].1498///1499/// \code{.operation}1500/// FOR i := 0 TO 151501/// j := i*81502/// k := i*161503/// result[k+15:k] := ZeroExtend(__V[j+7:j])1504/// ENDFOR1505/// \endcode1506///1507/// \headerfile <immintrin.h>1508///1509/// This intrinsic corresponds to the \c VPMOVZXBW instruction.1510///1511/// \param __V1512/// A 128-bit integer vector containing the source bytes.1513/// \returns A 256-bit vector of [16 x i16] containing the zero-extended1514/// values.1515static __inline__ __m256i __DEFAULT_FN_ATTRS2561516_mm256_cvtepu8_epi16(__m128i __V)1517{1518return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);1519}15201521/// Zero-extends bytes from the lower half of the 128-bit integer vector in1522/// \a __V and returns the 32-bit values in the corresponding elements of a1523/// 256-bit vector of [8 x i32].1524///1525/// \code{.operation}1526/// FOR i := 0 TO 71527/// j := i*81528/// k := i*321529/// result[k+31:k] := ZeroExtend(__V[j+7:j])1530/// ENDFOR1531/// \endcode1532///1533/// \headerfile <immintrin.h>1534///1535/// This intrinsic corresponds to the \c VPMOVZXBD instruction.1536///1537/// \param __V1538/// A 128-bit integer vector containing the source bytes.1539/// \returns A 256-bit vector of [8 x i32] containing the zero-extended1540/// values.1541static __inline__ __m256i __DEFAULT_FN_ATTRS2561542_mm256_cvtepu8_epi32(__m128i __V)1543{1544return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);1545}15461547/// Zero-extends the first four bytes from the 128-bit integer vector in1548/// \a __V and returns the 64-bit values in the corresponding elements of a1549/// 256-bit vector of [4 x i64].1550///1551/// \code{.operation}1552/// result[63:0] := ZeroExtend(__V[7:0])1553/// result[127:64] := ZeroExtend(__V[15:8])1554/// result[191:128] := ZeroExtend(__V[23:16])1555/// result[255:192] := ZeroExtend(__V[31:24])1556/// \endcode1557///1558/// \headerfile <immintrin.h>1559///1560/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.1561///1562/// \param __V1563/// A 128-bit integer vector containing the source bytes.1564/// \returns A 256-bit vector of [4 x i64] containing the zero-extended1565/// values.1566static __inline__ __m256i __DEFAULT_FN_ATTRS2561567_mm256_cvtepu8_epi64(__m128i __V)1568{1569return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);1570}15711572/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in1573/// \a __V and returns the 32-bit values in the corresponding elements of a1574/// 256-bit vector of [8 x i32].1575///1576/// \code{.operation}1577/// FOR i := 0 TO 71578/// j := i*161579/// k := i*321580/// result[k+31:k] := ZeroExtend(__V[j+15:j])1581/// ENDFOR1582/// \endcode1583///1584/// \headerfile <immintrin.h>1585///1586/// This intrinsic corresponds to the \c VPMOVZXWD instruction.1587///1588/// \param __V1589/// A 128-bit vector of [8 x i16] containing the source values.1590/// \returns A 256-bit vector of [8 x i32] containing the zero-extended1591/// values.1592static __inline__ __m256i __DEFAULT_FN_ATTRS2561593_mm256_cvtepu16_epi32(__m128i __V)1594{1595return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);1596}15971598/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of1599/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding1600/// elements of a 256-bit vector of [4 x i64].1601///1602/// \code{.operation}1603/// result[63:0] := ZeroExtend(__V[15:0])1604/// result[127:64] := ZeroExtend(__V[31:16])1605/// result[191:128] := ZeroExtend(__V[47:32])1606/// result[255:192] := ZeroExtend(__V[64:48])1607/// \endcode1608///1609/// \headerfile <immintrin.h>1610///1611/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.1612///1613/// \param __V1614/// A 128-bit vector of [8 x i16] containing the source values.1615/// \returns A 256-bit vector of [4 x i64] containing the zero-extended1616/// values.1617static __inline__ __m256i __DEFAULT_FN_ATTRS2561618_mm256_cvtepu16_epi64(__m128i __V)1619{1620return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);1621}16221623/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in1624/// \a __V and returns the 64-bit values in the corresponding elements of a1625/// 256-bit vector of [4 x i64].1626///1627/// \code{.operation}1628/// result[63:0] := ZeroExtend(__V[31:0])1629/// result[127:64] := ZeroExtend(__V[63:32])1630/// result[191:128] := ZeroExtend(__V[95:64])1631/// result[255:192] := ZeroExtend(__V[127:96])1632/// \endcode1633///1634/// \headerfile <immintrin.h>1635///1636/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.1637///1638/// \param __V1639/// A 128-bit vector of [4 x i32] containing the source values.1640/// \returns A 256-bit vector of [4 x i64] containing the zero-extended1641/// values.1642static __inline__ __m256i __DEFAULT_FN_ATTRS2561643_mm256_cvtepu32_epi64(__m128i __V)1644{1645return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);1646}16471648/// Multiplies signed 32-bit integers from even-numbered elements of two1649/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the1650/// [4 x i64] result.1651///1652/// \code{.operation}1653/// result[63:0] := __a[31:0] * __b[31:0]1654/// result[127:64] := __a[95:64] * __b[95:64]1655/// result[191:128] := __a[159:128] * __b[159:128]1656/// result[255:192] := __a[223:192] * __b[223:192]1657/// \endcode1658///1659/// \headerfile <immintrin.h>1660///1661/// This intrinsic corresponds to the \c VPMULDQ instruction.1662///1663/// \param __a1664/// A 256-bit vector of [8 x i32] containing one of the source operands.1665/// \param __b1666/// A 256-bit vector of [8 x i32] containing one of the source operands.1667/// \returns A 256-bit vector of [4 x i64] containing the products.1668static __inline__ __m256i __DEFAULT_FN_ATTRS2561669_mm256_mul_epi32(__m256i __a, __m256i __b)1670{1671return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);1672}16731674/// Multiplies signed 16-bit integer elements of two 256-bit vectors of1675/// [16 x i16], truncates the 32-bit results to the most significant 181676/// bits, rounds by adding 1, and returns bits [16:1] of each rounded1677/// product in the [16 x i16] result.1678///1679/// \code{.operation}1680/// FOR i := 0 TO 151681/// j := i*161682/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 11683/// result[j+15:j] := temp[16:1]1684/// \endcode1685///1686/// \headerfile <immintrin.h>1687///1688/// This intrinsic corresponds to the \c VPMULHRSW instruction.1689///1690/// \param __a1691/// A 256-bit vector of [16 x i16] containing one of the source operands.1692/// \param __b1693/// A 256-bit vector of [16 x i16] containing one of the source operands.1694/// \returns A 256-bit vector of [16 x i16] containing the rounded products.1695static __inline__ __m256i __DEFAULT_FN_ATTRS2561696_mm256_mulhrs_epi16(__m256i __a, __m256i __b)1697{1698return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);1699}17001701/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of1702/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the1703/// [16 x i16] result.1704///1705/// \headerfile <immintrin.h>1706///1707/// This intrinsic corresponds to the \c VPMULHUW instruction.1708///1709/// \param __a1710/// A 256-bit vector of [16 x i16] containing one of the source operands.1711/// \param __b1712/// A 256-bit vector of [16 x i16] containing one of the source operands.1713/// \returns A 256-bit vector of [16 x i16] containing the products.1714static __inline__ __m256i __DEFAULT_FN_ATTRS2561715_mm256_mulhi_epu16(__m256i __a, __m256i __b)1716{1717return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);1718}17191720/// Multiplies signed 16-bit integer elements of two 256-bit vectors of1721/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the1722/// [16 x i16] result.1723///1724/// \headerfile <immintrin.h>1725///1726/// This intrinsic corresponds to the \c VPMULHW instruction.1727///1728/// \param __a1729/// A 256-bit vector of [16 x i16] containing one of the source operands.1730/// \param __b1731/// A 256-bit vector of [16 x i16] containing one of the source operands.1732/// \returns A 256-bit vector of [16 x i16] containing the products.1733static __inline__ __m256i __DEFAULT_FN_ATTRS2561734_mm256_mulhi_epi16(__m256i __a, __m256i __b)1735{1736return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);1737}17381739/// Multiplies signed 16-bit integer elements of two 256-bit vectors of1740/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the1741/// [16 x i16] result.1742///1743/// \headerfile <immintrin.h>1744///1745/// This intrinsic corresponds to the \c VPMULLW instruction.1746///1747/// \param __a1748/// A 256-bit vector of [16 x i16] containing one of the source operands.1749/// \param __b1750/// A 256-bit vector of [16 x i16] containing one of the source operands.1751/// \returns A 256-bit vector of [16 x i16] containing the products.1752static __inline__ __m256i __DEFAULT_FN_ATTRS2561753_mm256_mullo_epi16(__m256i __a, __m256i __b)1754{1755return (__m256i)((__v16hu)__a * (__v16hu)__b);1756}17571758/// Multiplies signed 32-bit integer elements of two 256-bit vectors of1759/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the1760/// [8 x i32] result.1761///1762/// \headerfile <immintrin.h>1763///1764/// This intrinsic corresponds to the \c VPMULLD instruction.1765///1766/// \param __a1767/// A 256-bit vector of [8 x i32] containing one of the source operands.1768/// \param __b1769/// A 256-bit vector of [8 x i32] containing one of the source operands.1770/// \returns A 256-bit vector of [8 x i32] containing the products.1771static __inline__ __m256i __DEFAULT_FN_ATTRS2561772_mm256_mullo_epi32 (__m256i __a, __m256i __b)1773{1774return (__m256i)((__v8su)__a * (__v8su)__b);1775}17761777/// Multiplies unsigned 32-bit integers from even-numered elements of two1778/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the1779/// [4 x i64] result.1780///1781/// \code{.operation}1782/// result[63:0] := __a[31:0] * __b[31:0]1783/// result[127:64] := __a[95:64] * __b[95:64]1784/// result[191:128] := __a[159:128] * __b[159:128]1785/// result[255:192] := __a[223:192] * __b[223:192]1786/// \endcode1787///1788/// \headerfile <immintrin.h>1789///1790/// This intrinsic corresponds to the \c VPMULUDQ instruction.1791///1792/// \param __a1793/// A 256-bit vector of [8 x i32] containing one of the source operands.1794/// \param __b1795/// A 256-bit vector of [8 x i32] containing one of the source operands.1796/// \returns A 256-bit vector of [4 x i64] containing the products.1797static __inline__ __m256i __DEFAULT_FN_ATTRS2561798_mm256_mul_epu32(__m256i __a, __m256i __b)1799{1800return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);1801}18021803/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and1804/// \a __b.1805///1806/// \headerfile <immintrin.h>1807///1808/// This intrinsic corresponds to the \c VPOR instruction.1809///1810/// \param __a1811/// A 256-bit integer vector.1812/// \param __b1813/// A 256-bit integer vector.1814/// \returns A 256-bit integer vector containing the result.1815static __inline__ __m256i __DEFAULT_FN_ATTRS2561816_mm256_or_si256(__m256i __a, __m256i __b)1817{1818return (__m256i)((__v4du)__a | (__v4du)__b);1819}18201821/// Computes four sum of absolute difference (SAD) operations on sets of eight1822/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and1823/// \a __b.1824///1825/// One SAD result is computed for each set of eight bytes from \a __a and1826/// eight bytes from \a __b. The zero-extended SAD value is returned in the1827/// corresponding 64-bit element of the result.1828///1829/// A single SAD operation takes the differences between the corresponding1830/// bytes of \a __a and \a __b, takes the absolute value of each difference,1831/// and sums these eight values to form one 16-bit result. This operation1832/// is repeated four times with successive sets of eight bytes.1833///1834/// \code{.operation}1835/// FOR i := 0 TO 31836/// j := i*641837/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])1838/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])1839/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])1840/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])1841/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])1842/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])1843/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])1844/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])1845/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +1846/// temp4 + temp5 + temp6 + temp71847/// result[j+63:j+16] := 01848/// ENDFOR1849/// \endcode1850///1851/// \headerfile <immintrin.h>1852///1853/// This intrinsic corresponds to the \c VPSADBW instruction.1854///1855/// \param __a1856/// A 256-bit integer vector.1857/// \param __b1858/// A 256-bit integer vector.1859/// \returns A 256-bit integer vector containing the result.1860static __inline__ __m256i __DEFAULT_FN_ATTRS2561861_mm256_sad_epu8(__m256i __a, __m256i __b)1862{1863return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);1864}18651866/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according1867/// to control information in the 256-bit integer vector \a __b, and1868/// returns the 256-bit result. In effect there are two separate 128-bit1869/// shuffles in the lower and upper halves.1870///1871/// \code{.operation}1872/// FOR i := 0 TO 311873/// j := i*81874/// IF __b[j+7] == 11875/// result[j+7:j] := 01876/// ELSE1877/// k := __b[j+3:j] * 81878/// IF i > 151879/// k := k + 1281880/// FI1881/// result[j+7:j] := __a[k+7:k]1882/// FI1883/// ENDFOR1884/// \endcode1885///1886/// \headerfile <immintrin.h>1887///1888/// This intrinsic corresponds to the \c VPSHUFB instruction.1889///1890/// \param __a1891/// A 256-bit integer vector containing source values.1892/// \param __b1893/// A 256-bit integer vector containing control information to determine1894/// what goes into the corresponding byte of the result. If bit 7 of the1895/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the1896/// control byte specify the index (within the same 128-bit half) of \a __a1897/// to copy to the result byte.1898/// \returns A 256-bit integer vector containing the result.1899static __inline__ __m256i __DEFAULT_FN_ATTRS2561900_mm256_shuffle_epi8(__m256i __a, __m256i __b)1901{1902return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);1903}19041905/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a1906/// according to control information in the integer literal \a imm, and1907/// returns the 256-bit result. In effect there are two parallel 128-bit1908/// shuffles in the lower and upper halves.1909///1910/// \code{.operation}1911/// FOR i := 0 to 31912/// j := i*321913/// k := (imm >> i*2)[1:0] * 321914/// result[j+31:j] := a[k+31:k]1915/// result[128+j+31:128+j] := a[128+k+31:128+k]1916/// ENDFOR1917/// \endcode1918///1919/// \headerfile <immintrin.h>1920///1921/// \code1922/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);1923/// \endcode1924///1925/// This intrinsic corresponds to the \c VPSHUFB instruction.1926///1927/// \param a1928/// A 256-bit vector of [8 x i32] containing source values.1929/// \param imm1930/// An immediate 8-bit value specifying which elements to copy from \a a.1931/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the1932/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so1933/// forth.1934/// \returns A 256-bit vector of [8 x i32] containing the result.1935#define _mm256_shuffle_epi32(a, imm) \1936((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))19371938/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a1939/// according to control information in the integer literal \a imm, and1940/// returns the 256-bit result. The upper 64 bits of each 128-bit half1941/// are shuffled in parallel; the lower 64 bits of each 128-bit half are1942/// copied from \a a unchanged.1943///1944/// \code{.operation}1945/// result[63:0] := a[63:0]1946/// result[191:128] := a[191:128]1947/// FOR i := 0 TO 31948/// j := i * 16 + 641949/// k := (imm >> i*2)[1:0] * 16 + 641950/// result[j+15:j] := a[k+15:k]1951/// result[128+j+15:128+j] := a[128+k+15:128+k]1952/// ENDFOR1953/// \endcode1954///1955/// \headerfile <immintrin.h>1956///1957/// \code1958/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);1959/// \endcode1960///1961/// This intrinsic corresponds to the \c VPSHUFHW instruction.1962///1963/// \param a1964/// A 256-bit vector of [16 x i16] containing source values.1965/// \param imm1966/// An immediate 8-bit value specifying which elements to copy from \a a.1967/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the1968/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so1969/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).1970/// \returns A 256-bit vector of [16 x i16] containing the result.1971#define _mm256_shufflehi_epi16(a, imm) \1972((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))19731974/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a1975/// according to control information in the integer literal \a imm, and1976/// returns the 256-bit [16 x i16] result. The lower 64 bits of each1977/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are1978/// copied from \a a unchanged.1979///1980/// \code{.operation}1981/// result[127:64] := a[127:64]1982/// result[255:192] := a[255:192]1983/// FOR i := 0 TO 31984/// j := i * 161985/// k := (imm >> i*2)[1:0] * 161986/// result[j+15:j] := a[k+15:k]1987/// result[128+j+15:128+j] := a[128+k+15:128+k]1988/// ENDFOR1989/// \endcode1990///1991/// \headerfile <immintrin.h>1992///1993/// \code1994/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);1995/// \endcode1996///1997/// This intrinsic corresponds to the \c VPSHUFLW instruction.1998///1999/// \param a2000/// A 256-bit vector of [16 x i16] to use as a source of data for the2001/// result.2002/// \param imm2003/// An immediate 8-bit value specifying which elements to copy from \a a.2004/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the2005/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so2006/// forth.2007/// \returns A 256-bit vector of [16 x i16] containing the result.2008#define _mm256_shufflelo_epi16(a, imm) \2009((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))20102011/// Sets each byte of the result to the corresponding byte of the 256-bit2012/// integer vector in \a __a, the negative of that byte, or zero, depending2013/// on whether the corresponding byte of the 256-bit integer vector in2014/// \a __b is greater than zero, less than zero, or equal to zero,2015/// respectively.2016///2017/// \headerfile <immintrin.h>2018///2019/// This intrinsic corresponds to the \c VPSIGNB instruction.2020///2021/// \param __a2022/// A 256-bit integer vector.2023/// \param __b2024/// A 256-bit integer vector].2025/// \returns A 256-bit integer vector containing the result.2026static __inline__ __m256i __DEFAULT_FN_ATTRS2562027_mm256_sign_epi8(__m256i __a, __m256i __b)2028{2029return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);2030}20312032/// Sets each element of the result to the corresponding element of the2033/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,2034/// or zero, depending on whether the corresponding element of the 256-bit2035/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or2036/// equal to zero, respectively.2037///2038/// \headerfile <immintrin.h>2039///2040/// This intrinsic corresponds to the \c VPSIGNW instruction.2041///2042/// \param __a2043/// A 256-bit vector of [16 x i16].2044/// \param __b2045/// A 256-bit vector of [16 x i16].2046/// \returns A 256-bit vector of [16 x i16] containing the result.2047static __inline__ __m256i __DEFAULT_FN_ATTRS2562048_mm256_sign_epi16(__m256i __a, __m256i __b)2049{2050return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);2051}20522053/// Sets each element of the result to the corresponding element of the2054/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or2055/// zero, depending on whether the corresponding element of the 256-bit2056/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or2057/// equal to zero, respectively.2058///2059/// \headerfile <immintrin.h>2060///2061/// This intrinsic corresponds to the \c VPSIGND instruction.2062///2063/// \param __a2064/// A 256-bit vector of [8 x i32].2065/// \param __b2066/// A 256-bit vector of [8 x i32].2067/// \returns A 256-bit vector of [8 x i32] containing the result.2068static __inline__ __m256i __DEFAULT_FN_ATTRS2562069_mm256_sign_epi32(__m256i __a, __m256i __b)2070{2071return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);2072}20732074/// Shifts each 128-bit half of the 256-bit integer vector \a a left by2075/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm2076/// is greater than 15, the returned result is all zeroes.2077///2078/// \headerfile <immintrin.h>2079///2080/// \code2081/// __m256i _mm256_slli_si256(__m256i a, const int imm);2082/// \endcode2083///2084/// This intrinsic corresponds to the \c VPSLLDQ instruction.2085///2086/// \param a2087/// A 256-bit integer vector to be shifted.2088/// \param imm2089/// An unsigned immediate value specifying the shift count (in bytes).2090/// \returns A 256-bit integer vector containing the result.2091#define _mm256_slli_si256(a, imm) \2092((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))20932094/// Shifts each 128-bit half of the 256-bit integer vector \a a left by2095/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm2096/// is greater than 15, the returned result is all zeroes.2097///2098/// \headerfile <immintrin.h>2099///2100/// \code2101/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);2102/// \endcode2103///2104/// This intrinsic corresponds to the \c VPSLLDQ instruction.2105///2106/// \param a2107/// A 256-bit integer vector to be shifted.2108/// \param imm2109/// An unsigned immediate value specifying the shift count (in bytes).2110/// \returns A 256-bit integer vector containing the result.2111#define _mm256_bslli_epi128(a, imm) \2112((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))21132114/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2115/// left by \a __count bits, shifting in zero bits, and returns the result.2116/// If \a __count is greater than 15, the returned result is all zeroes.2117///2118/// \headerfile <immintrin.h>2119///2120/// This intrinsic corresponds to the \c VPSLLW instruction.2121///2122/// \param __a2123/// A 256-bit vector of [16 x i16] to be shifted.2124/// \param __count2125/// An unsigned integer value specifying the shift count (in bits).2126/// \returns A 256-bit vector of [16 x i16] containing the result.2127static __inline__ __m256i __DEFAULT_FN_ATTRS2562128_mm256_slli_epi16(__m256i __a, int __count)2129{2130return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);2131}21322133/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2134/// left by the number of bits specified by the lower 64 bits of \a __count,2135/// shifting in zero bits, and returns the result. If \a __count is greater2136/// than 15, the returned result is all zeroes.2137///2138/// \headerfile <immintrin.h>2139///2140/// This intrinsic corresponds to the \c VPSLLW instruction.2141///2142/// \param __a2143/// A 256-bit vector of [16 x i16] to be shifted.2144/// \param __count2145/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2146/// shift count (in bits). The upper element is ignored.2147/// \returns A 256-bit vector of [16 x i16] containing the result.2148static __inline__ __m256i __DEFAULT_FN_ATTRS2562149_mm256_sll_epi16(__m256i __a, __m128i __count)2150{2151return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);2152}21532154/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2155/// left by \a __count bits, shifting in zero bits, and returns the result.2156/// If \a __count is greater than 31, the returned result is all zeroes.2157///2158/// \headerfile <immintrin.h>2159///2160/// This intrinsic corresponds to the \c VPSLLD instruction.2161///2162/// \param __a2163/// A 256-bit vector of [8 x i32] to be shifted.2164/// \param __count2165/// An unsigned integer value specifying the shift count (in bits).2166/// \returns A 256-bit vector of [8 x i32] containing the result.2167static __inline__ __m256i __DEFAULT_FN_ATTRS2562168_mm256_slli_epi32(__m256i __a, int __count)2169{2170return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);2171}21722173/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2174/// left by the number of bits given in the lower 64 bits of \a __count,2175/// shifting in zero bits, and returns the result. If \a __count is greater2176/// than 31, the returned result is all zeroes.2177///2178/// \headerfile <immintrin.h>2179///2180/// This intrinsic corresponds to the \c VPSLLD instruction.2181///2182/// \param __a2183/// A 256-bit vector of [8 x i32] to be shifted.2184/// \param __count2185/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2186/// shift count (in bits). The upper element is ignored.2187/// \returns A 256-bit vector of [8 x i32] containing the result.2188static __inline__ __m256i __DEFAULT_FN_ATTRS2562189_mm256_sll_epi32(__m256i __a, __m128i __count)2190{2191return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);2192}21932194/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a2195/// left by \a __count bits, shifting in zero bits, and returns the result.2196/// If \a __count is greater than 63, the returned result is all zeroes.2197///2198/// \headerfile <immintrin.h>2199///2200/// This intrinsic corresponds to the \c VPSLLQ instruction.2201///2202/// \param __a2203/// A 256-bit vector of [4 x i64] to be shifted.2204/// \param __count2205/// An unsigned integer value specifying the shift count (in bits).2206/// \returns A 256-bit vector of [4 x i64] containing the result.2207static __inline__ __m256i __DEFAULT_FN_ATTRS2562208_mm256_slli_epi64(__m256i __a, int __count)2209{2210return __builtin_ia32_psllqi256((__v4di)__a, __count);2211}22122213/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a2214/// left by the number of bits given in the lower 64 bits of \a __count,2215/// shifting in zero bits, and returns the result. If \a __count is greater2216/// than 63, the returned result is all zeroes.2217///2218/// \headerfile <immintrin.h>2219///2220/// This intrinsic corresponds to the \c VPSLLQ instruction.2221///2222/// \param __a2223/// A 256-bit vector of [4 x i64] to be shifted.2224/// \param __count2225/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2226/// shift count (in bits). The upper element is ignored.2227/// \returns A 256-bit vector of [4 x i64] containing the result.2228static __inline__ __m256i __DEFAULT_FN_ATTRS2562229_mm256_sll_epi64(__m256i __a, __m128i __count)2230{2231return __builtin_ia32_psllq256((__v4di)__a, __count);2232}22332234/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2235/// right by \a __count bits, shifting in sign bits, and returns the result.2236/// If \a __count is greater than 15, each element of the result is either2237/// 0 or -1 according to the corresponding input sign bit.2238///2239/// \headerfile <immintrin.h>2240///2241/// This intrinsic corresponds to the \c VPSRAW instruction.2242///2243/// \param __a2244/// A 256-bit vector of [16 x i16] to be shifted.2245/// \param __count2246/// An unsigned integer value specifying the shift count (in bits).2247/// \returns A 256-bit vector of [16 x i16] containing the result.2248static __inline__ __m256i __DEFAULT_FN_ATTRS2562249_mm256_srai_epi16(__m256i __a, int __count)2250{2251return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);2252}22532254/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2255/// right by the number of bits given in the lower 64 bits of \a __count,2256/// shifting in sign bits, and returns the result. If \a __count is greater2257/// than 15, each element of the result is either 0 or -1 according to the2258/// corresponding input sign bit.2259///2260/// \headerfile <immintrin.h>2261///2262/// This intrinsic corresponds to the \c VPSRAW instruction.2263///2264/// \param __a2265/// A 256-bit vector of [16 x i16] to be shifted.2266/// \param __count2267/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2268/// shift count (in bits). The upper element is ignored.2269/// \returns A 256-bit vector of [16 x i16] containing the result.2270static __inline__ __m256i __DEFAULT_FN_ATTRS2562271_mm256_sra_epi16(__m256i __a, __m128i __count)2272{2273return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);2274}22752276/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2277/// right by \a __count bits, shifting in sign bits, and returns the result.2278/// If \a __count is greater than 31, each element of the result is either2279/// 0 or -1 according to the corresponding input sign bit.2280///2281/// \headerfile <immintrin.h>2282///2283/// This intrinsic corresponds to the \c VPSRAD instruction.2284///2285/// \param __a2286/// A 256-bit vector of [8 x i32] to be shifted.2287/// \param __count2288/// An unsigned integer value specifying the shift count (in bits).2289/// \returns A 256-bit vector of [8 x i32] containing the result.2290static __inline__ __m256i __DEFAULT_FN_ATTRS2562291_mm256_srai_epi32(__m256i __a, int __count)2292{2293return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);2294}22952296/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2297/// right by the number of bits given in the lower 64 bits of \a __count,2298/// shifting in sign bits, and returns the result. If \a __count is greater2299/// than 31, each element of the result is either 0 or -1 according to the2300/// corresponding input sign bit.2301///2302/// \headerfile <immintrin.h>2303///2304/// This intrinsic corresponds to the \c VPSRAD instruction.2305///2306/// \param __a2307/// A 256-bit vector of [8 x i32] to be shifted.2308/// \param __count2309/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2310/// shift count (in bits). The upper element is ignored.2311/// \returns A 256-bit vector of [8 x i32] containing the result.2312static __inline__ __m256i __DEFAULT_FN_ATTRS2562313_mm256_sra_epi32(__m256i __a, __m128i __count)2314{2315return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);2316}23172318/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by2319/// \a imm bytes, shifting in zero bytes, and returns the result. If2320/// \a imm is greater than 15, the returned result is all zeroes.2321///2322/// \headerfile <immintrin.h>2323///2324/// \code2325/// __m256i _mm256_srli_si256(__m256i a, const int imm);2326/// \endcode2327///2328/// This intrinsic corresponds to the \c VPSRLDQ instruction.2329///2330/// \param a2331/// A 256-bit integer vector to be shifted.2332/// \param imm2333/// An unsigned immediate value specifying the shift count (in bytes).2334/// \returns A 256-bit integer vector containing the result.2335#define _mm256_srli_si256(a, imm) \2336((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))23372338/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by2339/// \a imm bytes, shifting in zero bytes, and returns the result. If2340/// \a imm is greater than 15, the returned result is all zeroes.2341///2342/// \headerfile <immintrin.h>2343///2344/// \code2345/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);2346/// \endcode2347///2348/// This intrinsic corresponds to the \c VPSRLDQ instruction.2349///2350/// \param a2351/// A 256-bit integer vector to be shifted.2352/// \param imm2353/// An unsigned immediate value specifying the shift count (in bytes).2354/// \returns A 256-bit integer vector containing the result.2355#define _mm256_bsrli_epi128(a, imm) \2356((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))23572358/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2359/// right by \a __count bits, shifting in zero bits, and returns the result.2360/// If \a __count is greater than 15, the returned result is all zeroes.2361///2362/// \headerfile <immintrin.h>2363///2364/// This intrinsic corresponds to the \c VPSRLW instruction.2365///2366/// \param __a2367/// A 256-bit vector of [16 x i16] to be shifted.2368/// \param __count2369/// An unsigned integer value specifying the shift count (in bits).2370/// \returns A 256-bit vector of [16 x i16] containing the result.2371static __inline__ __m256i __DEFAULT_FN_ATTRS2562372_mm256_srli_epi16(__m256i __a, int __count)2373{2374return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);2375}23762377/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a2378/// right by the number of bits given in the lower 64 bits of \a __count,2379/// shifting in zero bits, and returns the result. If \a __count is greater2380/// than 15, the returned result is all zeroes.2381///2382/// \headerfile <immintrin.h>2383///2384/// This intrinsic corresponds to the \c VPSRLW instruction.2385///2386/// \param __a2387/// A 256-bit vector of [16 x i16] to be shifted.2388/// \param __count2389/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2390/// shift count (in bits). The upper element is ignored.2391/// \returns A 256-bit vector of [16 x i16] containing the result.2392static __inline__ __m256i __DEFAULT_FN_ATTRS2562393_mm256_srl_epi16(__m256i __a, __m128i __count)2394{2395return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);2396}23972398/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2399/// right by \a __count bits, shifting in zero bits, and returns the result.2400/// If \a __count is greater than 31, the returned result is all zeroes.2401///2402/// \headerfile <immintrin.h>2403///2404/// This intrinsic corresponds to the \c VPSRLD instruction.2405///2406/// \param __a2407/// A 256-bit vector of [8 x i32] to be shifted.2408/// \param __count2409/// An unsigned integer value specifying the shift count (in bits).2410/// \returns A 256-bit vector of [8 x i32] containing the result.2411static __inline__ __m256i __DEFAULT_FN_ATTRS2562412_mm256_srli_epi32(__m256i __a, int __count)2413{2414return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);2415}24162417/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a2418/// right by the number of bits given in the lower 64 bits of \a __count,2419/// shifting in zero bits, and returns the result. If \a __count is greater2420/// than 31, the returned result is all zeroes.2421///2422/// \headerfile <immintrin.h>2423///2424/// This intrinsic corresponds to the \c VPSRLD instruction.2425///2426/// \param __a2427/// A 256-bit vector of [8 x i32] to be shifted.2428/// \param __count2429/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2430/// shift count (in bits). The upper element is ignored.2431/// \returns A 256-bit vector of [8 x i32] containing the result.2432static __inline__ __m256i __DEFAULT_FN_ATTRS2562433_mm256_srl_epi32(__m256i __a, __m128i __count)2434{2435return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);2436}24372438/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a2439/// right by \a __count bits, shifting in zero bits, and returns the result.2440/// If \a __count is greater than 63, the returned result is all zeroes.2441///2442/// \headerfile <immintrin.h>2443///2444/// This intrinsic corresponds to the \c VPSRLQ instruction.2445///2446/// \param __a2447/// A 256-bit vector of [4 x i64] to be shifted.2448/// \param __count2449/// An unsigned integer value specifying the shift count (in bits).2450/// \returns A 256-bit vector of [4 x i64] containing the result.2451static __inline__ __m256i __DEFAULT_FN_ATTRS2562452_mm256_srli_epi64(__m256i __a, int __count)2453{2454return __builtin_ia32_psrlqi256((__v4di)__a, __count);2455}24562457/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a2458/// right by the number of bits given in the lower 64 bits of \a __count,2459/// shifting in zero bits, and returns the result. If \a __count is greater2460/// than 63, the returned result is all zeroes.2461///2462/// \headerfile <immintrin.h>2463///2464/// This intrinsic corresponds to the \c VPSRLQ instruction.2465///2466/// \param __a2467/// A 256-bit vector of [4 x i64] to be shifted.2468/// \param __count2469/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned2470/// shift count (in bits). The upper element is ignored.2471/// \returns A 256-bit vector of [4 x i64] containing the result.2472static __inline__ __m256i __DEFAULT_FN_ATTRS2562473_mm256_srl_epi64(__m256i __a, __m128i __count)2474{2475return __builtin_ia32_psrlq256((__v4di)__a, __count);2476}24772478/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer2479/// vectors. Returns the lower 8 bits of each difference in the2480/// corresponding byte of the 256-bit integer vector result (overflow is2481/// ignored).2482///2483/// \code{.operation}2484/// FOR i := 0 TO 312485/// j := i*82486/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]2487/// ENDFOR2488/// \endcode2489///2490/// \headerfile <immintrin.h>2491///2492/// This intrinsic corresponds to the \c VPSUBB instruction.2493///2494/// \param __a2495/// A 256-bit integer vector containing the minuends.2496/// \param __b2497/// A 256-bit integer vector containing the subtrahends.2498/// \returns A 256-bit integer vector containing the differences.2499static __inline__ __m256i __DEFAULT_FN_ATTRS2562500_mm256_sub_epi8(__m256i __a, __m256i __b)2501{2502return (__m256i)((__v32qu)__a - (__v32qu)__b);2503}25042505/// Subtracts 16-bit integers from corresponding elements of two 256-bit2506/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in2507/// the corresponding element of the [16 x i16] result (overflow is2508/// ignored).2509///2510/// \code{.operation}2511/// FOR i := 0 TO 152512/// j := i*162513/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]2514/// ENDFOR2515/// \endcode2516///2517/// \headerfile <immintrin.h>2518///2519/// This intrinsic corresponds to the \c VPSUBW instruction.2520///2521/// \param __a2522/// A 256-bit vector of [16 x i16] containing the minuends.2523/// \param __b2524/// A 256-bit vector of [16 x i16] containing the subtrahends.2525/// \returns A 256-bit vector of [16 x i16] containing the differences.2526static __inline__ __m256i __DEFAULT_FN_ATTRS2562527_mm256_sub_epi16(__m256i __a, __m256i __b)2528{2529return (__m256i)((__v16hu)__a - (__v16hu)__b);2530}25312532/// Subtracts 32-bit integers from corresponding elements of two 256-bit2533/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in2534/// the corresponding element of the [8 x i32] result (overflow is ignored).2535///2536/// \code{.operation}2537/// FOR i := 0 TO 72538/// j := i*322539/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]2540/// ENDFOR2541/// \endcode2542///2543/// \headerfile <immintrin.h>2544///2545/// This intrinsic corresponds to the \c VPSUBD instruction.2546///2547/// \param __a2548/// A 256-bit vector of [8 x i32] containing the minuends.2549/// \param __b2550/// A 256-bit vector of [8 x i32] containing the subtrahends.2551/// \returns A 256-bit vector of [8 x i32] containing the differences.2552static __inline__ __m256i __DEFAULT_FN_ATTRS2562553_mm256_sub_epi32(__m256i __a, __m256i __b)2554{2555return (__m256i)((__v8su)__a - (__v8su)__b);2556}25572558/// Subtracts 64-bit integers from corresponding elements of two 256-bit2559/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in2560/// the corresponding element of the [4 x i64] result (overflow is ignored).2561///2562/// \code{.operation}2563/// FOR i := 0 TO 32564/// j := i*642565/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]2566/// ENDFOR2567/// \endcode2568///2569/// \headerfile <immintrin.h>2570///2571/// This intrinsic corresponds to the \c VPSUBQ instruction.2572///2573/// \param __a2574/// A 256-bit vector of [4 x i64] containing the minuends.2575/// \param __b2576/// A 256-bit vector of [4 x i64] containing the subtrahends.2577/// \returns A 256-bit vector of [4 x i64] containing the differences.2578static __inline__ __m256i __DEFAULT_FN_ATTRS2562579_mm256_sub_epi64(__m256i __a, __m256i __b)2580{2581return (__m256i)((__v4du)__a - (__v4du)__b);2582}25832584/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer2585/// vectors using signed saturation, and returns each differences in the2586/// corresponding byte of the 256-bit integer vector result.2587///2588/// \code{.operation}2589/// FOR i := 0 TO 312590/// j := i*82591/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])2592/// ENDFOR2593/// \endcode2594///2595/// \headerfile <immintrin.h>2596///2597/// This intrinsic corresponds to the \c VPSUBSB instruction.2598///2599/// \param __a2600/// A 256-bit integer vector containing the minuends.2601/// \param __b2602/// A 256-bit integer vector containing the subtrahends.2603/// \returns A 256-bit integer vector containing the differences.2604static __inline__ __m256i __DEFAULT_FN_ATTRS2562605_mm256_subs_epi8(__m256i __a, __m256i __b)2606{2607return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);2608}26092610/// Subtracts 16-bit integers from corresponding elements of two 256-bit2611/// vectors of [16 x i16] using signed saturation, and returns each2612/// difference in the corresponding element of the [16 x i16] result.2613///2614/// \code{.operation}2615/// FOR i := 0 TO 152616/// j := i*162617/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])2618/// ENDFOR2619/// \endcode2620///2621/// \headerfile <immintrin.h>2622///2623/// This intrinsic corresponds to the \c VPSUBSW instruction.2624///2625/// \param __a2626/// A 256-bit vector of [16 x i16] containing the minuends.2627/// \param __b2628/// A 256-bit vector of [16 x i16] containing the subtrahends.2629/// \returns A 256-bit vector of [16 x i16] containing the differences.2630static __inline__ __m256i __DEFAULT_FN_ATTRS2562631_mm256_subs_epi16(__m256i __a, __m256i __b)2632{2633return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);2634}26352636/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer2637/// vectors using unsigned saturation, and returns each difference in the2638/// corresponding byte of the 256-bit integer vector result. For each byte,2639/// computes <c> result = __a - __b </c>.2640///2641/// \code{.operation}2642/// FOR i := 0 TO 312643/// j := i*82644/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])2645/// ENDFOR2646/// \endcode2647///2648/// \headerfile <immintrin.h>2649///2650/// This intrinsic corresponds to the \c VPSUBUSB instruction.2651///2652/// \param __a2653/// A 256-bit integer vector containing the minuends.2654/// \param __b2655/// A 256-bit integer vector containing the subtrahends.2656/// \returns A 256-bit integer vector containing the differences.2657static __inline__ __m256i __DEFAULT_FN_ATTRS2562658_mm256_subs_epu8(__m256i __a, __m256i __b)2659{2660return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);2661}26622663/// Subtracts 16-bit integers from corresponding elements of two 256-bit2664/// vectors of [16 x i16] using unsigned saturation, and returns each2665/// difference in the corresponding element of the [16 x i16] result.2666///2667/// \code{.operation}2668/// FOR i := 0 TO 152669/// j := i*162670/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])2671/// ENDFOR2672/// \endcode2673///2674/// \headerfile <immintrin.h>2675///2676/// This intrinsic corresponds to the \c VPSUBUSW instruction.2677///2678/// \param __a2679/// A 256-bit vector of [16 x i16] containing the minuends.2680/// \param __b2681/// A 256-bit vector of [16 x i16] containing the subtrahends.2682/// \returns A 256-bit vector of [16 x i16] containing the differences.2683static __inline__ __m256i __DEFAULT_FN_ATTRS2562684_mm256_subs_epu16(__m256i __a, __m256i __b)2685{2686return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);2687}26882689/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer2690/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,2691/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as2692/// input; other bits in these parameters are ignored.2693///2694/// \code{.operation}2695/// result[7:0] := __a[71:64]2696/// result[15:8] := __b[71:64]2697/// result[23:16] := __a[79:72]2698/// result[31:24] := __b[79:72]2699/// . . .2700/// result[127:120] := __b[127:120]2701/// result[135:128] := __a[199:192]2702/// . . .2703/// result[255:248] := __b[255:248]2704/// \endcode2705///2706/// \headerfile <immintrin.h>2707///2708/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.2709///2710/// \param __a2711/// A 256-bit integer vector used as the source for the even-numbered bytes2712/// of the result.2713/// \param __b2714/// A 256-bit integer vector used as the source for the odd-numbered bytes2715/// of the result.2716/// \returns A 256-bit integer vector containing the result.2717static __inline__ __m256i __DEFAULT_FN_ATTRS2562718_mm256_unpackhi_epi8(__m256i __a, __m256i __b)2719{2720return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);2721}27222723/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors2724/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit2725/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each2726/// 128-bit half of \a __a and \a __b as input; other bits in these2727/// parameters are ignored.2728///2729/// \code{.operation}2730/// result[15:0] := __a[79:64]2731/// result[31:16] := __b[79:64]2732/// result[47:32] := __a[95:80]2733/// result[63:48] := __b[95:80]2734/// . . .2735/// result[127:112] := __b[127:112]2736/// result[143:128] := __a[211:196]2737/// . . .2738/// result[255:240] := __b[255:240]2739/// \endcode2740///2741/// \headerfile <immintrin.h>2742///2743/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.2744///2745/// \param __a2746/// A 256-bit vector of [16 x i16] used as the source for the even-numbered2747/// elements of the result.2748/// \param __b2749/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered2750/// elements of the result.2751/// \returns A 256-bit vector of [16 x i16] containing the result.2752static __inline__ __m256i __DEFAULT_FN_ATTRS2562753_mm256_unpackhi_epi16(__m256i __a, __m256i __b)2754{2755return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);2756}27572758/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors2759/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector2760/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half2761/// of \a __a and \a __b as input; other bits in these parameters are2762/// ignored.2763///2764/// \code{.operation}2765/// result[31:0] := __a[95:64]2766/// result[63:32] := __b[95:64]2767/// result[95:64] := __a[127:96]2768/// result[127:96] := __b[127:96]2769/// result[159:128] := __a[223:192]2770/// result[191:160] := __b[223:192]2771/// result[223:192] := __a[255:224]2772/// result[255:224] := __b[255:224]2773/// \endcode2774///2775/// \headerfile <immintrin.h>2776///2777/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.2778///2779/// \param __a2780/// A 256-bit vector of [8 x i32] used as the source for the even-numbered2781/// elements of the result.2782/// \param __b2783/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered2784/// elements of the result.2785/// \returns A 256-bit vector of [8 x i32] containing the result.2786static __inline__ __m256i __DEFAULT_FN_ATTRS2562787_mm256_unpackhi_epi32(__m256i __a, __m256i __b)2788{2789return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);2790}27912792/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors2793/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector2794/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half2795/// of \a __a and \a __b as input; other bits in these parameters are2796/// ignored.2797///2798/// \code{.operation}2799/// result[63:0] := __a[127:64]2800/// result[127:64] := __b[127:64]2801/// result[191:128] := __a[255:192]2802/// result[255:192] := __b[255:192]2803/// \endcode2804///2805/// \headerfile <immintrin.h>2806///2807/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.2808///2809/// \param __a2810/// A 256-bit vector of [4 x i64] used as the source for the even-numbered2811/// elements of the result.2812/// \param __b2813/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered2814/// elements of the result.2815/// \returns A 256-bit vector of [4 x i64] containing the result.2816static __inline__ __m256i __DEFAULT_FN_ATTRS2562817_mm256_unpackhi_epi64(__m256i __a, __m256i __b)2818{2819return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);2820}28212822/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer2823/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,2824/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as2825/// input; other bits in these parameters are ignored.2826///2827/// \code{.operation}2828/// result[7:0] := __a[7:0]2829/// result[15:8] := __b[7:0]2830/// result[23:16] := __a[15:8]2831/// result[31:24] := __b[15:8]2832/// . . .2833/// result[127:120] := __b[63:56]2834/// result[135:128] := __a[135:128]2835/// . . .2836/// result[255:248] := __b[191:184]2837/// \endcode2838///2839/// \headerfile <immintrin.h>2840///2841/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.2842///2843/// \param __a2844/// A 256-bit integer vector used as the source for the even-numbered bytes2845/// of the result.2846/// \param __b2847/// A 256-bit integer vector used as the source for the odd-numbered bytes2848/// of the result.2849/// \returns A 256-bit integer vector containing the result.2850static __inline__ __m256i __DEFAULT_FN_ATTRS2562851_mm256_unpacklo_epi8(__m256i __a, __m256i __b)2852{2853return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);2854}28552856/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors2857/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit2858/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each2859/// 128-bit half of \a __a and \a __b as input; other bits in these2860/// parameters are ignored.2861///2862/// \code{.operation}2863/// result[15:0] := __a[15:0]2864/// result[31:16] := __b[15:0]2865/// result[47:32] := __a[31:16]2866/// result[63:48] := __b[31:16]2867/// . . .2868/// result[127:112] := __b[63:48]2869/// result[143:128] := __a[143:128]2870/// . . .2871/// result[255:239] := __b[191:176]2872/// \endcode2873///2874/// \headerfile <immintrin.h>2875///2876/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.2877///2878/// \param __a2879/// A 256-bit vector of [16 x i16] used as the source for the even-numbered2880/// elements of the result.2881/// \param __b2882/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered2883/// elements of the result.2884/// \returns A 256-bit vector of [16 x i16] containing the result.2885static __inline__ __m256i __DEFAULT_FN_ATTRS2562886_mm256_unpacklo_epi16(__m256i __a, __m256i __b)2887{2888return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);2889}28902891/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors2892/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector2893/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half2894/// of \a __a and \a __b as input; other bits in these parameters are2895/// ignored.2896///2897/// \code{.operation}2898/// result[31:0] := __a[31:0]2899/// result[63:32] := __b[31:0]2900/// result[95:64] := __a[63:32]2901/// result[127:96] := __b[63:32]2902/// result[159:128] := __a[159:128]2903/// result[191:160] := __b[159:128]2904/// result[223:192] := __a[191:160]2905/// result[255:224] := __b[191:190]2906/// \endcode2907///2908/// \headerfile <immintrin.h>2909///2910/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.2911///2912/// \param __a2913/// A 256-bit vector of [8 x i32] used as the source for the even-numbered2914/// elements of the result.2915/// \param __b2916/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered2917/// elements of the result.2918/// \returns A 256-bit vector of [8 x i32] containing the result.2919static __inline__ __m256i __DEFAULT_FN_ATTRS2562920_mm256_unpacklo_epi32(__m256i __a, __m256i __b)2921{2922return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);2923}29242925/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors2926/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector2927/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half2928/// of \a __a and \a __b as input; other bits in these parameters are2929/// ignored.2930///2931/// \code{.operation}2932/// result[63:0] := __a[63:0]2933/// result[127:64] := __b[63:0]2934/// result[191:128] := __a[191:128]2935/// result[255:192] := __b[191:128]2936/// \endcode2937///2938/// \headerfile <immintrin.h>2939///2940/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.2941///2942/// \param __a2943/// A 256-bit vector of [4 x i64] used as the source for the even-numbered2944/// elements of the result.2945/// \param __b2946/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered2947/// elements of the result.2948/// \returns A 256-bit vector of [4 x i64] containing the result.2949static __inline__ __m256i __DEFAULT_FN_ATTRS2562950_mm256_unpacklo_epi64(__m256i __a, __m256i __b)2951{2952return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);2953}29542955/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and2956/// \a __b.2957///2958/// \headerfile <immintrin.h>2959///2960/// This intrinsic corresponds to the \c VPXOR instruction.2961///2962/// \param __a2963/// A 256-bit integer vector.2964/// \param __b2965/// A 256-bit integer vector.2966/// \returns A 256-bit integer vector containing the result.2967static __inline__ __m256i __DEFAULT_FN_ATTRS2562968_mm256_xor_si256(__m256i __a, __m256i __b)2969{2970return (__m256i)((__v4du)__a ^ (__v4du)__b);2971}29722973/// Loads the 256-bit integer vector from memory \a __V using a non-temporal2974/// memory hint and returns the vector. \a __V must be aligned on a 32-byte2975/// boundary.2976///2977/// \headerfile <immintrin.h>2978///2979/// This intrinsic corresponds to the \c VMOVNTDQA instruction.2980///2981/// \param __V2982/// A pointer to the 32-byte aligned memory containing the vector to load.2983/// \returns A 256-bit integer vector loaded from memory.2984static __inline__ __m256i __DEFAULT_FN_ATTRS2562985_mm256_stream_load_si256(const void *__V)2986{2987typedef __v4di __v4di_aligned __attribute__((aligned(32)));2988return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);2989}29902991/// Broadcasts the 32-bit floating-point value from the low element of the2992/// 128-bit vector of [4 x float] in \a __X to all elements of the result's2993/// 128-bit vector of [4 x float].2994///2995/// \headerfile <immintrin.h>2996///2997/// This intrinsic corresponds to the \c VBROADCASTSS instruction.2998///2999/// \param __X3000/// A 128-bit vector of [4 x float] whose low element will be broadcast.3001/// \returns A 128-bit vector of [4 x float] containing the result.3002static __inline__ __m128 __DEFAULT_FN_ATTRS1283003_mm_broadcastss_ps(__m128 __X)3004{3005return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);3006}30073008/// Broadcasts the 64-bit floating-point value from the low element of the3009/// 128-bit vector of [2 x double] in \a __a to both elements of the3010/// result's 128-bit vector of [2 x double].3011///3012/// \headerfile <immintrin.h>3013///3014/// This intrinsic corresponds to the \c MOVDDUP instruction.3015///3016/// \param __a3017/// A 128-bit vector of [2 x double] whose low element will be broadcast.3018/// \returns A 128-bit vector of [2 x double] containing the result.3019static __inline__ __m128d __DEFAULT_FN_ATTRS1283020_mm_broadcastsd_pd(__m128d __a)3021{3022return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);3023}30243025/// Broadcasts the 32-bit floating-point value from the low element of the3026/// 128-bit vector of [4 x float] in \a __X to all elements of the3027/// result's 256-bit vector of [8 x float].3028///3029/// \headerfile <immintrin.h>3030///3031/// This intrinsic corresponds to the \c VBROADCASTSS instruction.3032///3033/// \param __X3034/// A 128-bit vector of [4 x float] whose low element will be broadcast.3035/// \returns A 256-bit vector of [8 x float] containing the result.3036static __inline__ __m256 __DEFAULT_FN_ATTRS2563037_mm256_broadcastss_ps(__m128 __X)3038{3039return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);3040}30413042/// Broadcasts the 64-bit floating-point value from the low element of the3043/// 128-bit vector of [2 x double] in \a __X to all elements of the3044/// result's 256-bit vector of [4 x double].3045///3046/// \headerfile <immintrin.h>3047///3048/// This intrinsic corresponds to the \c VBROADCASTSD instruction.3049///3050/// \param __X3051/// A 128-bit vector of [2 x double] whose low element will be broadcast.3052/// \returns A 256-bit vector of [4 x double] containing the result.3053static __inline__ __m256d __DEFAULT_FN_ATTRS2563054_mm256_broadcastsd_pd(__m128d __X)3055{3056return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);3057}30583059/// Broadcasts the 128-bit integer data from \a __X to both the lower and3060/// upper halves of the 256-bit result.3061///3062/// \headerfile <immintrin.h>3063///3064/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.3065///3066/// \param __X3067/// A 128-bit integer vector to be broadcast.3068/// \returns A 256-bit integer vector containing the result.3069static __inline__ __m256i __DEFAULT_FN_ATTRS2563070_mm256_broadcastsi128_si256(__m128i __X)3071{3072return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);3073}30743075#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)30763077/// Merges 32-bit integer elements from either of the two 128-bit vectors of3078/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],3079/// as specified by the immediate integer operand \a M.3080///3081/// \code{.operation}3082/// FOR i := 0 TO 33083/// j := i*323084/// IF M[i] == 03085/// result[31+j:j] := V1[31+j:j]3086/// ELSE3087/// result[31+j:j] := V2[32+j:j]3088/// FI3089/// ENDFOR3090/// \endcode3091///3092/// \headerfile <immintrin.h>3093///3094/// \code3095/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);3096/// \endcode3097///3098/// This intrinsic corresponds to the \c VPBLENDDD instruction.3099///3100/// \param V13101/// A 128-bit vector of [4 x i32] containing source values.3102/// \param V23103/// A 128-bit vector of [4 x i32] containing source values.3104/// \param M3105/// An immediate 8-bit integer operand, with bits [3:0] specifying the3106/// source for each element of the result. The position of the mask bit3107/// corresponds to the index of a copied value. When a mask bit is 0, the3108/// element is copied from \a V1; otherwise, it is copied from \a V2.3109/// \returns A 128-bit vector of [4 x i32] containing the result.3110#define _mm_blend_epi32(V1, V2, M) \3111((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \3112(__v4si)(__m128i)(V2), (int)(M)))31133114/// Merges 32-bit integer elements from either of the two 256-bit vectors of3115/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],3116/// as specified by the immediate integer operand \a M.3117///3118/// \code{.operation}3119/// FOR i := 0 TO 73120/// j := i*323121/// IF M[i] == 03122/// result[31+j:j] := V1[31+j:j]3123/// ELSE3124/// result[31+j:j] := V2[32+j:j]3125/// FI3126/// ENDFOR3127/// \endcode3128///3129/// \headerfile <immintrin.h>3130///3131/// \code3132/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);3133/// \endcode3134///3135/// This intrinsic corresponds to the \c VPBLENDDD instruction.3136///3137/// \param V13138/// A 256-bit vector of [8 x i32] containing source values.3139/// \param V23140/// A 256-bit vector of [8 x i32] containing source values.3141/// \param M3142/// An immediate 8-bit integer operand, with bits [7:0] specifying the3143/// source for each element of the result. The position of the mask bit3144/// corresponds to the index of a copied value. When a mask bit is 0, the3145/// element is copied from \a V1; otherwise, it is is copied from \a V2.3146/// \returns A 256-bit vector of [8 x i32] containing the result.3147#define _mm256_blend_epi32(V1, V2, M) \3148((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \3149(__v8si)(__m256i)(V2), (int)(M)))31503151/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all3152/// bytes of the 256-bit result.3153///3154/// \headerfile <immintrin.h>3155///3156/// This intrinsic corresponds to the \c VPBROADCASTB instruction.3157///3158/// \param __X3159/// A 128-bit integer vector whose low byte will be broadcast.3160/// \returns A 256-bit integer vector containing the result.3161static __inline__ __m256i __DEFAULT_FN_ATTRS2563162_mm256_broadcastb_epi8(__m128i __X)3163{3164return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);3165}31663167/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X3168/// to all elements of the result's 256-bit vector of [16 x i16].3169///3170/// \headerfile <immintrin.h>3171///3172/// This intrinsic corresponds to the \c VPBROADCASTW instruction.3173///3174/// \param __X3175/// A 128-bit vector of [8 x i16] whose low element will be broadcast.3176/// \returns A 256-bit vector of [16 x i16] containing the result.3177static __inline__ __m256i __DEFAULT_FN_ATTRS2563178_mm256_broadcastw_epi16(__m128i __X)3179{3180return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);3181}31823183/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X3184/// to all elements of the result's 256-bit vector of [8 x i32].3185///3186/// \headerfile <immintrin.h>3187///3188/// This intrinsic corresponds to the \c VPBROADCASTD instruction.3189///3190/// \param __X3191/// A 128-bit vector of [4 x i32] whose low element will be broadcast.3192/// \returns A 256-bit vector of [8 x i32] containing the result.3193static __inline__ __m256i __DEFAULT_FN_ATTRS2563194_mm256_broadcastd_epi32(__m128i __X)3195{3196return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);3197}31983199/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X3200/// to all elements of the result's 256-bit vector of [4 x i64].3201///3202/// \headerfile <immintrin.h>3203///3204/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.3205///3206/// \param __X3207/// A 128-bit vector of [2 x i64] whose low element will be broadcast.3208/// \returns A 256-bit vector of [4 x i64] containing the result.3209static __inline__ __m256i __DEFAULT_FN_ATTRS2563210_mm256_broadcastq_epi64(__m128i __X)3211{3212return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);3213}32143215/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all3216/// bytes of the 128-bit result.3217///3218/// \headerfile <immintrin.h>3219///3220/// This intrinsic corresponds to the \c VPBROADCASTB instruction.3221///3222/// \param __X3223/// A 128-bit integer vector whose low byte will be broadcast.3224/// \returns A 128-bit integer vector containing the result.3225static __inline__ __m128i __DEFAULT_FN_ATTRS1283226_mm_broadcastb_epi8(__m128i __X)3227{3228return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);3229}32303231/// Broadcasts the low element from the 128-bit vector of [8 x i16] in3232/// \a __X to all elements of the result's 128-bit vector of [8 x i16].3233///3234/// \headerfile <immintrin.h>3235///3236/// This intrinsic corresponds to the \c VPBROADCASTW instruction.3237///3238/// \param __X3239/// A 128-bit vector of [8 x i16] whose low element will be broadcast.3240/// \returns A 128-bit vector of [8 x i16] containing the result.3241static __inline__ __m128i __DEFAULT_FN_ATTRS1283242_mm_broadcastw_epi16(__m128i __X)3243{3244return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);3245}32463247/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X3248/// to all elements of the result's vector of [4 x i32].3249///3250/// \headerfile <immintrin.h>3251///3252/// This intrinsic corresponds to the \c VPBROADCASTD instruction.3253///3254/// \param __X3255/// A 128-bit vector of [4 x i32] whose low element will be broadcast.3256/// \returns A 128-bit vector of [4 x i32] containing the result.3257static __inline__ __m128i __DEFAULT_FN_ATTRS1283258_mm_broadcastd_epi32(__m128i __X)3259{3260return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);3261}32623263/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X3264/// to both elements of the result's 128-bit vector of [2 x i64].3265///3266/// \headerfile <immintrin.h>3267///3268/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.3269///3270/// \param __X3271/// A 128-bit vector of [2 x i64] whose low element will be broadcast.3272/// \returns A 128-bit vector of [2 x i64] containing the result.3273static __inline__ __m128i __DEFAULT_FN_ATTRS1283274_mm_broadcastq_epi64(__m128i __X)3275{3276return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);3277}32783279/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the3280/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the3281/// elements of the 256-bit vector of [8 x i32] in \a __b.3282///3283/// \code{.operation}3284/// FOR i := 0 TO 73285/// j := i*323286/// k := __b[j+2:j] * 323287/// result[j+31:j] := __a[k+31:k]3288/// ENDFOR3289/// \endcode3290///3291/// \headerfile <immintrin.h>3292///3293/// This intrinsic corresponds to the \c VPERMD instruction.3294///3295/// \param __a3296/// A 256-bit vector of [8 x i32] containing the source values.3297/// \param __b3298/// A 256-bit vector of [8 x i32] containing indexes of values to use from3299/// \a __a.3300/// \returns A 256-bit vector of [8 x i32] containing the result.3301static __inline__ __m256i __DEFAULT_FN_ATTRS2563302_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)3303{3304return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);3305}33063307/// Sets the result's 256-bit vector of [4 x double] to copies of elements of3308/// the 256-bit vector of [4 x double] in \a V as specified by the3309/// immediate value \a M.3310///3311/// \code{.operation}3312/// FOR i := 0 TO 33313/// j := i*643314/// k := (M >> i*2)[1:0] * 643315/// result[j+63:j] := V[k+63:k]3316/// ENDFOR3317/// \endcode3318///3319/// \headerfile <immintrin.h>3320///3321/// \code3322/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);3323/// \endcode3324///3325/// This intrinsic corresponds to the \c VPERMPD instruction.3326///3327/// \param V3328/// A 256-bit vector of [4 x double] containing the source values.3329/// \param M3330/// An immediate 8-bit value specifying which elements to copy from \a V.3331/// \a M[1:0] specifies the index in \a a for element 0 of the result,3332/// \a M[3:2] specifies the index for element 1, and so forth.3333/// \returns A 256-bit vector of [4 x double] containing the result.3334#define _mm256_permute4x64_pd(V, M) \3335((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))33363337/// Sets the result's 256-bit vector of [8 x float] to copies of elements of3338/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in3339/// the elements of the 256-bit vector of [8 x i32] in \a __b.3340///3341/// \code{.operation}3342/// FOR i := 0 TO 73343/// j := i*323344/// k := __b[j+2:j] * 323345/// result[j+31:j] := __a[k+31:k]3346/// ENDFOR3347/// \endcode3348///3349/// \headerfile <immintrin.h>3350///3351/// This intrinsic corresponds to the \c VPERMPS instruction.3352///3353/// \param __a3354/// A 256-bit vector of [8 x float] containing the source values.3355/// \param __b3356/// A 256-bit vector of [8 x i32] containing indexes of values to use from3357/// \a __a.3358/// \returns A 256-bit vector of [8 x float] containing the result.3359static __inline__ __m256 __DEFAULT_FN_ATTRS2563360_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)3361{3362return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);3363}33643365/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements3366/// of the 256-bit vector of [4 x i64] in \a V as specified by the3367/// immediate value \a M.3368///3369/// \code{.operation}3370/// FOR i := 0 TO 33371/// j := i*643372/// k := (M >> i*2)[1:0] * 643373/// result[j+63:j] := V[k+63:k]3374/// ENDFOR3375/// \endcode3376///3377/// \headerfile <immintrin.h>3378///3379/// \code3380/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);3381/// \endcode3382///3383/// This intrinsic corresponds to the \c VPERMQ instruction.3384///3385/// \param V3386/// A 256-bit vector of [4 x i64] containing the source values.3387/// \param M3388/// An immediate 8-bit value specifying which elements to copy from \a V.3389/// \a M[1:0] specifies the index in \a a for element 0 of the result,3390/// \a M[3:2] specifies the index for element 1, and so forth.3391/// \returns A 256-bit vector of [4 x i64] containing the result.3392#define _mm256_permute4x64_epi64(V, M) \3393((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))33943395/// Sets each half of the 256-bit result either to zero or to one of the3396/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,3397/// as specified by the immediate value \a M.3398///3399/// \code{.operation}3400/// FOR i := 0 TO 13401/// j := i*1283402/// k := M >> (i*4)3403/// IF k[3] == 03404/// CASE (k[1:0]) OF3405/// 0: result[127+j:j] := V1[127:0]3406/// 1: result[127+j:j] := V1[255:128]3407/// 2: result[127+j:j] := V2[127:0]3408/// 3: result[127+j:j] := V2[255:128]3409/// ESAC3410/// ELSE3411/// result[127+j:j] := 03412/// FI3413/// ENDFOR3414/// \endcode3415///3416/// \headerfile <immintrin.h>3417///3418/// \code3419/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);3420/// \endcode3421///3422/// This intrinsic corresponds to the \c VPERM2I128 instruction.3423///3424/// \param V13425/// A 256-bit integer vector containing source values.3426/// \param V23427/// A 256-bit integer vector containing source values.3428/// \param M3429/// An immediate value specifying how to form the result. Bits [3:0]3430/// control the lower half of the result, bits [7:4] control the upper half.3431/// Within each 4-bit control value, if bit 3 is 1, the result is zero,3432/// otherwise bits [1:0] determine the source as follows. \n3433/// 0: the lower half of \a V1 \n3434/// 1: the upper half of \a V1 \n3435/// 2: the lower half of \a V2 \n3436/// 3: the upper half of \a V23437/// \returns A 256-bit integer vector containing the result.3438#define _mm256_permute2x128_si256(V1, V2, M) \3439((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))34403441/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 03442/// of the immediate \a M is zero, extracts the lower half of the result;3443/// otherwise, extracts the upper half.3444///3445/// \headerfile <immintrin.h>3446///3447/// \code3448/// __m128i _mm256_extracti128_si256(__m256i V, const int M);3449/// \endcode3450///3451/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.3452///3453/// \param V3454/// A 256-bit integer vector containing the source values.3455/// \param M3456/// An immediate value specifying which half of \a V to extract.3457/// \returns A 128-bit integer vector containing the result.3458#define _mm256_extracti128_si256(V, M) \3459((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))34603461/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the3462/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M3463/// is zero, overwrites the lower half of the result; otherwise,3464/// overwrites the upper half.3465///3466/// \headerfile <immintrin.h>3467///3468/// \code3469/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);3470/// \endcode3471///3472/// This intrinsic corresponds to the \c VINSERTI128 instruction.3473///3474/// \param V13475/// A 256-bit integer vector containing a source value.3476/// \param V23477/// A 128-bit integer vector containing a source value.3478/// \param M3479/// An immediate value specifying where to put \a V2 in the result.3480/// \returns A 256-bit integer vector containing the result.3481#define _mm256_inserti128_si256(V1, V2, M) \3482((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \3483(__v2di)(__m128i)(V2), (int)(M)))34843485/// Conditionally loads eight 32-bit integer elements from memory \a __X, if3486/// the most significant bit of the corresponding element in the mask3487/// \a __M is set; otherwise, sets that element of the result to zero.3488/// Returns the 256-bit [8 x i32] result.3489///3490/// \code{.operation}3491/// FOR i := 0 TO 73492/// j := i*323493/// IF __M[j+31] == 13494/// result[j+31:j] := Load32(__X+(i*4))3495/// ELSE3496/// result[j+31:j] := 03497/// FI3498/// ENDFOR3499/// \endcode3500///3501/// \headerfile <immintrin.h>3502///3503/// This intrinsic corresponds to the \c VPMASKMOVD instruction.3504///3505/// \param __X3506/// A pointer to the memory used for loading values.3507/// \param __M3508/// A 256-bit vector of [8 x i32] containing the mask bits.3509/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed3510/// elements.3511static __inline__ __m256i __DEFAULT_FN_ATTRS2563512_mm256_maskload_epi32(int const *__X, __m256i __M)3513{3514return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);3515}35163517/// Conditionally loads four 64-bit integer elements from memory \a __X, if3518/// the most significant bit of the corresponding element in the mask3519/// \a __M is set; otherwise, sets that element of the result to zero.3520/// Returns the 256-bit [4 x i64] result.3521///3522/// \code{.operation}3523/// FOR i := 0 TO 33524/// j := i*643525/// IF __M[j+63] == 13526/// result[j+63:j] := Load64(__X+(i*8))3527/// ELSE3528/// result[j+63:j] := 03529/// FI3530/// ENDFOR3531/// \endcode3532///3533/// \headerfile <immintrin.h>3534///3535/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.3536///3537/// \param __X3538/// A pointer to the memory used for loading values.3539/// \param __M3540/// A 256-bit vector of [4 x i64] containing the mask bits.3541/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed3542/// elements.3543static __inline__ __m256i __DEFAULT_FN_ATTRS2563544_mm256_maskload_epi64(long long const *__X, __m256i __M)3545{3546return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);3547}35483549/// Conditionally loads four 32-bit integer elements from memory \a __X, if3550/// the most significant bit of the corresponding element in the mask3551/// \a __M is set; otherwise, sets that element of the result to zero.3552/// Returns the 128-bit [4 x i32] result.3553///3554/// \code{.operation}3555/// FOR i := 0 TO 33556/// j := i*323557/// IF __M[j+31] == 13558/// result[j+31:j] := Load32(__X+(i*4))3559/// ELSE3560/// result[j+31:j] := 03561/// FI3562/// ENDFOR3563/// \endcode3564///3565/// \headerfile <immintrin.h>3566///3567/// This intrinsic corresponds to the \c VPMASKMOVD instruction.3568///3569/// \param __X3570/// A pointer to the memory used for loading values.3571/// \param __M3572/// A 128-bit vector of [4 x i32] containing the mask bits.3573/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed3574/// elements.3575static __inline__ __m128i __DEFAULT_FN_ATTRS1283576_mm_maskload_epi32(int const *__X, __m128i __M)3577{3578return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);3579}35803581/// Conditionally loads two 64-bit integer elements from memory \a __X, if3582/// the most significant bit of the corresponding element in the mask3583/// \a __M is set; otherwise, sets that element of the result to zero.3584/// Returns the 128-bit [2 x i64] result.3585///3586/// \code{.operation}3587/// FOR i := 0 TO 13588/// j := i*643589/// IF __M[j+63] == 13590/// result[j+63:j] := Load64(__X+(i*8))3591/// ELSE3592/// result[j+63:j] := 03593/// FI3594/// ENDFOR3595/// \endcode3596///3597/// \headerfile <immintrin.h>3598///3599/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.3600///3601/// \param __X3602/// A pointer to the memory used for loading values.3603/// \param __M3604/// A 128-bit vector of [2 x i64] containing the mask bits.3605/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed3606/// elements.3607static __inline__ __m128i __DEFAULT_FN_ATTRS1283608_mm_maskload_epi64(long long const *__X, __m128i __M)3609{3610return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);3611}36123613/// Conditionally stores eight 32-bit integer elements from the 256-bit vector3614/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of3615/// the corresponding element in the mask \a __M is set; otherwise, the3616/// memory element is unchanged.3617///3618/// \code{.operation}3619/// FOR i := 0 TO 73620/// j := i*323621/// IF __M[j+31] == 13622/// Store32(__X+(i*4), __Y[j+31:j])3623/// FI3624/// ENDFOR3625/// \endcode3626///3627/// \headerfile <immintrin.h>3628///3629/// This intrinsic corresponds to the \c VPMASKMOVD instruction.3630///3631/// \param __X3632/// A pointer to the memory used for storing values.3633/// \param __M3634/// A 256-bit vector of [8 x i32] containing the mask bits.3635/// \param __Y3636/// A 256-bit vector of [8 x i32] containing the values to store.3637static __inline__ void __DEFAULT_FN_ATTRS2563638_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)3639{3640__builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);3641}36423643/// Conditionally stores four 64-bit integer elements from the 256-bit vector3644/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of3645/// the corresponding element in the mask \a __M is set; otherwise, the3646/// memory element is unchanged.3647///3648/// \code{.operation}3649/// FOR i := 0 TO 33650/// j := i*643651/// IF __M[j+63] == 13652/// Store64(__X+(i*8), __Y[j+63:j])3653/// FI3654/// ENDFOR3655/// \endcode3656///3657/// \headerfile <immintrin.h>3658///3659/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.3660///3661/// \param __X3662/// A pointer to the memory used for storing values.3663/// \param __M3664/// A 256-bit vector of [4 x i64] containing the mask bits.3665/// \param __Y3666/// A 256-bit vector of [4 x i64] containing the values to store.3667static __inline__ void __DEFAULT_FN_ATTRS2563668_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)3669{3670__builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);3671}36723673/// Conditionally stores four 32-bit integer elements from the 128-bit vector3674/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of3675/// the corresponding element in the mask \a __M is set; otherwise, the3676/// memory element is unchanged.3677///3678/// \code{.operation}3679/// FOR i := 0 TO 33680/// j := i*323681/// IF __M[j+31] == 13682/// Store32(__X+(i*4), __Y[j+31:j])3683/// FI3684/// ENDFOR3685/// \endcode3686///3687/// \headerfile <immintrin.h>3688///3689/// This intrinsic corresponds to the \c VPMASKMOVD instruction.3690///3691/// \param __X3692/// A pointer to the memory used for storing values.3693/// \param __M3694/// A 128-bit vector of [4 x i32] containing the mask bits.3695/// \param __Y3696/// A 128-bit vector of [4 x i32] containing the values to store.3697static __inline__ void __DEFAULT_FN_ATTRS1283698_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)3699{3700__builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);3701}37023703/// Conditionally stores two 64-bit integer elements from the 128-bit vector3704/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of3705/// the corresponding element in the mask \a __M is set; otherwise, the3706/// memory element is unchanged.3707///3708/// \code{.operation}3709/// FOR i := 0 TO 13710/// j := i*643711/// IF __M[j+63] == 13712/// Store64(__X+(i*8), __Y[j+63:j])3713/// FI3714/// ENDFOR3715/// \endcode3716///3717/// \headerfile <immintrin.h>3718///3719/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.3720///3721/// \param __X3722/// A pointer to the memory used for storing values.3723/// \param __M3724/// A 128-bit vector of [2 x i64] containing the mask bits.3725/// \param __Y3726/// A 128-bit vector of [2 x i64] containing the values to store.3727static __inline__ void __DEFAULT_FN_ATTRS1283728_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)3729{3730__builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);3731}37323733/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X3734/// left by the number of bits given in the corresponding element of the3735/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and3736/// returns the result. If the shift count for any element is greater than3737/// 31, the result for that element is zero.3738///3739/// \headerfile <immintrin.h>3740///3741/// This intrinsic corresponds to the \c VPSLLVD instruction.3742///3743/// \param __X3744/// A 256-bit vector of [8 x i32] to be shifted.3745/// \param __Y3746/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in3747/// bits).3748/// \returns A 256-bit vector of [8 x i32] containing the result.3749static __inline__ __m256i __DEFAULT_FN_ATTRS2563750_mm256_sllv_epi32(__m256i __X, __m256i __Y)3751{3752return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);3753}37543755/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X3756/// left by the number of bits given in the corresponding element of the3757/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and3758/// returns the result. If the shift count for any element is greater than3759/// 31, the result for that element is zero.3760///3761/// \headerfile <immintrin.h>3762///3763/// This intrinsic corresponds to the \c VPSLLVD instruction.3764///3765/// \param __X3766/// A 128-bit vector of [4 x i32] to be shifted.3767/// \param __Y3768/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in3769/// bits).3770/// \returns A 128-bit vector of [4 x i32] containing the result.3771static __inline__ __m128i __DEFAULT_FN_ATTRS1283772_mm_sllv_epi32(__m128i __X, __m128i __Y)3773{3774return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);3775}37763777/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X3778/// left by the number of bits given in the corresponding element of the3779/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and3780/// returns the result. If the shift count for any element is greater than3781/// 63, the result for that element is zero.3782///3783/// \headerfile <immintrin.h>3784///3785/// This intrinsic corresponds to the \c VPSLLVQ instruction.3786///3787/// \param __X3788/// A 256-bit vector of [4 x i64] to be shifted.3789/// \param __Y3790/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in3791/// bits).3792/// \returns A 256-bit vector of [4 x i64] containing the result.3793static __inline__ __m256i __DEFAULT_FN_ATTRS2563794_mm256_sllv_epi64(__m256i __X, __m256i __Y)3795{3796return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);3797}37983799/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X3800/// left by the number of bits given in the corresponding element of the3801/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and3802/// returns the result. If the shift count for any element is greater than3803/// 63, the result for that element is zero.3804///3805/// \headerfile <immintrin.h>3806///3807/// This intrinsic corresponds to the \c VPSLLVQ instruction.3808///3809/// \param __X3810/// A 128-bit vector of [2 x i64] to be shifted.3811/// \param __Y3812/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in3813/// bits).3814/// \returns A 128-bit vector of [2 x i64] containing the result.3815static __inline__ __m128i __DEFAULT_FN_ATTRS1283816_mm_sllv_epi64(__m128i __X, __m128i __Y)3817{3818return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);3819}38203821/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X3822/// right by the number of bits given in the corresponding element of the3823/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and3824/// returns the result. If the shift count for any element is greater than3825/// 31, the result for that element is 0 or -1 according to the sign bit3826/// for that element.3827///3828/// \headerfile <immintrin.h>3829///3830/// This intrinsic corresponds to the \c VPSRAVD instruction.3831///3832/// \param __X3833/// A 256-bit vector of [8 x i32] to be shifted.3834/// \param __Y3835/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in3836/// bits).3837/// \returns A 256-bit vector of [8 x i32] containing the result.3838static __inline__ __m256i __DEFAULT_FN_ATTRS2563839_mm256_srav_epi32(__m256i __X, __m256i __Y)3840{3841return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);3842}38433844/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X3845/// right by the number of bits given in the corresponding element of the3846/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and3847/// returns the result. If the shift count for any element is greater than3848/// 31, the result for that element is 0 or -1 according to the sign bit3849/// for that element.3850///3851/// \headerfile <immintrin.h>3852///3853/// This intrinsic corresponds to the \c VPSRAVD instruction.3854///3855/// \param __X3856/// A 128-bit vector of [4 x i32] to be shifted.3857/// \param __Y3858/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in3859/// bits).3860/// \returns A 128-bit vector of [4 x i32] containing the result.3861static __inline__ __m128i __DEFAULT_FN_ATTRS1283862_mm_srav_epi32(__m128i __X, __m128i __Y)3863{3864return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);3865}38663867/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X3868/// right by the number of bits given in the corresponding element of the3869/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and3870/// returns the result. If the shift count for any element is greater than3871/// 31, the result for that element is zero.3872///3873/// \headerfile <immintrin.h>3874///3875/// This intrinsic corresponds to the \c VPSRLVD instruction.3876///3877/// \param __X3878/// A 256-bit vector of [8 x i32] to be shifted.3879/// \param __Y3880/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in3881/// bits).3882/// \returns A 256-bit vector of [8 x i32] containing the result.3883static __inline__ __m256i __DEFAULT_FN_ATTRS2563884_mm256_srlv_epi32(__m256i __X, __m256i __Y)3885{3886return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);3887}38883889/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X3890/// right by the number of bits given in the corresponding element of the3891/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and3892/// returns the result. If the shift count for any element is greater than3893/// 31, the result for that element is zero.3894///3895/// \headerfile <immintrin.h>3896///3897/// This intrinsic corresponds to the \c VPSRLVD instruction.3898///3899/// \param __X3900/// A 128-bit vector of [4 x i32] to be shifted.3901/// \param __Y3902/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in3903/// bits).3904/// \returns A 128-bit vector of [4 x i32] containing the result.3905static __inline__ __m128i __DEFAULT_FN_ATTRS1283906_mm_srlv_epi32(__m128i __X, __m128i __Y)3907{3908return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);3909}39103911/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X3912/// right by the number of bits given in the corresponding element of the3913/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and3914/// returns the result. If the shift count for any element is greater than3915/// 63, the result for that element is zero.3916///3917/// \headerfile <immintrin.h>3918///3919/// This intrinsic corresponds to the \c VPSRLVQ instruction.3920///3921/// \param __X3922/// A 256-bit vector of [4 x i64] to be shifted.3923/// \param __Y3924/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in3925/// bits).3926/// \returns A 256-bit vector of [4 x i64] containing the result.3927static __inline__ __m256i __DEFAULT_FN_ATTRS2563928_mm256_srlv_epi64(__m256i __X, __m256i __Y)3929{3930return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);3931}39323933/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X3934/// right by the number of bits given in the corresponding element of the3935/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and3936/// returns the result. If the shift count for any element is greater than3937/// 63, the result for that element is zero.3938///3939/// \headerfile <immintrin.h>3940///3941/// This intrinsic corresponds to the \c VPSRLVQ instruction.3942///3943/// \param __X3944/// A 128-bit vector of [2 x i64] to be shifted.3945/// \param __Y3946/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in3947/// bits).3948/// \returns A 128-bit vector of [2 x i64] containing the result.3949static __inline__ __m128i __DEFAULT_FN_ATTRS1283950_mm_srlv_epi64(__m128i __X, __m128i __Y)3951{3952return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);3953}39543955/// Conditionally gathers two 64-bit floating-point values, either from the3956/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled3957/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector3958/// of [2 x double] in \a mask determines the source for each element.3959///3960/// \code{.operation}3961/// FOR element := 0 to 13962/// j := element*643963/// k := element*323964/// IF mask[j+63] == 03965/// result[j+63:j] := a[j+63:j]3966/// ELSE3967/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)3968/// FI3969/// ENDFOR3970/// \endcode3971///3972/// \headerfile <immintrin.h>3973///3974/// \code3975/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,3976/// __m128d mask, const int s);3977/// \endcode3978///3979/// This intrinsic corresponds to the \c VGATHERDPD instruction.3980///3981/// \param a3982/// A 128-bit vector of [2 x double] used as the source when a mask bit is3983/// zero.3984/// \param m3985/// A pointer to the memory used for loading values.3986/// \param i3987/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only3988/// the first two elements are used.3989/// \param mask3990/// A 128-bit vector of [2 x double] containing the mask. The most3991/// significant bit of each element in the mask vector represents the mask3992/// bits. If a mask bit is zero, the corresponding value from vector \a a3993/// is gathered; otherwise the value is loaded from memory.3994/// \param s3995/// A literal constant scale factor for the indexes in \a i. Must be3996/// 1, 2, 4, or 8.3997/// \returns A 128-bit vector of [2 x double] containing the gathered values.3998#define _mm_mask_i32gather_pd(a, m, i, mask, s) \3999((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \4000(double const *)(m), \4001(__v4si)(__m128i)(i), \4002(__v2df)(__m128d)(mask), (s)))40034004/// Conditionally gathers four 64-bit floating-point values, either from the4005/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled4006/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector4007/// of [4 x double] in \a mask determines the source for each element.4008///4009/// \code{.operation}4010/// FOR element := 0 to 34011/// j := element*644012/// k := element*324013/// IF mask[j+63] == 04014/// result[j+63:j] := a[j+63:j]4015/// ELSE4016/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)4017/// FI4018/// ENDFOR4019/// \endcode4020///4021/// \headerfile <immintrin.h>4022///4023/// \code4024/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,4025/// __m256d mask, const int s);4026/// \endcode4027///4028/// This intrinsic corresponds to the \c VGATHERDPD instruction.4029///4030/// \param a4031/// A 256-bit vector of [4 x double] used as the source when a mask bit is4032/// zero.4033/// \param m4034/// A pointer to the memory used for loading values.4035/// \param i4036/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4037/// \param mask4038/// A 256-bit vector of [4 x double] containing the mask. The most4039/// significant bit of each element in the mask vector represents the mask4040/// bits. If a mask bit is zero, the corresponding value from vector \a a4041/// is gathered; otherwise the value is loaded from memory.4042/// \param s4043/// A literal constant scale factor for the indexes in \a i. Must be4044/// 1, 2, 4, or 8.4045/// \returns A 256-bit vector of [4 x double] containing the gathered values.4046#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \4047((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \4048(double const *)(m), \4049(__v4si)(__m128i)(i), \4050(__v4df)(__m256d)(mask), (s)))40514052/// Conditionally gathers two 64-bit floating-point values, either from the4053/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled4054/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector4055/// of [2 x double] in \a mask determines the source for each element.4056///4057/// \code{.operation}4058/// FOR element := 0 to 14059/// j := element*644060/// k := element*644061/// IF mask[j+63] == 04062/// result[j+63:j] := a[j+63:j]4063/// ELSE4064/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4065/// FI4066/// ENDFOR4067/// \endcode4068///4069/// \headerfile <immintrin.h>4070///4071/// \code4072/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,4073/// __m128d mask, const int s);4074/// \endcode4075///4076/// This intrinsic corresponds to the \c VGATHERQPD instruction.4077///4078/// \param a4079/// A 128-bit vector of [2 x double] used as the source when a mask bit is4080/// zero.4081/// \param m4082/// A pointer to the memory used for loading values.4083/// \param i4084/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.4085/// \param mask4086/// A 128-bit vector of [2 x double] containing the mask. The most4087/// significant bit of each element in the mask vector represents the mask4088/// bits. If a mask bit is zero, the corresponding value from vector \a a4089/// is gathered; otherwise the value is loaded from memory.4090/// \param s4091/// A literal constant scale factor for the indexes in \a i. Must be4092/// 1, 2, 4, or 8.4093/// \returns A 128-bit vector of [2 x double] containing the gathered values.4094#define _mm_mask_i64gather_pd(a, m, i, mask, s) \4095((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \4096(double const *)(m), \4097(__v2di)(__m128i)(i), \4098(__v2df)(__m128d)(mask), (s)))40994100/// Conditionally gathers four 64-bit floating-point values, either from the4101/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled4102/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector4103/// of [4 x double] in \a mask determines the source for each element.4104///4105/// \code{.operation}4106/// FOR element := 0 to 34107/// j := element*644108/// k := element*644109/// IF mask[j+63] == 04110/// result[j+63:j] := a[j+63:j]4111/// ELSE4112/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4113/// FI4114/// ENDFOR4115/// \endcode4116///4117/// \headerfile <immintrin.h>4118///4119/// \code4120/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,4121/// __m256d mask, const int s);4122/// \endcode4123///4124/// This intrinsic corresponds to the \c VGATHERQPD instruction.4125///4126/// \param a4127/// A 256-bit vector of [4 x double] used as the source when a mask bit is4128/// zero.4129/// \param m4130/// A pointer to the memory used for loading values.4131/// \param i4132/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.4133/// \param mask4134/// A 256-bit vector of [4 x double] containing the mask. The most4135/// significant bit of each element in the mask vector represents the mask4136/// bits. If a mask bit is zero, the corresponding value from vector \a a4137/// is gathered; otherwise the value is loaded from memory.4138/// \param s4139/// A literal constant scale factor for the indexes in \a i. Must be4140/// 1, 2, 4, or 8.4141/// \returns A 256-bit vector of [4 x double] containing the gathered values.4142#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \4143((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \4144(double const *)(m), \4145(__v4di)(__m256i)(i), \4146(__v4df)(__m256d)(mask), (s)))41474148/// Conditionally gathers four 32-bit floating-point values, either from the4149/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled4150/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector4151/// of [4 x float] in \a mask determines the source for each element.4152///4153/// \code{.operation}4154/// FOR element := 0 to 34155/// j := element*324156/// k := element*324157/// IF mask[j+31] == 04158/// result[j+31:j] := a[j+31:j]4159/// ELSE4160/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4161/// FI4162/// ENDFOR4163/// \endcode4164///4165/// \headerfile <immintrin.h>4166///4167/// \code4168/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,4169/// __m128 mask, const int s);4170/// \endcode4171///4172/// This intrinsic corresponds to the \c VGATHERDPS instruction.4173///4174/// \param a4175/// A 128-bit vector of [4 x float] used as the source when a mask bit is4176/// zero.4177/// \param m4178/// A pointer to the memory used for loading values.4179/// \param i4180/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4181/// \param mask4182/// A 128-bit vector of [4 x float] containing the mask. The most4183/// significant bit of each element in the mask vector represents the mask4184/// bits. If a mask bit is zero, the corresponding value from vector \a a4185/// is gathered; otherwise the value is loaded from memory.4186/// \param s4187/// A literal constant scale factor for the indexes in \a i. Must be4188/// 1, 2, 4, or 8.4189/// \returns A 128-bit vector of [4 x float] containing the gathered values.4190#define _mm_mask_i32gather_ps(a, m, i, mask, s) \4191((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \4192(float const *)(m), \4193(__v4si)(__m128i)(i), \4194(__v4sf)(__m128)(mask), (s)))41954196/// Conditionally gathers eight 32-bit floating-point values, either from the4197/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled4198/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector4199/// of [8 x float] in \a mask determines the source for each element.4200///4201/// \code{.operation}4202/// FOR element := 0 to 74203/// j := element*324204/// k := element*324205/// IF mask[j+31] == 04206/// result[j+31:j] := a[j+31:j]4207/// ELSE4208/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4209/// FI4210/// ENDFOR4211/// \endcode4212///4213/// \headerfile <immintrin.h>4214///4215/// \code4216/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,4217/// __m256 mask, const int s);4218/// \endcode4219///4220/// This intrinsic corresponds to the \c VGATHERDPS instruction.4221///4222/// \param a4223/// A 256-bit vector of [8 x float] used as the source when a mask bit is4224/// zero.4225/// \param m4226/// A pointer to the memory used for loading values.4227/// \param i4228/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.4229/// \param mask4230/// A 256-bit vector of [8 x float] containing the mask. The most4231/// significant bit of each element in the mask vector represents the mask4232/// bits. If a mask bit is zero, the corresponding value from vector \a a4233/// is gathered; otherwise the value is loaded from memory.4234/// \param s4235/// A literal constant scale factor for the indexes in \a i. Must be4236/// 1, 2, 4, or 8.4237/// \returns A 256-bit vector of [8 x float] containing the gathered values.4238#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \4239((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \4240(float const *)(m), \4241(__v8si)(__m256i)(i), \4242(__v8sf)(__m256)(mask), (s)))42434244/// Conditionally gathers two 32-bit floating-point values, either from the4245/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled4246/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector4247/// of [4 x float] in \a mask determines the source for the lower two4248/// elements. The upper two elements of the result are zeroed.4249///4250/// \code{.operation}4251/// FOR element := 0 to 14252/// j := element*324253/// k := element*644254/// IF mask[j+31] == 04255/// result[j+31:j] := a[j+31:j]4256/// ELSE4257/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)4258/// FI4259/// ENDFOR4260/// result[127:64] := 04261/// \endcode4262///4263/// \headerfile <immintrin.h>4264///4265/// \code4266/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,4267/// __m128 mask, const int s);4268/// \endcode4269///4270/// This intrinsic corresponds to the \c VGATHERQPS instruction.4271///4272/// \param a4273/// A 128-bit vector of [4 x float] used as the source when a mask bit is4274/// zero. Only the first two elements are used.4275/// \param m4276/// A pointer to the memory used for loading values.4277/// \param i4278/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.4279/// \param mask4280/// A 128-bit vector of [4 x float] containing the mask. The most4281/// significant bit of each element in the mask vector represents the mask4282/// bits. If a mask bit is zero, the corresponding value from vector \a a4283/// is gathered; otherwise the value is loaded from memory. Only the first4284/// two elements are used.4285/// \param s4286/// A literal constant scale factor for the indexes in \a i. Must be4287/// 1, 2, 4, or 8.4288/// \returns A 128-bit vector of [4 x float] containing the gathered values.4289#define _mm_mask_i64gather_ps(a, m, i, mask, s) \4290((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \4291(float const *)(m), \4292(__v2di)(__m128i)(i), \4293(__v4sf)(__m128)(mask), (s)))42944295/// Conditionally gathers four 32-bit floating-point values, either from the4296/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled4297/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector4298/// of [4 x float] in \a mask determines the source for each element.4299///4300/// \code{.operation}4301/// FOR element := 0 to 34302/// j := element*324303/// k := element*644304/// IF mask[j+31] == 04305/// result[j+31:j] := a[j+31:j]4306/// ELSE4307/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)4308/// FI4309/// ENDFOR4310/// \endcode4311///4312/// \headerfile <immintrin.h>4313///4314/// \code4315/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,4316/// __m128 mask, const int s);4317/// \endcode4318///4319/// This intrinsic corresponds to the \c VGATHERQPS instruction.4320///4321/// \param a4322/// A 128-bit vector of [4 x float] used as the source when a mask bit is4323/// zero.4324/// \param m4325/// A pointer to the memory used for loading values.4326/// \param i4327/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.4328/// \param mask4329/// A 128-bit vector of [4 x float] containing the mask. The most4330/// significant bit of each element in the mask vector represents the mask4331/// bits. If a mask bit is zero, the corresponding value from vector \a a4332/// is gathered; otherwise the value is loaded from memory.4333/// \param s4334/// A literal constant scale factor for the indexes in \a i. Must be4335/// 1, 2, 4, or 8.4336/// \returns A 128-bit vector of [4 x float] containing the gathered values.4337#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \4338((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \4339(float const *)(m), \4340(__v4di)(__m256i)(i), \4341(__v4sf)(__m128)(mask), (s)))43424343/// Conditionally gathers four 32-bit integer values, either from the4344/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled4345/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector4346/// of [4 x i32] in \a mask determines the source for each element.4347///4348/// \code{.operation}4349/// FOR element := 0 to 34350/// j := element*324351/// k := element*324352/// IF mask[j+31] == 04353/// result[j+31:j] := a[j+31:j]4354/// ELSE4355/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4356/// FI4357/// ENDFOR4358/// \endcode4359///4360/// \headerfile <immintrin.h>4361///4362/// \code4363/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,4364/// __m128i mask, const int s);4365/// \endcode4366///4367/// This intrinsic corresponds to the \c VPGATHERDD instruction.4368///4369/// \param a4370/// A 128-bit vector of [4 x i32] used as the source when a mask bit is4371/// zero.4372/// \param m4373/// A pointer to the memory used for loading values.4374/// \param i4375/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4376/// \param mask4377/// A 128-bit vector of [4 x i32] containing the mask. The most significant4378/// bit of each element in the mask vector represents the mask bits. If a4379/// mask bit is zero, the corresponding value from vector \a a is gathered;4380/// otherwise the value is loaded from memory.4381/// \param s4382/// A literal constant scale factor for the indexes in \a i. Must be4383/// 1, 2, 4, or 8.4384/// \returns A 128-bit vector of [4 x i32] containing the gathered values.4385#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \4386((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \4387(int const *)(m), \4388(__v4si)(__m128i)(i), \4389(__v4si)(__m128i)(mask), (s)))43904391/// Conditionally gathers eight 32-bit integer values, either from the4392/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled4393/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector4394/// of [8 x i32] in \a mask determines the source for each element.4395///4396/// \code{.operation}4397/// FOR element := 0 to 74398/// j := element*324399/// k := element*324400/// IF mask[j+31] == 04401/// result[j+31:j] := a[j+31:j]4402/// ELSE4403/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4404/// FI4405/// ENDFOR4406/// \endcode4407///4408/// \headerfile <immintrin.h>4409///4410/// \code4411/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,4412/// __m256i mask, const int s);4413/// \endcode4414///4415/// This intrinsic corresponds to the \c VPGATHERDD instruction.4416///4417/// \param a4418/// A 256-bit vector of [8 x i32] used as the source when a mask bit is4419/// zero.4420/// \param m4421/// A pointer to the memory used for loading values.4422/// \param i4423/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.4424/// \param mask4425/// A 256-bit vector of [8 x i32] containing the mask. The most significant4426/// bit of each element in the mask vector represents the mask bits. If a4427/// mask bit is zero, the corresponding value from vector \a a is gathered;4428/// otherwise the value is loaded from memory.4429/// \param s4430/// A literal constant scale factor for the indexes in \a i. Must be4431/// 1, 2, 4, or 8.4432/// \returns A 256-bit vector of [8 x i32] containing the gathered values.4433#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \4434((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \4435(int const *)(m), \4436(__v8si)(__m256i)(i), \4437(__v8si)(__m256i)(mask), (s)))44384439/// Conditionally gathers two 32-bit integer values, either from the4440/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled4441/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector4442/// of [4 x i32] in \a mask determines the source for the lower two4443/// elements. The upper two elements of the result are zeroed.4444///4445/// \code{.operation}4446/// FOR element := 0 to 14447/// j := element*324448/// k := element*644449/// IF mask[j+31] == 04450/// result[j+31:j] := a[j+31:j]4451/// ELSE4452/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)4453/// FI4454/// ENDFOR4455/// result[127:64] := 04456/// \endcode4457///4458/// \headerfile <immintrin.h>4459///4460/// \code4461/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,4462/// __m128i mask, const int s);4463/// \endcode4464///4465/// This intrinsic corresponds to the \c VPGATHERQD instruction.4466///4467/// \param a4468/// A 128-bit vector of [4 x i32] used as the source when a mask bit is4469/// zero. Only the first two elements are used.4470/// \param m4471/// A pointer to the memory used for loading values.4472/// \param i4473/// A 128-bit vector of [2 x i64] containing indexes into \a m.4474/// \param mask4475/// A 128-bit vector of [4 x i32] containing the mask. The most significant4476/// bit of each element in the mask vector represents the mask bits. If a4477/// mask bit is zero, the corresponding value from vector \a a is gathered;4478/// otherwise the value is loaded from memory. Only the first two elements4479/// are used.4480/// \param s4481/// A literal constant scale factor for the indexes in \a i. Must be4482/// 1, 2, 4, or 8.4483/// \returns A 128-bit vector of [4 x i32] containing the gathered values.4484#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \4485((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \4486(int const *)(m), \4487(__v2di)(__m128i)(i), \4488(__v4si)(__m128i)(mask), (s)))44894490/// Conditionally gathers four 32-bit integer values, either from the4491/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled4492/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector4493/// of [4 x i32] in \a mask determines the source for each element.4494///4495/// \code{.operation}4496/// FOR element := 0 to 34497/// j := element*324498/// k := element*644499/// IF mask[j+31] == 04500/// result[j+31:j] := a[j+31:j]4501/// ELSE4502/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)4503/// FI4504/// ENDFOR4505/// \endcode4506///4507/// \headerfile <immintrin.h>4508///4509/// \code4510/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,4511/// __m128i mask, const int s);4512/// \endcode4513///4514/// This intrinsic corresponds to the \c VPGATHERQD instruction.4515///4516/// \param a4517/// A 128-bit vector of [4 x i32] used as the source when a mask bit is4518/// zero.4519/// \param m4520/// A pointer to the memory used for loading values.4521/// \param i4522/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.4523/// \param mask4524/// A 128-bit vector of [4 x i32] containing the mask. The most significant4525/// bit of each element in the mask vector represents the mask bits. If a4526/// mask bit is zero, the corresponding value from vector \a a is gathered;4527/// otherwise the value is loaded from memory.4528/// \param s4529/// A literal constant scale factor for the indexes in \a i. Must be4530/// 1, 2, 4, or 8.4531/// \returns A 128-bit vector of [4 x i32] containing the gathered values.4532#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \4533((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \4534(int const *)(m), \4535(__v4di)(__m256i)(i), \4536(__v4si)(__m128i)(mask), (s)))45374538/// Conditionally gathers two 64-bit integer values, either from the4539/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled4540/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector4541/// of [2 x i64] in \a mask determines the source for each element.4542///4543/// \code{.operation}4544/// FOR element := 0 to 14545/// j := element*644546/// k := element*324547/// IF mask[j+63] == 04548/// result[j+63:j] := a[j+63:j]4549/// ELSE4550/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)4551/// FI4552/// ENDFOR4553/// \endcode4554///4555/// \headerfile <immintrin.h>4556///4557/// \code4558/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,4559/// __m128i mask, const int s);4560/// \endcode4561///4562/// This intrinsic corresponds to the \c VPGATHERDQ instruction.4563///4564/// \param a4565/// A 128-bit vector of [2 x i64] used as the source when a mask bit is4566/// zero.4567/// \param m4568/// A pointer to the memory used for loading values.4569/// \param i4570/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only4571/// the first two elements are used.4572/// \param mask4573/// A 128-bit vector of [2 x i64] containing the mask. The most significant4574/// bit of each element in the mask vector represents the mask bits. If a4575/// mask bit is zero, the corresponding value from vector \a a is gathered;4576/// otherwise the value is loaded from memory.4577/// \param s4578/// A literal constant scale factor for the indexes in \a i. Must be4579/// 1, 2, 4, or 8.4580/// \returns A 128-bit vector of [2 x i64] containing the gathered values.4581#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \4582((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \4583(long long const *)(m), \4584(__v4si)(__m128i)(i), \4585(__v2di)(__m128i)(mask), (s)))45864587/// Conditionally gathers four 64-bit integer values, either from the4588/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled4589/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector4590/// of [4 x i64] in \a mask determines the source for each element.4591///4592/// \code{.operation}4593/// FOR element := 0 to 34594/// j := element*644595/// k := element*324596/// IF mask[j+63] == 04597/// result[j+63:j] := a[j+63:j]4598/// ELSE4599/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)4600/// FI4601/// ENDFOR4602/// \endcode4603///4604/// \headerfile <immintrin.h>4605///4606/// \code4607/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,4608/// __m128i i, __m256i mask, const int s);4609/// \endcode4610///4611/// This intrinsic corresponds to the \c VPGATHERDQ instruction.4612///4613/// \param a4614/// A 256-bit vector of [4 x i64] used as the source when a mask bit is4615/// zero.4616/// \param m4617/// A pointer to the memory used for loading values.4618/// \param i4619/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4620/// \param mask4621/// A 256-bit vector of [4 x i64] containing the mask. The most significant4622/// bit of each element in the mask vector represents the mask bits. If a4623/// mask bit is zero, the corresponding value from vector \a a is gathered;4624/// otherwise the value is loaded from memory.4625/// \param s4626/// A literal constant scale factor for the indexes in \a i. Must be4627/// 1, 2, 4, or 8.4628/// \returns A 256-bit vector of [4 x i64] containing the gathered values.4629#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \4630((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \4631(long long const *)(m), \4632(__v4si)(__m128i)(i), \4633(__v4di)(__m256i)(mask), (s)))46344635/// Conditionally gathers two 64-bit integer values, either from the4636/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled4637/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector4638/// of [2 x i64] in \a mask determines the source for each element.4639///4640/// \code{.operation}4641/// FOR element := 0 to 14642/// j := element*644643/// k := element*644644/// IF mask[j+63] == 04645/// result[j+63:j] := a[j+63:j]4646/// ELSE4647/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4648/// FI4649/// ENDFOR4650/// \endcode4651///4652/// \headerfile <immintrin.h>4653///4654/// \code4655/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,4656/// __m128i mask, const int s);4657/// \endcode4658///4659/// This intrinsic corresponds to the \c VPGATHERQQ instruction.4660///4661/// \param a4662/// A 128-bit vector of [2 x i64] used as the source when a mask bit is4663/// zero.4664/// \param m4665/// A pointer to the memory used for loading values.4666/// \param i4667/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.4668/// \param mask4669/// A 128-bit vector of [2 x i64] containing the mask. The most significant4670/// bit of each element in the mask vector represents the mask bits. If a4671/// mask bit is zero, the corresponding value from vector \a a is gathered;4672/// otherwise the value is loaded from memory.4673/// \param s4674/// A literal constant scale factor for the indexes in \a i. Must be4675/// 1, 2, 4, or 8.4676/// \returns A 128-bit vector of [2 x i64] containing the gathered values.4677#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \4678((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \4679(long long const *)(m), \4680(__v2di)(__m128i)(i), \4681(__v2di)(__m128i)(mask), (s)))46824683/// Conditionally gathers four 64-bit integer values, either from the4684/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled4685/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector4686/// of [4 x i64] in \a mask determines the source for each element.4687///4688/// \code{.operation}4689/// FOR element := 0 to 34690/// j := element*644691/// k := element*644692/// IF mask[j+63] == 04693/// result[j+63:j] := a[j+63:j]4694/// ELSE4695/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4696/// FI4697/// ENDFOR4698/// \endcode4699///4700/// \headerfile <immintrin.h>4701///4702/// \code4703/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,4704/// __m256i i, __m256i mask, const int s);4705/// \endcode4706///4707/// This intrinsic corresponds to the \c VPGATHERQQ instruction.4708///4709/// \param a4710/// A 256-bit vector of [4 x i64] used as the source when a mask bit is4711/// zero.4712/// \param m4713/// A pointer to the memory used for loading values.4714/// \param i4715/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.4716/// \param mask4717/// A 256-bit vector of [4 x i64] containing the mask. The most significant4718/// bit of each element in the mask vector represents the mask bits. If a4719/// mask bit is zero, the corresponding value from vector \a a is gathered;4720/// otherwise the value is loaded from memory.4721/// \param s4722/// A literal constant scale factor for the indexes in \a i. Must be4723/// 1, 2, 4, or 8.4724/// \returns A 256-bit vector of [4 x i64] containing the gathered values.4725#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \4726((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \4727(long long const *)(m), \4728(__v4di)(__m256i)(i), \4729(__v4di)(__m256i)(mask), (s)))47304731/// Gathers two 64-bit floating-point values from memory \a m using scaled4732/// indexes from the 128-bit vector of [4 x i32] in \a i.4733///4734/// \code{.operation}4735/// FOR element := 0 to 14736/// j := element*644737/// k := element*324738/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)4739/// ENDFOR4740/// \endcode4741///4742/// \headerfile <immintrin.h>4743///4744/// \code4745/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);4746/// \endcode4747///4748/// This intrinsic corresponds to the \c VGATHERDPD instruction.4749///4750/// \param m4751/// A pointer to the memory used for loading values.4752/// \param i4753/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only4754/// the first two elements are used.4755/// \param s4756/// A literal constant scale factor for the indexes in \a i. Must be4757/// 1, 2, 4, or 8.4758/// \returns A 128-bit vector of [2 x double] containing the gathered values.4759#define _mm_i32gather_pd(m, i, s) \4760((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \4761(double const *)(m), \4762(__v4si)(__m128i)(i), \4763(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \4764_mm_setzero_pd()), \4765(s)))47664767/// Gathers four 64-bit floating-point values from memory \a m using scaled4768/// indexes from the 128-bit vector of [4 x i32] in \a i.4769///4770/// \code{.operation}4771/// FOR element := 0 to 34772/// j := element*644773/// k := element*324774/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)4775/// ENDFOR4776/// \endcode4777///4778/// \headerfile <immintrin.h>4779///4780/// \code4781/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);4782/// \endcode4783///4784/// This intrinsic corresponds to the \c VGATHERDPD instruction.4785///4786/// \param m4787/// A pointer to the memory used for loading values.4788/// \param i4789/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4790/// \param s4791/// A literal constant scale factor for the indexes in \a i. Must be4792/// 1, 2, 4, or 8.4793/// \returns A 256-bit vector of [4 x double] containing the gathered values.4794#define _mm256_i32gather_pd(m, i, s) \4795((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \4796(double const *)(m), \4797(__v4si)(__m128i)(i), \4798(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \4799_mm256_setzero_pd(), \4800_CMP_EQ_OQ), \4801(s)))48024803/// Gathers two 64-bit floating-point values from memory \a m using scaled4804/// indexes from the 128-bit vector of [2 x i64] in \a i.4805///4806/// \code{.operation}4807/// FOR element := 0 to 14808/// j := element*644809/// k := element*644810/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4811/// ENDFOR4812/// \endcode4813///4814/// \headerfile <immintrin.h>4815///4816/// \code4817/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);4818/// \endcode4819///4820/// This intrinsic corresponds to the \c VGATHERQPD instruction.4821///4822/// \param m4823/// A pointer to the memory used for loading values.4824/// \param i4825/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.4826/// \param s4827/// A literal constant scale factor for the indexes in \a i. Must be4828/// 1, 2, 4, or 8.4829/// \returns A 128-bit vector of [2 x double] containing the gathered values.4830#define _mm_i64gather_pd(m, i, s) \4831((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \4832(double const *)(m), \4833(__v2di)(__m128i)(i), \4834(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \4835_mm_setzero_pd()), \4836(s)))48374838/// Gathers four 64-bit floating-point values from memory \a m using scaled4839/// indexes from the 256-bit vector of [4 x i64] in \a i.4840///4841/// \code{.operation}4842/// FOR element := 0 to 34843/// j := element*644844/// k := element*644845/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)4846/// ENDFOR4847/// \endcode4848///4849/// \headerfile <immintrin.h>4850///4851/// \code4852/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);4853/// \endcode4854///4855/// This intrinsic corresponds to the \c VGATHERQPD instruction.4856///4857/// \param m4858/// A pointer to the memory used for loading values.4859/// \param i4860/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.4861/// \param s4862/// A literal constant scale factor for the indexes in \a i. Must be4863/// 1, 2, 4, or 8.4864/// \returns A 256-bit vector of [4 x double] containing the gathered values.4865#define _mm256_i64gather_pd(m, i, s) \4866((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \4867(double const *)(m), \4868(__v4di)(__m256i)(i), \4869(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \4870_mm256_setzero_pd(), \4871_CMP_EQ_OQ), \4872(s)))48734874/// Gathers four 32-bit floating-point values from memory \a m using scaled4875/// indexes from the 128-bit vector of [4 x i32] in \a i.4876///4877/// \code{.operation}4878/// FOR element := 0 to 34879/// j := element*324880/// k := element*324881/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4882/// ENDFOR4883/// \endcode4884///4885/// \headerfile <immintrin.h>4886///4887/// \code4888/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);4889/// \endcode4890///4891/// This intrinsic corresponds to the \c VGATHERDPS instruction.4892///4893/// \param m4894/// A pointer to the memory used for loading values.4895/// \param i4896/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.4897/// \param s4898/// A literal constant scale factor for the indexes in \a i. Must be4899/// 1, 2, 4, or 8.4900/// \returns A 128-bit vector of [4 x float] containing the gathered values.4901#define _mm_i32gather_ps(m, i, s) \4902((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \4903(float const *)(m), \4904(__v4si)(__m128i)(i), \4905(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \4906_mm_setzero_ps()), \4907(s)))49084909/// Gathers eight 32-bit floating-point values from memory \a m using scaled4910/// indexes from the 256-bit vector of [8 x i32] in \a i.4911///4912/// \code{.operation}4913/// FOR element := 0 to 74914/// j := element*324915/// k := element*324916/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)4917/// ENDFOR4918/// \endcode4919///4920/// \headerfile <immintrin.h>4921///4922/// \code4923/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);4924/// \endcode4925///4926/// This intrinsic corresponds to the \c VGATHERDPS instruction.4927///4928/// \param m4929/// A pointer to the memory used for loading values.4930/// \param i4931/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.4932/// \param s4933/// A literal constant scale factor for the indexes in \a i. Must be4934/// 1, 2, 4, or 8.4935/// \returns A 256-bit vector of [8 x float] containing the gathered values.4936#define _mm256_i32gather_ps(m, i, s) \4937((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \4938(float const *)(m), \4939(__v8si)(__m256i)(i), \4940(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \4941_mm256_setzero_ps(), \4942_CMP_EQ_OQ), \4943(s)))49444945/// Gathers two 32-bit floating-point values from memory \a m using scaled4946/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two4947/// elements of the result are zeroed.4948///4949/// \code{.operation}4950/// FOR element := 0 to 14951/// j := element*324952/// k := element*644953/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)4954/// ENDFOR4955/// result[127:64] := 04956/// \endcode4957///4958/// \headerfile <immintrin.h>4959///4960/// \code4961/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);4962/// \endcode4963///4964/// This intrinsic corresponds to the \c VGATHERQPS instruction.4965///4966/// \param m4967/// A pointer to the memory used for loading values.4968/// \param i4969/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.4970/// \param s4971/// A literal constant scale factor for the indexes in \a i. Must be4972/// 1, 2, 4, or 8.4973/// \returns A 128-bit vector of [4 x float] containing the gathered values.4974#define _mm_i64gather_ps(m, i, s) \4975((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \4976(float const *)(m), \4977(__v2di)(__m128i)(i), \4978(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \4979_mm_setzero_ps()), \4980(s)))49814982/// Gathers four 32-bit floating-point values from memory \a m using scaled4983/// indexes from the 256-bit vector of [4 x i64] in \a i.4984///4985/// \code{.operation}4986/// FOR element := 0 to 34987/// j := element*324988/// k := element*644989/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)4990/// ENDFOR4991/// \endcode4992///4993/// \headerfile <immintrin.h>4994///4995/// \code4996/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);4997/// \endcode4998///4999/// This intrinsic corresponds to the \c VGATHERQPS instruction.5000///5001/// \param m5002/// A pointer to the memory used for loading values.5003/// \param i5004/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.5005/// \param s5006/// A literal constant scale factor for the indexes in \a i. Must be5007/// 1, 2, 4, or 8.5008/// \returns A 128-bit vector of [4 x float] containing the gathered values.5009#define _mm256_i64gather_ps(m, i, s) \5010((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \5011(float const *)(m), \5012(__v4di)(__m256i)(i), \5013(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \5014_mm_setzero_ps()), \5015(s)))50165017/// Gathers four 32-bit floating-point values from memory \a m using scaled5018/// indexes from the 128-bit vector of [4 x i32] in \a i.5019///5020/// \code{.operation}5021/// FOR element := 0 to 35022/// j := element*325023/// k := element*325024/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)5025/// ENDFOR5026/// \endcode5027///5028/// \headerfile <immintrin.h>5029///5030/// \code5031/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);5032/// \endcode5033///5034/// This intrinsic corresponds to the \c VPGATHERDD instruction.5035///5036/// \param m5037/// A pointer to the memory used for loading values.5038/// \param i5039/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.5040/// \param s5041/// A literal constant scale factor for the indexes in \a i. Must be5042/// 1, 2, 4, or 8.5043/// \returns A 128-bit vector of [4 x i32] containing the gathered values.5044#define _mm_i32gather_epi32(m, i, s) \5045((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \5046(int const *)(m), (__v4si)(__m128i)(i), \5047(__v4si)_mm_set1_epi32(-1), (s)))50485049/// Gathers eight 32-bit floating-point values from memory \a m using scaled5050/// indexes from the 256-bit vector of [8 x i32] in \a i.5051///5052/// \code{.operation}5053/// FOR element := 0 to 75054/// j := element*325055/// k := element*325056/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)5057/// ENDFOR5058/// \endcode5059///5060/// \headerfile <immintrin.h>5061///5062/// \code5063/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);5064/// \endcode5065///5066/// This intrinsic corresponds to the \c VPGATHERDD instruction.5067///5068/// \param m5069/// A pointer to the memory used for loading values.5070/// \param i5071/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.5072/// \param s5073/// A literal constant scale factor for the indexes in \a i. Must be5074/// 1, 2, 4, or 8.5075/// \returns A 256-bit vector of [8 x i32] containing the gathered values.5076#define _mm256_i32gather_epi32(m, i, s) \5077((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \5078(int const *)(m), (__v8si)(__m256i)(i), \5079(__v8si)_mm256_set1_epi32(-1), (s)))50805081/// Gathers two 32-bit integer values from memory \a m using scaled indexes5082/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements5083/// of the result are zeroed.5084///5085/// \code{.operation}5086/// FOR element := 0 to 15087/// j := element*325088/// k := element*645089/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)5090/// ENDFOR5091/// result[127:64] := 05092/// \endcode5093///5094/// \headerfile <immintrin.h>5095///5096/// \code5097/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);5098/// \endcode5099///5100/// This intrinsic corresponds to the \c VPGATHERQD instruction.5101///5102/// \param m5103/// A pointer to the memory used for loading values.5104/// \param i5105/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.5106/// \param s5107/// A literal constant scale factor for the indexes in \a i. Must be5108/// 1, 2, 4, or 8.5109/// \returns A 128-bit vector of [4 x i32] containing the gathered values.5110#define _mm_i64gather_epi32(m, i, s) \5111((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \5112(int const *)(m), (__v2di)(__m128i)(i), \5113(__v4si)_mm_set1_epi32(-1), (s)))51145115/// Gathers four 32-bit integer values from memory \a m using scaled indexes5116/// from the 256-bit vector of [4 x i64] in \a i.5117///5118/// \code{.operation}5119/// FOR element := 0 to 35120/// j := element*325121/// k := element*645122/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)5123/// ENDFOR5124/// \endcode5125///5126/// \headerfile <immintrin.h>5127///5128/// \code5129/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);5130/// \endcode5131///5132/// This intrinsic corresponds to the \c VPGATHERQD instruction.5133///5134/// \param m5135/// A pointer to the memory used for loading values.5136/// \param i5137/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.5138/// \param s5139/// A literal constant scale factor for the indexes in \a i. Must be5140/// 1, 2, 4, or 8.5141/// \returns A 128-bit vector of [4 x i32] containing the gathered values.5142#define _mm256_i64gather_epi32(m, i, s) \5143((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \5144(int const *)(m), (__v4di)(__m256i)(i), \5145(__v4si)_mm_set1_epi32(-1), (s)))51465147/// Gathers two 64-bit integer values from memory \a m using scaled indexes5148/// from the 128-bit vector of [4 x i32] in \a i.5149///5150/// \code{.operation}5151/// FOR element := 0 to 15152/// j := element*645153/// k := element*325154/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)5155/// ENDFOR5156/// \endcode5157///5158/// \headerfile <immintrin.h>5159///5160/// \code5161/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);5162/// \endcode5163///5164/// This intrinsic corresponds to the \c VPGATHERDQ instruction.5165///5166/// \param m5167/// A pointer to the memory used for loading values.5168/// \param i5169/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only5170/// the first two elements are used.5171/// \param s5172/// A literal constant scale factor for the indexes in \a i. Must be5173/// 1, 2, 4, or 8.5174/// \returns A 128-bit vector of [2 x i64] containing the gathered values.5175#define _mm_i32gather_epi64(m, i, s) \5176((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \5177(long long const *)(m), \5178(__v4si)(__m128i)(i), \5179(__v2di)_mm_set1_epi64x(-1), (s)))51805181/// Gathers four 64-bit integer values from memory \a m using scaled indexes5182/// from the 128-bit vector of [4 x i32] in \a i.5183///5184/// \code{.operation}5185/// FOR element := 0 to 35186/// j := element*645187/// k := element*325188/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)5189/// ENDFOR5190/// \endcode5191///5192/// \headerfile <immintrin.h>5193///5194/// \code5195/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);5196/// \endcode5197///5198/// This intrinsic corresponds to the \c VPGATHERDQ instruction.5199///5200/// \param m5201/// A pointer to the memory used for loading values.5202/// \param i5203/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.5204/// \param s5205/// A literal constant scale factor for the indexes in \a i. Must be5206/// 1, 2, 4, or 8.5207/// \returns A 256-bit vector of [4 x i64] containing the gathered values.5208#define _mm256_i32gather_epi64(m, i, s) \5209((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \5210(long long const *)(m), \5211(__v4si)(__m128i)(i), \5212(__v4di)_mm256_set1_epi64x(-1), (s)))52135214/// Gathers two 64-bit integer values from memory \a m using scaled indexes5215/// from the 128-bit vector of [2 x i64] in \a i.5216///5217/// \code{.operation}5218/// FOR element := 0 to 15219/// j := element*645220/// k := element*645221/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)5222/// ENDFOR5223/// \endcode5224///5225/// \headerfile <immintrin.h>5226///5227/// \code5228/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);5229/// \endcode5230///5231/// This intrinsic corresponds to the \c VPGATHERQQ instruction.5232///5233/// \param m5234/// A pointer to the memory used for loading values.5235/// \param i5236/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.5237/// \param s5238/// A literal constant scale factor for the indexes in \a i. Must be5239/// 1, 2, 4, or 8.5240/// \returns A 128-bit vector of [2 x i64] containing the gathered values.5241#define _mm_i64gather_epi64(m, i, s) \5242((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \5243(long long const *)(m), \5244(__v2di)(__m128i)(i), \5245(__v2di)_mm_set1_epi64x(-1), (s)))52465247/// Gathers four 64-bit integer values from memory \a m using scaled indexes5248/// from the 256-bit vector of [4 x i64] in \a i.5249///5250/// \code{.operation}5251/// FOR element := 0 to 35252/// j := element*645253/// k := element*645254/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)5255/// ENDFOR5256/// \endcode5257///5258/// \headerfile <immintrin.h>5259///5260/// \code5261/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);5262/// \endcode5263///5264/// This intrinsic corresponds to the \c VPGATHERQQ instruction.5265///5266/// \param m5267/// A pointer to the memory used for loading values.5268/// \param i5269/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.5270/// \param s5271/// A literal constant scale factor for the indexes in \a i. Must be5272/// 1, 2, 4, or 8.5273/// \returns A 256-bit vector of [4 x i64] containing the gathered values.5274#define _mm256_i64gather_epi64(m, i, s) \5275((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \5276(long long const *)(m), \5277(__v4di)(__m256i)(i), \5278(__v4di)_mm256_set1_epi64x(-1), (s)))52795280#undef __DEFAULT_FN_ATTRS2565281#undef __DEFAULT_FN_ATTRS12852825283#endif /* __AVX2INTRIN_H */528452855286