CoCalc -- avx2intrin.h

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Headers/avx2intrin.h
³⁵²³³ views
1
/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9

10
#ifndef __IMMINTRIN_H
11
#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12
#endif
13

14
#ifndef __AVX2INTRIN_H
15
#define __AVX2INTRIN_H
16

17
/* Define the default attributes for the functions in this file. */
18
#define __DEFAULT_FN_ATTRS256                                                  \
19
  __attribute__((__always_inline__, __nodebug__,                               \
20
                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
21
#define __DEFAULT_FN_ATTRS128                                                  \
22
  __attribute__((__always_inline__, __nodebug__,                               \
23
                 __target__("avx2,no-evex512"), __min_vector_width__(128)))
24

25
/* SSE4 Multiple Packed Sums of Absolute Difference.  */
26
/// Computes sixteen sum of absolute difference (SAD) operations on sets of
27
///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
28
///    \a Y.
29
///
30
///    Eight SAD results are computed using the lower half of the input
31
///    vectors, and another eight using the upper half. These 16-bit values
32
///    are returned in the lower and upper halves of the 256-bit result,
33
///    respectively.
34
///
35
///    A single SAD operation selects four bytes from \a X and four bytes from
36
///    \a Y as input. It computes the differences between each \a X byte and
37
///    the corresponding \a Y byte, takes the absolute value of each
38
///    difference, and sums these four values to form one 16-bit result. The
39
///    intrinsic computes 16 of these results with different sets of input
40
///    bytes.
41
///
42
///    For each set of eight results, the SAD operations use the same four
43
///    bytes from \a Y; the starting bit position for these four bytes is
44
///    specified by \a M[1:0] times 32. The eight operations use successive
45
///    sets of four bytes from \a X; the starting bit position for the first
46
///    set of four bytes is specified by \a M[2] times 32. These bit positions
47
///    are all relative to the 128-bit lane for each set of eight operations.
48
///
49
/// \code{.operation}
50
/// r := 0
51
/// FOR i := 0 TO 1
52
///   j := i*3
53
///   Ybase := M[j+1:j]*32 + i*128
54
///   Xbase := M[j+2]*32 + i*128
55
///   FOR k := 0 TO 3
56
///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57
///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58
///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59
///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60
///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
61
///     Xbase := Xbase + 8
62
///     r := r + 16
63
///   ENDFOR
64
/// ENDFOR
65
/// \endcode
66
///
67
/// \headerfile <immintrin.h>
68
///
69
/// \code
70
/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
71
/// \endcode
72
///
73
/// This intrinsic corresponds to the \c VMPSADBW instruction.
74
///
75
/// \param X
76
///    A 256-bit integer vector containing one of the inputs.
77
/// \param Y
78
///    A 256-bit integer vector containing one of the inputs.
79
/// \param M
80
///     An unsigned immediate value specifying the starting positions of the
81
///     bytes to operate on.
82
/// \returns A 256-bit vector of [16 x i16] containing the result.
83
#define _mm256_mpsadbw_epu8(X, Y, M) \
84
  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85
                                      (__v32qi)(__m256i)(Y), (int)(M)))
86

87
/// Computes the absolute value of each signed byte in the 256-bit integer
88
///    vector \a __a and returns each value in the corresponding byte of
89
///    the result.
90
///
91
/// \headerfile <immintrin.h>
92
///
93
/// This intrinsic corresponds to the \c VPABSB instruction.
94
///
95
/// \param __a
96
///    A 256-bit integer vector.
97
/// \returns A 256-bit integer vector containing the result.
98
static __inline__ __m256i __DEFAULT_FN_ATTRS256
99
_mm256_abs_epi8(__m256i __a)
100
{
101
    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
102
}
103

104
/// Computes the absolute value of each signed 16-bit element in the 256-bit
105
///    vector of [16 x i16] in \a __a and returns each value in the
106
///    corresponding element of the result.
107
///
108
/// \headerfile <immintrin.h>
109
///
110
/// This intrinsic corresponds to the \c VPABSW instruction.
111
///
112
/// \param __a
113
///    A 256-bit vector of [16 x i16].
114
/// \returns A 256-bit vector of [16 x i16] containing the result.
115
static __inline__ __m256i __DEFAULT_FN_ATTRS256
116
_mm256_abs_epi16(__m256i __a)
117
{
118
    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
119
}
120

121
/// Computes the absolute value of each signed 32-bit element in the 256-bit
122
///    vector of [8 x i32] in \a __a and returns each value in the
123
///    corresponding element of the result.
124
///
125
/// \headerfile <immintrin.h>
126
///
127
/// This intrinsic corresponds to the \c VPABSD instruction.
128
///
129
/// \param __a
130
///    A 256-bit vector of [8 x i32].
131
/// \returns A 256-bit vector of [8 x i32] containing the result.
132
static __inline__ __m256i __DEFAULT_FN_ATTRS256
133
_mm256_abs_epi32(__m256i __a)
134
{
135
    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
136
}
137

138
/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139
///    integers using signed saturation, and returns the 256-bit result.
140
///
141
/// \code{.operation}
142
/// FOR i := 0 TO 7
143
///   j := i*16
144
///   k := i*8
145
///   result[7+k:k] := SATURATE8(__a[15+j:j])
146
///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
147
///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148
///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
149
/// ENDFOR
150
/// \endcode
151
///
152
/// \headerfile <immintrin.h>
153
///
154
/// This intrinsic corresponds to the \c VPACKSSWB instruction.
155
///
156
/// \param __a
157
///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
158
///    result[191:128].
159
/// \param __b
160
///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
161
///    result[255:192].
162
/// \returns A 256-bit integer vector containing the result.
163
static __inline__ __m256i __DEFAULT_FN_ATTRS256
164
_mm256_packs_epi16(__m256i __a, __m256i __b)
165
{
166
  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
167
}
168

169
/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170
///    integers using signed saturation, and returns the resulting 256-bit
171
///    vector of [16 x i16].
172
///
173
/// \code{.operation}
174
/// FOR i := 0 TO 3
175
///   j := i*32
176
///   k := i*16
177
///   result[15+k:k] := SATURATE16(__a[31+j:j])
178
///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
179
///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180
///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
181
/// ENDFOR
182
/// \endcode
183
///
184
/// \headerfile <immintrin.h>
185
///
186
/// This intrinsic corresponds to the \c VPACKSSDW instruction.
187
///
188
/// \param __a
189
///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
190
///    result[191:128].
191
/// \param __b
192
///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
193
///    result[255:192].
194
/// \returns A 256-bit vector of [16 x i16] containing the result.
195
static __inline__ __m256i __DEFAULT_FN_ATTRS256
196
_mm256_packs_epi32(__m256i __a, __m256i __b)
197
{
198
  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
199
}
200

201
/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202
///    using unsigned saturation, and returns the 256-bit result.
203
///
204
/// \code{.operation}
205
/// FOR i := 0 TO 7
206
///   j := i*16
207
///   k := i*8
208
///   result[7+k:k] := SATURATE8U(__a[15+j:j])
209
///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210
///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211
///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
212
/// ENDFOR
213
/// \endcode
214
///
215
/// \headerfile <immintrin.h>
216
///
217
/// This intrinsic corresponds to the \c VPACKUSWB instruction.
218
///
219
/// \param __a
220
///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
221
///    result[191:128].
222
/// \param __b
223
///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
224
///    result[255:192].
225
/// \returns A 256-bit integer vector containing the result.
226
static __inline__ __m256i __DEFAULT_FN_ATTRS256
227
_mm256_packus_epi16(__m256i __a, __m256i __b)
228
{
229
  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
230
}
231

232
/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233
///    using unsigned saturation, and returns the resulting 256-bit vector of
234
///    [16 x i16].
235
///
236
/// \code{.operation}
237
/// FOR i := 0 TO 3
238
///   j := i*32
239
///   k := i*16
240
///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
241
///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242
///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243
///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
244
/// ENDFOR
245
/// \endcode
246
///
247
/// \headerfile <immintrin.h>
248
///
249
/// This intrinsic corresponds to the \c VPACKUSDW instruction.
250
///
251
/// \param __V1
252
///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
253
///    result[191:128].
254
/// \param __V2
255
///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
256
///    result[255:192].
257
/// \returns A 256-bit vector of [16 x i16] containing the result.
258
static __inline__ __m256i __DEFAULT_FN_ATTRS256
259
_mm256_packus_epi32(__m256i __V1, __m256i __V2)
260
{
261
  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
262
}
263

264
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265
///    vectors and returns the lower 8 bits of each sum in the corresponding
266
///    byte of the 256-bit integer vector result (overflow is ignored).
267
///
268
/// \headerfile <immintrin.h>
269
///
270
/// This intrinsic corresponds to the \c VPADDB instruction.
271
///
272
/// \param __a
273
///    A 256-bit integer vector containing one of the source operands.
274
/// \param __b
275
///    A 256-bit integer vector containing one of the source operands.
276
/// \returns A 256-bit integer vector containing the sums.
277
static __inline__ __m256i __DEFAULT_FN_ATTRS256
278
_mm256_add_epi8(__m256i __a, __m256i __b)
279
{
280
  return (__m256i)((__v32qu)__a + (__v32qu)__b);
281
}
282

283
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284
///    [16 x i16] and returns the lower 16 bits of each sum in the
285
///    corresponding element of the [16 x i16] result (overflow is ignored).
286
///
287
/// \headerfile <immintrin.h>
288
///
289
/// This intrinsic corresponds to the \c VPADDW instruction.
290
///
291
/// \param __a
292
///    A 256-bit vector of [16 x i16] containing one of the source operands.
293
/// \param __b
294
///    A 256-bit vector of [16 x i16] containing one of the source operands.
295
/// \returns A 256-bit vector of [16 x i16] containing the sums.
296
static __inline__ __m256i __DEFAULT_FN_ATTRS256
297
_mm256_add_epi16(__m256i __a, __m256i __b)
298
{
299
  return (__m256i)((__v16hu)__a + (__v16hu)__b);
300
}
301

302
/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303
///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304
///    element of the [8 x i32] result (overflow is ignored).
305
///
306
/// \headerfile <immintrin.h>
307
///
308
/// This intrinsic corresponds to the \c VPADDD instruction.
309
///
310
/// \param __a
311
///    A 256-bit vector of [8 x i32] containing one of the source operands.
312
/// \param __b
313
///    A 256-bit vector of [8 x i32] containing one of the source operands.
314
/// \returns A 256-bit vector of [8 x i32] containing the sums.
315
static __inline__ __m256i __DEFAULT_FN_ATTRS256
316
_mm256_add_epi32(__m256i __a, __m256i __b)
317
{
318
  return (__m256i)((__v8su)__a + (__v8su)__b);
319
}
320

321
/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322
///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323
///    element of the [4 x i64] result (overflow is ignored).
324
///
325
/// \headerfile <immintrin.h>
326
///
327
/// This intrinsic corresponds to the \c VPADDQ instruction.
328
///
329
/// \param __a
330
///    A 256-bit vector of [4 x i64] containing one of the source operands.
331
/// \param __b
332
///    A 256-bit vector of [4 x i64] containing one of the source operands.
333
/// \returns A 256-bit vector of [4 x i64] containing the sums.
334
static __inline__ __m256i __DEFAULT_FN_ATTRS256
335
_mm256_add_epi64(__m256i __a, __m256i __b)
336
{
337
  return (__m256i)((__v4du)__a + (__v4du)__b);
338
}
339

340
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341
///    vectors using signed saturation, and returns each sum in the
342
///    corresponding byte of the 256-bit integer vector result.
343
///
344
/// \headerfile <immintrin.h>
345
///
346
/// This intrinsic corresponds to the \c VPADDSB instruction.
347
///
348
/// \param __a
349
///    A 256-bit integer vector containing one of the source operands.
350
/// \param __b
351
///    A 256-bit integer vector containing one of the source operands.
352
/// \returns A 256-bit integer vector containing the sums.
353
static __inline__ __m256i __DEFAULT_FN_ATTRS256
354
_mm256_adds_epi8(__m256i __a, __m256i __b)
355
{
356
  return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
357
}
358

359
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360
///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
361
///
362
/// \headerfile <immintrin.h>
363
///
364
/// This intrinsic corresponds to the \c VPADDSW instruction.
365
///
366
/// \param __a
367
///    A 256-bit vector of [16 x i16] containing one of the source operands.
368
/// \param __b
369
///    A 256-bit vector of [16 x i16] containing one of the source operands.
370
/// \returns A 256-bit vector of [16 x i16] containing the sums.
371
static __inline__ __m256i __DEFAULT_FN_ATTRS256
372
_mm256_adds_epi16(__m256i __a, __m256i __b)
373
{
374
  return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
375
}
376

377
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378
///    vectors using unsigned saturation, and returns each sum in the
379
///    corresponding byte of the 256-bit integer vector result.
380
///
381
/// \headerfile <immintrin.h>
382
///
383
/// This intrinsic corresponds to the \c VPADDUSB instruction.
384
///
385
/// \param __a
386
///    A 256-bit integer vector containing one of the source operands.
387
/// \param __b
388
///    A 256-bit integer vector containing one of the source operands.
389
/// \returns A 256-bit integer vector containing the sums.
390
static __inline__ __m256i __DEFAULT_FN_ATTRS256
391
_mm256_adds_epu8(__m256i __a, __m256i __b)
392
{
393
  return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
394
}
395

396
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397
///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
398
///
399
/// \headerfile <immintrin.h>
400
///
401
/// This intrinsic corresponds to the \c VPADDUSW instruction.
402
///
403
/// \param __a
404
///    A 256-bit vector of [16 x i16] containing one of the source operands.
405
/// \param __b
406
///    A 256-bit vector of [16 x i16] containing one of the source operands.
407
/// \returns A 256-bit vector of [16 x i16] containing the sums.
408
static __inline__ __m256i __DEFAULT_FN_ATTRS256
409
_mm256_adds_epu16(__m256i __a, __m256i __b)
410
{
411
  return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
412
}
413

414
/// Uses the lower half of the 256-bit vector \a a as the upper half of a
415
///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
416
///    as the lower half of the temporary value. Right-shifts the temporary
417
///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
418
///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
419
///    \a b to make another temporary value, right shifts by \a n, and uses
420
///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
421
///    result.
422
///
423
/// \headerfile <immintrin.h>
424
///
425
/// \code
426
/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
427
/// \endcode
428
///
429
/// This intrinsic corresponds to the \c VPALIGNR instruction.
430
///
431
/// \param a
432
///    A 256-bit integer vector containing source values.
433
/// \param b
434
///    A 256-bit integer vector containing source values.
435
/// \param n
436
///    An immediate value specifying the number of bytes to shift.
437
/// \returns A 256-bit integer vector containing the result.
438
#define _mm256_alignr_epi8(a, b, n) \
439
  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440
                                      (__v32qi)(__m256i)(b), (n)))
441

442
/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
443
///    \a __b.
444
///
445
/// \headerfile <immintrin.h>
446
///
447
/// This intrinsic corresponds to the \c VPAND instruction.
448
///
449
/// \param __a
450
///    A 256-bit integer vector.
451
/// \param __b
452
///    A 256-bit integer vector.
453
/// \returns A 256-bit integer vector containing the result.
454
static __inline__ __m256i __DEFAULT_FN_ATTRS256
455
_mm256_and_si256(__m256i __a, __m256i __b)
456
{
457
  return (__m256i)((__v4du)__a & (__v4du)__b);
458
}
459

460
/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461
///    the bitwise NOT of the 256-bit integer vector in \a __a.
462
///
463
/// \headerfile <immintrin.h>
464
///
465
/// This intrinsic corresponds to the \c VPANDN instruction.
466
///
467
/// \param __a
468
///    A 256-bit integer vector.
469
/// \param __b
470
///    A 256-bit integer vector.
471
/// \returns A 256-bit integer vector containing the result.
472
static __inline__ __m256i __DEFAULT_FN_ATTRS256
473
_mm256_andnot_si256(__m256i __a, __m256i __b)
474
{
475
  return (__m256i)(~(__v4du)__a & (__v4du)__b);
476
}
477

478
/// Computes the averages of the corresponding unsigned bytes in the two
479
///    256-bit integer vectors in \a __a and \a __b and returns each
480
///    average in the corresponding byte of the 256-bit result.
481
///
482
/// \code{.operation}
483
/// FOR i := 0 TO 31
484
///   j := i*8
485
///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
486
/// ENDFOR
487
/// \endcode
488
///
489
/// \headerfile <immintrin.h>
490
///
491
/// This intrinsic corresponds to the \c VPAVGB instruction.
492
///
493
/// \param __a
494
///    A 256-bit integer vector.
495
/// \param __b
496
///    A 256-bit integer vector.
497
/// \returns A 256-bit integer vector containing the result.
498
static __inline__ __m256i __DEFAULT_FN_ATTRS256
499
_mm256_avg_epu8(__m256i __a, __m256i __b)
500
{
501
  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
502
}
503

504
/// Computes the averages of the corresponding unsigned 16-bit integers in
505
///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506
///    each average in the corresponding element of the 256-bit result.
507
///
508
/// \code{.operation}
509
/// FOR i := 0 TO 15
510
///   j := i*16
511
///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512
/// ENDFOR
513
/// \endcode
514
///
515
/// \headerfile <immintrin.h>
516
///
517
/// This intrinsic corresponds to the \c VPAVGW instruction.
518
///
519
/// \param __a
520
///    A 256-bit vector of [16 x i16].
521
/// \param __b
522
///    A 256-bit vector of [16 x i16].
523
/// \returns A 256-bit vector of [16 x i16] containing the result.
524
static __inline__ __m256i __DEFAULT_FN_ATTRS256
525
_mm256_avg_epu16(__m256i __a, __m256i __b)
526
{
527
  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
528
}
529

530
/// Merges 8-bit integer values from either of the two 256-bit vectors
531
///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532
///    the resulting 256-bit integer vector.
533
///
534
/// \code{.operation}
535
/// FOR i := 0 TO 31
536
///   j := i*8
537
///   IF __M[7+i] == 0
538
///     result[7+j:j] := __V1[7+j:j]
539
///   ELSE
540
///     result[7+j:j] := __V2[7+j:j]
541
///   FI
542
/// ENDFOR
543
/// \endcode
544
///
545
/// \headerfile <immintrin.h>
546
///
547
/// This intrinsic corresponds to the \c VPBLENDVB instruction.
548
///
549
/// \param __V1
550
///    A 256-bit integer vector containing source values.
551
/// \param __V2
552
///    A 256-bit integer vector containing source values.
553
/// \param __M
554
///    A 256-bit integer vector, with bit [7] of each byte specifying the
555
///    source for each corresponding byte of the result. When the mask bit
556
///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
557
///    \a __V2.
558
/// \returns A 256-bit integer vector containing the result.
559
static __inline__ __m256i __DEFAULT_FN_ATTRS256
560
_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
561
{
562
  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
563
                                              (__v32qi)__M);
564
}
565

566
/// Merges 16-bit integer values from either of the two 256-bit vectors
567
///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
568
///    and returns the resulting 256-bit vector of [16 x i16].
569
///
570
/// \code{.operation}
571
/// FOR i := 0 TO 7
572
///   j := i*16
573
///   IF M[i] == 0
574
///     result[7+j:j] := V1[7+j:j]
575
///     result[135+j:128+j] := V1[135+j:128+j]
576
///   ELSE
577
///     result[7+j:j] := V2[7+j:j]
578
///     result[135+j:128+j] := V2[135+j:128+j]
579
///   FI
580
/// ENDFOR
581
/// \endcode
582
///
583
/// \headerfile <immintrin.h>
584
///
585
/// \code
586
/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
587
/// \endcode
588
///
589
/// This intrinsic corresponds to the \c VPBLENDW instruction.
590
///
591
/// \param V1
592
///    A 256-bit vector of [16 x i16] containing source values.
593
/// \param V2
594
///    A 256-bit vector of [16 x i16] containing source values.
595
/// \param M
596
///    An immediate 8-bit integer operand, with bits [7:0] specifying the
597
///    source for each element of the result. The position of the mask bit
598
///    corresponds to the index of a copied value. When a mask bit is 0, the
599
///    element is copied from \a V1; otherwise, it is copied from \a V2.
600
///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
601
///    elements 1 and 9, and so forth.
602
/// \returns A 256-bit vector of [16 x i16] containing the result.
603
#define _mm256_blend_epi16(V1, V2, M) \
604
  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605
                                      (__v16hi)(__m256i)(V2), (int)(M)))
606

607
/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608
///    \a __b for equality and returns the outcomes in the corresponding
609
///    bytes of the 256-bit result.
610
///
611
/// \code{.operation}
612
/// FOR i := 0 TO 31
613
///   j := i*8
614
///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
615
/// ENDFOR
616
/// \endcode
617
///
618
/// \headerfile <immintrin.h>
619
///
620
/// This intrinsic corresponds to the \c VPCMPEQB instruction.
621
///
622
/// \param __a
623
///    A 256-bit integer vector containing one of the inputs.
624
/// \param __b
625
///    A 256-bit integer vector containing one of the inputs.
626
/// \returns A 256-bit integer vector containing the result.
627
static __inline__ __m256i __DEFAULT_FN_ATTRS256
628
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
629
{
630
  return (__m256i)((__v32qi)__a == (__v32qi)__b);
631
}
632

633
/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634
///    \a __a and \a __b for equality and returns the outcomes in the
635
///    corresponding elements of the 256-bit result.
636
///
637
/// \code{.operation}
638
/// FOR i := 0 TO 15
639
///   j := i*16
640
///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
641
/// ENDFOR
642
/// \endcode
643
///
644
/// \headerfile <immintrin.h>
645
///
646
/// This intrinsic corresponds to the \c VPCMPEQW instruction.
647
///
648
/// \param __a
649
///    A 256-bit vector of [16 x i16] containing one of the inputs.
650
/// \param __b
651
///    A 256-bit vector of [16 x i16] containing one of the inputs.
652
/// \returns A 256-bit vector of [16 x i16] containing the result.
653
static __inline__ __m256i __DEFAULT_FN_ATTRS256
654
_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
655
{
656
  return (__m256i)((__v16hi)__a == (__v16hi)__b);
657
}
658

659
/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660
///    \a __a and \a __b for equality and returns the outcomes in the
661
///    corresponding elements of the 256-bit result.
662
///
663
/// \code{.operation}
664
/// FOR i := 0 TO 7
665
///   j := i*32
666
///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
667
/// ENDFOR
668
/// \endcode
669
///
670
/// \headerfile <immintrin.h>
671
///
672
/// This intrinsic corresponds to the \c VPCMPEQD instruction.
673
///
674
/// \param __a
675
///    A 256-bit vector of [8 x i32] containing one of the inputs.
676
/// \param __b
677
///    A 256-bit vector of [8 x i32] containing one of the inputs.
678
/// \returns A 256-bit vector of [8 x i32] containing the result.
679
static __inline__ __m256i __DEFAULT_FN_ATTRS256
680
_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
681
{
682
  return (__m256i)((__v8si)__a == (__v8si)__b);
683
}
684

685
/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686
///    \a __a and \a __b for equality and returns the outcomes in the
687
///    corresponding elements of the 256-bit result.
688
///
689
/// \code{.operation}
690
/// FOR i := 0 TO 3
691
///   j := i*64
692
///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
693
/// ENDFOR
694
/// \endcode
695
///
696
/// \headerfile <immintrin.h>
697
///
698
/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
699
///
700
/// \param __a
701
///    A 256-bit vector of [4 x i64] containing one of the inputs.
702
/// \param __b
703
///    A 256-bit vector of [4 x i64] containing one of the inputs.
704
/// \returns A 256-bit vector of [4 x i64] containing the result.
705
static __inline__ __m256i __DEFAULT_FN_ATTRS256
706
_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
707
{
708
  return (__m256i)((__v4di)__a == (__v4di)__b);
709
}
710

711
/// Compares corresponding signed bytes in the 256-bit integer vectors in
712
///    \a __a and \a __b for greater-than and returns the outcomes in the
713
///    corresponding bytes of the 256-bit result.
714
///
715
/// \code{.operation}
716
/// FOR i := 0 TO 31
717
///   j := i*8
718
///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
719
/// ENDFOR
720
/// \endcode
721
///
722
/// \headerfile <immintrin.h>
723
///
724
/// This intrinsic corresponds to the \c VPCMPGTB instruction.
725
///
726
/// \param __a
727
///    A 256-bit integer vector containing one of the inputs.
728
/// \param __b
729
///    A 256-bit integer vector containing one of the inputs.
730
/// \returns A 256-bit integer vector containing the result.
731
static __inline__ __m256i __DEFAULT_FN_ATTRS256
732
_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
733
{
734
  /* This function always performs a signed comparison, but __v32qi is a char
735
     which may be signed or unsigned, so use __v32qs. */
736
  return (__m256i)((__v32qs)__a > (__v32qs)__b);
737
}
738

739
/// Compares corresponding signed elements in the 256-bit vectors of
740
///    [16 x i16] in \a __a and \a __b for greater-than and returns the
741
///    outcomes in the corresponding elements of the 256-bit result.
742
///
743
/// \code{.operation}
744
/// FOR i := 0 TO 15
745
///   j := i*16
746
///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
747
/// ENDFOR
748
/// \endcode
749
///
750
/// \headerfile <immintrin.h>
751
///
752
/// This intrinsic corresponds to the \c VPCMPGTW instruction.
753
///
754
/// \param __a
755
///    A 256-bit vector of [16 x i16] containing one of the inputs.
756
/// \param __b
757
///    A 256-bit vector of [16 x i16] containing one of the inputs.
758
/// \returns A 256-bit vector of [16 x i16] containing the result.
759
static __inline__ __m256i __DEFAULT_FN_ATTRS256
760
_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
761
{
762
  return (__m256i)((__v16hi)__a > (__v16hi)__b);
763
}
764

765
/// Compares corresponding signed elements in the 256-bit vectors of
766
///    [8 x i32] in \a __a and \a __b for greater-than and returns the
767
///    outcomes in the corresponding elements of the 256-bit result.
768
///
769
/// \code{.operation}
770
/// FOR i := 0 TO 7
771
///   j := i*32
772
///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
773
/// ENDFOR
774
/// \endcode
775
///
776
/// \headerfile <immintrin.h>
777
///
778
/// This intrinsic corresponds to the \c VPCMPGTD instruction.
779
///
780
/// \param __a
781
///    A 256-bit vector of [8 x i32] containing one of the inputs.
782
/// \param __b
783
///    A 256-bit vector of [8 x i32] containing one of the inputs.
784
/// \returns A 256-bit vector of [8 x i32] containing the result.
785
static __inline__ __m256i __DEFAULT_FN_ATTRS256
786
_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
787
{
788
  return (__m256i)((__v8si)__a > (__v8si)__b);
789
}
790

791
/// Compares corresponding signed elements in the 256-bit vectors of
792
///    [4 x i64] in \a __a and \a __b for greater-than and returns the
793
///    outcomes in the corresponding elements of the 256-bit result.
794
///
795
/// \code{.operation}
796
/// FOR i := 0 TO 3
797
///   j := i*64
798
///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
799
/// ENDFOR
800
/// \endcode
801
///
802
/// \headerfile <immintrin.h>
803
///
804
/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
805
///
806
/// \param __a
807
///    A 256-bit vector of [4 x i64] containing one of the inputs.
808
/// \param __b
809
///    A 256-bit vector of [4 x i64] containing one of the inputs.
810
/// \returns A 256-bit vector of [4 x i64] containing the result.
811
static __inline__ __m256i __DEFAULT_FN_ATTRS256
812
_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
813
{
814
  return (__m256i)((__v4di)__a > (__v4di)__b);
815
}
816

817
/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818
///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819
///    element of the [16 x i16] result (overflow is ignored). Sums from
820
///    \a __a are returned in the lower 64 bits of each 128-bit half of the
821
///    result; sums from \a __b are returned in the upper 64 bits of each
822
///    128-bit half of the result.
823
///
824
/// \code{.operation}
825
/// FOR i := 0 TO 1
826
///   j := i*128
827
///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828
///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829
///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830
///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831
///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832
///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833
///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834
///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
835
/// ENDFOR
836
/// \endcode
837
///
838
/// \headerfile <immintrin.h>
839
///
840
/// This intrinsic corresponds to the \c VPHADDW instruction.
841
///
842
/// \param __a
843
///    A 256-bit vector of [16 x i16] containing one of the source operands.
844
/// \param __b
845
///    A 256-bit vector of [16 x i16] containing one of the source operands.
846
/// \returns A 256-bit vector of [16 x i16] containing the sums.
847
static __inline__ __m256i __DEFAULT_FN_ATTRS256
848
_mm256_hadd_epi16(__m256i __a, __m256i __b)
849
{
850
    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
851
}
852

853
/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854
///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855
///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856
///    are returned in the lower 64 bits of each 128-bit half of the result;
857
///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
858
///    of the result.
859
///
860
/// \code{.operation}
861
/// FOR i := 0 TO 1
862
///   j := i*128
863
///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864
///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865
///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866
///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
867
/// ENDFOR
868
/// \endcode
869
///
870
/// \headerfile <immintrin.h>
871
///
872
/// This intrinsic corresponds to the \c VPHADDD instruction.
873
///
874
/// \param __a
875
///    A 256-bit vector of [8 x i32] containing one of the source operands.
876
/// \param __b
877
///    A 256-bit vector of [8 x i32] containing one of the source operands.
878
/// \returns A 256-bit vector of [8 x i32] containing the sums.
879
static __inline__ __m256i __DEFAULT_FN_ATTRS256
880
_mm256_hadd_epi32(__m256i __a, __m256i __b)
881
{
882
    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
883
}
884

885
/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886
///    vectors of [16 x i16] using signed saturation and returns each sum in
887
///    an element of the [16 x i16] result. Sums from \a __a are returned in
888
///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
889
///    are returned in the upper 64 bits of each 128-bit half of the result.
890
///
891
/// \code{.operation}
892
/// FOR i := 0 TO 1
893
///   j := i*128
894
///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895
///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896
///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897
///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898
///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899
///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900
///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901
///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
902
/// ENDFOR
903
/// \endcode
904
///
905
/// \headerfile <immintrin.h>
906
///
907
/// This intrinsic corresponds to the \c VPHADDSW instruction.
908
///
909
/// \param __a
910
///    A 256-bit vector of [16 x i16] containing one of the source operands.
911
/// \param __b
912
///    A 256-bit vector of [16 x i16] containing one of the source operands.
913
/// \returns A 256-bit vector of [16 x i16] containing the sums.
914
static __inline__ __m256i __DEFAULT_FN_ATTRS256
915
_mm256_hadds_epi16(__m256i __a, __m256i __b)
916
{
917
    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
918
}
919

920
/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921
///    vectors of [16 x i16] and returns the lower 16 bits of each difference
922
///    in an element of the [16 x i16] result (overflow is ignored).
923
///    Differences from \a __a are returned in the lower 64 bits of each
924
///    128-bit half of the result; differences from \a __b are returned in the
925
///    upper 64 bits of each 128-bit half of the result.
926
///
927
/// \code{.operation}
928
/// FOR i := 0 TO 1
929
///   j := i*128
930
///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931
///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932
///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933
///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934
///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935
///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936
///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937
///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
938
/// ENDFOR
939
/// \endcode
940
///
941
/// \headerfile <immintrin.h>
942
///
943
/// This intrinsic corresponds to the \c VPHSUBW instruction.
944
///
945
/// \param __a
946
///    A 256-bit vector of [16 x i16] containing one of the source operands.
947
/// \param __b
948
///    A 256-bit vector of [16 x i16] containing one of the source operands.
949
/// \returns A 256-bit vector of [16 x i16] containing the differences.
950
static __inline__ __m256i __DEFAULT_FN_ATTRS256
951
_mm256_hsub_epi16(__m256i __a, __m256i __b)
952
{
953
    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
954
}
955

956
/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957
///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
958
///    an element of the [8 x i32] result (overflow is ignored). Differences
959
///    from \a __a are returned in the lower 64 bits of each 128-bit half of
960
///    the result; differences from \a __b are returned in the upper 64 bits
961
///    of each 128-bit half of the result.
962
///
963
/// \code{.operation}
964
/// FOR i := 0 TO 1
965
///   j := i*128
966
///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967
///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968
///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969
///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
970
/// ENDFOR
971
/// \endcode
972
///
973
/// \headerfile <immintrin.h>
974
///
975
/// This intrinsic corresponds to the \c VPHSUBD instruction.
976
///
977
/// \param __a
978
///    A 256-bit vector of [8 x i32] containing one of the source operands.
979
/// \param __b
980
///    A 256-bit vector of [8 x i32] containing one of the source operands.
981
/// \returns A 256-bit vector of [8 x i32] containing the differences.
982
static __inline__ __m256i __DEFAULT_FN_ATTRS256
983
_mm256_hsub_epi32(__m256i __a, __m256i __b)
984
{
985
    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
986
}
987

988
/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989
///    vectors of [16 x i16] using signed saturation and returns each sum in
990
///    an element of the [16 x i16] result. Differences from \a __a are
991
///    returned in the lower 64 bits of each 128-bit half of the result;
992
///    differences from \a __b are returned in the upper 64 bits of each
993
///    128-bit half of the result.
994
///
995
/// \code{.operation}
996
/// FOR i := 0 TO 1
997
///   j := i*128
998
///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999
///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000
///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001
///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002
///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003
///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004
///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005
///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006
/// ENDFOR
1007
/// \endcode
1008
///
1009
/// \headerfile <immintrin.h>
1010
///
1011
/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1012
///
1013
/// \param __a
1014
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1015
/// \param __b
1016
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1017
/// \returns A 256-bit vector of [16 x i16] containing the differences.
1018
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019
_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1020
{
1021
    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1022
}
1023

1024
/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025
///    with the corresponding signed byte from the 256-bit integer vector in
1026
///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027
///    pairs of those products using signed saturation to form 16-bit sums
1028
///    returned as elements of the [16 x i16] result.
1029
///
1030
/// \code{.operation}
1031
/// FOR i := 0 TO 15
1032
///   j := i*16
1033
///   temp1 := __a[j+7:j] * __b[j+7:j]
1034
///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035
///   result[j+15:j] := SATURATE16(temp1 + temp2)
1036
/// ENDFOR
1037
/// \endcode
1038
///
1039
/// \headerfile <immintrin.h>
1040
///
1041
/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1042
///
1043
/// \param __a
1044
///    A 256-bit vector containing one of the source operands.
1045
/// \param __b
1046
///    A 256-bit vector containing one of the source operands.
1047
/// \returns A 256-bit vector of [16 x i16] containing the result.
1048
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049
_mm256_maddubs_epi16(__m256i __a, __m256i __b)
1050
{
1051
    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1052
}
1053

1054
/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055
///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056
///    those products to form 32-bit sums returned as elements of the
1057
///    [8 x i32] result.
1058
///
1059
///    There is only one wraparound case: when all four of the 16-bit sources
1060
///    are \c 0x8000, the result will be \c 0x80000000.
1061
///
1062
/// \code{.operation}
1063
/// FOR i := 0 TO 7
1064
///   j := i*32
1065
///   temp1 := __a[j+15:j] * __b[j+15:j]
1066
///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067
///   result[j+31:j] := temp1 + temp2
1068
/// ENDFOR
1069
/// \endcode
1070
///
1071
/// \headerfile <immintrin.h>
1072
///
1073
/// This intrinsic corresponds to the \c VPMADDWD instruction.
1074
///
1075
/// \param __a
1076
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1077
/// \param __b
1078
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1079
/// \returns A 256-bit vector of [8 x i32] containing the result.
1080
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081
_mm256_madd_epi16(__m256i __a, __m256i __b)
1082
{
1083
  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1084
}
1085

1086
/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087
///     in \a __a and \a __b and returns the larger of each pair in the
1088
///     corresponding byte of the 256-bit result.
1089
///
1090
/// \headerfile <immintrin.h>
1091
///
1092
/// This intrinsic corresponds to the \c VPMAXSB instruction.
1093
///
1094
/// \param __a
1095
///    A 256-bit integer vector.
1096
/// \param __b
1097
///    A 256-bit integer vector.
1098
/// \returns A 256-bit integer vector containing the result.
1099
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100
_mm256_max_epi8(__m256i __a, __m256i __b)
1101
{
1102
  return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1103
}
1104

1105
/// Compares the corresponding signed 16-bit integers in the two 256-bit
1106
///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107
///    each pair in the corresponding element of the 256-bit result.
1108
///
1109
/// \headerfile <immintrin.h>
1110
///
1111
/// This intrinsic corresponds to the \c VPMAXSW instruction.
1112
///
1113
/// \param __a
1114
///    A 256-bit vector of [16 x i16].
1115
/// \param __b
1116
///    A 256-bit vector of [16 x i16].
1117
/// \returns A 256-bit vector of [16 x i16] containing the result.
1118
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119
_mm256_max_epi16(__m256i __a, __m256i __b)
1120
{
1121
  return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1122
}
1123

1124
/// Compares the corresponding signed 32-bit integers in the two 256-bit
1125
///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126
///    each pair in the corresponding element of the 256-bit result.
1127
///
1128
/// \headerfile <immintrin.h>
1129
///
1130
/// This intrinsic corresponds to the \c VPMAXSD instruction.
1131
///
1132
/// \param __a
1133
///    A 256-bit vector of [8 x i32].
1134
/// \param __b
1135
///    A 256-bit vector of [8 x i32].
1136
/// \returns A 256-bit vector of [8 x i32] containing the result.
1137
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138
_mm256_max_epi32(__m256i __a, __m256i __b)
1139
{
1140
  return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1141
}
1142

1143
/// Compares the corresponding unsigned bytes in the two 256-bit integer
1144
///     vectors in \a __a and \a __b and returns the larger of each pair in
1145
///     the corresponding byte of the 256-bit result.
1146
///
1147
/// \headerfile <immintrin.h>
1148
///
1149
/// This intrinsic corresponds to the \c VPMAXUB instruction.
1150
///
1151
/// \param __a
1152
///    A 256-bit integer vector.
1153
/// \param __b
1154
///    A 256-bit integer vector.
1155
/// \returns A 256-bit integer vector containing the result.
1156
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157
_mm256_max_epu8(__m256i __a, __m256i __b)
1158
{
1159
  return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1160
}
1161

1162
/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163
///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164
///    each pair in the corresponding element of the 256-bit result.
1165
///
1166
/// \headerfile <immintrin.h>
1167
///
1168
/// This intrinsic corresponds to the \c VPMAXUW instruction.
1169
///
1170
/// \param __a
1171
///    A 256-bit vector of [16 x i16].
1172
/// \param __b
1173
///    A 256-bit vector of [16 x i16].
1174
/// \returns A 256-bit vector of [16 x i16] containing the result.
1175
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176
_mm256_max_epu16(__m256i __a, __m256i __b)
1177
{
1178
  return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1179
}
1180

1181
/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182
///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183
///    each pair in the corresponding element of the 256-bit result.
1184
///
1185
/// \headerfile <immintrin.h>
1186
///
1187
/// This intrinsic corresponds to the \c VPMAXUD instruction.
1188
///
1189
/// \param __a
1190
///    A 256-bit vector of [8 x i32].
1191
/// \param __b
1192
///    A 256-bit vector of [8 x i32].
1193
/// \returns A 256-bit vector of [8 x i32] containing the result.
1194
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195
_mm256_max_epu32(__m256i __a, __m256i __b)
1196
{
1197
  return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1198
}
1199

1200
/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201
///     in \a __a and \a __b and returns the smaller of each pair in the
1202
///     corresponding byte of the 256-bit result.
1203
///
1204
/// \headerfile <immintrin.h>
1205
///
1206
/// This intrinsic corresponds to the \c VPMINSB instruction.
1207
///
1208
/// \param __a
1209
///    A 256-bit integer vector.
1210
/// \param __b
1211
///    A 256-bit integer vector.
1212
/// \returns A 256-bit integer vector containing the result.
1213
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214
_mm256_min_epi8(__m256i __a, __m256i __b)
1215
{
1216
  return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1217
}
1218

1219
/// Compares the corresponding signed 16-bit integers in the two 256-bit
1220
///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221
///    each pair in the corresponding element of the 256-bit result.
1222
///
1223
/// \headerfile <immintrin.h>
1224
///
1225
/// This intrinsic corresponds to the \c VPMINSW instruction.
1226
///
1227
/// \param __a
1228
///    A 256-bit vector of [16 x i16].
1229
/// \param __b
1230
///    A 256-bit vector of [16 x i16].
1231
/// \returns A 256-bit vector of [16 x i16] containing the result.
1232
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233
_mm256_min_epi16(__m256i __a, __m256i __b)
1234
{
1235
  return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1236
}
1237

1238
/// Compares the corresponding signed 32-bit integers in the two 256-bit
1239
///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240
///    each pair in the corresponding element of the 256-bit result.
1241
///
1242
/// \headerfile <immintrin.h>
1243
///
1244
/// This intrinsic corresponds to the \c VPMINSD instruction.
1245
///
1246
/// \param __a
1247
///    A 256-bit vector of [8 x i32].
1248
/// \param __b
1249
///    A 256-bit vector of [8 x i32].
1250
/// \returns A 256-bit vector of [8 x i32] containing the result.
1251
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252
_mm256_min_epi32(__m256i __a, __m256i __b)
1253
{
1254
  return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1255
}
1256

1257
/// Compares the corresponding unsigned bytes in the two 256-bit integer
1258
///     vectors in \a __a and \a __b and returns the smaller of each pair in
1259
///     the corresponding byte of the 256-bit result.
1260
///
1261
/// \headerfile <immintrin.h>
1262
///
1263
/// This intrinsic corresponds to the \c VPMINUB instruction.
1264
///
1265
/// \param __a
1266
///    A 256-bit integer vector.
1267
/// \param __b
1268
///    A 256-bit integer vector.
1269
/// \returns A 256-bit integer vector containing the result.
1270
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271
_mm256_min_epu8(__m256i __a, __m256i __b)
1272
{
1273
  return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1274
}
1275

1276
/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277
///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278
///    each pair in the corresponding element of the 256-bit result.
1279
///
1280
/// \headerfile <immintrin.h>
1281
///
1282
/// This intrinsic corresponds to the \c VPMINUW instruction.
1283
///
1284
/// \param __a
1285
///    A 256-bit vector of [16 x i16].
1286
/// \param __b
1287
///    A 256-bit vector of [16 x i16].
1288
/// \returns A 256-bit vector of [16 x i16] containing the result.
1289
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290
_mm256_min_epu16(__m256i __a, __m256i __b)
1291
{
1292
  return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1293
}
1294

1295
/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296
///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297
///    each pair in the corresponding element of the 256-bit result.
1298
///
1299
/// \headerfile <immintrin.h>
1300
///
1301
/// This intrinsic corresponds to the \c VPMINUD instruction.
1302
///
1303
/// \param __a
1304
///    A 256-bit vector of [8 x i32].
1305
/// \param __b
1306
///    A 256-bit vector of [8 x i32].
1307
/// \returns A 256-bit vector of [8 x i32] containing the result.
1308
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309
_mm256_min_epu32(__m256i __a, __m256i __b)
1310
{
1311
  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1312
}
1313

1314
/// Creates a 32-bit integer mask from the most significant bit of each byte
1315
///    in the 256-bit integer vector in \a __a and returns the result.
1316
///
1317
/// \code{.operation}
1318
/// FOR i := 0 TO 31
1319
///   j := i*8
1320
///   result[i] := __a[j+7]
1321
/// ENDFOR
1322
/// \endcode
1323
///
1324
/// \headerfile <immintrin.h>
1325
///
1326
/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327
///
1328
/// \param __a
1329
///    A 256-bit integer vector containing the source bytes.
1330
/// \returns The 32-bit integer mask.
1331
static __inline__ int __DEFAULT_FN_ATTRS256
1332
_mm256_movemask_epi8(__m256i __a)
1333
{
1334
  return __builtin_ia32_pmovmskb256((__v32qi)__a);
1335
}
1336

1337
/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338
///    the 16-bit values in the corresponding elements of a 256-bit vector
1339
///    of [16 x i16].
1340
///
1341
/// \code{.operation}
1342
/// FOR i := 0 TO 15
1343
///   j := i*8
1344
///   k := i*16
1345
///   result[k+15:k] := SignExtend(__V[j+7:j])
1346
/// ENDFOR
1347
/// \endcode
1348
///
1349
/// \headerfile <immintrin.h>
1350
///
1351
/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1352
///
1353
/// \param __V
1354
///    A 128-bit integer vector containing the source bytes.
1355
/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356
///    values.
1357
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358
_mm256_cvtepi8_epi16(__m128i __V)
1359
{
1360
  /* This function always performs a signed extension, but __v16qi is a char
1361
     which may be signed or unsigned, so use __v16qs. */
1362
  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1363
}
1364

1365
/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366
///    \a __V and returns the 32-bit values in the corresponding elements of a
1367
///    256-bit vector of [8 x i32].
1368
///
1369
/// \code{.operation}
1370
/// FOR i := 0 TO 7
1371
///   j := i*8
1372
///   k := i*32
1373
///   result[k+31:k] := SignExtend(__V[j+7:j])
1374
/// ENDFOR
1375
/// \endcode
1376
///
1377
/// \headerfile <immintrin.h>
1378
///
1379
/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1380
///
1381
/// \param __V
1382
///    A 128-bit integer vector containing the source bytes.
1383
/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384
///    values.
1385
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386
_mm256_cvtepi8_epi32(__m128i __V)
1387
{
1388
  /* This function always performs a signed extension, but __v16qi is a char
1389
     which may be signed or unsigned, so use __v16qs. */
1390
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1391
}
1392

1393
/// Sign-extends the first four bytes from the 128-bit integer vector in
1394
///    \a __V and returns the 64-bit values in the corresponding elements of a
1395
///    256-bit vector of [4 x i64].
1396
///
1397
/// \code{.operation}
1398
/// result[63:0] := SignExtend(__V[7:0])
1399
/// result[127:64] := SignExtend(__V[15:8])
1400
/// result[191:128] := SignExtend(__V[23:16])
1401
/// result[255:192] := SignExtend(__V[31:24])
1402
/// \endcode
1403
///
1404
/// \headerfile <immintrin.h>
1405
///
1406
/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1407
///
1408
/// \param __V
1409
///    A 128-bit integer vector containing the source bytes.
1410
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411
///    values.
1412
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413
_mm256_cvtepi8_epi64(__m128i __V)
1414
{
1415
  /* This function always performs a signed extension, but __v16qi is a char
1416
     which may be signed or unsigned, so use __v16qs. */
1417
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1418
}
1419

1420
/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421
///    \a __V and returns the 32-bit values in the corresponding elements of a
1422
///    256-bit vector of [8 x i32].
1423
///
1424
/// \code{.operation}
1425
/// FOR i := 0 TO 7
1426
///   j := i*16
1427
///   k := i*32
1428
///   result[k+31:k] := SignExtend(__V[j+15:j])
1429
/// ENDFOR
1430
/// \endcode
1431
///
1432
/// \headerfile <immintrin.h>
1433
///
1434
/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1435
///
1436
/// \param __V
1437
///    A 128-bit vector of [8 x i16] containing the source values.
1438
/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439
///    values.
1440
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441
_mm256_cvtepi16_epi32(__m128i __V)
1442
{
1443
  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1444
}
1445

1446
/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447
///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448
///    elements of a 256-bit vector of [4 x i64].
1449
///
1450
/// \code{.operation}
1451
/// result[63:0] := SignExtend(__V[15:0])
1452
/// result[127:64] := SignExtend(__V[31:16])
1453
/// result[191:128] := SignExtend(__V[47:32])
1454
/// result[255:192] := SignExtend(__V[64:48])
1455
/// \endcode
1456
///
1457
/// \headerfile <immintrin.h>
1458
///
1459
/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1460
///
1461
/// \param __V
1462
///    A 128-bit vector of [8 x i16] containing the source values.
1463
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464
///    values.
1465
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466
_mm256_cvtepi16_epi64(__m128i __V)
1467
{
1468
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1469
}
1470

1471
/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472
///    \a __V and returns the 64-bit values in the corresponding elements of a
1473
///    256-bit vector of [4 x i64].
1474
///
1475
/// \code{.operation}
1476
/// result[63:0] := SignExtend(__V[31:0])
1477
/// result[127:64] := SignExtend(__V[63:32])
1478
/// result[191:128] := SignExtend(__V[95:64])
1479
/// result[255:192] := SignExtend(__V[127:96])
1480
/// \endcode
1481
///
1482
/// \headerfile <immintrin.h>
1483
///
1484
/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1485
///
1486
/// \param __V
1487
///    A 128-bit vector of [4 x i32] containing the source values.
1488
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489
///    values.
1490
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491
_mm256_cvtepi32_epi64(__m128i __V)
1492
{
1493
  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1494
}
1495

1496
/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497
///    the 16-bit values in the corresponding elements of a 256-bit vector
1498
///    of [16 x i16].
1499
///
1500
/// \code{.operation}
1501
/// FOR i := 0 TO 15
1502
///   j := i*8
1503
///   k := i*16
1504
///   result[k+15:k] := ZeroExtend(__V[j+7:j])
1505
/// ENDFOR
1506
/// \endcode
1507
///
1508
/// \headerfile <immintrin.h>
1509
///
1510
/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1511
///
1512
/// \param __V
1513
///    A 128-bit integer vector containing the source bytes.
1514
/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515
///    values.
1516
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517
_mm256_cvtepu8_epi16(__m128i __V)
1518
{
1519
  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1520
}
1521

1522
/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523
///    \a __V and returns the 32-bit values in the corresponding elements of a
1524
///    256-bit vector of [8 x i32].
1525
///
1526
/// \code{.operation}
1527
/// FOR i := 0 TO 7
1528
///   j := i*8
1529
///   k := i*32
1530
///   result[k+31:k] := ZeroExtend(__V[j+7:j])
1531
/// ENDFOR
1532
/// \endcode
1533
///
1534
/// \headerfile <immintrin.h>
1535
///
1536
/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1537
///
1538
/// \param __V
1539
///    A 128-bit integer vector containing the source bytes.
1540
/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541
///    values.
1542
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543
_mm256_cvtepu8_epi32(__m128i __V)
1544
{
1545
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1546
}
1547

1548
/// Zero-extends the first four bytes from the 128-bit integer vector in
1549
///    \a __V and returns the 64-bit values in the corresponding elements of a
1550
///    256-bit vector of [4 x i64].
1551
///
1552
/// \code{.operation}
1553
/// result[63:0] := ZeroExtend(__V[7:0])
1554
/// result[127:64] := ZeroExtend(__V[15:8])
1555
/// result[191:128] := ZeroExtend(__V[23:16])
1556
/// result[255:192] := ZeroExtend(__V[31:24])
1557
/// \endcode
1558
///
1559
/// \headerfile <immintrin.h>
1560
///
1561
/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1562
///
1563
/// \param __V
1564
///    A 128-bit integer vector containing the source bytes.
1565
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566
///    values.
1567
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568
_mm256_cvtepu8_epi64(__m128i __V)
1569
{
1570
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1571
}
1572

1573
/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574
///    \a __V and returns the 32-bit values in the corresponding elements of a
1575
///    256-bit vector of [8 x i32].
1576
///
1577
/// \code{.operation}
1578
/// FOR i := 0 TO 7
1579
///   j := i*16
1580
///   k := i*32
1581
///   result[k+31:k] := ZeroExtend(__V[j+15:j])
1582
/// ENDFOR
1583
/// \endcode
1584
///
1585
/// \headerfile <immintrin.h>
1586
///
1587
/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1588
///
1589
/// \param __V
1590
///    A 128-bit vector of [8 x i16] containing the source values.
1591
/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592
///    values.
1593
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594
_mm256_cvtepu16_epi32(__m128i __V)
1595
{
1596
  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1597
}
1598

1599
/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600
///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601
///    elements of a 256-bit vector of [4 x i64].
1602
///
1603
/// \code{.operation}
1604
/// result[63:0] := ZeroExtend(__V[15:0])
1605
/// result[127:64] := ZeroExtend(__V[31:16])
1606
/// result[191:128] := ZeroExtend(__V[47:32])
1607
/// result[255:192] := ZeroExtend(__V[64:48])
1608
/// \endcode
1609
///
1610
/// \headerfile <immintrin.h>
1611
///
1612
/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1613
///
1614
/// \param __V
1615
///    A 128-bit vector of [8 x i16] containing the source values.
1616
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617
///    values.
1618
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619
_mm256_cvtepu16_epi64(__m128i __V)
1620
{
1621
  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1622
}
1623

1624
/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625
///    \a __V and returns the 64-bit values in the corresponding elements of a
1626
///    256-bit vector of [4 x i64].
1627
///
1628
/// \code{.operation}
1629
/// result[63:0] := ZeroExtend(__V[31:0])
1630
/// result[127:64] := ZeroExtend(__V[63:32])
1631
/// result[191:128] := ZeroExtend(__V[95:64])
1632
/// result[255:192] := ZeroExtend(__V[127:96])
1633
/// \endcode
1634
///
1635
/// \headerfile <immintrin.h>
1636
///
1637
/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1638
///
1639
/// \param __V
1640
///    A 128-bit vector of [4 x i32] containing the source values.
1641
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642
///    values.
1643
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644
_mm256_cvtepu32_epi64(__m128i __V)
1645
{
1646
  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1647
}
1648

1649
/// Multiplies signed 32-bit integers from even-numbered elements of two
1650
///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651
///    [4 x i64] result.
1652
///
1653
/// \code{.operation}
1654
/// result[63:0] := __a[31:0] * __b[31:0]
1655
/// result[127:64] := __a[95:64] * __b[95:64]
1656
/// result[191:128] := __a[159:128] * __b[159:128]
1657
/// result[255:192] := __a[223:192] * __b[223:192]
1658
/// \endcode
1659
///
1660
/// \headerfile <immintrin.h>
1661
///
1662
/// This intrinsic corresponds to the \c VPMULDQ instruction.
1663
///
1664
/// \param __a
1665
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1666
/// \param __b
1667
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1668
/// \returns A 256-bit vector of [4 x i64] containing the products.
1669
static __inline__  __m256i __DEFAULT_FN_ATTRS256
1670
_mm256_mul_epi32(__m256i __a, __m256i __b)
1671
{
1672
  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1673
}
1674

1675
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676
///    [16 x i16], truncates the 32-bit results to the most significant 18
1677
///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678
///    product in the [16 x i16] result.
1679
///
1680
/// \code{.operation}
1681
/// FOR i := 0 TO 15
1682
///   j := i*16
1683
///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684
///   result[j+15:j] := temp[16:1]
1685
/// \endcode
1686
///
1687
/// \headerfile <immintrin.h>
1688
///
1689
/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1690
///
1691
/// \param __a
1692
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1693
/// \param __b
1694
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1695
/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697
_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1698
{
1699
  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1700
}
1701

1702
/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703
///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704
///    [16 x i16] result.
1705
///
1706
/// \headerfile <immintrin.h>
1707
///
1708
/// This intrinsic corresponds to the \c VPMULHUW instruction.
1709
///
1710
/// \param __a
1711
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1712
/// \param __b
1713
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1714
/// \returns A 256-bit vector of [16 x i16] containing the products.
1715
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716
_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1717
{
1718
  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1719
}
1720

1721
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722
///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723
///    [16 x i16] result.
1724
///
1725
/// \headerfile <immintrin.h>
1726
///
1727
/// This intrinsic corresponds to the \c VPMULHW instruction.
1728
///
1729
/// \param __a
1730
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1731
/// \param __b
1732
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1733
/// \returns A 256-bit vector of [16 x i16] containing the products.
1734
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735
_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1736
{
1737
  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1738
}
1739

1740
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741
///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742
///    [16 x i16] result.
1743
///
1744
/// \headerfile <immintrin.h>
1745
///
1746
/// This intrinsic corresponds to the \c VPMULLW instruction.
1747
///
1748
/// \param __a
1749
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1750
/// \param __b
1751
///    A 256-bit vector of [16 x i16] containing one of the source operands.
1752
/// \returns A 256-bit vector of [16 x i16] containing the products.
1753
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754
_mm256_mullo_epi16(__m256i __a, __m256i __b)
1755
{
1756
  return (__m256i)((__v16hu)__a * (__v16hu)__b);
1757
}
1758

1759
/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760
///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761
///    [8 x i32] result.
1762
///
1763
/// \headerfile <immintrin.h>
1764
///
1765
/// This intrinsic corresponds to the \c VPMULLD instruction.
1766
///
1767
/// \param __a
1768
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1769
/// \param __b
1770
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1771
/// \returns A 256-bit vector of [8 x i32] containing the products.
1772
static __inline__  __m256i __DEFAULT_FN_ATTRS256
1773
_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1774
{
1775
  return (__m256i)((__v8su)__a * (__v8su)__b);
1776
}
1777

1778
/// Multiplies unsigned 32-bit integers from even-numered elements of two
1779
///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780
///    [4 x i64] result.
1781
///
1782
/// \code{.operation}
1783
/// result[63:0] := __a[31:0] * __b[31:0]
1784
/// result[127:64] := __a[95:64] * __b[95:64]
1785
/// result[191:128] := __a[159:128] * __b[159:128]
1786
/// result[255:192] := __a[223:192] * __b[223:192]
1787
/// \endcode
1788
///
1789
/// \headerfile <immintrin.h>
1790
///
1791
/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1792
///
1793
/// \param __a
1794
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1795
/// \param __b
1796
///    A 256-bit vector of [8 x i32] containing one of the source operands.
1797
/// \returns A 256-bit vector of [4 x i64] containing the products.
1798
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799
_mm256_mul_epu32(__m256i __a, __m256i __b)
1800
{
1801
  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1802
}
1803

1804
/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805
///    \a __b.
1806
///
1807
/// \headerfile <immintrin.h>
1808
///
1809
/// This intrinsic corresponds to the \c VPOR instruction.
1810
///
1811
/// \param __a
1812
///    A 256-bit integer vector.
1813
/// \param __b
1814
///    A 256-bit integer vector.
1815
/// \returns A 256-bit integer vector containing the result.
1816
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817
_mm256_or_si256(__m256i __a, __m256i __b)
1818
{
1819
  return (__m256i)((__v4du)__a | (__v4du)__b);
1820
}
1821

1822
/// Computes four sum of absolute difference (SAD) operations on sets of eight
1823
///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824
///    \a __b.
1825
///
1826
///    One SAD result is computed for each set of eight bytes from \a __a and
1827
///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1828
///    corresponding 64-bit element of the result.
1829
///
1830
///    A single SAD operation takes the differences between the corresponding
1831
///    bytes of \a __a and \a __b, takes the absolute value of each difference,
1832
///    and sums these eight values to form one 16-bit result. This operation
1833
///    is repeated four times with successive sets of eight bytes.
1834
///
1835
/// \code{.operation}
1836
/// FOR i := 0 TO 3
1837
///   j := i*64
1838
///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839
///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840
///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841
///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842
///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843
///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844
///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845
///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846
///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847
///                     temp4 + temp5 + temp6 + temp7
1848
///   result[j+63:j+16] := 0
1849
/// ENDFOR
1850
/// \endcode
1851
///
1852
/// \headerfile <immintrin.h>
1853
///
1854
/// This intrinsic corresponds to the \c VPSADBW instruction.
1855
///
1856
/// \param __a
1857
///    A 256-bit integer vector.
1858
/// \param __b
1859
///    A 256-bit integer vector.
1860
/// \returns A 256-bit integer vector containing the result.
1861
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862
_mm256_sad_epu8(__m256i __a, __m256i __b)
1863
{
1864
  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1865
}
1866

1867
/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868
///    to control information in the 256-bit integer vector \a __b, and
1869
///    returns the 256-bit result. In effect there are two separate 128-bit
1870
///    shuffles in the lower and upper halves.
1871
///
1872
/// \code{.operation}
1873
/// FOR i := 0 TO 31
1874
///   j := i*8
1875
///   IF __b[j+7] == 1
1876
///     result[j+7:j] := 0
1877
///   ELSE
1878
///     k := __b[j+3:j] * 8
1879
///     IF i > 15
1880
///       k := k + 128
1881
///     FI
1882
///     result[j+7:j] := __a[k+7:k]
1883
///   FI
1884
/// ENDFOR
1885
/// \endcode
1886
///
1887
/// \headerfile <immintrin.h>
1888
///
1889
/// This intrinsic corresponds to the \c VPSHUFB instruction.
1890
///
1891
/// \param __a
1892
///    A 256-bit integer vector containing source values.
1893
/// \param __b
1894
///    A 256-bit integer vector containing control information to determine
1895
///    what goes into the corresponding byte of the result. If bit 7 of the
1896
///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897
///    control byte specify the index (within the same 128-bit half) of \a __a
1898
///    to copy to the result byte.
1899
/// \returns A 256-bit integer vector containing the result.
1900
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901
_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1902
{
1903
  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1904
}
1905

1906
/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907
///    according to control information in the integer literal \a imm, and
1908
///    returns the 256-bit result. In effect there are two parallel 128-bit
1909
///    shuffles in the lower and upper halves.
1910
///
1911
/// \code{.operation}
1912
/// FOR i := 0 to 3
1913
///   j := i*32
1914
///   k := (imm >> i*2)[1:0] * 32
1915
///   result[j+31:j] := a[k+31:k]
1916
///   result[128+j+31:128+j] := a[128+k+31:128+k]
1917
/// ENDFOR
1918
/// \endcode
1919
///
1920
/// \headerfile <immintrin.h>
1921
///
1922
/// \code
1923
/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924
/// \endcode
1925
///
1926
/// This intrinsic corresponds to the \c VPSHUFB instruction.
1927
///
1928
/// \param a
1929
///    A 256-bit vector of [8 x i32] containing source values.
1930
/// \param imm
1931
///    An immediate 8-bit value specifying which elements to copy from \a a.
1932
///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933
///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934
///    forth.
1935
/// \returns A 256-bit vector of [8 x i32] containing the result.
1936
#define _mm256_shuffle_epi32(a, imm) \
1937
  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1938

1939
/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940
///    according to control information in the integer literal \a imm, and
1941
///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1942
///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943
///    copied from \a a unchanged.
1944
///
1945
/// \code{.operation}
1946
/// result[63:0] := a[63:0]
1947
/// result[191:128] := a[191:128]
1948
/// FOR i := 0 TO 3
1949
///   j := i * 16 + 64
1950
///   k := (imm >> i*2)[1:0] * 16 + 64
1951
///   result[j+15:j] := a[k+15:k]
1952
///   result[128+j+15:128+j] := a[128+k+15:128+k]
1953
/// ENDFOR
1954
/// \endcode
1955
///
1956
/// \headerfile <immintrin.h>
1957
///
1958
/// \code
1959
/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960
/// \endcode
1961
///
1962
/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1963
///
1964
/// \param a
1965
///    A 256-bit vector of [16 x i16] containing source values.
1966
/// \param imm
1967
///    An immediate 8-bit value specifying which elements to copy from \a a.
1968
///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969
///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970
///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971
/// \returns A 256-bit vector of [16 x i16] containing the result.
1972
#define _mm256_shufflehi_epi16(a, imm) \
1973
  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1974

1975
/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976
///    according to control information in the integer literal \a imm, and
1977
///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978
///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979
///    copied from \a a unchanged.
1980
///
1981
/// \code{.operation}
1982
/// result[127:64] := a[127:64]
1983
/// result[255:192] := a[255:192]
1984
/// FOR i := 0 TO 3
1985
///   j := i * 16
1986
///   k := (imm >> i*2)[1:0] * 16
1987
///   result[j+15:j] := a[k+15:k]
1988
///   result[128+j+15:128+j] := a[128+k+15:128+k]
1989
/// ENDFOR
1990
/// \endcode
1991
///
1992
/// \headerfile <immintrin.h>
1993
///
1994
/// \code
1995
/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996
/// \endcode
1997
///
1998
/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1999
///
2000
/// \param a
2001
///    A 256-bit vector of [16 x i16] to use as a source of data for the
2002
///    result.
2003
/// \param imm
2004
///    An immediate 8-bit value specifying which elements to copy from \a a.
2005
///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006
///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007
///    forth.
2008
/// \returns A 256-bit vector of [16 x i16] containing the result.
2009
#define _mm256_shufflelo_epi16(a, imm) \
2010
  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2011

2012
/// Sets each byte of the result to the corresponding byte of the 256-bit
2013
///    integer vector in \a __a, the negative of that byte, or zero, depending
2014
///    on whether the corresponding byte of the 256-bit integer vector in
2015
///    \a __b is greater than zero, less than zero, or equal to zero,
2016
///    respectively.
2017
///
2018
/// \headerfile <immintrin.h>
2019
///
2020
/// This intrinsic corresponds to the \c VPSIGNB instruction.
2021
///
2022
/// \param __a
2023
///    A 256-bit integer vector.
2024
/// \param __b
2025
///    A 256-bit integer vector].
2026
/// \returns A 256-bit integer vector containing the result.
2027
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028
_mm256_sign_epi8(__m256i __a, __m256i __b)
2029
{
2030
    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2031
}
2032

2033
/// Sets each element of the result to the corresponding element of the
2034
///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035
///    or zero, depending on whether the corresponding element of the 256-bit
2036
///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037
///    equal to zero, respectively.
2038
///
2039
/// \headerfile <immintrin.h>
2040
///
2041
/// This intrinsic corresponds to the \c VPSIGNW instruction.
2042
///
2043
/// \param __a
2044
///    A 256-bit vector of [16 x i16].
2045
/// \param __b
2046
///    A 256-bit vector of [16 x i16].
2047
/// \returns A 256-bit vector of [16 x i16] containing the result.
2048
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049
_mm256_sign_epi16(__m256i __a, __m256i __b)
2050
{
2051
    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2052
}
2053

2054
/// Sets each element of the result to the corresponding element of the
2055
///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056
///    zero, depending on whether the corresponding element of the 256-bit
2057
///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058
///    equal to zero, respectively.
2059
///
2060
/// \headerfile <immintrin.h>
2061
///
2062
/// This intrinsic corresponds to the \c VPSIGND instruction.
2063
///
2064
/// \param __a
2065
///    A 256-bit vector of [8 x i32].
2066
/// \param __b
2067
///    A 256-bit vector of [8 x i32].
2068
/// \returns A 256-bit vector of [8 x i32] containing the result.
2069
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070
_mm256_sign_epi32(__m256i __a, __m256i __b)
2071
{
2072
    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2073
}
2074

2075
/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076
///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077
///    is greater than 15, the returned result is all zeroes.
2078
///
2079
/// \headerfile <immintrin.h>
2080
///
2081
/// \code
2082
/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083
/// \endcode
2084
///
2085
/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2086
///
2087
/// \param a
2088
///    A 256-bit integer vector to be shifted.
2089
/// \param imm
2090
///     An unsigned immediate value specifying the shift count (in bytes).
2091
/// \returns A 256-bit integer vector containing the result.
2092
#define _mm256_slli_si256(a, imm) \
2093
  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2094

2095
/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096
///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097
///    is greater than 15, the returned result is all zeroes.
2098
///
2099
/// \headerfile <immintrin.h>
2100
///
2101
/// \code
2102
/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103
/// \endcode
2104
///
2105
/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2106
///
2107
/// \param a
2108
///    A 256-bit integer vector to be shifted.
2109
/// \param imm
2110
///    An unsigned immediate value specifying the shift count (in bytes).
2111
/// \returns A 256-bit integer vector containing the result.
2112
#define _mm256_bslli_epi128(a, imm) \
2113
  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2114

2115
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116
///    left by \a __count bits, shifting in zero bits, and returns the result.
2117
///    If \a __count is greater than 15, the returned result is all zeroes.
2118
///
2119
/// \headerfile <immintrin.h>
2120
///
2121
/// This intrinsic corresponds to the \c VPSLLW instruction.
2122
///
2123
/// \param __a
2124
///    A 256-bit vector of [16 x i16] to be shifted.
2125
/// \param __count
2126
///    An unsigned integer value specifying the shift count (in bits).
2127
/// \returns A 256-bit vector of [16 x i16] containing the result.
2128
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129
_mm256_slli_epi16(__m256i __a, int __count)
2130
{
2131
  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2132
}
2133

2134
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135
///    left by the number of bits specified by the lower 64 bits of \a __count,
2136
///    shifting in zero bits, and returns the result. If \a __count is greater
2137
///    than 15, the returned result is all zeroes.
2138
///
2139
/// \headerfile <immintrin.h>
2140
///
2141
/// This intrinsic corresponds to the \c VPSLLW instruction.
2142
///
2143
/// \param __a
2144
///    A 256-bit vector of [16 x i16] to be shifted.
2145
/// \param __count
2146
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147
///    shift count (in bits). The upper element is ignored.
2148
/// \returns A 256-bit vector of [16 x i16] containing the result.
2149
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150
_mm256_sll_epi16(__m256i __a, __m128i __count)
2151
{
2152
  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2153
}
2154

2155
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156
///    left by \a __count bits, shifting in zero bits, and returns the result.
2157
///    If \a __count is greater than 31, the returned result is all zeroes.
2158
///
2159
/// \headerfile <immintrin.h>
2160
///
2161
/// This intrinsic corresponds to the \c VPSLLD instruction.
2162
///
2163
/// \param __a
2164
///    A 256-bit vector of [8 x i32] to be shifted.
2165
/// \param __count
2166
///    An unsigned integer value specifying the shift count (in bits).
2167
/// \returns A 256-bit vector of [8 x i32] containing the result.
2168
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169
_mm256_slli_epi32(__m256i __a, int __count)
2170
{
2171
  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2172
}
2173

2174
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175
///    left by the number of bits given in the lower 64 bits of \a __count,
2176
///    shifting in zero bits, and returns the result. If \a __count is greater
2177
///    than 31, the returned result is all zeroes.
2178
///
2179
/// \headerfile <immintrin.h>
2180
///
2181
/// This intrinsic corresponds to the \c VPSLLD instruction.
2182
///
2183
/// \param __a
2184
///    A 256-bit vector of [8 x i32] to be shifted.
2185
/// \param __count
2186
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187
///    shift count (in bits). The upper element is ignored.
2188
/// \returns A 256-bit vector of [8 x i32] containing the result.
2189
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190
_mm256_sll_epi32(__m256i __a, __m128i __count)
2191
{
2192
  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2193
}
2194

2195
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196
///    left by \a __count bits, shifting in zero bits, and returns the result.
2197
///    If \a __count is greater than 63, the returned result is all zeroes.
2198
///
2199
/// \headerfile <immintrin.h>
2200
///
2201
/// This intrinsic corresponds to the \c VPSLLQ instruction.
2202
///
2203
/// \param __a
2204
///    A 256-bit vector of [4 x i64] to be shifted.
2205
/// \param __count
2206
///    An unsigned integer value specifying the shift count (in bits).
2207
/// \returns A 256-bit vector of [4 x i64] containing the result.
2208
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209
_mm256_slli_epi64(__m256i __a, int __count)
2210
{
2211
  return __builtin_ia32_psllqi256((__v4di)__a, __count);
2212
}
2213

2214
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215
///    left by the number of bits given in the lower 64 bits of \a __count,
2216
///    shifting in zero bits, and returns the result. If \a __count is greater
2217
///    than 63, the returned result is all zeroes.
2218
///
2219
/// \headerfile <immintrin.h>
2220
///
2221
/// This intrinsic corresponds to the \c VPSLLQ instruction.
2222
///
2223
/// \param __a
2224
///    A 256-bit vector of [4 x i64] to be shifted.
2225
/// \param __count
2226
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227
///    shift count (in bits). The upper element is ignored.
2228
/// \returns A 256-bit vector of [4 x i64] containing the result.
2229
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230
_mm256_sll_epi64(__m256i __a, __m128i __count)
2231
{
2232
  return __builtin_ia32_psllq256((__v4di)__a, __count);
2233
}
2234

2235
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236
///    right by \a __count bits, shifting in sign bits, and returns the result.
2237
///    If \a __count is greater than 15, each element of the result is either
2238
///    0 or -1 according to the corresponding input sign bit.
2239
///
2240
/// \headerfile <immintrin.h>
2241
///
2242
/// This intrinsic corresponds to the \c VPSRAW instruction.
2243
///
2244
/// \param __a
2245
///    A 256-bit vector of [16 x i16] to be shifted.
2246
/// \param __count
2247
///    An unsigned integer value specifying the shift count (in bits).
2248
/// \returns A 256-bit vector of [16 x i16] containing the result.
2249
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250
_mm256_srai_epi16(__m256i __a, int __count)
2251
{
2252
  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2253
}
2254

2255
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256
///    right by the number of bits given in the lower 64 bits of \a __count,
2257
///    shifting in sign bits, and returns the result. If \a __count is greater
2258
///    than 15, each element of the result is either 0 or -1 according to the
2259
///    corresponding input sign bit.
2260
///
2261
/// \headerfile <immintrin.h>
2262
///
2263
/// This intrinsic corresponds to the \c VPSRAW instruction.
2264
///
2265
/// \param __a
2266
///    A 256-bit vector of [16 x i16] to be shifted.
2267
/// \param __count
2268
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269
///    shift count (in bits). The upper element is ignored.
2270
/// \returns A 256-bit vector of [16 x i16] containing the result.
2271
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272
_mm256_sra_epi16(__m256i __a, __m128i __count)
2273
{
2274
  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2275
}
2276

2277
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278
///    right by \a __count bits, shifting in sign bits, and returns the result.
2279
///    If \a __count is greater than 31, each element of the result is either
2280
///    0 or -1 according to the corresponding input sign bit.
2281
///
2282
/// \headerfile <immintrin.h>
2283
///
2284
/// This intrinsic corresponds to the \c VPSRAD instruction.
2285
///
2286
/// \param __a
2287
///    A 256-bit vector of [8 x i32] to be shifted.
2288
/// \param __count
2289
///    An unsigned integer value specifying the shift count (in bits).
2290
/// \returns A 256-bit vector of [8 x i32] containing the result.
2291
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292
_mm256_srai_epi32(__m256i __a, int __count)
2293
{
2294
  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2295
}
2296

2297
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298
///    right by the number of bits given in the lower 64 bits of \a __count,
2299
///    shifting in sign bits, and returns the result. If \a __count is greater
2300
///    than 31, each element of the result is either 0 or -1 according to the
2301
///    corresponding input sign bit.
2302
///
2303
/// \headerfile <immintrin.h>
2304
///
2305
/// This intrinsic corresponds to the \c VPSRAD instruction.
2306
///
2307
/// \param __a
2308
///    A 256-bit vector of [8 x i32] to be shifted.
2309
/// \param __count
2310
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311
///    shift count (in bits). The upper element is ignored.
2312
/// \returns A 256-bit vector of [8 x i32] containing the result.
2313
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314
_mm256_sra_epi32(__m256i __a, __m128i __count)
2315
{
2316
  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2317
}
2318

2319
/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320
///    \a imm bytes, shifting in zero bytes, and returns the result. If
2321
///    \a imm is greater than 15, the returned result is all zeroes.
2322
///
2323
/// \headerfile <immintrin.h>
2324
///
2325
/// \code
2326
/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327
/// \endcode
2328
///
2329
/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2330
///
2331
/// \param a
2332
///    A 256-bit integer vector to be shifted.
2333
/// \param imm
2334
///    An unsigned immediate value specifying the shift count (in bytes).
2335
/// \returns A 256-bit integer vector containing the result.
2336
#define _mm256_srli_si256(a, imm) \
2337
  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2338

2339
/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340
///    \a imm bytes, shifting in zero bytes, and returns the result. If
2341
///    \a imm is greater than 15, the returned result is all zeroes.
2342
///
2343
/// \headerfile <immintrin.h>
2344
///
2345
/// \code
2346
/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347
/// \endcode
2348
///
2349
/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2350
///
2351
/// \param a
2352
///    A 256-bit integer vector to be shifted.
2353
/// \param imm
2354
///     An unsigned immediate value specifying the shift count (in bytes).
2355
/// \returns A 256-bit integer vector containing the result.
2356
#define _mm256_bsrli_epi128(a, imm) \
2357
  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2358

2359
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360
///    right by \a __count bits, shifting in zero bits, and returns the result.
2361
///    If \a __count is greater than 15, the returned result is all zeroes.
2362
///
2363
/// \headerfile <immintrin.h>
2364
///
2365
/// This intrinsic corresponds to the \c VPSRLW instruction.
2366
///
2367
/// \param __a
2368
///    A 256-bit vector of [16 x i16] to be shifted.
2369
/// \param __count
2370
///    An unsigned integer value specifying the shift count (in bits).
2371
/// \returns A 256-bit vector of [16 x i16] containing the result.
2372
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373
_mm256_srli_epi16(__m256i __a, int __count)
2374
{
2375
  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2376
}
2377

2378
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379
///    right by the number of bits given in the lower 64 bits of \a __count,
2380
///    shifting in zero bits, and returns the result. If \a __count is greater
2381
///    than 15, the returned result is all zeroes.
2382
///
2383
/// \headerfile <immintrin.h>
2384
///
2385
/// This intrinsic corresponds to the \c VPSRLW instruction.
2386
///
2387
/// \param __a
2388
///    A 256-bit vector of [16 x i16] to be shifted.
2389
/// \param __count
2390
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391
///    shift count (in bits). The upper element is ignored.
2392
/// \returns A 256-bit vector of [16 x i16] containing the result.
2393
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394
_mm256_srl_epi16(__m256i __a, __m128i __count)
2395
{
2396
  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2397
}
2398

2399
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400
///    right by \a __count bits, shifting in zero bits, and returns the result.
2401
///    If \a __count is greater than 31, the returned result is all zeroes.
2402
///
2403
/// \headerfile <immintrin.h>
2404
///
2405
/// This intrinsic corresponds to the \c VPSRLD instruction.
2406
///
2407
/// \param __a
2408
///    A 256-bit vector of [8 x i32] to be shifted.
2409
/// \param __count
2410
///    An unsigned integer value specifying the shift count (in bits).
2411
/// \returns A 256-bit vector of [8 x i32] containing the result.
2412
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413
_mm256_srli_epi32(__m256i __a, int __count)
2414
{
2415
  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2416
}
2417

2418
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419
///    right by the number of bits given in the lower 64 bits of \a __count,
2420
///    shifting in zero bits, and returns the result. If \a __count is greater
2421
///    than 31, the returned result is all zeroes.
2422
///
2423
/// \headerfile <immintrin.h>
2424
///
2425
/// This intrinsic corresponds to the \c VPSRLD instruction.
2426
///
2427
/// \param __a
2428
///    A 256-bit vector of [8 x i32] to be shifted.
2429
/// \param __count
2430
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431
///    shift count (in bits). The upper element is ignored.
2432
/// \returns A 256-bit vector of [8 x i32] containing the result.
2433
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434
_mm256_srl_epi32(__m256i __a, __m128i __count)
2435
{
2436
  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2437
}
2438

2439
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440
///    right by \a __count bits, shifting in zero bits, and returns the result.
2441
///    If \a __count is greater than 63, the returned result is all zeroes.
2442
///
2443
/// \headerfile <immintrin.h>
2444
///
2445
/// This intrinsic corresponds to the \c VPSRLQ instruction.
2446
///
2447
/// \param __a
2448
///    A 256-bit vector of [4 x i64] to be shifted.
2449
/// \param __count
2450
///    An unsigned integer value specifying the shift count (in bits).
2451
/// \returns A 256-bit vector of [4 x i64] containing the result.
2452
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453
_mm256_srli_epi64(__m256i __a, int __count)
2454
{
2455
  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2456
}
2457

2458
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459
///    right by the number of bits given in the lower 64 bits of \a __count,
2460
///    shifting in zero bits, and returns the result. If \a __count is greater
2461
///    than 63, the returned result is all zeroes.
2462
///
2463
/// \headerfile <immintrin.h>
2464
///
2465
/// This intrinsic corresponds to the \c VPSRLQ instruction.
2466
///
2467
/// \param __a
2468
///    A 256-bit vector of [4 x i64] to be shifted.
2469
/// \param __count
2470
///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471
///    shift count (in bits). The upper element is ignored.
2472
/// \returns A 256-bit vector of [4 x i64] containing the result.
2473
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474
_mm256_srl_epi64(__m256i __a, __m128i __count)
2475
{
2476
  return __builtin_ia32_psrlq256((__v4di)__a, __count);
2477
}
2478

2479
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480
///    vectors. Returns the lower 8 bits of each difference in the
2481
///    corresponding byte of the 256-bit integer vector result (overflow is
2482
///    ignored).
2483
///
2484
/// \code{.operation}
2485
/// FOR i := 0 TO 31
2486
///   j := i*8
2487
///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488
/// ENDFOR
2489
/// \endcode
2490
///
2491
/// \headerfile <immintrin.h>
2492
///
2493
/// This intrinsic corresponds to the \c VPSUBB instruction.
2494
///
2495
/// \param __a
2496
///    A 256-bit integer vector containing the minuends.
2497
/// \param __b
2498
///    A 256-bit integer vector containing the subtrahends.
2499
/// \returns A 256-bit integer vector containing the differences.
2500
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501
_mm256_sub_epi8(__m256i __a, __m256i __b)
2502
{
2503
  return (__m256i)((__v32qu)__a - (__v32qu)__b);
2504
}
2505

2506
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507
///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508
///    the corresponding element of the [16 x i16] result (overflow is
2509
///    ignored).
2510
///
2511
/// \code{.operation}
2512
/// FOR i := 0 TO 15
2513
///   j := i*16
2514
///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515
/// ENDFOR
2516
/// \endcode
2517
///
2518
/// \headerfile <immintrin.h>
2519
///
2520
/// This intrinsic corresponds to the \c VPSUBW instruction.
2521
///
2522
/// \param __a
2523
///    A 256-bit vector of [16 x i16] containing the minuends.
2524
/// \param __b
2525
///    A 256-bit vector of [16 x i16] containing the subtrahends.
2526
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2527
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528
_mm256_sub_epi16(__m256i __a, __m256i __b)
2529
{
2530
  return (__m256i)((__v16hu)__a - (__v16hu)__b);
2531
}
2532

2533
/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534
///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535
///    the corresponding element of the [8 x i32] result (overflow is ignored).
2536
///
2537
/// \code{.operation}
2538
/// FOR i := 0 TO 7
2539
///   j := i*32
2540
///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541
/// ENDFOR
2542
/// \endcode
2543
///
2544
/// \headerfile <immintrin.h>
2545
///
2546
/// This intrinsic corresponds to the \c VPSUBD instruction.
2547
///
2548
/// \param __a
2549
///    A 256-bit vector of [8 x i32] containing the minuends.
2550
/// \param __b
2551
///    A 256-bit vector of [8 x i32] containing the subtrahends.
2552
/// \returns A 256-bit vector of [8 x i32] containing the differences.
2553
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554
_mm256_sub_epi32(__m256i __a, __m256i __b)
2555
{
2556
  return (__m256i)((__v8su)__a - (__v8su)__b);
2557
}
2558

2559
/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560
///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561
///    the corresponding element of the [4 x i64] result (overflow is ignored).
2562
///
2563
/// \code{.operation}
2564
/// FOR i := 0 TO 3
2565
///   j := i*64
2566
///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567
/// ENDFOR
2568
/// \endcode
2569
///
2570
/// \headerfile <immintrin.h>
2571
///
2572
/// This intrinsic corresponds to the \c VPSUBQ instruction.
2573
///
2574
/// \param __a
2575
///    A 256-bit vector of [4 x i64] containing the minuends.
2576
/// \param __b
2577
///    A 256-bit vector of [4 x i64] containing the subtrahends.
2578
/// \returns A 256-bit vector of [4 x i64] containing the differences.
2579
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580
_mm256_sub_epi64(__m256i __a, __m256i __b)
2581
{
2582
  return (__m256i)((__v4du)__a - (__v4du)__b);
2583
}
2584

2585
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586
///    vectors using signed saturation, and returns each differences in the
2587
///    corresponding byte of the 256-bit integer vector result.
2588
///
2589
/// \code{.operation}
2590
/// FOR i := 0 TO 31
2591
///   j := i*8
2592
///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593
/// ENDFOR
2594
/// \endcode
2595
///
2596
/// \headerfile <immintrin.h>
2597
///
2598
/// This intrinsic corresponds to the \c VPSUBSB instruction.
2599
///
2600
/// \param __a
2601
///    A 256-bit integer vector containing the minuends.
2602
/// \param __b
2603
///    A 256-bit integer vector containing the subtrahends.
2604
/// \returns A 256-bit integer vector containing the differences.
2605
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606
_mm256_subs_epi8(__m256i __a, __m256i __b)
2607
{
2608
  return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2609
}
2610

2611
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612
///    vectors of [16 x i16] using signed saturation, and returns each
2613
///    difference in the corresponding element of the [16 x i16] result.
2614
///
2615
/// \code{.operation}
2616
/// FOR i := 0 TO 15
2617
///   j := i*16
2618
///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619
/// ENDFOR
2620
/// \endcode
2621
///
2622
/// \headerfile <immintrin.h>
2623
///
2624
/// This intrinsic corresponds to the \c VPSUBSW instruction.
2625
///
2626
/// \param __a
2627
///    A 256-bit vector of [16 x i16] containing the minuends.
2628
/// \param __b
2629
///    A 256-bit vector of [16 x i16] containing the subtrahends.
2630
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2631
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632
_mm256_subs_epi16(__m256i __a, __m256i __b)
2633
{
2634
  return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2635
}
2636

2637
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638
///    vectors using unsigned saturation, and returns each difference in the
2639
///    corresponding byte of the 256-bit integer vector result. For each byte,
2640
///    computes <c> result = __a - __b </c>.
2641
///
2642
/// \code{.operation}
2643
/// FOR i := 0 TO 31
2644
///   j := i*8
2645
///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646
/// ENDFOR
2647
/// \endcode
2648
///
2649
/// \headerfile <immintrin.h>
2650
///
2651
/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2652
///
2653
/// \param __a
2654
///    A 256-bit integer vector containing the minuends.
2655
/// \param __b
2656
///    A 256-bit integer vector containing the subtrahends.
2657
/// \returns A 256-bit integer vector containing the differences.
2658
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659
_mm256_subs_epu8(__m256i __a, __m256i __b)
2660
{
2661
  return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2662
}
2663

2664
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665
///    vectors of [16 x i16] using unsigned saturation, and returns each
2666
///    difference in the corresponding element of the [16 x i16] result.
2667
///
2668
/// \code{.operation}
2669
/// FOR i := 0 TO 15
2670
///   j := i*16
2671
///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672
/// ENDFOR
2673
/// \endcode
2674
///
2675
/// \headerfile <immintrin.h>
2676
///
2677
/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2678
///
2679
/// \param __a
2680
///    A 256-bit vector of [16 x i16] containing the minuends.
2681
/// \param __b
2682
///    A 256-bit vector of [16 x i16] containing the subtrahends.
2683
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2684
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685
_mm256_subs_epu16(__m256i __a, __m256i __b)
2686
{
2687
  return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2688
}
2689

2690
/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691
///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692
///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693
///    input; other bits in these parameters are ignored.
2694
///
2695
/// \code{.operation}
2696
/// result[7:0] := __a[71:64]
2697
/// result[15:8] := __b[71:64]
2698
/// result[23:16] := __a[79:72]
2699
/// result[31:24] := __b[79:72]
2700
/// . . .
2701
/// result[127:120] := __b[127:120]
2702
/// result[135:128] := __a[199:192]
2703
/// . . .
2704
/// result[255:248] := __b[255:248]
2705
/// \endcode
2706
///
2707
/// \headerfile <immintrin.h>
2708
///
2709
/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2710
///
2711
/// \param __a
2712
///    A 256-bit integer vector used as the source for the even-numbered bytes
2713
///    of the result.
2714
/// \param __b
2715
///    A 256-bit integer vector used as the source for the odd-numbered bytes
2716
///    of the result.
2717
/// \returns A 256-bit integer vector containing the result.
2718
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719
_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2720
{
2721
  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2722
}
2723

2724
/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725
///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726
///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727
///    128-bit half of \a __a and \a __b as input; other bits in these
2728
///    parameters are ignored.
2729
///
2730
/// \code{.operation}
2731
/// result[15:0] := __a[79:64]
2732
/// result[31:16] := __b[79:64]
2733
/// result[47:32] := __a[95:80]
2734
/// result[63:48] := __b[95:80]
2735
/// . . .
2736
/// result[127:112] := __b[127:112]
2737
/// result[143:128] := __a[211:196]
2738
/// . . .
2739
/// result[255:240] := __b[255:240]
2740
/// \endcode
2741
///
2742
/// \headerfile <immintrin.h>
2743
///
2744
/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2745
///
2746
/// \param __a
2747
///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748
///    elements of the result.
2749
/// \param __b
2750
///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751
///    elements of the result.
2752
/// \returns A 256-bit vector of [16 x i16] containing the result.
2753
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754
_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2755
{
2756
  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2757
}
2758

2759
/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760
///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761
///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762
///    of \a __a and \a __b as input; other bits in these parameters are
2763
///    ignored.
2764
///
2765
/// \code{.operation}
2766
/// result[31:0] := __a[95:64]
2767
/// result[63:32] := __b[95:64]
2768
/// result[95:64] := __a[127:96]
2769
/// result[127:96] := __b[127:96]
2770
/// result[159:128] := __a[223:192]
2771
/// result[191:160] := __b[223:192]
2772
/// result[223:192] := __a[255:224]
2773
/// result[255:224] := __b[255:224]
2774
/// \endcode
2775
///
2776
/// \headerfile <immintrin.h>
2777
///
2778
/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2779
///
2780
/// \param __a
2781
///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782
///    elements of the result.
2783
/// \param __b
2784
///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785
///    elements of the result.
2786
/// \returns A 256-bit vector of [8 x i32] containing the result.
2787
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788
_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2789
{
2790
  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2791
}
2792

2793
/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794
///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795
///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796
///    of \a __a and \a __b as input; other bits in these parameters are
2797
///    ignored.
2798
///
2799
/// \code{.operation}
2800
/// result[63:0] := __a[127:64]
2801
/// result[127:64] := __b[127:64]
2802
/// result[191:128] := __a[255:192]
2803
/// result[255:192] := __b[255:192]
2804
/// \endcode
2805
///
2806
/// \headerfile <immintrin.h>
2807
///
2808
/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2809
///
2810
/// \param __a
2811
///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812
///    elements of the result.
2813
/// \param __b
2814
///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815
///    elements of the result.
2816
/// \returns A 256-bit vector of [4 x i64] containing the result.
2817
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818
_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2819
{
2820
  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2821
}
2822

2823
/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824
///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825
///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826
///    input; other bits in these parameters are ignored.
2827
///
2828
/// \code{.operation}
2829
/// result[7:0] := __a[7:0]
2830
/// result[15:8] := __b[7:0]
2831
/// result[23:16] := __a[15:8]
2832
/// result[31:24] := __b[15:8]
2833
/// . . .
2834
/// result[127:120] := __b[63:56]
2835
/// result[135:128] := __a[135:128]
2836
/// . . .
2837
/// result[255:248] := __b[191:184]
2838
/// \endcode
2839
///
2840
/// \headerfile <immintrin.h>
2841
///
2842
/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2843
///
2844
/// \param __a
2845
///    A 256-bit integer vector used as the source for the even-numbered bytes
2846
///    of the result.
2847
/// \param __b
2848
///    A 256-bit integer vector used as the source for the odd-numbered bytes
2849
///    of the result.
2850
/// \returns A 256-bit integer vector containing the result.
2851
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852
_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2853
{
2854
  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2855
}
2856

2857
/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858
///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859
///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860
///    128-bit half of \a __a and \a __b as input; other bits in these
2861
///    parameters are ignored.
2862
///
2863
/// \code{.operation}
2864
/// result[15:0] := __a[15:0]
2865
/// result[31:16] := __b[15:0]
2866
/// result[47:32] := __a[31:16]
2867
/// result[63:48] := __b[31:16]
2868
/// . . .
2869
/// result[127:112] := __b[63:48]
2870
/// result[143:128] := __a[143:128]
2871
/// . . .
2872
/// result[255:239] := __b[191:176]
2873
/// \endcode
2874
///
2875
/// \headerfile <immintrin.h>
2876
///
2877
/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2878
///
2879
/// \param __a
2880
///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881
///    elements of the result.
2882
/// \param __b
2883
///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884
///    elements of the result.
2885
/// \returns A 256-bit vector of [16 x i16] containing the result.
2886
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887
_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2888
{
2889
  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2890
}
2891

2892
/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893
///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894
///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895
///    of \a __a and \a __b as input; other bits in these parameters are
2896
///    ignored.
2897
///
2898
/// \code{.operation}
2899
/// result[31:0] := __a[31:0]
2900
/// result[63:32] := __b[31:0]
2901
/// result[95:64] := __a[63:32]
2902
/// result[127:96] := __b[63:32]
2903
/// result[159:128] := __a[159:128]
2904
/// result[191:160] := __b[159:128]
2905
/// result[223:192] := __a[191:160]
2906
/// result[255:224] := __b[191:190]
2907
/// \endcode
2908
///
2909
/// \headerfile <immintrin.h>
2910
///
2911
/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2912
///
2913
/// \param __a
2914
///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915
///    elements of the result.
2916
/// \param __b
2917
///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918
///    elements of the result.
2919
/// \returns A 256-bit vector of [8 x i32] containing the result.
2920
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921
_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2922
{
2923
  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2924
}
2925

2926
/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927
///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928
///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929
///    of \a __a and \a __b as input; other bits in these parameters are
2930
///    ignored.
2931
///
2932
/// \code{.operation}
2933
/// result[63:0] := __a[63:0]
2934
/// result[127:64] := __b[63:0]
2935
/// result[191:128] := __a[191:128]
2936
/// result[255:192] := __b[191:128]
2937
/// \endcode
2938
///
2939
/// \headerfile <immintrin.h>
2940
///
2941
/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2942
///
2943
/// \param __a
2944
///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945
///    elements of the result.
2946
/// \param __b
2947
///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948
///    elements of the result.
2949
/// \returns A 256-bit vector of [4 x i64] containing the result.
2950
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951
_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2952
{
2953
  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2954
}
2955

2956
/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957
///    \a __b.
2958
///
2959
/// \headerfile <immintrin.h>
2960
///
2961
/// This intrinsic corresponds to the \c VPXOR instruction.
2962
///
2963
/// \param __a
2964
///    A 256-bit integer vector.
2965
/// \param __b
2966
///    A 256-bit integer vector.
2967
/// \returns A 256-bit integer vector containing the result.
2968
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969
_mm256_xor_si256(__m256i __a, __m256i __b)
2970
{
2971
  return (__m256i)((__v4du)__a ^ (__v4du)__b);
2972
}
2973

2974
/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975
///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976
///   boundary.
2977
///
2978
/// \headerfile <immintrin.h>
2979
///
2980
/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2981
///
2982
/// \param __V
2983
///    A pointer to the 32-byte aligned memory containing the vector to load.
2984
/// \returns A 256-bit integer vector loaded from memory.
2985
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986
_mm256_stream_load_si256(const void *__V)
2987
{
2988
  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989
  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2990
}
2991

2992
/// Broadcasts the 32-bit floating-point value from the low element of the
2993
///    128-bit vector of [4 x float] in \a __X to all elements of the result's
2994
///    128-bit vector of [4 x float].
2995
///
2996
/// \headerfile <immintrin.h>
2997
///
2998
/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2999
///
3000
/// \param __X
3001
///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3002
/// \returns A 128-bit vector of [4 x float] containing the result.
3003
static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004
_mm_broadcastss_ps(__m128 __X)
3005
{
3006
  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3007
}
3008

3009
/// Broadcasts the 64-bit floating-point value from the low element of the
3010
///    128-bit vector of [2 x double] in \a __a to both elements of the
3011
///    result's 128-bit vector of [2 x double].
3012
///
3013
/// \headerfile <immintrin.h>
3014
///
3015
/// This intrinsic corresponds to the \c MOVDDUP instruction.
3016
///
3017
/// \param __a
3018
///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3019
/// \returns A 128-bit vector of [2 x double] containing the result.
3020
static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021
_mm_broadcastsd_pd(__m128d __a)
3022
{
3023
  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3024
}
3025

3026
/// Broadcasts the 32-bit floating-point value from the low element of the
3027
///    128-bit vector of [4 x float] in \a __X to all elements of the
3028
///    result's 256-bit vector of [8 x float].
3029
///
3030
/// \headerfile <immintrin.h>
3031
///
3032
/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3033
///
3034
/// \param __X
3035
///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3036
/// \returns A 256-bit vector of [8 x float] containing the result.
3037
static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038
_mm256_broadcastss_ps(__m128 __X)
3039
{
3040
  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3041
}
3042

3043
/// Broadcasts the 64-bit floating-point value from the low element of the
3044
///    128-bit vector of [2 x double] in \a __X to all elements of the
3045
///    result's 256-bit vector of [4 x double].
3046
///
3047
/// \headerfile <immintrin.h>
3048
///
3049
/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3050
///
3051
/// \param __X
3052
///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3053
/// \returns A 256-bit vector of [4 x double] containing the result.
3054
static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055
_mm256_broadcastsd_pd(__m128d __X)
3056
{
3057
  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3058
}
3059

3060
/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061
///    upper halves of the 256-bit result.
3062
///
3063
/// \headerfile <immintrin.h>
3064
///
3065
/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3066
///
3067
/// \param __X
3068
///    A 128-bit integer vector to be broadcast.
3069
/// \returns A 256-bit integer vector containing the result.
3070
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071
_mm256_broadcastsi128_si256(__m128i __X)
3072
{
3073
  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3074
}
3075

3076
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3077

3078
/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079
///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080
///    as specified by the immediate integer operand \a M.
3081
///
3082
/// \code{.operation}
3083
/// FOR i := 0 TO 3
3084
///   j := i*32
3085
///   IF M[i] == 0
3086
///     result[31+j:j] := V1[31+j:j]
3087
///   ELSE
3088
///     result[31+j:j] := V2[32+j:j]
3089
///   FI
3090
/// ENDFOR
3091
/// \endcode
3092
///
3093
/// \headerfile <immintrin.h>
3094
///
3095
/// \code
3096
/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097
/// \endcode
3098
///
3099
/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3100
///
3101
/// \param V1
3102
///    A 128-bit vector of [4 x i32] containing source values.
3103
/// \param V2
3104
///    A 128-bit vector of [4 x i32] containing source values.
3105
/// \param M
3106
///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3107
///    source for each element of the result. The position of the mask bit
3108
///    corresponds to the index of a copied value. When a mask bit is 0, the
3109
///    element is copied from \a V1; otherwise, it is copied from \a V2.
3110
/// \returns A 128-bit vector of [4 x i32] containing the result.
3111
#define _mm_blend_epi32(V1, V2, M) \
3112
  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113
                                      (__v4si)(__m128i)(V2), (int)(M)))
3114

3115
/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116
///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117
///    as specified by the immediate integer operand \a M.
3118
///
3119
/// \code{.operation}
3120
/// FOR i := 0 TO 7
3121
///   j := i*32
3122
///   IF M[i] == 0
3123
///     result[31+j:j] := V1[31+j:j]
3124
///   ELSE
3125
///     result[31+j:j] := V2[32+j:j]
3126
///   FI
3127
/// ENDFOR
3128
/// \endcode
3129
///
3130
/// \headerfile <immintrin.h>
3131
///
3132
/// \code
3133
/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134
/// \endcode
3135
///
3136
/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3137
///
3138
/// \param V1
3139
///    A 256-bit vector of [8 x i32] containing source values.
3140
/// \param V2
3141
///    A 256-bit vector of [8 x i32] containing source values.
3142
/// \param M
3143
///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3144
///    source for each element of the result. The position of the mask bit
3145
///    corresponds to the index of a copied value. When a mask bit is 0, the
3146
///    element is copied from \a V1; otherwise, it is is copied from \a V2.
3147
/// \returns A 256-bit vector of [8 x i32] containing the result.
3148
#define _mm256_blend_epi32(V1, V2, M) \
3149
  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150
                                      (__v8si)(__m256i)(V2), (int)(M)))
3151

3152
/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153
///    bytes of the 256-bit result.
3154
///
3155
/// \headerfile <immintrin.h>
3156
///
3157
/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3158
///
3159
/// \param __X
3160
///    A 128-bit integer vector whose low byte will be broadcast.
3161
/// \returns A 256-bit integer vector containing the result.
3162
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163
_mm256_broadcastb_epi8(__m128i __X)
3164
{
3165
  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3166
}
3167

3168
/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169
///    to all elements of the result's 256-bit vector of [16 x i16].
3170
///
3171
/// \headerfile <immintrin.h>
3172
///
3173
/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3174
///
3175
/// \param __X
3176
///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177
/// \returns A 256-bit vector of [16 x i16] containing the result.
3178
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179
_mm256_broadcastw_epi16(__m128i __X)
3180
{
3181
  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3182
}
3183

3184
/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185
///    to all elements of the result's 256-bit vector of [8 x i32].
3186
///
3187
/// \headerfile <immintrin.h>
3188
///
3189
/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190
///
3191
/// \param __X
3192
///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193
/// \returns A 256-bit vector of [8 x i32] containing the result.
3194
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195
_mm256_broadcastd_epi32(__m128i __X)
3196
{
3197
  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3198
}
3199

3200
/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201
///    to all elements of the result's 256-bit vector of [4 x i64].
3202
///
3203
/// \headerfile <immintrin.h>
3204
///
3205
/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3206
///
3207
/// \param __X
3208
///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209
/// \returns A 256-bit vector of [4 x i64] containing the result.
3210
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211
_mm256_broadcastq_epi64(__m128i __X)
3212
{
3213
  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3214
}
3215

3216
/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217
///    bytes of the 128-bit result.
3218
///
3219
/// \headerfile <immintrin.h>
3220
///
3221
/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3222
///
3223
/// \param __X
3224
///    A 128-bit integer vector whose low byte will be broadcast.
3225
/// \returns A 128-bit integer vector containing the result.
3226
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227
_mm_broadcastb_epi8(__m128i __X)
3228
{
3229
  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3230
}
3231

3232
/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233
///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3234
///
3235
/// \headerfile <immintrin.h>
3236
///
3237
/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3238
///
3239
/// \param __X
3240
///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241
/// \returns A 128-bit vector of [8 x i16] containing the result.
3242
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243
_mm_broadcastw_epi16(__m128i __X)
3244
{
3245
  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3246
}
3247

3248
/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249
///    to all elements of the result's vector of [4 x i32].
3250
///
3251
/// \headerfile <immintrin.h>
3252
///
3253
/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3254
///
3255
/// \param __X
3256
///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257
/// \returns A 128-bit vector of [4 x i32] containing the result.
3258
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259
_mm_broadcastd_epi32(__m128i __X)
3260
{
3261
  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3262
}
3263

3264
/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265
///    to both elements of the result's 128-bit vector of [2 x i64].
3266
///
3267
/// \headerfile <immintrin.h>
3268
///
3269
/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3270
///
3271
/// \param __X
3272
///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273
/// \returns A 128-bit vector of [2 x i64] containing the result.
3274
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275
_mm_broadcastq_epi64(__m128i __X)
3276
{
3277
  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3278
}
3279

3280
/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281
///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282
///    elements of the 256-bit vector of [8 x i32] in \a __b.
3283
///
3284
/// \code{.operation}
3285
/// FOR i := 0 TO 7
3286
///   j := i*32
3287
///   k := __b[j+2:j] * 32
3288
///   result[j+31:j] := __a[k+31:k]
3289
/// ENDFOR
3290
/// \endcode
3291
///
3292
/// \headerfile <immintrin.h>
3293
///
3294
/// This intrinsic corresponds to the \c VPERMD instruction.
3295
///
3296
/// \param __a
3297
///    A 256-bit vector of [8 x i32] containing the source values.
3298
/// \param __b
3299
///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3300
///    \a __a.
3301
/// \returns A 256-bit vector of [8 x i32] containing the result.
3302
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303
_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3304
{
3305
  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3306
}
3307

3308
/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309
///    the 256-bit vector of [4 x double] in \a V as specified by the
3310
///    immediate value \a M.
3311
///
3312
/// \code{.operation}
3313
/// FOR i := 0 TO 3
3314
///   j := i*64
3315
///   k := (M >> i*2)[1:0] * 64
3316
///   result[j+63:j] := V[k+63:k]
3317
/// ENDFOR
3318
/// \endcode
3319
///
3320
/// \headerfile <immintrin.h>
3321
///
3322
/// \code
3323
/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324
/// \endcode
3325
///
3326
/// This intrinsic corresponds to the \c VPERMPD instruction.
3327
///
3328
/// \param V
3329
///    A 256-bit vector of [4 x double] containing the source values.
3330
/// \param M
3331
///    An immediate 8-bit value specifying which elements to copy from \a V.
3332
///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3333
///    \a M[3:2] specifies the index for element 1, and so forth.
3334
/// \returns A 256-bit vector of [4 x double] containing the result.
3335
#define _mm256_permute4x64_pd(V, M) \
3336
  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3337

3338
/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339
///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340
///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3341
///
3342
/// \code{.operation}
3343
/// FOR i := 0 TO 7
3344
///   j := i*32
3345
///   k := __b[j+2:j] * 32
3346
///   result[j+31:j] := __a[k+31:k]
3347
/// ENDFOR
3348
/// \endcode
3349
///
3350
/// \headerfile <immintrin.h>
3351
///
3352
/// This intrinsic corresponds to the \c VPERMPS instruction.
3353
///
3354
/// \param __a
3355
///    A 256-bit vector of [8 x float] containing the source values.
3356
/// \param __b
3357
///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3358
///    \a __a.
3359
/// \returns A 256-bit vector of [8 x float] containing the result.
3360
static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3362
{
3363
  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3364
}
3365

3366
/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367
///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3368
///    immediate value \a M.
3369
///
3370
/// \code{.operation}
3371
/// FOR i := 0 TO 3
3372
///   j := i*64
3373
///   k := (M >> i*2)[1:0] * 64
3374
///   result[j+63:j] := V[k+63:k]
3375
/// ENDFOR
3376
/// \endcode
3377
///
3378
/// \headerfile <immintrin.h>
3379
///
3380
/// \code
3381
/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382
/// \endcode
3383
///
3384
/// This intrinsic corresponds to the \c VPERMQ instruction.
3385
///
3386
/// \param V
3387
///    A 256-bit vector of [4 x i64] containing the source values.
3388
/// \param M
3389
///    An immediate 8-bit value specifying which elements to copy from \a V.
3390
///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3391
///    \a M[3:2] specifies the index for element 1, and so forth.
3392
/// \returns A 256-bit vector of [4 x i64] containing the result.
3393
#define _mm256_permute4x64_epi64(V, M) \
3394
  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3395

3396
/// Sets each half of the 256-bit result either to zero or to one of the
3397
///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398
///    as specified by the immediate value \a M.
3399
///
3400
/// \code{.operation}
3401
/// FOR i := 0 TO 1
3402
///   j := i*128
3403
///   k := M >> (i*4)
3404
///   IF k[3] == 0
3405
///     CASE (k[1:0]) OF
3406
///     0: result[127+j:j] := V1[127:0]
3407
///     1: result[127+j:j] := V1[255:128]
3408
///     2: result[127+j:j] := V2[127:0]
3409
///     3: result[127+j:j] := V2[255:128]
3410
///     ESAC
3411
///   ELSE
3412
///     result[127+j:j] := 0
3413
///   FI
3414
/// ENDFOR
3415
/// \endcode
3416
///
3417
/// \headerfile <immintrin.h>
3418
///
3419
/// \code
3420
/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421
/// \endcode
3422
///
3423
/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3424
///
3425
/// \param V1
3426
///    A 256-bit integer vector containing source values.
3427
/// \param V2
3428
///    A 256-bit integer vector containing source values.
3429
/// \param M
3430
///    An immediate value specifying how to form the result. Bits [3:0]
3431
///    control the lower half of the result, bits [7:4] control the upper half.
3432
///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433
///    otherwise bits [1:0] determine the source as follows. \n
3434
///    0: the lower half of \a V1 \n
3435
///    1: the upper half of \a V1 \n
3436
///    2: the lower half of \a V2 \n
3437
///    3: the upper half of \a V2
3438
/// \returns A 256-bit integer vector containing the result.
3439
#define _mm256_permute2x128_si256(V1, V2, M) \
3440
  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3441

3442
/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443
///     of the immediate \a M is zero, extracts the lower half of the result;
3444
///     otherwise, extracts the upper half.
3445
///
3446
/// \headerfile <immintrin.h>
3447
///
3448
/// \code
3449
/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450
/// \endcode
3451
///
3452
/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3453
///
3454
/// \param V
3455
///    A 256-bit integer vector containing the source values.
3456
/// \param M
3457
///    An immediate value specifying which half of \a V to extract.
3458
/// \returns A 128-bit integer vector containing the result.
3459
#define _mm256_extracti128_si256(V, M) \
3460
  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3461

3462
/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463
///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464
///     is zero, overwrites the lower half of the result; otherwise,
3465
///     overwrites the upper half.
3466
///
3467
/// \headerfile <immintrin.h>
3468
///
3469
/// \code
3470
/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471
/// \endcode
3472
///
3473
/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3474
///
3475
/// \param V1
3476
///    A 256-bit integer vector containing a source value.
3477
/// \param V2
3478
///    A 128-bit integer vector containing a source value.
3479
/// \param M
3480
///    An immediate value specifying where to put \a V2 in the result.
3481
/// \returns A 256-bit integer vector containing the result.
3482
#define _mm256_inserti128_si256(V1, V2, M) \
3483
  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484
                                         (__v2di)(__m128i)(V2), (int)(M)))
3485

3486
/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487
///    the most significant bit of the corresponding element in the mask
3488
///    \a __M is set; otherwise, sets that element of the result to zero.
3489
///    Returns the 256-bit [8 x i32] result.
3490
///
3491
/// \code{.operation}
3492
/// FOR i := 0 TO 7
3493
///   j := i*32
3494
///   IF __M[j+31] == 1
3495
///     result[j+31:j] := Load32(__X+(i*4))
3496
///   ELSE
3497
///     result[j+31:j] := 0
3498
///   FI
3499
/// ENDFOR
3500
/// \endcode
3501
///
3502
/// \headerfile <immintrin.h>
3503
///
3504
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3505
///
3506
/// \param __X
3507
///    A pointer to the memory used for loading values.
3508
/// \param __M
3509
///    A 256-bit vector of [8 x i32] containing the mask bits.
3510
/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511
///    elements.
3512
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513
_mm256_maskload_epi32(int const *__X, __m256i __M)
3514
{
3515
  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3516
}
3517

3518
/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519
///    the most significant bit of the corresponding element in the mask
3520
///    \a __M is set; otherwise, sets that element of the result to zero.
3521
///    Returns the 256-bit [4 x i64] result.
3522
///
3523
/// \code{.operation}
3524
/// FOR i := 0 TO 3
3525
///   j := i*64
3526
///   IF __M[j+63] == 1
3527
///     result[j+63:j] := Load64(__X+(i*8))
3528
///   ELSE
3529
///     result[j+63:j] := 0
3530
///   FI
3531
/// ENDFOR
3532
/// \endcode
3533
///
3534
/// \headerfile <immintrin.h>
3535
///
3536
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3537
///
3538
/// \param __X
3539
///    A pointer to the memory used for loading values.
3540
/// \param __M
3541
///    A 256-bit vector of [4 x i64] containing the mask bits.
3542
/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543
///    elements.
3544
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545
_mm256_maskload_epi64(long long const *__X, __m256i __M)
3546
{
3547
  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3548
}
3549

3550
/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551
///    the most significant bit of the corresponding element in the mask
3552
///    \a __M is set; otherwise, sets that element of the result to zero.
3553
///    Returns the 128-bit [4 x i32] result.
3554
///
3555
/// \code{.operation}
3556
/// FOR i := 0 TO 3
3557
///   j := i*32
3558
///   IF __M[j+31] == 1
3559
///     result[j+31:j] := Load32(__X+(i*4))
3560
///   ELSE
3561
///     result[j+31:j] := 0
3562
///   FI
3563
/// ENDFOR
3564
/// \endcode
3565
///
3566
/// \headerfile <immintrin.h>
3567
///
3568
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3569
///
3570
/// \param __X
3571
///    A pointer to the memory used for loading values.
3572
/// \param __M
3573
///    A 128-bit vector of [4 x i32] containing the mask bits.
3574
/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575
///    elements.
3576
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577
_mm_maskload_epi32(int const *__X, __m128i __M)
3578
{
3579
  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3580
}
3581

3582
/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583
///    the most significant bit of the corresponding element in the mask
3584
///    \a __M is set; otherwise, sets that element of the result to zero.
3585
///    Returns the 128-bit [2 x i64] result.
3586
///
3587
/// \code{.operation}
3588
/// FOR i := 0 TO 1
3589
///   j := i*64
3590
///   IF __M[j+63] == 1
3591
///     result[j+63:j] := Load64(__X+(i*8))
3592
///   ELSE
3593
///     result[j+63:j] := 0
3594
///   FI
3595
/// ENDFOR
3596
/// \endcode
3597
///
3598
/// \headerfile <immintrin.h>
3599
///
3600
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3601
///
3602
/// \param __X
3603
///    A pointer to the memory used for loading values.
3604
/// \param __M
3605
///    A 128-bit vector of [2 x i64] containing the mask bits.
3606
/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607
///    elements.
3608
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609
_mm_maskload_epi64(long long const *__X, __m128i __M)
3610
{
3611
  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3612
}
3613

3614
/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615
///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616
///    the corresponding element in the mask \a __M is set; otherwise, the
3617
///    memory element is unchanged.
3618
///
3619
/// \code{.operation}
3620
/// FOR i := 0 TO 7
3621
///   j := i*32
3622
///   IF __M[j+31] == 1
3623
///     Store32(__X+(i*4), __Y[j+31:j])
3624
///   FI
3625
/// ENDFOR
3626
/// \endcode
3627
///
3628
/// \headerfile <immintrin.h>
3629
///
3630
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3631
///
3632
/// \param __X
3633
///    A pointer to the memory used for storing values.
3634
/// \param __M
3635
///    A 256-bit vector of [8 x i32] containing the mask bits.
3636
/// \param __Y
3637
///    A 256-bit vector of [8 x i32] containing the values to store.
3638
static __inline__ void __DEFAULT_FN_ATTRS256
3639
_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3640
{
3641
  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3642
}
3643

3644
/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645
///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646
///    the corresponding element in the mask \a __M is set; otherwise, the
3647
///    memory element is unchanged.
3648
///
3649
/// \code{.operation}
3650
/// FOR i := 0 TO 3
3651
///   j := i*64
3652
///   IF __M[j+63] == 1
3653
///     Store64(__X+(i*8), __Y[j+63:j])
3654
///   FI
3655
/// ENDFOR
3656
/// \endcode
3657
///
3658
/// \headerfile <immintrin.h>
3659
///
3660
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3661
///
3662
/// \param __X
3663
///    A pointer to the memory used for storing values.
3664
/// \param __M
3665
///    A 256-bit vector of [4 x i64] containing the mask bits.
3666
/// \param __Y
3667
///    A 256-bit vector of [4 x i64] containing the values to store.
3668
static __inline__ void __DEFAULT_FN_ATTRS256
3669
_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3670
{
3671
  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3672
}
3673

3674
/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675
///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676
///    the corresponding element in the mask \a __M is set; otherwise, the
3677
///    memory element is unchanged.
3678
///
3679
/// \code{.operation}
3680
/// FOR i := 0 TO 3
3681
///   j := i*32
3682
///   IF __M[j+31] == 1
3683
///     Store32(__X+(i*4), __Y[j+31:j])
3684
///   FI
3685
/// ENDFOR
3686
/// \endcode
3687
///
3688
/// \headerfile <immintrin.h>
3689
///
3690
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3691
///
3692
/// \param __X
3693
///    A pointer to the memory used for storing values.
3694
/// \param __M
3695
///    A 128-bit vector of [4 x i32] containing the mask bits.
3696
/// \param __Y
3697
///    A 128-bit vector of [4 x i32] containing the values to store.
3698
static __inline__ void __DEFAULT_FN_ATTRS128
3699
_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3700
{
3701
  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3702
}
3703

3704
/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705
///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706
///    the corresponding element in the mask \a __M is set; otherwise, the
3707
///    memory element is unchanged.
3708
///
3709
/// \code{.operation}
3710
/// FOR i := 0 TO 1
3711
///   j := i*64
3712
///   IF __M[j+63] == 1
3713
///     Store64(__X+(i*8), __Y[j+63:j])
3714
///   FI
3715
/// ENDFOR
3716
/// \endcode
3717
///
3718
/// \headerfile <immintrin.h>
3719
///
3720
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3721
///
3722
/// \param __X
3723
///    A pointer to the memory used for storing values.
3724
/// \param __M
3725
///    A 128-bit vector of [2 x i64] containing the mask bits.
3726
/// \param __Y
3727
///    A 128-bit vector of [2 x i64] containing the values to store.
3728
static __inline__ void __DEFAULT_FN_ATTRS128
3729
_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3730
{
3731
  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3732
}
3733

3734
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735
///    left by the number of bits given in the corresponding element of the
3736
///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737
///    returns the result. If the shift count for any element is greater than
3738
///    31, the result for that element is zero.
3739
///
3740
/// \headerfile <immintrin.h>
3741
///
3742
/// This intrinsic corresponds to the \c VPSLLVD instruction.
3743
///
3744
/// \param __X
3745
///    A 256-bit vector of [8 x i32] to be shifted.
3746
/// \param __Y
3747
///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748
///    bits).
3749
/// \returns A 256-bit vector of [8 x i32] containing the result.
3750
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751
_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3752
{
3753
  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3754
}
3755

3756
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757
///    left by the number of bits given in the corresponding element of the
3758
///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759
///    returns the result. If the shift count for any element is greater than
3760
///    31, the result for that element is zero.
3761
///
3762
/// \headerfile <immintrin.h>
3763
///
3764
/// This intrinsic corresponds to the \c VPSLLVD instruction.
3765
///
3766
/// \param __X
3767
///    A 128-bit vector of [4 x i32] to be shifted.
3768
/// \param __Y
3769
///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770
///    bits).
3771
/// \returns A 128-bit vector of [4 x i32] containing the result.
3772
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773
_mm_sllv_epi32(__m128i __X, __m128i __Y)
3774
{
3775
  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3776
}
3777

3778
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779
///    left by the number of bits given in the corresponding element of the
3780
///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781
///    returns the result. If the shift count for any element is greater than
3782
///    63, the result for that element is zero.
3783
///
3784
/// \headerfile <immintrin.h>
3785
///
3786
/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3787
///
3788
/// \param __X
3789
///    A 256-bit vector of [4 x i64] to be shifted.
3790
/// \param __Y
3791
///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792
///    bits).
3793
/// \returns A 256-bit vector of [4 x i64] containing the result.
3794
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795
_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3796
{
3797
  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3798
}
3799

3800
/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801
///    left by the number of bits given in the corresponding element of the
3802
///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803
///    returns the result. If the shift count for any element is greater than
3804
///    63, the result for that element is zero.
3805
///
3806
/// \headerfile <immintrin.h>
3807
///
3808
/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3809
///
3810
/// \param __X
3811
///    A 128-bit vector of [2 x i64] to be shifted.
3812
/// \param __Y
3813
///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814
///    bits).
3815
/// \returns A 128-bit vector of [2 x i64] containing the result.
3816
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817
_mm_sllv_epi64(__m128i __X, __m128i __Y)
3818
{
3819
  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3820
}
3821

3822
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823
///    right by the number of bits given in the corresponding element of the
3824
///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825
///    returns the result. If the shift count for any element is greater than
3826
///    31, the result for that element is 0 or -1 according to the sign bit
3827
///    for that element.
3828
///
3829
/// \headerfile <immintrin.h>
3830
///
3831
/// This intrinsic corresponds to the \c VPSRAVD instruction.
3832
///
3833
/// \param __X
3834
///    A 256-bit vector of [8 x i32] to be shifted.
3835
/// \param __Y
3836
///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837
///    bits).
3838
/// \returns A 256-bit vector of [8 x i32] containing the result.
3839
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840
_mm256_srav_epi32(__m256i __X, __m256i __Y)
3841
{
3842
  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3843
}
3844

3845
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846
///    right by the number of bits given in the corresponding element of the
3847
///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848
///    returns the result. If the shift count for any element is greater than
3849
///    31, the result for that element is 0 or -1 according to the sign bit
3850
///    for that element.
3851
///
3852
/// \headerfile <immintrin.h>
3853
///
3854
/// This intrinsic corresponds to the \c VPSRAVD instruction.
3855
///
3856
/// \param __X
3857
///    A 128-bit vector of [4 x i32] to be shifted.
3858
/// \param __Y
3859
///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860
///    bits).
3861
/// \returns A 128-bit vector of [4 x i32] containing the result.
3862
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863
_mm_srav_epi32(__m128i __X, __m128i __Y)
3864
{
3865
  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3866
}
3867

3868
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869
///    right by the number of bits given in the corresponding element of the
3870
///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871
///    returns the result. If the shift count for any element is greater than
3872
///    31, the result for that element is zero.
3873
///
3874
/// \headerfile <immintrin.h>
3875
///
3876
/// This intrinsic corresponds to the \c VPSRLVD instruction.
3877
///
3878
/// \param __X
3879
///    A 256-bit vector of [8 x i32] to be shifted.
3880
/// \param __Y
3881
///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882
///    bits).
3883
/// \returns A 256-bit vector of [8 x i32] containing the result.
3884
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885
_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3886
{
3887
  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3888
}
3889

3890
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891
///    right by the number of bits given in the corresponding element of the
3892
///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893
///    returns the result. If the shift count for any element is greater than
3894
///    31, the result for that element is zero.
3895
///
3896
/// \headerfile <immintrin.h>
3897
///
3898
/// This intrinsic corresponds to the \c VPSRLVD instruction.
3899
///
3900
/// \param __X
3901
///    A 128-bit vector of [4 x i32] to be shifted.
3902
/// \param __Y
3903
///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904
///    bits).
3905
/// \returns A 128-bit vector of [4 x i32] containing the result.
3906
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907
_mm_srlv_epi32(__m128i __X, __m128i __Y)
3908
{
3909
  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3910
}
3911

3912
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913
///    right by the number of bits given in the corresponding element of the
3914
///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915
///    returns the result. If the shift count for any element is greater than
3916
///    63, the result for that element is zero.
3917
///
3918
/// \headerfile <immintrin.h>
3919
///
3920
/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3921
///
3922
/// \param __X
3923
///    A 256-bit vector of [4 x i64] to be shifted.
3924
/// \param __Y
3925
///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926
///    bits).
3927
/// \returns A 256-bit vector of [4 x i64] containing the result.
3928
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929
_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3930
{
3931
  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3932
}
3933

3934
/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935
///    right by the number of bits given in the corresponding element of the
3936
///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937
///    returns the result. If the shift count for any element is greater than
3938
///    63, the result for that element is zero.
3939
///
3940
/// \headerfile <immintrin.h>
3941
///
3942
/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3943
///
3944
/// \param __X
3945
///    A 128-bit vector of [2 x i64] to be shifted.
3946
/// \param __Y
3947
///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948
///    bits).
3949
/// \returns A 128-bit vector of [2 x i64] containing the result.
3950
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951
_mm_srlv_epi64(__m128i __X, __m128i __Y)
3952
{
3953
  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3954
}
3955

3956
/// Conditionally gathers two 64-bit floating-point values, either from the
3957
///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959
///    of [2 x double] in \a mask determines the source for each element.
3960
///
3961
/// \code{.operation}
3962
/// FOR element := 0 to 1
3963
///   j := element*64
3964
///   k := element*32
3965
///   IF mask[j+63] == 0
3966
///     result[j+63:j] := a[j+63:j]
3967
///   ELSE
3968
///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969
///   FI
3970
/// ENDFOR
3971
/// \endcode
3972
///
3973
/// \headerfile <immintrin.h>
3974
///
3975
/// \code
3976
/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977
///                               __m128d mask, const int s);
3978
/// \endcode
3979
///
3980
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3981
///
3982
/// \param a
3983
///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3984
///    zero.
3985
/// \param m
3986
///    A pointer to the memory used for loading values.
3987
/// \param i
3988
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989
///    the first two elements are used.
3990
/// \param mask
3991
///    A 128-bit vector of [2 x double] containing the mask. The most
3992
///    significant bit of each element in the mask vector represents the mask
3993
///    bits. If a mask bit is zero, the corresponding value from vector \a a
3994
///    is gathered; otherwise the value is loaded from memory.
3995
/// \param s
3996
///    A literal constant scale factor for the indexes in \a i. Must be
3997
///    1, 2, 4, or 8.
3998
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000
  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001
                                      (double const *)(m), \
4002
                                      (__v4si)(__m128i)(i), \
4003
                                      (__v2df)(__m128d)(mask), (s)))
4004

4005
/// Conditionally gathers four 64-bit floating-point values, either from the
4006
///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008
///    of [4 x double] in \a mask determines the source for each element.
4009
///
4010
/// \code{.operation}
4011
/// FOR element := 0 to 3
4012
///   j := element*64
4013
///   k := element*32
4014
///   IF mask[j+63] == 0
4015
///     result[j+63:j] := a[j+63:j]
4016
///   ELSE
4017
///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018
///   FI
4019
/// ENDFOR
4020
/// \endcode
4021
///
4022
/// \headerfile <immintrin.h>
4023
///
4024
/// \code
4025
/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026
///                                  __m256d mask, const int s);
4027
/// \endcode
4028
///
4029
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4030
///
4031
/// \param a
4032
///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4033
///    zero.
4034
/// \param m
4035
///    A pointer to the memory used for loading values.
4036
/// \param i
4037
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038
/// \param mask
4039
///    A 256-bit vector of [4 x double] containing the mask. The most
4040
///    significant bit of each element in the mask vector represents the mask
4041
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4042
///    is gathered; otherwise the value is loaded from memory.
4043
/// \param s
4044
///    A literal constant scale factor for the indexes in \a i. Must be
4045
///    1, 2, 4, or 8.
4046
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048
  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049
                                         (double const *)(m), \
4050
                                         (__v4si)(__m128i)(i), \
4051
                                         (__v4df)(__m256d)(mask), (s)))
4052

4053
/// Conditionally gathers two 64-bit floating-point values, either from the
4054
///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055
///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056
///    of [2 x double] in \a mask determines the source for each element.
4057
///
4058
/// \code{.operation}
4059
/// FOR element := 0 to 1
4060
///   j := element*64
4061
///   k := element*64
4062
///   IF mask[j+63] == 0
4063
///     result[j+63:j] := a[j+63:j]
4064
///   ELSE
4065
///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066
///   FI
4067
/// ENDFOR
4068
/// \endcode
4069
///
4070
/// \headerfile <immintrin.h>
4071
///
4072
/// \code
4073
/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074
///                               __m128d mask, const int s);
4075
/// \endcode
4076
///
4077
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4078
///
4079
/// \param a
4080
///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4081
///    zero.
4082
/// \param m
4083
///    A pointer to the memory used for loading values.
4084
/// \param i
4085
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086
/// \param mask
4087
///    A 128-bit vector of [2 x double] containing the mask. The most
4088
///    significant bit of each element in the mask vector represents the mask
4089
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4090
///    is gathered; otherwise the value is loaded from memory.
4091
/// \param s
4092
///    A literal constant scale factor for the indexes in \a i. Must be
4093
///    1, 2, 4, or 8.
4094
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096
  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097
                                      (double const *)(m), \
4098
                                      (__v2di)(__m128i)(i), \
4099
                                      (__v2df)(__m128d)(mask), (s)))
4100

4101
/// Conditionally gathers four 64-bit floating-point values, either from the
4102
///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103
///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104
///    of [4 x double] in \a mask determines the source for each element.
4105
///
4106
/// \code{.operation}
4107
/// FOR element := 0 to 3
4108
///   j := element*64
4109
///   k := element*64
4110
///   IF mask[j+63] == 0
4111
///     result[j+63:j] := a[j+63:j]
4112
///   ELSE
4113
///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114
///   FI
4115
/// ENDFOR
4116
/// \endcode
4117
///
4118
/// \headerfile <immintrin.h>
4119
///
4120
/// \code
4121
/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122
///                                  __m256d mask, const int s);
4123
/// \endcode
4124
///
4125
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4126
///
4127
/// \param a
4128
///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4129
///    zero.
4130
/// \param m
4131
///    A pointer to the memory used for loading values.
4132
/// \param i
4133
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134
/// \param mask
4135
///    A 256-bit vector of [4 x double] containing the mask. The most
4136
///    significant bit of each element in the mask vector represents the mask
4137
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4138
///    is gathered; otherwise the value is loaded from memory.
4139
/// \param s
4140
///    A literal constant scale factor for the indexes in \a i. Must be
4141
///    1, 2, 4, or 8.
4142
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144
  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145
                                         (double const *)(m), \
4146
                                         (__v4di)(__m256i)(i), \
4147
                                         (__v4df)(__m256d)(mask), (s)))
4148

4149
/// Conditionally gathers four 32-bit floating-point values, either from the
4150
///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152
///    of [4 x float] in \a mask determines the source for each element.
4153
///
4154
/// \code{.operation}
4155
/// FOR element := 0 to 3
4156
///   j := element*32
4157
///   k := element*32
4158
///   IF mask[j+31] == 0
4159
///     result[j+31:j] := a[j+31:j]
4160
///   ELSE
4161
///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162
///   FI
4163
/// ENDFOR
4164
/// \endcode
4165
///
4166
/// \headerfile <immintrin.h>
4167
///
4168
/// \code
4169
/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170
///                              __m128 mask, const int s);
4171
/// \endcode
4172
///
4173
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4174
///
4175
/// \param a
4176
///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4177
///    zero.
4178
/// \param m
4179
///    A pointer to the memory used for loading values.
4180
/// \param i
4181
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182
/// \param mask
4183
///    A 128-bit vector of [4 x float] containing the mask. The most
4184
///    significant bit of each element in the mask vector represents the mask
4185
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4186
///    is gathered; otherwise the value is loaded from memory.
4187
/// \param s
4188
///    A literal constant scale factor for the indexes in \a i. Must be
4189
///    1, 2, 4, or 8.
4190
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192
  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193
                                     (float const *)(m), \
4194
                                     (__v4si)(__m128i)(i), \
4195
                                     (__v4sf)(__m128)(mask), (s)))
4196

4197
/// Conditionally gathers eight 32-bit floating-point values, either from the
4198
///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199
///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200
///    of [8 x float] in \a mask determines the source for each element.
4201
///
4202
/// \code{.operation}
4203
/// FOR element := 0 to 7
4204
///   j := element*32
4205
///   k := element*32
4206
///   IF mask[j+31] == 0
4207
///     result[j+31:j] := a[j+31:j]
4208
///   ELSE
4209
///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210
///   FI
4211
/// ENDFOR
4212
/// \endcode
4213
///
4214
/// \headerfile <immintrin.h>
4215
///
4216
/// \code
4217
/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218
///                                 __m256 mask, const int s);
4219
/// \endcode
4220
///
4221
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4222
///
4223
/// \param a
4224
///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4225
///    zero.
4226
/// \param m
4227
///    A pointer to the memory used for loading values.
4228
/// \param i
4229
///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230
/// \param mask
4231
///    A 256-bit vector of [8 x float] containing the mask. The most
4232
///    significant bit of each element in the mask vector represents the mask
4233
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4234
///    is gathered; otherwise the value is loaded from memory.
4235
/// \param s
4236
///    A literal constant scale factor for the indexes in \a i. Must be
4237
///    1, 2, 4, or 8.
4238
/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240
  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241
                                        (float const *)(m), \
4242
                                        (__v8si)(__m256i)(i), \
4243
                                        (__v8sf)(__m256)(mask), (s)))
4244

4245
/// Conditionally gathers two 32-bit floating-point values, either from the
4246
///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247
///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248
///    of [4 x float] in \a mask determines the source for the lower two
4249
///    elements. The upper two elements of the result are zeroed.
4250
///
4251
/// \code{.operation}
4252
/// FOR element := 0 to 1
4253
///   j := element*32
4254
///   k := element*64
4255
///   IF mask[j+31] == 0
4256
///     result[j+31:j] := a[j+31:j]
4257
///   ELSE
4258
///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259
///   FI
4260
/// ENDFOR
4261
/// result[127:64] := 0
4262
/// \endcode
4263
///
4264
/// \headerfile <immintrin.h>
4265
///
4266
/// \code
4267
/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268
///                              __m128 mask, const int s);
4269
/// \endcode
4270
///
4271
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4272
///
4273
/// \param a
4274
///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4275
///    zero. Only the first two elements are used.
4276
/// \param m
4277
///    A pointer to the memory used for loading values.
4278
/// \param i
4279
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280
/// \param mask
4281
///    A 128-bit vector of [4 x float] containing the mask. The most
4282
///    significant bit of each element in the mask vector represents the mask
4283
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4284
///    is gathered; otherwise the value is loaded from memory. Only the first
4285
///    two elements are used.
4286
/// \param s
4287
///    A literal constant scale factor for the indexes in \a i. Must be
4288
///    1, 2, 4, or 8.
4289
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291
  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292
                                     (float const *)(m), \
4293
                                     (__v2di)(__m128i)(i), \
4294
                                     (__v4sf)(__m128)(mask), (s)))
4295

4296
/// Conditionally gathers four 32-bit floating-point values, either from the
4297
///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298
///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299
///    of [4 x float] in \a mask determines the source for each element.
4300
///
4301
/// \code{.operation}
4302
/// FOR element := 0 to 3
4303
///   j := element*32
4304
///   k := element*64
4305
///   IF mask[j+31] == 0
4306
///     result[j+31:j] := a[j+31:j]
4307
///   ELSE
4308
///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309
///   FI
4310
/// ENDFOR
4311
/// \endcode
4312
///
4313
/// \headerfile <immintrin.h>
4314
///
4315
/// \code
4316
/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317
///                                 __m128 mask, const int s);
4318
/// \endcode
4319
///
4320
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4321
///
4322
/// \param a
4323
///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4324
///   zero.
4325
/// \param m
4326
///    A pointer to the memory used for loading values.
4327
/// \param i
4328
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329
/// \param mask
4330
///    A 128-bit vector of [4 x float] containing the mask. The most
4331
///    significant bit of each element in the mask vector represents the mask
4332
///    bits. If a mask bit is zero, the corresponding value from vector \a a
4333
///    is gathered; otherwise the value is loaded from memory.
4334
/// \param s
4335
///    A literal constant scale factor for the indexes in \a i. Must be
4336
///    1, 2, 4, or 8.
4337
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339
  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340
                                        (float const *)(m), \
4341
                                        (__v4di)(__m256i)(i), \
4342
                                        (__v4sf)(__m128)(mask), (s)))
4343

4344
/// Conditionally gathers four 32-bit integer values, either from the
4345
///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347
///    of [4 x i32] in \a mask determines the source for each element.
4348
///
4349
/// \code{.operation}
4350
/// FOR element := 0 to 3
4351
///   j := element*32
4352
///   k := element*32
4353
///   IF mask[j+31] == 0
4354
///     result[j+31:j] := a[j+31:j]
4355
///   ELSE
4356
///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357
///   FI
4358
/// ENDFOR
4359
/// \endcode
4360
///
4361
/// \headerfile <immintrin.h>
4362
///
4363
/// \code
4364
/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365
///                                  __m128i mask, const int s);
4366
/// \endcode
4367
///
4368
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4369
///
4370
/// \param a
4371
///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372
///    zero.
4373
/// \param m
4374
///    A pointer to the memory used for loading values.
4375
/// \param i
4376
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377
/// \param mask
4378
///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4379
///    bit of each element in the mask vector represents the mask bits. If a
4380
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4381
///    otherwise the value is loaded from memory.
4382
/// \param s
4383
///    A literal constant scale factor for the indexes in \a i. Must be
4384
///    1, 2, 4, or 8.
4385
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387
  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388
                                     (int const *)(m), \
4389
                                     (__v4si)(__m128i)(i), \
4390
                                     (__v4si)(__m128i)(mask), (s)))
4391

4392
/// Conditionally gathers eight 32-bit integer values, either from the
4393
///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394
///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395
///    of [8 x i32] in \a mask determines the source for each element.
4396
///
4397
/// \code{.operation}
4398
/// FOR element := 0 to 7
4399
///   j := element*32
4400
///   k := element*32
4401
///   IF mask[j+31] == 0
4402
///     result[j+31:j] := a[j+31:j]
4403
///   ELSE
4404
///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405
///   FI
4406
/// ENDFOR
4407
/// \endcode
4408
///
4409
/// \headerfile <immintrin.h>
4410
///
4411
/// \code
4412
/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413
///                                     __m256i mask, const int s);
4414
/// \endcode
4415
///
4416
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4417
///
4418
/// \param a
4419
///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420
///    zero.
4421
/// \param m
4422
///    A pointer to the memory used for loading values.
4423
/// \param i
4424
///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425
/// \param mask
4426
///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4427
///    bit of each element in the mask vector represents the mask bits. If a
4428
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4429
///    otherwise the value is loaded from memory.
4430
/// \param s
4431
///    A literal constant scale factor for the indexes in \a i. Must be
4432
///    1, 2, 4, or 8.
4433
/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435
  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436
                                        (int const *)(m), \
4437
                                        (__v8si)(__m256i)(i), \
4438
                                        (__v8si)(__m256i)(mask), (s)))
4439

4440
/// Conditionally gathers two 32-bit integer values, either from the
4441
///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442
///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443
///    of [4 x i32] in \a mask determines the source for the lower two
4444
///    elements. The upper two elements of the result are zeroed.
4445
///
4446
/// \code{.operation}
4447
/// FOR element := 0 to 1
4448
///   j := element*32
4449
///   k := element*64
4450
///   IF mask[j+31] == 0
4451
///     result[j+31:j] := a[j+31:j]
4452
///   ELSE
4453
///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454
///   FI
4455
/// ENDFOR
4456
/// result[127:64] := 0
4457
/// \endcode
4458
///
4459
/// \headerfile <immintrin.h>
4460
///
4461
/// \code
4462
/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463
///                                  __m128i mask, const int s);
4464
/// \endcode
4465
///
4466
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4467
///
4468
/// \param a
4469
///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470
///   zero. Only the first two elements are used.
4471
/// \param m
4472
///    A pointer to the memory used for loading values.
4473
/// \param i
4474
///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4475
/// \param mask
4476
///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4477
///    bit of each element in the mask vector represents the mask bits. If a
4478
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4479
///    otherwise the value is loaded from memory. Only the first two elements
4480
///    are used.
4481
/// \param s
4482
///    A literal constant scale factor for the indexes in \a i. Must be
4483
///    1, 2, 4, or 8.
4484
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486
  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487
                                     (int const *)(m), \
4488
                                     (__v2di)(__m128i)(i), \
4489
                                     (__v4si)(__m128i)(mask), (s)))
4490

4491
/// Conditionally gathers four 32-bit integer values, either from the
4492
///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493
///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494
///    of [4 x i32] in \a mask determines the source for each element.
4495
///
4496
/// \code{.operation}
4497
/// FOR element := 0 to 3
4498
///   j := element*32
4499
///   k := element*64
4500
///   IF mask[j+31] == 0
4501
///     result[j+31:j] := a[j+31:j]
4502
///   ELSE
4503
///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504
///   FI
4505
/// ENDFOR
4506
/// \endcode
4507
///
4508
/// \headerfile <immintrin.h>
4509
///
4510
/// \code
4511
/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512
///                                     __m128i mask, const int s);
4513
/// \endcode
4514
///
4515
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4516
///
4517
/// \param a
4518
///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519
///    zero.
4520
/// \param m
4521
///    A pointer to the memory used for loading values.
4522
/// \param i
4523
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524
/// \param mask
4525
///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4526
///    bit of each element in the mask vector represents the mask bits. If a
4527
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4528
///    otherwise the value is loaded from memory.
4529
/// \param s
4530
///    A literal constant scale factor for the indexes in \a i. Must be
4531
///    1, 2, 4, or 8.
4532
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534
  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535
                                        (int const *)(m), \
4536
                                        (__v4di)(__m256i)(i), \
4537
                                        (__v4si)(__m128i)(mask), (s)))
4538

4539
/// Conditionally gathers two 64-bit integer values, either from the
4540
///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542
///    of [2 x i64] in \a mask determines the source for each element.
4543
///
4544
/// \code{.operation}
4545
/// FOR element := 0 to 1
4546
///   j := element*64
4547
///   k := element*32
4548
///   IF mask[j+63] == 0
4549
///     result[j+63:j] := a[j+63:j]
4550
///   ELSE
4551
///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552
///   FI
4553
/// ENDFOR
4554
/// \endcode
4555
///
4556
/// \headerfile <immintrin.h>
4557
///
4558
/// \code
4559
/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560
///                                  __m128i mask, const int s);
4561
/// \endcode
4562
///
4563
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4564
///
4565
/// \param a
4566
///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567
///    zero.
4568
/// \param m
4569
///    A pointer to the memory used for loading values.
4570
/// \param i
4571
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572
///    the first two elements are used.
4573
/// \param mask
4574
///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4575
///    bit of each element in the mask vector represents the mask bits. If a
4576
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4577
///    otherwise the value is loaded from memory.
4578
/// \param s
4579
///    A literal constant scale factor for the indexes in \a i. Must be
4580
///    1, 2, 4, or 8.
4581
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583
  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584
                                     (long long const *)(m), \
4585
                                     (__v4si)(__m128i)(i), \
4586
                                     (__v2di)(__m128i)(mask), (s)))
4587

4588
/// Conditionally gathers four 64-bit integer values, either from the
4589
///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590
///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591
///    of [4 x i64] in \a mask determines the source for each element.
4592
///
4593
/// \code{.operation}
4594
/// FOR element := 0 to 3
4595
///   j := element*64
4596
///   k := element*32
4597
///   IF mask[j+63] == 0
4598
///     result[j+63:j] := a[j+63:j]
4599
///   ELSE
4600
///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601
///   FI
4602
/// ENDFOR
4603
/// \endcode
4604
///
4605
/// \headerfile <immintrin.h>
4606
///
4607
/// \code
4608
/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609
///                                     __m128i i, __m256i mask, const int s);
4610
/// \endcode
4611
///
4612
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4613
///
4614
/// \param a
4615
///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616
///    zero.
4617
/// \param m
4618
///    A pointer to the memory used for loading values.
4619
/// \param i
4620
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621
/// \param mask
4622
///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4623
///    bit of each element in the mask vector represents the mask bits. If a
4624
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4625
///    otherwise the value is loaded from memory.
4626
/// \param s
4627
///    A literal constant scale factor for the indexes in \a i. Must be
4628
///    1, 2, 4, or 8.
4629
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631
  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632
                                        (long long const *)(m), \
4633
                                        (__v4si)(__m128i)(i), \
4634
                                        (__v4di)(__m256i)(mask), (s)))
4635

4636
/// Conditionally gathers two 64-bit integer values, either from the
4637
///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638
///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639
///    of [2 x i64] in \a mask determines the source for each element.
4640
///
4641
/// \code{.operation}
4642
/// FOR element := 0 to 1
4643
///   j := element*64
4644
///   k := element*64
4645
///   IF mask[j+63] == 0
4646
///     result[j+63:j] := a[j+63:j]
4647
///   ELSE
4648
///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649
///   FI
4650
/// ENDFOR
4651
/// \endcode
4652
///
4653
/// \headerfile <immintrin.h>
4654
///
4655
/// \code
4656
/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657
///                                  __m128i mask, const int s);
4658
/// \endcode
4659
///
4660
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4661
///
4662
/// \param a
4663
///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664
///    zero.
4665
/// \param m
4666
///    A pointer to the memory used for loading values.
4667
/// \param i
4668
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669
/// \param mask
4670
///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4671
///    bit of each element in the mask vector represents the mask bits. If a
4672
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4673
///    otherwise the value is loaded from memory.
4674
/// \param s
4675
///    A literal constant scale factor for the indexes in \a i. Must be
4676
///    1, 2, 4, or 8.
4677
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679
  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680
                                     (long long const *)(m), \
4681
                                     (__v2di)(__m128i)(i), \
4682
                                     (__v2di)(__m128i)(mask), (s)))
4683

4684
/// Conditionally gathers four 64-bit integer values, either from the
4685
///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686
///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687
///    of [4 x i64] in \a mask determines the source for each element.
4688
///
4689
/// \code{.operation}
4690
/// FOR element := 0 to 3
4691
///   j := element*64
4692
///   k := element*64
4693
///   IF mask[j+63] == 0
4694
///     result[j+63:j] := a[j+63:j]
4695
///   ELSE
4696
///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697
///   FI
4698
/// ENDFOR
4699
/// \endcode
4700
///
4701
/// \headerfile <immintrin.h>
4702
///
4703
/// \code
4704
/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705
///                                     __m256i i, __m256i mask, const int s);
4706
/// \endcode
4707
///
4708
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4709
///
4710
/// \param a
4711
///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712
///    zero.
4713
/// \param m
4714
///    A pointer to the memory used for loading values.
4715
/// \param i
4716
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717
/// \param mask
4718
///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4719
///    bit of each element in the mask vector represents the mask bits. If a
4720
///    mask bit is zero, the corresponding value from vector \a a is gathered;
4721
///    otherwise the value is loaded from memory.
4722
/// \param s
4723
///    A literal constant scale factor for the indexes in \a i. Must be
4724
///    1, 2, 4, or 8.
4725
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727
  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728
                                        (long long const *)(m), \
4729
                                        (__v4di)(__m256i)(i), \
4730
                                        (__v4di)(__m256i)(mask), (s)))
4731

4732
/// Gathers two 64-bit floating-point values from memory \a m using scaled
4733
///    indexes from the 128-bit vector of [4 x i32] in \a i.
4734
///
4735
/// \code{.operation}
4736
/// FOR element := 0 to 1
4737
///   j := element*64
4738
///   k := element*32
4739
///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740
/// ENDFOR
4741
/// \endcode
4742
///
4743
/// \headerfile <immintrin.h>
4744
///
4745
/// \code
4746
/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747
/// \endcode
4748
///
4749
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4750
///
4751
/// \param m
4752
///    A pointer to the memory used for loading values.
4753
/// \param i
4754
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755
///    the first two elements are used.
4756
/// \param s
4757
///    A literal constant scale factor for the indexes in \a i. Must be
4758
///    1, 2, 4, or 8.
4759
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760
#define _mm_i32gather_pd(m, i, s) \
4761
  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762
                                      (double const *)(m), \
4763
                                      (__v4si)(__m128i)(i), \
4764
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765
                                                           _mm_setzero_pd()), \
4766
                                      (s)))
4767

4768
/// Gathers four 64-bit floating-point values from memory \a m using scaled
4769
///    indexes from the 128-bit vector of [4 x i32] in \a i.
4770
///
4771
/// \code{.operation}
4772
/// FOR element := 0 to 3
4773
///   j := element*64
4774
///   k := element*32
4775
///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776
/// ENDFOR
4777
/// \endcode
4778
///
4779
/// \headerfile <immintrin.h>
4780
///
4781
/// \code
4782
/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783
/// \endcode
4784
///
4785
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4786
///
4787
/// \param m
4788
///    A pointer to the memory used for loading values.
4789
/// \param i
4790
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791
/// \param s
4792
///    A literal constant scale factor for the indexes in \a i. Must be
4793
///    1, 2, 4, or 8.
4794
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795
#define _mm256_i32gather_pd(m, i, s) \
4796
  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797
                                         (double const *)(m), \
4798
                                         (__v4si)(__m128i)(i), \
4799
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800
                                                               _mm256_setzero_pd(), \
4801
                                                               _CMP_EQ_OQ), \
4802
                                         (s)))
4803

4804
/// Gathers two 64-bit floating-point values from memory \a m using scaled
4805
///    indexes from the 128-bit vector of [2 x i64] in \a i.
4806
///
4807
/// \code{.operation}
4808
/// FOR element := 0 to 1
4809
///   j := element*64
4810
///   k := element*64
4811
///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812
/// ENDFOR
4813
/// \endcode
4814
///
4815
/// \headerfile <immintrin.h>
4816
///
4817
/// \code
4818
/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819
/// \endcode
4820
///
4821
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4822
///
4823
/// \param m
4824
///    A pointer to the memory used for loading values.
4825
/// \param i
4826
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827
/// \param s
4828
///    A literal constant scale factor for the indexes in \a i. Must be
4829
///    1, 2, 4, or 8.
4830
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831
#define _mm_i64gather_pd(m, i, s) \
4832
  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833
                                      (double const *)(m), \
4834
                                      (__v2di)(__m128i)(i), \
4835
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836
                                                           _mm_setzero_pd()), \
4837
                                      (s)))
4838

4839
/// Gathers four 64-bit floating-point values from memory \a m using scaled
4840
///    indexes from the 256-bit vector of [4 x i64] in \a i.
4841
///
4842
/// \code{.operation}
4843
/// FOR element := 0 to 3
4844
///   j := element*64
4845
///   k := element*64
4846
///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847
/// ENDFOR
4848
/// \endcode
4849
///
4850
/// \headerfile <immintrin.h>
4851
///
4852
/// \code
4853
/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854
/// \endcode
4855
///
4856
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4857
///
4858
/// \param m
4859
///    A pointer to the memory used for loading values.
4860
/// \param i
4861
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862
/// \param s
4863
///    A literal constant scale factor for the indexes in \a i. Must be
4864
///    1, 2, 4, or 8.
4865
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866
#define _mm256_i64gather_pd(m, i, s) \
4867
  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868
                                         (double const *)(m), \
4869
                                         (__v4di)(__m256i)(i), \
4870
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871
                                                               _mm256_setzero_pd(), \
4872
                                                               _CMP_EQ_OQ), \
4873
                                         (s)))
4874

4875
/// Gathers four 32-bit floating-point values from memory \a m using scaled
4876
///    indexes from the 128-bit vector of [4 x i32] in \a i.
4877
///
4878
/// \code{.operation}
4879
/// FOR element := 0 to 3
4880
///   j := element*32
4881
///   k := element*32
4882
///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883
/// ENDFOR
4884
/// \endcode
4885
///
4886
/// \headerfile <immintrin.h>
4887
///
4888
/// \code
4889
/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890
/// \endcode
4891
///
4892
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4893
///
4894
/// \param m
4895
///    A pointer to the memory used for loading values.
4896
/// \param i
4897
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898
/// \param s
4899
///    A literal constant scale factor for the indexes in \a i. Must be
4900
///    1, 2, 4, or 8.
4901
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902
#define _mm_i32gather_ps(m, i, s) \
4903
  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904
                                     (float const *)(m), \
4905
                                     (__v4si)(__m128i)(i), \
4906
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907
                                                          _mm_setzero_ps()), \
4908
                                     (s)))
4909

4910
/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911
///    indexes from the 256-bit vector of [8 x i32] in \a i.
4912
///
4913
/// \code{.operation}
4914
/// FOR element := 0 to 7
4915
///   j := element*32
4916
///   k := element*32
4917
///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918
/// ENDFOR
4919
/// \endcode
4920
///
4921
/// \headerfile <immintrin.h>
4922
///
4923
/// \code
4924
/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925
/// \endcode
4926
///
4927
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4928
///
4929
/// \param m
4930
///    A pointer to the memory used for loading values.
4931
/// \param i
4932
///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933
/// \param s
4934
///    A literal constant scale factor for the indexes in \a i. Must be
4935
///    1, 2, 4, or 8.
4936
/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937
#define _mm256_i32gather_ps(m, i, s) \
4938
  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939
                                        (float const *)(m), \
4940
                                        (__v8si)(__m256i)(i), \
4941
                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942
                                                              _mm256_setzero_ps(), \
4943
                                                              _CMP_EQ_OQ), \
4944
                                        (s)))
4945

4946
/// Gathers two 32-bit floating-point values from memory \a m using scaled
4947
///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948
///    elements of the result are zeroed.
4949
///
4950
/// \code{.operation}
4951
/// FOR element := 0 to 1
4952
///   j := element*32
4953
///   k := element*64
4954
///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955
/// ENDFOR
4956
/// result[127:64] := 0
4957
/// \endcode
4958
///
4959
/// \headerfile <immintrin.h>
4960
///
4961
/// \code
4962
/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963
/// \endcode
4964
///
4965
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4966
///
4967
/// \param m
4968
///    A pointer to the memory used for loading values.
4969
/// \param i
4970
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971
/// \param s
4972
///    A literal constant scale factor for the indexes in \a i. Must be
4973
///    1, 2, 4, or 8.
4974
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975
#define _mm_i64gather_ps(m, i, s) \
4976
  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977
                                     (float const *)(m), \
4978
                                     (__v2di)(__m128i)(i), \
4979
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980
                                                          _mm_setzero_ps()), \
4981
                                     (s)))
4982

4983
/// Gathers four 32-bit floating-point values from memory \a m using scaled
4984
///    indexes from the 256-bit vector of [4 x i64] in \a i.
4985
///
4986
/// \code{.operation}
4987
/// FOR element := 0 to 3
4988
///   j := element*32
4989
///   k := element*64
4990
///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991
/// ENDFOR
4992
/// \endcode
4993
///
4994
/// \headerfile <immintrin.h>
4995
///
4996
/// \code
4997
/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998
/// \endcode
4999
///
5000
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5001
///
5002
/// \param m
5003
///    A pointer to the memory used for loading values.
5004
/// \param i
5005
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006
/// \param s
5007
///    A literal constant scale factor for the indexes in \a i. Must be
5008
///    1, 2, 4, or 8.
5009
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010
#define _mm256_i64gather_ps(m, i, s) \
5011
  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012
                                        (float const *)(m), \
5013
                                        (__v4di)(__m256i)(i), \
5014
                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015
                                                             _mm_setzero_ps()), \
5016
                                        (s)))
5017

5018
/// Gathers four 32-bit floating-point values from memory \a m using scaled
5019
///    indexes from the 128-bit vector of [4 x i32] in \a i.
5020
///
5021
/// \code{.operation}
5022
/// FOR element := 0 to 3
5023
///   j := element*32
5024
///   k := element*32
5025
///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026
/// ENDFOR
5027
/// \endcode
5028
///
5029
/// \headerfile <immintrin.h>
5030
///
5031
/// \code
5032
/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033
/// \endcode
5034
///
5035
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5036
///
5037
/// \param m
5038
///    A pointer to the memory used for loading values.
5039
/// \param i
5040
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041
/// \param s
5042
///    A literal constant scale factor for the indexes in \a i. Must be
5043
///    1, 2, 4, or 8.
5044
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045
#define _mm_i32gather_epi32(m, i, s) \
5046
  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047
                                     (int const *)(m), (__v4si)(__m128i)(i), \
5048
                                     (__v4si)_mm_set1_epi32(-1), (s)))
5049

5050
/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051
///    indexes from the 256-bit vector of [8 x i32] in \a i.
5052
///
5053
/// \code{.operation}
5054
/// FOR element := 0 to 7
5055
///   j := element*32
5056
///   k := element*32
5057
///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058
/// ENDFOR
5059
/// \endcode
5060
///
5061
/// \headerfile <immintrin.h>
5062
///
5063
/// \code
5064
/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065
/// \endcode
5066
///
5067
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5068
///
5069
/// \param m
5070
///    A pointer to the memory used for loading values.
5071
/// \param i
5072
///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073
/// \param s
5074
///    A literal constant scale factor for the indexes in \a i. Must be
5075
///    1, 2, 4, or 8.
5076
/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077
#define _mm256_i32gather_epi32(m, i, s) \
5078
  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079
                                        (int const *)(m), (__v8si)(__m256i)(i), \
5080
                                        (__v8si)_mm256_set1_epi32(-1), (s)))
5081

5082
/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083
///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084
///    of the result are zeroed.
5085
///
5086
/// \code{.operation}
5087
/// FOR element := 0 to 1
5088
///   j := element*32
5089
///   k := element*64
5090
///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091
/// ENDFOR
5092
/// result[127:64] := 0
5093
/// \endcode
5094
///
5095
/// \headerfile <immintrin.h>
5096
///
5097
/// \code
5098
/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099
/// \endcode
5100
///
5101
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5102
///
5103
/// \param m
5104
///    A pointer to the memory used for loading values.
5105
/// \param i
5106
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107
/// \param s
5108
///    A literal constant scale factor for the indexes in \a i. Must be
5109
///    1, 2, 4, or 8.
5110
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111
#define _mm_i64gather_epi32(m, i, s) \
5112
  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113
                                     (int const *)(m), (__v2di)(__m128i)(i), \
5114
                                     (__v4si)_mm_set1_epi32(-1), (s)))
5115

5116
/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117
///    from the 256-bit vector of [4 x i64] in \a i.
5118
///
5119
/// \code{.operation}
5120
/// FOR element := 0 to 3
5121
///   j := element*32
5122
///   k := element*64
5123
///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124
/// ENDFOR
5125
/// \endcode
5126
///
5127
/// \headerfile <immintrin.h>
5128
///
5129
/// \code
5130
/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131
/// \endcode
5132
///
5133
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5134
///
5135
/// \param m
5136
///    A pointer to the memory used for loading values.
5137
/// \param i
5138
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139
/// \param s
5140
///    A literal constant scale factor for the indexes in \a i. Must be
5141
///    1, 2, 4, or 8.
5142
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143
#define _mm256_i64gather_epi32(m, i, s) \
5144
  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145
                                        (int const *)(m), (__v4di)(__m256i)(i), \
5146
                                        (__v4si)_mm_set1_epi32(-1), (s)))
5147

5148
/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149
///    from the 128-bit vector of [4 x i32] in \a i.
5150
///
5151
/// \code{.operation}
5152
/// FOR element := 0 to 1
5153
///   j := element*64
5154
///   k := element*32
5155
///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156
/// ENDFOR
5157
/// \endcode
5158
///
5159
/// \headerfile <immintrin.h>
5160
///
5161
/// \code
5162
/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163
/// \endcode
5164
///
5165
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5166
///
5167
/// \param m
5168
///    A pointer to the memory used for loading values.
5169
/// \param i
5170
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171
///    the first two elements are used.
5172
/// \param s
5173
///    A literal constant scale factor for the indexes in \a i. Must be
5174
///    1, 2, 4, or 8.
5175
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176
#define _mm_i32gather_epi64(m, i, s) \
5177
  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178
                                     (long long const *)(m), \
5179
                                     (__v4si)(__m128i)(i), \
5180
                                     (__v2di)_mm_set1_epi64x(-1), (s)))
5181

5182
/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183
///    from the 128-bit vector of [4 x i32] in \a i.
5184
///
5185
/// \code{.operation}
5186
/// FOR element := 0 to 3
5187
///   j := element*64
5188
///   k := element*32
5189
///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190
/// ENDFOR
5191
/// \endcode
5192
///
5193
/// \headerfile <immintrin.h>
5194
///
5195
/// \code
5196
/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197
/// \endcode
5198
///
5199
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5200
///
5201
/// \param m
5202
///    A pointer to the memory used for loading values.
5203
/// \param i
5204
///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205
/// \param s
5206
///    A literal constant scale factor for the indexes in \a i. Must be
5207
///    1, 2, 4, or 8.
5208
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209
#define _mm256_i32gather_epi64(m, i, s) \
5210
  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211
                                        (long long const *)(m), \
5212
                                        (__v4si)(__m128i)(i), \
5213
                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
5214

5215
/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216
///    from the 128-bit vector of [2 x i64] in \a i.
5217
///
5218
/// \code{.operation}
5219
/// FOR element := 0 to 1
5220
///   j := element*64
5221
///   k := element*64
5222
///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223
/// ENDFOR
5224
/// \endcode
5225
///
5226
/// \headerfile <immintrin.h>
5227
///
5228
/// \code
5229
/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230
/// \endcode
5231
///
5232
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5233
///
5234
/// \param m
5235
///    A pointer to the memory used for loading values.
5236
/// \param i
5237
///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238
/// \param s
5239
///    A literal constant scale factor for the indexes in \a i. Must be
5240
///    1, 2, 4, or 8.
5241
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242
#define _mm_i64gather_epi64(m, i, s) \
5243
  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244
                                     (long long const *)(m), \
5245
                                     (__v2di)(__m128i)(i), \
5246
                                     (__v2di)_mm_set1_epi64x(-1), (s)))
5247

5248
/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249
///    from the 256-bit vector of [4 x i64] in \a i.
5250
///
5251
/// \code{.operation}
5252
/// FOR element := 0 to 3
5253
///   j := element*64
5254
///   k := element*64
5255
///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256
/// ENDFOR
5257
/// \endcode
5258
///
5259
/// \headerfile <immintrin.h>
5260
///
5261
/// \code
5262
/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263
/// \endcode
5264
///
5265
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5266
///
5267
/// \param m
5268
///    A pointer to the memory used for loading values.
5269
/// \param i
5270
///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271
/// \param s
5272
///    A literal constant scale factor for the indexes in \a i. Must be
5273
///    1, 2, 4, or 8.
5274
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275
#define _mm256_i64gather_epi64(m, i, s) \
5276
  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277
                                        (long long const *)(m), \
5278
                                        (__v4di)(__m256i)(i), \
5279
                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
5280

5281
#undef __DEFAULT_FN_ATTRS256
5282
#undef __DEFAULT_FN_ATTRS128
5283

5284
#endif /* __AVX2INTRIN_H */
5285

5286
Product

Resources

Company