Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Headers/avx2intrin.h
35233 views
1
/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2
*
3
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
* See https://llvm.org/LICENSE.txt for license information.
5
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
*
7
*===-----------------------------------------------------------------------===
8
*/
9
10
#ifndef __IMMINTRIN_H
11
#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12
#endif
13
14
#ifndef __AVX2INTRIN_H
15
#define __AVX2INTRIN_H
16
17
/* Define the default attributes for the functions in this file. */
18
#define __DEFAULT_FN_ATTRS256 \
19
__attribute__((__always_inline__, __nodebug__, \
20
__target__("avx2,no-evex512"), __min_vector_width__(256)))
21
#define __DEFAULT_FN_ATTRS128 \
22
__attribute__((__always_inline__, __nodebug__, \
23
__target__("avx2,no-evex512"), __min_vector_width__(128)))
24
25
/* SSE4 Multiple Packed Sums of Absolute Difference. */
26
/// Computes sixteen sum of absolute difference (SAD) operations on sets of
27
/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
28
/// \a Y.
29
///
30
/// Eight SAD results are computed using the lower half of the input
31
/// vectors, and another eight using the upper half. These 16-bit values
32
/// are returned in the lower and upper halves of the 256-bit result,
33
/// respectively.
34
///
35
/// A single SAD operation selects four bytes from \a X and four bytes from
36
/// \a Y as input. It computes the differences between each \a X byte and
37
/// the corresponding \a Y byte, takes the absolute value of each
38
/// difference, and sums these four values to form one 16-bit result. The
39
/// intrinsic computes 16 of these results with different sets of input
40
/// bytes.
41
///
42
/// For each set of eight results, the SAD operations use the same four
43
/// bytes from \a Y; the starting bit position for these four bytes is
44
/// specified by \a M[1:0] times 32. The eight operations use successive
45
/// sets of four bytes from \a X; the starting bit position for the first
46
/// set of four bytes is specified by \a M[2] times 32. These bit positions
47
/// are all relative to the 128-bit lane for each set of eight operations.
48
///
49
/// \code{.operation}
50
/// r := 0
51
/// FOR i := 0 TO 1
52
/// j := i*3
53
/// Ybase := M[j+1:j]*32 + i*128
54
/// Xbase := M[j+2]*32 + i*128
55
/// FOR k := 0 TO 3
56
/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57
/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58
/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59
/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60
/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
61
/// Xbase := Xbase + 8
62
/// r := r + 16
63
/// ENDFOR
64
/// ENDFOR
65
/// \endcode
66
///
67
/// \headerfile <immintrin.h>
68
///
69
/// \code
70
/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
71
/// \endcode
72
///
73
/// This intrinsic corresponds to the \c VMPSADBW instruction.
74
///
75
/// \param X
76
/// A 256-bit integer vector containing one of the inputs.
77
/// \param Y
78
/// A 256-bit integer vector containing one of the inputs.
79
/// \param M
80
/// An unsigned immediate value specifying the starting positions of the
81
/// bytes to operate on.
82
/// \returns A 256-bit vector of [16 x i16] containing the result.
83
#define _mm256_mpsadbw_epu8(X, Y, M) \
84
((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85
(__v32qi)(__m256i)(Y), (int)(M)))
86
87
/// Computes the absolute value of each signed byte in the 256-bit integer
88
/// vector \a __a and returns each value in the corresponding byte of
89
/// the result.
90
///
91
/// \headerfile <immintrin.h>
92
///
93
/// This intrinsic corresponds to the \c VPABSB instruction.
94
///
95
/// \param __a
96
/// A 256-bit integer vector.
97
/// \returns A 256-bit integer vector containing the result.
98
static __inline__ __m256i __DEFAULT_FN_ATTRS256
99
_mm256_abs_epi8(__m256i __a)
100
{
101
return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
102
}
103
104
/// Computes the absolute value of each signed 16-bit element in the 256-bit
105
/// vector of [16 x i16] in \a __a and returns each value in the
106
/// corresponding element of the result.
107
///
108
/// \headerfile <immintrin.h>
109
///
110
/// This intrinsic corresponds to the \c VPABSW instruction.
111
///
112
/// \param __a
113
/// A 256-bit vector of [16 x i16].
114
/// \returns A 256-bit vector of [16 x i16] containing the result.
115
static __inline__ __m256i __DEFAULT_FN_ATTRS256
116
_mm256_abs_epi16(__m256i __a)
117
{
118
return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
119
}
120
121
/// Computes the absolute value of each signed 32-bit element in the 256-bit
122
/// vector of [8 x i32] in \a __a and returns each value in the
123
/// corresponding element of the result.
124
///
125
/// \headerfile <immintrin.h>
126
///
127
/// This intrinsic corresponds to the \c VPABSD instruction.
128
///
129
/// \param __a
130
/// A 256-bit vector of [8 x i32].
131
/// \returns A 256-bit vector of [8 x i32] containing the result.
132
static __inline__ __m256i __DEFAULT_FN_ATTRS256
133
_mm256_abs_epi32(__m256i __a)
134
{
135
return (__m256i)__builtin_elementwise_abs((__v8si)__a);
136
}
137
138
/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139
/// integers using signed saturation, and returns the 256-bit result.
140
///
141
/// \code{.operation}
142
/// FOR i := 0 TO 7
143
/// j := i*16
144
/// k := i*8
145
/// result[7+k:k] := SATURATE8(__a[15+j:j])
146
/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
147
/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148
/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
149
/// ENDFOR
150
/// \endcode
151
///
152
/// \headerfile <immintrin.h>
153
///
154
/// This intrinsic corresponds to the \c VPACKSSWB instruction.
155
///
156
/// \param __a
157
/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
158
/// result[191:128].
159
/// \param __b
160
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
161
/// result[255:192].
162
/// \returns A 256-bit integer vector containing the result.
163
static __inline__ __m256i __DEFAULT_FN_ATTRS256
164
_mm256_packs_epi16(__m256i __a, __m256i __b)
165
{
166
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
167
}
168
169
/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170
/// integers using signed saturation, and returns the resulting 256-bit
171
/// vector of [16 x i16].
172
///
173
/// \code{.operation}
174
/// FOR i := 0 TO 3
175
/// j := i*32
176
/// k := i*16
177
/// result[15+k:k] := SATURATE16(__a[31+j:j])
178
/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
179
/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180
/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
181
/// ENDFOR
182
/// \endcode
183
///
184
/// \headerfile <immintrin.h>
185
///
186
/// This intrinsic corresponds to the \c VPACKSSDW instruction.
187
///
188
/// \param __a
189
/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
190
/// result[191:128].
191
/// \param __b
192
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
193
/// result[255:192].
194
/// \returns A 256-bit vector of [16 x i16] containing the result.
195
static __inline__ __m256i __DEFAULT_FN_ATTRS256
196
_mm256_packs_epi32(__m256i __a, __m256i __b)
197
{
198
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
199
}
200
201
/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202
/// using unsigned saturation, and returns the 256-bit result.
203
///
204
/// \code{.operation}
205
/// FOR i := 0 TO 7
206
/// j := i*16
207
/// k := i*8
208
/// result[7+k:k] := SATURATE8U(__a[15+j:j])
209
/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210
/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211
/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
212
/// ENDFOR
213
/// \endcode
214
///
215
/// \headerfile <immintrin.h>
216
///
217
/// This intrinsic corresponds to the \c VPACKUSWB instruction.
218
///
219
/// \param __a
220
/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
221
/// result[191:128].
222
/// \param __b
223
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
224
/// result[255:192].
225
/// \returns A 256-bit integer vector containing the result.
226
static __inline__ __m256i __DEFAULT_FN_ATTRS256
227
_mm256_packus_epi16(__m256i __a, __m256i __b)
228
{
229
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
230
}
231
232
/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233
/// using unsigned saturation, and returns the resulting 256-bit vector of
234
/// [16 x i16].
235
///
236
/// \code{.operation}
237
/// FOR i := 0 TO 3
238
/// j := i*32
239
/// k := i*16
240
/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
241
/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242
/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243
/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
244
/// ENDFOR
245
/// \endcode
246
///
247
/// \headerfile <immintrin.h>
248
///
249
/// This intrinsic corresponds to the \c VPACKUSDW instruction.
250
///
251
/// \param __V1
252
/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
253
/// result[191:128].
254
/// \param __V2
255
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
256
/// result[255:192].
257
/// \returns A 256-bit vector of [16 x i16] containing the result.
258
static __inline__ __m256i __DEFAULT_FN_ATTRS256
259
_mm256_packus_epi32(__m256i __V1, __m256i __V2)
260
{
261
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
262
}
263
264
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265
/// vectors and returns the lower 8 bits of each sum in the corresponding
266
/// byte of the 256-bit integer vector result (overflow is ignored).
267
///
268
/// \headerfile <immintrin.h>
269
///
270
/// This intrinsic corresponds to the \c VPADDB instruction.
271
///
272
/// \param __a
273
/// A 256-bit integer vector containing one of the source operands.
274
/// \param __b
275
/// A 256-bit integer vector containing one of the source operands.
276
/// \returns A 256-bit integer vector containing the sums.
277
static __inline__ __m256i __DEFAULT_FN_ATTRS256
278
_mm256_add_epi8(__m256i __a, __m256i __b)
279
{
280
return (__m256i)((__v32qu)__a + (__v32qu)__b);
281
}
282
283
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284
/// [16 x i16] and returns the lower 16 bits of each sum in the
285
/// corresponding element of the [16 x i16] result (overflow is ignored).
286
///
287
/// \headerfile <immintrin.h>
288
///
289
/// This intrinsic corresponds to the \c VPADDW instruction.
290
///
291
/// \param __a
292
/// A 256-bit vector of [16 x i16] containing one of the source operands.
293
/// \param __b
294
/// A 256-bit vector of [16 x i16] containing one of the source operands.
295
/// \returns A 256-bit vector of [16 x i16] containing the sums.
296
static __inline__ __m256i __DEFAULT_FN_ATTRS256
297
_mm256_add_epi16(__m256i __a, __m256i __b)
298
{
299
return (__m256i)((__v16hu)__a + (__v16hu)__b);
300
}
301
302
/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303
/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304
/// element of the [8 x i32] result (overflow is ignored).
305
///
306
/// \headerfile <immintrin.h>
307
///
308
/// This intrinsic corresponds to the \c VPADDD instruction.
309
///
310
/// \param __a
311
/// A 256-bit vector of [8 x i32] containing one of the source operands.
312
/// \param __b
313
/// A 256-bit vector of [8 x i32] containing one of the source operands.
314
/// \returns A 256-bit vector of [8 x i32] containing the sums.
315
static __inline__ __m256i __DEFAULT_FN_ATTRS256
316
_mm256_add_epi32(__m256i __a, __m256i __b)
317
{
318
return (__m256i)((__v8su)__a + (__v8su)__b);
319
}
320
321
/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322
/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323
/// element of the [4 x i64] result (overflow is ignored).
324
///
325
/// \headerfile <immintrin.h>
326
///
327
/// This intrinsic corresponds to the \c VPADDQ instruction.
328
///
329
/// \param __a
330
/// A 256-bit vector of [4 x i64] containing one of the source operands.
331
/// \param __b
332
/// A 256-bit vector of [4 x i64] containing one of the source operands.
333
/// \returns A 256-bit vector of [4 x i64] containing the sums.
334
static __inline__ __m256i __DEFAULT_FN_ATTRS256
335
_mm256_add_epi64(__m256i __a, __m256i __b)
336
{
337
return (__m256i)((__v4du)__a + (__v4du)__b);
338
}
339
340
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341
/// vectors using signed saturation, and returns each sum in the
342
/// corresponding byte of the 256-bit integer vector result.
343
///
344
/// \headerfile <immintrin.h>
345
///
346
/// This intrinsic corresponds to the \c VPADDSB instruction.
347
///
348
/// \param __a
349
/// A 256-bit integer vector containing one of the source operands.
350
/// \param __b
351
/// A 256-bit integer vector containing one of the source operands.
352
/// \returns A 256-bit integer vector containing the sums.
353
static __inline__ __m256i __DEFAULT_FN_ATTRS256
354
_mm256_adds_epi8(__m256i __a, __m256i __b)
355
{
356
return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
357
}
358
359
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360
/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
361
///
362
/// \headerfile <immintrin.h>
363
///
364
/// This intrinsic corresponds to the \c VPADDSW instruction.
365
///
366
/// \param __a
367
/// A 256-bit vector of [16 x i16] containing one of the source operands.
368
/// \param __b
369
/// A 256-bit vector of [16 x i16] containing one of the source operands.
370
/// \returns A 256-bit vector of [16 x i16] containing the sums.
371
static __inline__ __m256i __DEFAULT_FN_ATTRS256
372
_mm256_adds_epi16(__m256i __a, __m256i __b)
373
{
374
return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
375
}
376
377
/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378
/// vectors using unsigned saturation, and returns each sum in the
379
/// corresponding byte of the 256-bit integer vector result.
380
///
381
/// \headerfile <immintrin.h>
382
///
383
/// This intrinsic corresponds to the \c VPADDUSB instruction.
384
///
385
/// \param __a
386
/// A 256-bit integer vector containing one of the source operands.
387
/// \param __b
388
/// A 256-bit integer vector containing one of the source operands.
389
/// \returns A 256-bit integer vector containing the sums.
390
static __inline__ __m256i __DEFAULT_FN_ATTRS256
391
_mm256_adds_epu8(__m256i __a, __m256i __b)
392
{
393
return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
394
}
395
396
/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397
/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
398
///
399
/// \headerfile <immintrin.h>
400
///
401
/// This intrinsic corresponds to the \c VPADDUSW instruction.
402
///
403
/// \param __a
404
/// A 256-bit vector of [16 x i16] containing one of the source operands.
405
/// \param __b
406
/// A 256-bit vector of [16 x i16] containing one of the source operands.
407
/// \returns A 256-bit vector of [16 x i16] containing the sums.
408
static __inline__ __m256i __DEFAULT_FN_ATTRS256
409
_mm256_adds_epu16(__m256i __a, __m256i __b)
410
{
411
return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
412
}
413
414
/// Uses the lower half of the 256-bit vector \a a as the upper half of a
415
/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
416
/// as the lower half of the temporary value. Right-shifts the temporary
417
/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
418
/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
419
/// \a b to make another temporary value, right shifts by \a n, and uses
420
/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
421
/// result.
422
///
423
/// \headerfile <immintrin.h>
424
///
425
/// \code
426
/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
427
/// \endcode
428
///
429
/// This intrinsic corresponds to the \c VPALIGNR instruction.
430
///
431
/// \param a
432
/// A 256-bit integer vector containing source values.
433
/// \param b
434
/// A 256-bit integer vector containing source values.
435
/// \param n
436
/// An immediate value specifying the number of bytes to shift.
437
/// \returns A 256-bit integer vector containing the result.
438
#define _mm256_alignr_epi8(a, b, n) \
439
((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440
(__v32qi)(__m256i)(b), (n)))
441
442
/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
443
/// \a __b.
444
///
445
/// \headerfile <immintrin.h>
446
///
447
/// This intrinsic corresponds to the \c VPAND instruction.
448
///
449
/// \param __a
450
/// A 256-bit integer vector.
451
/// \param __b
452
/// A 256-bit integer vector.
453
/// \returns A 256-bit integer vector containing the result.
454
static __inline__ __m256i __DEFAULT_FN_ATTRS256
455
_mm256_and_si256(__m256i __a, __m256i __b)
456
{
457
return (__m256i)((__v4du)__a & (__v4du)__b);
458
}
459
460
/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461
/// the bitwise NOT of the 256-bit integer vector in \a __a.
462
///
463
/// \headerfile <immintrin.h>
464
///
465
/// This intrinsic corresponds to the \c VPANDN instruction.
466
///
467
/// \param __a
468
/// A 256-bit integer vector.
469
/// \param __b
470
/// A 256-bit integer vector.
471
/// \returns A 256-bit integer vector containing the result.
472
static __inline__ __m256i __DEFAULT_FN_ATTRS256
473
_mm256_andnot_si256(__m256i __a, __m256i __b)
474
{
475
return (__m256i)(~(__v4du)__a & (__v4du)__b);
476
}
477
478
/// Computes the averages of the corresponding unsigned bytes in the two
479
/// 256-bit integer vectors in \a __a and \a __b and returns each
480
/// average in the corresponding byte of the 256-bit result.
481
///
482
/// \code{.operation}
483
/// FOR i := 0 TO 31
484
/// j := i*8
485
/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
486
/// ENDFOR
487
/// \endcode
488
///
489
/// \headerfile <immintrin.h>
490
///
491
/// This intrinsic corresponds to the \c VPAVGB instruction.
492
///
493
/// \param __a
494
/// A 256-bit integer vector.
495
/// \param __b
496
/// A 256-bit integer vector.
497
/// \returns A 256-bit integer vector containing the result.
498
static __inline__ __m256i __DEFAULT_FN_ATTRS256
499
_mm256_avg_epu8(__m256i __a, __m256i __b)
500
{
501
return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
502
}
503
504
/// Computes the averages of the corresponding unsigned 16-bit integers in
505
/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506
/// each average in the corresponding element of the 256-bit result.
507
///
508
/// \code{.operation}
509
/// FOR i := 0 TO 15
510
/// j := i*16
511
/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512
/// ENDFOR
513
/// \endcode
514
///
515
/// \headerfile <immintrin.h>
516
///
517
/// This intrinsic corresponds to the \c VPAVGW instruction.
518
///
519
/// \param __a
520
/// A 256-bit vector of [16 x i16].
521
/// \param __b
522
/// A 256-bit vector of [16 x i16].
523
/// \returns A 256-bit vector of [16 x i16] containing the result.
524
static __inline__ __m256i __DEFAULT_FN_ATTRS256
525
_mm256_avg_epu16(__m256i __a, __m256i __b)
526
{
527
return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
528
}
529
530
/// Merges 8-bit integer values from either of the two 256-bit vectors
531
/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532
/// the resulting 256-bit integer vector.
533
///
534
/// \code{.operation}
535
/// FOR i := 0 TO 31
536
/// j := i*8
537
/// IF __M[7+i] == 0
538
/// result[7+j:j] := __V1[7+j:j]
539
/// ELSE
540
/// result[7+j:j] := __V2[7+j:j]
541
/// FI
542
/// ENDFOR
543
/// \endcode
544
///
545
/// \headerfile <immintrin.h>
546
///
547
/// This intrinsic corresponds to the \c VPBLENDVB instruction.
548
///
549
/// \param __V1
550
/// A 256-bit integer vector containing source values.
551
/// \param __V2
552
/// A 256-bit integer vector containing source values.
553
/// \param __M
554
/// A 256-bit integer vector, with bit [7] of each byte specifying the
555
/// source for each corresponding byte of the result. When the mask bit
556
/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
557
/// \a __V2.
558
/// \returns A 256-bit integer vector containing the result.
559
static __inline__ __m256i __DEFAULT_FN_ATTRS256
560
_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
561
{
562
return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
563
(__v32qi)__M);
564
}
565
566
/// Merges 16-bit integer values from either of the two 256-bit vectors
567
/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
568
/// and returns the resulting 256-bit vector of [16 x i16].
569
///
570
/// \code{.operation}
571
/// FOR i := 0 TO 7
572
/// j := i*16
573
/// IF M[i] == 0
574
/// result[7+j:j] := V1[7+j:j]
575
/// result[135+j:128+j] := V1[135+j:128+j]
576
/// ELSE
577
/// result[7+j:j] := V2[7+j:j]
578
/// result[135+j:128+j] := V2[135+j:128+j]
579
/// FI
580
/// ENDFOR
581
/// \endcode
582
///
583
/// \headerfile <immintrin.h>
584
///
585
/// \code
586
/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
587
/// \endcode
588
///
589
/// This intrinsic corresponds to the \c VPBLENDW instruction.
590
///
591
/// \param V1
592
/// A 256-bit vector of [16 x i16] containing source values.
593
/// \param V2
594
/// A 256-bit vector of [16 x i16] containing source values.
595
/// \param M
596
/// An immediate 8-bit integer operand, with bits [7:0] specifying the
597
/// source for each element of the result. The position of the mask bit
598
/// corresponds to the index of a copied value. When a mask bit is 0, the
599
/// element is copied from \a V1; otherwise, it is copied from \a V2.
600
/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
601
/// elements 1 and 9, and so forth.
602
/// \returns A 256-bit vector of [16 x i16] containing the result.
603
#define _mm256_blend_epi16(V1, V2, M) \
604
((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605
(__v16hi)(__m256i)(V2), (int)(M)))
606
607
/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608
/// \a __b for equality and returns the outcomes in the corresponding
609
/// bytes of the 256-bit result.
610
///
611
/// \code{.operation}
612
/// FOR i := 0 TO 31
613
/// j := i*8
614
/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
615
/// ENDFOR
616
/// \endcode
617
///
618
/// \headerfile <immintrin.h>
619
///
620
/// This intrinsic corresponds to the \c VPCMPEQB instruction.
621
///
622
/// \param __a
623
/// A 256-bit integer vector containing one of the inputs.
624
/// \param __b
625
/// A 256-bit integer vector containing one of the inputs.
626
/// \returns A 256-bit integer vector containing the result.
627
static __inline__ __m256i __DEFAULT_FN_ATTRS256
628
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
629
{
630
return (__m256i)((__v32qi)__a == (__v32qi)__b);
631
}
632
633
/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634
/// \a __a and \a __b for equality and returns the outcomes in the
635
/// corresponding elements of the 256-bit result.
636
///
637
/// \code{.operation}
638
/// FOR i := 0 TO 15
639
/// j := i*16
640
/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
641
/// ENDFOR
642
/// \endcode
643
///
644
/// \headerfile <immintrin.h>
645
///
646
/// This intrinsic corresponds to the \c VPCMPEQW instruction.
647
///
648
/// \param __a
649
/// A 256-bit vector of [16 x i16] containing one of the inputs.
650
/// \param __b
651
/// A 256-bit vector of [16 x i16] containing one of the inputs.
652
/// \returns A 256-bit vector of [16 x i16] containing the result.
653
static __inline__ __m256i __DEFAULT_FN_ATTRS256
654
_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
655
{
656
return (__m256i)((__v16hi)__a == (__v16hi)__b);
657
}
658
659
/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660
/// \a __a and \a __b for equality and returns the outcomes in the
661
/// corresponding elements of the 256-bit result.
662
///
663
/// \code{.operation}
664
/// FOR i := 0 TO 7
665
/// j := i*32
666
/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
667
/// ENDFOR
668
/// \endcode
669
///
670
/// \headerfile <immintrin.h>
671
///
672
/// This intrinsic corresponds to the \c VPCMPEQD instruction.
673
///
674
/// \param __a
675
/// A 256-bit vector of [8 x i32] containing one of the inputs.
676
/// \param __b
677
/// A 256-bit vector of [8 x i32] containing one of the inputs.
678
/// \returns A 256-bit vector of [8 x i32] containing the result.
679
static __inline__ __m256i __DEFAULT_FN_ATTRS256
680
_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
681
{
682
return (__m256i)((__v8si)__a == (__v8si)__b);
683
}
684
685
/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686
/// \a __a and \a __b for equality and returns the outcomes in the
687
/// corresponding elements of the 256-bit result.
688
///
689
/// \code{.operation}
690
/// FOR i := 0 TO 3
691
/// j := i*64
692
/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
693
/// ENDFOR
694
/// \endcode
695
///
696
/// \headerfile <immintrin.h>
697
///
698
/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
699
///
700
/// \param __a
701
/// A 256-bit vector of [4 x i64] containing one of the inputs.
702
/// \param __b
703
/// A 256-bit vector of [4 x i64] containing one of the inputs.
704
/// \returns A 256-bit vector of [4 x i64] containing the result.
705
static __inline__ __m256i __DEFAULT_FN_ATTRS256
706
_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
707
{
708
return (__m256i)((__v4di)__a == (__v4di)__b);
709
}
710
711
/// Compares corresponding signed bytes in the 256-bit integer vectors in
712
/// \a __a and \a __b for greater-than and returns the outcomes in the
713
/// corresponding bytes of the 256-bit result.
714
///
715
/// \code{.operation}
716
/// FOR i := 0 TO 31
717
/// j := i*8
718
/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
719
/// ENDFOR
720
/// \endcode
721
///
722
/// \headerfile <immintrin.h>
723
///
724
/// This intrinsic corresponds to the \c VPCMPGTB instruction.
725
///
726
/// \param __a
727
/// A 256-bit integer vector containing one of the inputs.
728
/// \param __b
729
/// A 256-bit integer vector containing one of the inputs.
730
/// \returns A 256-bit integer vector containing the result.
731
static __inline__ __m256i __DEFAULT_FN_ATTRS256
732
_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
733
{
734
/* This function always performs a signed comparison, but __v32qi is a char
735
which may be signed or unsigned, so use __v32qs. */
736
return (__m256i)((__v32qs)__a > (__v32qs)__b);
737
}
738
739
/// Compares corresponding signed elements in the 256-bit vectors of
740
/// [16 x i16] in \a __a and \a __b for greater-than and returns the
741
/// outcomes in the corresponding elements of the 256-bit result.
742
///
743
/// \code{.operation}
744
/// FOR i := 0 TO 15
745
/// j := i*16
746
/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
747
/// ENDFOR
748
/// \endcode
749
///
750
/// \headerfile <immintrin.h>
751
///
752
/// This intrinsic corresponds to the \c VPCMPGTW instruction.
753
///
754
/// \param __a
755
/// A 256-bit vector of [16 x i16] containing one of the inputs.
756
/// \param __b
757
/// A 256-bit vector of [16 x i16] containing one of the inputs.
758
/// \returns A 256-bit vector of [16 x i16] containing the result.
759
static __inline__ __m256i __DEFAULT_FN_ATTRS256
760
_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
761
{
762
return (__m256i)((__v16hi)__a > (__v16hi)__b);
763
}
764
765
/// Compares corresponding signed elements in the 256-bit vectors of
766
/// [8 x i32] in \a __a and \a __b for greater-than and returns the
767
/// outcomes in the corresponding elements of the 256-bit result.
768
///
769
/// \code{.operation}
770
/// FOR i := 0 TO 7
771
/// j := i*32
772
/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
773
/// ENDFOR
774
/// \endcode
775
///
776
/// \headerfile <immintrin.h>
777
///
778
/// This intrinsic corresponds to the \c VPCMPGTD instruction.
779
///
780
/// \param __a
781
/// A 256-bit vector of [8 x i32] containing one of the inputs.
782
/// \param __b
783
/// A 256-bit vector of [8 x i32] containing one of the inputs.
784
/// \returns A 256-bit vector of [8 x i32] containing the result.
785
static __inline__ __m256i __DEFAULT_FN_ATTRS256
786
_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
787
{
788
return (__m256i)((__v8si)__a > (__v8si)__b);
789
}
790
791
/// Compares corresponding signed elements in the 256-bit vectors of
792
/// [4 x i64] in \a __a and \a __b for greater-than and returns the
793
/// outcomes in the corresponding elements of the 256-bit result.
794
///
795
/// \code{.operation}
796
/// FOR i := 0 TO 3
797
/// j := i*64
798
/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
799
/// ENDFOR
800
/// \endcode
801
///
802
/// \headerfile <immintrin.h>
803
///
804
/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
805
///
806
/// \param __a
807
/// A 256-bit vector of [4 x i64] containing one of the inputs.
808
/// \param __b
809
/// A 256-bit vector of [4 x i64] containing one of the inputs.
810
/// \returns A 256-bit vector of [4 x i64] containing the result.
811
static __inline__ __m256i __DEFAULT_FN_ATTRS256
812
_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
813
{
814
return (__m256i)((__v4di)__a > (__v4di)__b);
815
}
816
817
/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818
/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819
/// element of the [16 x i16] result (overflow is ignored). Sums from
820
/// \a __a are returned in the lower 64 bits of each 128-bit half of the
821
/// result; sums from \a __b are returned in the upper 64 bits of each
822
/// 128-bit half of the result.
823
///
824
/// \code{.operation}
825
/// FOR i := 0 TO 1
826
/// j := i*128
827
/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828
/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829
/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830
/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831
/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832
/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833
/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834
/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
835
/// ENDFOR
836
/// \endcode
837
///
838
/// \headerfile <immintrin.h>
839
///
840
/// This intrinsic corresponds to the \c VPHADDW instruction.
841
///
842
/// \param __a
843
/// A 256-bit vector of [16 x i16] containing one of the source operands.
844
/// \param __b
845
/// A 256-bit vector of [16 x i16] containing one of the source operands.
846
/// \returns A 256-bit vector of [16 x i16] containing the sums.
847
static __inline__ __m256i __DEFAULT_FN_ATTRS256
848
_mm256_hadd_epi16(__m256i __a, __m256i __b)
849
{
850
return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
851
}
852
853
/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854
/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855
/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856
/// are returned in the lower 64 bits of each 128-bit half of the result;
857
/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
858
/// of the result.
859
///
860
/// \code{.operation}
861
/// FOR i := 0 TO 1
862
/// j := i*128
863
/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864
/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865
/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866
/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
867
/// ENDFOR
868
/// \endcode
869
///
870
/// \headerfile <immintrin.h>
871
///
872
/// This intrinsic corresponds to the \c VPHADDD instruction.
873
///
874
/// \param __a
875
/// A 256-bit vector of [8 x i32] containing one of the source operands.
876
/// \param __b
877
/// A 256-bit vector of [8 x i32] containing one of the source operands.
878
/// \returns A 256-bit vector of [8 x i32] containing the sums.
879
static __inline__ __m256i __DEFAULT_FN_ATTRS256
880
_mm256_hadd_epi32(__m256i __a, __m256i __b)
881
{
882
return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
883
}
884
885
/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886
/// vectors of [16 x i16] using signed saturation and returns each sum in
887
/// an element of the [16 x i16] result. Sums from \a __a are returned in
888
/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
889
/// are returned in the upper 64 bits of each 128-bit half of the result.
890
///
891
/// \code{.operation}
892
/// FOR i := 0 TO 1
893
/// j := i*128
894
/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895
/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896
/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897
/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898
/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899
/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900
/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901
/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
902
/// ENDFOR
903
/// \endcode
904
///
905
/// \headerfile <immintrin.h>
906
///
907
/// This intrinsic corresponds to the \c VPHADDSW instruction.
908
///
909
/// \param __a
910
/// A 256-bit vector of [16 x i16] containing one of the source operands.
911
/// \param __b
912
/// A 256-bit vector of [16 x i16] containing one of the source operands.
913
/// \returns A 256-bit vector of [16 x i16] containing the sums.
914
static __inline__ __m256i __DEFAULT_FN_ATTRS256
915
_mm256_hadds_epi16(__m256i __a, __m256i __b)
916
{
917
return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
918
}
919
920
/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921
/// vectors of [16 x i16] and returns the lower 16 bits of each difference
922
/// in an element of the [16 x i16] result (overflow is ignored).
923
/// Differences from \a __a are returned in the lower 64 bits of each
924
/// 128-bit half of the result; differences from \a __b are returned in the
925
/// upper 64 bits of each 128-bit half of the result.
926
///
927
/// \code{.operation}
928
/// FOR i := 0 TO 1
929
/// j := i*128
930
/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931
/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932
/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933
/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934
/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935
/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936
/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937
/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
938
/// ENDFOR
939
/// \endcode
940
///
941
/// \headerfile <immintrin.h>
942
///
943
/// This intrinsic corresponds to the \c VPHSUBW instruction.
944
///
945
/// \param __a
946
/// A 256-bit vector of [16 x i16] containing one of the source operands.
947
/// \param __b
948
/// A 256-bit vector of [16 x i16] containing one of the source operands.
949
/// \returns A 256-bit vector of [16 x i16] containing the differences.
950
static __inline__ __m256i __DEFAULT_FN_ATTRS256
951
_mm256_hsub_epi16(__m256i __a, __m256i __b)
952
{
953
return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
954
}
955
956
/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957
/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
958
/// an element of the [8 x i32] result (overflow is ignored). Differences
959
/// from \a __a are returned in the lower 64 bits of each 128-bit half of
960
/// the result; differences from \a __b are returned in the upper 64 bits
961
/// of each 128-bit half of the result.
962
///
963
/// \code{.operation}
964
/// FOR i := 0 TO 1
965
/// j := i*128
966
/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967
/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968
/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969
/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
970
/// ENDFOR
971
/// \endcode
972
///
973
/// \headerfile <immintrin.h>
974
///
975
/// This intrinsic corresponds to the \c VPHSUBD instruction.
976
///
977
/// \param __a
978
/// A 256-bit vector of [8 x i32] containing one of the source operands.
979
/// \param __b
980
/// A 256-bit vector of [8 x i32] containing one of the source operands.
981
/// \returns A 256-bit vector of [8 x i32] containing the differences.
982
static __inline__ __m256i __DEFAULT_FN_ATTRS256
983
_mm256_hsub_epi32(__m256i __a, __m256i __b)
984
{
985
return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
986
}
987
988
/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989
/// vectors of [16 x i16] using signed saturation and returns each sum in
990
/// an element of the [16 x i16] result. Differences from \a __a are
991
/// returned in the lower 64 bits of each 128-bit half of the result;
992
/// differences from \a __b are returned in the upper 64 bits of each
993
/// 128-bit half of the result.
994
///
995
/// \code{.operation}
996
/// FOR i := 0 TO 1
997
/// j := i*128
998
/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999
/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000
/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001
/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002
/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003
/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004
/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005
/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006
/// ENDFOR
1007
/// \endcode
1008
///
1009
/// \headerfile <immintrin.h>
1010
///
1011
/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1012
///
1013
/// \param __a
1014
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1015
/// \param __b
1016
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1017
/// \returns A 256-bit vector of [16 x i16] containing the differences.
1018
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019
_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1020
{
1021
return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1022
}
1023
1024
/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025
/// with the corresponding signed byte from the 256-bit integer vector in
1026
/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027
/// pairs of those products using signed saturation to form 16-bit sums
1028
/// returned as elements of the [16 x i16] result.
1029
///
1030
/// \code{.operation}
1031
/// FOR i := 0 TO 15
1032
/// j := i*16
1033
/// temp1 := __a[j+7:j] * __b[j+7:j]
1034
/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035
/// result[j+15:j] := SATURATE16(temp1 + temp2)
1036
/// ENDFOR
1037
/// \endcode
1038
///
1039
/// \headerfile <immintrin.h>
1040
///
1041
/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1042
///
1043
/// \param __a
1044
/// A 256-bit vector containing one of the source operands.
1045
/// \param __b
1046
/// A 256-bit vector containing one of the source operands.
1047
/// \returns A 256-bit vector of [16 x i16] containing the result.
1048
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049
_mm256_maddubs_epi16(__m256i __a, __m256i __b)
1050
{
1051
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1052
}
1053
1054
/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055
/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056
/// those products to form 32-bit sums returned as elements of the
1057
/// [8 x i32] result.
1058
///
1059
/// There is only one wraparound case: when all four of the 16-bit sources
1060
/// are \c 0x8000, the result will be \c 0x80000000.
1061
///
1062
/// \code{.operation}
1063
/// FOR i := 0 TO 7
1064
/// j := i*32
1065
/// temp1 := __a[j+15:j] * __b[j+15:j]
1066
/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067
/// result[j+31:j] := temp1 + temp2
1068
/// ENDFOR
1069
/// \endcode
1070
///
1071
/// \headerfile <immintrin.h>
1072
///
1073
/// This intrinsic corresponds to the \c VPMADDWD instruction.
1074
///
1075
/// \param __a
1076
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1077
/// \param __b
1078
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1079
/// \returns A 256-bit vector of [8 x i32] containing the result.
1080
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081
_mm256_madd_epi16(__m256i __a, __m256i __b)
1082
{
1083
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1084
}
1085
1086
/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087
/// in \a __a and \a __b and returns the larger of each pair in the
1088
/// corresponding byte of the 256-bit result.
1089
///
1090
/// \headerfile <immintrin.h>
1091
///
1092
/// This intrinsic corresponds to the \c VPMAXSB instruction.
1093
///
1094
/// \param __a
1095
/// A 256-bit integer vector.
1096
/// \param __b
1097
/// A 256-bit integer vector.
1098
/// \returns A 256-bit integer vector containing the result.
1099
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100
_mm256_max_epi8(__m256i __a, __m256i __b)
1101
{
1102
return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1103
}
1104
1105
/// Compares the corresponding signed 16-bit integers in the two 256-bit
1106
/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107
/// each pair in the corresponding element of the 256-bit result.
1108
///
1109
/// \headerfile <immintrin.h>
1110
///
1111
/// This intrinsic corresponds to the \c VPMAXSW instruction.
1112
///
1113
/// \param __a
1114
/// A 256-bit vector of [16 x i16].
1115
/// \param __b
1116
/// A 256-bit vector of [16 x i16].
1117
/// \returns A 256-bit vector of [16 x i16] containing the result.
1118
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119
_mm256_max_epi16(__m256i __a, __m256i __b)
1120
{
1121
return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1122
}
1123
1124
/// Compares the corresponding signed 32-bit integers in the two 256-bit
1125
/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126
/// each pair in the corresponding element of the 256-bit result.
1127
///
1128
/// \headerfile <immintrin.h>
1129
///
1130
/// This intrinsic corresponds to the \c VPMAXSD instruction.
1131
///
1132
/// \param __a
1133
/// A 256-bit vector of [8 x i32].
1134
/// \param __b
1135
/// A 256-bit vector of [8 x i32].
1136
/// \returns A 256-bit vector of [8 x i32] containing the result.
1137
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138
_mm256_max_epi32(__m256i __a, __m256i __b)
1139
{
1140
return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1141
}
1142
1143
/// Compares the corresponding unsigned bytes in the two 256-bit integer
1144
/// vectors in \a __a and \a __b and returns the larger of each pair in
1145
/// the corresponding byte of the 256-bit result.
1146
///
1147
/// \headerfile <immintrin.h>
1148
///
1149
/// This intrinsic corresponds to the \c VPMAXUB instruction.
1150
///
1151
/// \param __a
1152
/// A 256-bit integer vector.
1153
/// \param __b
1154
/// A 256-bit integer vector.
1155
/// \returns A 256-bit integer vector containing the result.
1156
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157
_mm256_max_epu8(__m256i __a, __m256i __b)
1158
{
1159
return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1160
}
1161
1162
/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163
/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164
/// each pair in the corresponding element of the 256-bit result.
1165
///
1166
/// \headerfile <immintrin.h>
1167
///
1168
/// This intrinsic corresponds to the \c VPMAXUW instruction.
1169
///
1170
/// \param __a
1171
/// A 256-bit vector of [16 x i16].
1172
/// \param __b
1173
/// A 256-bit vector of [16 x i16].
1174
/// \returns A 256-bit vector of [16 x i16] containing the result.
1175
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176
_mm256_max_epu16(__m256i __a, __m256i __b)
1177
{
1178
return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1179
}
1180
1181
/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182
/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183
/// each pair in the corresponding element of the 256-bit result.
1184
///
1185
/// \headerfile <immintrin.h>
1186
///
1187
/// This intrinsic corresponds to the \c VPMAXUD instruction.
1188
///
1189
/// \param __a
1190
/// A 256-bit vector of [8 x i32].
1191
/// \param __b
1192
/// A 256-bit vector of [8 x i32].
1193
/// \returns A 256-bit vector of [8 x i32] containing the result.
1194
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195
_mm256_max_epu32(__m256i __a, __m256i __b)
1196
{
1197
return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1198
}
1199
1200
/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201
/// in \a __a and \a __b and returns the smaller of each pair in the
1202
/// corresponding byte of the 256-bit result.
1203
///
1204
/// \headerfile <immintrin.h>
1205
///
1206
/// This intrinsic corresponds to the \c VPMINSB instruction.
1207
///
1208
/// \param __a
1209
/// A 256-bit integer vector.
1210
/// \param __b
1211
/// A 256-bit integer vector.
1212
/// \returns A 256-bit integer vector containing the result.
1213
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214
_mm256_min_epi8(__m256i __a, __m256i __b)
1215
{
1216
return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1217
}
1218
1219
/// Compares the corresponding signed 16-bit integers in the two 256-bit
1220
/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221
/// each pair in the corresponding element of the 256-bit result.
1222
///
1223
/// \headerfile <immintrin.h>
1224
///
1225
/// This intrinsic corresponds to the \c VPMINSW instruction.
1226
///
1227
/// \param __a
1228
/// A 256-bit vector of [16 x i16].
1229
/// \param __b
1230
/// A 256-bit vector of [16 x i16].
1231
/// \returns A 256-bit vector of [16 x i16] containing the result.
1232
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233
_mm256_min_epi16(__m256i __a, __m256i __b)
1234
{
1235
return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1236
}
1237
1238
/// Compares the corresponding signed 32-bit integers in the two 256-bit
1239
/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240
/// each pair in the corresponding element of the 256-bit result.
1241
///
1242
/// \headerfile <immintrin.h>
1243
///
1244
/// This intrinsic corresponds to the \c VPMINSD instruction.
1245
///
1246
/// \param __a
1247
/// A 256-bit vector of [8 x i32].
1248
/// \param __b
1249
/// A 256-bit vector of [8 x i32].
1250
/// \returns A 256-bit vector of [8 x i32] containing the result.
1251
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252
_mm256_min_epi32(__m256i __a, __m256i __b)
1253
{
1254
return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1255
}
1256
1257
/// Compares the corresponding unsigned bytes in the two 256-bit integer
1258
/// vectors in \a __a and \a __b and returns the smaller of each pair in
1259
/// the corresponding byte of the 256-bit result.
1260
///
1261
/// \headerfile <immintrin.h>
1262
///
1263
/// This intrinsic corresponds to the \c VPMINUB instruction.
1264
///
1265
/// \param __a
1266
/// A 256-bit integer vector.
1267
/// \param __b
1268
/// A 256-bit integer vector.
1269
/// \returns A 256-bit integer vector containing the result.
1270
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271
_mm256_min_epu8(__m256i __a, __m256i __b)
1272
{
1273
return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1274
}
1275
1276
/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277
/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278
/// each pair in the corresponding element of the 256-bit result.
1279
///
1280
/// \headerfile <immintrin.h>
1281
///
1282
/// This intrinsic corresponds to the \c VPMINUW instruction.
1283
///
1284
/// \param __a
1285
/// A 256-bit vector of [16 x i16].
1286
/// \param __b
1287
/// A 256-bit vector of [16 x i16].
1288
/// \returns A 256-bit vector of [16 x i16] containing the result.
1289
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290
_mm256_min_epu16(__m256i __a, __m256i __b)
1291
{
1292
return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1293
}
1294
1295
/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296
/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297
/// each pair in the corresponding element of the 256-bit result.
1298
///
1299
/// \headerfile <immintrin.h>
1300
///
1301
/// This intrinsic corresponds to the \c VPMINUD instruction.
1302
///
1303
/// \param __a
1304
/// A 256-bit vector of [8 x i32].
1305
/// \param __b
1306
/// A 256-bit vector of [8 x i32].
1307
/// \returns A 256-bit vector of [8 x i32] containing the result.
1308
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309
_mm256_min_epu32(__m256i __a, __m256i __b)
1310
{
1311
return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1312
}
1313
1314
/// Creates a 32-bit integer mask from the most significant bit of each byte
1315
/// in the 256-bit integer vector in \a __a and returns the result.
1316
///
1317
/// \code{.operation}
1318
/// FOR i := 0 TO 31
1319
/// j := i*8
1320
/// result[i] := __a[j+7]
1321
/// ENDFOR
1322
/// \endcode
1323
///
1324
/// \headerfile <immintrin.h>
1325
///
1326
/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327
///
1328
/// \param __a
1329
/// A 256-bit integer vector containing the source bytes.
1330
/// \returns The 32-bit integer mask.
1331
static __inline__ int __DEFAULT_FN_ATTRS256
1332
_mm256_movemask_epi8(__m256i __a)
1333
{
1334
return __builtin_ia32_pmovmskb256((__v32qi)__a);
1335
}
1336
1337
/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338
/// the 16-bit values in the corresponding elements of a 256-bit vector
1339
/// of [16 x i16].
1340
///
1341
/// \code{.operation}
1342
/// FOR i := 0 TO 15
1343
/// j := i*8
1344
/// k := i*16
1345
/// result[k+15:k] := SignExtend(__V[j+7:j])
1346
/// ENDFOR
1347
/// \endcode
1348
///
1349
/// \headerfile <immintrin.h>
1350
///
1351
/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1352
///
1353
/// \param __V
1354
/// A 128-bit integer vector containing the source bytes.
1355
/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356
/// values.
1357
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358
_mm256_cvtepi8_epi16(__m128i __V)
1359
{
1360
/* This function always performs a signed extension, but __v16qi is a char
1361
which may be signed or unsigned, so use __v16qs. */
1362
return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1363
}
1364
1365
/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366
/// \a __V and returns the 32-bit values in the corresponding elements of a
1367
/// 256-bit vector of [8 x i32].
1368
///
1369
/// \code{.operation}
1370
/// FOR i := 0 TO 7
1371
/// j := i*8
1372
/// k := i*32
1373
/// result[k+31:k] := SignExtend(__V[j+7:j])
1374
/// ENDFOR
1375
/// \endcode
1376
///
1377
/// \headerfile <immintrin.h>
1378
///
1379
/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1380
///
1381
/// \param __V
1382
/// A 128-bit integer vector containing the source bytes.
1383
/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384
/// values.
1385
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386
_mm256_cvtepi8_epi32(__m128i __V)
1387
{
1388
/* This function always performs a signed extension, but __v16qi is a char
1389
which may be signed or unsigned, so use __v16qs. */
1390
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1391
}
1392
1393
/// Sign-extends the first four bytes from the 128-bit integer vector in
1394
/// \a __V and returns the 64-bit values in the corresponding elements of a
1395
/// 256-bit vector of [4 x i64].
1396
///
1397
/// \code{.operation}
1398
/// result[63:0] := SignExtend(__V[7:0])
1399
/// result[127:64] := SignExtend(__V[15:8])
1400
/// result[191:128] := SignExtend(__V[23:16])
1401
/// result[255:192] := SignExtend(__V[31:24])
1402
/// \endcode
1403
///
1404
/// \headerfile <immintrin.h>
1405
///
1406
/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1407
///
1408
/// \param __V
1409
/// A 128-bit integer vector containing the source bytes.
1410
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411
/// values.
1412
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413
_mm256_cvtepi8_epi64(__m128i __V)
1414
{
1415
/* This function always performs a signed extension, but __v16qi is a char
1416
which may be signed or unsigned, so use __v16qs. */
1417
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1418
}
1419
1420
/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421
/// \a __V and returns the 32-bit values in the corresponding elements of a
1422
/// 256-bit vector of [8 x i32].
1423
///
1424
/// \code{.operation}
1425
/// FOR i := 0 TO 7
1426
/// j := i*16
1427
/// k := i*32
1428
/// result[k+31:k] := SignExtend(__V[j+15:j])
1429
/// ENDFOR
1430
/// \endcode
1431
///
1432
/// \headerfile <immintrin.h>
1433
///
1434
/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1435
///
1436
/// \param __V
1437
/// A 128-bit vector of [8 x i16] containing the source values.
1438
/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439
/// values.
1440
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441
_mm256_cvtepi16_epi32(__m128i __V)
1442
{
1443
return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1444
}
1445
1446
/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447
/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448
/// elements of a 256-bit vector of [4 x i64].
1449
///
1450
/// \code{.operation}
1451
/// result[63:0] := SignExtend(__V[15:0])
1452
/// result[127:64] := SignExtend(__V[31:16])
1453
/// result[191:128] := SignExtend(__V[47:32])
1454
/// result[255:192] := SignExtend(__V[64:48])
1455
/// \endcode
1456
///
1457
/// \headerfile <immintrin.h>
1458
///
1459
/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1460
///
1461
/// \param __V
1462
/// A 128-bit vector of [8 x i16] containing the source values.
1463
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464
/// values.
1465
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466
_mm256_cvtepi16_epi64(__m128i __V)
1467
{
1468
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1469
}
1470
1471
/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472
/// \a __V and returns the 64-bit values in the corresponding elements of a
1473
/// 256-bit vector of [4 x i64].
1474
///
1475
/// \code{.operation}
1476
/// result[63:0] := SignExtend(__V[31:0])
1477
/// result[127:64] := SignExtend(__V[63:32])
1478
/// result[191:128] := SignExtend(__V[95:64])
1479
/// result[255:192] := SignExtend(__V[127:96])
1480
/// \endcode
1481
///
1482
/// \headerfile <immintrin.h>
1483
///
1484
/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1485
///
1486
/// \param __V
1487
/// A 128-bit vector of [4 x i32] containing the source values.
1488
/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489
/// values.
1490
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491
_mm256_cvtepi32_epi64(__m128i __V)
1492
{
1493
return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1494
}
1495
1496
/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497
/// the 16-bit values in the corresponding elements of a 256-bit vector
1498
/// of [16 x i16].
1499
///
1500
/// \code{.operation}
1501
/// FOR i := 0 TO 15
1502
/// j := i*8
1503
/// k := i*16
1504
/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1505
/// ENDFOR
1506
/// \endcode
1507
///
1508
/// \headerfile <immintrin.h>
1509
///
1510
/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1511
///
1512
/// \param __V
1513
/// A 128-bit integer vector containing the source bytes.
1514
/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515
/// values.
1516
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517
_mm256_cvtepu8_epi16(__m128i __V)
1518
{
1519
return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1520
}
1521
1522
/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523
/// \a __V and returns the 32-bit values in the corresponding elements of a
1524
/// 256-bit vector of [8 x i32].
1525
///
1526
/// \code{.operation}
1527
/// FOR i := 0 TO 7
1528
/// j := i*8
1529
/// k := i*32
1530
/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1531
/// ENDFOR
1532
/// \endcode
1533
///
1534
/// \headerfile <immintrin.h>
1535
///
1536
/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1537
///
1538
/// \param __V
1539
/// A 128-bit integer vector containing the source bytes.
1540
/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541
/// values.
1542
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543
_mm256_cvtepu8_epi32(__m128i __V)
1544
{
1545
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1546
}
1547
1548
/// Zero-extends the first four bytes from the 128-bit integer vector in
1549
/// \a __V and returns the 64-bit values in the corresponding elements of a
1550
/// 256-bit vector of [4 x i64].
1551
///
1552
/// \code{.operation}
1553
/// result[63:0] := ZeroExtend(__V[7:0])
1554
/// result[127:64] := ZeroExtend(__V[15:8])
1555
/// result[191:128] := ZeroExtend(__V[23:16])
1556
/// result[255:192] := ZeroExtend(__V[31:24])
1557
/// \endcode
1558
///
1559
/// \headerfile <immintrin.h>
1560
///
1561
/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1562
///
1563
/// \param __V
1564
/// A 128-bit integer vector containing the source bytes.
1565
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566
/// values.
1567
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568
_mm256_cvtepu8_epi64(__m128i __V)
1569
{
1570
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1571
}
1572
1573
/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574
/// \a __V and returns the 32-bit values in the corresponding elements of a
1575
/// 256-bit vector of [8 x i32].
1576
///
1577
/// \code{.operation}
1578
/// FOR i := 0 TO 7
1579
/// j := i*16
1580
/// k := i*32
1581
/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1582
/// ENDFOR
1583
/// \endcode
1584
///
1585
/// \headerfile <immintrin.h>
1586
///
1587
/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1588
///
1589
/// \param __V
1590
/// A 128-bit vector of [8 x i16] containing the source values.
1591
/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592
/// values.
1593
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594
_mm256_cvtepu16_epi32(__m128i __V)
1595
{
1596
return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1597
}
1598
1599
/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600
/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601
/// elements of a 256-bit vector of [4 x i64].
1602
///
1603
/// \code{.operation}
1604
/// result[63:0] := ZeroExtend(__V[15:0])
1605
/// result[127:64] := ZeroExtend(__V[31:16])
1606
/// result[191:128] := ZeroExtend(__V[47:32])
1607
/// result[255:192] := ZeroExtend(__V[64:48])
1608
/// \endcode
1609
///
1610
/// \headerfile <immintrin.h>
1611
///
1612
/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1613
///
1614
/// \param __V
1615
/// A 128-bit vector of [8 x i16] containing the source values.
1616
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617
/// values.
1618
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619
_mm256_cvtepu16_epi64(__m128i __V)
1620
{
1621
return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1622
}
1623
1624
/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625
/// \a __V and returns the 64-bit values in the corresponding elements of a
1626
/// 256-bit vector of [4 x i64].
1627
///
1628
/// \code{.operation}
1629
/// result[63:0] := ZeroExtend(__V[31:0])
1630
/// result[127:64] := ZeroExtend(__V[63:32])
1631
/// result[191:128] := ZeroExtend(__V[95:64])
1632
/// result[255:192] := ZeroExtend(__V[127:96])
1633
/// \endcode
1634
///
1635
/// \headerfile <immintrin.h>
1636
///
1637
/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1638
///
1639
/// \param __V
1640
/// A 128-bit vector of [4 x i32] containing the source values.
1641
/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642
/// values.
1643
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644
_mm256_cvtepu32_epi64(__m128i __V)
1645
{
1646
return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1647
}
1648
1649
/// Multiplies signed 32-bit integers from even-numbered elements of two
1650
/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651
/// [4 x i64] result.
1652
///
1653
/// \code{.operation}
1654
/// result[63:0] := __a[31:0] * __b[31:0]
1655
/// result[127:64] := __a[95:64] * __b[95:64]
1656
/// result[191:128] := __a[159:128] * __b[159:128]
1657
/// result[255:192] := __a[223:192] * __b[223:192]
1658
/// \endcode
1659
///
1660
/// \headerfile <immintrin.h>
1661
///
1662
/// This intrinsic corresponds to the \c VPMULDQ instruction.
1663
///
1664
/// \param __a
1665
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1666
/// \param __b
1667
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1668
/// \returns A 256-bit vector of [4 x i64] containing the products.
1669
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670
_mm256_mul_epi32(__m256i __a, __m256i __b)
1671
{
1672
return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1673
}
1674
1675
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676
/// [16 x i16], truncates the 32-bit results to the most significant 18
1677
/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678
/// product in the [16 x i16] result.
1679
///
1680
/// \code{.operation}
1681
/// FOR i := 0 TO 15
1682
/// j := i*16
1683
/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684
/// result[j+15:j] := temp[16:1]
1685
/// \endcode
1686
///
1687
/// \headerfile <immintrin.h>
1688
///
1689
/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1690
///
1691
/// \param __a
1692
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1693
/// \param __b
1694
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1695
/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697
_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1698
{
1699
return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1700
}
1701
1702
/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703
/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704
/// [16 x i16] result.
1705
///
1706
/// \headerfile <immintrin.h>
1707
///
1708
/// This intrinsic corresponds to the \c VPMULHUW instruction.
1709
///
1710
/// \param __a
1711
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1712
/// \param __b
1713
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1714
/// \returns A 256-bit vector of [16 x i16] containing the products.
1715
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716
_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1717
{
1718
return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1719
}
1720
1721
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722
/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723
/// [16 x i16] result.
1724
///
1725
/// \headerfile <immintrin.h>
1726
///
1727
/// This intrinsic corresponds to the \c VPMULHW instruction.
1728
///
1729
/// \param __a
1730
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1731
/// \param __b
1732
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1733
/// \returns A 256-bit vector of [16 x i16] containing the products.
1734
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735
_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1736
{
1737
return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1738
}
1739
1740
/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741
/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742
/// [16 x i16] result.
1743
///
1744
/// \headerfile <immintrin.h>
1745
///
1746
/// This intrinsic corresponds to the \c VPMULLW instruction.
1747
///
1748
/// \param __a
1749
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1750
/// \param __b
1751
/// A 256-bit vector of [16 x i16] containing one of the source operands.
1752
/// \returns A 256-bit vector of [16 x i16] containing the products.
1753
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754
_mm256_mullo_epi16(__m256i __a, __m256i __b)
1755
{
1756
return (__m256i)((__v16hu)__a * (__v16hu)__b);
1757
}
1758
1759
/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760
/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761
/// [8 x i32] result.
1762
///
1763
/// \headerfile <immintrin.h>
1764
///
1765
/// This intrinsic corresponds to the \c VPMULLD instruction.
1766
///
1767
/// \param __a
1768
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1769
/// \param __b
1770
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1771
/// \returns A 256-bit vector of [8 x i32] containing the products.
1772
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1773
_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1774
{
1775
return (__m256i)((__v8su)__a * (__v8su)__b);
1776
}
1777
1778
/// Multiplies unsigned 32-bit integers from even-numered elements of two
1779
/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780
/// [4 x i64] result.
1781
///
1782
/// \code{.operation}
1783
/// result[63:0] := __a[31:0] * __b[31:0]
1784
/// result[127:64] := __a[95:64] * __b[95:64]
1785
/// result[191:128] := __a[159:128] * __b[159:128]
1786
/// result[255:192] := __a[223:192] * __b[223:192]
1787
/// \endcode
1788
///
1789
/// \headerfile <immintrin.h>
1790
///
1791
/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1792
///
1793
/// \param __a
1794
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1795
/// \param __b
1796
/// A 256-bit vector of [8 x i32] containing one of the source operands.
1797
/// \returns A 256-bit vector of [4 x i64] containing the products.
1798
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799
_mm256_mul_epu32(__m256i __a, __m256i __b)
1800
{
1801
return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1802
}
1803
1804
/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805
/// \a __b.
1806
///
1807
/// \headerfile <immintrin.h>
1808
///
1809
/// This intrinsic corresponds to the \c VPOR instruction.
1810
///
1811
/// \param __a
1812
/// A 256-bit integer vector.
1813
/// \param __b
1814
/// A 256-bit integer vector.
1815
/// \returns A 256-bit integer vector containing the result.
1816
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817
_mm256_or_si256(__m256i __a, __m256i __b)
1818
{
1819
return (__m256i)((__v4du)__a | (__v4du)__b);
1820
}
1821
1822
/// Computes four sum of absolute difference (SAD) operations on sets of eight
1823
/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824
/// \a __b.
1825
///
1826
/// One SAD result is computed for each set of eight bytes from \a __a and
1827
/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1828
/// corresponding 64-bit element of the result.
1829
///
1830
/// A single SAD operation takes the differences between the corresponding
1831
/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1832
/// and sums these eight values to form one 16-bit result. This operation
1833
/// is repeated four times with successive sets of eight bytes.
1834
///
1835
/// \code{.operation}
1836
/// FOR i := 0 TO 3
1837
/// j := i*64
1838
/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839
/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840
/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841
/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842
/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843
/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844
/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845
/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846
/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847
/// temp4 + temp5 + temp6 + temp7
1848
/// result[j+63:j+16] := 0
1849
/// ENDFOR
1850
/// \endcode
1851
///
1852
/// \headerfile <immintrin.h>
1853
///
1854
/// This intrinsic corresponds to the \c VPSADBW instruction.
1855
///
1856
/// \param __a
1857
/// A 256-bit integer vector.
1858
/// \param __b
1859
/// A 256-bit integer vector.
1860
/// \returns A 256-bit integer vector containing the result.
1861
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862
_mm256_sad_epu8(__m256i __a, __m256i __b)
1863
{
1864
return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1865
}
1866
1867
/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868
/// to control information in the 256-bit integer vector \a __b, and
1869
/// returns the 256-bit result. In effect there are two separate 128-bit
1870
/// shuffles in the lower and upper halves.
1871
///
1872
/// \code{.operation}
1873
/// FOR i := 0 TO 31
1874
/// j := i*8
1875
/// IF __b[j+7] == 1
1876
/// result[j+7:j] := 0
1877
/// ELSE
1878
/// k := __b[j+3:j] * 8
1879
/// IF i > 15
1880
/// k := k + 128
1881
/// FI
1882
/// result[j+7:j] := __a[k+7:k]
1883
/// FI
1884
/// ENDFOR
1885
/// \endcode
1886
///
1887
/// \headerfile <immintrin.h>
1888
///
1889
/// This intrinsic corresponds to the \c VPSHUFB instruction.
1890
///
1891
/// \param __a
1892
/// A 256-bit integer vector containing source values.
1893
/// \param __b
1894
/// A 256-bit integer vector containing control information to determine
1895
/// what goes into the corresponding byte of the result. If bit 7 of the
1896
/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897
/// control byte specify the index (within the same 128-bit half) of \a __a
1898
/// to copy to the result byte.
1899
/// \returns A 256-bit integer vector containing the result.
1900
static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901
_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1902
{
1903
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1904
}
1905
1906
/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907
/// according to control information in the integer literal \a imm, and
1908
/// returns the 256-bit result. In effect there are two parallel 128-bit
1909
/// shuffles in the lower and upper halves.
1910
///
1911
/// \code{.operation}
1912
/// FOR i := 0 to 3
1913
/// j := i*32
1914
/// k := (imm >> i*2)[1:0] * 32
1915
/// result[j+31:j] := a[k+31:k]
1916
/// result[128+j+31:128+j] := a[128+k+31:128+k]
1917
/// ENDFOR
1918
/// \endcode
1919
///
1920
/// \headerfile <immintrin.h>
1921
///
1922
/// \code
1923
/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924
/// \endcode
1925
///
1926
/// This intrinsic corresponds to the \c VPSHUFB instruction.
1927
///
1928
/// \param a
1929
/// A 256-bit vector of [8 x i32] containing source values.
1930
/// \param imm
1931
/// An immediate 8-bit value specifying which elements to copy from \a a.
1932
/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933
/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934
/// forth.
1935
/// \returns A 256-bit vector of [8 x i32] containing the result.
1936
#define _mm256_shuffle_epi32(a, imm) \
1937
((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1938
1939
/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940
/// according to control information in the integer literal \a imm, and
1941
/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1942
/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943
/// copied from \a a unchanged.
1944
///
1945
/// \code{.operation}
1946
/// result[63:0] := a[63:0]
1947
/// result[191:128] := a[191:128]
1948
/// FOR i := 0 TO 3
1949
/// j := i * 16 + 64
1950
/// k := (imm >> i*2)[1:0] * 16 + 64
1951
/// result[j+15:j] := a[k+15:k]
1952
/// result[128+j+15:128+j] := a[128+k+15:128+k]
1953
/// ENDFOR
1954
/// \endcode
1955
///
1956
/// \headerfile <immintrin.h>
1957
///
1958
/// \code
1959
/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960
/// \endcode
1961
///
1962
/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1963
///
1964
/// \param a
1965
/// A 256-bit vector of [16 x i16] containing source values.
1966
/// \param imm
1967
/// An immediate 8-bit value specifying which elements to copy from \a a.
1968
/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969
/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970
/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971
/// \returns A 256-bit vector of [16 x i16] containing the result.
1972
#define _mm256_shufflehi_epi16(a, imm) \
1973
((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1974
1975
/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976
/// according to control information in the integer literal \a imm, and
1977
/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978
/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979
/// copied from \a a unchanged.
1980
///
1981
/// \code{.operation}
1982
/// result[127:64] := a[127:64]
1983
/// result[255:192] := a[255:192]
1984
/// FOR i := 0 TO 3
1985
/// j := i * 16
1986
/// k := (imm >> i*2)[1:0] * 16
1987
/// result[j+15:j] := a[k+15:k]
1988
/// result[128+j+15:128+j] := a[128+k+15:128+k]
1989
/// ENDFOR
1990
/// \endcode
1991
///
1992
/// \headerfile <immintrin.h>
1993
///
1994
/// \code
1995
/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996
/// \endcode
1997
///
1998
/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1999
///
2000
/// \param a
2001
/// A 256-bit vector of [16 x i16] to use as a source of data for the
2002
/// result.
2003
/// \param imm
2004
/// An immediate 8-bit value specifying which elements to copy from \a a.
2005
/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006
/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007
/// forth.
2008
/// \returns A 256-bit vector of [16 x i16] containing the result.
2009
#define _mm256_shufflelo_epi16(a, imm) \
2010
((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2011
2012
/// Sets each byte of the result to the corresponding byte of the 256-bit
2013
/// integer vector in \a __a, the negative of that byte, or zero, depending
2014
/// on whether the corresponding byte of the 256-bit integer vector in
2015
/// \a __b is greater than zero, less than zero, or equal to zero,
2016
/// respectively.
2017
///
2018
/// \headerfile <immintrin.h>
2019
///
2020
/// This intrinsic corresponds to the \c VPSIGNB instruction.
2021
///
2022
/// \param __a
2023
/// A 256-bit integer vector.
2024
/// \param __b
2025
/// A 256-bit integer vector].
2026
/// \returns A 256-bit integer vector containing the result.
2027
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028
_mm256_sign_epi8(__m256i __a, __m256i __b)
2029
{
2030
return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2031
}
2032
2033
/// Sets each element of the result to the corresponding element of the
2034
/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035
/// or zero, depending on whether the corresponding element of the 256-bit
2036
/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037
/// equal to zero, respectively.
2038
///
2039
/// \headerfile <immintrin.h>
2040
///
2041
/// This intrinsic corresponds to the \c VPSIGNW instruction.
2042
///
2043
/// \param __a
2044
/// A 256-bit vector of [16 x i16].
2045
/// \param __b
2046
/// A 256-bit vector of [16 x i16].
2047
/// \returns A 256-bit vector of [16 x i16] containing the result.
2048
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049
_mm256_sign_epi16(__m256i __a, __m256i __b)
2050
{
2051
return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2052
}
2053
2054
/// Sets each element of the result to the corresponding element of the
2055
/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056
/// zero, depending on whether the corresponding element of the 256-bit
2057
/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058
/// equal to zero, respectively.
2059
///
2060
/// \headerfile <immintrin.h>
2061
///
2062
/// This intrinsic corresponds to the \c VPSIGND instruction.
2063
///
2064
/// \param __a
2065
/// A 256-bit vector of [8 x i32].
2066
/// \param __b
2067
/// A 256-bit vector of [8 x i32].
2068
/// \returns A 256-bit vector of [8 x i32] containing the result.
2069
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070
_mm256_sign_epi32(__m256i __a, __m256i __b)
2071
{
2072
return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2073
}
2074
2075
/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076
/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077
/// is greater than 15, the returned result is all zeroes.
2078
///
2079
/// \headerfile <immintrin.h>
2080
///
2081
/// \code
2082
/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083
/// \endcode
2084
///
2085
/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2086
///
2087
/// \param a
2088
/// A 256-bit integer vector to be shifted.
2089
/// \param imm
2090
/// An unsigned immediate value specifying the shift count (in bytes).
2091
/// \returns A 256-bit integer vector containing the result.
2092
#define _mm256_slli_si256(a, imm) \
2093
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2094
2095
/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096
/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097
/// is greater than 15, the returned result is all zeroes.
2098
///
2099
/// \headerfile <immintrin.h>
2100
///
2101
/// \code
2102
/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103
/// \endcode
2104
///
2105
/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2106
///
2107
/// \param a
2108
/// A 256-bit integer vector to be shifted.
2109
/// \param imm
2110
/// An unsigned immediate value specifying the shift count (in bytes).
2111
/// \returns A 256-bit integer vector containing the result.
2112
#define _mm256_bslli_epi128(a, imm) \
2113
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2114
2115
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116
/// left by \a __count bits, shifting in zero bits, and returns the result.
2117
/// If \a __count is greater than 15, the returned result is all zeroes.
2118
///
2119
/// \headerfile <immintrin.h>
2120
///
2121
/// This intrinsic corresponds to the \c VPSLLW instruction.
2122
///
2123
/// \param __a
2124
/// A 256-bit vector of [16 x i16] to be shifted.
2125
/// \param __count
2126
/// An unsigned integer value specifying the shift count (in bits).
2127
/// \returns A 256-bit vector of [16 x i16] containing the result.
2128
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129
_mm256_slli_epi16(__m256i __a, int __count)
2130
{
2131
return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2132
}
2133
2134
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135
/// left by the number of bits specified by the lower 64 bits of \a __count,
2136
/// shifting in zero bits, and returns the result. If \a __count is greater
2137
/// than 15, the returned result is all zeroes.
2138
///
2139
/// \headerfile <immintrin.h>
2140
///
2141
/// This intrinsic corresponds to the \c VPSLLW instruction.
2142
///
2143
/// \param __a
2144
/// A 256-bit vector of [16 x i16] to be shifted.
2145
/// \param __count
2146
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147
/// shift count (in bits). The upper element is ignored.
2148
/// \returns A 256-bit vector of [16 x i16] containing the result.
2149
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150
_mm256_sll_epi16(__m256i __a, __m128i __count)
2151
{
2152
return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2153
}
2154
2155
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156
/// left by \a __count bits, shifting in zero bits, and returns the result.
2157
/// If \a __count is greater than 31, the returned result is all zeroes.
2158
///
2159
/// \headerfile <immintrin.h>
2160
///
2161
/// This intrinsic corresponds to the \c VPSLLD instruction.
2162
///
2163
/// \param __a
2164
/// A 256-bit vector of [8 x i32] to be shifted.
2165
/// \param __count
2166
/// An unsigned integer value specifying the shift count (in bits).
2167
/// \returns A 256-bit vector of [8 x i32] containing the result.
2168
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169
_mm256_slli_epi32(__m256i __a, int __count)
2170
{
2171
return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2172
}
2173
2174
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175
/// left by the number of bits given in the lower 64 bits of \a __count,
2176
/// shifting in zero bits, and returns the result. If \a __count is greater
2177
/// than 31, the returned result is all zeroes.
2178
///
2179
/// \headerfile <immintrin.h>
2180
///
2181
/// This intrinsic corresponds to the \c VPSLLD instruction.
2182
///
2183
/// \param __a
2184
/// A 256-bit vector of [8 x i32] to be shifted.
2185
/// \param __count
2186
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187
/// shift count (in bits). The upper element is ignored.
2188
/// \returns A 256-bit vector of [8 x i32] containing the result.
2189
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190
_mm256_sll_epi32(__m256i __a, __m128i __count)
2191
{
2192
return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2193
}
2194
2195
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196
/// left by \a __count bits, shifting in zero bits, and returns the result.
2197
/// If \a __count is greater than 63, the returned result is all zeroes.
2198
///
2199
/// \headerfile <immintrin.h>
2200
///
2201
/// This intrinsic corresponds to the \c VPSLLQ instruction.
2202
///
2203
/// \param __a
2204
/// A 256-bit vector of [4 x i64] to be shifted.
2205
/// \param __count
2206
/// An unsigned integer value specifying the shift count (in bits).
2207
/// \returns A 256-bit vector of [4 x i64] containing the result.
2208
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209
_mm256_slli_epi64(__m256i __a, int __count)
2210
{
2211
return __builtin_ia32_psllqi256((__v4di)__a, __count);
2212
}
2213
2214
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215
/// left by the number of bits given in the lower 64 bits of \a __count,
2216
/// shifting in zero bits, and returns the result. If \a __count is greater
2217
/// than 63, the returned result is all zeroes.
2218
///
2219
/// \headerfile <immintrin.h>
2220
///
2221
/// This intrinsic corresponds to the \c VPSLLQ instruction.
2222
///
2223
/// \param __a
2224
/// A 256-bit vector of [4 x i64] to be shifted.
2225
/// \param __count
2226
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227
/// shift count (in bits). The upper element is ignored.
2228
/// \returns A 256-bit vector of [4 x i64] containing the result.
2229
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230
_mm256_sll_epi64(__m256i __a, __m128i __count)
2231
{
2232
return __builtin_ia32_psllq256((__v4di)__a, __count);
2233
}
2234
2235
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236
/// right by \a __count bits, shifting in sign bits, and returns the result.
2237
/// If \a __count is greater than 15, each element of the result is either
2238
/// 0 or -1 according to the corresponding input sign bit.
2239
///
2240
/// \headerfile <immintrin.h>
2241
///
2242
/// This intrinsic corresponds to the \c VPSRAW instruction.
2243
///
2244
/// \param __a
2245
/// A 256-bit vector of [16 x i16] to be shifted.
2246
/// \param __count
2247
/// An unsigned integer value specifying the shift count (in bits).
2248
/// \returns A 256-bit vector of [16 x i16] containing the result.
2249
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250
_mm256_srai_epi16(__m256i __a, int __count)
2251
{
2252
return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2253
}
2254
2255
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256
/// right by the number of bits given in the lower 64 bits of \a __count,
2257
/// shifting in sign bits, and returns the result. If \a __count is greater
2258
/// than 15, each element of the result is either 0 or -1 according to the
2259
/// corresponding input sign bit.
2260
///
2261
/// \headerfile <immintrin.h>
2262
///
2263
/// This intrinsic corresponds to the \c VPSRAW instruction.
2264
///
2265
/// \param __a
2266
/// A 256-bit vector of [16 x i16] to be shifted.
2267
/// \param __count
2268
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269
/// shift count (in bits). The upper element is ignored.
2270
/// \returns A 256-bit vector of [16 x i16] containing the result.
2271
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272
_mm256_sra_epi16(__m256i __a, __m128i __count)
2273
{
2274
return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2275
}
2276
2277
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278
/// right by \a __count bits, shifting in sign bits, and returns the result.
2279
/// If \a __count is greater than 31, each element of the result is either
2280
/// 0 or -1 according to the corresponding input sign bit.
2281
///
2282
/// \headerfile <immintrin.h>
2283
///
2284
/// This intrinsic corresponds to the \c VPSRAD instruction.
2285
///
2286
/// \param __a
2287
/// A 256-bit vector of [8 x i32] to be shifted.
2288
/// \param __count
2289
/// An unsigned integer value specifying the shift count (in bits).
2290
/// \returns A 256-bit vector of [8 x i32] containing the result.
2291
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292
_mm256_srai_epi32(__m256i __a, int __count)
2293
{
2294
return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2295
}
2296
2297
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298
/// right by the number of bits given in the lower 64 bits of \a __count,
2299
/// shifting in sign bits, and returns the result. If \a __count is greater
2300
/// than 31, each element of the result is either 0 or -1 according to the
2301
/// corresponding input sign bit.
2302
///
2303
/// \headerfile <immintrin.h>
2304
///
2305
/// This intrinsic corresponds to the \c VPSRAD instruction.
2306
///
2307
/// \param __a
2308
/// A 256-bit vector of [8 x i32] to be shifted.
2309
/// \param __count
2310
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311
/// shift count (in bits). The upper element is ignored.
2312
/// \returns A 256-bit vector of [8 x i32] containing the result.
2313
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314
_mm256_sra_epi32(__m256i __a, __m128i __count)
2315
{
2316
return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2317
}
2318
2319
/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320
/// \a imm bytes, shifting in zero bytes, and returns the result. If
2321
/// \a imm is greater than 15, the returned result is all zeroes.
2322
///
2323
/// \headerfile <immintrin.h>
2324
///
2325
/// \code
2326
/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327
/// \endcode
2328
///
2329
/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2330
///
2331
/// \param a
2332
/// A 256-bit integer vector to be shifted.
2333
/// \param imm
2334
/// An unsigned immediate value specifying the shift count (in bytes).
2335
/// \returns A 256-bit integer vector containing the result.
2336
#define _mm256_srli_si256(a, imm) \
2337
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2338
2339
/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340
/// \a imm bytes, shifting in zero bytes, and returns the result. If
2341
/// \a imm is greater than 15, the returned result is all zeroes.
2342
///
2343
/// \headerfile <immintrin.h>
2344
///
2345
/// \code
2346
/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347
/// \endcode
2348
///
2349
/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2350
///
2351
/// \param a
2352
/// A 256-bit integer vector to be shifted.
2353
/// \param imm
2354
/// An unsigned immediate value specifying the shift count (in bytes).
2355
/// \returns A 256-bit integer vector containing the result.
2356
#define _mm256_bsrli_epi128(a, imm) \
2357
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2358
2359
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360
/// right by \a __count bits, shifting in zero bits, and returns the result.
2361
/// If \a __count is greater than 15, the returned result is all zeroes.
2362
///
2363
/// \headerfile <immintrin.h>
2364
///
2365
/// This intrinsic corresponds to the \c VPSRLW instruction.
2366
///
2367
/// \param __a
2368
/// A 256-bit vector of [16 x i16] to be shifted.
2369
/// \param __count
2370
/// An unsigned integer value specifying the shift count (in bits).
2371
/// \returns A 256-bit vector of [16 x i16] containing the result.
2372
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373
_mm256_srli_epi16(__m256i __a, int __count)
2374
{
2375
return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2376
}
2377
2378
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379
/// right by the number of bits given in the lower 64 bits of \a __count,
2380
/// shifting in zero bits, and returns the result. If \a __count is greater
2381
/// than 15, the returned result is all zeroes.
2382
///
2383
/// \headerfile <immintrin.h>
2384
///
2385
/// This intrinsic corresponds to the \c VPSRLW instruction.
2386
///
2387
/// \param __a
2388
/// A 256-bit vector of [16 x i16] to be shifted.
2389
/// \param __count
2390
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391
/// shift count (in bits). The upper element is ignored.
2392
/// \returns A 256-bit vector of [16 x i16] containing the result.
2393
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394
_mm256_srl_epi16(__m256i __a, __m128i __count)
2395
{
2396
return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2397
}
2398
2399
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400
/// right by \a __count bits, shifting in zero bits, and returns the result.
2401
/// If \a __count is greater than 31, the returned result is all zeroes.
2402
///
2403
/// \headerfile <immintrin.h>
2404
///
2405
/// This intrinsic corresponds to the \c VPSRLD instruction.
2406
///
2407
/// \param __a
2408
/// A 256-bit vector of [8 x i32] to be shifted.
2409
/// \param __count
2410
/// An unsigned integer value specifying the shift count (in bits).
2411
/// \returns A 256-bit vector of [8 x i32] containing the result.
2412
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413
_mm256_srli_epi32(__m256i __a, int __count)
2414
{
2415
return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2416
}
2417
2418
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419
/// right by the number of bits given in the lower 64 bits of \a __count,
2420
/// shifting in zero bits, and returns the result. If \a __count is greater
2421
/// than 31, the returned result is all zeroes.
2422
///
2423
/// \headerfile <immintrin.h>
2424
///
2425
/// This intrinsic corresponds to the \c VPSRLD instruction.
2426
///
2427
/// \param __a
2428
/// A 256-bit vector of [8 x i32] to be shifted.
2429
/// \param __count
2430
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431
/// shift count (in bits). The upper element is ignored.
2432
/// \returns A 256-bit vector of [8 x i32] containing the result.
2433
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434
_mm256_srl_epi32(__m256i __a, __m128i __count)
2435
{
2436
return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2437
}
2438
2439
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440
/// right by \a __count bits, shifting in zero bits, and returns the result.
2441
/// If \a __count is greater than 63, the returned result is all zeroes.
2442
///
2443
/// \headerfile <immintrin.h>
2444
///
2445
/// This intrinsic corresponds to the \c VPSRLQ instruction.
2446
///
2447
/// \param __a
2448
/// A 256-bit vector of [4 x i64] to be shifted.
2449
/// \param __count
2450
/// An unsigned integer value specifying the shift count (in bits).
2451
/// \returns A 256-bit vector of [4 x i64] containing the result.
2452
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453
_mm256_srli_epi64(__m256i __a, int __count)
2454
{
2455
return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2456
}
2457
2458
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459
/// right by the number of bits given in the lower 64 bits of \a __count,
2460
/// shifting in zero bits, and returns the result. If \a __count is greater
2461
/// than 63, the returned result is all zeroes.
2462
///
2463
/// \headerfile <immintrin.h>
2464
///
2465
/// This intrinsic corresponds to the \c VPSRLQ instruction.
2466
///
2467
/// \param __a
2468
/// A 256-bit vector of [4 x i64] to be shifted.
2469
/// \param __count
2470
/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471
/// shift count (in bits). The upper element is ignored.
2472
/// \returns A 256-bit vector of [4 x i64] containing the result.
2473
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474
_mm256_srl_epi64(__m256i __a, __m128i __count)
2475
{
2476
return __builtin_ia32_psrlq256((__v4di)__a, __count);
2477
}
2478
2479
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480
/// vectors. Returns the lower 8 bits of each difference in the
2481
/// corresponding byte of the 256-bit integer vector result (overflow is
2482
/// ignored).
2483
///
2484
/// \code{.operation}
2485
/// FOR i := 0 TO 31
2486
/// j := i*8
2487
/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488
/// ENDFOR
2489
/// \endcode
2490
///
2491
/// \headerfile <immintrin.h>
2492
///
2493
/// This intrinsic corresponds to the \c VPSUBB instruction.
2494
///
2495
/// \param __a
2496
/// A 256-bit integer vector containing the minuends.
2497
/// \param __b
2498
/// A 256-bit integer vector containing the subtrahends.
2499
/// \returns A 256-bit integer vector containing the differences.
2500
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501
_mm256_sub_epi8(__m256i __a, __m256i __b)
2502
{
2503
return (__m256i)((__v32qu)__a - (__v32qu)__b);
2504
}
2505
2506
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507
/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508
/// the corresponding element of the [16 x i16] result (overflow is
2509
/// ignored).
2510
///
2511
/// \code{.operation}
2512
/// FOR i := 0 TO 15
2513
/// j := i*16
2514
/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515
/// ENDFOR
2516
/// \endcode
2517
///
2518
/// \headerfile <immintrin.h>
2519
///
2520
/// This intrinsic corresponds to the \c VPSUBW instruction.
2521
///
2522
/// \param __a
2523
/// A 256-bit vector of [16 x i16] containing the minuends.
2524
/// \param __b
2525
/// A 256-bit vector of [16 x i16] containing the subtrahends.
2526
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2527
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528
_mm256_sub_epi16(__m256i __a, __m256i __b)
2529
{
2530
return (__m256i)((__v16hu)__a - (__v16hu)__b);
2531
}
2532
2533
/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534
/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535
/// the corresponding element of the [8 x i32] result (overflow is ignored).
2536
///
2537
/// \code{.operation}
2538
/// FOR i := 0 TO 7
2539
/// j := i*32
2540
/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541
/// ENDFOR
2542
/// \endcode
2543
///
2544
/// \headerfile <immintrin.h>
2545
///
2546
/// This intrinsic corresponds to the \c VPSUBD instruction.
2547
///
2548
/// \param __a
2549
/// A 256-bit vector of [8 x i32] containing the minuends.
2550
/// \param __b
2551
/// A 256-bit vector of [8 x i32] containing the subtrahends.
2552
/// \returns A 256-bit vector of [8 x i32] containing the differences.
2553
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554
_mm256_sub_epi32(__m256i __a, __m256i __b)
2555
{
2556
return (__m256i)((__v8su)__a - (__v8su)__b);
2557
}
2558
2559
/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560
/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561
/// the corresponding element of the [4 x i64] result (overflow is ignored).
2562
///
2563
/// \code{.operation}
2564
/// FOR i := 0 TO 3
2565
/// j := i*64
2566
/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567
/// ENDFOR
2568
/// \endcode
2569
///
2570
/// \headerfile <immintrin.h>
2571
///
2572
/// This intrinsic corresponds to the \c VPSUBQ instruction.
2573
///
2574
/// \param __a
2575
/// A 256-bit vector of [4 x i64] containing the minuends.
2576
/// \param __b
2577
/// A 256-bit vector of [4 x i64] containing the subtrahends.
2578
/// \returns A 256-bit vector of [4 x i64] containing the differences.
2579
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580
_mm256_sub_epi64(__m256i __a, __m256i __b)
2581
{
2582
return (__m256i)((__v4du)__a - (__v4du)__b);
2583
}
2584
2585
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586
/// vectors using signed saturation, and returns each differences in the
2587
/// corresponding byte of the 256-bit integer vector result.
2588
///
2589
/// \code{.operation}
2590
/// FOR i := 0 TO 31
2591
/// j := i*8
2592
/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593
/// ENDFOR
2594
/// \endcode
2595
///
2596
/// \headerfile <immintrin.h>
2597
///
2598
/// This intrinsic corresponds to the \c VPSUBSB instruction.
2599
///
2600
/// \param __a
2601
/// A 256-bit integer vector containing the minuends.
2602
/// \param __b
2603
/// A 256-bit integer vector containing the subtrahends.
2604
/// \returns A 256-bit integer vector containing the differences.
2605
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606
_mm256_subs_epi8(__m256i __a, __m256i __b)
2607
{
2608
return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2609
}
2610
2611
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612
/// vectors of [16 x i16] using signed saturation, and returns each
2613
/// difference in the corresponding element of the [16 x i16] result.
2614
///
2615
/// \code{.operation}
2616
/// FOR i := 0 TO 15
2617
/// j := i*16
2618
/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619
/// ENDFOR
2620
/// \endcode
2621
///
2622
/// \headerfile <immintrin.h>
2623
///
2624
/// This intrinsic corresponds to the \c VPSUBSW instruction.
2625
///
2626
/// \param __a
2627
/// A 256-bit vector of [16 x i16] containing the minuends.
2628
/// \param __b
2629
/// A 256-bit vector of [16 x i16] containing the subtrahends.
2630
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2631
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632
_mm256_subs_epi16(__m256i __a, __m256i __b)
2633
{
2634
return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2635
}
2636
2637
/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638
/// vectors using unsigned saturation, and returns each difference in the
2639
/// corresponding byte of the 256-bit integer vector result. For each byte,
2640
/// computes <c> result = __a - __b </c>.
2641
///
2642
/// \code{.operation}
2643
/// FOR i := 0 TO 31
2644
/// j := i*8
2645
/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646
/// ENDFOR
2647
/// \endcode
2648
///
2649
/// \headerfile <immintrin.h>
2650
///
2651
/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2652
///
2653
/// \param __a
2654
/// A 256-bit integer vector containing the minuends.
2655
/// \param __b
2656
/// A 256-bit integer vector containing the subtrahends.
2657
/// \returns A 256-bit integer vector containing the differences.
2658
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659
_mm256_subs_epu8(__m256i __a, __m256i __b)
2660
{
2661
return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2662
}
2663
2664
/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665
/// vectors of [16 x i16] using unsigned saturation, and returns each
2666
/// difference in the corresponding element of the [16 x i16] result.
2667
///
2668
/// \code{.operation}
2669
/// FOR i := 0 TO 15
2670
/// j := i*16
2671
/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672
/// ENDFOR
2673
/// \endcode
2674
///
2675
/// \headerfile <immintrin.h>
2676
///
2677
/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2678
///
2679
/// \param __a
2680
/// A 256-bit vector of [16 x i16] containing the minuends.
2681
/// \param __b
2682
/// A 256-bit vector of [16 x i16] containing the subtrahends.
2683
/// \returns A 256-bit vector of [16 x i16] containing the differences.
2684
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685
_mm256_subs_epu16(__m256i __a, __m256i __b)
2686
{
2687
return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2688
}
2689
2690
/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691
/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692
/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693
/// input; other bits in these parameters are ignored.
2694
///
2695
/// \code{.operation}
2696
/// result[7:0] := __a[71:64]
2697
/// result[15:8] := __b[71:64]
2698
/// result[23:16] := __a[79:72]
2699
/// result[31:24] := __b[79:72]
2700
/// . . .
2701
/// result[127:120] := __b[127:120]
2702
/// result[135:128] := __a[199:192]
2703
/// . . .
2704
/// result[255:248] := __b[255:248]
2705
/// \endcode
2706
///
2707
/// \headerfile <immintrin.h>
2708
///
2709
/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2710
///
2711
/// \param __a
2712
/// A 256-bit integer vector used as the source for the even-numbered bytes
2713
/// of the result.
2714
/// \param __b
2715
/// A 256-bit integer vector used as the source for the odd-numbered bytes
2716
/// of the result.
2717
/// \returns A 256-bit integer vector containing the result.
2718
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719
_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2720
{
2721
return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2722
}
2723
2724
/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725
/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726
/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727
/// 128-bit half of \a __a and \a __b as input; other bits in these
2728
/// parameters are ignored.
2729
///
2730
/// \code{.operation}
2731
/// result[15:0] := __a[79:64]
2732
/// result[31:16] := __b[79:64]
2733
/// result[47:32] := __a[95:80]
2734
/// result[63:48] := __b[95:80]
2735
/// . . .
2736
/// result[127:112] := __b[127:112]
2737
/// result[143:128] := __a[211:196]
2738
/// . . .
2739
/// result[255:240] := __b[255:240]
2740
/// \endcode
2741
///
2742
/// \headerfile <immintrin.h>
2743
///
2744
/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2745
///
2746
/// \param __a
2747
/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748
/// elements of the result.
2749
/// \param __b
2750
/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751
/// elements of the result.
2752
/// \returns A 256-bit vector of [16 x i16] containing the result.
2753
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754
_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2755
{
2756
return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2757
}
2758
2759
/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760
/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761
/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762
/// of \a __a and \a __b as input; other bits in these parameters are
2763
/// ignored.
2764
///
2765
/// \code{.operation}
2766
/// result[31:0] := __a[95:64]
2767
/// result[63:32] := __b[95:64]
2768
/// result[95:64] := __a[127:96]
2769
/// result[127:96] := __b[127:96]
2770
/// result[159:128] := __a[223:192]
2771
/// result[191:160] := __b[223:192]
2772
/// result[223:192] := __a[255:224]
2773
/// result[255:224] := __b[255:224]
2774
/// \endcode
2775
///
2776
/// \headerfile <immintrin.h>
2777
///
2778
/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2779
///
2780
/// \param __a
2781
/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782
/// elements of the result.
2783
/// \param __b
2784
/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785
/// elements of the result.
2786
/// \returns A 256-bit vector of [8 x i32] containing the result.
2787
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788
_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2789
{
2790
return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2791
}
2792
2793
/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794
/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795
/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796
/// of \a __a and \a __b as input; other bits in these parameters are
2797
/// ignored.
2798
///
2799
/// \code{.operation}
2800
/// result[63:0] := __a[127:64]
2801
/// result[127:64] := __b[127:64]
2802
/// result[191:128] := __a[255:192]
2803
/// result[255:192] := __b[255:192]
2804
/// \endcode
2805
///
2806
/// \headerfile <immintrin.h>
2807
///
2808
/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2809
///
2810
/// \param __a
2811
/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812
/// elements of the result.
2813
/// \param __b
2814
/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815
/// elements of the result.
2816
/// \returns A 256-bit vector of [4 x i64] containing the result.
2817
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818
_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2819
{
2820
return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2821
}
2822
2823
/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824
/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825
/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826
/// input; other bits in these parameters are ignored.
2827
///
2828
/// \code{.operation}
2829
/// result[7:0] := __a[7:0]
2830
/// result[15:8] := __b[7:0]
2831
/// result[23:16] := __a[15:8]
2832
/// result[31:24] := __b[15:8]
2833
/// . . .
2834
/// result[127:120] := __b[63:56]
2835
/// result[135:128] := __a[135:128]
2836
/// . . .
2837
/// result[255:248] := __b[191:184]
2838
/// \endcode
2839
///
2840
/// \headerfile <immintrin.h>
2841
///
2842
/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2843
///
2844
/// \param __a
2845
/// A 256-bit integer vector used as the source for the even-numbered bytes
2846
/// of the result.
2847
/// \param __b
2848
/// A 256-bit integer vector used as the source for the odd-numbered bytes
2849
/// of the result.
2850
/// \returns A 256-bit integer vector containing the result.
2851
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852
_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2853
{
2854
return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2855
}
2856
2857
/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858
/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859
/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860
/// 128-bit half of \a __a and \a __b as input; other bits in these
2861
/// parameters are ignored.
2862
///
2863
/// \code{.operation}
2864
/// result[15:0] := __a[15:0]
2865
/// result[31:16] := __b[15:0]
2866
/// result[47:32] := __a[31:16]
2867
/// result[63:48] := __b[31:16]
2868
/// . . .
2869
/// result[127:112] := __b[63:48]
2870
/// result[143:128] := __a[143:128]
2871
/// . . .
2872
/// result[255:239] := __b[191:176]
2873
/// \endcode
2874
///
2875
/// \headerfile <immintrin.h>
2876
///
2877
/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2878
///
2879
/// \param __a
2880
/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881
/// elements of the result.
2882
/// \param __b
2883
/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884
/// elements of the result.
2885
/// \returns A 256-bit vector of [16 x i16] containing the result.
2886
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887
_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2888
{
2889
return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2890
}
2891
2892
/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893
/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894
/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895
/// of \a __a and \a __b as input; other bits in these parameters are
2896
/// ignored.
2897
///
2898
/// \code{.operation}
2899
/// result[31:0] := __a[31:0]
2900
/// result[63:32] := __b[31:0]
2901
/// result[95:64] := __a[63:32]
2902
/// result[127:96] := __b[63:32]
2903
/// result[159:128] := __a[159:128]
2904
/// result[191:160] := __b[159:128]
2905
/// result[223:192] := __a[191:160]
2906
/// result[255:224] := __b[191:190]
2907
/// \endcode
2908
///
2909
/// \headerfile <immintrin.h>
2910
///
2911
/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2912
///
2913
/// \param __a
2914
/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915
/// elements of the result.
2916
/// \param __b
2917
/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918
/// elements of the result.
2919
/// \returns A 256-bit vector of [8 x i32] containing the result.
2920
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921
_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2922
{
2923
return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2924
}
2925
2926
/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927
/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928
/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929
/// of \a __a and \a __b as input; other bits in these parameters are
2930
/// ignored.
2931
///
2932
/// \code{.operation}
2933
/// result[63:0] := __a[63:0]
2934
/// result[127:64] := __b[63:0]
2935
/// result[191:128] := __a[191:128]
2936
/// result[255:192] := __b[191:128]
2937
/// \endcode
2938
///
2939
/// \headerfile <immintrin.h>
2940
///
2941
/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2942
///
2943
/// \param __a
2944
/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945
/// elements of the result.
2946
/// \param __b
2947
/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948
/// elements of the result.
2949
/// \returns A 256-bit vector of [4 x i64] containing the result.
2950
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951
_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2952
{
2953
return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2954
}
2955
2956
/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957
/// \a __b.
2958
///
2959
/// \headerfile <immintrin.h>
2960
///
2961
/// This intrinsic corresponds to the \c VPXOR instruction.
2962
///
2963
/// \param __a
2964
/// A 256-bit integer vector.
2965
/// \param __b
2966
/// A 256-bit integer vector.
2967
/// \returns A 256-bit integer vector containing the result.
2968
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969
_mm256_xor_si256(__m256i __a, __m256i __b)
2970
{
2971
return (__m256i)((__v4du)__a ^ (__v4du)__b);
2972
}
2973
2974
/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975
/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976
/// boundary.
2977
///
2978
/// \headerfile <immintrin.h>
2979
///
2980
/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2981
///
2982
/// \param __V
2983
/// A pointer to the 32-byte aligned memory containing the vector to load.
2984
/// \returns A 256-bit integer vector loaded from memory.
2985
static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986
_mm256_stream_load_si256(const void *__V)
2987
{
2988
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989
return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2990
}
2991
2992
/// Broadcasts the 32-bit floating-point value from the low element of the
2993
/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2994
/// 128-bit vector of [4 x float].
2995
///
2996
/// \headerfile <immintrin.h>
2997
///
2998
/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2999
///
3000
/// \param __X
3001
/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3002
/// \returns A 128-bit vector of [4 x float] containing the result.
3003
static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004
_mm_broadcastss_ps(__m128 __X)
3005
{
3006
return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3007
}
3008
3009
/// Broadcasts the 64-bit floating-point value from the low element of the
3010
/// 128-bit vector of [2 x double] in \a __a to both elements of the
3011
/// result's 128-bit vector of [2 x double].
3012
///
3013
/// \headerfile <immintrin.h>
3014
///
3015
/// This intrinsic corresponds to the \c MOVDDUP instruction.
3016
///
3017
/// \param __a
3018
/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3019
/// \returns A 128-bit vector of [2 x double] containing the result.
3020
static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021
_mm_broadcastsd_pd(__m128d __a)
3022
{
3023
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3024
}
3025
3026
/// Broadcasts the 32-bit floating-point value from the low element of the
3027
/// 128-bit vector of [4 x float] in \a __X to all elements of the
3028
/// result's 256-bit vector of [8 x float].
3029
///
3030
/// \headerfile <immintrin.h>
3031
///
3032
/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3033
///
3034
/// \param __X
3035
/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3036
/// \returns A 256-bit vector of [8 x float] containing the result.
3037
static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038
_mm256_broadcastss_ps(__m128 __X)
3039
{
3040
return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3041
}
3042
3043
/// Broadcasts the 64-bit floating-point value from the low element of the
3044
/// 128-bit vector of [2 x double] in \a __X to all elements of the
3045
/// result's 256-bit vector of [4 x double].
3046
///
3047
/// \headerfile <immintrin.h>
3048
///
3049
/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3050
///
3051
/// \param __X
3052
/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3053
/// \returns A 256-bit vector of [4 x double] containing the result.
3054
static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055
_mm256_broadcastsd_pd(__m128d __X)
3056
{
3057
return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3058
}
3059
3060
/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061
/// upper halves of the 256-bit result.
3062
///
3063
/// \headerfile <immintrin.h>
3064
///
3065
/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3066
///
3067
/// \param __X
3068
/// A 128-bit integer vector to be broadcast.
3069
/// \returns A 256-bit integer vector containing the result.
3070
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071
_mm256_broadcastsi128_si256(__m128i __X)
3072
{
3073
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3074
}
3075
3076
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3077
3078
/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079
/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080
/// as specified by the immediate integer operand \a M.
3081
///
3082
/// \code{.operation}
3083
/// FOR i := 0 TO 3
3084
/// j := i*32
3085
/// IF M[i] == 0
3086
/// result[31+j:j] := V1[31+j:j]
3087
/// ELSE
3088
/// result[31+j:j] := V2[32+j:j]
3089
/// FI
3090
/// ENDFOR
3091
/// \endcode
3092
///
3093
/// \headerfile <immintrin.h>
3094
///
3095
/// \code
3096
/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097
/// \endcode
3098
///
3099
/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3100
///
3101
/// \param V1
3102
/// A 128-bit vector of [4 x i32] containing source values.
3103
/// \param V2
3104
/// A 128-bit vector of [4 x i32] containing source values.
3105
/// \param M
3106
/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3107
/// source for each element of the result. The position of the mask bit
3108
/// corresponds to the index of a copied value. When a mask bit is 0, the
3109
/// element is copied from \a V1; otherwise, it is copied from \a V2.
3110
/// \returns A 128-bit vector of [4 x i32] containing the result.
3111
#define _mm_blend_epi32(V1, V2, M) \
3112
((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113
(__v4si)(__m128i)(V2), (int)(M)))
3114
3115
/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116
/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117
/// as specified by the immediate integer operand \a M.
3118
///
3119
/// \code{.operation}
3120
/// FOR i := 0 TO 7
3121
/// j := i*32
3122
/// IF M[i] == 0
3123
/// result[31+j:j] := V1[31+j:j]
3124
/// ELSE
3125
/// result[31+j:j] := V2[32+j:j]
3126
/// FI
3127
/// ENDFOR
3128
/// \endcode
3129
///
3130
/// \headerfile <immintrin.h>
3131
///
3132
/// \code
3133
/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134
/// \endcode
3135
///
3136
/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3137
///
3138
/// \param V1
3139
/// A 256-bit vector of [8 x i32] containing source values.
3140
/// \param V2
3141
/// A 256-bit vector of [8 x i32] containing source values.
3142
/// \param M
3143
/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3144
/// source for each element of the result. The position of the mask bit
3145
/// corresponds to the index of a copied value. When a mask bit is 0, the
3146
/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3147
/// \returns A 256-bit vector of [8 x i32] containing the result.
3148
#define _mm256_blend_epi32(V1, V2, M) \
3149
((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150
(__v8si)(__m256i)(V2), (int)(M)))
3151
3152
/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153
/// bytes of the 256-bit result.
3154
///
3155
/// \headerfile <immintrin.h>
3156
///
3157
/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3158
///
3159
/// \param __X
3160
/// A 128-bit integer vector whose low byte will be broadcast.
3161
/// \returns A 256-bit integer vector containing the result.
3162
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163
_mm256_broadcastb_epi8(__m128i __X)
3164
{
3165
return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3166
}
3167
3168
/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169
/// to all elements of the result's 256-bit vector of [16 x i16].
3170
///
3171
/// \headerfile <immintrin.h>
3172
///
3173
/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3174
///
3175
/// \param __X
3176
/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177
/// \returns A 256-bit vector of [16 x i16] containing the result.
3178
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179
_mm256_broadcastw_epi16(__m128i __X)
3180
{
3181
return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3182
}
3183
3184
/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185
/// to all elements of the result's 256-bit vector of [8 x i32].
3186
///
3187
/// \headerfile <immintrin.h>
3188
///
3189
/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190
///
3191
/// \param __X
3192
/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193
/// \returns A 256-bit vector of [8 x i32] containing the result.
3194
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195
_mm256_broadcastd_epi32(__m128i __X)
3196
{
3197
return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3198
}
3199
3200
/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201
/// to all elements of the result's 256-bit vector of [4 x i64].
3202
///
3203
/// \headerfile <immintrin.h>
3204
///
3205
/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3206
///
3207
/// \param __X
3208
/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209
/// \returns A 256-bit vector of [4 x i64] containing the result.
3210
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211
_mm256_broadcastq_epi64(__m128i __X)
3212
{
3213
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3214
}
3215
3216
/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217
/// bytes of the 128-bit result.
3218
///
3219
/// \headerfile <immintrin.h>
3220
///
3221
/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3222
///
3223
/// \param __X
3224
/// A 128-bit integer vector whose low byte will be broadcast.
3225
/// \returns A 128-bit integer vector containing the result.
3226
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227
_mm_broadcastb_epi8(__m128i __X)
3228
{
3229
return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3230
}
3231
3232
/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233
/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3234
///
3235
/// \headerfile <immintrin.h>
3236
///
3237
/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3238
///
3239
/// \param __X
3240
/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241
/// \returns A 128-bit vector of [8 x i16] containing the result.
3242
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243
_mm_broadcastw_epi16(__m128i __X)
3244
{
3245
return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3246
}
3247
3248
/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249
/// to all elements of the result's vector of [4 x i32].
3250
///
3251
/// \headerfile <immintrin.h>
3252
///
3253
/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3254
///
3255
/// \param __X
3256
/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257
/// \returns A 128-bit vector of [4 x i32] containing the result.
3258
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259
_mm_broadcastd_epi32(__m128i __X)
3260
{
3261
return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3262
}
3263
3264
/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265
/// to both elements of the result's 128-bit vector of [2 x i64].
3266
///
3267
/// \headerfile <immintrin.h>
3268
///
3269
/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3270
///
3271
/// \param __X
3272
/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273
/// \returns A 128-bit vector of [2 x i64] containing the result.
3274
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275
_mm_broadcastq_epi64(__m128i __X)
3276
{
3277
return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3278
}
3279
3280
/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281
/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282
/// elements of the 256-bit vector of [8 x i32] in \a __b.
3283
///
3284
/// \code{.operation}
3285
/// FOR i := 0 TO 7
3286
/// j := i*32
3287
/// k := __b[j+2:j] * 32
3288
/// result[j+31:j] := __a[k+31:k]
3289
/// ENDFOR
3290
/// \endcode
3291
///
3292
/// \headerfile <immintrin.h>
3293
///
3294
/// This intrinsic corresponds to the \c VPERMD instruction.
3295
///
3296
/// \param __a
3297
/// A 256-bit vector of [8 x i32] containing the source values.
3298
/// \param __b
3299
/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3300
/// \a __a.
3301
/// \returns A 256-bit vector of [8 x i32] containing the result.
3302
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303
_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3304
{
3305
return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3306
}
3307
3308
/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309
/// the 256-bit vector of [4 x double] in \a V as specified by the
3310
/// immediate value \a M.
3311
///
3312
/// \code{.operation}
3313
/// FOR i := 0 TO 3
3314
/// j := i*64
3315
/// k := (M >> i*2)[1:0] * 64
3316
/// result[j+63:j] := V[k+63:k]
3317
/// ENDFOR
3318
/// \endcode
3319
///
3320
/// \headerfile <immintrin.h>
3321
///
3322
/// \code
3323
/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324
/// \endcode
3325
///
3326
/// This intrinsic corresponds to the \c VPERMPD instruction.
3327
///
3328
/// \param V
3329
/// A 256-bit vector of [4 x double] containing the source values.
3330
/// \param M
3331
/// An immediate 8-bit value specifying which elements to copy from \a V.
3332
/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333
/// \a M[3:2] specifies the index for element 1, and so forth.
3334
/// \returns A 256-bit vector of [4 x double] containing the result.
3335
#define _mm256_permute4x64_pd(V, M) \
3336
((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3337
3338
/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339
/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340
/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3341
///
3342
/// \code{.operation}
3343
/// FOR i := 0 TO 7
3344
/// j := i*32
3345
/// k := __b[j+2:j] * 32
3346
/// result[j+31:j] := __a[k+31:k]
3347
/// ENDFOR
3348
/// \endcode
3349
///
3350
/// \headerfile <immintrin.h>
3351
///
3352
/// This intrinsic corresponds to the \c VPERMPS instruction.
3353
///
3354
/// \param __a
3355
/// A 256-bit vector of [8 x float] containing the source values.
3356
/// \param __b
3357
/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3358
/// \a __a.
3359
/// \returns A 256-bit vector of [8 x float] containing the result.
3360
static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3362
{
3363
return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3364
}
3365
3366
/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367
/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3368
/// immediate value \a M.
3369
///
3370
/// \code{.operation}
3371
/// FOR i := 0 TO 3
3372
/// j := i*64
3373
/// k := (M >> i*2)[1:0] * 64
3374
/// result[j+63:j] := V[k+63:k]
3375
/// ENDFOR
3376
/// \endcode
3377
///
3378
/// \headerfile <immintrin.h>
3379
///
3380
/// \code
3381
/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382
/// \endcode
3383
///
3384
/// This intrinsic corresponds to the \c VPERMQ instruction.
3385
///
3386
/// \param V
3387
/// A 256-bit vector of [4 x i64] containing the source values.
3388
/// \param M
3389
/// An immediate 8-bit value specifying which elements to copy from \a V.
3390
/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3391
/// \a M[3:2] specifies the index for element 1, and so forth.
3392
/// \returns A 256-bit vector of [4 x i64] containing the result.
3393
#define _mm256_permute4x64_epi64(V, M) \
3394
((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3395
3396
/// Sets each half of the 256-bit result either to zero or to one of the
3397
/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398
/// as specified by the immediate value \a M.
3399
///
3400
/// \code{.operation}
3401
/// FOR i := 0 TO 1
3402
/// j := i*128
3403
/// k := M >> (i*4)
3404
/// IF k[3] == 0
3405
/// CASE (k[1:0]) OF
3406
/// 0: result[127+j:j] := V1[127:0]
3407
/// 1: result[127+j:j] := V1[255:128]
3408
/// 2: result[127+j:j] := V2[127:0]
3409
/// 3: result[127+j:j] := V2[255:128]
3410
/// ESAC
3411
/// ELSE
3412
/// result[127+j:j] := 0
3413
/// FI
3414
/// ENDFOR
3415
/// \endcode
3416
///
3417
/// \headerfile <immintrin.h>
3418
///
3419
/// \code
3420
/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421
/// \endcode
3422
///
3423
/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3424
///
3425
/// \param V1
3426
/// A 256-bit integer vector containing source values.
3427
/// \param V2
3428
/// A 256-bit integer vector containing source values.
3429
/// \param M
3430
/// An immediate value specifying how to form the result. Bits [3:0]
3431
/// control the lower half of the result, bits [7:4] control the upper half.
3432
/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433
/// otherwise bits [1:0] determine the source as follows. \n
3434
/// 0: the lower half of \a V1 \n
3435
/// 1: the upper half of \a V1 \n
3436
/// 2: the lower half of \a V2 \n
3437
/// 3: the upper half of \a V2
3438
/// \returns A 256-bit integer vector containing the result.
3439
#define _mm256_permute2x128_si256(V1, V2, M) \
3440
((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3441
3442
/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443
/// of the immediate \a M is zero, extracts the lower half of the result;
3444
/// otherwise, extracts the upper half.
3445
///
3446
/// \headerfile <immintrin.h>
3447
///
3448
/// \code
3449
/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450
/// \endcode
3451
///
3452
/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3453
///
3454
/// \param V
3455
/// A 256-bit integer vector containing the source values.
3456
/// \param M
3457
/// An immediate value specifying which half of \a V to extract.
3458
/// \returns A 128-bit integer vector containing the result.
3459
#define _mm256_extracti128_si256(V, M) \
3460
((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3461
3462
/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463
/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464
/// is zero, overwrites the lower half of the result; otherwise,
3465
/// overwrites the upper half.
3466
///
3467
/// \headerfile <immintrin.h>
3468
///
3469
/// \code
3470
/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471
/// \endcode
3472
///
3473
/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3474
///
3475
/// \param V1
3476
/// A 256-bit integer vector containing a source value.
3477
/// \param V2
3478
/// A 128-bit integer vector containing a source value.
3479
/// \param M
3480
/// An immediate value specifying where to put \a V2 in the result.
3481
/// \returns A 256-bit integer vector containing the result.
3482
#define _mm256_inserti128_si256(V1, V2, M) \
3483
((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484
(__v2di)(__m128i)(V2), (int)(M)))
3485
3486
/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487
/// the most significant bit of the corresponding element in the mask
3488
/// \a __M is set; otherwise, sets that element of the result to zero.
3489
/// Returns the 256-bit [8 x i32] result.
3490
///
3491
/// \code{.operation}
3492
/// FOR i := 0 TO 7
3493
/// j := i*32
3494
/// IF __M[j+31] == 1
3495
/// result[j+31:j] := Load32(__X+(i*4))
3496
/// ELSE
3497
/// result[j+31:j] := 0
3498
/// FI
3499
/// ENDFOR
3500
/// \endcode
3501
///
3502
/// \headerfile <immintrin.h>
3503
///
3504
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3505
///
3506
/// \param __X
3507
/// A pointer to the memory used for loading values.
3508
/// \param __M
3509
/// A 256-bit vector of [8 x i32] containing the mask bits.
3510
/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511
/// elements.
3512
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513
_mm256_maskload_epi32(int const *__X, __m256i __M)
3514
{
3515
return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3516
}
3517
3518
/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519
/// the most significant bit of the corresponding element in the mask
3520
/// \a __M is set; otherwise, sets that element of the result to zero.
3521
/// Returns the 256-bit [4 x i64] result.
3522
///
3523
/// \code{.operation}
3524
/// FOR i := 0 TO 3
3525
/// j := i*64
3526
/// IF __M[j+63] == 1
3527
/// result[j+63:j] := Load64(__X+(i*8))
3528
/// ELSE
3529
/// result[j+63:j] := 0
3530
/// FI
3531
/// ENDFOR
3532
/// \endcode
3533
///
3534
/// \headerfile <immintrin.h>
3535
///
3536
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3537
///
3538
/// \param __X
3539
/// A pointer to the memory used for loading values.
3540
/// \param __M
3541
/// A 256-bit vector of [4 x i64] containing the mask bits.
3542
/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543
/// elements.
3544
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545
_mm256_maskload_epi64(long long const *__X, __m256i __M)
3546
{
3547
return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3548
}
3549
3550
/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551
/// the most significant bit of the corresponding element in the mask
3552
/// \a __M is set; otherwise, sets that element of the result to zero.
3553
/// Returns the 128-bit [4 x i32] result.
3554
///
3555
/// \code{.operation}
3556
/// FOR i := 0 TO 3
3557
/// j := i*32
3558
/// IF __M[j+31] == 1
3559
/// result[j+31:j] := Load32(__X+(i*4))
3560
/// ELSE
3561
/// result[j+31:j] := 0
3562
/// FI
3563
/// ENDFOR
3564
/// \endcode
3565
///
3566
/// \headerfile <immintrin.h>
3567
///
3568
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3569
///
3570
/// \param __X
3571
/// A pointer to the memory used for loading values.
3572
/// \param __M
3573
/// A 128-bit vector of [4 x i32] containing the mask bits.
3574
/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575
/// elements.
3576
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577
_mm_maskload_epi32(int const *__X, __m128i __M)
3578
{
3579
return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3580
}
3581
3582
/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583
/// the most significant bit of the corresponding element in the mask
3584
/// \a __M is set; otherwise, sets that element of the result to zero.
3585
/// Returns the 128-bit [2 x i64] result.
3586
///
3587
/// \code{.operation}
3588
/// FOR i := 0 TO 1
3589
/// j := i*64
3590
/// IF __M[j+63] == 1
3591
/// result[j+63:j] := Load64(__X+(i*8))
3592
/// ELSE
3593
/// result[j+63:j] := 0
3594
/// FI
3595
/// ENDFOR
3596
/// \endcode
3597
///
3598
/// \headerfile <immintrin.h>
3599
///
3600
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3601
///
3602
/// \param __X
3603
/// A pointer to the memory used for loading values.
3604
/// \param __M
3605
/// A 128-bit vector of [2 x i64] containing the mask bits.
3606
/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607
/// elements.
3608
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609
_mm_maskload_epi64(long long const *__X, __m128i __M)
3610
{
3611
return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3612
}
3613
3614
/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615
/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616
/// the corresponding element in the mask \a __M is set; otherwise, the
3617
/// memory element is unchanged.
3618
///
3619
/// \code{.operation}
3620
/// FOR i := 0 TO 7
3621
/// j := i*32
3622
/// IF __M[j+31] == 1
3623
/// Store32(__X+(i*4), __Y[j+31:j])
3624
/// FI
3625
/// ENDFOR
3626
/// \endcode
3627
///
3628
/// \headerfile <immintrin.h>
3629
///
3630
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3631
///
3632
/// \param __X
3633
/// A pointer to the memory used for storing values.
3634
/// \param __M
3635
/// A 256-bit vector of [8 x i32] containing the mask bits.
3636
/// \param __Y
3637
/// A 256-bit vector of [8 x i32] containing the values to store.
3638
static __inline__ void __DEFAULT_FN_ATTRS256
3639
_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3640
{
3641
__builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3642
}
3643
3644
/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645
/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646
/// the corresponding element in the mask \a __M is set; otherwise, the
3647
/// memory element is unchanged.
3648
///
3649
/// \code{.operation}
3650
/// FOR i := 0 TO 3
3651
/// j := i*64
3652
/// IF __M[j+63] == 1
3653
/// Store64(__X+(i*8), __Y[j+63:j])
3654
/// FI
3655
/// ENDFOR
3656
/// \endcode
3657
///
3658
/// \headerfile <immintrin.h>
3659
///
3660
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3661
///
3662
/// \param __X
3663
/// A pointer to the memory used for storing values.
3664
/// \param __M
3665
/// A 256-bit vector of [4 x i64] containing the mask bits.
3666
/// \param __Y
3667
/// A 256-bit vector of [4 x i64] containing the values to store.
3668
static __inline__ void __DEFAULT_FN_ATTRS256
3669
_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3670
{
3671
__builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3672
}
3673
3674
/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675
/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676
/// the corresponding element in the mask \a __M is set; otherwise, the
3677
/// memory element is unchanged.
3678
///
3679
/// \code{.operation}
3680
/// FOR i := 0 TO 3
3681
/// j := i*32
3682
/// IF __M[j+31] == 1
3683
/// Store32(__X+(i*4), __Y[j+31:j])
3684
/// FI
3685
/// ENDFOR
3686
/// \endcode
3687
///
3688
/// \headerfile <immintrin.h>
3689
///
3690
/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3691
///
3692
/// \param __X
3693
/// A pointer to the memory used for storing values.
3694
/// \param __M
3695
/// A 128-bit vector of [4 x i32] containing the mask bits.
3696
/// \param __Y
3697
/// A 128-bit vector of [4 x i32] containing the values to store.
3698
static __inline__ void __DEFAULT_FN_ATTRS128
3699
_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3700
{
3701
__builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3702
}
3703
3704
/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705
/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706
/// the corresponding element in the mask \a __M is set; otherwise, the
3707
/// memory element is unchanged.
3708
///
3709
/// \code{.operation}
3710
/// FOR i := 0 TO 1
3711
/// j := i*64
3712
/// IF __M[j+63] == 1
3713
/// Store64(__X+(i*8), __Y[j+63:j])
3714
/// FI
3715
/// ENDFOR
3716
/// \endcode
3717
///
3718
/// \headerfile <immintrin.h>
3719
///
3720
/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3721
///
3722
/// \param __X
3723
/// A pointer to the memory used for storing values.
3724
/// \param __M
3725
/// A 128-bit vector of [2 x i64] containing the mask bits.
3726
/// \param __Y
3727
/// A 128-bit vector of [2 x i64] containing the values to store.
3728
static __inline__ void __DEFAULT_FN_ATTRS128
3729
_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3730
{
3731
__builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3732
}
3733
3734
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735
/// left by the number of bits given in the corresponding element of the
3736
/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737
/// returns the result. If the shift count for any element is greater than
3738
/// 31, the result for that element is zero.
3739
///
3740
/// \headerfile <immintrin.h>
3741
///
3742
/// This intrinsic corresponds to the \c VPSLLVD instruction.
3743
///
3744
/// \param __X
3745
/// A 256-bit vector of [8 x i32] to be shifted.
3746
/// \param __Y
3747
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748
/// bits).
3749
/// \returns A 256-bit vector of [8 x i32] containing the result.
3750
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751
_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3752
{
3753
return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3754
}
3755
3756
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757
/// left by the number of bits given in the corresponding element of the
3758
/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759
/// returns the result. If the shift count for any element is greater than
3760
/// 31, the result for that element is zero.
3761
///
3762
/// \headerfile <immintrin.h>
3763
///
3764
/// This intrinsic corresponds to the \c VPSLLVD instruction.
3765
///
3766
/// \param __X
3767
/// A 128-bit vector of [4 x i32] to be shifted.
3768
/// \param __Y
3769
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770
/// bits).
3771
/// \returns A 128-bit vector of [4 x i32] containing the result.
3772
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773
_mm_sllv_epi32(__m128i __X, __m128i __Y)
3774
{
3775
return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3776
}
3777
3778
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779
/// left by the number of bits given in the corresponding element of the
3780
/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781
/// returns the result. If the shift count for any element is greater than
3782
/// 63, the result for that element is zero.
3783
///
3784
/// \headerfile <immintrin.h>
3785
///
3786
/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3787
///
3788
/// \param __X
3789
/// A 256-bit vector of [4 x i64] to be shifted.
3790
/// \param __Y
3791
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792
/// bits).
3793
/// \returns A 256-bit vector of [4 x i64] containing the result.
3794
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795
_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3796
{
3797
return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3798
}
3799
3800
/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801
/// left by the number of bits given in the corresponding element of the
3802
/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803
/// returns the result. If the shift count for any element is greater than
3804
/// 63, the result for that element is zero.
3805
///
3806
/// \headerfile <immintrin.h>
3807
///
3808
/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3809
///
3810
/// \param __X
3811
/// A 128-bit vector of [2 x i64] to be shifted.
3812
/// \param __Y
3813
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814
/// bits).
3815
/// \returns A 128-bit vector of [2 x i64] containing the result.
3816
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817
_mm_sllv_epi64(__m128i __X, __m128i __Y)
3818
{
3819
return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3820
}
3821
3822
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823
/// right by the number of bits given in the corresponding element of the
3824
/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825
/// returns the result. If the shift count for any element is greater than
3826
/// 31, the result for that element is 0 or -1 according to the sign bit
3827
/// for that element.
3828
///
3829
/// \headerfile <immintrin.h>
3830
///
3831
/// This intrinsic corresponds to the \c VPSRAVD instruction.
3832
///
3833
/// \param __X
3834
/// A 256-bit vector of [8 x i32] to be shifted.
3835
/// \param __Y
3836
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837
/// bits).
3838
/// \returns A 256-bit vector of [8 x i32] containing the result.
3839
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840
_mm256_srav_epi32(__m256i __X, __m256i __Y)
3841
{
3842
return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3843
}
3844
3845
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846
/// right by the number of bits given in the corresponding element of the
3847
/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848
/// returns the result. If the shift count for any element is greater than
3849
/// 31, the result for that element is 0 or -1 according to the sign bit
3850
/// for that element.
3851
///
3852
/// \headerfile <immintrin.h>
3853
///
3854
/// This intrinsic corresponds to the \c VPSRAVD instruction.
3855
///
3856
/// \param __X
3857
/// A 128-bit vector of [4 x i32] to be shifted.
3858
/// \param __Y
3859
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860
/// bits).
3861
/// \returns A 128-bit vector of [4 x i32] containing the result.
3862
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863
_mm_srav_epi32(__m128i __X, __m128i __Y)
3864
{
3865
return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3866
}
3867
3868
/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869
/// right by the number of bits given in the corresponding element of the
3870
/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871
/// returns the result. If the shift count for any element is greater than
3872
/// 31, the result for that element is zero.
3873
///
3874
/// \headerfile <immintrin.h>
3875
///
3876
/// This intrinsic corresponds to the \c VPSRLVD instruction.
3877
///
3878
/// \param __X
3879
/// A 256-bit vector of [8 x i32] to be shifted.
3880
/// \param __Y
3881
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882
/// bits).
3883
/// \returns A 256-bit vector of [8 x i32] containing the result.
3884
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885
_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3886
{
3887
return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3888
}
3889
3890
/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891
/// right by the number of bits given in the corresponding element of the
3892
/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893
/// returns the result. If the shift count for any element is greater than
3894
/// 31, the result for that element is zero.
3895
///
3896
/// \headerfile <immintrin.h>
3897
///
3898
/// This intrinsic corresponds to the \c VPSRLVD instruction.
3899
///
3900
/// \param __X
3901
/// A 128-bit vector of [4 x i32] to be shifted.
3902
/// \param __Y
3903
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904
/// bits).
3905
/// \returns A 128-bit vector of [4 x i32] containing the result.
3906
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907
_mm_srlv_epi32(__m128i __X, __m128i __Y)
3908
{
3909
return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3910
}
3911
3912
/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913
/// right by the number of bits given in the corresponding element of the
3914
/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915
/// returns the result. If the shift count for any element is greater than
3916
/// 63, the result for that element is zero.
3917
///
3918
/// \headerfile <immintrin.h>
3919
///
3920
/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3921
///
3922
/// \param __X
3923
/// A 256-bit vector of [4 x i64] to be shifted.
3924
/// \param __Y
3925
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926
/// bits).
3927
/// \returns A 256-bit vector of [4 x i64] containing the result.
3928
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929
_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3930
{
3931
return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3932
}
3933
3934
/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935
/// right by the number of bits given in the corresponding element of the
3936
/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937
/// returns the result. If the shift count for any element is greater than
3938
/// 63, the result for that element is zero.
3939
///
3940
/// \headerfile <immintrin.h>
3941
///
3942
/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3943
///
3944
/// \param __X
3945
/// A 128-bit vector of [2 x i64] to be shifted.
3946
/// \param __Y
3947
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948
/// bits).
3949
/// \returns A 128-bit vector of [2 x i64] containing the result.
3950
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951
_mm_srlv_epi64(__m128i __X, __m128i __Y)
3952
{
3953
return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3954
}
3955
3956
/// Conditionally gathers two 64-bit floating-point values, either from the
3957
/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959
/// of [2 x double] in \a mask determines the source for each element.
3960
///
3961
/// \code{.operation}
3962
/// FOR element := 0 to 1
3963
/// j := element*64
3964
/// k := element*32
3965
/// IF mask[j+63] == 0
3966
/// result[j+63:j] := a[j+63:j]
3967
/// ELSE
3968
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969
/// FI
3970
/// ENDFOR
3971
/// \endcode
3972
///
3973
/// \headerfile <immintrin.h>
3974
///
3975
/// \code
3976
/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977
/// __m128d mask, const int s);
3978
/// \endcode
3979
///
3980
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3981
///
3982
/// \param a
3983
/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3984
/// zero.
3985
/// \param m
3986
/// A pointer to the memory used for loading values.
3987
/// \param i
3988
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989
/// the first two elements are used.
3990
/// \param mask
3991
/// A 128-bit vector of [2 x double] containing the mask. The most
3992
/// significant bit of each element in the mask vector represents the mask
3993
/// bits. If a mask bit is zero, the corresponding value from vector \a a
3994
/// is gathered; otherwise the value is loaded from memory.
3995
/// \param s
3996
/// A literal constant scale factor for the indexes in \a i. Must be
3997
/// 1, 2, 4, or 8.
3998
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000
((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001
(double const *)(m), \
4002
(__v4si)(__m128i)(i), \
4003
(__v2df)(__m128d)(mask), (s)))
4004
4005
/// Conditionally gathers four 64-bit floating-point values, either from the
4006
/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008
/// of [4 x double] in \a mask determines the source for each element.
4009
///
4010
/// \code{.operation}
4011
/// FOR element := 0 to 3
4012
/// j := element*64
4013
/// k := element*32
4014
/// IF mask[j+63] == 0
4015
/// result[j+63:j] := a[j+63:j]
4016
/// ELSE
4017
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018
/// FI
4019
/// ENDFOR
4020
/// \endcode
4021
///
4022
/// \headerfile <immintrin.h>
4023
///
4024
/// \code
4025
/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026
/// __m256d mask, const int s);
4027
/// \endcode
4028
///
4029
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4030
///
4031
/// \param a
4032
/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4033
/// zero.
4034
/// \param m
4035
/// A pointer to the memory used for loading values.
4036
/// \param i
4037
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038
/// \param mask
4039
/// A 256-bit vector of [4 x double] containing the mask. The most
4040
/// significant bit of each element in the mask vector represents the mask
4041
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4042
/// is gathered; otherwise the value is loaded from memory.
4043
/// \param s
4044
/// A literal constant scale factor for the indexes in \a i. Must be
4045
/// 1, 2, 4, or 8.
4046
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049
(double const *)(m), \
4050
(__v4si)(__m128i)(i), \
4051
(__v4df)(__m256d)(mask), (s)))
4052
4053
/// Conditionally gathers two 64-bit floating-point values, either from the
4054
/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055
/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056
/// of [2 x double] in \a mask determines the source for each element.
4057
///
4058
/// \code{.operation}
4059
/// FOR element := 0 to 1
4060
/// j := element*64
4061
/// k := element*64
4062
/// IF mask[j+63] == 0
4063
/// result[j+63:j] := a[j+63:j]
4064
/// ELSE
4065
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066
/// FI
4067
/// ENDFOR
4068
/// \endcode
4069
///
4070
/// \headerfile <immintrin.h>
4071
///
4072
/// \code
4073
/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074
/// __m128d mask, const int s);
4075
/// \endcode
4076
///
4077
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4078
///
4079
/// \param a
4080
/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4081
/// zero.
4082
/// \param m
4083
/// A pointer to the memory used for loading values.
4084
/// \param i
4085
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086
/// \param mask
4087
/// A 128-bit vector of [2 x double] containing the mask. The most
4088
/// significant bit of each element in the mask vector represents the mask
4089
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4090
/// is gathered; otherwise the value is loaded from memory.
4091
/// \param s
4092
/// A literal constant scale factor for the indexes in \a i. Must be
4093
/// 1, 2, 4, or 8.
4094
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096
((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097
(double const *)(m), \
4098
(__v2di)(__m128i)(i), \
4099
(__v2df)(__m128d)(mask), (s)))
4100
4101
/// Conditionally gathers four 64-bit floating-point values, either from the
4102
/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103
/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104
/// of [4 x double] in \a mask determines the source for each element.
4105
///
4106
/// \code{.operation}
4107
/// FOR element := 0 to 3
4108
/// j := element*64
4109
/// k := element*64
4110
/// IF mask[j+63] == 0
4111
/// result[j+63:j] := a[j+63:j]
4112
/// ELSE
4113
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114
/// FI
4115
/// ENDFOR
4116
/// \endcode
4117
///
4118
/// \headerfile <immintrin.h>
4119
///
4120
/// \code
4121
/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122
/// __m256d mask, const int s);
4123
/// \endcode
4124
///
4125
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4126
///
4127
/// \param a
4128
/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4129
/// zero.
4130
/// \param m
4131
/// A pointer to the memory used for loading values.
4132
/// \param i
4133
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134
/// \param mask
4135
/// A 256-bit vector of [4 x double] containing the mask. The most
4136
/// significant bit of each element in the mask vector represents the mask
4137
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4138
/// is gathered; otherwise the value is loaded from memory.
4139
/// \param s
4140
/// A literal constant scale factor for the indexes in \a i. Must be
4141
/// 1, 2, 4, or 8.
4142
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145
(double const *)(m), \
4146
(__v4di)(__m256i)(i), \
4147
(__v4df)(__m256d)(mask), (s)))
4148
4149
/// Conditionally gathers four 32-bit floating-point values, either from the
4150
/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152
/// of [4 x float] in \a mask determines the source for each element.
4153
///
4154
/// \code{.operation}
4155
/// FOR element := 0 to 3
4156
/// j := element*32
4157
/// k := element*32
4158
/// IF mask[j+31] == 0
4159
/// result[j+31:j] := a[j+31:j]
4160
/// ELSE
4161
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162
/// FI
4163
/// ENDFOR
4164
/// \endcode
4165
///
4166
/// \headerfile <immintrin.h>
4167
///
4168
/// \code
4169
/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170
/// __m128 mask, const int s);
4171
/// \endcode
4172
///
4173
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4174
///
4175
/// \param a
4176
/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4177
/// zero.
4178
/// \param m
4179
/// A pointer to the memory used for loading values.
4180
/// \param i
4181
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182
/// \param mask
4183
/// A 128-bit vector of [4 x float] containing the mask. The most
4184
/// significant bit of each element in the mask vector represents the mask
4185
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4186
/// is gathered; otherwise the value is loaded from memory.
4187
/// \param s
4188
/// A literal constant scale factor for the indexes in \a i. Must be
4189
/// 1, 2, 4, or 8.
4190
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192
((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193
(float const *)(m), \
4194
(__v4si)(__m128i)(i), \
4195
(__v4sf)(__m128)(mask), (s)))
4196
4197
/// Conditionally gathers eight 32-bit floating-point values, either from the
4198
/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199
/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200
/// of [8 x float] in \a mask determines the source for each element.
4201
///
4202
/// \code{.operation}
4203
/// FOR element := 0 to 7
4204
/// j := element*32
4205
/// k := element*32
4206
/// IF mask[j+31] == 0
4207
/// result[j+31:j] := a[j+31:j]
4208
/// ELSE
4209
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210
/// FI
4211
/// ENDFOR
4212
/// \endcode
4213
///
4214
/// \headerfile <immintrin.h>
4215
///
4216
/// \code
4217
/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218
/// __m256 mask, const int s);
4219
/// \endcode
4220
///
4221
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4222
///
4223
/// \param a
4224
/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4225
/// zero.
4226
/// \param m
4227
/// A pointer to the memory used for loading values.
4228
/// \param i
4229
/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230
/// \param mask
4231
/// A 256-bit vector of [8 x float] containing the mask. The most
4232
/// significant bit of each element in the mask vector represents the mask
4233
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4234
/// is gathered; otherwise the value is loaded from memory.
4235
/// \param s
4236
/// A literal constant scale factor for the indexes in \a i. Must be
4237
/// 1, 2, 4, or 8.
4238
/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241
(float const *)(m), \
4242
(__v8si)(__m256i)(i), \
4243
(__v8sf)(__m256)(mask), (s)))
4244
4245
/// Conditionally gathers two 32-bit floating-point values, either from the
4246
/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247
/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248
/// of [4 x float] in \a mask determines the source for the lower two
4249
/// elements. The upper two elements of the result are zeroed.
4250
///
4251
/// \code{.operation}
4252
/// FOR element := 0 to 1
4253
/// j := element*32
4254
/// k := element*64
4255
/// IF mask[j+31] == 0
4256
/// result[j+31:j] := a[j+31:j]
4257
/// ELSE
4258
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259
/// FI
4260
/// ENDFOR
4261
/// result[127:64] := 0
4262
/// \endcode
4263
///
4264
/// \headerfile <immintrin.h>
4265
///
4266
/// \code
4267
/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268
/// __m128 mask, const int s);
4269
/// \endcode
4270
///
4271
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4272
///
4273
/// \param a
4274
/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4275
/// zero. Only the first two elements are used.
4276
/// \param m
4277
/// A pointer to the memory used for loading values.
4278
/// \param i
4279
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280
/// \param mask
4281
/// A 128-bit vector of [4 x float] containing the mask. The most
4282
/// significant bit of each element in the mask vector represents the mask
4283
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4284
/// is gathered; otherwise the value is loaded from memory. Only the first
4285
/// two elements are used.
4286
/// \param s
4287
/// A literal constant scale factor for the indexes in \a i. Must be
4288
/// 1, 2, 4, or 8.
4289
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291
((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292
(float const *)(m), \
4293
(__v2di)(__m128i)(i), \
4294
(__v4sf)(__m128)(mask), (s)))
4295
4296
/// Conditionally gathers four 32-bit floating-point values, either from the
4297
/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298
/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299
/// of [4 x float] in \a mask determines the source for each element.
4300
///
4301
/// \code{.operation}
4302
/// FOR element := 0 to 3
4303
/// j := element*32
4304
/// k := element*64
4305
/// IF mask[j+31] == 0
4306
/// result[j+31:j] := a[j+31:j]
4307
/// ELSE
4308
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309
/// FI
4310
/// ENDFOR
4311
/// \endcode
4312
///
4313
/// \headerfile <immintrin.h>
4314
///
4315
/// \code
4316
/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317
/// __m128 mask, const int s);
4318
/// \endcode
4319
///
4320
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4321
///
4322
/// \param a
4323
/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4324
/// zero.
4325
/// \param m
4326
/// A pointer to the memory used for loading values.
4327
/// \param i
4328
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329
/// \param mask
4330
/// A 128-bit vector of [4 x float] containing the mask. The most
4331
/// significant bit of each element in the mask vector represents the mask
4332
/// bits. If a mask bit is zero, the corresponding value from vector \a a
4333
/// is gathered; otherwise the value is loaded from memory.
4334
/// \param s
4335
/// A literal constant scale factor for the indexes in \a i. Must be
4336
/// 1, 2, 4, or 8.
4337
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340
(float const *)(m), \
4341
(__v4di)(__m256i)(i), \
4342
(__v4sf)(__m128)(mask), (s)))
4343
4344
/// Conditionally gathers four 32-bit integer values, either from the
4345
/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347
/// of [4 x i32] in \a mask determines the source for each element.
4348
///
4349
/// \code{.operation}
4350
/// FOR element := 0 to 3
4351
/// j := element*32
4352
/// k := element*32
4353
/// IF mask[j+31] == 0
4354
/// result[j+31:j] := a[j+31:j]
4355
/// ELSE
4356
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357
/// FI
4358
/// ENDFOR
4359
/// \endcode
4360
///
4361
/// \headerfile <immintrin.h>
4362
///
4363
/// \code
4364
/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365
/// __m128i mask, const int s);
4366
/// \endcode
4367
///
4368
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4369
///
4370
/// \param a
4371
/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372
/// zero.
4373
/// \param m
4374
/// A pointer to the memory used for loading values.
4375
/// \param i
4376
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377
/// \param mask
4378
/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4379
/// bit of each element in the mask vector represents the mask bits. If a
4380
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4381
/// otherwise the value is loaded from memory.
4382
/// \param s
4383
/// A literal constant scale factor for the indexes in \a i. Must be
4384
/// 1, 2, 4, or 8.
4385
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387
((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388
(int const *)(m), \
4389
(__v4si)(__m128i)(i), \
4390
(__v4si)(__m128i)(mask), (s)))
4391
4392
/// Conditionally gathers eight 32-bit integer values, either from the
4393
/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394
/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395
/// of [8 x i32] in \a mask determines the source for each element.
4396
///
4397
/// \code{.operation}
4398
/// FOR element := 0 to 7
4399
/// j := element*32
4400
/// k := element*32
4401
/// IF mask[j+31] == 0
4402
/// result[j+31:j] := a[j+31:j]
4403
/// ELSE
4404
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405
/// FI
4406
/// ENDFOR
4407
/// \endcode
4408
///
4409
/// \headerfile <immintrin.h>
4410
///
4411
/// \code
4412
/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413
/// __m256i mask, const int s);
4414
/// \endcode
4415
///
4416
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4417
///
4418
/// \param a
4419
/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420
/// zero.
4421
/// \param m
4422
/// A pointer to the memory used for loading values.
4423
/// \param i
4424
/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425
/// \param mask
4426
/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4427
/// bit of each element in the mask vector represents the mask bits. If a
4428
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4429
/// otherwise the value is loaded from memory.
4430
/// \param s
4431
/// A literal constant scale factor for the indexes in \a i. Must be
4432
/// 1, 2, 4, or 8.
4433
/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435
((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436
(int const *)(m), \
4437
(__v8si)(__m256i)(i), \
4438
(__v8si)(__m256i)(mask), (s)))
4439
4440
/// Conditionally gathers two 32-bit integer values, either from the
4441
/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442
/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443
/// of [4 x i32] in \a mask determines the source for the lower two
4444
/// elements. The upper two elements of the result are zeroed.
4445
///
4446
/// \code{.operation}
4447
/// FOR element := 0 to 1
4448
/// j := element*32
4449
/// k := element*64
4450
/// IF mask[j+31] == 0
4451
/// result[j+31:j] := a[j+31:j]
4452
/// ELSE
4453
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454
/// FI
4455
/// ENDFOR
4456
/// result[127:64] := 0
4457
/// \endcode
4458
///
4459
/// \headerfile <immintrin.h>
4460
///
4461
/// \code
4462
/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463
/// __m128i mask, const int s);
4464
/// \endcode
4465
///
4466
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4467
///
4468
/// \param a
4469
/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470
/// zero. Only the first two elements are used.
4471
/// \param m
4472
/// A pointer to the memory used for loading values.
4473
/// \param i
4474
/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4475
/// \param mask
4476
/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4477
/// bit of each element in the mask vector represents the mask bits. If a
4478
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4479
/// otherwise the value is loaded from memory. Only the first two elements
4480
/// are used.
4481
/// \param s
4482
/// A literal constant scale factor for the indexes in \a i. Must be
4483
/// 1, 2, 4, or 8.
4484
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486
((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487
(int const *)(m), \
4488
(__v2di)(__m128i)(i), \
4489
(__v4si)(__m128i)(mask), (s)))
4490
4491
/// Conditionally gathers four 32-bit integer values, either from the
4492
/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493
/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494
/// of [4 x i32] in \a mask determines the source for each element.
4495
///
4496
/// \code{.operation}
4497
/// FOR element := 0 to 3
4498
/// j := element*32
4499
/// k := element*64
4500
/// IF mask[j+31] == 0
4501
/// result[j+31:j] := a[j+31:j]
4502
/// ELSE
4503
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504
/// FI
4505
/// ENDFOR
4506
/// \endcode
4507
///
4508
/// \headerfile <immintrin.h>
4509
///
4510
/// \code
4511
/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512
/// __m128i mask, const int s);
4513
/// \endcode
4514
///
4515
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4516
///
4517
/// \param a
4518
/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519
/// zero.
4520
/// \param m
4521
/// A pointer to the memory used for loading values.
4522
/// \param i
4523
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524
/// \param mask
4525
/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4526
/// bit of each element in the mask vector represents the mask bits. If a
4527
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4528
/// otherwise the value is loaded from memory.
4529
/// \param s
4530
/// A literal constant scale factor for the indexes in \a i. Must be
4531
/// 1, 2, 4, or 8.
4532
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534
((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535
(int const *)(m), \
4536
(__v4di)(__m256i)(i), \
4537
(__v4si)(__m128i)(mask), (s)))
4538
4539
/// Conditionally gathers two 64-bit integer values, either from the
4540
/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542
/// of [2 x i64] in \a mask determines the source for each element.
4543
///
4544
/// \code{.operation}
4545
/// FOR element := 0 to 1
4546
/// j := element*64
4547
/// k := element*32
4548
/// IF mask[j+63] == 0
4549
/// result[j+63:j] := a[j+63:j]
4550
/// ELSE
4551
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552
/// FI
4553
/// ENDFOR
4554
/// \endcode
4555
///
4556
/// \headerfile <immintrin.h>
4557
///
4558
/// \code
4559
/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560
/// __m128i mask, const int s);
4561
/// \endcode
4562
///
4563
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4564
///
4565
/// \param a
4566
/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567
/// zero.
4568
/// \param m
4569
/// A pointer to the memory used for loading values.
4570
/// \param i
4571
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572
/// the first two elements are used.
4573
/// \param mask
4574
/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4575
/// bit of each element in the mask vector represents the mask bits. If a
4576
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4577
/// otherwise the value is loaded from memory.
4578
/// \param s
4579
/// A literal constant scale factor for the indexes in \a i. Must be
4580
/// 1, 2, 4, or 8.
4581
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583
((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584
(long long const *)(m), \
4585
(__v4si)(__m128i)(i), \
4586
(__v2di)(__m128i)(mask), (s)))
4587
4588
/// Conditionally gathers four 64-bit integer values, either from the
4589
/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590
/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591
/// of [4 x i64] in \a mask determines the source for each element.
4592
///
4593
/// \code{.operation}
4594
/// FOR element := 0 to 3
4595
/// j := element*64
4596
/// k := element*32
4597
/// IF mask[j+63] == 0
4598
/// result[j+63:j] := a[j+63:j]
4599
/// ELSE
4600
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601
/// FI
4602
/// ENDFOR
4603
/// \endcode
4604
///
4605
/// \headerfile <immintrin.h>
4606
///
4607
/// \code
4608
/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609
/// __m128i i, __m256i mask, const int s);
4610
/// \endcode
4611
///
4612
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4613
///
4614
/// \param a
4615
/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616
/// zero.
4617
/// \param m
4618
/// A pointer to the memory used for loading values.
4619
/// \param i
4620
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621
/// \param mask
4622
/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4623
/// bit of each element in the mask vector represents the mask bits. If a
4624
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4625
/// otherwise the value is loaded from memory.
4626
/// \param s
4627
/// A literal constant scale factor for the indexes in \a i. Must be
4628
/// 1, 2, 4, or 8.
4629
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631
((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632
(long long const *)(m), \
4633
(__v4si)(__m128i)(i), \
4634
(__v4di)(__m256i)(mask), (s)))
4635
4636
/// Conditionally gathers two 64-bit integer values, either from the
4637
/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638
/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639
/// of [2 x i64] in \a mask determines the source for each element.
4640
///
4641
/// \code{.operation}
4642
/// FOR element := 0 to 1
4643
/// j := element*64
4644
/// k := element*64
4645
/// IF mask[j+63] == 0
4646
/// result[j+63:j] := a[j+63:j]
4647
/// ELSE
4648
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649
/// FI
4650
/// ENDFOR
4651
/// \endcode
4652
///
4653
/// \headerfile <immintrin.h>
4654
///
4655
/// \code
4656
/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657
/// __m128i mask, const int s);
4658
/// \endcode
4659
///
4660
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4661
///
4662
/// \param a
4663
/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664
/// zero.
4665
/// \param m
4666
/// A pointer to the memory used for loading values.
4667
/// \param i
4668
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669
/// \param mask
4670
/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4671
/// bit of each element in the mask vector represents the mask bits. If a
4672
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4673
/// otherwise the value is loaded from memory.
4674
/// \param s
4675
/// A literal constant scale factor for the indexes in \a i. Must be
4676
/// 1, 2, 4, or 8.
4677
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679
((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680
(long long const *)(m), \
4681
(__v2di)(__m128i)(i), \
4682
(__v2di)(__m128i)(mask), (s)))
4683
4684
/// Conditionally gathers four 64-bit integer values, either from the
4685
/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686
/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687
/// of [4 x i64] in \a mask determines the source for each element.
4688
///
4689
/// \code{.operation}
4690
/// FOR element := 0 to 3
4691
/// j := element*64
4692
/// k := element*64
4693
/// IF mask[j+63] == 0
4694
/// result[j+63:j] := a[j+63:j]
4695
/// ELSE
4696
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697
/// FI
4698
/// ENDFOR
4699
/// \endcode
4700
///
4701
/// \headerfile <immintrin.h>
4702
///
4703
/// \code
4704
/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705
/// __m256i i, __m256i mask, const int s);
4706
/// \endcode
4707
///
4708
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4709
///
4710
/// \param a
4711
/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712
/// zero.
4713
/// \param m
4714
/// A pointer to the memory used for loading values.
4715
/// \param i
4716
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717
/// \param mask
4718
/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4719
/// bit of each element in the mask vector represents the mask bits. If a
4720
/// mask bit is zero, the corresponding value from vector \a a is gathered;
4721
/// otherwise the value is loaded from memory.
4722
/// \param s
4723
/// A literal constant scale factor for the indexes in \a i. Must be
4724
/// 1, 2, 4, or 8.
4725
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727
((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728
(long long const *)(m), \
4729
(__v4di)(__m256i)(i), \
4730
(__v4di)(__m256i)(mask), (s)))
4731
4732
/// Gathers two 64-bit floating-point values from memory \a m using scaled
4733
/// indexes from the 128-bit vector of [4 x i32] in \a i.
4734
///
4735
/// \code{.operation}
4736
/// FOR element := 0 to 1
4737
/// j := element*64
4738
/// k := element*32
4739
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740
/// ENDFOR
4741
/// \endcode
4742
///
4743
/// \headerfile <immintrin.h>
4744
///
4745
/// \code
4746
/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747
/// \endcode
4748
///
4749
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4750
///
4751
/// \param m
4752
/// A pointer to the memory used for loading values.
4753
/// \param i
4754
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755
/// the first two elements are used.
4756
/// \param s
4757
/// A literal constant scale factor for the indexes in \a i. Must be
4758
/// 1, 2, 4, or 8.
4759
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760
#define _mm_i32gather_pd(m, i, s) \
4761
((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762
(double const *)(m), \
4763
(__v4si)(__m128i)(i), \
4764
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765
_mm_setzero_pd()), \
4766
(s)))
4767
4768
/// Gathers four 64-bit floating-point values from memory \a m using scaled
4769
/// indexes from the 128-bit vector of [4 x i32] in \a i.
4770
///
4771
/// \code{.operation}
4772
/// FOR element := 0 to 3
4773
/// j := element*64
4774
/// k := element*32
4775
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776
/// ENDFOR
4777
/// \endcode
4778
///
4779
/// \headerfile <immintrin.h>
4780
///
4781
/// \code
4782
/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783
/// \endcode
4784
///
4785
/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4786
///
4787
/// \param m
4788
/// A pointer to the memory used for loading values.
4789
/// \param i
4790
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791
/// \param s
4792
/// A literal constant scale factor for the indexes in \a i. Must be
4793
/// 1, 2, 4, or 8.
4794
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795
#define _mm256_i32gather_pd(m, i, s) \
4796
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797
(double const *)(m), \
4798
(__v4si)(__m128i)(i), \
4799
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800
_mm256_setzero_pd(), \
4801
_CMP_EQ_OQ), \
4802
(s)))
4803
4804
/// Gathers two 64-bit floating-point values from memory \a m using scaled
4805
/// indexes from the 128-bit vector of [2 x i64] in \a i.
4806
///
4807
/// \code{.operation}
4808
/// FOR element := 0 to 1
4809
/// j := element*64
4810
/// k := element*64
4811
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812
/// ENDFOR
4813
/// \endcode
4814
///
4815
/// \headerfile <immintrin.h>
4816
///
4817
/// \code
4818
/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819
/// \endcode
4820
///
4821
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4822
///
4823
/// \param m
4824
/// A pointer to the memory used for loading values.
4825
/// \param i
4826
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827
/// \param s
4828
/// A literal constant scale factor for the indexes in \a i. Must be
4829
/// 1, 2, 4, or 8.
4830
/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831
#define _mm_i64gather_pd(m, i, s) \
4832
((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833
(double const *)(m), \
4834
(__v2di)(__m128i)(i), \
4835
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836
_mm_setzero_pd()), \
4837
(s)))
4838
4839
/// Gathers four 64-bit floating-point values from memory \a m using scaled
4840
/// indexes from the 256-bit vector of [4 x i64] in \a i.
4841
///
4842
/// \code{.operation}
4843
/// FOR element := 0 to 3
4844
/// j := element*64
4845
/// k := element*64
4846
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847
/// ENDFOR
4848
/// \endcode
4849
///
4850
/// \headerfile <immintrin.h>
4851
///
4852
/// \code
4853
/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854
/// \endcode
4855
///
4856
/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4857
///
4858
/// \param m
4859
/// A pointer to the memory used for loading values.
4860
/// \param i
4861
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862
/// \param s
4863
/// A literal constant scale factor for the indexes in \a i. Must be
4864
/// 1, 2, 4, or 8.
4865
/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866
#define _mm256_i64gather_pd(m, i, s) \
4867
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868
(double const *)(m), \
4869
(__v4di)(__m256i)(i), \
4870
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871
_mm256_setzero_pd(), \
4872
_CMP_EQ_OQ), \
4873
(s)))
4874
4875
/// Gathers four 32-bit floating-point values from memory \a m using scaled
4876
/// indexes from the 128-bit vector of [4 x i32] in \a i.
4877
///
4878
/// \code{.operation}
4879
/// FOR element := 0 to 3
4880
/// j := element*32
4881
/// k := element*32
4882
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883
/// ENDFOR
4884
/// \endcode
4885
///
4886
/// \headerfile <immintrin.h>
4887
///
4888
/// \code
4889
/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890
/// \endcode
4891
///
4892
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4893
///
4894
/// \param m
4895
/// A pointer to the memory used for loading values.
4896
/// \param i
4897
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898
/// \param s
4899
/// A literal constant scale factor for the indexes in \a i. Must be
4900
/// 1, 2, 4, or 8.
4901
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902
#define _mm_i32gather_ps(m, i, s) \
4903
((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904
(float const *)(m), \
4905
(__v4si)(__m128i)(i), \
4906
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907
_mm_setzero_ps()), \
4908
(s)))
4909
4910
/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911
/// indexes from the 256-bit vector of [8 x i32] in \a i.
4912
///
4913
/// \code{.operation}
4914
/// FOR element := 0 to 7
4915
/// j := element*32
4916
/// k := element*32
4917
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918
/// ENDFOR
4919
/// \endcode
4920
///
4921
/// \headerfile <immintrin.h>
4922
///
4923
/// \code
4924
/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925
/// \endcode
4926
///
4927
/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4928
///
4929
/// \param m
4930
/// A pointer to the memory used for loading values.
4931
/// \param i
4932
/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933
/// \param s
4934
/// A literal constant scale factor for the indexes in \a i. Must be
4935
/// 1, 2, 4, or 8.
4936
/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937
#define _mm256_i32gather_ps(m, i, s) \
4938
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939
(float const *)(m), \
4940
(__v8si)(__m256i)(i), \
4941
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942
_mm256_setzero_ps(), \
4943
_CMP_EQ_OQ), \
4944
(s)))
4945
4946
/// Gathers two 32-bit floating-point values from memory \a m using scaled
4947
/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948
/// elements of the result are zeroed.
4949
///
4950
/// \code{.operation}
4951
/// FOR element := 0 to 1
4952
/// j := element*32
4953
/// k := element*64
4954
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955
/// ENDFOR
4956
/// result[127:64] := 0
4957
/// \endcode
4958
///
4959
/// \headerfile <immintrin.h>
4960
///
4961
/// \code
4962
/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963
/// \endcode
4964
///
4965
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4966
///
4967
/// \param m
4968
/// A pointer to the memory used for loading values.
4969
/// \param i
4970
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971
/// \param s
4972
/// A literal constant scale factor for the indexes in \a i. Must be
4973
/// 1, 2, 4, or 8.
4974
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975
#define _mm_i64gather_ps(m, i, s) \
4976
((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977
(float const *)(m), \
4978
(__v2di)(__m128i)(i), \
4979
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980
_mm_setzero_ps()), \
4981
(s)))
4982
4983
/// Gathers four 32-bit floating-point values from memory \a m using scaled
4984
/// indexes from the 256-bit vector of [4 x i64] in \a i.
4985
///
4986
/// \code{.operation}
4987
/// FOR element := 0 to 3
4988
/// j := element*32
4989
/// k := element*64
4990
/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991
/// ENDFOR
4992
/// \endcode
4993
///
4994
/// \headerfile <immintrin.h>
4995
///
4996
/// \code
4997
/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998
/// \endcode
4999
///
5000
/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5001
///
5002
/// \param m
5003
/// A pointer to the memory used for loading values.
5004
/// \param i
5005
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006
/// \param s
5007
/// A literal constant scale factor for the indexes in \a i. Must be
5008
/// 1, 2, 4, or 8.
5009
/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010
#define _mm256_i64gather_ps(m, i, s) \
5011
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012
(float const *)(m), \
5013
(__v4di)(__m256i)(i), \
5014
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015
_mm_setzero_ps()), \
5016
(s)))
5017
5018
/// Gathers four 32-bit floating-point values from memory \a m using scaled
5019
/// indexes from the 128-bit vector of [4 x i32] in \a i.
5020
///
5021
/// \code{.operation}
5022
/// FOR element := 0 to 3
5023
/// j := element*32
5024
/// k := element*32
5025
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026
/// ENDFOR
5027
/// \endcode
5028
///
5029
/// \headerfile <immintrin.h>
5030
///
5031
/// \code
5032
/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033
/// \endcode
5034
///
5035
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5036
///
5037
/// \param m
5038
/// A pointer to the memory used for loading values.
5039
/// \param i
5040
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041
/// \param s
5042
/// A literal constant scale factor for the indexes in \a i. Must be
5043
/// 1, 2, 4, or 8.
5044
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045
#define _mm_i32gather_epi32(m, i, s) \
5046
((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047
(int const *)(m), (__v4si)(__m128i)(i), \
5048
(__v4si)_mm_set1_epi32(-1), (s)))
5049
5050
/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051
/// indexes from the 256-bit vector of [8 x i32] in \a i.
5052
///
5053
/// \code{.operation}
5054
/// FOR element := 0 to 7
5055
/// j := element*32
5056
/// k := element*32
5057
/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058
/// ENDFOR
5059
/// \endcode
5060
///
5061
/// \headerfile <immintrin.h>
5062
///
5063
/// \code
5064
/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065
/// \endcode
5066
///
5067
/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5068
///
5069
/// \param m
5070
/// A pointer to the memory used for loading values.
5071
/// \param i
5072
/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073
/// \param s
5074
/// A literal constant scale factor for the indexes in \a i. Must be
5075
/// 1, 2, 4, or 8.
5076
/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077
#define _mm256_i32gather_epi32(m, i, s) \
5078
((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079
(int const *)(m), (__v8si)(__m256i)(i), \
5080
(__v8si)_mm256_set1_epi32(-1), (s)))
5081
5082
/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083
/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084
/// of the result are zeroed.
5085
///
5086
/// \code{.operation}
5087
/// FOR element := 0 to 1
5088
/// j := element*32
5089
/// k := element*64
5090
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091
/// ENDFOR
5092
/// result[127:64] := 0
5093
/// \endcode
5094
///
5095
/// \headerfile <immintrin.h>
5096
///
5097
/// \code
5098
/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099
/// \endcode
5100
///
5101
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5102
///
5103
/// \param m
5104
/// A pointer to the memory used for loading values.
5105
/// \param i
5106
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107
/// \param s
5108
/// A literal constant scale factor for the indexes in \a i. Must be
5109
/// 1, 2, 4, or 8.
5110
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111
#define _mm_i64gather_epi32(m, i, s) \
5112
((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113
(int const *)(m), (__v2di)(__m128i)(i), \
5114
(__v4si)_mm_set1_epi32(-1), (s)))
5115
5116
/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117
/// from the 256-bit vector of [4 x i64] in \a i.
5118
///
5119
/// \code{.operation}
5120
/// FOR element := 0 to 3
5121
/// j := element*32
5122
/// k := element*64
5123
/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124
/// ENDFOR
5125
/// \endcode
5126
///
5127
/// \headerfile <immintrin.h>
5128
///
5129
/// \code
5130
/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131
/// \endcode
5132
///
5133
/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5134
///
5135
/// \param m
5136
/// A pointer to the memory used for loading values.
5137
/// \param i
5138
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139
/// \param s
5140
/// A literal constant scale factor for the indexes in \a i. Must be
5141
/// 1, 2, 4, or 8.
5142
/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143
#define _mm256_i64gather_epi32(m, i, s) \
5144
((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145
(int const *)(m), (__v4di)(__m256i)(i), \
5146
(__v4si)_mm_set1_epi32(-1), (s)))
5147
5148
/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149
/// from the 128-bit vector of [4 x i32] in \a i.
5150
///
5151
/// \code{.operation}
5152
/// FOR element := 0 to 1
5153
/// j := element*64
5154
/// k := element*32
5155
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156
/// ENDFOR
5157
/// \endcode
5158
///
5159
/// \headerfile <immintrin.h>
5160
///
5161
/// \code
5162
/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163
/// \endcode
5164
///
5165
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5166
///
5167
/// \param m
5168
/// A pointer to the memory used for loading values.
5169
/// \param i
5170
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171
/// the first two elements are used.
5172
/// \param s
5173
/// A literal constant scale factor for the indexes in \a i. Must be
5174
/// 1, 2, 4, or 8.
5175
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176
#define _mm_i32gather_epi64(m, i, s) \
5177
((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178
(long long const *)(m), \
5179
(__v4si)(__m128i)(i), \
5180
(__v2di)_mm_set1_epi64x(-1), (s)))
5181
5182
/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183
/// from the 128-bit vector of [4 x i32] in \a i.
5184
///
5185
/// \code{.operation}
5186
/// FOR element := 0 to 3
5187
/// j := element*64
5188
/// k := element*32
5189
/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190
/// ENDFOR
5191
/// \endcode
5192
///
5193
/// \headerfile <immintrin.h>
5194
///
5195
/// \code
5196
/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197
/// \endcode
5198
///
5199
/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5200
///
5201
/// \param m
5202
/// A pointer to the memory used for loading values.
5203
/// \param i
5204
/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205
/// \param s
5206
/// A literal constant scale factor for the indexes in \a i. Must be
5207
/// 1, 2, 4, or 8.
5208
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209
#define _mm256_i32gather_epi64(m, i, s) \
5210
((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211
(long long const *)(m), \
5212
(__v4si)(__m128i)(i), \
5213
(__v4di)_mm256_set1_epi64x(-1), (s)))
5214
5215
/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216
/// from the 128-bit vector of [2 x i64] in \a i.
5217
///
5218
/// \code{.operation}
5219
/// FOR element := 0 to 1
5220
/// j := element*64
5221
/// k := element*64
5222
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223
/// ENDFOR
5224
/// \endcode
5225
///
5226
/// \headerfile <immintrin.h>
5227
///
5228
/// \code
5229
/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230
/// \endcode
5231
///
5232
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5233
///
5234
/// \param m
5235
/// A pointer to the memory used for loading values.
5236
/// \param i
5237
/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238
/// \param s
5239
/// A literal constant scale factor for the indexes in \a i. Must be
5240
/// 1, 2, 4, or 8.
5241
/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242
#define _mm_i64gather_epi64(m, i, s) \
5243
((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244
(long long const *)(m), \
5245
(__v2di)(__m128i)(i), \
5246
(__v2di)_mm_set1_epi64x(-1), (s)))
5247
5248
/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249
/// from the 256-bit vector of [4 x i64] in \a i.
5250
///
5251
/// \code{.operation}
5252
/// FOR element := 0 to 3
5253
/// j := element*64
5254
/// k := element*64
5255
/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256
/// ENDFOR
5257
/// \endcode
5258
///
5259
/// \headerfile <immintrin.h>
5260
///
5261
/// \code
5262
/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263
/// \endcode
5264
///
5265
/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5266
///
5267
/// \param m
5268
/// A pointer to the memory used for loading values.
5269
/// \param i
5270
/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271
/// \param s
5272
/// A literal constant scale factor for the indexes in \a i. Must be
5273
/// 1, 2, 4, or 8.
5274
/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275
#define _mm256_i64gather_epi64(m, i, s) \
5276
((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277
(long long const *)(m), \
5278
(__v4di)(__m256i)(i), \
5279
(__v4di)_mm256_set1_epi64x(-1), (s)))
5280
5281
#undef __DEFAULT_FN_ATTRS256
5282
#undef __DEFAULT_FN_ATTRS128
5283
5284
#endif /* __AVX2INTRIN_H */
5285
5286