Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
screetsec
GitHub Repository: screetsec/TheFatRat
Path: blob/master/tools/android-sdk/renderscript/clang-include/emmintrin.h
496 views
1
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2
*
3
* Permission is hereby granted, free of charge, to any person obtaining a copy
4
* of this software and associated documentation files (the "Software"), to deal
5
* in the Software without restriction, including without limitation the rights
6
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
* copies of the Software, and to permit persons to whom the Software is
8
* furnished to do so, subject to the following conditions:
9
*
10
* The above copyright notice and this permission notice shall be included in
11
* all copies or substantial portions of the Software.
12
*
13
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
* THE SOFTWARE.
20
*
21
*===-----------------------------------------------------------------------===
22
*/
23
24
#ifndef __EMMINTRIN_H
25
#define __EMMINTRIN_H
26
27
#include <xmmintrin.h>
28
29
typedef double __m128d __attribute__((__vector_size__(16)));
30
typedef long long __m128i __attribute__((__vector_size__(16)));
31
32
/* Type defines. */
33
typedef double __v2df __attribute__ ((__vector_size__ (16)));
34
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35
typedef short __v8hi __attribute__((__vector_size__(16)));
36
typedef char __v16qi __attribute__((__vector_size__(16)));
37
38
/* Unsigned types */
39
typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40
typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41
typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42
43
/* We need an explicitly signed variant for char. Note that this shouldn't
44
* appear in the interface though. */
45
typedef signed char __v16qs __attribute__((__vector_size__(16)));
46
47
#include <f16cintrin.h>
48
49
/* Define the default attributes for the functions in this file. */
50
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
51
52
static __inline__ __m128d __DEFAULT_FN_ATTRS
53
_mm_add_sd(__m128d __a, __m128d __b)
54
{
55
__a[0] += __b[0];
56
return __a;
57
}
58
59
static __inline__ __m128d __DEFAULT_FN_ATTRS
60
_mm_add_pd(__m128d __a, __m128d __b)
61
{
62
return (__m128d)((__v2df)__a + (__v2df)__b);
63
}
64
65
static __inline__ __m128d __DEFAULT_FN_ATTRS
66
_mm_sub_sd(__m128d __a, __m128d __b)
67
{
68
__a[0] -= __b[0];
69
return __a;
70
}
71
72
static __inline__ __m128d __DEFAULT_FN_ATTRS
73
_mm_sub_pd(__m128d __a, __m128d __b)
74
{
75
return (__m128d)((__v2df)__a - (__v2df)__b);
76
}
77
78
static __inline__ __m128d __DEFAULT_FN_ATTRS
79
_mm_mul_sd(__m128d __a, __m128d __b)
80
{
81
__a[0] *= __b[0];
82
return __a;
83
}
84
85
static __inline__ __m128d __DEFAULT_FN_ATTRS
86
_mm_mul_pd(__m128d __a, __m128d __b)
87
{
88
return (__m128d)((__v2df)__a * (__v2df)__b);
89
}
90
91
static __inline__ __m128d __DEFAULT_FN_ATTRS
92
_mm_div_sd(__m128d __a, __m128d __b)
93
{
94
__a[0] /= __b[0];
95
return __a;
96
}
97
98
static __inline__ __m128d __DEFAULT_FN_ATTRS
99
_mm_div_pd(__m128d __a, __m128d __b)
100
{
101
return (__m128d)((__v2df)__a / (__v2df)__b);
102
}
103
104
static __inline__ __m128d __DEFAULT_FN_ATTRS
105
_mm_sqrt_sd(__m128d __a, __m128d __b)
106
{
107
__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
108
return (__m128d) { __c[0], __a[1] };
109
}
110
111
static __inline__ __m128d __DEFAULT_FN_ATTRS
112
_mm_sqrt_pd(__m128d __a)
113
{
114
return __builtin_ia32_sqrtpd((__v2df)__a);
115
}
116
117
static __inline__ __m128d __DEFAULT_FN_ATTRS
118
_mm_min_sd(__m128d __a, __m128d __b)
119
{
120
return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
121
}
122
123
static __inline__ __m128d __DEFAULT_FN_ATTRS
124
_mm_min_pd(__m128d __a, __m128d __b)
125
{
126
return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
127
}
128
129
static __inline__ __m128d __DEFAULT_FN_ATTRS
130
_mm_max_sd(__m128d __a, __m128d __b)
131
{
132
return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
133
}
134
135
static __inline__ __m128d __DEFAULT_FN_ATTRS
136
_mm_max_pd(__m128d __a, __m128d __b)
137
{
138
return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
139
}
140
141
static __inline__ __m128d __DEFAULT_FN_ATTRS
142
_mm_and_pd(__m128d __a, __m128d __b)
143
{
144
return (__m128d)((__v4su)__a & (__v4su)__b);
145
}
146
147
static __inline__ __m128d __DEFAULT_FN_ATTRS
148
_mm_andnot_pd(__m128d __a, __m128d __b)
149
{
150
return (__m128d)(~(__v4su)__a & (__v4su)__b);
151
}
152
153
static __inline__ __m128d __DEFAULT_FN_ATTRS
154
_mm_or_pd(__m128d __a, __m128d __b)
155
{
156
return (__m128d)((__v4su)__a | (__v4su)__b);
157
}
158
159
static __inline__ __m128d __DEFAULT_FN_ATTRS
160
_mm_xor_pd(__m128d __a, __m128d __b)
161
{
162
return (__m128d)((__v4su)__a ^ (__v4su)__b);
163
}
164
165
static __inline__ __m128d __DEFAULT_FN_ATTRS
166
_mm_cmpeq_pd(__m128d __a, __m128d __b)
167
{
168
return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
169
}
170
171
static __inline__ __m128d __DEFAULT_FN_ATTRS
172
_mm_cmplt_pd(__m128d __a, __m128d __b)
173
{
174
return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
175
}
176
177
static __inline__ __m128d __DEFAULT_FN_ATTRS
178
_mm_cmple_pd(__m128d __a, __m128d __b)
179
{
180
return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
181
}
182
183
static __inline__ __m128d __DEFAULT_FN_ATTRS
184
_mm_cmpgt_pd(__m128d __a, __m128d __b)
185
{
186
return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
187
}
188
189
static __inline__ __m128d __DEFAULT_FN_ATTRS
190
_mm_cmpge_pd(__m128d __a, __m128d __b)
191
{
192
return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
193
}
194
195
static __inline__ __m128d __DEFAULT_FN_ATTRS
196
_mm_cmpord_pd(__m128d __a, __m128d __b)
197
{
198
return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
199
}
200
201
static __inline__ __m128d __DEFAULT_FN_ATTRS
202
_mm_cmpunord_pd(__m128d __a, __m128d __b)
203
{
204
return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
205
}
206
207
static __inline__ __m128d __DEFAULT_FN_ATTRS
208
_mm_cmpneq_pd(__m128d __a, __m128d __b)
209
{
210
return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
211
}
212
213
static __inline__ __m128d __DEFAULT_FN_ATTRS
214
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
215
{
216
return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
217
}
218
219
static __inline__ __m128d __DEFAULT_FN_ATTRS
220
_mm_cmpnle_pd(__m128d __a, __m128d __b)
221
{
222
return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
223
}
224
225
static __inline__ __m128d __DEFAULT_FN_ATTRS
226
_mm_cmpngt_pd(__m128d __a, __m128d __b)
227
{
228
return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
229
}
230
231
static __inline__ __m128d __DEFAULT_FN_ATTRS
232
_mm_cmpnge_pd(__m128d __a, __m128d __b)
233
{
234
return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
235
}
236
237
static __inline__ __m128d __DEFAULT_FN_ATTRS
238
_mm_cmpeq_sd(__m128d __a, __m128d __b)
239
{
240
return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
241
}
242
243
static __inline__ __m128d __DEFAULT_FN_ATTRS
244
_mm_cmplt_sd(__m128d __a, __m128d __b)
245
{
246
return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
247
}
248
249
static __inline__ __m128d __DEFAULT_FN_ATTRS
250
_mm_cmple_sd(__m128d __a, __m128d __b)
251
{
252
return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
253
}
254
255
static __inline__ __m128d __DEFAULT_FN_ATTRS
256
_mm_cmpgt_sd(__m128d __a, __m128d __b)
257
{
258
__m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
259
return (__m128d) { __c[0], __a[1] };
260
}
261
262
static __inline__ __m128d __DEFAULT_FN_ATTRS
263
_mm_cmpge_sd(__m128d __a, __m128d __b)
264
{
265
__m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
266
return (__m128d) { __c[0], __a[1] };
267
}
268
269
static __inline__ __m128d __DEFAULT_FN_ATTRS
270
_mm_cmpord_sd(__m128d __a, __m128d __b)
271
{
272
return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
273
}
274
275
static __inline__ __m128d __DEFAULT_FN_ATTRS
276
_mm_cmpunord_sd(__m128d __a, __m128d __b)
277
{
278
return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
279
}
280
281
static __inline__ __m128d __DEFAULT_FN_ATTRS
282
_mm_cmpneq_sd(__m128d __a, __m128d __b)
283
{
284
return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
285
}
286
287
static __inline__ __m128d __DEFAULT_FN_ATTRS
288
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
289
{
290
return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
291
}
292
293
static __inline__ __m128d __DEFAULT_FN_ATTRS
294
_mm_cmpnle_sd(__m128d __a, __m128d __b)
295
{
296
return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
297
}
298
299
static __inline__ __m128d __DEFAULT_FN_ATTRS
300
_mm_cmpngt_sd(__m128d __a, __m128d __b)
301
{
302
__m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
303
return (__m128d) { __c[0], __a[1] };
304
}
305
306
static __inline__ __m128d __DEFAULT_FN_ATTRS
307
_mm_cmpnge_sd(__m128d __a, __m128d __b)
308
{
309
__m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
310
return (__m128d) { __c[0], __a[1] };
311
}
312
313
static __inline__ int __DEFAULT_FN_ATTRS
314
_mm_comieq_sd(__m128d __a, __m128d __b)
315
{
316
return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
317
}
318
319
static __inline__ int __DEFAULT_FN_ATTRS
320
_mm_comilt_sd(__m128d __a, __m128d __b)
321
{
322
return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
323
}
324
325
static __inline__ int __DEFAULT_FN_ATTRS
326
_mm_comile_sd(__m128d __a, __m128d __b)
327
{
328
return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
329
}
330
331
static __inline__ int __DEFAULT_FN_ATTRS
332
_mm_comigt_sd(__m128d __a, __m128d __b)
333
{
334
return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
335
}
336
337
static __inline__ int __DEFAULT_FN_ATTRS
338
_mm_comige_sd(__m128d __a, __m128d __b)
339
{
340
return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
341
}
342
343
static __inline__ int __DEFAULT_FN_ATTRS
344
_mm_comineq_sd(__m128d __a, __m128d __b)
345
{
346
return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
347
}
348
349
static __inline__ int __DEFAULT_FN_ATTRS
350
_mm_ucomieq_sd(__m128d __a, __m128d __b)
351
{
352
return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
353
}
354
355
static __inline__ int __DEFAULT_FN_ATTRS
356
_mm_ucomilt_sd(__m128d __a, __m128d __b)
357
{
358
return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
359
}
360
361
static __inline__ int __DEFAULT_FN_ATTRS
362
_mm_ucomile_sd(__m128d __a, __m128d __b)
363
{
364
return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
365
}
366
367
static __inline__ int __DEFAULT_FN_ATTRS
368
_mm_ucomigt_sd(__m128d __a, __m128d __b)
369
{
370
return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
371
}
372
373
static __inline__ int __DEFAULT_FN_ATTRS
374
_mm_ucomige_sd(__m128d __a, __m128d __b)
375
{
376
return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
377
}
378
379
static __inline__ int __DEFAULT_FN_ATTRS
380
_mm_ucomineq_sd(__m128d __a, __m128d __b)
381
{
382
return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
383
}
384
385
static __inline__ __m128 __DEFAULT_FN_ATTRS
386
_mm_cvtpd_ps(__m128d __a)
387
{
388
return __builtin_ia32_cvtpd2ps((__v2df)__a);
389
}
390
391
static __inline__ __m128d __DEFAULT_FN_ATTRS
392
_mm_cvtps_pd(__m128 __a)
393
{
394
return (__m128d) __builtin_convertvector(
395
__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
396
}
397
398
static __inline__ __m128d __DEFAULT_FN_ATTRS
399
_mm_cvtepi32_pd(__m128i __a)
400
{
401
return (__m128d) __builtin_convertvector(
402
__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
403
}
404
405
static __inline__ __m128i __DEFAULT_FN_ATTRS
406
_mm_cvtpd_epi32(__m128d __a)
407
{
408
return __builtin_ia32_cvtpd2dq((__v2df)__a);
409
}
410
411
static __inline__ int __DEFAULT_FN_ATTRS
412
_mm_cvtsd_si32(__m128d __a)
413
{
414
return __builtin_ia32_cvtsd2si((__v2df)__a);
415
}
416
417
static __inline__ __m128 __DEFAULT_FN_ATTRS
418
_mm_cvtsd_ss(__m128 __a, __m128d __b)
419
{
420
__a[0] = __b[0];
421
return __a;
422
}
423
424
static __inline__ __m128d __DEFAULT_FN_ATTRS
425
_mm_cvtsi32_sd(__m128d __a, int __b)
426
{
427
__a[0] = __b;
428
return __a;
429
}
430
431
static __inline__ __m128d __DEFAULT_FN_ATTRS
432
_mm_cvtss_sd(__m128d __a, __m128 __b)
433
{
434
__a[0] = __b[0];
435
return __a;
436
}
437
438
static __inline__ __m128i __DEFAULT_FN_ATTRS
439
_mm_cvttpd_epi32(__m128d __a)
440
{
441
return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
442
}
443
444
static __inline__ int __DEFAULT_FN_ATTRS
445
_mm_cvttsd_si32(__m128d __a)
446
{
447
return __a[0];
448
}
449
450
static __inline__ __m64 __DEFAULT_FN_ATTRS
451
_mm_cvtpd_pi32(__m128d __a)
452
{
453
return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
454
}
455
456
static __inline__ __m64 __DEFAULT_FN_ATTRS
457
_mm_cvttpd_pi32(__m128d __a)
458
{
459
return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
460
}
461
462
static __inline__ __m128d __DEFAULT_FN_ATTRS
463
_mm_cvtpi32_pd(__m64 __a)
464
{
465
return __builtin_ia32_cvtpi2pd((__v2si)__a);
466
}
467
468
static __inline__ double __DEFAULT_FN_ATTRS
469
_mm_cvtsd_f64(__m128d __a)
470
{
471
return __a[0];
472
}
473
474
static __inline__ __m128d __DEFAULT_FN_ATTRS
475
_mm_load_pd(double const *__dp)
476
{
477
return *(__m128d*)__dp;
478
}
479
480
static __inline__ __m128d __DEFAULT_FN_ATTRS
481
_mm_load1_pd(double const *__dp)
482
{
483
struct __mm_load1_pd_struct {
484
double __u;
485
} __attribute__((__packed__, __may_alias__));
486
double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
487
return (__m128d){ __u, __u };
488
}
489
490
#define _mm_load_pd1(dp) _mm_load1_pd(dp)
491
492
static __inline__ __m128d __DEFAULT_FN_ATTRS
493
_mm_loadr_pd(double const *__dp)
494
{
495
__m128d __u = *(__m128d*)__dp;
496
return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
497
}
498
499
static __inline__ __m128d __DEFAULT_FN_ATTRS
500
_mm_loadu_pd(double const *__dp)
501
{
502
struct __loadu_pd {
503
__m128d __v;
504
} __attribute__((__packed__, __may_alias__));
505
return ((struct __loadu_pd*)__dp)->__v;
506
}
507
508
static __inline__ __m128i __DEFAULT_FN_ATTRS
509
_mm_loadu_si64(void const *__a)
510
{
511
struct __loadu_si64 {
512
long long __v;
513
} __attribute__((__packed__, __may_alias__));
514
long long __u = ((struct __loadu_si64*)__a)->__v;
515
return (__m128i){__u, 0L};
516
}
517
518
static __inline__ __m128d __DEFAULT_FN_ATTRS
519
_mm_load_sd(double const *__dp)
520
{
521
struct __mm_load_sd_struct {
522
double __u;
523
} __attribute__((__packed__, __may_alias__));
524
double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
525
return (__m128d){ __u, 0 };
526
}
527
528
static __inline__ __m128d __DEFAULT_FN_ATTRS
529
_mm_loadh_pd(__m128d __a, double const *__dp)
530
{
531
struct __mm_loadh_pd_struct {
532
double __u;
533
} __attribute__((__packed__, __may_alias__));
534
double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
535
return (__m128d){ __a[0], __u };
536
}
537
538
static __inline__ __m128d __DEFAULT_FN_ATTRS
539
_mm_loadl_pd(__m128d __a, double const *__dp)
540
{
541
struct __mm_loadl_pd_struct {
542
double __u;
543
} __attribute__((__packed__, __may_alias__));
544
double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
545
return (__m128d){ __u, __a[1] };
546
}
547
548
static __inline__ __m128d __DEFAULT_FN_ATTRS
549
_mm_undefined_pd(void)
550
{
551
return (__m128d)__builtin_ia32_undef128();
552
}
553
554
static __inline__ __m128d __DEFAULT_FN_ATTRS
555
_mm_set_sd(double __w)
556
{
557
return (__m128d){ __w, 0 };
558
}
559
560
static __inline__ __m128d __DEFAULT_FN_ATTRS
561
_mm_set1_pd(double __w)
562
{
563
return (__m128d){ __w, __w };
564
}
565
566
static __inline__ __m128d __DEFAULT_FN_ATTRS
567
_mm_set_pd(double __w, double __x)
568
{
569
return (__m128d){ __x, __w };
570
}
571
572
static __inline__ __m128d __DEFAULT_FN_ATTRS
573
_mm_setr_pd(double __w, double __x)
574
{
575
return (__m128d){ __w, __x };
576
}
577
578
static __inline__ __m128d __DEFAULT_FN_ATTRS
579
_mm_setzero_pd(void)
580
{
581
return (__m128d){ 0, 0 };
582
}
583
584
static __inline__ __m128d __DEFAULT_FN_ATTRS
585
_mm_move_sd(__m128d __a, __m128d __b)
586
{
587
return (__m128d){ __b[0], __a[1] };
588
}
589
590
static __inline__ void __DEFAULT_FN_ATTRS
591
_mm_store_sd(double *__dp, __m128d __a)
592
{
593
struct __mm_store_sd_struct {
594
double __u;
595
} __attribute__((__packed__, __may_alias__));
596
((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
597
}
598
599
static __inline__ void __DEFAULT_FN_ATTRS
600
_mm_store_pd(double *__dp, __m128d __a)
601
{
602
*(__m128d*)__dp = __a;
603
}
604
605
static __inline__ void __DEFAULT_FN_ATTRS
606
_mm_store1_pd(double *__dp, __m128d __a)
607
{
608
__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
609
_mm_store_pd(__dp, __a);
610
}
611
612
static __inline__ void __DEFAULT_FN_ATTRS
613
_mm_store_pd1(double *__dp, __m128d __a)
614
{
615
return _mm_store1_pd(__dp, __a);
616
}
617
618
static __inline__ void __DEFAULT_FN_ATTRS
619
_mm_storeu_pd(double *__dp, __m128d __a)
620
{
621
struct __storeu_pd {
622
__m128d __v;
623
} __attribute__((__packed__, __may_alias__));
624
((struct __storeu_pd*)__dp)->__v = __a;
625
}
626
627
static __inline__ void __DEFAULT_FN_ATTRS
628
_mm_storer_pd(double *__dp, __m128d __a)
629
{
630
__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
631
*(__m128d *)__dp = __a;
632
}
633
634
static __inline__ void __DEFAULT_FN_ATTRS
635
_mm_storeh_pd(double *__dp, __m128d __a)
636
{
637
struct __mm_storeh_pd_struct {
638
double __u;
639
} __attribute__((__packed__, __may_alias__));
640
((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
641
}
642
643
static __inline__ void __DEFAULT_FN_ATTRS
644
_mm_storel_pd(double *__dp, __m128d __a)
645
{
646
struct __mm_storeh_pd_struct {
647
double __u;
648
} __attribute__((__packed__, __may_alias__));
649
((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
650
}
651
652
static __inline__ __m128i __DEFAULT_FN_ATTRS
653
_mm_add_epi8(__m128i __a, __m128i __b)
654
{
655
return (__m128i)((__v16qu)__a + (__v16qu)__b);
656
}
657
658
static __inline__ __m128i __DEFAULT_FN_ATTRS
659
_mm_add_epi16(__m128i __a, __m128i __b)
660
{
661
return (__m128i)((__v8hu)__a + (__v8hu)__b);
662
}
663
664
static __inline__ __m128i __DEFAULT_FN_ATTRS
665
_mm_add_epi32(__m128i __a, __m128i __b)
666
{
667
return (__m128i)((__v4su)__a + (__v4su)__b);
668
}
669
670
static __inline__ __m64 __DEFAULT_FN_ATTRS
671
_mm_add_si64(__m64 __a, __m64 __b)
672
{
673
return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
674
}
675
676
static __inline__ __m128i __DEFAULT_FN_ATTRS
677
_mm_add_epi64(__m128i __a, __m128i __b)
678
{
679
return (__m128i)((__v2du)__a + (__v2du)__b);
680
}
681
682
static __inline__ __m128i __DEFAULT_FN_ATTRS
683
_mm_adds_epi8(__m128i __a, __m128i __b)
684
{
685
return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
686
}
687
688
static __inline__ __m128i __DEFAULT_FN_ATTRS
689
_mm_adds_epi16(__m128i __a, __m128i __b)
690
{
691
return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
692
}
693
694
static __inline__ __m128i __DEFAULT_FN_ATTRS
695
_mm_adds_epu8(__m128i __a, __m128i __b)
696
{
697
return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
698
}
699
700
static __inline__ __m128i __DEFAULT_FN_ATTRS
701
_mm_adds_epu16(__m128i __a, __m128i __b)
702
{
703
return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
704
}
705
706
static __inline__ __m128i __DEFAULT_FN_ATTRS
707
_mm_avg_epu8(__m128i __a, __m128i __b)
708
{
709
return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
710
}
711
712
static __inline__ __m128i __DEFAULT_FN_ATTRS
713
_mm_avg_epu16(__m128i __a, __m128i __b)
714
{
715
return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
716
}
717
718
static __inline__ __m128i __DEFAULT_FN_ATTRS
719
_mm_madd_epi16(__m128i __a, __m128i __b)
720
{
721
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
722
}
723
724
static __inline__ __m128i __DEFAULT_FN_ATTRS
725
_mm_max_epi16(__m128i __a, __m128i __b)
726
{
727
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
728
}
729
730
static __inline__ __m128i __DEFAULT_FN_ATTRS
731
_mm_max_epu8(__m128i __a, __m128i __b)
732
{
733
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
734
}
735
736
static __inline__ __m128i __DEFAULT_FN_ATTRS
737
_mm_min_epi16(__m128i __a, __m128i __b)
738
{
739
return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
740
}
741
742
static __inline__ __m128i __DEFAULT_FN_ATTRS
743
_mm_min_epu8(__m128i __a, __m128i __b)
744
{
745
return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
746
}
747
748
static __inline__ __m128i __DEFAULT_FN_ATTRS
749
_mm_mulhi_epi16(__m128i __a, __m128i __b)
750
{
751
return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
752
}
753
754
static __inline__ __m128i __DEFAULT_FN_ATTRS
755
_mm_mulhi_epu16(__m128i __a, __m128i __b)
756
{
757
return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
758
}
759
760
/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
761
/// returns a vector containing the low-order 16 bits of each 32-bit product
762
/// in the corresponding element.
763
///
764
/// \headerfile <x86intrin.h>
765
///
766
/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
767
///
768
/// \param __a
769
/// A 128-bit integer vector containing one of the source operands.
770
/// \param __b
771
/// A 128-bit integer vector containing one of the source operands.
772
/// \returns A 128-bit integer vector containing the products of both operands.
773
static __inline__ __m128i __DEFAULT_FN_ATTRS
774
_mm_mullo_epi16(__m128i __a, __m128i __b)
775
{
776
return (__m128i)((__v8hu)__a * (__v8hu)__b);
777
}
778
779
/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
780
/// of the two 64-bit integer vectors and returns the 64-bit unsigned
781
/// product.
782
///
783
/// \headerfile <x86intrin.h>
784
///
785
/// This intrinsic corresponds to the \c PMULUDQ instruction.
786
///
787
/// \param __a
788
/// A 64-bit integer containing one of the source operands.
789
/// \param __b
790
/// A 64-bit integer containing one of the source operands.
791
/// \returns A 64-bit integer vector containing the product of both operands.
792
static __inline__ __m64 __DEFAULT_FN_ATTRS
793
_mm_mul_su32(__m64 __a, __m64 __b)
794
{
795
return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
796
}
797
798
/// \brief Multiplies 32-bit unsigned integer values contained in the lower
799
/// bits of the corresponding elements of two [2 x i64] vectors, and returns
800
/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
801
///
802
/// \headerfile <x86intrin.h>
803
///
804
/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
805
///
806
/// \param __a
807
/// A [2 x i64] vector containing one of the source operands.
808
/// \param __b
809
/// A [2 x i64] vector containing one of the source operands.
810
/// \returns A [2 x i64] vector containing the product of both operands.
811
static __inline__ __m128i __DEFAULT_FN_ATTRS
812
_mm_mul_epu32(__m128i __a, __m128i __b)
813
{
814
return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
815
}
816
817
/// \brief Computes the absolute differences of corresponding 8-bit integer
818
/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
819
/// separately sums the second 8 absolute differences. Packss these two
820
/// unsigned 16-bit integer sums into the upper and lower elements of a
821
/// [2 x i64] vector.
822
///
823
/// \headerfile <x86intrin.h>
824
///
825
/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
826
///
827
/// \param __a
828
/// A 128-bit integer vector containing one of the source operands.
829
/// \param __b
830
/// A 128-bit integer vector containing one of the source operands.
831
/// \returns A [2 x i64] vector containing the sums of the sets of absolute
832
/// differences between both operands.
833
static __inline__ __m128i __DEFAULT_FN_ATTRS
834
_mm_sad_epu8(__m128i __a, __m128i __b)
835
{
836
return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
837
}
838
839
/// \brief Subtracts the corresponding 8-bit integer values in the operands.
840
///
841
/// \headerfile <x86intrin.h>
842
///
843
/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
844
///
845
/// \param __a
846
/// A 128-bit integer vector containing the minuends.
847
/// \param __b
848
/// A 128-bit integer vector containing the subtrahends.
849
/// \returns A 128-bit integer vector containing the differences of the values
850
/// in the operands.
851
static __inline__ __m128i __DEFAULT_FN_ATTRS
852
_mm_sub_epi8(__m128i __a, __m128i __b)
853
{
854
return (__m128i)((__v16qu)__a - (__v16qu)__b);
855
}
856
857
/// \brief Subtracts the corresponding 16-bit integer values in the operands.
858
///
859
/// \headerfile <x86intrin.h>
860
///
861
/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
862
///
863
/// \param __a
864
/// A 128-bit integer vector containing the minuends.
865
/// \param __b
866
/// A 128-bit integer vector containing the subtrahends.
867
/// \returns A 128-bit integer vector containing the differences of the values
868
/// in the operands.
869
static __inline__ __m128i __DEFAULT_FN_ATTRS
870
_mm_sub_epi16(__m128i __a, __m128i __b)
871
{
872
return (__m128i)((__v8hu)__a - (__v8hu)__b);
873
}
874
875
/// \brief Subtracts the corresponding 32-bit integer values in the operands.
876
///
877
/// \headerfile <x86intrin.h>
878
///
879
/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
880
///
881
/// \param __a
882
/// A 128-bit integer vector containing the minuends.
883
/// \param __b
884
/// A 128-bit integer vector containing the subtrahends.
885
/// \returns A 128-bit integer vector containing the differences of the values
886
/// in the operands.
887
static __inline__ __m128i __DEFAULT_FN_ATTRS
888
_mm_sub_epi32(__m128i __a, __m128i __b)
889
{
890
return (__m128i)((__v4su)__a - (__v4su)__b);
891
}
892
893
/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
894
/// difference to the corresponding bits in the destination.
895
///
896
/// \headerfile <x86intrin.h>
897
///
898
/// This intrinsic corresponds to the \c PSUBQ instruction.
899
///
900
/// \param __a
901
/// A 64-bit integer vector containing the minuend.
902
/// \param __b
903
/// A 64-bit integer vector containing the subtrahend.
904
/// \returns A 64-bit integer vector containing the difference of the values in
905
/// the operands.
906
static __inline__ __m64 __DEFAULT_FN_ATTRS
907
_mm_sub_si64(__m64 __a, __m64 __b)
908
{
909
return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
910
}
911
912
/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
913
///
914
/// \headerfile <x86intrin.h>
915
///
916
/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
917
///
918
/// \param __a
919
/// A 128-bit integer vector containing the minuends.
920
/// \param __b
921
/// A 128-bit integer vector containing the subtrahends.
922
/// \returns A 128-bit integer vector containing the differences of the values
923
/// in the operands.
924
static __inline__ __m128i __DEFAULT_FN_ATTRS
925
_mm_sub_epi64(__m128i __a, __m128i __b)
926
{
927
return (__m128i)((__v2du)__a - (__v2du)__b);
928
}
929
930
/// \brief Subtracts corresponding 8-bit signed integer values in the input and
931
/// returns the differences in the corresponding bytes in the destination.
932
/// Differences greater than 7Fh are saturated to 7Fh, and differences less
933
/// than 80h are saturated to 80h.
934
///
935
/// \headerfile <x86intrin.h>
936
///
937
/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
938
///
939
/// \param __a
940
/// A 128-bit integer vector containing the minuends.
941
/// \param __b
942
/// A 128-bit integer vector containing the subtrahends.
943
/// \returns A 128-bit integer vector containing the differences of the values
944
/// in the operands.
945
static __inline__ __m128i __DEFAULT_FN_ATTRS
946
_mm_subs_epi8(__m128i __a, __m128i __b)
947
{
948
return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
949
}
950
951
/// \brief Subtracts corresponding 16-bit signed integer values in the input and
952
/// returns the differences in the corresponding bytes in the destination.
953
/// Differences greater than 7FFFh are saturated to 7FFFh, and values less
954
/// than 8000h are saturated to 8000h.
955
///
956
/// \headerfile <x86intrin.h>
957
///
958
/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
959
///
960
/// \param __a
961
/// A 128-bit integer vector containing the minuends.
962
/// \param __b
963
/// A 128-bit integer vector containing the subtrahends.
964
/// \returns A 128-bit integer vector containing the differences of the values
965
/// in the operands.
966
static __inline__ __m128i __DEFAULT_FN_ATTRS
967
_mm_subs_epi16(__m128i __a, __m128i __b)
968
{
969
return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
970
}
971
972
/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
973
/// and returns the differences in the corresponding bytes in the
974
/// destination. Differences less than 00h are saturated to 00h.
975
///
976
/// \headerfile <x86intrin.h>
977
///
978
/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
979
///
980
/// \param __a
981
/// A 128-bit integer vector containing the minuends.
982
/// \param __b
983
/// A 128-bit integer vector containing the subtrahends.
984
/// \returns A 128-bit integer vector containing the unsigned integer
985
/// differences of the values in the operands.
986
static __inline__ __m128i __DEFAULT_FN_ATTRS
987
_mm_subs_epu8(__m128i __a, __m128i __b)
988
{
989
return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
990
}
991
992
/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
993
/// and returns the differences in the corresponding bytes in the
994
/// destination. Differences less than 0000h are saturated to 0000h.
995
///
996
/// \headerfile <x86intrin.h>
997
///
998
/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
999
///
1000
/// \param __a
1001
/// A 128-bit integer vector containing the minuends.
1002
/// \param __b
1003
/// A 128-bit integer vector containing the subtrahends.
1004
/// \returns A 128-bit integer vector containing the unsigned integer
1005
/// differences of the values in the operands.
1006
static __inline__ __m128i __DEFAULT_FN_ATTRS
1007
_mm_subs_epu16(__m128i __a, __m128i __b)
1008
{
1009
return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
1010
}
1011
1012
/// \brief Performs a bitwise AND of two 128-bit integer vectors.
1013
///
1014
/// \headerfile <x86intrin.h>
1015
///
1016
/// This intrinsic corresponds to the \c VPAND / PAND instruction.
1017
///
1018
/// \param __a
1019
/// A 128-bit integer vector containing one of the source operands.
1020
/// \param __b
1021
/// A 128-bit integer vector containing one of the source operands.
1022
/// \returns A 128-bit integer vector containing the bitwise AND of the values
1023
/// in both operands.
1024
static __inline__ __m128i __DEFAULT_FN_ATTRS
1025
_mm_and_si128(__m128i __a, __m128i __b)
1026
{
1027
return (__m128i)((__v2du)__a & (__v2du)__b);
1028
}
1029
1030
/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
1031
/// one's complement of the values contained in the first source operand.
1032
///
1033
/// \headerfile <x86intrin.h>
1034
///
1035
/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
1036
///
1037
/// \param __a
1038
/// A 128-bit vector containing the left source operand. The one's complement
1039
/// of this value is used in the bitwise AND.
1040
/// \param __b
1041
/// A 128-bit vector containing the right source operand.
1042
/// \returns A 128-bit integer vector containing the bitwise AND of the one's
1043
/// complement of the first operand and the values in the second operand.
1044
static __inline__ __m128i __DEFAULT_FN_ATTRS
1045
_mm_andnot_si128(__m128i __a, __m128i __b)
1046
{
1047
return (__m128i)(~(__v2du)__a & (__v2du)__b);
1048
}
1049
/// \brief Performs a bitwise OR of two 128-bit integer vectors.
1050
///
1051
/// \headerfile <x86intrin.h>
1052
///
1053
/// This intrinsic corresponds to the \c VPOR / POR instruction.
1054
///
1055
/// \param __a
1056
/// A 128-bit integer vector containing one of the source operands.
1057
/// \param __b
1058
/// A 128-bit integer vector containing one of the source operands.
1059
/// \returns A 128-bit integer vector containing the bitwise OR of the values
1060
/// in both operands.
1061
static __inline__ __m128i __DEFAULT_FN_ATTRS
1062
_mm_or_si128(__m128i __a, __m128i __b)
1063
{
1064
return (__m128i)((__v2du)__a | (__v2du)__b);
1065
}
1066
1067
/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
1068
///
1069
/// \headerfile <x86intrin.h>
1070
///
1071
/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
1072
///
1073
/// \param __a
1074
/// A 128-bit integer vector containing one of the source operands.
1075
/// \param __b
1076
/// A 128-bit integer vector containing one of the source operands.
1077
/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
1078
/// values in both operands.
1079
static __inline__ __m128i __DEFAULT_FN_ATTRS
1080
_mm_xor_si128(__m128i __a, __m128i __b)
1081
{
1082
return (__m128i)((__v2du)__a ^ (__v2du)__b);
1083
}
1084
1085
/// \brief Left-shifts the 128-bit integer vector operand by the specified
1086
/// number of bytes. Low-order bits are cleared.
1087
///
1088
/// \headerfile <x86intrin.h>
1089
///
1090
/// \code
1091
/// __m128i _mm_slli_si128(__m128i a, const int imm);
1092
/// \endcode
1093
///
1094
/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
1095
///
1096
/// \param a
1097
/// A 128-bit integer vector containing the source operand.
1098
/// \param imm
1099
/// An immediate value specifying the number of bytes to left-shift
1100
/// operand a.
1101
/// \returns A 128-bit integer vector containing the left-shifted value.
1102
#define _mm_slli_si128(a, imm) __extension__ ({ \
1103
(__m128i)__builtin_shufflevector( \
1104
(__v16qi)_mm_setzero_si128(), \
1105
(__v16qi)(__m128i)(a), \
1106
((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \
1107
((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \
1108
((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \
1109
((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \
1110
((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \
1111
((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \
1112
((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \
1113
((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \
1114
((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \
1115
((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \
1116
((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
1117
((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
1118
((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
1119
((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
1120
((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
1121
((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
1122
1123
#define _mm_bslli_si128(a, imm) \
1124
_mm_slli_si128((a), (imm))
1125
1126
/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1127
/// by the specified number of bits. Low-order bits are cleared.
1128
///
1129
/// \headerfile <x86intrin.h>
1130
///
1131
/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1132
///
1133
/// \param __a
1134
/// A 128-bit integer vector containing the source operand.
1135
/// \param __count
1136
/// An integer value specifying the number of bits to left-shift each value
1137
/// in operand __a.
1138
/// \returns A 128-bit integer vector containing the left-shifted values.
1139
static __inline__ __m128i __DEFAULT_FN_ATTRS
1140
_mm_slli_epi16(__m128i __a, int __count)
1141
{
1142
return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
1143
}
1144
1145
/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1146
/// by the specified number of bits. Low-order bits are cleared.
1147
///
1148
/// \headerfile <x86intrin.h>
1149
///
1150
/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1151
///
1152
/// \param __a
1153
/// A 128-bit integer vector containing the source operand.
1154
/// \param __count
1155
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1156
/// to left-shift each value in operand __a.
1157
/// \returns A 128-bit integer vector containing the left-shifted values.
1158
static __inline__ __m128i __DEFAULT_FN_ATTRS
1159
_mm_sll_epi16(__m128i __a, __m128i __count)
1160
{
1161
return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
1162
}
1163
1164
/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1165
/// by the specified number of bits. Low-order bits are cleared.
1166
///
1167
/// \headerfile <x86intrin.h>
1168
///
1169
/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1170
///
1171
/// \param __a
1172
/// A 128-bit integer vector containing the source operand.
1173
/// \param __count
1174
/// An integer value specifying the number of bits to left-shift each value
1175
/// in operand __a.
1176
/// \returns A 128-bit integer vector containing the left-shifted values.
1177
static __inline__ __m128i __DEFAULT_FN_ATTRS
1178
_mm_slli_epi32(__m128i __a, int __count)
1179
{
1180
return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
1181
}
1182
1183
/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1184
/// by the specified number of bits. Low-order bits are cleared.
1185
///
1186
/// \headerfile <x86intrin.h>
1187
///
1188
/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1189
///
1190
/// \param __a
1191
/// A 128-bit integer vector containing the source operand.
1192
/// \param __count
1193
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1194
/// to left-shift each value in operand __a.
1195
/// \returns A 128-bit integer vector containing the left-shifted values.
1196
static __inline__ __m128i __DEFAULT_FN_ATTRS
1197
_mm_sll_epi32(__m128i __a, __m128i __count)
1198
{
1199
return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
1200
}
1201
1202
/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1203
/// by the specified number of bits. Low-order bits are cleared.
1204
///
1205
/// \headerfile <x86intrin.h>
1206
///
1207
/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1208
///
1209
/// \param __a
1210
/// A 128-bit integer vector containing the source operand.
1211
/// \param __count
1212
/// An integer value specifying the number of bits to left-shift each value
1213
/// in operand __a.
1214
/// \returns A 128-bit integer vector containing the left-shifted values.
1215
static __inline__ __m128i __DEFAULT_FN_ATTRS
1216
_mm_slli_epi64(__m128i __a, int __count)
1217
{
1218
return __builtin_ia32_psllqi128((__v2di)__a, __count);
1219
}
1220
1221
/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1222
/// by the specified number of bits. Low-order bits are cleared.
1223
///
1224
/// \headerfile <x86intrin.h>
1225
///
1226
/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1227
///
1228
/// \param __a
1229
/// A 128-bit integer vector containing the source operand.
1230
/// \param __count
1231
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1232
/// to left-shift each value in operand __a.
1233
/// \returns A 128-bit integer vector containing the left-shifted values.
1234
static __inline__ __m128i __DEFAULT_FN_ATTRS
1235
_mm_sll_epi64(__m128i __a, __m128i __count)
1236
{
1237
return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
1238
}
1239
1240
/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1241
/// by the specified number of bits. High-order bits are filled with the sign
1242
/// bit of the initial value.
1243
///
1244
/// \headerfile <x86intrin.h>
1245
///
1246
/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1247
///
1248
/// \param __a
1249
/// A 128-bit integer vector containing the source operand.
1250
/// \param __count
1251
/// An integer value specifying the number of bits to right-shift each value
1252
/// in operand __a.
1253
/// \returns A 128-bit integer vector containing the right-shifted values.
1254
static __inline__ __m128i __DEFAULT_FN_ATTRS
1255
_mm_srai_epi16(__m128i __a, int __count)
1256
{
1257
return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
1258
}
1259
1260
/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1261
/// by the specified number of bits. High-order bits are filled with the sign
1262
/// bit of the initial value.
1263
///
1264
/// \headerfile <x86intrin.h>
1265
///
1266
/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1267
///
1268
/// \param __a
1269
/// A 128-bit integer vector containing the source operand.
1270
/// \param __count
1271
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1272
/// to right-shift each value in operand __a.
1273
/// \returns A 128-bit integer vector containing the right-shifted values.
1274
static __inline__ __m128i __DEFAULT_FN_ATTRS
1275
_mm_sra_epi16(__m128i __a, __m128i __count)
1276
{
1277
return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
1278
}
1279
1280
/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1281
/// by the specified number of bits. High-order bits are filled with the sign
1282
/// bit of the initial value.
1283
///
1284
/// \headerfile <x86intrin.h>
1285
///
1286
/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1287
///
1288
/// \param __a
1289
/// A 128-bit integer vector containing the source operand.
1290
/// \param __count
1291
/// An integer value specifying the number of bits to right-shift each value
1292
/// in operand __a.
1293
/// \returns A 128-bit integer vector containing the right-shifted values.
1294
static __inline__ __m128i __DEFAULT_FN_ATTRS
1295
_mm_srai_epi32(__m128i __a, int __count)
1296
{
1297
return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
1298
}
1299
1300
/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1301
/// by the specified number of bits. High-order bits are filled with the sign
1302
/// bit of the initial value.
1303
///
1304
/// \headerfile <x86intrin.h>
1305
///
1306
/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1307
///
1308
/// \param __a
1309
/// A 128-bit integer vector containing the source operand.
1310
/// \param __count
1311
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1312
/// to right-shift each value in operand __a.
1313
/// \returns A 128-bit integer vector containing the right-shifted values.
1314
static __inline__ __m128i __DEFAULT_FN_ATTRS
1315
_mm_sra_epi32(__m128i __a, __m128i __count)
1316
{
1317
return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
1318
}
1319
1320
/// \brief Right-shifts the 128-bit integer vector operand by the specified
1321
/// number of bytes. High-order bits are cleared.
1322
///
1323
/// \headerfile <x86intrin.h>
1324
///
1325
/// \code
1326
/// __m128i _mm_srli_si128(__m128i a, const int imm);
1327
/// \endcode
1328
///
1329
/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
1330
///
1331
/// \param a
1332
/// A 128-bit integer vector containing the source operand.
1333
/// \param imm
1334
/// An immediate value specifying the number of bytes to right-shift operand
1335
/// a.
1336
/// \returns A 128-bit integer vector containing the right-shifted value.
1337
#define _mm_srli_si128(a, imm) __extension__ ({ \
1338
(__m128i)__builtin_shufflevector( \
1339
(__v16qi)(__m128i)(a), \
1340
(__v16qi)_mm_setzero_si128(), \
1341
((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \
1342
((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \
1343
((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \
1344
((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \
1345
((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \
1346
((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \
1347
((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \
1348
((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \
1349
((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \
1350
((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \
1351
((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
1352
((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
1353
((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
1354
((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
1355
((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
1356
((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
1357
1358
#define _mm_bsrli_si128(a, imm) \
1359
_mm_srli_si128((a), (imm))
1360
1361
/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1362
/// operand by the specified number of bits. High-order bits are cleared.
1363
///
1364
/// \headerfile <x86intrin.h>
1365
///
1366
/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1367
///
1368
/// \param __a
1369
/// A 128-bit integer vector containing the source operand.
1370
/// \param __count
1371
/// An integer value specifying the number of bits to right-shift each value
1372
/// in operand __a.
1373
/// \returns A 128-bit integer vector containing the right-shifted values.
1374
static __inline__ __m128i __DEFAULT_FN_ATTRS
1375
_mm_srli_epi16(__m128i __a, int __count)
1376
{
1377
return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
1378
}
1379
1380
/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1381
/// operand by the specified number of bits. High-order bits are cleared.
1382
///
1383
/// \headerfile <x86intrin.h>
1384
///
1385
/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1386
///
1387
/// \param __a
1388
/// A 128-bit integer vector containing the source operand.
1389
/// \param __count
1390
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1391
/// to right-shift each value in operand __a.
1392
/// \returns A 128-bit integer vector containing the right-shifted values.
1393
static __inline__ __m128i __DEFAULT_FN_ATTRS
1394
_mm_srl_epi16(__m128i __a, __m128i __count)
1395
{
1396
return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
1397
}
1398
1399
/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1400
/// operand by the specified number of bits. High-order bits are cleared.
1401
///
1402
/// \headerfile <x86intrin.h>
1403
///
1404
/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1405
///
1406
/// \param __a
1407
/// A 128-bit integer vector containing the source operand.
1408
/// \param __count
1409
/// An integer value specifying the number of bits to right-shift each value
1410
/// in operand __a.
1411
/// \returns A 128-bit integer vector containing the right-shifted values.
1412
static __inline__ __m128i __DEFAULT_FN_ATTRS
1413
_mm_srli_epi32(__m128i __a, int __count)
1414
{
1415
return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
1416
}
1417
1418
/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1419
/// operand by the specified number of bits. High-order bits are cleared.
1420
///
1421
/// \headerfile <x86intrin.h>
1422
///
1423
/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1424
///
1425
/// \param __a
1426
/// A 128-bit integer vector containing the source operand.
1427
/// \param __count
1428
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1429
/// to right-shift each value in operand __a.
1430
/// \returns A 128-bit integer vector containing the right-shifted values.
1431
static __inline__ __m128i __DEFAULT_FN_ATTRS
1432
_mm_srl_epi32(__m128i __a, __m128i __count)
1433
{
1434
return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
1435
}
1436
1437
/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1438
/// operand by the specified number of bits. High-order bits are cleared.
1439
///
1440
/// \headerfile <x86intrin.h>
1441
///
1442
/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1443
///
1444
/// \param __a
1445
/// A 128-bit integer vector containing the source operand.
1446
/// \param __count
1447
/// An integer value specifying the number of bits to right-shift each value
1448
/// in operand __a.
1449
/// \returns A 128-bit integer vector containing the right-shifted values.
1450
static __inline__ __m128i __DEFAULT_FN_ATTRS
1451
_mm_srli_epi64(__m128i __a, int __count)
1452
{
1453
return __builtin_ia32_psrlqi128((__v2di)__a, __count);
1454
}
1455
1456
/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1457
/// operand by the specified number of bits. High-order bits are cleared.
1458
///
1459
/// \headerfile <x86intrin.h>
1460
///
1461
/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1462
///
1463
/// \param __a
1464
/// A 128-bit integer vector containing the source operand.
1465
/// \param __count
1466
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
1467
/// to right-shift each value in operand __a.
1468
/// \returns A 128-bit integer vector containing the right-shifted values.
1469
static __inline__ __m128i __DEFAULT_FN_ATTRS
1470
_mm_srl_epi64(__m128i __a, __m128i __count)
1471
{
1472
return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
1473
}
1474
1475
/// \brief Compares each of the corresponding 8-bit values of the 128-bit
1476
/// integer vectors for equality. Each comparison yields 0h for false, FFh
1477
/// for true.
1478
///
1479
/// \headerfile <x86intrin.h>
1480
///
1481
/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
1482
///
1483
/// \param __a
1484
/// A 128-bit integer vector.
1485
/// \param __b
1486
/// A 128-bit integer vector.
1487
/// \returns A 128-bit integer vector containing the comparison results.
1488
static __inline__ __m128i __DEFAULT_FN_ATTRS
1489
_mm_cmpeq_epi8(__m128i __a, __m128i __b)
1490
{
1491
return (__m128i)((__v16qi)__a == (__v16qi)__b);
1492
}
1493
1494
/// \brief Compares each of the corresponding 16-bit values of the 128-bit
1495
/// integer vectors for equality. Each comparison yields 0h for false, FFFFh
1496
/// for true.
1497
///
1498
/// \headerfile <x86intrin.h>
1499
///
1500
/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
1501
///
1502
/// \param __a
1503
/// A 128-bit integer vector.
1504
/// \param __b
1505
/// A 128-bit integer vector.
1506
/// \returns A 128-bit integer vector containing the comparison results.
1507
static __inline__ __m128i __DEFAULT_FN_ATTRS
1508
_mm_cmpeq_epi16(__m128i __a, __m128i __b)
1509
{
1510
return (__m128i)((__v8hi)__a == (__v8hi)__b);
1511
}
1512
1513
/// \brief Compares each of the corresponding 32-bit values of the 128-bit
1514
/// integer vectors for equality. Each comparison yields 0h for false,
1515
/// FFFFFFFFh for true.
1516
///
1517
/// \headerfile <x86intrin.h>
1518
///
1519
/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
1520
///
1521
/// \param __a
1522
/// A 128-bit integer vector.
1523
/// \param __b
1524
/// A 128-bit integer vector.
1525
/// \returns A 128-bit integer vector containing the comparison results.
1526
static __inline__ __m128i __DEFAULT_FN_ATTRS
1527
_mm_cmpeq_epi32(__m128i __a, __m128i __b)
1528
{
1529
return (__m128i)((__v4si)__a == (__v4si)__b);
1530
}
1531
1532
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1533
/// integer vectors to determine if the values in the first operand are
1534
/// greater than those in the second operand. Each comparison yields 0h for
1535
/// false, FFh for true.
1536
///
1537
/// \headerfile <x86intrin.h>
1538
///
1539
/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1540
///
1541
/// \param __a
1542
/// A 128-bit integer vector.
1543
/// \param __b
1544
/// A 128-bit integer vector.
1545
/// \returns A 128-bit integer vector containing the comparison results.
1546
static __inline__ __m128i __DEFAULT_FN_ATTRS
1547
_mm_cmpgt_epi8(__m128i __a, __m128i __b)
1548
{
1549
/* This function always performs a signed comparison, but __v16qi is a char
1550
which may be signed or unsigned, so use __v16qs. */
1551
return (__m128i)((__v16qs)__a > (__v16qs)__b);
1552
}
1553
1554
/// \brief Compares each of the corresponding signed 16-bit values of the
1555
/// 128-bit integer vectors to determine if the values in the first operand
1556
/// are greater than those in the second operand. Each comparison yields 0h
1557
/// for false, FFFFh for true.
1558
///
1559
/// \headerfile <x86intrin.h>
1560
///
1561
/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1562
///
1563
/// \param __a
1564
/// A 128-bit integer vector.
1565
/// \param __b
1566
/// A 128-bit integer vector.
1567
/// \returns A 128-bit integer vector containing the comparison results.
1568
static __inline__ __m128i __DEFAULT_FN_ATTRS
1569
_mm_cmpgt_epi16(__m128i __a, __m128i __b)
1570
{
1571
return (__m128i)((__v8hi)__a > (__v8hi)__b);
1572
}
1573
1574
/// \brief Compares each of the corresponding signed 32-bit values of the
1575
/// 128-bit integer vectors to determine if the values in the first operand
1576
/// are greater than those in the second operand. Each comparison yields 0h
1577
/// for false, FFFFFFFFh for true.
1578
///
1579
/// \headerfile <x86intrin.h>
1580
///
1581
/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1582
///
1583
/// \param __a
1584
/// A 128-bit integer vector.
1585
/// \param __b
1586
/// A 128-bit integer vector.
1587
/// \returns A 128-bit integer vector containing the comparison results.
1588
static __inline__ __m128i __DEFAULT_FN_ATTRS
1589
_mm_cmpgt_epi32(__m128i __a, __m128i __b)
1590
{
1591
return (__m128i)((__v4si)__a > (__v4si)__b);
1592
}
1593
1594
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1595
/// integer vectors to determine if the values in the first operand are less
1596
/// than those in the second operand. Each comparison yields 0h for false,
1597
/// FFh for true.
1598
///
1599
/// \headerfile <x86intrin.h>
1600
///
1601
/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1602
///
1603
/// \param __a
1604
/// A 128-bit integer vector.
1605
/// \param __b
1606
/// A 128-bit integer vector.
1607
/// \returns A 128-bit integer vector containing the comparison results.
1608
static __inline__ __m128i __DEFAULT_FN_ATTRS
1609
_mm_cmplt_epi8(__m128i __a, __m128i __b)
1610
{
1611
return _mm_cmpgt_epi8(__b, __a);
1612
}
1613
1614
/// \brief Compares each of the corresponding signed 16-bit values of the
1615
/// 128-bit integer vectors to determine if the values in the first operand
1616
/// are less than those in the second operand. Each comparison yields 0h for
1617
/// false, FFFFh for true.
1618
///
1619
/// \headerfile <x86intrin.h>
1620
///
1621
/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1622
///
1623
/// \param __a
1624
/// A 128-bit integer vector.
1625
/// \param __b
1626
/// A 128-bit integer vector.
1627
/// \returns A 128-bit integer vector containing the comparison results.
1628
static __inline__ __m128i __DEFAULT_FN_ATTRS
1629
_mm_cmplt_epi16(__m128i __a, __m128i __b)
1630
{
1631
return _mm_cmpgt_epi16(__b, __a);
1632
}
1633
1634
/// \brief Compares each of the corresponding signed 32-bit values of the
1635
/// 128-bit integer vectors to determine if the values in the first operand
1636
/// are less than those in the second operand. Each comparison yields 0h for
1637
/// false, FFFFFFFFh for true.
1638
///
1639
/// \headerfile <x86intrin.h>
1640
///
1641
/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1642
///
1643
/// \param __a
1644
/// A 128-bit integer vector.
1645
/// \param __b
1646
/// A 128-bit integer vector.
1647
/// \returns A 128-bit integer vector containing the comparison results.
1648
static __inline__ __m128i __DEFAULT_FN_ATTRS
1649
_mm_cmplt_epi32(__m128i __a, __m128i __b)
1650
{
1651
return _mm_cmpgt_epi32(__b, __a);
1652
}
1653
1654
#ifdef __x86_64__
1655
/// \brief Converts a 64-bit signed integer value from the second operand into a
1656
/// double-precision value and returns it in the lower element of a [2 x
1657
/// double] vector; the upper element of the returned vector is copied from
1658
/// the upper element of the first operand.
1659
///
1660
/// \headerfile <x86intrin.h>
1661
///
1662
/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1663
///
1664
/// \param __a
1665
/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
1666
/// copied to the upper 64 bits of the destination.
1667
/// \param __b
1668
/// A 64-bit signed integer operand containing the value to be converted.
1669
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
1670
/// converted value of the second operand. The upper 64 bits are copied from
1671
/// the upper 64 bits of the first operand.
1672
static __inline__ __m128d __DEFAULT_FN_ATTRS
1673
_mm_cvtsi64_sd(__m128d __a, long long __b)
1674
{
1675
__a[0] = __b;
1676
return __a;
1677
}
1678
1679
/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1680
/// 64-bit signed integer value, according to the current rounding mode.
1681
///
1682
/// \headerfile <x86intrin.h>
1683
///
1684
/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1685
///
1686
/// \param __a
1687
/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1688
/// conversion.
1689
/// \returns A 64-bit signed integer containing the converted value.
1690
static __inline__ long long __DEFAULT_FN_ATTRS
1691
_mm_cvtsd_si64(__m128d __a)
1692
{
1693
return __builtin_ia32_cvtsd2si64((__v2df)__a);
1694
}
1695
1696
/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1697
/// 64-bit signed integer value, truncating the result when it is inexact.
1698
///
1699
/// \headerfile <x86intrin.h>
1700
///
1701
/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1702
///
1703
/// \param __a
1704
/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1705
/// conversion.
1706
/// \returns A 64-bit signed integer containing the converted value.
1707
static __inline__ long long __DEFAULT_FN_ATTRS
1708
_mm_cvttsd_si64(__m128d __a)
1709
{
1710
return __a[0];
1711
}
1712
#endif
1713
1714
/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
1715
///
1716
/// \headerfile <x86intrin.h>
1717
///
1718
/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
1719
///
1720
/// \param __a
1721
/// A 128-bit integer vector.
1722
/// \returns A 128-bit vector of [4 x float] containing the converted values.
1723
static __inline__ __m128 __DEFAULT_FN_ATTRS
1724
_mm_cvtepi32_ps(__m128i __a)
1725
{
1726
return __builtin_ia32_cvtdq2ps((__v4si)__a);
1727
}
1728
1729
/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
1730
///
1731
/// \headerfile <x86intrin.h>
1732
///
1733
/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
1734
///
1735
/// \param __a
1736
/// A 128-bit vector of [4 x float].
1737
/// \returns A 128-bit integer vector of [4 x i32] containing the converted
1738
/// values.
1739
static __inline__ __m128i __DEFAULT_FN_ATTRS
1740
_mm_cvtps_epi32(__m128 __a)
1741
{
1742
return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
1743
}
1744
1745
/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
1746
/// truncating the result when it is inexact.
1747
///
1748
/// \headerfile <x86intrin.h>
1749
///
1750
/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
1751
///
1752
/// \param __a
1753
/// A 128-bit vector of [4 x float].
1754
/// \returns A 128-bit vector of [4 x i32] containing the converted values.
1755
static __inline__ __m128i __DEFAULT_FN_ATTRS
1756
_mm_cvttps_epi32(__m128 __a)
1757
{
1758
return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
1759
}
1760
1761
/// \brief Returns a vector of [4 x i32] where the lowest element is the input
1762
/// operand and the remaining elements are zero.
1763
///
1764
/// \headerfile <x86intrin.h>
1765
///
1766
/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1767
///
1768
/// \param __a
1769
/// A 32-bit signed integer operand.
1770
/// \returns A 128-bit vector of [4 x i32].
1771
static __inline__ __m128i __DEFAULT_FN_ATTRS
1772
_mm_cvtsi32_si128(int __a)
1773
{
1774
return (__m128i)(__v4si){ __a, 0, 0, 0 };
1775
}
1776
1777
#ifdef __x86_64__
1778
/// \brief Returns a vector of [2 x i64] where the lower element is the input
1779
/// operand and the upper element is zero.
1780
///
1781
/// \headerfile <x86intrin.h>
1782
///
1783
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1784
///
1785
/// \param __a
1786
/// A 64-bit signed integer operand containing the value to be converted.
1787
/// \returns A 128-bit vector of [2 x i64] containing the converted value.
1788
static __inline__ __m128i __DEFAULT_FN_ATTRS
1789
_mm_cvtsi64_si128(long long __a)
1790
{
1791
return (__m128i){ __a, 0 };
1792
}
1793
#endif
1794
1795
/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
1796
/// 32-bit signed integer value.
1797
///
1798
/// \headerfile <x86intrin.h>
1799
///
1800
/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1801
///
1802
/// \param __a
1803
/// A vector of [4 x i32]. The least significant 32 bits are moved to the
1804
/// destination.
1805
/// \returns A 32-bit signed integer containing the moved value.
1806
static __inline__ int __DEFAULT_FN_ATTRS
1807
_mm_cvtsi128_si32(__m128i __a)
1808
{
1809
__v4si __b = (__v4si)__a;
1810
return __b[0];
1811
}
1812
1813
#ifdef __x86_64__
1814
/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
1815
/// 64-bit signed integer value.
1816
///
1817
/// \headerfile <x86intrin.h>
1818
///
1819
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1820
///
1821
/// \param __a
1822
/// A vector of [2 x i64]. The least significant 64 bits are moved to the
1823
/// destination.
1824
/// \returns A 64-bit signed integer containing the moved value.
1825
static __inline__ long long __DEFAULT_FN_ATTRS
1826
_mm_cvtsi128_si64(__m128i __a)
1827
{
1828
return __a[0];
1829
}
1830
#endif
1831
1832
/// \brief Moves packed integer values from an aligned 128-bit memory location
1833
/// to elements in a 128-bit integer vector.
1834
///
1835
/// \headerfile <x86intrin.h>
1836
///
1837
/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
1838
///
1839
/// \param __p
1840
/// An aligned pointer to a memory location containing integer values.
1841
/// \returns A 128-bit integer vector containing the moved values.
1842
static __inline__ __m128i __DEFAULT_FN_ATTRS
1843
_mm_load_si128(__m128i const *__p)
1844
{
1845
return *__p;
1846
}
1847
1848
/// \brief Moves packed integer values from an unaligned 128-bit memory location
1849
/// to elements in a 128-bit integer vector.
1850
///
1851
/// \headerfile <x86intrin.h>
1852
///
1853
/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
1854
///
1855
/// \param __p
1856
/// A pointer to a memory location containing integer values.
1857
/// \returns A 128-bit integer vector containing the moved values.
1858
static __inline__ __m128i __DEFAULT_FN_ATTRS
1859
_mm_loadu_si128(__m128i const *__p)
1860
{
1861
struct __loadu_si128 {
1862
__m128i __v;
1863
} __attribute__((__packed__, __may_alias__));
1864
return ((struct __loadu_si128*)__p)->__v;
1865
}
1866
1867
/// \brief Returns a vector of [2 x i64] where the lower element is taken from
1868
/// the lower element of the operand, and the upper element is zero.
1869
///
1870
/// \headerfile <x86intrin.h>
1871
///
1872
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1873
///
1874
/// \param __p
1875
/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
1876
/// the destination.
1877
/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
1878
/// moved value. The higher order bits are cleared.
1879
static __inline__ __m128i __DEFAULT_FN_ATTRS
1880
_mm_loadl_epi64(__m128i const *__p)
1881
{
1882
struct __mm_loadl_epi64_struct {
1883
long long __u;
1884
} __attribute__((__packed__, __may_alias__));
1885
return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1886
}
1887
1888
/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
1889
/// This could be used as an argument to another intrinsic function where the
1890
/// argument is required but the value is not actually used.
1891
///
1892
/// \headerfile <x86intrin.h>
1893
///
1894
/// This intrinsic has no corresponding instruction.
1895
///
1896
/// \returns A 128-bit vector of [4 x i32] with unspecified content.
1897
static __inline__ __m128i __DEFAULT_FN_ATTRS
1898
_mm_undefined_si128(void)
1899
{
1900
return (__m128i)__builtin_ia32_undef128();
1901
}
1902
1903
/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1904
/// the specified 64-bit integer values.
1905
///
1906
/// \headerfile <x86intrin.h>
1907
///
1908
/// This intrinsic is a utility function and does not correspond to a specific
1909
/// instruction.
1910
///
1911
/// \param __q1
1912
/// A 64-bit integer value used to initialize the upper 64 bits of the
1913
/// destination vector of [2 x i64].
1914
/// \param __q0
1915
/// A 64-bit integer value used to initialize the lower 64 bits of the
1916
/// destination vector of [2 x i64].
1917
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1918
/// provided in the operands.
1919
static __inline__ __m128i __DEFAULT_FN_ATTRS
1920
_mm_set_epi64x(long long __q1, long long __q0)
1921
{
1922
return (__m128i){ __q0, __q1 };
1923
}
1924
1925
/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1926
/// the specified 64-bit integer values.
1927
///
1928
/// \headerfile <x86intrin.h>
1929
///
1930
/// This intrinsic is a utility function and does not correspond to a specific
1931
/// instruction.
1932
///
1933
/// \param __q1
1934
/// A 64-bit integer value used to initialize the upper 64 bits of the
1935
/// destination vector of [2 x i64].
1936
/// \param __q0
1937
/// A 64-bit integer value used to initialize the lower 64 bits of the
1938
/// destination vector of [2 x i64].
1939
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1940
/// provided in the operands.
1941
static __inline__ __m128i __DEFAULT_FN_ATTRS
1942
_mm_set_epi64(__m64 __q1, __m64 __q0)
1943
{
1944
return (__m128i){ (long long)__q0, (long long)__q1 };
1945
}
1946
1947
/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
1948
/// the specified 32-bit integer values.
1949
///
1950
/// \headerfile <x86intrin.h>
1951
///
1952
/// This intrinsic is a utility function and does not correspond to a specific
1953
/// instruction.
1954
///
1955
/// \param __i3
1956
/// A 32-bit integer value used to initialize bits [127:96] of the
1957
/// destination vector.
1958
/// \param __i2
1959
/// A 32-bit integer value used to initialize bits [95:64] of the destination
1960
/// vector.
1961
/// \param __i1
1962
/// A 32-bit integer value used to initialize bits [63:32] of the destination
1963
/// vector.
1964
/// \param __i0
1965
/// A 32-bit integer value used to initialize bits [31:0] of the destination
1966
/// vector.
1967
/// \returns An initialized 128-bit vector of [4 x i32] containing the values
1968
/// provided in the operands.
1969
static __inline__ __m128i __DEFAULT_FN_ATTRS
1970
_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
1971
{
1972
return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
1973
}
1974
1975
/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
1976
/// the specified 16-bit integer values.
1977
///
1978
/// \headerfile <x86intrin.h>
1979
///
1980
/// This intrinsic is a utility function and does not correspond to a specific
1981
/// instruction.
1982
///
1983
/// \param __w7
1984
/// A 16-bit integer value used to initialize bits [127:112] of the
1985
/// destination vector.
1986
/// \param __w6
1987
/// A 16-bit integer value used to initialize bits [111:96] of the
1988
/// destination vector.
1989
/// \param __w5
1990
/// A 16-bit integer value used to initialize bits [95:80] of the destination
1991
/// vector.
1992
/// \param __w4
1993
/// A 16-bit integer value used to initialize bits [79:64] of the destination
1994
/// vector.
1995
/// \param __w3
1996
/// A 16-bit integer value used to initialize bits [63:48] of the destination
1997
/// vector.
1998
/// \param __w2
1999
/// A 16-bit integer value used to initialize bits [47:32] of the destination
2000
/// vector.
2001
/// \param __w1
2002
/// A 16-bit integer value used to initialize bits [31:16] of the destination
2003
/// vector.
2004
/// \param __w0
2005
/// A 16-bit integer value used to initialize bits [15:0] of the destination
2006
/// vector.
2007
/// \returns An initialized 128-bit vector of [8 x i16] containing the values
2008
/// provided in the operands.
2009
static __inline__ __m128i __DEFAULT_FN_ATTRS
2010
_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
2011
{
2012
return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2013
}
2014
2015
/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
2016
/// the specified 8-bit integer values.
2017
///
2018
/// \headerfile <x86intrin.h>
2019
///
2020
/// This intrinsic is a utility function and does not correspond to a specific
2021
/// instruction.
2022
///
2023
/// \param __b15
2024
/// Initializes bits [127:120] of the destination vector.
2025
/// \param __b14
2026
/// Initializes bits [119:112] of the destination vector.
2027
/// \param __b13
2028
/// Initializes bits [111:104] of the destination vector.
2029
/// \param __b12
2030
/// Initializes bits [103:96] of the destination vector.
2031
/// \param __b11
2032
/// Initializes bits [95:88] of the destination vector.
2033
/// \param __b10
2034
/// Initializes bits [87:80] of the destination vector.
2035
/// \param __b9
2036
/// Initializes bits [79:72] of the destination vector.
2037
/// \param __b8
2038
/// Initializes bits [71:64] of the destination vector.
2039
/// \param __b7
2040
/// Initializes bits [63:56] of the destination vector.
2041
/// \param __b6
2042
/// Initializes bits [55:48] of the destination vector.
2043
/// \param __b5
2044
/// Initializes bits [47:40] of the destination vector.
2045
/// \param __b4
2046
/// Initializes bits [39:32] of the destination vector.
2047
/// \param __b3
2048
/// Initializes bits [31:24] of the destination vector.
2049
/// \param __b2
2050
/// Initializes bits [23:16] of the destination vector.
2051
/// \param __b1
2052
/// Initializes bits [15:8] of the destination vector.
2053
/// \param __b0
2054
/// Initializes bits [7:0] of the destination vector.
2055
/// \returns An initialized 128-bit vector of [16 x i8] containing the values
2056
/// provided in the operands.
2057
static __inline__ __m128i __DEFAULT_FN_ATTRS
2058
_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
2059
{
2060
return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2061
}
2062
2063
/// \brief Initializes both values in a 128-bit integer vector with the
2064
/// specified 64-bit integer value.
2065
///
2066
/// \headerfile <x86intrin.h>
2067
///
2068
/// This intrinsic is a utility function and does not correspond to a specific
2069
/// instruction.
2070
///
2071
/// \param __q
2072
/// Integer value used to initialize the elements of the destination integer
2073
/// vector.
2074
/// \returns An initialized 128-bit integer vector of [2 x i64] with both
2075
/// elements containing the value provided in the operand.
2076
static __inline__ __m128i __DEFAULT_FN_ATTRS
2077
_mm_set1_epi64x(long long __q)
2078
{
2079
return (__m128i){ __q, __q };
2080
}
2081
2082
/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
2083
/// specified 64-bit value.
2084
///
2085
/// \headerfile <x86intrin.h>
2086
///
2087
/// This intrinsic is a utility function and does not correspond to a specific
2088
/// instruction.
2089
///
2090
/// \param __q
2091
/// A 64-bit value used to initialize the elements of the destination integer
2092
/// vector.
2093
/// \returns An initialized 128-bit vector of [2 x i64] with all elements
2094
/// containing the value provided in the operand.
2095
static __inline__ __m128i __DEFAULT_FN_ATTRS
2096
_mm_set1_epi64(__m64 __q)
2097
{
2098
return (__m128i){ (long long)__q, (long long)__q };
2099
}
2100
2101
/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
2102
/// specified 32-bit value.
2103
///
2104
/// \headerfile <x86intrin.h>
2105
///
2106
/// This intrinsic is a utility function and does not correspond to a specific
2107
/// instruction.
2108
///
2109
/// \param __i
2110
/// A 32-bit value used to initialize the elements of the destination integer
2111
/// vector.
2112
/// \returns An initialized 128-bit vector of [4 x i32] with all elements
2113
/// containing the value provided in the operand.
2114
static __inline__ __m128i __DEFAULT_FN_ATTRS
2115
_mm_set1_epi32(int __i)
2116
{
2117
return (__m128i)(__v4si){ __i, __i, __i, __i };
2118
}
2119
2120
/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
2121
/// specified 16-bit value.
2122
///
2123
/// \headerfile <x86intrin.h>
2124
///
2125
/// This intrinsic is a utility function and does not correspond to a specific
2126
/// instruction.
2127
///
2128
/// \param __w
2129
/// A 16-bit value used to initialize the elements of the destination integer
2130
/// vector.
2131
/// \returns An initialized 128-bit vector of [8 x i16] with all elements
2132
/// containing the value provided in the operand.
2133
static __inline__ __m128i __DEFAULT_FN_ATTRS
2134
_mm_set1_epi16(short __w)
2135
{
2136
return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
2137
}
2138
2139
/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
2140
/// specified 8-bit value.
2141
///
2142
/// \headerfile <x86intrin.h>
2143
///
2144
/// This intrinsic is a utility function and does not correspond to a specific
2145
/// instruction.
2146
///
2147
/// \param __b
2148
/// An 8-bit value used to initialize the elements of the destination integer
2149
/// vector.
2150
/// \returns An initialized 128-bit vector of [16 x i8] with all elements
2151
/// containing the value provided in the operand.
2152
static __inline__ __m128i __DEFAULT_FN_ATTRS
2153
_mm_set1_epi8(char __b)
2154
{
2155
return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
2156
}
2157
2158
static __inline__ __m128i __DEFAULT_FN_ATTRS
2159
_mm_setr_epi64(__m64 __q0, __m64 __q1)
2160
{
2161
return (__m128i){ (long long)__q0, (long long)__q1 };
2162
}
2163
2164
static __inline__ __m128i __DEFAULT_FN_ATTRS
2165
_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
2166
{
2167
return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
2168
}
2169
2170
static __inline__ __m128i __DEFAULT_FN_ATTRS
2171
_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
2172
{
2173
return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2174
}
2175
2176
static __inline__ __m128i __DEFAULT_FN_ATTRS
2177
_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
2178
{
2179
return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2180
}
2181
2182
static __inline__ __m128i __DEFAULT_FN_ATTRS
2183
_mm_setzero_si128(void)
2184
{
2185
return (__m128i){ 0LL, 0LL };
2186
}
2187
2188
static __inline__ void __DEFAULT_FN_ATTRS
2189
_mm_store_si128(__m128i *__p, __m128i __b)
2190
{
2191
*__p = __b;
2192
}
2193
2194
static __inline__ void __DEFAULT_FN_ATTRS
2195
_mm_storeu_si128(__m128i *__p, __m128i __b)
2196
{
2197
struct __storeu_si128 {
2198
__m128i __v;
2199
} __attribute__((__packed__, __may_alias__));
2200
((struct __storeu_si128*)__p)->__v = __b;
2201
}
2202
2203
static __inline__ void __DEFAULT_FN_ATTRS
2204
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
2205
{
2206
__builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
2207
}
2208
2209
static __inline__ void __DEFAULT_FN_ATTRS
2210
_mm_storel_epi64(__m128i *__p, __m128i __a)
2211
{
2212
struct __mm_storel_epi64_struct {
2213
long long __u;
2214
} __attribute__((__packed__, __may_alias__));
2215
((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
2216
}
2217
2218
static __inline__ void __DEFAULT_FN_ATTRS
2219
_mm_stream_pd(double *__p, __m128d __a)
2220
{
2221
__builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
2222
}
2223
2224
static __inline__ void __DEFAULT_FN_ATTRS
2225
_mm_stream_si128(__m128i *__p, __m128i __a)
2226
{
2227
__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
2228
}
2229
2230
static __inline__ void __DEFAULT_FN_ATTRS
2231
_mm_stream_si32(int *__p, int __a)
2232
{
2233
__builtin_ia32_movnti(__p, __a);
2234
}
2235
2236
#ifdef __x86_64__
2237
static __inline__ void __DEFAULT_FN_ATTRS
2238
_mm_stream_si64(long long *__p, long long __a)
2239
{
2240
__builtin_ia32_movnti64(__p, __a);
2241
}
2242
#endif
2243
2244
static __inline__ void __DEFAULT_FN_ATTRS
2245
_mm_clflush(void const *__p)
2246
{
2247
__builtin_ia32_clflush(__p);
2248
}
2249
2250
static __inline__ void __DEFAULT_FN_ATTRS
2251
_mm_lfence(void)
2252
{
2253
__builtin_ia32_lfence();
2254
}
2255
2256
static __inline__ void __DEFAULT_FN_ATTRS
2257
_mm_mfence(void)
2258
{
2259
__builtin_ia32_mfence();
2260
}
2261
2262
static __inline__ __m128i __DEFAULT_FN_ATTRS
2263
_mm_packs_epi16(__m128i __a, __m128i __b)
2264
{
2265
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
2266
}
2267
2268
static __inline__ __m128i __DEFAULT_FN_ATTRS
2269
_mm_packs_epi32(__m128i __a, __m128i __b)
2270
{
2271
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
2272
}
2273
2274
static __inline__ __m128i __DEFAULT_FN_ATTRS
2275
_mm_packus_epi16(__m128i __a, __m128i __b)
2276
{
2277
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
2278
}
2279
2280
static __inline__ int __DEFAULT_FN_ATTRS
2281
_mm_extract_epi16(__m128i __a, int __imm)
2282
{
2283
__v8hi __b = (__v8hi)__a;
2284
return (unsigned short)__b[__imm & 7];
2285
}
2286
2287
static __inline__ __m128i __DEFAULT_FN_ATTRS
2288
_mm_insert_epi16(__m128i __a, int __b, int __imm)
2289
{
2290
__v8hi __c = (__v8hi)__a;
2291
__c[__imm & 7] = __b;
2292
return (__m128i)__c;
2293
}
2294
2295
static __inline__ int __DEFAULT_FN_ATTRS
2296
_mm_movemask_epi8(__m128i __a)
2297
{
2298
return __builtin_ia32_pmovmskb128((__v16qi)__a);
2299
}
2300
2301
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
2302
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
2303
(__v4si)_mm_undefined_si128(), \
2304
((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2305
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
2306
2307
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
2308
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2309
(__v8hi)_mm_undefined_si128(), \
2310
((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2311
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
2312
4, 5, 6, 7); })
2313
2314
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
2315
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2316
(__v8hi)_mm_undefined_si128(), \
2317
0, 1, 2, 3, \
2318
4 + (((imm) >> 0) & 0x3), \
2319
4 + (((imm) >> 2) & 0x3), \
2320
4 + (((imm) >> 4) & 0x3), \
2321
4 + (((imm) >> 6) & 0x3)); })
2322
2323
static __inline__ __m128i __DEFAULT_FN_ATTRS
2324
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
2325
{
2326
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2327
}
2328
2329
static __inline__ __m128i __DEFAULT_FN_ATTRS
2330
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
2331
{
2332
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
2333
}
2334
2335
static __inline__ __m128i __DEFAULT_FN_ATTRS
2336
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
2337
{
2338
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
2339
}
2340
2341
static __inline__ __m128i __DEFAULT_FN_ATTRS
2342
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
2343
{
2344
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
2345
}
2346
2347
static __inline__ __m128i __DEFAULT_FN_ATTRS
2348
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
2349
{
2350
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
2351
}
2352
2353
static __inline__ __m128i __DEFAULT_FN_ATTRS
2354
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
2355
{
2356
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
2357
}
2358
2359
static __inline__ __m128i __DEFAULT_FN_ATTRS
2360
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
2361
{
2362
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
2363
}
2364
2365
static __inline__ __m128i __DEFAULT_FN_ATTRS
2366
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
2367
{
2368
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
2369
}
2370
2371
static __inline__ __m64 __DEFAULT_FN_ATTRS
2372
_mm_movepi64_pi64(__m128i __a)
2373
{
2374
return (__m64)__a[0];
2375
}
2376
2377
static __inline__ __m128i __DEFAULT_FN_ATTRS
2378
_mm_movpi64_epi64(__m64 __a)
2379
{
2380
return (__m128i){ (long long)__a, 0 };
2381
}
2382
2383
static __inline__ __m128i __DEFAULT_FN_ATTRS
2384
_mm_move_epi64(__m128i __a)
2385
{
2386
return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
2387
}
2388
2389
static __inline__ __m128d __DEFAULT_FN_ATTRS
2390
_mm_unpackhi_pd(__m128d __a, __m128d __b)
2391
{
2392
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
2393
}
2394
2395
static __inline__ __m128d __DEFAULT_FN_ATTRS
2396
_mm_unpacklo_pd(__m128d __a, __m128d __b)
2397
{
2398
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
2399
}
2400
2401
static __inline__ int __DEFAULT_FN_ATTRS
2402
_mm_movemask_pd(__m128d __a)
2403
{
2404
return __builtin_ia32_movmskpd((__v2df)__a);
2405
}
2406
2407
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
2408
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
2409
0 + (((i) >> 0) & 0x1), \
2410
2 + (((i) >> 1) & 0x1)); })
2411
2412
static __inline__ __m128 __DEFAULT_FN_ATTRS
2413
_mm_castpd_ps(__m128d __a)
2414
{
2415
return (__m128)__a;
2416
}
2417
2418
static __inline__ __m128i __DEFAULT_FN_ATTRS
2419
_mm_castpd_si128(__m128d __a)
2420
{
2421
return (__m128i)__a;
2422
}
2423
2424
static __inline__ __m128d __DEFAULT_FN_ATTRS
2425
_mm_castps_pd(__m128 __a)
2426
{
2427
return (__m128d)__a;
2428
}
2429
2430
static __inline__ __m128i __DEFAULT_FN_ATTRS
2431
_mm_castps_si128(__m128 __a)
2432
{
2433
return (__m128i)__a;
2434
}
2435
2436
static __inline__ __m128 __DEFAULT_FN_ATTRS
2437
_mm_castsi128_ps(__m128i __a)
2438
{
2439
return (__m128)__a;
2440
}
2441
2442
static __inline__ __m128d __DEFAULT_FN_ATTRS
2443
_mm_castsi128_pd(__m128i __a)
2444
{
2445
return (__m128d)__a;
2446
}
2447
2448
static __inline__ void __DEFAULT_FN_ATTRS
2449
_mm_pause(void)
2450
{
2451
__builtin_ia32_pause();
2452
}
2453
2454
#undef __DEFAULT_FN_ATTRS
2455
2456
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
2457
2458
#endif /* __EMMINTRIN_H */
2459
2460