CoCalc -- emmintrin.h

GitHub Repository: screetsec/TheFatRat
Path: blob/master/tools/android-sdk/renderscript/clang-include/emmintrin.h
⁴⁹⁶ views
1
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2
 *
3
 * Permission is hereby granted, free of charge, to any person obtaining a copy
4
 * of this software and associated documentation files (the "Software"), to deal
5
 * in the Software without restriction, including without limitation the rights
6
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
 * copies of the Software, and to permit persons to whom the Software is
8
 * furnished to do so, subject to the following conditions:
9
 *
10
 * The above copyright notice and this permission notice shall be included in
11
 * all copies or substantial portions of the Software.
12
 *
13
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
 * THE SOFTWARE.
20
 *
21
 *===-----------------------------------------------------------------------===
22
 */
23

24
#ifndef __EMMINTRIN_H
25
#define __EMMINTRIN_H
26

27
#include <xmmintrin.h>
28

29
typedef double __m128d __attribute__((__vector_size__(16)));
30
typedef long long __m128i __attribute__((__vector_size__(16)));
31

32
/* Type defines.  */
33
typedef double __v2df __attribute__ ((__vector_size__ (16)));
34
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35
typedef short __v8hi __attribute__((__vector_size__(16)));
36
typedef char __v16qi __attribute__((__vector_size__(16)));
37

38
/* Unsigned types */
39
typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40
typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41
typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42

43
/* We need an explicitly signed variant for char. Note that this shouldn't
44
 * appear in the interface though. */
45
typedef signed char __v16qs __attribute__((__vector_size__(16)));
46

47
#include <f16cintrin.h>
48

49
/* Define the default attributes for the functions in this file. */
50
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
51

52
static __inline__ __m128d __DEFAULT_FN_ATTRS
53
_mm_add_sd(__m128d __a, __m128d __b)
54
{
55
  __a[0] += __b[0];
56
  return __a;
57
}
58

59
static __inline__ __m128d __DEFAULT_FN_ATTRS
60
_mm_add_pd(__m128d __a, __m128d __b)
61
{
62
  return (__m128d)((__v2df)__a + (__v2df)__b);
63
}
64

65
static __inline__ __m128d __DEFAULT_FN_ATTRS
66
_mm_sub_sd(__m128d __a, __m128d __b)
67
{
68
  __a[0] -= __b[0];
69
  return __a;
70
}
71

72
static __inline__ __m128d __DEFAULT_FN_ATTRS
73
_mm_sub_pd(__m128d __a, __m128d __b)
74
{
75
  return (__m128d)((__v2df)__a - (__v2df)__b);
76
}
77

78
static __inline__ __m128d __DEFAULT_FN_ATTRS
79
_mm_mul_sd(__m128d __a, __m128d __b)
80
{
81
  __a[0] *= __b[0];
82
  return __a;
83
}
84

85
static __inline__ __m128d __DEFAULT_FN_ATTRS
86
_mm_mul_pd(__m128d __a, __m128d __b)
87
{
88
  return (__m128d)((__v2df)__a * (__v2df)__b);
89
}
90

91
static __inline__ __m128d __DEFAULT_FN_ATTRS
92
_mm_div_sd(__m128d __a, __m128d __b)
93
{
94
  __a[0] /= __b[0];
95
  return __a;
96
}
97

98
static __inline__ __m128d __DEFAULT_FN_ATTRS
99
_mm_div_pd(__m128d __a, __m128d __b)
100
{
101
  return (__m128d)((__v2df)__a / (__v2df)__b);
102
}
103

104
static __inline__ __m128d __DEFAULT_FN_ATTRS
105
_mm_sqrt_sd(__m128d __a, __m128d __b)
106
{
107
  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
108
  return (__m128d) { __c[0], __a[1] };
109
}
110

111
static __inline__ __m128d __DEFAULT_FN_ATTRS
112
_mm_sqrt_pd(__m128d __a)
113
{
114
  return __builtin_ia32_sqrtpd((__v2df)__a);
115
}
116

117
static __inline__ __m128d __DEFAULT_FN_ATTRS
118
_mm_min_sd(__m128d __a, __m128d __b)
119
{
120
  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
121
}
122

123
static __inline__ __m128d __DEFAULT_FN_ATTRS
124
_mm_min_pd(__m128d __a, __m128d __b)
125
{
126
  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
127
}
128

129
static __inline__ __m128d __DEFAULT_FN_ATTRS
130
_mm_max_sd(__m128d __a, __m128d __b)
131
{
132
  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
133
}
134

135
static __inline__ __m128d __DEFAULT_FN_ATTRS
136
_mm_max_pd(__m128d __a, __m128d __b)
137
{
138
  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
139
}
140

141
static __inline__ __m128d __DEFAULT_FN_ATTRS
142
_mm_and_pd(__m128d __a, __m128d __b)
143
{
144
  return (__m128d)((__v4su)__a & (__v4su)__b);
145
}
146

147
static __inline__ __m128d __DEFAULT_FN_ATTRS
148
_mm_andnot_pd(__m128d __a, __m128d __b)
149
{
150
  return (__m128d)(~(__v4su)__a & (__v4su)__b);
151
}
152

153
static __inline__ __m128d __DEFAULT_FN_ATTRS
154
_mm_or_pd(__m128d __a, __m128d __b)
155
{
156
  return (__m128d)((__v4su)__a | (__v4su)__b);
157
}
158

159
static __inline__ __m128d __DEFAULT_FN_ATTRS
160
_mm_xor_pd(__m128d __a, __m128d __b)
161
{
162
  return (__m128d)((__v4su)__a ^ (__v4su)__b);
163
}
164

165
static __inline__ __m128d __DEFAULT_FN_ATTRS
166
_mm_cmpeq_pd(__m128d __a, __m128d __b)
167
{
168
  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
169
}
170

171
static __inline__ __m128d __DEFAULT_FN_ATTRS
172
_mm_cmplt_pd(__m128d __a, __m128d __b)
173
{
174
  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
175
}
176

177
static __inline__ __m128d __DEFAULT_FN_ATTRS
178
_mm_cmple_pd(__m128d __a, __m128d __b)
179
{
180
  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
181
}
182

183
static __inline__ __m128d __DEFAULT_FN_ATTRS
184
_mm_cmpgt_pd(__m128d __a, __m128d __b)
185
{
186
  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
187
}
188

189
static __inline__ __m128d __DEFAULT_FN_ATTRS
190
_mm_cmpge_pd(__m128d __a, __m128d __b)
191
{
192
  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
193
}
194

195
static __inline__ __m128d __DEFAULT_FN_ATTRS
196
_mm_cmpord_pd(__m128d __a, __m128d __b)
197
{
198
  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
199
}
200

201
static __inline__ __m128d __DEFAULT_FN_ATTRS
202
_mm_cmpunord_pd(__m128d __a, __m128d __b)
203
{
204
  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
205
}
206

207
static __inline__ __m128d __DEFAULT_FN_ATTRS
208
_mm_cmpneq_pd(__m128d __a, __m128d __b)
209
{
210
  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
211
}
212

213
static __inline__ __m128d __DEFAULT_FN_ATTRS
214
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
215
{
216
  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
217
}
218

219
static __inline__ __m128d __DEFAULT_FN_ATTRS
220
_mm_cmpnle_pd(__m128d __a, __m128d __b)
221
{
222
  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
223
}
224

225
static __inline__ __m128d __DEFAULT_FN_ATTRS
226
_mm_cmpngt_pd(__m128d __a, __m128d __b)
227
{
228
  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
229
}
230

231
static __inline__ __m128d __DEFAULT_FN_ATTRS
232
_mm_cmpnge_pd(__m128d __a, __m128d __b)
233
{
234
  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
235
}
236

237
static __inline__ __m128d __DEFAULT_FN_ATTRS
238
_mm_cmpeq_sd(__m128d __a, __m128d __b)
239
{
240
  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
241
}
242

243
static __inline__ __m128d __DEFAULT_FN_ATTRS
244
_mm_cmplt_sd(__m128d __a, __m128d __b)
245
{
246
  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
247
}
248

249
static __inline__ __m128d __DEFAULT_FN_ATTRS
250
_mm_cmple_sd(__m128d __a, __m128d __b)
251
{
252
  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
253
}
254

255
static __inline__ __m128d __DEFAULT_FN_ATTRS
256
_mm_cmpgt_sd(__m128d __a, __m128d __b)
257
{
258
  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
259
  return (__m128d) { __c[0], __a[1] };
260
}
261

262
static __inline__ __m128d __DEFAULT_FN_ATTRS
263
_mm_cmpge_sd(__m128d __a, __m128d __b)
264
{
265
  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
266
  return (__m128d) { __c[0], __a[1] };
267
}
268

269
static __inline__ __m128d __DEFAULT_FN_ATTRS
270
_mm_cmpord_sd(__m128d __a, __m128d __b)
271
{
272
  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
273
}
274

275
static __inline__ __m128d __DEFAULT_FN_ATTRS
276
_mm_cmpunord_sd(__m128d __a, __m128d __b)
277
{
278
  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
279
}
280

281
static __inline__ __m128d __DEFAULT_FN_ATTRS
282
_mm_cmpneq_sd(__m128d __a, __m128d __b)
283
{
284
  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
285
}
286

287
static __inline__ __m128d __DEFAULT_FN_ATTRS
288
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
289
{
290
  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
291
}
292

293
static __inline__ __m128d __DEFAULT_FN_ATTRS
294
_mm_cmpnle_sd(__m128d __a, __m128d __b)
295
{
296
  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
297
}
298

299
static __inline__ __m128d __DEFAULT_FN_ATTRS
300
_mm_cmpngt_sd(__m128d __a, __m128d __b)
301
{
302
  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
303
  return (__m128d) { __c[0], __a[1] };
304
}
305

306
static __inline__ __m128d __DEFAULT_FN_ATTRS
307
_mm_cmpnge_sd(__m128d __a, __m128d __b)
308
{
309
  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
310
  return (__m128d) { __c[0], __a[1] };
311
}
312

313
static __inline__ int __DEFAULT_FN_ATTRS
314
_mm_comieq_sd(__m128d __a, __m128d __b)
315
{
316
  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
317
}
318

319
static __inline__ int __DEFAULT_FN_ATTRS
320
_mm_comilt_sd(__m128d __a, __m128d __b)
321
{
322
  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
323
}
324

325
static __inline__ int __DEFAULT_FN_ATTRS
326
_mm_comile_sd(__m128d __a, __m128d __b)
327
{
328
  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
329
}
330

331
static __inline__ int __DEFAULT_FN_ATTRS
332
_mm_comigt_sd(__m128d __a, __m128d __b)
333
{
334
  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
335
}
336

337
static __inline__ int __DEFAULT_FN_ATTRS
338
_mm_comige_sd(__m128d __a, __m128d __b)
339
{
340
  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
341
}
342

343
static __inline__ int __DEFAULT_FN_ATTRS
344
_mm_comineq_sd(__m128d __a, __m128d __b)
345
{
346
  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
347
}
348

349
static __inline__ int __DEFAULT_FN_ATTRS
350
_mm_ucomieq_sd(__m128d __a, __m128d __b)
351
{
352
  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
353
}
354

355
static __inline__ int __DEFAULT_FN_ATTRS
356
_mm_ucomilt_sd(__m128d __a, __m128d __b)
357
{
358
  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
359
}
360

361
static __inline__ int __DEFAULT_FN_ATTRS
362
_mm_ucomile_sd(__m128d __a, __m128d __b)
363
{
364
  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
365
}
366

367
static __inline__ int __DEFAULT_FN_ATTRS
368
_mm_ucomigt_sd(__m128d __a, __m128d __b)
369
{
370
  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
371
}
372

373
static __inline__ int __DEFAULT_FN_ATTRS
374
_mm_ucomige_sd(__m128d __a, __m128d __b)
375
{
376
  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
377
}
378

379
static __inline__ int __DEFAULT_FN_ATTRS
380
_mm_ucomineq_sd(__m128d __a, __m128d __b)
381
{
382
  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
383
}
384

385
static __inline__ __m128 __DEFAULT_FN_ATTRS
386
_mm_cvtpd_ps(__m128d __a)
387
{
388
  return __builtin_ia32_cvtpd2ps((__v2df)__a);
389
}
390

391
static __inline__ __m128d __DEFAULT_FN_ATTRS
392
_mm_cvtps_pd(__m128 __a)
393
{
394
  return (__m128d) __builtin_convertvector(
395
      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
396
}
397

398
static __inline__ __m128d __DEFAULT_FN_ATTRS
399
_mm_cvtepi32_pd(__m128i __a)
400
{
401
  return (__m128d) __builtin_convertvector(
402
      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
403
}
404

405
static __inline__ __m128i __DEFAULT_FN_ATTRS
406
_mm_cvtpd_epi32(__m128d __a)
407
{
408
  return __builtin_ia32_cvtpd2dq((__v2df)__a);
409
}
410

411
static __inline__ int __DEFAULT_FN_ATTRS
412
_mm_cvtsd_si32(__m128d __a)
413
{
414
  return __builtin_ia32_cvtsd2si((__v2df)__a);
415
}
416

417
static __inline__ __m128 __DEFAULT_FN_ATTRS
418
_mm_cvtsd_ss(__m128 __a, __m128d __b)
419
{
420
  __a[0] = __b[0];
421
  return __a;
422
}
423

424
static __inline__ __m128d __DEFAULT_FN_ATTRS
425
_mm_cvtsi32_sd(__m128d __a, int __b)
426
{
427
  __a[0] = __b;
428
  return __a;
429
}
430

431
static __inline__ __m128d __DEFAULT_FN_ATTRS
432
_mm_cvtss_sd(__m128d __a, __m128 __b)
433
{
434
  __a[0] = __b[0];
435
  return __a;
436
}
437

438
static __inline__ __m128i __DEFAULT_FN_ATTRS
439
_mm_cvttpd_epi32(__m128d __a)
440
{
441
  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
442
}
443

444
static __inline__ int __DEFAULT_FN_ATTRS
445
_mm_cvttsd_si32(__m128d __a)
446
{
447
  return __a[0];
448
}
449

450
static __inline__ __m64 __DEFAULT_FN_ATTRS
451
_mm_cvtpd_pi32(__m128d __a)
452
{
453
  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
454
}
455

456
static __inline__ __m64 __DEFAULT_FN_ATTRS
457
_mm_cvttpd_pi32(__m128d __a)
458
{
459
  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
460
}
461

462
static __inline__ __m128d __DEFAULT_FN_ATTRS
463
_mm_cvtpi32_pd(__m64 __a)
464
{
465
  return __builtin_ia32_cvtpi2pd((__v2si)__a);
466
}
467

468
static __inline__ double __DEFAULT_FN_ATTRS
469
_mm_cvtsd_f64(__m128d __a)
470
{
471
  return __a[0];
472
}
473

474
static __inline__ __m128d __DEFAULT_FN_ATTRS
475
_mm_load_pd(double const *__dp)
476
{
477
  return *(__m128d*)__dp;
478
}
479

480
static __inline__ __m128d __DEFAULT_FN_ATTRS
481
_mm_load1_pd(double const *__dp)
482
{
483
  struct __mm_load1_pd_struct {
484
    double __u;
485
  } __attribute__((__packed__, __may_alias__));
486
  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
487
  return (__m128d){ __u, __u };
488
}
489

490
#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
491

492
static __inline__ __m128d __DEFAULT_FN_ATTRS
493
_mm_loadr_pd(double const *__dp)
494
{
495
  __m128d __u = *(__m128d*)__dp;
496
  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
497
}
498

499
static __inline__ __m128d __DEFAULT_FN_ATTRS
500
_mm_loadu_pd(double const *__dp)
501
{
502
  struct __loadu_pd {
503
    __m128d __v;
504
  } __attribute__((__packed__, __may_alias__));
505
  return ((struct __loadu_pd*)__dp)->__v;
506
}
507

508
static __inline__ __m128i __DEFAULT_FN_ATTRS
509
_mm_loadu_si64(void const *__a)
510
{
511
  struct __loadu_si64 {
512
    long long __v;
513
  } __attribute__((__packed__, __may_alias__));
514
  long long __u = ((struct __loadu_si64*)__a)->__v;
515
  return (__m128i){__u, 0L};
516
}
517

518
static __inline__ __m128d __DEFAULT_FN_ATTRS
519
_mm_load_sd(double const *__dp)
520
{
521
  struct __mm_load_sd_struct {
522
    double __u;
523
  } __attribute__((__packed__, __may_alias__));
524
  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
525
  return (__m128d){ __u, 0 };
526
}
527

528
static __inline__ __m128d __DEFAULT_FN_ATTRS
529
_mm_loadh_pd(__m128d __a, double const *__dp)
530
{
531
  struct __mm_loadh_pd_struct {
532
    double __u;
533
  } __attribute__((__packed__, __may_alias__));
534
  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
535
  return (__m128d){ __a[0], __u };
536
}
537

538
static __inline__ __m128d __DEFAULT_FN_ATTRS
539
_mm_loadl_pd(__m128d __a, double const *__dp)
540
{
541
  struct __mm_loadl_pd_struct {
542
    double __u;
543
  } __attribute__((__packed__, __may_alias__));
544
  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
545
  return (__m128d){ __u, __a[1] };
546
}
547

548
static __inline__ __m128d __DEFAULT_FN_ATTRS
549
_mm_undefined_pd(void)
550
{
551
  return (__m128d)__builtin_ia32_undef128();
552
}
553

554
static __inline__ __m128d __DEFAULT_FN_ATTRS
555
_mm_set_sd(double __w)
556
{
557
  return (__m128d){ __w, 0 };
558
}
559

560
static __inline__ __m128d __DEFAULT_FN_ATTRS
561
_mm_set1_pd(double __w)
562
{
563
  return (__m128d){ __w, __w };
564
}
565

566
static __inline__ __m128d __DEFAULT_FN_ATTRS
567
_mm_set_pd(double __w, double __x)
568
{
569
  return (__m128d){ __x, __w };
570
}
571

572
static __inline__ __m128d __DEFAULT_FN_ATTRS
573
_mm_setr_pd(double __w, double __x)
574
{
575
  return (__m128d){ __w, __x };
576
}
577

578
static __inline__ __m128d __DEFAULT_FN_ATTRS
579
_mm_setzero_pd(void)
580
{
581
  return (__m128d){ 0, 0 };
582
}
583

584
static __inline__ __m128d __DEFAULT_FN_ATTRS
585
_mm_move_sd(__m128d __a, __m128d __b)
586
{
587
  return (__m128d){ __b[0], __a[1] };
588
}
589

590
static __inline__ void __DEFAULT_FN_ATTRS
591
_mm_store_sd(double *__dp, __m128d __a)
592
{
593
  struct __mm_store_sd_struct {
594
    double __u;
595
  } __attribute__((__packed__, __may_alias__));
596
  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
597
}
598

599
static __inline__ void __DEFAULT_FN_ATTRS
600
_mm_store_pd(double *__dp, __m128d __a)
601
{
602
  *(__m128d*)__dp = __a;
603
}
604

605
static __inline__ void __DEFAULT_FN_ATTRS
606
_mm_store1_pd(double *__dp, __m128d __a)
607
{
608
  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
609
  _mm_store_pd(__dp, __a);
610
}
611

612
static __inline__ void __DEFAULT_FN_ATTRS
613
_mm_store_pd1(double *__dp, __m128d __a)
614
{
615
  return _mm_store1_pd(__dp, __a);
616
}
617

618
static __inline__ void __DEFAULT_FN_ATTRS
619
_mm_storeu_pd(double *__dp, __m128d __a)
620
{
621
  struct __storeu_pd {
622
    __m128d __v;
623
  } __attribute__((__packed__, __may_alias__));
624
  ((struct __storeu_pd*)__dp)->__v = __a;
625
}
626

627
static __inline__ void __DEFAULT_FN_ATTRS
628
_mm_storer_pd(double *__dp, __m128d __a)
629
{
630
  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
631
  *(__m128d *)__dp = __a;
632
}
633

634
static __inline__ void __DEFAULT_FN_ATTRS
635
_mm_storeh_pd(double *__dp, __m128d __a)
636
{
637
  struct __mm_storeh_pd_struct {
638
    double __u;
639
  } __attribute__((__packed__, __may_alias__));
640
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
641
}
642

643
static __inline__ void __DEFAULT_FN_ATTRS
644
_mm_storel_pd(double *__dp, __m128d __a)
645
{
646
  struct __mm_storeh_pd_struct {
647
    double __u;
648
  } __attribute__((__packed__, __may_alias__));
649
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
650
}
651

652
static __inline__ __m128i __DEFAULT_FN_ATTRS
653
_mm_add_epi8(__m128i __a, __m128i __b)
654
{
655
  return (__m128i)((__v16qu)__a + (__v16qu)__b);
656
}
657

658
static __inline__ __m128i __DEFAULT_FN_ATTRS
659
_mm_add_epi16(__m128i __a, __m128i __b)
660
{
661
  return (__m128i)((__v8hu)__a + (__v8hu)__b);
662
}
663

664
static __inline__ __m128i __DEFAULT_FN_ATTRS
665
_mm_add_epi32(__m128i __a, __m128i __b)
666
{
667
  return (__m128i)((__v4su)__a + (__v4su)__b);
668
}
669

670
static __inline__ __m64 __DEFAULT_FN_ATTRS
671
_mm_add_si64(__m64 __a, __m64 __b)
672
{
673
  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
674
}
675

676
static __inline__ __m128i __DEFAULT_FN_ATTRS
677
_mm_add_epi64(__m128i __a, __m128i __b)
678
{
679
  return (__m128i)((__v2du)__a + (__v2du)__b);
680
}
681

682
static __inline__ __m128i __DEFAULT_FN_ATTRS
683
_mm_adds_epi8(__m128i __a, __m128i __b)
684
{
685
  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
686
}
687

688
static __inline__ __m128i __DEFAULT_FN_ATTRS
689
_mm_adds_epi16(__m128i __a, __m128i __b)
690
{
691
  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
692
}
693

694
static __inline__ __m128i __DEFAULT_FN_ATTRS
695
_mm_adds_epu8(__m128i __a, __m128i __b)
696
{
697
  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
698
}
699

700
static __inline__ __m128i __DEFAULT_FN_ATTRS
701
_mm_adds_epu16(__m128i __a, __m128i __b)
702
{
703
  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
704
}
705

706
static __inline__ __m128i __DEFAULT_FN_ATTRS
707
_mm_avg_epu8(__m128i __a, __m128i __b)
708
{
709
  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
710
}
711

712
static __inline__ __m128i __DEFAULT_FN_ATTRS
713
_mm_avg_epu16(__m128i __a, __m128i __b)
714
{
715
  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
716
}
717

718
static __inline__ __m128i __DEFAULT_FN_ATTRS
719
_mm_madd_epi16(__m128i __a, __m128i __b)
720
{
721
  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
722
}
723

724
static __inline__ __m128i __DEFAULT_FN_ATTRS
725
_mm_max_epi16(__m128i __a, __m128i __b)
726
{
727
  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
728
}
729

730
static __inline__ __m128i __DEFAULT_FN_ATTRS
731
_mm_max_epu8(__m128i __a, __m128i __b)
732
{
733
  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
734
}
735

736
static __inline__ __m128i __DEFAULT_FN_ATTRS
737
_mm_min_epi16(__m128i __a, __m128i __b)
738
{
739
  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
740
}
741

742
static __inline__ __m128i __DEFAULT_FN_ATTRS
743
_mm_min_epu8(__m128i __a, __m128i __b)
744
{
745
  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
746
}
747

748
static __inline__ __m128i __DEFAULT_FN_ATTRS
749
_mm_mulhi_epi16(__m128i __a, __m128i __b)
750
{
751
  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
752
}
753

754
static __inline__ __m128i __DEFAULT_FN_ATTRS
755
_mm_mulhi_epu16(__m128i __a, __m128i __b)
756
{
757
  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
758
}
759

760
/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
761
///    returns a vector containing the low-order 16 bits of each 32-bit product
762
///    in the corresponding element.
763
///
764
/// \headerfile <x86intrin.h>
765
///
766
/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
767
///
768
/// \param __a
769
///    A 128-bit integer vector containing one of the source operands.
770
/// \param __b
771
///    A 128-bit integer vector containing one of the source operands.
772
/// \returns A 128-bit integer vector containing the products of both operands.
773
static __inline__ __m128i __DEFAULT_FN_ATTRS
774
_mm_mullo_epi16(__m128i __a, __m128i __b)
775
{
776
  return (__m128i)((__v8hu)__a * (__v8hu)__b);
777
}
778

779
/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
780
///    of the two 64-bit integer vectors and returns the 64-bit unsigned
781
///    product.
782
///
783
/// \headerfile <x86intrin.h>
784
///
785
/// This intrinsic corresponds to the \c PMULUDQ instruction.
786
///
787
/// \param __a
788
///    A 64-bit integer containing one of the source operands.
789
/// \param __b
790
///    A 64-bit integer containing one of the source operands.
791
/// \returns A 64-bit integer vector containing the product of both operands.
792
static __inline__ __m64 __DEFAULT_FN_ATTRS
793
_mm_mul_su32(__m64 __a, __m64 __b)
794
{
795
  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
796
}
797

798
/// \brief Multiplies 32-bit unsigned integer values contained in the lower
799
///    bits of the corresponding elements of two [2 x i64] vectors, and returns
800
///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
801
///
802
/// \headerfile <x86intrin.h>
803
///
804
/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
805
///
806
/// \param __a
807
///    A [2 x i64] vector containing one of the source operands.
808
/// \param __b
809
///    A [2 x i64] vector containing one of the source operands.
810
/// \returns A [2 x i64] vector containing the product of both operands.
811
static __inline__ __m128i __DEFAULT_FN_ATTRS
812
_mm_mul_epu32(__m128i __a, __m128i __b)
813
{
814
  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
815
}
816

817
/// \brief Computes the absolute differences of corresponding 8-bit integer
818
///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
819
///    separately sums the second 8 absolute differences. Packss these two
820
///    unsigned 16-bit integer sums into the upper and lower elements of a
821
///    [2 x i64] vector.
822
///
823
/// \headerfile <x86intrin.h>
824
///
825
/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
826
///
827
/// \param __a
828
///    A 128-bit integer vector containing one of the source operands.
829
/// \param __b
830
///    A 128-bit integer vector containing one of the source operands.
831
/// \returns A [2 x i64] vector containing the sums of the sets of absolute
832
///    differences between both operands.
833
static __inline__ __m128i __DEFAULT_FN_ATTRS
834
_mm_sad_epu8(__m128i __a, __m128i __b)
835
{
836
  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
837
}
838

839
/// \brief Subtracts the corresponding 8-bit integer values in the operands.
840
///
841
/// \headerfile <x86intrin.h>
842
///
843
/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
844
///
845
/// \param __a
846
///    A 128-bit integer vector containing the minuends.
847
/// \param __b
848
///    A 128-bit integer vector containing the subtrahends.
849
/// \returns A 128-bit integer vector containing the differences of the values
850
///    in the operands.
851
static __inline__ __m128i __DEFAULT_FN_ATTRS
852
_mm_sub_epi8(__m128i __a, __m128i __b)
853
{
854
  return (__m128i)((__v16qu)__a - (__v16qu)__b);
855
}
856

857
/// \brief Subtracts the corresponding 16-bit integer values in the operands.
858
///
859
/// \headerfile <x86intrin.h>
860
///
861
/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
862
///
863
/// \param __a
864
///    A 128-bit integer vector containing the minuends.
865
/// \param __b
866
///    A 128-bit integer vector containing the subtrahends.
867
/// \returns A 128-bit integer vector containing the differences of the values
868
///    in the operands.
869
static __inline__ __m128i __DEFAULT_FN_ATTRS
870
_mm_sub_epi16(__m128i __a, __m128i __b)
871
{
872
  return (__m128i)((__v8hu)__a - (__v8hu)__b);
873
}
874

875
/// \brief Subtracts the corresponding 32-bit integer values in the operands.
876
///
877
/// \headerfile <x86intrin.h>
878
///
879
/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
880
///
881
/// \param __a
882
///    A 128-bit integer vector containing the minuends.
883
/// \param __b
884
///    A 128-bit integer vector containing the subtrahends.
885
/// \returns A 128-bit integer vector containing the differences of the values
886
///    in the operands.
887
static __inline__ __m128i __DEFAULT_FN_ATTRS
888
_mm_sub_epi32(__m128i __a, __m128i __b)
889
{
890
  return (__m128i)((__v4su)__a - (__v4su)__b);
891
}
892

893
/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
894
///    difference to the corresponding bits in the destination.
895
///
896
/// \headerfile <x86intrin.h>
897
///
898
/// This intrinsic corresponds to the \c PSUBQ instruction.
899
///
900
/// \param __a
901
///    A 64-bit integer vector containing the minuend.
902
/// \param __b
903
///    A 64-bit integer vector containing the subtrahend.
904
/// \returns A 64-bit integer vector containing the difference of the values in
905
///    the operands.
906
static __inline__ __m64 __DEFAULT_FN_ATTRS
907
_mm_sub_si64(__m64 __a, __m64 __b)
908
{
909
  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
910
}
911

912
/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
913
///
914
/// \headerfile <x86intrin.h>
915
///
916
/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
917
///
918
/// \param __a
919
///    A 128-bit integer vector containing the minuends.
920
/// \param __b
921
///    A 128-bit integer vector containing the subtrahends.
922
/// \returns A 128-bit integer vector containing the differences of the values
923
///    in the operands.
924
static __inline__ __m128i __DEFAULT_FN_ATTRS
925
_mm_sub_epi64(__m128i __a, __m128i __b)
926
{
927
  return (__m128i)((__v2du)__a - (__v2du)__b);
928
}
929

930
/// \brief Subtracts corresponding 8-bit signed integer values in the input and
931
///    returns the differences in the corresponding bytes in the destination.
932
///    Differences greater than 7Fh are saturated to 7Fh, and differences less
933
///    than 80h are saturated to 80h.
934
///
935
/// \headerfile <x86intrin.h>
936
///
937
/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
938
///
939
/// \param __a
940
///    A 128-bit integer vector containing the minuends.
941
/// \param __b
942
///    A 128-bit integer vector containing the subtrahends.
943
/// \returns A 128-bit integer vector containing the differences of the values
944
///    in the operands.
945
static __inline__ __m128i __DEFAULT_FN_ATTRS
946
_mm_subs_epi8(__m128i __a, __m128i __b)
947
{
948
  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
949
}
950

951
/// \brief Subtracts corresponding 16-bit signed integer values in the input and
952
///    returns the differences in the corresponding bytes in the destination.
953
///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
954
///    than 8000h are saturated to 8000h.
955
///
956
/// \headerfile <x86intrin.h>
957
///
958
/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
959
///
960
/// \param __a
961
///    A 128-bit integer vector containing the minuends.
962
/// \param __b
963
///    A 128-bit integer vector containing the subtrahends.
964
/// \returns A 128-bit integer vector containing the differences of the values
965
///    in the operands.
966
static __inline__ __m128i __DEFAULT_FN_ATTRS
967
_mm_subs_epi16(__m128i __a, __m128i __b)
968
{
969
  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
970
}
971

972
/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
973
///    and returns the differences in the corresponding bytes in the
974
///    destination. Differences less than 00h are saturated to 00h.
975
///
976
/// \headerfile <x86intrin.h>
977
///
978
/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
979
///
980
/// \param __a
981
///    A 128-bit integer vector containing the minuends.
982
/// \param __b
983
///    A 128-bit integer vector containing the subtrahends.
984
/// \returns A 128-bit integer vector containing the unsigned integer
985
///    differences of the values in the operands.
986
static __inline__ __m128i __DEFAULT_FN_ATTRS
987
_mm_subs_epu8(__m128i __a, __m128i __b)
988
{
989
  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
990
}
991

992
/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
993
///    and returns the differences in the corresponding bytes in the
994
///    destination. Differences less than 0000h are saturated to 0000h.
995
///
996
/// \headerfile <x86intrin.h>
997
///
998
/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
999
///
1000
/// \param __a
1001
///    A 128-bit integer vector containing the minuends.
1002
/// \param __b
1003
///    A 128-bit integer vector containing the subtrahends.
1004
/// \returns A 128-bit integer vector containing the unsigned integer
1005
///    differences of the values in the operands.
1006
static __inline__ __m128i __DEFAULT_FN_ATTRS
1007
_mm_subs_epu16(__m128i __a, __m128i __b)
1008
{
1009
  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
1010
}
1011

1012
/// \brief Performs a bitwise AND of two 128-bit integer vectors.
1013
///
1014
/// \headerfile <x86intrin.h>
1015
///
1016
/// This intrinsic corresponds to the \c VPAND / PAND instruction.
1017
///
1018
/// \param __a
1019
///    A 128-bit integer vector containing one of the source operands.
1020
/// \param __b
1021
///    A 128-bit integer vector containing one of the source operands.
1022
/// \returns A 128-bit integer vector containing the bitwise AND of the values
1023
///    in both operands.
1024
static __inline__ __m128i __DEFAULT_FN_ATTRS
1025
_mm_and_si128(__m128i __a, __m128i __b)
1026
{
1027
  return (__m128i)((__v2du)__a & (__v2du)__b);
1028
}
1029

1030
/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
1031
///    one's complement of the values contained in the first source operand.
1032
///
1033
/// \headerfile <x86intrin.h>
1034
///
1035
/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
1036
///
1037
/// \param __a
1038
///    A 128-bit vector containing the left source operand. The one's complement
1039
///    of this value is used in the bitwise AND.
1040
/// \param __b
1041
///    A 128-bit vector containing the right source operand.
1042
/// \returns A 128-bit integer vector containing the bitwise AND of the one's
1043
///    complement of the first operand and the values in the second operand.
1044
static __inline__ __m128i __DEFAULT_FN_ATTRS
1045
_mm_andnot_si128(__m128i __a, __m128i __b)
1046
{
1047
  return (__m128i)(~(__v2du)__a & (__v2du)__b);
1048
}
1049
/// \brief Performs a bitwise OR of two 128-bit integer vectors.
1050
///
1051
/// \headerfile <x86intrin.h>
1052
///
1053
/// This intrinsic corresponds to the \c VPOR / POR instruction.
1054
///
1055
/// \param __a
1056
///    A 128-bit integer vector containing one of the source operands.
1057
/// \param __b
1058
///    A 128-bit integer vector containing one of the source operands.
1059
/// \returns A 128-bit integer vector containing the bitwise OR of the values
1060
///    in both operands.
1061
static __inline__ __m128i __DEFAULT_FN_ATTRS
1062
_mm_or_si128(__m128i __a, __m128i __b)
1063
{
1064
  return (__m128i)((__v2du)__a | (__v2du)__b);
1065
}
1066

1067
/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
1068
///
1069
/// \headerfile <x86intrin.h>
1070
///
1071
/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
1072
///
1073
/// \param __a
1074
///    A 128-bit integer vector containing one of the source operands.
1075
/// \param __b
1076
///    A 128-bit integer vector containing one of the source operands.
1077
/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
1078
///    values in both operands.
1079
static __inline__ __m128i __DEFAULT_FN_ATTRS
1080
_mm_xor_si128(__m128i __a, __m128i __b)
1081
{
1082
  return (__m128i)((__v2du)__a ^ (__v2du)__b);
1083
}
1084

1085
/// \brief Left-shifts the 128-bit integer vector operand by the specified
1086
///    number of bytes. Low-order bits are cleared.
1087
///
1088
/// \headerfile <x86intrin.h>
1089
///
1090
/// \code
1091
/// __m128i _mm_slli_si128(__m128i a, const int imm);
1092
/// \endcode
1093
///
1094
/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
1095
///
1096
/// \param a
1097
///    A 128-bit integer vector containing the source operand.
1098
/// \param imm
1099
///    An immediate value specifying the number of bytes to left-shift
1100
///    operand a.
1101
/// \returns A 128-bit integer vector containing the left-shifted value.
1102
#define _mm_slli_si128(a, imm) __extension__ ({                              \
1103
  (__m128i)__builtin_shufflevector(                                          \
1104
                                 (__v16qi)_mm_setzero_si128(),               \
1105
                                 (__v16qi)(__m128i)(a),                      \
1106
                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
1107
                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
1108
                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
1109
                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
1110
                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
1111
                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
1112
                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
1113
                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
1114
                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
1115
                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
1116
                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
1117
                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
1118
                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
1119
                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
1120
                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
1121
                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
1122

1123
#define _mm_bslli_si128(a, imm) \
1124
  _mm_slli_si128((a), (imm))
1125

1126
/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1127
///    by the specified number of bits. Low-order bits are cleared.
1128
///
1129
/// \headerfile <x86intrin.h>
1130
///
1131
/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1132
///
1133
/// \param __a
1134
///    A 128-bit integer vector containing the source operand.
1135
/// \param __count
1136
///    An integer value specifying the number of bits to left-shift each value
1137
///    in operand __a.
1138
/// \returns A 128-bit integer vector containing the left-shifted values.
1139
static __inline__ __m128i __DEFAULT_FN_ATTRS
1140
_mm_slli_epi16(__m128i __a, int __count)
1141
{
1142
  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
1143
}
1144

1145
/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1146
///    by the specified number of bits. Low-order bits are cleared.
1147
///
1148
/// \headerfile <x86intrin.h>
1149
///
1150
/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1151
///
1152
/// \param __a
1153
///    A 128-bit integer vector containing the source operand.
1154
/// \param __count
1155
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1156
///    to left-shift each value in operand __a.
1157
/// \returns A 128-bit integer vector containing the left-shifted values.
1158
static __inline__ __m128i __DEFAULT_FN_ATTRS
1159
_mm_sll_epi16(__m128i __a, __m128i __count)
1160
{
1161
  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
1162
}
1163

1164
/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1165
///    by the specified number of bits. Low-order bits are cleared.
1166
///
1167
/// \headerfile <x86intrin.h>
1168
///
1169
/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1170
///
1171
/// \param __a
1172
///    A 128-bit integer vector containing the source operand.
1173
/// \param __count
1174
///    An integer value specifying the number of bits to left-shift each value
1175
///    in operand __a.
1176
/// \returns A 128-bit integer vector containing the left-shifted values.
1177
static __inline__ __m128i __DEFAULT_FN_ATTRS
1178
_mm_slli_epi32(__m128i __a, int __count)
1179
{
1180
  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
1181
}
1182

1183
/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1184
///    by the specified number of bits. Low-order bits are cleared.
1185
///
1186
/// \headerfile <x86intrin.h>
1187
///
1188
/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1189
///
1190
/// \param __a
1191
///    A 128-bit integer vector containing the source operand.
1192
/// \param __count
1193
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1194
///    to left-shift each value in operand __a.
1195
/// \returns A 128-bit integer vector containing the left-shifted values.
1196
static __inline__ __m128i __DEFAULT_FN_ATTRS
1197
_mm_sll_epi32(__m128i __a, __m128i __count)
1198
{
1199
  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
1200
}
1201

1202
/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1203
///    by the specified number of bits. Low-order bits are cleared.
1204
///
1205
/// \headerfile <x86intrin.h>
1206
///
1207
/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1208
///
1209
/// \param __a
1210
///    A 128-bit integer vector containing the source operand.
1211
/// \param __count
1212
///    An integer value specifying the number of bits to left-shift each value
1213
///    in operand __a.
1214
/// \returns A 128-bit integer vector containing the left-shifted values.
1215
static __inline__ __m128i __DEFAULT_FN_ATTRS
1216
_mm_slli_epi64(__m128i __a, int __count)
1217
{
1218
  return __builtin_ia32_psllqi128((__v2di)__a, __count);
1219
}
1220

1221
/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1222
///    by the specified number of bits. Low-order bits are cleared.
1223
///
1224
/// \headerfile <x86intrin.h>
1225
///
1226
/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1227
///
1228
/// \param __a
1229
///    A 128-bit integer vector containing the source operand.
1230
/// \param __count
1231
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1232
///    to left-shift each value in operand __a.
1233
/// \returns A 128-bit integer vector containing the left-shifted values.
1234
static __inline__ __m128i __DEFAULT_FN_ATTRS
1235
_mm_sll_epi64(__m128i __a, __m128i __count)
1236
{
1237
  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
1238
}
1239

1240
/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1241
///    by the specified number of bits. High-order bits are filled with the sign
1242
///    bit of the initial value.
1243
///
1244
/// \headerfile <x86intrin.h>
1245
///
1246
/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1247
///
1248
/// \param __a
1249
///    A 128-bit integer vector containing the source operand.
1250
/// \param __count
1251
///    An integer value specifying the number of bits to right-shift each value
1252
///    in operand __a.
1253
/// \returns A 128-bit integer vector containing the right-shifted values.
1254
static __inline__ __m128i __DEFAULT_FN_ATTRS
1255
_mm_srai_epi16(__m128i __a, int __count)
1256
{
1257
  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
1258
}
1259

1260
/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1261
///    by the specified number of bits. High-order bits are filled with the sign
1262
///    bit of the initial value.
1263
///
1264
/// \headerfile <x86intrin.h>
1265
///
1266
/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1267
///
1268
/// \param __a
1269
///    A 128-bit integer vector containing the source operand.
1270
/// \param __count
1271
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1272
///    to right-shift each value in operand __a.
1273
/// \returns A 128-bit integer vector containing the right-shifted values.
1274
static __inline__ __m128i __DEFAULT_FN_ATTRS
1275
_mm_sra_epi16(__m128i __a, __m128i __count)
1276
{
1277
  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
1278
}
1279

1280
/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1281
///    by the specified number of bits. High-order bits are filled with the sign
1282
///    bit of the initial value.
1283
///
1284
/// \headerfile <x86intrin.h>
1285
///
1286
/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1287
///
1288
/// \param __a
1289
///    A 128-bit integer vector containing the source operand.
1290
/// \param __count
1291
///    An integer value specifying the number of bits to right-shift each value
1292
///    in operand __a.
1293
/// \returns A 128-bit integer vector containing the right-shifted values.
1294
static __inline__ __m128i __DEFAULT_FN_ATTRS
1295
_mm_srai_epi32(__m128i __a, int __count)
1296
{
1297
  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
1298
}
1299

1300
/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1301
///    by the specified number of bits. High-order bits are filled with the sign
1302
///    bit of the initial value.
1303
///
1304
/// \headerfile <x86intrin.h>
1305
///
1306
/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1307
///
1308
/// \param __a
1309
///    A 128-bit integer vector containing the source operand.
1310
/// \param __count
1311
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1312
///    to right-shift each value in operand __a.
1313
/// \returns A 128-bit integer vector containing the right-shifted values.
1314
static __inline__ __m128i __DEFAULT_FN_ATTRS
1315
_mm_sra_epi32(__m128i __a, __m128i __count)
1316
{
1317
  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
1318
}
1319

1320
/// \brief Right-shifts the 128-bit integer vector operand by the specified
1321
///    number of bytes. High-order bits are cleared.
1322
///
1323
/// \headerfile <x86intrin.h>
1324
///
1325
/// \code
1326
/// __m128i _mm_srli_si128(__m128i a, const int imm);
1327
/// \endcode
1328
///
1329
/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
1330
///
1331
/// \param a
1332
///    A 128-bit integer vector containing the source operand.
1333
/// \param imm
1334
///    An immediate value specifying the number of bytes to right-shift operand
1335
///    a.
1336
/// \returns A 128-bit integer vector containing the right-shifted value.
1337
#define _mm_srli_si128(a, imm) __extension__ ({                              \
1338
  (__m128i)__builtin_shufflevector(                                          \
1339
                                 (__v16qi)(__m128i)(a),                      \
1340
                                 (__v16qi)_mm_setzero_si128(),               \
1341
                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
1342
                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
1343
                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
1344
                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
1345
                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
1346
                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
1347
                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
1348
                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
1349
                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
1350
                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
1351
                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
1352
                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
1353
                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
1354
                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
1355
                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
1356
                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
1357

1358
#define _mm_bsrli_si128(a, imm) \
1359
  _mm_srli_si128((a), (imm))
1360

1361
/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1362
///    operand by the specified number of bits. High-order bits are cleared.
1363
///
1364
/// \headerfile <x86intrin.h>
1365
///
1366
/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1367
///
1368
/// \param __a
1369
///    A 128-bit integer vector containing the source operand.
1370
/// \param __count
1371
///    An integer value specifying the number of bits to right-shift each value
1372
///    in operand __a.
1373
/// \returns A 128-bit integer vector containing the right-shifted values.
1374
static __inline__ __m128i __DEFAULT_FN_ATTRS
1375
_mm_srli_epi16(__m128i __a, int __count)
1376
{
1377
  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
1378
}
1379

1380
/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1381
///    operand by the specified number of bits. High-order bits are cleared.
1382
///
1383
/// \headerfile <x86intrin.h>
1384
///
1385
/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1386
///
1387
/// \param __a
1388
///    A 128-bit integer vector containing the source operand.
1389
/// \param __count
1390
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1391
///    to right-shift each value in operand __a.
1392
/// \returns A 128-bit integer vector containing the right-shifted values.
1393
static __inline__ __m128i __DEFAULT_FN_ATTRS
1394
_mm_srl_epi16(__m128i __a, __m128i __count)
1395
{
1396
  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
1397
}
1398

1399
/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1400
///    operand by the specified number of bits. High-order bits are cleared.
1401
///
1402
/// \headerfile <x86intrin.h>
1403
///
1404
/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1405
///
1406
/// \param __a
1407
///    A 128-bit integer vector containing the source operand.
1408
/// \param __count
1409
///    An integer value specifying the number of bits to right-shift each value
1410
///    in operand __a.
1411
/// \returns A 128-bit integer vector containing the right-shifted values.
1412
static __inline__ __m128i __DEFAULT_FN_ATTRS
1413
_mm_srli_epi32(__m128i __a, int __count)
1414
{
1415
  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
1416
}
1417

1418
/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1419
///    operand by the specified number of bits. High-order bits are cleared.
1420
///
1421
/// \headerfile <x86intrin.h>
1422
///
1423
/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1424
///
1425
/// \param __a
1426
///    A 128-bit integer vector containing the source operand.
1427
/// \param __count
1428
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1429
///    to right-shift each value in operand __a.
1430
/// \returns A 128-bit integer vector containing the right-shifted values.
1431
static __inline__ __m128i __DEFAULT_FN_ATTRS
1432
_mm_srl_epi32(__m128i __a, __m128i __count)
1433
{
1434
  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
1435
}
1436

1437
/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1438
///    operand by the specified number of bits. High-order bits are cleared.
1439
///
1440
/// \headerfile <x86intrin.h>
1441
///
1442
/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1443
///
1444
/// \param __a
1445
///    A 128-bit integer vector containing the source operand.
1446
/// \param __count
1447
///    An integer value specifying the number of bits to right-shift each value
1448
///    in operand __a.
1449
/// \returns A 128-bit integer vector containing the right-shifted values.
1450
static __inline__ __m128i __DEFAULT_FN_ATTRS
1451
_mm_srli_epi64(__m128i __a, int __count)
1452
{
1453
  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
1454
}
1455

1456
/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1457
///    operand by the specified number of bits. High-order bits are cleared.
1458
///
1459
/// \headerfile <x86intrin.h>
1460
///
1461
/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1462
///
1463
/// \param __a
1464
///    A 128-bit integer vector containing the source operand.
1465
/// \param __count
1466
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1467
///    to right-shift each value in operand __a.
1468
/// \returns A 128-bit integer vector containing the right-shifted values.
1469
static __inline__ __m128i __DEFAULT_FN_ATTRS
1470
_mm_srl_epi64(__m128i __a, __m128i __count)
1471
{
1472
  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
1473
}
1474

1475
/// \brief Compares each of the corresponding 8-bit values of the 128-bit
1476
///    integer vectors for equality. Each comparison yields 0h for false, FFh
1477
///    for true.
1478
///
1479
/// \headerfile <x86intrin.h>
1480
///
1481
/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
1482
///
1483
/// \param __a
1484
///    A 128-bit integer vector.
1485
/// \param __b
1486
///    A 128-bit integer vector.
1487
/// \returns A 128-bit integer vector containing the comparison results.
1488
static __inline__ __m128i __DEFAULT_FN_ATTRS
1489
_mm_cmpeq_epi8(__m128i __a, __m128i __b)
1490
{
1491
  return (__m128i)((__v16qi)__a == (__v16qi)__b);
1492
}
1493

1494
/// \brief Compares each of the corresponding 16-bit values of the 128-bit
1495
///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
1496
///    for true.
1497
///
1498
/// \headerfile <x86intrin.h>
1499
///
1500
/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
1501
///
1502
/// \param __a
1503
///    A 128-bit integer vector.
1504
/// \param __b
1505
///    A 128-bit integer vector.
1506
/// \returns A 128-bit integer vector containing the comparison results.
1507
static __inline__ __m128i __DEFAULT_FN_ATTRS
1508
_mm_cmpeq_epi16(__m128i __a, __m128i __b)
1509
{
1510
  return (__m128i)((__v8hi)__a == (__v8hi)__b);
1511
}
1512

1513
/// \brief Compares each of the corresponding 32-bit values of the 128-bit
1514
///    integer vectors for equality. Each comparison yields 0h for false,
1515
///    FFFFFFFFh for true.
1516
///
1517
/// \headerfile <x86intrin.h>
1518
///
1519
/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
1520
///
1521
/// \param __a
1522
///    A 128-bit integer vector.
1523
/// \param __b
1524
///    A 128-bit integer vector.
1525
/// \returns A 128-bit integer vector containing the comparison results.
1526
static __inline__ __m128i __DEFAULT_FN_ATTRS
1527
_mm_cmpeq_epi32(__m128i __a, __m128i __b)
1528
{
1529
  return (__m128i)((__v4si)__a == (__v4si)__b);
1530
}
1531

1532
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1533
///    integer vectors to determine if the values in the first operand are
1534
///    greater than those in the second operand. Each comparison yields 0h for
1535
///    false, FFh for true.
1536
///
1537
/// \headerfile <x86intrin.h>
1538
///
1539
/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1540
///
1541
/// \param __a
1542
///    A 128-bit integer vector.
1543
/// \param __b
1544
///    A 128-bit integer vector.
1545
/// \returns A 128-bit integer vector containing the comparison results.
1546
static __inline__ __m128i __DEFAULT_FN_ATTRS
1547
_mm_cmpgt_epi8(__m128i __a, __m128i __b)
1548
{
1549
  /* This function always performs a signed comparison, but __v16qi is a char
1550
     which may be signed or unsigned, so use __v16qs. */
1551
  return (__m128i)((__v16qs)__a > (__v16qs)__b);
1552
}
1553

1554
/// \brief Compares each of the corresponding signed 16-bit values of the
1555
///    128-bit integer vectors to determine if the values in the first operand
1556
///    are greater than those in the second operand. Each comparison yields 0h
1557
///    for false, FFFFh for true.
1558
///
1559
/// \headerfile <x86intrin.h>
1560
///
1561
/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1562
///
1563
/// \param __a
1564
///    A 128-bit integer vector.
1565
/// \param __b
1566
///    A 128-bit integer vector.
1567
/// \returns A 128-bit integer vector containing the comparison results.
1568
static __inline__ __m128i __DEFAULT_FN_ATTRS
1569
_mm_cmpgt_epi16(__m128i __a, __m128i __b)
1570
{
1571
  return (__m128i)((__v8hi)__a > (__v8hi)__b);
1572
}
1573

1574
/// \brief Compares each of the corresponding signed 32-bit values of the
1575
///    128-bit integer vectors to determine if the values in the first operand
1576
///    are greater than those in the second operand. Each comparison yields 0h
1577
///    for false, FFFFFFFFh for true.
1578
///
1579
/// \headerfile <x86intrin.h>
1580
///
1581
/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1582
///
1583
/// \param __a
1584
///    A 128-bit integer vector.
1585
/// \param __b
1586
///    A 128-bit integer vector.
1587
/// \returns A 128-bit integer vector containing the comparison results.
1588
static __inline__ __m128i __DEFAULT_FN_ATTRS
1589
_mm_cmpgt_epi32(__m128i __a, __m128i __b)
1590
{
1591
  return (__m128i)((__v4si)__a > (__v4si)__b);
1592
}
1593

1594
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1595
///    integer vectors to determine if the values in the first operand are less
1596
///    than those in the second operand. Each comparison yields 0h for false,
1597
///    FFh for true.
1598
///
1599
/// \headerfile <x86intrin.h>
1600
///
1601
/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1602
///
1603
/// \param __a
1604
///    A 128-bit integer vector.
1605
/// \param __b
1606
///    A 128-bit integer vector.
1607
/// \returns A 128-bit integer vector containing the comparison results.
1608
static __inline__ __m128i __DEFAULT_FN_ATTRS
1609
_mm_cmplt_epi8(__m128i __a, __m128i __b)
1610
{
1611
  return _mm_cmpgt_epi8(__b, __a);
1612
}
1613

1614
/// \brief Compares each of the corresponding signed 16-bit values of the
1615
///    128-bit integer vectors to determine if the values in the first operand
1616
///    are less than those in the second operand. Each comparison yields 0h for
1617
///    false, FFFFh for true.
1618
///
1619
/// \headerfile <x86intrin.h>
1620
///
1621
/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1622
///
1623
/// \param __a
1624
///    A 128-bit integer vector.
1625
/// \param __b
1626
///    A 128-bit integer vector.
1627
/// \returns A 128-bit integer vector containing the comparison results.
1628
static __inline__ __m128i __DEFAULT_FN_ATTRS
1629
_mm_cmplt_epi16(__m128i __a, __m128i __b)
1630
{
1631
  return _mm_cmpgt_epi16(__b, __a);
1632
}
1633

1634
/// \brief Compares each of the corresponding signed 32-bit values of the
1635
///    128-bit integer vectors to determine if the values in the first operand
1636
///    are less than those in the second operand. Each comparison yields 0h for
1637
///    false, FFFFFFFFh for true.
1638
///
1639
/// \headerfile <x86intrin.h>
1640
///
1641
/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1642
///
1643
/// \param __a
1644
///    A 128-bit integer vector.
1645
/// \param __b
1646
///    A 128-bit integer vector.
1647
/// \returns A 128-bit integer vector containing the comparison results.
1648
static __inline__ __m128i __DEFAULT_FN_ATTRS
1649
_mm_cmplt_epi32(__m128i __a, __m128i __b)
1650
{
1651
  return _mm_cmpgt_epi32(__b, __a);
1652
}
1653

1654
#ifdef __x86_64__
1655
/// \brief Converts a 64-bit signed integer value from the second operand into a
1656
///    double-precision value and returns it in the lower element of a [2 x
1657
///    double] vector; the upper element of the returned vector is copied from
1658
///    the upper element of the first operand.
1659
///
1660
/// \headerfile <x86intrin.h>
1661
///
1662
/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1663
///
1664
/// \param __a
1665
///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
1666
///    copied to the upper 64 bits of the destination.
1667
/// \param __b
1668
///    A 64-bit signed integer operand containing the value to be converted.
1669
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
1670
///    converted value of the second operand. The upper 64 bits are copied from
1671
///    the upper 64 bits of the first operand.
1672
static __inline__ __m128d __DEFAULT_FN_ATTRS
1673
_mm_cvtsi64_sd(__m128d __a, long long __b)
1674
{
1675
  __a[0] = __b;
1676
  return __a;
1677
}
1678

1679
/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1680
///    64-bit signed integer value, according to the current rounding mode.
1681
///
1682
/// \headerfile <x86intrin.h>
1683
///
1684
/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1685
///
1686
/// \param __a
1687
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1688
///    conversion.
1689
/// \returns A 64-bit signed integer containing the converted value.
1690
static __inline__ long long __DEFAULT_FN_ATTRS
1691
_mm_cvtsd_si64(__m128d __a)
1692
{
1693
  return __builtin_ia32_cvtsd2si64((__v2df)__a);
1694
}
1695

1696
/// \brief Converts the first (lower) element of a vector of [2 x double] into a
1697
///    64-bit signed integer value, truncating the result when it is inexact.
1698
///
1699
/// \headerfile <x86intrin.h>
1700
///
1701
/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1702
///
1703
/// \param __a
1704
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1705
///    conversion.
1706
/// \returns A 64-bit signed integer containing the converted value.
1707
static __inline__ long long __DEFAULT_FN_ATTRS
1708
_mm_cvttsd_si64(__m128d __a)
1709
{
1710
  return __a[0];
1711
}
1712
#endif
1713

1714
/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
1715
///
1716
/// \headerfile <x86intrin.h>
1717
///
1718
/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
1719
///
1720
/// \param __a
1721
///    A 128-bit integer vector.
1722
/// \returns A 128-bit vector of [4 x float] containing the converted values.
1723
static __inline__ __m128 __DEFAULT_FN_ATTRS
1724
_mm_cvtepi32_ps(__m128i __a)
1725
{
1726
  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1727
}
1728

1729
/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
1730
///
1731
/// \headerfile <x86intrin.h>
1732
///
1733
/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
1734
///
1735
/// \param __a
1736
///    A 128-bit vector of [4 x float].
1737
/// \returns A 128-bit integer vector of [4 x i32] containing the converted
1738
///    values.
1739
static __inline__ __m128i __DEFAULT_FN_ATTRS
1740
_mm_cvtps_epi32(__m128 __a)
1741
{
1742
  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
1743
}
1744

1745
/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
1746
///    truncating the result when it is inexact.
1747
///
1748
/// \headerfile <x86intrin.h>
1749
///
1750
/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
1751
///
1752
/// \param __a
1753
///    A 128-bit vector of [4 x float].
1754
/// \returns A 128-bit vector of [4 x i32] containing the converted values.
1755
static __inline__ __m128i __DEFAULT_FN_ATTRS
1756
_mm_cvttps_epi32(__m128 __a)
1757
{
1758
  return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
1759
}
1760

1761
/// \brief Returns a vector of [4 x i32] where the lowest element is the input
1762
///    operand and the remaining elements are zero.
1763
///
1764
/// \headerfile <x86intrin.h>
1765
///
1766
/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1767
///
1768
/// \param __a
1769
///    A 32-bit signed integer operand.
1770
/// \returns A 128-bit vector of [4 x i32].
1771
static __inline__ __m128i __DEFAULT_FN_ATTRS
1772
_mm_cvtsi32_si128(int __a)
1773
{
1774
  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1775
}
1776

1777
#ifdef __x86_64__
1778
/// \brief Returns a vector of [2 x i64] where the lower element is the input
1779
///    operand and the upper element is zero.
1780
///
1781
/// \headerfile <x86intrin.h>
1782
///
1783
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1784
///
1785
/// \param __a
1786
///    A 64-bit signed integer operand containing the value to be converted.
1787
/// \returns A 128-bit vector of [2 x i64] containing the converted value.
1788
static __inline__ __m128i __DEFAULT_FN_ATTRS
1789
_mm_cvtsi64_si128(long long __a)
1790
{
1791
  return (__m128i){ __a, 0 };
1792
}
1793
#endif
1794

1795
/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
1796
///    32-bit signed integer value.
1797
///
1798
/// \headerfile <x86intrin.h>
1799
///
1800
/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1801
///
1802
/// \param __a
1803
///    A vector of [4 x i32]. The least significant 32 bits are moved to the
1804
///    destination.
1805
/// \returns A 32-bit signed integer containing the moved value.
1806
static __inline__ int __DEFAULT_FN_ATTRS
1807
_mm_cvtsi128_si32(__m128i __a)
1808
{
1809
  __v4si __b = (__v4si)__a;
1810
  return __b[0];
1811
}
1812

1813
#ifdef __x86_64__
1814
/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
1815
///    64-bit signed integer value.
1816
///
1817
/// \headerfile <x86intrin.h>
1818
///
1819
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1820
///
1821
/// \param __a
1822
///    A vector of [2 x i64]. The least significant 64 bits are moved to the
1823
///    destination.
1824
/// \returns A 64-bit signed integer containing the moved value.
1825
static __inline__ long long __DEFAULT_FN_ATTRS
1826
_mm_cvtsi128_si64(__m128i __a)
1827
{
1828
  return __a[0];
1829
}
1830
#endif
1831

1832
/// \brief Moves packed integer values from an aligned 128-bit memory location
1833
///    to elements in a 128-bit integer vector.
1834
///
1835
/// \headerfile <x86intrin.h>
1836
///
1837
/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
1838
///
1839
/// \param __p
1840
///    An aligned pointer to a memory location containing integer values.
1841
/// \returns A 128-bit integer vector containing the moved values.
1842
static __inline__ __m128i __DEFAULT_FN_ATTRS
1843
_mm_load_si128(__m128i const *__p)
1844
{
1845
  return *__p;
1846
}
1847

1848
/// \brief Moves packed integer values from an unaligned 128-bit memory location
1849
///    to elements in a 128-bit integer vector.
1850
///
1851
/// \headerfile <x86intrin.h>
1852
///
1853
/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
1854
///
1855
/// \param __p
1856
///    A pointer to a memory location containing integer values.
1857
/// \returns A 128-bit integer vector containing the moved values.
1858
static __inline__ __m128i __DEFAULT_FN_ATTRS
1859
_mm_loadu_si128(__m128i const *__p)
1860
{
1861
  struct __loadu_si128 {
1862
    __m128i __v;
1863
  } __attribute__((__packed__, __may_alias__));
1864
  return ((struct __loadu_si128*)__p)->__v;
1865
}
1866

1867
/// \brief Returns a vector of [2 x i64] where the lower element is taken from
1868
///    the lower element of the operand, and the upper element is zero.
1869
///
1870
/// \headerfile <x86intrin.h>
1871
///
1872
/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1873
///
1874
/// \param __p
1875
///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
1876
///    the destination.
1877
/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
1878
///    moved value. The higher order bits are cleared.
1879
static __inline__ __m128i __DEFAULT_FN_ATTRS
1880
_mm_loadl_epi64(__m128i const *__p)
1881
{
1882
  struct __mm_loadl_epi64_struct {
1883
    long long __u;
1884
  } __attribute__((__packed__, __may_alias__));
1885
  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1886
}
1887

1888
/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
1889
///    This could be used as an argument to another intrinsic function where the
1890
///    argument is required but the value is not actually used.
1891
///
1892
/// \headerfile <x86intrin.h>
1893
///
1894
/// This intrinsic has no corresponding instruction.
1895
///
1896
/// \returns A 128-bit vector of [4 x i32] with unspecified content.
1897
static __inline__ __m128i __DEFAULT_FN_ATTRS
1898
_mm_undefined_si128(void)
1899
{
1900
  return (__m128i)__builtin_ia32_undef128();
1901
}
1902

1903
/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1904
///    the specified 64-bit integer values.
1905
///
1906
/// \headerfile <x86intrin.h>
1907
///
1908
/// This intrinsic is a utility function and does not correspond to a specific
1909
///    instruction.
1910
///
1911
/// \param __q1
1912
///    A 64-bit integer value used to initialize the upper 64 bits of the
1913
///    destination vector of [2 x i64].
1914
/// \param __q0
1915
///    A 64-bit integer value used to initialize the lower 64 bits of the
1916
///    destination vector of [2 x i64].
1917
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1918
///    provided in the operands.
1919
static __inline__ __m128i __DEFAULT_FN_ATTRS
1920
_mm_set_epi64x(long long __q1, long long __q0)
1921
{
1922
  return (__m128i){ __q0, __q1 };
1923
}
1924

1925
/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1926
///    the specified 64-bit integer values.
1927
///
1928
/// \headerfile <x86intrin.h>
1929
///
1930
/// This intrinsic is a utility function and does not correspond to a specific
1931
///    instruction.
1932
///
1933
/// \param __q1
1934
///    A 64-bit integer value used to initialize the upper 64 bits of the
1935
///    destination vector of [2 x i64].
1936
/// \param __q0
1937
///    A 64-bit integer value used to initialize the lower 64 bits of the
1938
///    destination vector of [2 x i64].
1939
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
1940
///    provided in the operands.
1941
static __inline__ __m128i __DEFAULT_FN_ATTRS
1942
_mm_set_epi64(__m64 __q1, __m64 __q0)
1943
{
1944
  return (__m128i){ (long long)__q0, (long long)__q1 };
1945
}
1946

1947
/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
1948
///    the specified 32-bit integer values.
1949
///
1950
/// \headerfile <x86intrin.h>
1951
///
1952
/// This intrinsic is a utility function and does not correspond to a specific
1953
///    instruction.
1954
///
1955
/// \param __i3
1956
///    A 32-bit integer value used to initialize bits [127:96] of the
1957
///    destination vector.
1958
/// \param __i2
1959
///    A 32-bit integer value used to initialize bits [95:64] of the destination
1960
///    vector.
1961
/// \param __i1
1962
///    A 32-bit integer value used to initialize bits [63:32] of the destination
1963
///    vector.
1964
/// \param __i0
1965
///    A 32-bit integer value used to initialize bits [31:0] of the destination
1966
///    vector.
1967
/// \returns An initialized 128-bit vector of [4 x i32] containing the values
1968
///    provided in the operands.
1969
static __inline__ __m128i __DEFAULT_FN_ATTRS
1970
_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
1971
{
1972
  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
1973
}
1974

1975
/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
1976
///    the specified 16-bit integer values.
1977
///
1978
/// \headerfile <x86intrin.h>
1979
///
1980
/// This intrinsic is a utility function and does not correspond to a specific
1981
///    instruction.
1982
///
1983
/// \param __w7
1984
///    A 16-bit integer value used to initialize bits [127:112] of the
1985
///    destination vector.
1986
/// \param __w6
1987
///    A 16-bit integer value used to initialize bits [111:96] of the
1988
///    destination vector.
1989
/// \param __w5
1990
///    A 16-bit integer value used to initialize bits [95:80] of the destination
1991
///    vector.
1992
/// \param __w4
1993
///    A 16-bit integer value used to initialize bits [79:64] of the destination
1994
///    vector.
1995
/// \param __w3
1996
///    A 16-bit integer value used to initialize bits [63:48] of the destination
1997
///    vector.
1998
/// \param __w2
1999
///    A 16-bit integer value used to initialize bits [47:32] of the destination
2000
///    vector.
2001
/// \param __w1
2002
///    A 16-bit integer value used to initialize bits [31:16] of the destination
2003
///    vector.
2004
/// \param __w0
2005
///    A 16-bit integer value used to initialize bits [15:0] of the destination
2006
///    vector.
2007
/// \returns An initialized 128-bit vector of [8 x i16] containing the values
2008
///    provided in the operands.
2009
static __inline__ __m128i __DEFAULT_FN_ATTRS
2010
_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
2011
{
2012
  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2013
}
2014

2015
/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
2016
///    the specified 8-bit integer values.
2017
///
2018
/// \headerfile <x86intrin.h>
2019
///
2020
/// This intrinsic is a utility function and does not correspond to a specific
2021
///    instruction.
2022
///
2023
/// \param __b15
2024
///    Initializes bits [127:120] of the destination vector.
2025
/// \param __b14
2026
///    Initializes bits [119:112] of the destination vector.
2027
/// \param __b13
2028
///    Initializes bits [111:104] of the destination vector.
2029
/// \param __b12
2030
///    Initializes bits [103:96] of the destination vector.
2031
/// \param __b11
2032
///    Initializes bits [95:88] of the destination vector.
2033
/// \param __b10
2034
///    Initializes bits [87:80] of the destination vector.
2035
/// \param __b9
2036
///    Initializes bits [79:72] of the destination vector.
2037
/// \param __b8
2038
///    Initializes bits [71:64] of the destination vector.
2039
/// \param __b7
2040
///    Initializes bits [63:56] of the destination vector.
2041
/// \param __b6
2042
///    Initializes bits [55:48] of the destination vector.
2043
/// \param __b5
2044
///    Initializes bits [47:40] of the destination vector.
2045
/// \param __b4
2046
///    Initializes bits [39:32] of the destination vector.
2047
/// \param __b3
2048
///    Initializes bits [31:24] of the destination vector.
2049
/// \param __b2
2050
///    Initializes bits [23:16] of the destination vector.
2051
/// \param __b1
2052
///    Initializes bits [15:8] of the destination vector.
2053
/// \param __b0
2054
///    Initializes bits [7:0] of the destination vector.
2055
/// \returns An initialized 128-bit vector of [16 x i8] containing the values
2056
///    provided in the operands.
2057
static __inline__ __m128i __DEFAULT_FN_ATTRS
2058
_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
2059
{
2060
  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2061
}
2062

2063
/// \brief Initializes both values in a 128-bit integer vector with the
2064
///    specified 64-bit integer value.
2065
///
2066
/// \headerfile <x86intrin.h>
2067
///
2068
/// This intrinsic is a utility function and does not correspond to a specific
2069
///    instruction.
2070
///
2071
/// \param __q
2072
///    Integer value used to initialize the elements of the destination integer
2073
///    vector.
2074
/// \returns An initialized 128-bit integer vector of [2 x i64] with both
2075
///    elements containing the value provided in the operand.
2076
static __inline__ __m128i __DEFAULT_FN_ATTRS
2077
_mm_set1_epi64x(long long __q)
2078
{
2079
  return (__m128i){ __q, __q };
2080
}
2081

2082
/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
2083
///    specified 64-bit value.
2084
///
2085
/// \headerfile <x86intrin.h>
2086
///
2087
/// This intrinsic is a utility function and does not correspond to a specific
2088
///    instruction.
2089
///
2090
/// \param __q
2091
///    A 64-bit value used to initialize the elements of the destination integer
2092
///    vector.
2093
/// \returns An initialized 128-bit vector of [2 x i64] with all elements
2094
///    containing the value provided in the operand.
2095
static __inline__ __m128i __DEFAULT_FN_ATTRS
2096
_mm_set1_epi64(__m64 __q)
2097
{
2098
  return (__m128i){ (long long)__q, (long long)__q };
2099
}
2100

2101
/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
2102
///    specified 32-bit value.
2103
///
2104
/// \headerfile <x86intrin.h>
2105
///
2106
/// This intrinsic is a utility function and does not correspond to a specific
2107
///    instruction.
2108
///
2109
/// \param __i
2110
///    A 32-bit value used to initialize the elements of the destination integer
2111
///    vector.
2112
/// \returns An initialized 128-bit vector of [4 x i32] with all elements
2113
///    containing the value provided in the operand.
2114
static __inline__ __m128i __DEFAULT_FN_ATTRS
2115
_mm_set1_epi32(int __i)
2116
{
2117
  return (__m128i)(__v4si){ __i, __i, __i, __i };
2118
}
2119

2120
/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
2121
///    specified 16-bit value.
2122
///
2123
/// \headerfile <x86intrin.h>
2124
///
2125
/// This intrinsic is a utility function and does not correspond to a specific
2126
///    instruction.
2127
///
2128
/// \param __w
2129
///    A 16-bit value used to initialize the elements of the destination integer
2130
///    vector.
2131
/// \returns An initialized 128-bit vector of [8 x i16] with all elements
2132
///    containing the value provided in the operand.
2133
static __inline__ __m128i __DEFAULT_FN_ATTRS
2134
_mm_set1_epi16(short __w)
2135
{
2136
  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
2137
}
2138

2139
/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
2140
///    specified 8-bit value.
2141
///
2142
/// \headerfile <x86intrin.h>
2143
///
2144
/// This intrinsic is a utility function and does not correspond to a specific
2145
///    instruction.
2146
///
2147
/// \param __b
2148
///    An 8-bit value used to initialize the elements of the destination integer
2149
///    vector.
2150
/// \returns An initialized 128-bit vector of [16 x i8] with all elements
2151
///    containing the value provided in the operand.
2152
static __inline__ __m128i __DEFAULT_FN_ATTRS
2153
_mm_set1_epi8(char __b)
2154
{
2155
  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
2156
}
2157

2158
static __inline__ __m128i __DEFAULT_FN_ATTRS
2159
_mm_setr_epi64(__m64 __q0, __m64 __q1)
2160
{
2161
  return (__m128i){ (long long)__q0, (long long)__q1 };
2162
}
2163

2164
static __inline__ __m128i __DEFAULT_FN_ATTRS
2165
_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
2166
{
2167
  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
2168
}
2169

2170
static __inline__ __m128i __DEFAULT_FN_ATTRS
2171
_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
2172
{
2173
  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2174
}
2175

2176
static __inline__ __m128i __DEFAULT_FN_ATTRS
2177
_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
2178
{
2179
  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2180
}
2181

2182
static __inline__ __m128i __DEFAULT_FN_ATTRS
2183
_mm_setzero_si128(void)
2184
{
2185
  return (__m128i){ 0LL, 0LL };
2186
}
2187

2188
static __inline__ void __DEFAULT_FN_ATTRS
2189
_mm_store_si128(__m128i *__p, __m128i __b)
2190
{
2191
  *__p = __b;
2192
}
2193

2194
static __inline__ void __DEFAULT_FN_ATTRS
2195
_mm_storeu_si128(__m128i *__p, __m128i __b)
2196
{
2197
  struct __storeu_si128 {
2198
    __m128i __v;
2199
  } __attribute__((__packed__, __may_alias__));
2200
  ((struct __storeu_si128*)__p)->__v = __b;
2201
}
2202

2203
static __inline__ void __DEFAULT_FN_ATTRS
2204
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
2205
{
2206
  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
2207
}
2208

2209
static __inline__ void __DEFAULT_FN_ATTRS
2210
_mm_storel_epi64(__m128i *__p, __m128i __a)
2211
{
2212
  struct __mm_storel_epi64_struct {
2213
    long long __u;
2214
  } __attribute__((__packed__, __may_alias__));
2215
  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
2216
}
2217

2218
static __inline__ void __DEFAULT_FN_ATTRS
2219
_mm_stream_pd(double *__p, __m128d __a)
2220
{
2221
  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
2222
}
2223

2224
static __inline__ void __DEFAULT_FN_ATTRS
2225
_mm_stream_si128(__m128i *__p, __m128i __a)
2226
{
2227
  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
2228
}
2229

2230
static __inline__ void __DEFAULT_FN_ATTRS
2231
_mm_stream_si32(int *__p, int __a)
2232
{
2233
  __builtin_ia32_movnti(__p, __a);
2234
}
2235

2236
#ifdef __x86_64__
2237
static __inline__ void __DEFAULT_FN_ATTRS
2238
_mm_stream_si64(long long *__p, long long __a)
2239
{
2240
  __builtin_ia32_movnti64(__p, __a);
2241
}
2242
#endif
2243

2244
static __inline__ void __DEFAULT_FN_ATTRS
2245
_mm_clflush(void const *__p)
2246
{
2247
  __builtin_ia32_clflush(__p);
2248
}
2249

2250
static __inline__ void __DEFAULT_FN_ATTRS
2251
_mm_lfence(void)
2252
{
2253
  __builtin_ia32_lfence();
2254
}
2255

2256
static __inline__ void __DEFAULT_FN_ATTRS
2257
_mm_mfence(void)
2258
{
2259
  __builtin_ia32_mfence();
2260
}
2261

2262
static __inline__ __m128i __DEFAULT_FN_ATTRS
2263
_mm_packs_epi16(__m128i __a, __m128i __b)
2264
{
2265
  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
2266
}
2267

2268
static __inline__ __m128i __DEFAULT_FN_ATTRS
2269
_mm_packs_epi32(__m128i __a, __m128i __b)
2270
{
2271
  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
2272
}
2273

2274
static __inline__ __m128i __DEFAULT_FN_ATTRS
2275
_mm_packus_epi16(__m128i __a, __m128i __b)
2276
{
2277
  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
2278
}
2279

2280
static __inline__ int __DEFAULT_FN_ATTRS
2281
_mm_extract_epi16(__m128i __a, int __imm)
2282
{
2283
  __v8hi __b = (__v8hi)__a;
2284
  return (unsigned short)__b[__imm & 7];
2285
}
2286

2287
static __inline__ __m128i __DEFAULT_FN_ATTRS
2288
_mm_insert_epi16(__m128i __a, int __b, int __imm)
2289
{
2290
  __v8hi __c = (__v8hi)__a;
2291
  __c[__imm & 7] = __b;
2292
  return (__m128i)__c;
2293
}
2294

2295
static __inline__ int __DEFAULT_FN_ATTRS
2296
_mm_movemask_epi8(__m128i __a)
2297
{
2298
  return __builtin_ia32_pmovmskb128((__v16qi)__a);
2299
}
2300

2301
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
2302
  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
2303
                                   (__v4si)_mm_undefined_si128(), \
2304
                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2305
                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
2306

2307
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
2308
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2309
                                   (__v8hi)_mm_undefined_si128(), \
2310
                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2311
                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
2312
                                   4, 5, 6, 7); })
2313

2314
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
2315
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2316
                                   (__v8hi)_mm_undefined_si128(), \
2317
                                   0, 1, 2, 3, \
2318
                                   4 + (((imm) >> 0) & 0x3), \
2319
                                   4 + (((imm) >> 2) & 0x3), \
2320
                                   4 + (((imm) >> 4) & 0x3), \
2321
                                   4 + (((imm) >> 6) & 0x3)); })
2322

2323
static __inline__ __m128i __DEFAULT_FN_ATTRS
2324
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
2325
{
2326
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2327
}
2328

2329
static __inline__ __m128i __DEFAULT_FN_ATTRS
2330
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
2331
{
2332
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
2333
}
2334

2335
static __inline__ __m128i __DEFAULT_FN_ATTRS
2336
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
2337
{
2338
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
2339
}
2340

2341
static __inline__ __m128i __DEFAULT_FN_ATTRS
2342
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
2343
{
2344
  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
2345
}
2346

2347
static __inline__ __m128i __DEFAULT_FN_ATTRS
2348
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
2349
{
2350
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
2351
}
2352

2353
static __inline__ __m128i __DEFAULT_FN_ATTRS
2354
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
2355
{
2356
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
2357
}
2358

2359
static __inline__ __m128i __DEFAULT_FN_ATTRS
2360
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
2361
{
2362
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
2363
}
2364

2365
static __inline__ __m128i __DEFAULT_FN_ATTRS
2366
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
2367
{
2368
  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
2369
}
2370

2371
static __inline__ __m64 __DEFAULT_FN_ATTRS
2372
_mm_movepi64_pi64(__m128i __a)
2373
{
2374
  return (__m64)__a[0];
2375
}
2376

2377
static __inline__ __m128i __DEFAULT_FN_ATTRS
2378
_mm_movpi64_epi64(__m64 __a)
2379
{
2380
  return (__m128i){ (long long)__a, 0 };
2381
}
2382

2383
static __inline__ __m128i __DEFAULT_FN_ATTRS
2384
_mm_move_epi64(__m128i __a)
2385
{
2386
  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
2387
}
2388

2389
static __inline__ __m128d __DEFAULT_FN_ATTRS
2390
_mm_unpackhi_pd(__m128d __a, __m128d __b)
2391
{
2392
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
2393
}
2394

2395
static __inline__ __m128d __DEFAULT_FN_ATTRS
2396
_mm_unpacklo_pd(__m128d __a, __m128d __b)
2397
{
2398
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
2399
}
2400

2401
static __inline__ int __DEFAULT_FN_ATTRS
2402
_mm_movemask_pd(__m128d __a)
2403
{
2404
  return __builtin_ia32_movmskpd((__v2df)__a);
2405
}
2406

2407
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
2408
  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
2409
                                   0 + (((i) >> 0) & 0x1), \
2410
                                   2 + (((i) >> 1) & 0x1)); })
2411

2412
static __inline__ __m128 __DEFAULT_FN_ATTRS
2413
_mm_castpd_ps(__m128d __a)
2414
{
2415
  return (__m128)__a;
2416
}
2417

2418
static __inline__ __m128i __DEFAULT_FN_ATTRS
2419
_mm_castpd_si128(__m128d __a)
2420
{
2421
  return (__m128i)__a;
2422
}
2423

2424
static __inline__ __m128d __DEFAULT_FN_ATTRS
2425
_mm_castps_pd(__m128 __a)
2426
{
2427
  return (__m128d)__a;
2428
}
2429

2430
static __inline__ __m128i __DEFAULT_FN_ATTRS
2431
_mm_castps_si128(__m128 __a)
2432
{
2433
  return (__m128i)__a;
2434
}
2435

2436
static __inline__ __m128 __DEFAULT_FN_ATTRS
2437
_mm_castsi128_ps(__m128i __a)
2438
{
2439
  return (__m128)__a;
2440
}
2441

2442
static __inline__ __m128d __DEFAULT_FN_ATTRS
2443
_mm_castsi128_pd(__m128i __a)
2444
{
2445
  return (__m128d)__a;
2446
}
2447

2448
static __inline__ void __DEFAULT_FN_ATTRS
2449
_mm_pause(void)
2450
{
2451
  __builtin_ia32_pause();
2452
}
2453

2454
#undef __DEFAULT_FN_ATTRS
2455

2456
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
2457

2458
#endif /* __EMMINTRIN_H */
2459

2460
Product

Resources

Company