Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/arm/sse2neon.h
9715 views
1
#ifndef SSE2NEON_H
2
#define SSE2NEON_H
3
4
// This header file provides a simple API translation layer
5
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6
//
7
// Contributors to this work are:
8
// John W. Ratcliff <[email protected]>
9
// Brandon Rowlett <[email protected]>
10
// Ken Fast <[email protected]>
11
// Eric van Beurden <[email protected]>
12
// Alexander Potylitsin <[email protected]>
13
// Hasindu Gamaarachchi <[email protected]>
14
// Jim Huang <[email protected]>
15
// Mark Cheng <[email protected]>
16
// Malcolm James MacLeod <[email protected]>
17
// Devin Hussey (easyaspi314) <[email protected]>
18
// Sebastian Pop <[email protected]>
19
// Developer Ecosystem Engineering <[email protected]>
20
// Danila Kutenin <[email protected]>
21
// François Turban (JishinMaster) <[email protected]>
22
// Pei-Hsuan Hung <[email protected]>
23
// Yang-Hao Yuan <[email protected]>
24
// Syoyo Fujita <[email protected]>
25
// Brecht Van Lommel <[email protected]>
26
// Jonathan Hue <[email protected]>
27
// Cuda Chen <[email protected]>
28
// Aymen Qader <[email protected]>
29
30
/*
31
* sse2neon is freely redistributable under the MIT License.
32
*
33
* Permission is hereby granted, free of charge, to any person obtaining a copy
34
* of this software and associated documentation files (the "Software"), to deal
35
* in the Software without restriction, including without limitation the rights
36
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
37
* copies of the Software, and to permit persons to whom the Software is
38
* furnished to do so, subject to the following conditions:
39
*
40
* The above copyright notice and this permission notice shall be included in
41
* all copies or substantial portions of the Software.
42
*
43
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
48
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49
* SOFTWARE.
50
*/
51
52
/* Tunable configurations */
53
54
/* Enable precise implementation of math operations
55
* This would slow down the computation a bit, but gives consistent result with
56
* x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
57
*/
58
/* _mm_min|max_ps|ss|pd|sd */
59
#ifndef SSE2NEON_PRECISE_MINMAX
60
#define SSE2NEON_PRECISE_MINMAX (0)
61
#endif
62
/* _mm_rcp_ps and _mm_div_ps */
63
#ifndef SSE2NEON_PRECISE_DIV
64
#define SSE2NEON_PRECISE_DIV (0)
65
#endif
66
/* _mm_sqrt_ps and _mm_rsqrt_ps */
67
#ifndef SSE2NEON_PRECISE_SQRT
68
#define SSE2NEON_PRECISE_SQRT (0)
69
#endif
70
/* _mm_dp_pd */
71
#ifndef SSE2NEON_PRECISE_DP
72
#define SSE2NEON_PRECISE_DP (0)
73
#endif
74
75
/* compiler specific definitions */
76
#if defined(__GNUC__) || defined(__clang__)
77
#pragma push_macro("FORCE_INLINE")
78
#pragma push_macro("ALIGN_STRUCT")
79
#define FORCE_INLINE static inline __attribute__((always_inline))
80
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
81
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
82
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
83
#else /* non-GNU / non-clang compilers */
84
#warning "Macro name collisions may happen with unsupported compiler."
85
#ifndef FORCE_INLINE
86
#define FORCE_INLINE static inline
87
#endif
88
#ifndef ALIGN_STRUCT
89
#define ALIGN_STRUCT(x) __declspec(align(x))
90
#endif
91
#define _sse2neon_likely(x) (x)
92
#define _sse2neon_unlikely(x) (x)
93
#endif
94
95
/* C language does not allow initializing a variable with a function call. */
96
#ifdef __cplusplus
97
#define _sse2neon_const static const
98
#else
99
#define _sse2neon_const const
100
#endif
101
102
#include <stdint.h>
103
#include <stdlib.h>
104
105
#if defined(_WIN32) && !defined(__MINGW32__)
106
/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
107
* from both MinGW-w64 and MSVC.
108
*/
109
#define SSE2NEON_ALLOC_DEFINED
110
#endif
111
112
/* If using MSVC */
113
#ifdef _MSC_VER
114
#include <intrin.h>
115
#if (defined(_M_AMD64) || defined(__x86_64__)) || \
116
(defined(_M_ARM) || defined(__arm__))
117
#define SSE2NEON_HAS_BITSCAN64
118
#endif
119
#endif
120
121
/* Compiler barrier */
122
#define SSE2NEON_BARRIER() \
123
do { \
124
__asm__ __volatile__("" ::: "memory"); \
125
(void) 0; \
126
} while (0)
127
128
/* Memory barriers
129
* __atomic_thread_fence does not include a compiler barrier; instead,
130
* the barrier is part of __atomic_load/__atomic_store's "volatile-like"
131
* semantics.
132
*/
133
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
134
#include <stdatomic.h>
135
#endif
136
137
FORCE_INLINE void _sse2neon_smp_mb(void)
138
{
139
SSE2NEON_BARRIER();
140
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
141
!defined(__STDC_NO_ATOMICS__)
142
atomic_thread_fence(memory_order_seq_cst);
143
#elif defined(__GNUC__) || defined(__clang__)
144
__atomic_thread_fence(__ATOMIC_SEQ_CST);
145
#else
146
/* FIXME: MSVC support */
147
#endif
148
}
149
150
/* Architecture-specific build options */
151
/* FIXME: #pragma GCC push_options is only available on GCC */
152
#if defined(__GNUC__)
153
#if defined(__arm__) && __ARM_ARCH == 7
154
/* According to ARM C Language Extensions Architecture specification,
155
* __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
156
* architecture supported.
157
*/
158
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
159
#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
160
#endif
161
#if !defined(__clang__)
162
#pragma GCC push_options
163
#pragma GCC target("fpu=neon")
164
#endif
165
#elif defined(__aarch64__)
166
#if !defined(__clang__)
167
#pragma GCC push_options
168
#pragma GCC target("+simd")
169
#endif
170
#elif __ARM_ARCH == 8
171
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
172
#error \
173
"You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
174
#endif
175
#if !defined(__clang__)
176
#pragma GCC push_options
177
#endif
178
#else
179
#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
180
#endif
181
#endif
182
183
#include <arm_neon.h>
184
#if !defined(__aarch64__) && (__ARM_ARCH == 8)
185
#if defined __has_include && __has_include(<arm_acle.h>)
186
#include <arm_acle.h>
187
#endif
188
#endif
189
190
/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
191
* and other Arm microarchtectures use.
192
* From sysctl -a on Apple M1:
193
* hw.cachelinesize: 128
194
*/
195
#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
196
#define SSE2NEON_CACHELINE_SIZE 128
197
#else
198
#define SSE2NEON_CACHELINE_SIZE 64
199
#endif
200
201
/* Rounding functions require either Aarch64 instructions or libm failback */
202
#if !defined(__aarch64__)
203
#include <math.h>
204
#endif
205
206
/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
207
* or even not accessible in user mode.
208
* To write or access to these registers in user mode,
209
* we have to perform syscall instead.
210
*/
211
#if !defined(__aarch64__)
212
#include <sys/time.h>
213
#endif
214
215
/* "__has_builtin" can be used to query support for built-in functions
216
* provided by gcc/clang and other compilers that support it.
217
*/
218
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
219
/* Compatibility with gcc <= 9 */
220
#if defined(__GNUC__) && (__GNUC__ <= 9)
221
#define __has_builtin(x) HAS##x
222
#define HAS__builtin_popcount 1
223
#define HAS__builtin_popcountll 1
224
225
// __builtin_shuffle introduced in GCC 4.7.0
226
#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
227
#define HAS__builtin_shuffle 1
228
#else
229
#define HAS__builtin_shuffle 0
230
#endif
231
232
#define HAS__builtin_shufflevector 0
233
#define HAS__builtin_nontemporal_store 0
234
#else
235
#define __has_builtin(x) 0
236
#endif
237
#endif
238
239
/**
240
* MACRO for shuffle parameter for _mm_shuffle_ps().
241
* Argument fp3 is a digit[0123] that represents the fp from argument "b"
242
* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
243
* for fp2 in result. fp1 is a digit[0123] that represents the fp from
244
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
245
* fp0 is the same for fp0 of result.
246
*/
247
#if defined(__aarch64__)
248
#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
249
2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
250
#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
251
2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
252
4)+16+3) } )
253
#endif
254
255
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
256
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
257
258
#if __has_builtin(__builtin_shufflevector)
259
#define _sse2neon_shuffle(type, a, b, ...) \
260
__builtin_shufflevector(a, b, __VA_ARGS__)
261
#elif __has_builtin(__builtin_shuffle)
262
#define _sse2neon_shuffle(type, a, b, ...) \
263
__extension__({ \
264
type tmp = {__VA_ARGS__}; \
265
__builtin_shuffle(a, b, tmp); \
266
})
267
#endif
268
269
#ifdef _sse2neon_shuffle
270
#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
271
#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
272
#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
273
#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
274
#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
275
#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
276
#endif
277
278
/* Rounding mode macros. */
279
#define _MM_FROUND_TO_NEAREST_INT 0x00
280
#define _MM_FROUND_TO_NEG_INF 0x01
281
#define _MM_FROUND_TO_POS_INF 0x02
282
#define _MM_FROUND_TO_ZERO 0x03
283
#define _MM_FROUND_CUR_DIRECTION 0x04
284
#define _MM_FROUND_NO_EXC 0x08
285
#define _MM_FROUND_RAISE_EXC 0x00
286
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
287
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
288
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
289
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
290
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
291
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
292
#define _MM_ROUND_NEAREST 0x0000
293
#define _MM_ROUND_DOWN 0x2000
294
#define _MM_ROUND_UP 0x4000
295
#define _MM_ROUND_TOWARD_ZERO 0x6000
296
/* Flush zero mode macros. */
297
#define _MM_FLUSH_ZERO_MASK 0x8000
298
#define _MM_FLUSH_ZERO_ON 0x8000
299
#define _MM_FLUSH_ZERO_OFF 0x0000
300
/* Denormals are zeros mode macros. */
301
#define _MM_DENORMALS_ZERO_MASK 0x0040
302
#define _MM_DENORMALS_ZERO_ON 0x0040
303
#define _MM_DENORMALS_ZERO_OFF 0x0000
304
305
/* indicate immediate constant argument in a given range */
306
#define __constrange(a, b) const
307
308
/* A few intrinsics accept traditional data types like ints or floats, but
309
* most operate on data types that are specific to SSE.
310
* If a vector type ends in d, it contains doubles, and if it does not have
311
* a suffix, it contains floats. An integer vector type can contain any type
312
* of integer, from chars to shorts to unsigned long longs.
313
*/
314
typedef int64x1_t __m64;
315
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
316
// On ARM 32-bit architecture, the float64x2_t is not supported.
317
// The data type __m128d should be represented in a different way for related
318
// intrinsic conversion.
319
#if defined(__aarch64__)
320
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
321
#else
322
typedef float32x4_t __m128d;
323
#endif
324
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
325
326
// __int64 is defined in the Intrinsics Guide which maps to different datatype
327
// in different data model
328
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
329
#if (defined(__x86_64__) || defined(__i386__))
330
#define __int64 long long
331
#else
332
#define __int64 int64_t
333
#endif
334
#endif
335
336
/* type-safe casting between types */
337
338
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
339
#define vreinterpretq_m128_f32(x) (x)
340
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
341
342
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
343
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
344
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
345
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
346
347
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
348
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
349
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
350
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
351
352
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
353
#define vreinterpretq_f32_m128(x) (x)
354
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
355
356
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
357
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
358
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
359
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
360
361
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
362
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
363
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
364
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
365
366
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
367
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
368
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
369
#define vreinterpretq_m128i_s64(x) (x)
370
371
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
372
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
373
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
374
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
375
376
#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
377
#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
378
379
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
380
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
381
#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
382
#define vreinterpretq_s64_m128i(x) (x)
383
384
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
385
#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
386
#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
387
#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
388
389
#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
390
#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
391
#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
392
#define vreinterpret_m64_s64(x) (x)
393
394
#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
395
#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
396
#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
397
#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
398
399
#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
400
#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
401
#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
402
403
#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
404
#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
405
#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
406
#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
407
408
#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
409
#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
410
#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
411
#define vreinterpret_s64_m64(x) (x)
412
413
#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
414
415
#if defined(__aarch64__)
416
#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
417
#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
418
419
#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
420
421
#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
422
#define vreinterpretq_m128d_f64(x) (x)
423
424
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
425
426
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
427
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
428
429
#define vreinterpretq_f64_m128d(x) (x)
430
#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
431
#else
432
#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
433
#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
434
435
#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
436
#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
437
438
#define vreinterpretq_m128d_f32(x) (x)
439
440
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
441
442
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
443
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
444
445
#define vreinterpretq_f32_m128d(x) (x)
446
#endif
447
448
// A struct is defined in this header file called 'SIMDVec' which can be used
449
// by applications which attempt to access the contents of an __m128 struct
450
// directly. It is important to note that accessing the __m128 struct directly
451
// is bad coding practice by Microsoft: @see:
452
// https://docs.microsoft.com/en-us/cpp/cpp/m128
453
//
454
// However, some legacy source code may try to access the contents of an __m128
455
// struct directly so the developer can use the SIMDVec as an alias for it. Any
456
// casting must be done manually by the developer, as you cannot cast or
457
// otherwise alias the base NEON data type for intrinsic operations.
458
//
459
// union intended to allow direct access to an __m128 variable using the names
460
// that the MSVC compiler provides. This union should really only be used when
461
// trying to access the members of the vector as integer values. GCC/clang
462
// allow native access to the float members through a simple array access
463
// operator (in C since 4.6, in C++ since 4.8).
464
//
465
// Ideally direct accesses to SIMD vectors should not be used since it can cause
466
// a performance hit. If it really is needed however, the original __m128
467
// variable can be aliased with a pointer to this union and used to access
468
// individual components. The use of this union should be hidden behind a macro
469
// that is used throughout the codebase to access the members instead of always
470
// declaring this type of variable.
471
typedef union ALIGN_STRUCT(16) SIMDVec {
472
float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
473
int8_t m128_i8[16]; // as signed 8-bit integers.
474
int16_t m128_i16[8]; // as signed 16-bit integers.
475
int32_t m128_i32[4]; // as signed 32-bit integers.
476
int64_t m128_i64[2]; // as signed 64-bit integers.
477
uint8_t m128_u8[16]; // as unsigned 8-bit integers.
478
uint16_t m128_u16[8]; // as unsigned 16-bit integers.
479
uint32_t m128_u32[4]; // as unsigned 32-bit integers.
480
uint64_t m128_u64[2]; // as unsigned 64-bit integers.
481
} SIMDVec;
482
483
// casting using SIMDVec
484
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
485
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
486
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
487
488
/* SSE macros */
489
#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
490
#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
491
#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
492
#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
493
494
// Function declaration
495
// SSE
496
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
497
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
498
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
499
FORCE_INLINE __m128 _mm_set_ps1(float);
500
FORCE_INLINE __m128 _mm_setzero_ps(void);
501
// SSE2
502
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
503
FORCE_INLINE __m128i _mm_castps_si128(__m128);
504
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
505
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
506
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
507
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
508
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
509
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
510
FORCE_INLINE __m128d _mm_set_pd(double, double);
511
FORCE_INLINE __m128i _mm_set1_epi32(int);
512
FORCE_INLINE __m128i _mm_setzero_si128();
513
// SSE4.1
514
FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
515
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
516
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
517
FORCE_INLINE __m128 _mm_floor_ps(__m128);
518
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
519
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
520
// SSE4.2
521
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
522
523
/* Backwards compatibility for compilers with lack of specific type support */
524
525
// Older gcc does not define vld1q_u8_x4 type
526
#if defined(__GNUC__) && !defined(__clang__) && \
527
((__GNUC__ <= 12 && defined(__arm__)) || \
528
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
529
(__GNUC__ <= 9 && defined(__aarch64__)))
530
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
531
{
532
uint8x16x4_t ret;
533
ret.val[0] = vld1q_u8(p + 0);
534
ret.val[1] = vld1q_u8(p + 16);
535
ret.val[2] = vld1q_u8(p + 32);
536
ret.val[3] = vld1q_u8(p + 48);
537
return ret;
538
}
539
#else
540
// Wraps vld1q_u8_x4
541
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
542
{
543
return vld1q_u8_x4(p);
544
}
545
#endif
546
547
#if !defined(__aarch64__)
548
/* emulate vaddv u8 variant */
549
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
550
{
551
const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
552
return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
553
}
554
#else
555
// Wraps vaddv_u8
556
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
557
{
558
return vaddv_u8(v8);
559
}
560
#endif
561
562
#if !defined(__aarch64__)
563
/* emulate vaddvq u8 variant */
564
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
565
{
566
uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
567
uint8_t res = 0;
568
for (int i = 0; i < 8; ++i)
569
res += tmp[i];
570
return res;
571
}
572
#else
573
// Wraps vaddvq_u8
574
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
575
{
576
return vaddvq_u8(a);
577
}
578
#endif
579
580
#if !defined(__aarch64__)
581
/* emulate vaddvq u16 variant */
582
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
583
{
584
uint32x4_t m = vpaddlq_u16(a);
585
uint64x2_t n = vpaddlq_u32(m);
586
uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
587
588
return vget_lane_u32((uint32x2_t) o, 0);
589
}
590
#else
591
// Wraps vaddvq_u16
592
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
593
{
594
return vaddvq_u16(a);
595
}
596
#endif
597
598
/* Function Naming Conventions
599
* The naming convention of SSE intrinsics is straightforward. A generic SSE
600
* intrinsic function is given as follows:
601
* _mm_<name>_<data_type>
602
*
603
* The parts of this format are given as follows:
604
* 1. <name> describes the operation performed by the intrinsic
605
* 2. <data_type> identifies the data type of the function's primary arguments
606
*
607
* This last part, <data_type>, is a little complicated. It identifies the
608
* content of the input values, and can be set to any of the following values:
609
* + ps - vectors contain floats (ps stands for packed single-precision)
610
* + pd - vectors cantain doubles (pd stands for packed double-precision)
611
* + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
612
* signed integers
613
* + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
614
* unsigned integers
615
* + si128 - unspecified 128-bit vector or 256-bit vector
616
* + m128/m128i/m128d - identifies input vector types when they are different
617
* than the type of the returned vector
618
*
619
* For example, _mm_setzero_ps. The _mm implies that the function returns
620
* a 128-bit vector. The _ps at the end implies that the argument vectors
621
* contain floats.
622
*
623
* A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
624
* // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
625
* __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
626
* // Set packed 8-bit integers
627
* // 128 bits, 16 chars, per 8 bits
628
* __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
629
* 4, 5, 12, 13, 6, 7, 14, 15);
630
* // Shuffle packed 8-bit integers
631
* __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
632
*
633
* Data (Number, Binary, Byte Index):
634
+------+------+-------------+------+------+-------------+
635
| 1 | 2 | 3 | 4 | Number
636
+------+------+------+------+------+------+------+------+
637
| 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
638
+------+------+------+------+------+------+------+------+
639
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
640
+------+------+------+------+------+------+------+------+
641
642
+------+------+------+------+------+------+------+------+
643
| 5 | 6 | 7 | 8 | Number
644
+------+------+------+------+------+------+------+------+
645
| 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
646
+------+------+------+------+------+------+------+------+
647
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
648
+------+------+------+------+------+------+------+------+
649
* Index (Byte Index):
650
+------+------+------+------+------+------+------+------+
651
| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
652
+------+------+------+------+------+------+------+------+
653
654
+------+------+------+------+------+------+------+------+
655
| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
656
+------+------+------+------+------+------+------+------+
657
* Result:
658
+------+------+------+------+------+------+------+------+
659
| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
660
+------+------+------+------+------+------+------+------+
661
| 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
662
+------+------+------+------+------+------+------+------+
663
| 256 | 2 | 5 | 6 | Number
664
+------+------+------+------+------+------+------+------+
665
666
+------+------+------+------+------+------+------+------+
667
| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
668
+------+------+------+------+------+------+------+------+
669
| 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
670
+------+------+------+------+------+------+------+------+
671
| 3 | 7 | 4 | 8 | Number
672
+------+------+------+------+------+------+-------------+
673
*/
674
675
/* Constants for use with _mm_prefetch. */
676
enum _mm_hint {
677
_MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
678
_MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
679
_MM_HINT_T1 = 2, /* load data to L2 cache only */
680
_MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
681
};
682
683
// The bit field mapping to the FPCR(floating-point control register)
684
typedef struct {
685
uint16_t res0;
686
uint8_t res1 : 6;
687
uint8_t bit22 : 1;
688
uint8_t bit23 : 1;
689
uint8_t bit24 : 1;
690
uint8_t res2 : 7;
691
#if defined(__aarch64__)
692
uint32_t res3;
693
#endif
694
} fpcr_bitfield;
695
696
// Takes the upper 64 bits of a and places it in the low end of the result
697
// Takes the lower 64 bits of b and places it into the high end of the result.
698
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
699
{
700
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
701
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
702
return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
703
}
704
705
// takes the lower two 32-bit values from a and swaps them and places in high
706
// end of result takes the higher two 32 bit values from b and swaps them and
707
// places in low end of result.
708
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
709
{
710
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
711
float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
712
return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
713
}
714
715
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
716
{
717
float32x2_t a21 = vget_high_f32(
718
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
719
float32x2_t b03 = vget_low_f32(
720
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
721
return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
722
}
723
724
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
725
{
726
float32x2_t a03 = vget_low_f32(
727
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
728
float32x2_t b21 = vget_high_f32(
729
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
730
return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
731
}
732
733
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
734
{
735
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
736
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
737
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
738
}
739
740
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
741
{
742
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
743
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
744
return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
745
}
746
747
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
748
{
749
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
750
float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
751
return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
752
}
753
754
// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
755
// high
756
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
757
{
758
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
759
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
760
return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
761
}
762
763
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
764
{
765
float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
766
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
767
return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
768
}
769
770
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
771
{
772
float32x2_t a22 =
773
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
774
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
775
return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
776
}
777
778
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
779
{
780
float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
781
float32x2_t b22 =
782
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
783
return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
784
}
785
786
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
787
{
788
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
789
float32x2_t a22 =
790
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
791
float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
792
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
793
return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
794
}
795
796
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
797
{
798
float32x2_t a33 =
799
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
800
float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
801
return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
802
}
803
804
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
805
{
806
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
807
float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
808
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
809
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
810
return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
811
}
812
813
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
814
{
815
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
816
float32_t b2 = vgetq_lane_f32(b, 2);
817
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
818
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
819
return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
820
}
821
822
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
823
{
824
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
825
float32_t b2 = vgetq_lane_f32(b, 2);
826
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
827
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
828
return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
829
}
830
831
// Kahan summation for accurate summation of floating-point numbers.
832
// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
833
FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
834
{
835
y -= *c;
836
float t = *sum + y;
837
*c = (t - *sum) - y;
838
*sum = t;
839
}
840
841
#if defined(__ARM_FEATURE_CRYPTO) && \
842
(defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
843
// Wraps vmull_p64
844
FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
845
{
846
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
847
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
848
return vreinterpretq_u64_p128(vmull_p64(a, b));
849
}
850
#else // ARMv7 polyfill
851
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
852
//
853
// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
854
// 64-bit->128-bit polynomial multiply.
855
//
856
// It needs some work and is somewhat slow, but it is still faster than all
857
// known scalar methods.
858
//
859
// Algorithm adapted to C from
860
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
861
// from "Fast Software Polynomial Multiplication on ARM Processors Using the
862
// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
863
// (https://hal.inria.fr/hal-01506572)
864
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
865
{
866
poly8x8_t a = vreinterpret_p8_u64(_a);
867
poly8x8_t b = vreinterpret_p8_u64(_b);
868
869
// Masks
870
uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
871
vcreate_u8(0x00000000ffffffff));
872
uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
873
vcreate_u8(0x0000000000000000));
874
875
// Do the multiplies, rotating with vext to get all combinations
876
uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
877
uint8x16_t e =
878
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
879
uint8x16_t f =
880
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
881
uint8x16_t g =
882
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
883
uint8x16_t h =
884
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
885
uint8x16_t i =
886
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
887
uint8x16_t j =
888
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
889
uint8x16_t k =
890
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
891
892
// Add cross products
893
uint8x16_t l = veorq_u8(e, f); // L = E + F
894
uint8x16_t m = veorq_u8(g, h); // M = G + H
895
uint8x16_t n = veorq_u8(i, j); // N = I + J
896
897
// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
898
// instructions.
899
#if defined(__aarch64__)
900
uint8x16_t lm_p0 = vreinterpretq_u8_u64(
901
vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
902
uint8x16_t lm_p1 = vreinterpretq_u8_u64(
903
vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
904
uint8x16_t nk_p0 = vreinterpretq_u8_u64(
905
vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
906
uint8x16_t nk_p1 = vreinterpretq_u8_u64(
907
vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
908
#else
909
uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
910
uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
911
uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
912
uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
913
#endif
914
// t0 = (L) (P0 + P1) << 8
915
// t1 = (M) (P2 + P3) << 16
916
uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
917
uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
918
uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
919
920
// t2 = (N) (P4 + P5) << 24
921
// t3 = (K) (P6 + P7) << 32
922
uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
923
uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
924
uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
925
926
// De-interleave
927
#if defined(__aarch64__)
928
uint8x16_t t0 = vreinterpretq_u8_u64(
929
vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
930
uint8x16_t t1 = vreinterpretq_u8_u64(
931
vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
932
uint8x16_t t2 = vreinterpretq_u8_u64(
933
vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
934
uint8x16_t t3 = vreinterpretq_u8_u64(
935
vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
936
#else
937
uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
938
uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
939
uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
940
uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
941
#endif
942
// Shift the cross products
943
uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
944
uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
945
uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
946
uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
947
948
// Accumulate the products
949
uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
950
uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
951
uint8x16_t mix = veorq_u8(d, cross1);
952
uint8x16_t r = veorq_u8(mix, cross2);
953
return vreinterpretq_u64_u8(r);
954
}
955
#endif // ARMv7 polyfill
956
957
// C equivalent:
958
// __m128i _mm_shuffle_epi32_default(__m128i a,
959
// __constrange(0, 255) int imm) {
960
// __m128i ret;
961
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
962
// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
963
// return ret;
964
// }
965
#define _mm_shuffle_epi32_default(a, imm) \
966
__extension__({ \
967
int32x4_t ret; \
968
ret = vmovq_n_s32( \
969
vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
970
ret = vsetq_lane_s32( \
971
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
972
ret, 1); \
973
ret = vsetq_lane_s32( \
974
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
975
ret, 2); \
976
ret = vsetq_lane_s32( \
977
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
978
ret, 3); \
979
vreinterpretq_m128i_s32(ret); \
980
})
981
982
// Takes the upper 64 bits of a and places it in the low end of the result
983
// Takes the lower 64 bits of a and places it into the high end of the result.
984
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
985
{
986
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
987
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
988
return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
989
}
990
991
// takes the lower two 32-bit values from a and swaps them and places in low end
992
// of result takes the higher two 32 bit values from a and swaps them and places
993
// in high end of result.
994
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
995
{
996
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
997
int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
998
return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
999
}
1000
1001
// rotates the least significant 32 bits into the most significant 32 bits, and
1002
// shifts the rest down
1003
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1004
{
1005
return vreinterpretq_m128i_s32(
1006
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1007
}
1008
1009
// rotates the most significant 32 bits into the least significant 32 bits, and
1010
// shifts the rest up
1011
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1012
{
1013
return vreinterpretq_m128i_s32(
1014
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1015
}
1016
1017
// gets the lower 64 bits of a, and places it in the upper 64 bits
1018
// gets the lower 64 bits of a and places it in the lower 64 bits
1019
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1020
{
1021
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1022
return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1023
}
1024
1025
// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1026
// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1027
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1028
{
1029
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1030
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1031
return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1032
}
1033
1034
// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1035
// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1036
// places it in the lower 64 bits
1037
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1038
{
1039
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1040
return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1041
}
1042
1043
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1044
{
1045
int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1046
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1047
return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1048
}
1049
1050
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1051
{
1052
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1053
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1054
return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1055
}
1056
1057
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1058
{
1059
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1060
int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1061
return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1062
}
1063
1064
// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1065
// int imm)
1066
#if defined(__aarch64__)
1067
#define _mm_shuffle_epi32_splat(a, imm) \
1068
__extension__({ \
1069
vreinterpretq_m128i_s32( \
1070
vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1071
})
1072
#else
1073
#define _mm_shuffle_epi32_splat(a, imm) \
1074
__extension__({ \
1075
vreinterpretq_m128i_s32( \
1076
vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1077
})
1078
#endif
1079
1080
// NEON does not support a general purpose permute intrinsic
1081
// Selects four specific single-precision, floating-point values from a and b,
1082
// based on the mask i.
1083
//
1084
// C equivalent:
1085
// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1086
// __constrange(0, 255) int imm) {
1087
// __m128 ret;
1088
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1089
// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1090
// return ret;
1091
// }
1092
//
1093
// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1094
#define _mm_shuffle_ps_default(a, b, imm) \
1095
__extension__({ \
1096
float32x4_t ret; \
1097
ret = vmovq_n_f32( \
1098
vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
1099
ret = vsetq_lane_f32( \
1100
vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1101
ret, 1); \
1102
ret = vsetq_lane_f32( \
1103
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1104
ret, 2); \
1105
ret = vsetq_lane_f32( \
1106
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1107
ret, 3); \
1108
vreinterpretq_m128_f32(ret); \
1109
})
1110
1111
// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1112
// by imm.
1113
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1114
// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1115
// __constrange(0,255) int
1116
// imm)
1117
#define _mm_shufflelo_epi16_function(a, imm) \
1118
__extension__({ \
1119
int16x8_t ret = vreinterpretq_s16_m128i(a); \
1120
int16x4_t lowBits = vget_low_s16(ret); \
1121
ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1122
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1123
1); \
1124
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1125
2); \
1126
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1127
3); \
1128
vreinterpretq_m128i_s16(ret); \
1129
})
1130
1131
// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1132
// by imm.
1133
// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1134
// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1135
// __constrange(0,255) int
1136
// imm)
1137
#define _mm_shufflehi_epi16_function(a, imm) \
1138
__extension__({ \
1139
int16x8_t ret = vreinterpretq_s16_m128i(a); \
1140
int16x4_t highBits = vget_high_s16(ret); \
1141
ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1142
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1143
5); \
1144
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1145
6); \
1146
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1147
7); \
1148
vreinterpretq_m128i_s16(ret); \
1149
})
1150
1151
/* MMX */
1152
1153
//_mm_empty is a no-op on arm
1154
FORCE_INLINE void _mm_empty(void) {}
1155
1156
/* SSE */
1157
1158
// Adds the four single-precision, floating-point values of a and b.
1159
//
1160
// r0 := a0 + b0
1161
// r1 := a1 + b1
1162
// r2 := a2 + b2
1163
// r3 := a3 + b3
1164
//
1165
// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1166
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1167
{
1168
return vreinterpretq_m128_f32(
1169
vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1170
}
1171
1172
// adds the scalar single-precision floating point values of a and b.
1173
// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1174
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1175
{
1176
float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1177
float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1178
// the upper values in the result must be the remnants of <a>.
1179
return vreinterpretq_m128_f32(vaddq_f32(a, value));
1180
}
1181
1182
// Computes the bitwise AND of the four single-precision, floating-point values
1183
// of a and b.
1184
//
1185
// r0 := a0 & b0
1186
// r1 := a1 & b1
1187
// r2 := a2 & b2
1188
// r3 := a3 & b3
1189
//
1190
// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1191
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1192
{
1193
return vreinterpretq_m128_s32(
1194
vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1195
}
1196
1197
// Computes the bitwise AND-NOT of the four single-precision, floating-point
1198
// values of a and b.
1199
//
1200
// r0 := ~a0 & b0
1201
// r1 := ~a1 & b1
1202
// r2 := ~a2 & b2
1203
// r3 := ~a3 & b3
1204
//
1205
// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1206
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1207
{
1208
return vreinterpretq_m128_s32(
1209
vbicq_s32(vreinterpretq_s32_m128(b),
1210
vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1211
}
1212
1213
// Average packed unsigned 16-bit integers in a and b, and store the results in
1214
// dst.
1215
//
1216
// FOR j := 0 to 3
1217
// i := j*16
1218
// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1219
// ENDFOR
1220
//
1221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1222
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1223
{
1224
return vreinterpret_m64_u16(
1225
vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1226
}
1227
1228
// Average packed unsigned 8-bit integers in a and b, and store the results in
1229
// dst.
1230
//
1231
// FOR j := 0 to 7
1232
// i := j*8
1233
// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1234
// ENDFOR
1235
//
1236
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1237
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1238
{
1239
return vreinterpret_m64_u8(
1240
vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1241
}
1242
1243
// Compares for equality.
1244
// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1245
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1246
{
1247
return vreinterpretq_m128_u32(
1248
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1249
}
1250
1251
// Compares for equality.
1252
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1253
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1254
{
1255
return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1256
}
1257
1258
// Compares for greater than or equal.
1259
// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1260
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1261
{
1262
return vreinterpretq_m128_u32(
1263
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1264
}
1265
1266
// Compares for greater than or equal.
1267
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1268
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1269
{
1270
return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1271
}
1272
1273
// Compares for greater than.
1274
//
1275
// r0 := (a0 > b0) ? 0xffffffff : 0x0
1276
// r1 := (a1 > b1) ? 0xffffffff : 0x0
1277
// r2 := (a2 > b2) ? 0xffffffff : 0x0
1278
// r3 := (a3 > b3) ? 0xffffffff : 0x0
1279
//
1280
// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1281
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1282
{
1283
return vreinterpretq_m128_u32(
1284
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1285
}
1286
1287
// Compares for greater than.
1288
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1289
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1290
{
1291
return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1292
}
1293
1294
// Compares for less than or equal.
1295
//
1296
// r0 := (a0 <= b0) ? 0xffffffff : 0x0
1297
// r1 := (a1 <= b1) ? 0xffffffff : 0x0
1298
// r2 := (a2 <= b2) ? 0xffffffff : 0x0
1299
// r3 := (a3 <= b3) ? 0xffffffff : 0x0
1300
//
1301
// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1302
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1303
{
1304
return vreinterpretq_m128_u32(
1305
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1306
}
1307
1308
// Compares for less than or equal.
1309
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1310
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1311
{
1312
return _mm_move_ss(a, _mm_cmple_ps(a, b));
1313
}
1314
1315
// Compares for less than
1316
// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1317
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1318
{
1319
return vreinterpretq_m128_u32(
1320
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1321
}
1322
1323
// Compares for less than
1324
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1325
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1326
{
1327
return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1328
}
1329
1330
// Compares for inequality.
1331
// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1332
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1333
{
1334
return vreinterpretq_m128_u32(vmvnq_u32(
1335
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1336
}
1337
1338
// Compares for inequality.
1339
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1340
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1341
{
1342
return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1343
}
1344
1345
// Compares for not greater than or equal.
1346
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1347
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1348
{
1349
return vreinterpretq_m128_u32(vmvnq_u32(
1350
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1351
}
1352
1353
// Compares for not greater than or equal.
1354
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1355
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1356
{
1357
return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1358
}
1359
1360
// Compares for not greater than.
1361
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1362
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1363
{
1364
return vreinterpretq_m128_u32(vmvnq_u32(
1365
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1366
}
1367
1368
// Compares for not greater than.
1369
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1370
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1371
{
1372
return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1373
}
1374
1375
// Compares for not less than or equal.
1376
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1377
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1378
{
1379
return vreinterpretq_m128_u32(vmvnq_u32(
1380
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1381
}
1382
1383
// Compares for not less than or equal.
1384
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1385
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1386
{
1387
return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1388
}
1389
1390
// Compares for not less than.
1391
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1392
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1393
{
1394
return vreinterpretq_m128_u32(vmvnq_u32(
1395
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1396
}
1397
1398
// Compares for not less than.
1399
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1400
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1401
{
1402
return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1403
}
1404
1405
// Compares the four 32-bit floats in a and b to check if any values are NaN.
1406
// Ordered compare between each value returns true for "orderable" and false for
1407
// "not orderable" (NaN).
1408
// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1409
// also:
1410
// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1411
// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1412
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1413
{
1414
// Note: NEON does not have ordered compare builtin
1415
// Need to compare a eq a and b eq b to check for NaN
1416
// Do AND of results to get final
1417
uint32x4_t ceqaa =
1418
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1419
uint32x4_t ceqbb =
1420
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1421
return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1422
}
1423
1424
// Compares for ordered.
1425
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1426
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1427
{
1428
return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1429
}
1430
1431
// Compares for unordered.
1432
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1433
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1434
{
1435
uint32x4_t f32a =
1436
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1437
uint32x4_t f32b =
1438
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1439
return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1440
}
1441
1442
// Compares for unordered.
1443
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1444
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1445
{
1446
return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1447
}
1448
1449
// Compares the lower single-precision floating point scalar values of a and b
1450
// using an equality operation. :
1451
// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1452
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1453
{
1454
uint32x4_t a_eq_b =
1455
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1456
return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1457
}
1458
1459
// Compares the lower single-precision floating point scalar values of a and b
1460
// using a greater than or equal operation. :
1461
// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1462
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1463
{
1464
uint32x4_t a_ge_b =
1465
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1466
return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1467
}
1468
1469
// Compares the lower single-precision floating point scalar values of a and b
1470
// using a greater than operation. :
1471
// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1472
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1473
{
1474
uint32x4_t a_gt_b =
1475
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1476
return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1477
}
1478
1479
// Compares the lower single-precision floating point scalar values of a and b
1480
// using a less than or equal operation. :
1481
// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1482
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1483
{
1484
uint32x4_t a_le_b =
1485
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1486
return vgetq_lane_u32(a_le_b, 0) & 0x1;
1487
}
1488
1489
// Compares the lower single-precision floating point scalar values of a and b
1490
// using a less than operation. :
1491
// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1492
// note!! The documentation on MSDN is incorrect! If either of the values is a
1493
// NAN the docs say you will get a one, but in fact, it will return a zero!!
1494
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1495
{
1496
uint32x4_t a_lt_b =
1497
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1498
return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1499
}
1500
1501
// Compares the lower single-precision floating point scalar values of a and b
1502
// using an inequality operation. :
1503
// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1504
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1505
{
1506
return !_mm_comieq_ss(a, b);
1507
}
1508
1509
// Convert packed signed 32-bit integers in b to packed single-precision
1510
// (32-bit) floating-point elements, store the results in the lower 2 elements
1511
// of dst, and copy the upper 2 packed elements from a to the upper elements of
1512
// dst.
1513
//
1514
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1515
// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1516
// dst[95:64] := a[95:64]
1517
// dst[127:96] := a[127:96]
1518
//
1519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1520
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1521
{
1522
return vreinterpretq_m128_f32(
1523
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1524
vget_high_f32(vreinterpretq_f32_m128(a))));
1525
}
1526
1527
// Convert packed single-precision (32-bit) floating-point elements in a to
1528
// packed 32-bit integers, and store the results in dst.
1529
//
1530
// FOR j := 0 to 1
1531
// i := 32*j
1532
// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1533
// ENDFOR
1534
//
1535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1536
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1537
{
1538
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1539
return vreinterpret_m64_s32(
1540
vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1541
#else
1542
return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1543
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1544
#endif
1545
}
1546
1547
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1548
// floating-point element, store the result in the lower element of dst, and
1549
// copy the upper 3 packed elements from a to the upper elements of dst.
1550
//
1551
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1552
// dst[127:32] := a[127:32]
1553
//
1554
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1555
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1556
{
1557
return vreinterpretq_m128_f32(
1558
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1559
}
1560
1561
// Convert the lower single-precision (32-bit) floating-point element in a to a
1562
// 32-bit integer, and store the result in dst.
1563
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1564
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1565
{
1566
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1567
return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1568
0);
1569
#else
1570
float32_t data = vgetq_lane_f32(
1571
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1572
return (int32_t) data;
1573
#endif
1574
}
1575
1576
// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1577
// floating-point elements, and store the results in dst.
1578
//
1579
// FOR j := 0 to 3
1580
// i := j*16
1581
// m := j*32
1582
// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1583
// ENDFOR
1584
//
1585
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1586
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1587
{
1588
return vreinterpretq_m128_f32(
1589
vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1590
}
1591
1592
// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1593
// floating-point elements, store the results in the lower 2 elements of dst,
1594
// and copy the upper 2 packed elements from a to the upper elements of dst.
1595
//
1596
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1597
// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1598
// dst[95:64] := a[95:64]
1599
// dst[127:96] := a[127:96]
1600
//
1601
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1602
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1603
{
1604
return vreinterpretq_m128_f32(
1605
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1606
vget_high_f32(vreinterpretq_f32_m128(a))));
1607
}
1608
1609
// Convert packed signed 32-bit integers in a to packed single-precision
1610
// (32-bit) floating-point elements, store the results in the lower 2 elements
1611
// of dst, then convert the packed signed 32-bit integers in b to
1612
// single-precision (32-bit) floating-point element, and store the results in
1613
// the upper 2 elements of dst.
1614
//
1615
// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1616
// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1617
// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1618
// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1619
//
1620
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1621
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1622
{
1623
return vreinterpretq_m128_f32(vcvtq_f32_s32(
1624
vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1625
}
1626
1627
// Convert the lower packed 8-bit integers in a to packed single-precision
1628
// (32-bit) floating-point elements, and store the results in dst.
1629
//
1630
// FOR j := 0 to 3
1631
// i := j*8
1632
// m := j*32
1633
// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1634
// ENDFOR
1635
//
1636
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1637
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1638
{
1639
return vreinterpretq_m128_f32(vcvtq_f32_s32(
1640
vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1641
}
1642
1643
// Convert packed single-precision (32-bit) floating-point elements in a to
1644
// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1645
// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1646
// 0x7FFFFFFF.
1647
//
1648
// FOR j := 0 to 3
1649
// i := 16*j
1650
// k := 32*j
1651
// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1652
// dst[i+15:i] := 0x7FFF
1653
// ELSE
1654
// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1655
// FI
1656
// ENDFOR
1657
//
1658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1659
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1660
{
1661
return vreinterpret_m64_s16(
1662
vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1663
}
1664
1665
// Convert packed single-precision (32-bit) floating-point elements in a to
1666
// packed 32-bit integers, and store the results in dst.
1667
//
1668
// FOR j := 0 to 1
1669
// i := 32*j
1670
// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1671
// ENDFOR
1672
//
1673
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1674
#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1675
1676
// Convert packed single-precision (32-bit) floating-point elements in a to
1677
// packed 8-bit integers, and store the results in lower 4 elements of dst.
1678
// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1679
// between 0x7F and 0x7FFFFFFF.
1680
//
1681
// FOR j := 0 to 3
1682
// i := 8*j
1683
// k := 32*j
1684
// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1685
// dst[i+7:i] := 0x7F
1686
// ELSE
1687
// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1688
// FI
1689
// ENDFOR
1690
//
1691
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1692
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1693
{
1694
return vreinterpret_m64_s8(vqmovn_s16(
1695
vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1696
}
1697
1698
// Convert packed unsigned 16-bit integers in a to packed single-precision
1699
// (32-bit) floating-point elements, and store the results in dst.
1700
//
1701
// FOR j := 0 to 3
1702
// i := j*16
1703
// m := j*32
1704
// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1705
// ENDFOR
1706
//
1707
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1708
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1709
{
1710
return vreinterpretq_m128_f32(
1711
vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1712
}
1713
1714
// Convert the lower packed unsigned 8-bit integers in a to packed
1715
// single-precision (32-bit) floating-point elements, and store the results in
1716
// dst.
1717
//
1718
// FOR j := 0 to 3
1719
// i := j*8
1720
// m := j*32
1721
// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1722
// ENDFOR
1723
//
1724
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1725
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1726
{
1727
return vreinterpretq_m128_f32(vcvtq_f32_u32(
1728
vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1729
}
1730
1731
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1732
// floating-point element, store the result in the lower element of dst, and
1733
// copy the upper 3 packed elements from a to the upper elements of dst.
1734
//
1735
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1736
// dst[127:32] := a[127:32]
1737
//
1738
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1739
#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1740
1741
// Convert the signed 64-bit integer b to a single-precision (32-bit)
1742
// floating-point element, store the result in the lower element of dst, and
1743
// copy the upper 3 packed elements from a to the upper elements of dst.
1744
//
1745
// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1746
// dst[127:32] := a[127:32]
1747
//
1748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1749
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1750
{
1751
return vreinterpretq_m128_f32(
1752
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1753
}
1754
1755
// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1756
//
1757
// dst[31:0] := a[31:0]
1758
//
1759
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1760
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1761
{
1762
return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1763
}
1764
1765
// Convert the lower single-precision (32-bit) floating-point element in a to a
1766
// 32-bit integer, and store the result in dst.
1767
//
1768
// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1769
//
1770
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1771
#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1772
1773
// Convert the lower single-precision (32-bit) floating-point element in a to a
1774
// 64-bit integer, and store the result in dst.
1775
//
1776
// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1777
//
1778
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1779
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1780
{
1781
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1782
return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1783
#else
1784
float32_t data = vgetq_lane_f32(
1785
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1786
return (int64_t) data;
1787
#endif
1788
}
1789
1790
// Convert packed single-precision (32-bit) floating-point elements in a to
1791
// packed 32-bit integers with truncation, and store the results in dst.
1792
//
1793
// FOR j := 0 to 1
1794
// i := 32*j
1795
// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1796
// ENDFOR
1797
//
1798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1799
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1800
{
1801
return vreinterpret_m64_s32(
1802
vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1803
}
1804
1805
// Convert the lower single-precision (32-bit) floating-point element in a to a
1806
// 32-bit integer with truncation, and store the result in dst.
1807
//
1808
// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1809
//
1810
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1811
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1812
{
1813
return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1814
}
1815
1816
// Convert packed single-precision (32-bit) floating-point elements in a to
1817
// packed 32-bit integers with truncation, and store the results in dst.
1818
//
1819
// FOR j := 0 to 1
1820
// i := 32*j
1821
// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1822
// ENDFOR
1823
//
1824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1825
#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1826
1827
// Convert the lower single-precision (32-bit) floating-point element in a to a
1828
// 32-bit integer with truncation, and store the result in dst.
1829
//
1830
// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1831
//
1832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1833
#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1834
1835
// Convert the lower single-precision (32-bit) floating-point element in a to a
1836
// 64-bit integer with truncation, and store the result in dst.
1837
//
1838
// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1839
//
1840
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1841
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1842
{
1843
return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1844
}
1845
1846
// Divides the four single-precision, floating-point values of a and b.
1847
//
1848
// r0 := a0 / b0
1849
// r1 := a1 / b1
1850
// r2 := a2 / b2
1851
// r3 := a3 / b3
1852
//
1853
// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1854
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1855
{
1856
#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1857
return vreinterpretq_m128_f32(
1858
vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1859
#else
1860
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1861
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1862
#if SSE2NEON_PRECISE_DIV
1863
// Additional Netwon-Raphson iteration for accuracy
1864
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1865
#endif
1866
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1867
#endif
1868
}
1869
1870
// Divides the scalar single-precision floating point value of a by b.
1871
// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1872
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1873
{
1874
float32_t value =
1875
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1876
return vreinterpretq_m128_f32(
1877
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1878
}
1879
1880
// Extract a 16-bit integer from a, selected with imm8, and store the result in
1881
// the lower element of dst.
1882
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1883
#define _mm_extract_pi16(a, imm) \
1884
(int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1885
1886
// Free aligned memory that was allocated with _mm_malloc.
1887
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1888
#if !defined(SSE2NEON_ALLOC_DEFINED)
1889
FORCE_INLINE void _mm_free(void *addr)
1890
{
1891
#if defined(_WIN32)
1892
_aligned_free(addr);
1893
#else
1894
free(addr);
1895
#endif
1896
}
1897
#endif
1898
1899
// Macro: Get the flush zero bits from the MXCSR control and status register.
1900
// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1901
// _MM_FLUSH_ZERO_OFF
1902
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1903
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
1904
{
1905
union {
1906
fpcr_bitfield field;
1907
#if defined(__aarch64__)
1908
uint64_t value;
1909
#else
1910
uint32_t value;
1911
#endif
1912
} r;
1913
1914
#if defined(__aarch64__)
1915
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1916
#else
1917
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1918
#endif
1919
1920
return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1921
}
1922
1923
// Macro: Get the rounding mode bits from the MXCSR control and status register.
1924
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1925
// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1927
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1928
{
1929
union {
1930
fpcr_bitfield field;
1931
#if defined(__aarch64__)
1932
uint64_t value;
1933
#else
1934
uint32_t value;
1935
#endif
1936
} r;
1937
1938
#if defined(__aarch64__)
1939
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1940
#else
1941
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1942
#endif
1943
1944
if (r.field.bit22) {
1945
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1946
} else {
1947
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1948
}
1949
}
1950
1951
// Copy a to dst, and insert the 16-bit integer i into dst at the location
1952
// specified by imm8.
1953
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1954
#define _mm_insert_pi16(a, b, imm) \
1955
__extension__({ \
1956
vreinterpret_m64_s16( \
1957
vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1958
})
1959
1960
// Loads four single-precision, floating-point values.
1961
// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1962
FORCE_INLINE __m128 _mm_load_ps(const float *p)
1963
{
1964
return vreinterpretq_m128_f32(vld1q_f32(p));
1965
}
1966
1967
// Load a single-precision (32-bit) floating-point element from memory into all
1968
// elements of dst.
1969
//
1970
// dst[31:0] := MEM[mem_addr+31:mem_addr]
1971
// dst[63:32] := MEM[mem_addr+31:mem_addr]
1972
// dst[95:64] := MEM[mem_addr+31:mem_addr]
1973
// dst[127:96] := MEM[mem_addr+31:mem_addr]
1974
//
1975
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1976
#define _mm_load_ps1 _mm_load1_ps
1977
1978
// Loads an single - precision, floating - point value into the low word and
1979
// clears the upper three words.
1980
// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1981
FORCE_INLINE __m128 _mm_load_ss(const float *p)
1982
{
1983
return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1984
}
1985
1986
// Loads a single single-precision, floating-point value, copying it into all
1987
// four words
1988
// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1989
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1990
{
1991
return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1992
}
1993
1994
// Sets the upper two single-precision, floating-point values with 64
1995
// bits of data loaded from the address p; the lower two values are passed
1996
// through from a.
1997
//
1998
// r0 := a0
1999
// r1 := a1
2000
// r2 := *p0
2001
// r3 := *p1
2002
//
2003
// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
2004
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
2005
{
2006
return vreinterpretq_m128_f32(
2007
vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
2008
}
2009
2010
// Sets the lower two single-precision, floating-point values with 64
2011
// bits of data loaded from the address p; the upper two values are passed
2012
// through from a.
2013
//
2014
// Return Value
2015
// r0 := *p0
2016
// r1 := *p1
2017
// r2 := a2
2018
// r3 := a3
2019
//
2020
// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
2021
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
2022
{
2023
return vreinterpretq_m128_f32(
2024
vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
2025
}
2026
2027
// Load 4 single-precision (32-bit) floating-point elements from memory into dst
2028
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2029
// general-protection exception may be generated.
2030
//
2031
// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
2032
// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
2033
// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
2034
// dst[127:96] := MEM[mem_addr+31:mem_addr]
2035
//
2036
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
2037
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
2038
{
2039
float32x4_t v = vrev64q_f32(vld1q_f32(p));
2040
return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
2041
}
2042
2043
// Loads four single-precision, floating-point values.
2044
// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
2045
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
2046
{
2047
// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
2048
// equivalent for neon
2049
return vreinterpretq_m128_f32(vld1q_f32(p));
2050
}
2051
2052
// Load unaligned 16-bit integer from memory into the first element of dst.
2053
//
2054
// dst[15:0] := MEM[mem_addr+15:mem_addr]
2055
// dst[MAX:16] := 0
2056
//
2057
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
2058
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
2059
{
2060
return vreinterpretq_m128i_s16(
2061
vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
2062
}
2063
2064
// Load unaligned 64-bit integer from memory into the first element of dst.
2065
//
2066
// dst[63:0] := MEM[mem_addr+63:mem_addr]
2067
// dst[MAX:64] := 0
2068
//
2069
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
2070
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
2071
{
2072
return vreinterpretq_m128i_s64(
2073
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
2074
}
2075
2076
// Allocate aligned blocks of memory.
2077
// https://software.intel.com/en-us/
2078
// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
2079
#if !defined(SSE2NEON_ALLOC_DEFINED)
2080
FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
2081
{
2082
void *ptr;
2083
if (align == 1)
2084
return malloc(size);
2085
if (align == 2 || (sizeof(void *) == 8 && align == 4))
2086
align = sizeof(void *);
2087
#if defined(_WIN32)
2088
ptr = _aligned_malloc(size, align);
2089
if (ptr)
2090
return ptr;
2091
#else
2092
if (!posix_memalign(&ptr, align, size))
2093
return ptr;
2094
#endif
2095
return NULL;
2096
}
2097
#endif
2098
2099
// Conditionally store 8-bit integer elements from a into memory using mask
2100
// (elements are not stored when the highest bit is not set in the corresponding
2101
// element) and a non-temporal memory hint.
2102
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
2103
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
2104
{
2105
int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
2106
__m128 b = _mm_load_ps((const float *) mem_addr);
2107
int8x8_t masked =
2108
vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
2109
vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
2110
vst1_s8((int8_t *) mem_addr, masked);
2111
}
2112
2113
// Conditionally store 8-bit integer elements from a into memory using mask
2114
// (elements are not stored when the highest bit is not set in the corresponding
2115
// element) and a non-temporal memory hint.
2116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
2117
#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2118
2119
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2120
// values in dst.
2121
//
2122
// FOR j := 0 to 3
2123
// i := j*16
2124
// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
2125
// ENDFOR
2126
//
2127
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
2128
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
2129
{
2130
return vreinterpret_m64_s16(
2131
vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2132
}
2133
2134
// Computes the maximums of the four single-precision, floating-point values of
2135
// a and b.
2136
// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2137
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2138
{
2139
#if SSE2NEON_PRECISE_MINMAX
2140
float32x4_t _a = vreinterpretq_f32_m128(a);
2141
float32x4_t _b = vreinterpretq_f32_m128(b);
2142
return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
2143
#else
2144
return vreinterpretq_m128_f32(
2145
vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2146
#endif
2147
}
2148
2149
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2150
// values in dst.
2151
//
2152
// FOR j := 0 to 7
2153
// i := j*8
2154
// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
2155
// ENDFOR
2156
//
2157
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2158
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2159
{
2160
return vreinterpret_m64_u8(
2161
vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2162
}
2163
2164
// Computes the maximum of the two lower scalar single-precision floating point
2165
// values of a and b.
2166
// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2167
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2168
{
2169
float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2170
return vreinterpretq_m128_f32(
2171
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2172
}
2173
2174
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2175
// values in dst.
2176
//
2177
// FOR j := 0 to 3
2178
// i := j*16
2179
// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2180
// ENDFOR
2181
//
2182
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2183
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2184
{
2185
return vreinterpret_m64_s16(
2186
vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2187
}
2188
2189
// Computes the minima of the four single-precision, floating-point values of a
2190
// and b.
2191
// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2192
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2193
{
2194
#if SSE2NEON_PRECISE_MINMAX
2195
float32x4_t _a = vreinterpretq_f32_m128(a);
2196
float32x4_t _b = vreinterpretq_f32_m128(b);
2197
return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2198
#else
2199
return vreinterpretq_m128_f32(
2200
vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2201
#endif
2202
}
2203
2204
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2205
// values in dst.
2206
//
2207
// FOR j := 0 to 7
2208
// i := j*8
2209
// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2210
// ENDFOR
2211
//
2212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2213
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2214
{
2215
return vreinterpret_m64_u8(
2216
vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2217
}
2218
2219
// Computes the minimum of the two lower scalar single-precision floating point
2220
// values of a and b.
2221
// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2222
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2223
{
2224
float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2225
return vreinterpretq_m128_f32(
2226
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2227
}
2228
2229
// Sets the low word to the single-precision, floating-point value of b
2230
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2231
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2232
{
2233
return vreinterpretq_m128_f32(
2234
vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2235
vreinterpretq_f32_m128(a), 0));
2236
}
2237
2238
// Moves the upper two values of B into the lower two values of A.
2239
//
2240
// r3 := a3
2241
// r2 := a2
2242
// r1 := b3
2243
// r0 := b2
2244
FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2245
{
2246
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2247
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2248
return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2249
}
2250
2251
// Moves the lower two values of B into the upper two values of A.
2252
//
2253
// r3 := b1
2254
// r2 := b0
2255
// r1 := a1
2256
// r0 := a0
2257
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2258
{
2259
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2260
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2261
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2262
}
2263
2264
// Create mask from the most significant bit of each 8-bit element in a, and
2265
// store the result in dst.
2266
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2267
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2268
{
2269
uint8x8_t input = vreinterpret_u8_m64(a);
2270
#if defined(__aarch64__)
2271
static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2272
uint8x8_t tmp = vshr_n_u8(input, 7);
2273
return vaddv_u8(vshl_u8(tmp, shift));
2274
#else
2275
// Refer the implementation of `_mm_movemask_epi8`
2276
uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2277
uint32x2_t paired16 =
2278
vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2279
uint8x8_t paired32 =
2280
vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2281
return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2282
#endif
2283
}
2284
2285
// NEON does not provide this method
2286
// Creates a 4-bit mask from the most significant bits of the four
2287
// single-precision, floating-point values.
2288
// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2289
FORCE_INLINE int _mm_movemask_ps(__m128 a)
2290
{
2291
uint32x4_t input = vreinterpretq_u32_m128(a);
2292
#if defined(__aarch64__)
2293
static const int32x4_t shift = {0, 1, 2, 3};
2294
uint32x4_t tmp = vshrq_n_u32(input, 31);
2295
return vaddvq_u32(vshlq_u32(tmp, shift));
2296
#else
2297
// Uses the exact same method as _mm_movemask_epi8, see that for details.
2298
// Shift out everything but the sign bits with a 32-bit unsigned shift
2299
// right.
2300
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2301
// Merge the two pairs together with a 64-bit unsigned shift right + add.
2302
uint8x16_t paired =
2303
vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2304
// Extract the result.
2305
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2306
#endif
2307
}
2308
2309
// Multiplies the four single-precision, floating-point values of a and b.
2310
//
2311
// r0 := a0 * b0
2312
// r1 := a1 * b1
2313
// r2 := a2 * b2
2314
// r3 := a3 * b3
2315
//
2316
// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2317
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2318
{
2319
return vreinterpretq_m128_f32(
2320
vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2321
}
2322
2323
// Multiply the lower single-precision (32-bit) floating-point element in a and
2324
// b, store the result in the lower element of dst, and copy the upper 3 packed
2325
// elements from a to the upper elements of dst.
2326
//
2327
// dst[31:0] := a[31:0] * b[31:0]
2328
// dst[127:32] := a[127:32]
2329
//
2330
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2331
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2332
{
2333
return _mm_move_ss(a, _mm_mul_ps(a, b));
2334
}
2335
2336
// Multiply the packed unsigned 16-bit integers in a and b, producing
2337
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2338
// integers in dst.
2339
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2340
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2341
{
2342
return vreinterpret_m64_u16(vshrn_n_u32(
2343
vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2344
}
2345
2346
// Computes the bitwise OR of the four single-precision, floating-point values
2347
// of a and b.
2348
// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2349
FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2350
{
2351
return vreinterpretq_m128_s32(
2352
vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2353
}
2354
2355
// Average packed unsigned 8-bit integers in a and b, and store the results in
2356
// dst.
2357
//
2358
// FOR j := 0 to 7
2359
// i := j*8
2360
// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2361
// ENDFOR
2362
//
2363
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2364
#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2365
2366
// Average packed unsigned 16-bit integers in a and b, and store the results in
2367
// dst.
2368
//
2369
// FOR j := 0 to 3
2370
// i := j*16
2371
// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2372
// ENDFOR
2373
//
2374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2375
#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2376
2377
// Extract a 16-bit integer from a, selected with imm8, and store the result in
2378
// the lower element of dst.
2379
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2380
#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2381
2382
// Copy a to dst, and insert the 16-bit integer i into dst at the location
2383
// specified by imm8.
2384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2385
#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2386
2387
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2388
// values in dst.
2389
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2390
#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2391
2392
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2393
// values in dst.
2394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2395
#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2396
2397
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2398
// values in dst.
2399
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2400
#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2401
2402
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2403
// values in dst.
2404
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2405
#define _m_pminub(a, b) _mm_min_pu8(a, b)
2406
2407
// Create mask from the most significant bit of each 8-bit element in a, and
2408
// store the result in dst.
2409
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2410
#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2411
2412
// Multiply the packed unsigned 16-bit integers in a and b, producing
2413
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2414
// integers in dst.
2415
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2416
#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2417
2418
// Fetch the line of data from memory that contains address p to a location in
2419
// the cache heirarchy specified by the locality hint i.
2420
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2421
FORCE_INLINE void _mm_prefetch(char const *p, int i)
2422
{
2423
switch (i) {
2424
case _MM_HINT_NTA:
2425
__builtin_prefetch(p, 0, 0);
2426
break;
2427
case _MM_HINT_T0:
2428
__builtin_prefetch(p, 0, 3);
2429
break;
2430
case _MM_HINT_T1:
2431
__builtin_prefetch(p, 0, 2);
2432
break;
2433
case _MM_HINT_T2:
2434
__builtin_prefetch(p, 0, 1);
2435
break;
2436
}
2437
}
2438
2439
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2440
// b, then horizontally sum each consecutive 8 differences to produce four
2441
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2442
// 16 bits of dst.
2443
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2444
#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2445
2446
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2447
// in dst.
2448
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2449
#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2450
2451
// Compute the approximate reciprocal of packed single-precision (32-bit)
2452
// floating-point elements in a, and store the results in dst. The maximum
2453
// relative error for this approximation is less than 1.5*2^-12.
2454
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2455
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2456
{
2457
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2458
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2459
#if SSE2NEON_PRECISE_DIV
2460
// Additional Netwon-Raphson iteration for accuracy
2461
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2462
#endif
2463
return vreinterpretq_m128_f32(recip);
2464
}
2465
2466
// Compute the approximate reciprocal of the lower single-precision (32-bit)
2467
// floating-point element in a, store the result in the lower element of dst,
2468
// and copy the upper 3 packed elements from a to the upper elements of dst. The
2469
// maximum relative error for this approximation is less than 1.5*2^-12.
2470
//
2471
// dst[31:0] := (1.0 / a[31:0])
2472
// dst[127:32] := a[127:32]
2473
//
2474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2475
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2476
{
2477
return _mm_move_ss(a, _mm_rcp_ps(a));
2478
}
2479
2480
// Computes the approximations of the reciprocal square roots of the four
2481
// single-precision floating point values of in.
2482
// The current precision is 1% error.
2483
// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2484
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2485
{
2486
float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2487
#if SSE2NEON_PRECISE_SQRT
2488
// Additional Netwon-Raphson iteration for accuracy
2489
out = vmulq_f32(
2490
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2491
out = vmulq_f32(
2492
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2493
#endif
2494
return vreinterpretq_m128_f32(out);
2495
}
2496
2497
// Compute the approximate reciprocal square root of the lower single-precision
2498
// (32-bit) floating-point element in a, store the result in the lower element
2499
// of dst, and copy the upper 3 packed elements from a to the upper elements of
2500
// dst.
2501
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2502
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2503
{
2504
return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2505
}
2506
2507
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2508
// b, then horizontally sum each consecutive 8 differences to produce four
2509
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2510
// 16 bits of dst.
2511
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2512
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2513
{
2514
uint64x1_t t = vpaddl_u32(vpaddl_u16(
2515
vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2516
return vreinterpret_m64_u16(
2517
vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2518
}
2519
2520
// Macro: Set the flush zero bits of the MXCSR control and status register to
2521
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2522
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2524
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2525
{
2526
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2527
// regardless of the value of the FZ bit.
2528
union {
2529
fpcr_bitfield field;
2530
#if defined(__aarch64__)
2531
uint64_t value;
2532
#else
2533
uint32_t value;
2534
#endif
2535
} r;
2536
2537
#if defined(__aarch64__)
2538
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2539
#else
2540
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2541
#endif
2542
2543
r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2544
2545
#if defined(__aarch64__)
2546
__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2547
#else
2548
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2549
#endif
2550
}
2551
2552
// Sets the four single-precision, floating-point values to the four inputs.
2553
// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2554
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2555
{
2556
float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2557
return vreinterpretq_m128_f32(vld1q_f32(data));
2558
}
2559
2560
// Sets the four single-precision, floating-point values to w.
2561
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2562
FORCE_INLINE __m128 _mm_set_ps1(float _w)
2563
{
2564
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2565
}
2566
2567
// Macro: Set the rounding mode bits of the MXCSR control and status register to
2568
// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2569
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2570
// _MM_ROUND_TOWARD_ZERO
2571
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2572
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2573
{
2574
union {
2575
fpcr_bitfield field;
2576
#if defined(__aarch64__)
2577
uint64_t value;
2578
#else
2579
uint32_t value;
2580
#endif
2581
} r;
2582
2583
#if defined(__aarch64__)
2584
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2585
#else
2586
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2587
#endif
2588
2589
switch (rounding) {
2590
case _MM_ROUND_TOWARD_ZERO:
2591
r.field.bit22 = 1;
2592
r.field.bit23 = 1;
2593
break;
2594
case _MM_ROUND_DOWN:
2595
r.field.bit22 = 0;
2596
r.field.bit23 = 1;
2597
break;
2598
case _MM_ROUND_UP:
2599
r.field.bit22 = 1;
2600
r.field.bit23 = 0;
2601
break;
2602
default: //_MM_ROUND_NEAREST
2603
r.field.bit22 = 0;
2604
r.field.bit23 = 0;
2605
}
2606
2607
#if defined(__aarch64__)
2608
__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2609
#else
2610
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2611
#endif
2612
}
2613
2614
// Copy single-precision (32-bit) floating-point element a to the lower element
2615
// of dst, and zero the upper 3 elements.
2616
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2617
FORCE_INLINE __m128 _mm_set_ss(float a)
2618
{
2619
return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2620
}
2621
2622
// Sets the four single-precision, floating-point values to w.
2623
//
2624
// r0 := r1 := r2 := r3 := w
2625
//
2626
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2627
FORCE_INLINE __m128 _mm_set1_ps(float _w)
2628
{
2629
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2630
}
2631
2632
// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2633
FORCE_INLINE void _mm_setcsr(unsigned int a)
2634
{
2635
_MM_SET_ROUNDING_MODE(a);
2636
}
2637
2638
// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2639
FORCE_INLINE unsigned int _mm_getcsr()
2640
{
2641
return _MM_GET_ROUNDING_MODE();
2642
}
2643
2644
// Sets the four single-precision, floating-point values to the four inputs in
2645
// reverse order.
2646
// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2647
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2648
{
2649
float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2650
return vreinterpretq_m128_f32(vld1q_f32(data));
2651
}
2652
2653
// Clears the four single-precision, floating-point values.
2654
// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2655
FORCE_INLINE __m128 _mm_setzero_ps(void)
2656
{
2657
return vreinterpretq_m128_f32(vdupq_n_f32(0));
2658
}
2659
2660
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2661
// in dst.
2662
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2663
#ifdef _sse2neon_shuffle
2664
#define _mm_shuffle_pi16(a, imm) \
2665
__extension__({ \
2666
vreinterpret_m64_s16(vshuffle_s16( \
2667
vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2668
((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2669
})
2670
#else
2671
#define _mm_shuffle_pi16(a, imm) \
2672
__extension__({ \
2673
int16x4_t ret; \
2674
ret = \
2675
vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2676
ret = vset_lane_s16( \
2677
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2678
1); \
2679
ret = vset_lane_s16( \
2680
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2681
2); \
2682
ret = vset_lane_s16( \
2683
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2684
3); \
2685
vreinterpret_m64_s16(ret); \
2686
})
2687
#endif
2688
2689
// Perform a serializing operation on all store-to-memory instructions that were
2690
// issued prior to this instruction. Guarantees that every store instruction
2691
// that precedes, in program order, is globally visible before any store
2692
// instruction which follows the fence in program order.
2693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2694
FORCE_INLINE void _mm_sfence(void)
2695
{
2696
_sse2neon_smp_mb();
2697
}
2698
2699
// Perform a serializing operation on all load-from-memory and store-to-memory
2700
// instructions that were issued prior to this instruction. Guarantees that
2701
// every memory access that precedes, in program order, the memory fence
2702
// instruction is globally visible before any memory instruction which follows
2703
// the fence in program order.
2704
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2705
FORCE_INLINE void _mm_mfence(void)
2706
{
2707
_sse2neon_smp_mb();
2708
}
2709
2710
// Perform a serializing operation on all load-from-memory instructions that
2711
// were issued prior to this instruction. Guarantees that every load instruction
2712
// that precedes, in program order, is globally visible before any load
2713
// instruction which follows the fence in program order.
2714
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2715
FORCE_INLINE void _mm_lfence(void)
2716
{
2717
_sse2neon_smp_mb();
2718
}
2719
2720
// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2721
// int imm)
2722
#ifdef _sse2neon_shuffle
2723
#define _mm_shuffle_ps(a, b, imm) \
2724
__extension__({ \
2725
float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2726
float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2727
float32x4_t _shuf = \
2728
vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2729
(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2730
vreinterpretq_m128_f32(_shuf); \
2731
})
2732
#else // generic
2733
#define _mm_shuffle_ps(a, b, imm) \
2734
__extension__({ \
2735
__m128 ret; \
2736
switch (imm) { \
2737
case _MM_SHUFFLE(1, 0, 3, 2): \
2738
ret = _mm_shuffle_ps_1032((a), (b)); \
2739
break; \
2740
case _MM_SHUFFLE(2, 3, 0, 1): \
2741
ret = _mm_shuffle_ps_2301((a), (b)); \
2742
break; \
2743
case _MM_SHUFFLE(0, 3, 2, 1): \
2744
ret = _mm_shuffle_ps_0321((a), (b)); \
2745
break; \
2746
case _MM_SHUFFLE(2, 1, 0, 3): \
2747
ret = _mm_shuffle_ps_2103((a), (b)); \
2748
break; \
2749
case _MM_SHUFFLE(1, 0, 1, 0): \
2750
ret = _mm_movelh_ps((a), (b)); \
2751
break; \
2752
case _MM_SHUFFLE(1, 0, 0, 1): \
2753
ret = _mm_shuffle_ps_1001((a), (b)); \
2754
break; \
2755
case _MM_SHUFFLE(0, 1, 0, 1): \
2756
ret = _mm_shuffle_ps_0101((a), (b)); \
2757
break; \
2758
case _MM_SHUFFLE(3, 2, 1, 0): \
2759
ret = _mm_shuffle_ps_3210((a), (b)); \
2760
break; \
2761
case _MM_SHUFFLE(0, 0, 1, 1): \
2762
ret = _mm_shuffle_ps_0011((a), (b)); \
2763
break; \
2764
case _MM_SHUFFLE(0, 0, 2, 2): \
2765
ret = _mm_shuffle_ps_0022((a), (b)); \
2766
break; \
2767
case _MM_SHUFFLE(2, 2, 0, 0): \
2768
ret = _mm_shuffle_ps_2200((a), (b)); \
2769
break; \
2770
case _MM_SHUFFLE(3, 2, 0, 2): \
2771
ret = _mm_shuffle_ps_3202((a), (b)); \
2772
break; \
2773
case _MM_SHUFFLE(3, 2, 3, 2): \
2774
ret = _mm_movehl_ps((b), (a)); \
2775
break; \
2776
case _MM_SHUFFLE(1, 1, 3, 3): \
2777
ret = _mm_shuffle_ps_1133((a), (b)); \
2778
break; \
2779
case _MM_SHUFFLE(2, 0, 1, 0): \
2780
ret = _mm_shuffle_ps_2010((a), (b)); \
2781
break; \
2782
case _MM_SHUFFLE(2, 0, 0, 1): \
2783
ret = _mm_shuffle_ps_2001((a), (b)); \
2784
break; \
2785
case _MM_SHUFFLE(2, 0, 3, 2): \
2786
ret = _mm_shuffle_ps_2032((a), (b)); \
2787
break; \
2788
default: \
2789
ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2790
break; \
2791
} \
2792
ret; \
2793
})
2794
#endif
2795
2796
// Computes the approximations of square roots of the four single-precision,
2797
// floating-point values of a. First computes reciprocal square roots and then
2798
// reciprocals of the four values.
2799
//
2800
// r0 := sqrt(a0)
2801
// r1 := sqrt(a1)
2802
// r2 := sqrt(a2)
2803
// r3 := sqrt(a3)
2804
//
2805
// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2806
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2807
{
2808
#if SSE2NEON_PRECISE_SQRT
2809
float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2810
2811
// Test for vrsqrteq_f32(0) -> positive infinity case.
2812
// Change to zero, so that s * 1/sqrt(s) result is zero too.
2813
const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2814
const uint32x4_t div_by_zero =
2815
vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2816
recip = vreinterpretq_f32_u32(
2817
vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2818
2819
// Additional Netwon-Raphson iteration for accuracy
2820
recip = vmulq_f32(
2821
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2822
recip);
2823
recip = vmulq_f32(
2824
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2825
recip);
2826
2827
// sqrt(s) = s * 1/sqrt(s)
2828
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2829
#elif defined(__aarch64__)
2830
return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2831
#else
2832
float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2833
float32x4_t sq = vrecpeq_f32(recipsq);
2834
return vreinterpretq_m128_f32(sq);
2835
#endif
2836
}
2837
2838
// Computes the approximation of the square root of the scalar single-precision
2839
// floating point value of in.
2840
// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2841
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2842
{
2843
float32_t value =
2844
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2845
return vreinterpretq_m128_f32(
2846
vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2847
}
2848
2849
// Stores four single-precision, floating-point values.
2850
// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2851
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2852
{
2853
vst1q_f32(p, vreinterpretq_f32_m128(a));
2854
}
2855
2856
// Store the lower single-precision (32-bit) floating-point element from a into
2857
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2858
// boundary or a general-protection exception may be generated.
2859
//
2860
// MEM[mem_addr+31:mem_addr] := a[31:0]
2861
// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2862
// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2863
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2864
//
2865
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2866
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2867
{
2868
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2869
vst1q_f32(p, vdupq_n_f32(a0));
2870
}
2871
2872
// Stores the lower single - precision, floating - point value.
2873
// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2874
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2875
{
2876
vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2877
}
2878
2879
// Store the lower single-precision (32-bit) floating-point element from a into
2880
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2881
// boundary or a general-protection exception may be generated.
2882
//
2883
// MEM[mem_addr+31:mem_addr] := a[31:0]
2884
// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2885
// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2886
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2887
//
2888
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2889
#define _mm_store1_ps _mm_store_ps1
2890
2891
// Stores the upper two single-precision, floating-point values of a to the
2892
// address p.
2893
//
2894
// *p0 := a2
2895
// *p1 := a3
2896
//
2897
// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2898
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2899
{
2900
*p = vreinterpret_m64_f32(vget_high_f32(a));
2901
}
2902
2903
// Stores the lower two single-precision floating point values of a to the
2904
// address p.
2905
//
2906
// *p0 := a0
2907
// *p1 := a1
2908
//
2909
// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2910
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2911
{
2912
*p = vreinterpret_m64_f32(vget_low_f32(a));
2913
}
2914
2915
// Store 4 single-precision (32-bit) floating-point elements from a into memory
2916
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2917
// general-protection exception may be generated.
2918
//
2919
// MEM[mem_addr+31:mem_addr] := a[127:96]
2920
// MEM[mem_addr+63:mem_addr+32] := a[95:64]
2921
// MEM[mem_addr+95:mem_addr+64] := a[63:32]
2922
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2923
//
2924
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2925
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2926
{
2927
float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2928
float32x4_t rev = vextq_f32(tmp, tmp, 2);
2929
vst1q_f32(p, rev);
2930
}
2931
2932
// Stores four single-precision, floating-point values.
2933
// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2934
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2935
{
2936
vst1q_f32(p, vreinterpretq_f32_m128(a));
2937
}
2938
2939
// Stores 16-bits of integer data a at the address p.
2940
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2941
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2942
{
2943
vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2944
}
2945
2946
// Stores 64-bits of integer data a at the address p.
2947
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2948
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2949
{
2950
vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2951
}
2952
2953
// Store 64-bits of integer data from a into memory using a non-temporal memory
2954
// hint.
2955
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2956
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2957
{
2958
vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2959
}
2960
2961
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2962
// point elements) from a into memory using a non-temporal memory hint.
2963
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2964
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2965
{
2966
#if __has_builtin(__builtin_nontemporal_store)
2967
__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
2968
#else
2969
vst1q_f32(p, vreinterpretq_f32_m128(a));
2970
#endif
2971
}
2972
2973
// Subtracts the four single-precision, floating-point values of a and b.
2974
//
2975
// r0 := a0 - b0
2976
// r1 := a1 - b1
2977
// r2 := a2 - b2
2978
// r3 := a3 - b3
2979
//
2980
// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2981
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2982
{
2983
return vreinterpretq_m128_f32(
2984
vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2985
}
2986
2987
// Subtract the lower single-precision (32-bit) floating-point element in b from
2988
// the lower single-precision (32-bit) floating-point element in a, store the
2989
// result in the lower element of dst, and copy the upper 3 packed elements from
2990
// a to the upper elements of dst.
2991
//
2992
// dst[31:0] := a[31:0] - b[31:0]
2993
// dst[127:32] := a[127:32]
2994
//
2995
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2996
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2997
{
2998
return _mm_move_ss(a, _mm_sub_ps(a, b));
2999
}
3000
3001
// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
3002
// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
3003
// transposed matrix in these vectors (row0 now contains column 0, etc.).
3004
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
3005
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3006
do { \
3007
float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
3008
float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
3009
row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
3010
vget_low_f32(ROW23.val[0])); \
3011
row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
3012
vget_low_f32(ROW23.val[1])); \
3013
row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
3014
vget_high_f32(ROW23.val[0])); \
3015
row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
3016
vget_high_f32(ROW23.val[1])); \
3017
} while (0)
3018
3019
// according to the documentation, these intrinsics behave the same as the
3020
// non-'u' versions. We'll just alias them here.
3021
#define _mm_ucomieq_ss _mm_comieq_ss
3022
#define _mm_ucomige_ss _mm_comige_ss
3023
#define _mm_ucomigt_ss _mm_comigt_ss
3024
#define _mm_ucomile_ss _mm_comile_ss
3025
#define _mm_ucomilt_ss _mm_comilt_ss
3026
#define _mm_ucomineq_ss _mm_comineq_ss
3027
3028
// Return vector of type __m128i with undefined elements.
3029
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
3030
FORCE_INLINE __m128i _mm_undefined_si128(void)
3031
{
3032
#if defined(__GNUC__) || defined(__clang__)
3033
#pragma GCC diagnostic push
3034
#pragma GCC diagnostic ignored "-Wuninitialized"
3035
#endif
3036
__m128i a;
3037
return a;
3038
#if defined(__GNUC__) || defined(__clang__)
3039
#pragma GCC diagnostic pop
3040
#endif
3041
}
3042
3043
// Return vector of type __m128 with undefined elements.
3044
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
3045
FORCE_INLINE __m128 _mm_undefined_ps(void)
3046
{
3047
#if defined(__GNUC__) || defined(__clang__)
3048
#pragma GCC diagnostic push
3049
#pragma GCC diagnostic ignored "-Wuninitialized"
3050
#endif
3051
__m128 a;
3052
return a;
3053
#if defined(__GNUC__) || defined(__clang__)
3054
#pragma GCC diagnostic pop
3055
#endif
3056
}
3057
3058
// Selects and interleaves the upper two single-precision, floating-point values
3059
// from a and b.
3060
//
3061
// r0 := a2
3062
// r1 := b2
3063
// r2 := a3
3064
// r3 := b3
3065
//
3066
// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
3067
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
3068
{
3069
#if defined(__aarch64__)
3070
return vreinterpretq_m128_f32(
3071
vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3072
#else
3073
float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
3074
float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
3075
float32x2x2_t result = vzip_f32(a1, b1);
3076
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3077
#endif
3078
}
3079
3080
// Selects and interleaves the lower two single-precision, floating-point values
3081
// from a and b.
3082
//
3083
// r0 := a0
3084
// r1 := b0
3085
// r2 := a1
3086
// r3 := b1
3087
//
3088
// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
3089
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
3090
{
3091
#if defined(__aarch64__)
3092
return vreinterpretq_m128_f32(
3093
vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3094
#else
3095
float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
3096
float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
3097
float32x2x2_t result = vzip_f32(a1, b1);
3098
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3099
#endif
3100
}
3101
3102
// Computes bitwise EXOR (exclusive-or) of the four single-precision,
3103
// floating-point values of a and b.
3104
// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
3105
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
3106
{
3107
return vreinterpretq_m128_s32(
3108
veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
3109
}
3110
3111
/* SSE2 */
3112
3113
// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
3114
// unsigned 16-bit integers in b.
3115
// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
3116
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
3117
{
3118
return vreinterpretq_m128i_s16(
3119
vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3120
}
3121
3122
// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
3123
// unsigned 32-bit integers in b.
3124
//
3125
// r0 := a0 + b0
3126
// r1 := a1 + b1
3127
// r2 := a2 + b2
3128
// r3 := a3 + b3
3129
//
3130
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
3131
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
3132
{
3133
return vreinterpretq_m128i_s32(
3134
vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3135
}
3136
3137
// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
3138
// unsigned 32-bit integers in b.
3139
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
3140
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
3141
{
3142
return vreinterpretq_m128i_s64(
3143
vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
3144
}
3145
3146
// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3147
// unsigned 8-bit integers in b.
3148
// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
3149
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3150
{
3151
return vreinterpretq_m128i_s8(
3152
vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3153
}
3154
3155
// Add packed double-precision (64-bit) floating-point elements in a and b, and
3156
// store the results in dst.
3157
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
3158
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
3159
{
3160
#if defined(__aarch64__)
3161
return vreinterpretq_m128d_f64(
3162
vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3163
#else
3164
double *da = (double *) &a;
3165
double *db = (double *) &b;
3166
double c[2];
3167
c[0] = da[0] + db[0];
3168
c[1] = da[1] + db[1];
3169
return vld1q_f32((float32_t *) c);
3170
#endif
3171
}
3172
3173
// Add the lower double-precision (64-bit) floating-point element in a and b,
3174
// store the result in the lower element of dst, and copy the upper element from
3175
// a to the upper element of dst.
3176
//
3177
// dst[63:0] := a[63:0] + b[63:0]
3178
// dst[127:64] := a[127:64]
3179
//
3180
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
3181
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
3182
{
3183
#if defined(__aarch64__)
3184
return _mm_move_sd(a, _mm_add_pd(a, b));
3185
#else
3186
double *da = (double *) &a;
3187
double *db = (double *) &b;
3188
double c[2];
3189
c[0] = da[0] + db[0];
3190
c[1] = da[1];
3191
return vld1q_f32((float32_t *) c);
3192
#endif
3193
}
3194
3195
// Add 64-bit integers a and b, and store the result in dst.
3196
//
3197
// dst[63:0] := a[63:0] + b[63:0]
3198
//
3199
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
3200
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3201
{
3202
return vreinterpret_m64_s64(
3203
vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3204
}
3205
3206
// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3207
// and saturates.
3208
//
3209
// r0 := SignedSaturate(a0 + b0)
3210
// r1 := SignedSaturate(a1 + b1)
3211
// ...
3212
// r7 := SignedSaturate(a7 + b7)
3213
//
3214
// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3215
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3216
{
3217
return vreinterpretq_m128i_s16(
3218
vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3219
}
3220
3221
// Add packed signed 8-bit integers in a and b using saturation, and store the
3222
// results in dst.
3223
//
3224
// FOR j := 0 to 15
3225
// i := j*8
3226
// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3227
// ENDFOR
3228
//
3229
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
3230
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3231
{
3232
return vreinterpretq_m128i_s8(
3233
vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3234
}
3235
3236
// Add packed unsigned 16-bit integers in a and b using saturation, and store
3237
// the results in dst.
3238
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3239
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3240
{
3241
return vreinterpretq_m128i_u16(
3242
vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3243
}
3244
3245
// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3246
// b and saturates..
3247
// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3248
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3249
{
3250
return vreinterpretq_m128i_u8(
3251
vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3252
}
3253
3254
// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3255
// elements in a and b, and store the results in dst.
3256
//
3257
// FOR j := 0 to 1
3258
// i := j*64
3259
// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3260
// ENDFOR
3261
//
3262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3263
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3264
{
3265
return vreinterpretq_m128d_s64(
3266
vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3267
}
3268
3269
// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3270
// b.
3271
//
3272
// r := a & b
3273
//
3274
// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3275
FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3276
{
3277
return vreinterpretq_m128i_s32(
3278
vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3279
}
3280
3281
// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3282
// elements in a and then AND with b, and store the results in dst.
3283
//
3284
// FOR j := 0 to 1
3285
// i := j*64
3286
// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3287
// ENDFOR
3288
//
3289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3290
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3291
{
3292
// *NOTE* argument swap
3293
return vreinterpretq_m128d_s64(
3294
vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3295
}
3296
3297
// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3298
// 128-bit value in a.
3299
//
3300
// r := (~a) & b
3301
//
3302
// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3303
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3304
{
3305
return vreinterpretq_m128i_s32(
3306
vbicq_s32(vreinterpretq_s32_m128i(b),
3307
vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3308
}
3309
3310
// Computes the average of the 8 unsigned 16-bit integers in a and the 8
3311
// unsigned 16-bit integers in b and rounds.
3312
//
3313
// r0 := (a0 + b0) / 2
3314
// r1 := (a1 + b1) / 2
3315
// ...
3316
// r7 := (a7 + b7) / 2
3317
//
3318
// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3319
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3320
{
3321
return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3322
vreinterpretq_u16_m128i(b));
3323
}
3324
3325
// Computes the average of the 16 unsigned 8-bit integers in a and the 16
3326
// unsigned 8-bit integers in b and rounds.
3327
//
3328
// r0 := (a0 + b0) / 2
3329
// r1 := (a1 + b1) / 2
3330
// ...
3331
// r15 := (a15 + b15) / 2
3332
//
3333
// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3334
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3335
{
3336
return vreinterpretq_m128i_u8(
3337
vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3338
}
3339
3340
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3341
// dst.
3342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3343
#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3344
3345
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3346
// dst.
3347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3348
#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3349
3350
// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3351
// compilation and does not generate any instructions, thus it has zero latency.
3352
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3353
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3354
{
3355
return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3356
}
3357
3358
// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3359
// compilation and does not generate any instructions, thus it has zero latency.
3360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3361
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3362
{
3363
return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3364
}
3365
3366
// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3367
// compilation and does not generate any instructions, thus it has zero latency.
3368
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3369
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3370
{
3371
return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3372
}
3373
3374
// Applies a type cast to reinterpret four 32-bit floating point values passed
3375
// in as a 128-bit parameter as packed 32-bit integers.
3376
// https://msdn.microsoft.com/en-us/library/bb514099.aspx
3377
FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3378
{
3379
return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3380
}
3381
3382
// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3383
// compilation and does not generate any instructions, thus it has zero latency.
3384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3385
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3386
{
3387
#if defined(__aarch64__)
3388
return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3389
#else
3390
return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3391
#endif
3392
}
3393
3394
// Applies a type cast to reinterpret four 32-bit integers passed in as a
3395
// 128-bit parameter as packed 32-bit floating point values.
3396
// https://msdn.microsoft.com/en-us/library/bb514029.aspx
3397
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3398
{
3399
return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3400
}
3401
3402
// Invalidate and flush the cache line that contains p from all levels of the
3403
// cache hierarchy.
3404
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3405
#if defined(__APPLE__)
3406
#include <libkern/OSCacheControl.h>
3407
#endif
3408
FORCE_INLINE void _mm_clflush(void const *p)
3409
{
3410
(void) p;
3411
3412
/* sys_icache_invalidate is supported since macOS 10.5.
3413
* However, it does not work on non-jailbroken iOS devices, although the
3414
* compilation is successful.
3415
*/
3416
#if defined(__APPLE__)
3417
sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3418
#elif defined(__GNUC__) || defined(__clang__)
3419
uintptr_t ptr = (uintptr_t) p;
3420
__builtin___clear_cache((char *) ptr,
3421
(char *) ptr + SSE2NEON_CACHELINE_SIZE);
3422
#else
3423
/* FIXME: MSVC support */
3424
#endif
3425
}
3426
3427
// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3428
// unsigned 16-bit integers in b for equality.
3429
// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3430
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3431
{
3432
return vreinterpretq_m128i_u16(
3433
vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3434
}
3435
3436
// Compare packed 32-bit integers in a and b for equality, and store the results
3437
// in dst
3438
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3439
{
3440
return vreinterpretq_m128i_u32(
3441
vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3442
}
3443
3444
// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3445
// unsigned 8-bit integers in b for equality.
3446
// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3447
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3448
{
3449
return vreinterpretq_m128i_u8(
3450
vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3451
}
3452
3453
// Compare packed double-precision (64-bit) floating-point elements in a and b
3454
// for equality, and store the results in dst.
3455
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3456
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3457
{
3458
#if defined(__aarch64__)
3459
return vreinterpretq_m128d_u64(
3460
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3461
#else
3462
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3463
uint32x4_t cmp =
3464
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3465
uint32x4_t swapped = vrev64q_u32(cmp);
3466
return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3467
#endif
3468
}
3469
3470
// Compare the lower double-precision (64-bit) floating-point elements in a and
3471
// b for equality, store the result in the lower element of dst, and copy the
3472
// upper element from a to the upper element of dst.
3473
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3474
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3475
{
3476
return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3477
}
3478
3479
// Compare packed double-precision (64-bit) floating-point elements in a and b
3480
// for greater-than-or-equal, and store the results in dst.
3481
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3482
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3483
{
3484
#if defined(__aarch64__)
3485
return vreinterpretq_m128d_u64(
3486
vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3487
#else
3488
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3489
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3490
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3491
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3492
uint64_t d[2];
3493
d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3494
d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3495
3496
return vreinterpretq_m128d_u64(vld1q_u64(d));
3497
#endif
3498
}
3499
3500
// Compare the lower double-precision (64-bit) floating-point elements in a and
3501
// b for greater-than-or-equal, store the result in the lower element of dst,
3502
// and copy the upper element from a to the upper element of dst.
3503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3504
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3505
{
3506
#if defined(__aarch64__)
3507
return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3508
#else
3509
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3510
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3511
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3512
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3513
uint64_t d[2];
3514
d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3515
d[1] = a1;
3516
3517
return vreinterpretq_m128d_u64(vld1q_u64(d));
3518
#endif
3519
}
3520
3521
// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3522
// in b for greater than.
3523
//
3524
// r0 := (a0 > b0) ? 0xffff : 0x0
3525
// r1 := (a1 > b1) ? 0xffff : 0x0
3526
// ...
3527
// r7 := (a7 > b7) ? 0xffff : 0x0
3528
//
3529
// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3530
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3531
{
3532
return vreinterpretq_m128i_u16(
3533
vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3534
}
3535
3536
// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3537
// in b for greater than.
3538
// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3539
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3540
{
3541
return vreinterpretq_m128i_u32(
3542
vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3543
}
3544
3545
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3546
// in b for greater than.
3547
//
3548
// r0 := (a0 > b0) ? 0xff : 0x0
3549
// r1 := (a1 > b1) ? 0xff : 0x0
3550
// ...
3551
// r15 := (a15 > b15) ? 0xff : 0x0
3552
//
3553
// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3554
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3555
{
3556
return vreinterpretq_m128i_u8(
3557
vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3558
}
3559
3560
// Compare packed double-precision (64-bit) floating-point elements in a and b
3561
// for greater-than, and store the results in dst.
3562
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3563
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3564
{
3565
#if defined(__aarch64__)
3566
return vreinterpretq_m128d_u64(
3567
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3568
#else
3569
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3570
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3571
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3572
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3573
uint64_t d[2];
3574
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3575
d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3576
3577
return vreinterpretq_m128d_u64(vld1q_u64(d));
3578
#endif
3579
}
3580
3581
// Compare the lower double-precision (64-bit) floating-point elements in a and
3582
// b for greater-than, store the result in the lower element of dst, and copy
3583
// the upper element from a to the upper element of dst.
3584
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3585
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3586
{
3587
#if defined(__aarch64__)
3588
return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3589
#else
3590
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3591
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3592
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3593
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3594
uint64_t d[2];
3595
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3596
d[1] = a1;
3597
3598
return vreinterpretq_m128d_u64(vld1q_u64(d));
3599
#endif
3600
}
3601
3602
// Compare packed double-precision (64-bit) floating-point elements in a and b
3603
// for less-than-or-equal, and store the results in dst.
3604
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3605
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3606
{
3607
#if defined(__aarch64__)
3608
return vreinterpretq_m128d_u64(
3609
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3610
#else
3611
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3612
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3613
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3614
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3615
uint64_t d[2];
3616
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3617
d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3618
3619
return vreinterpretq_m128d_u64(vld1q_u64(d));
3620
#endif
3621
}
3622
3623
// Compare the lower double-precision (64-bit) floating-point elements in a and
3624
// b for less-than-or-equal, store the result in the lower element of dst, and
3625
// copy the upper element from a to the upper element of dst.
3626
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3627
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3628
{
3629
#if defined(__aarch64__)
3630
return _mm_move_sd(a, _mm_cmple_pd(a, b));
3631
#else
3632
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3633
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3634
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3635
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3636
uint64_t d[2];
3637
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3638
d[1] = a1;
3639
3640
return vreinterpretq_m128d_u64(vld1q_u64(d));
3641
#endif
3642
}
3643
3644
// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3645
// in b for less than.
3646
//
3647
// r0 := (a0 < b0) ? 0xffff : 0x0
3648
// r1 := (a1 < b1) ? 0xffff : 0x0
3649
// ...
3650
// r7 := (a7 < b7) ? 0xffff : 0x0
3651
//
3652
// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3653
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3654
{
3655
return vreinterpretq_m128i_u16(
3656
vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3657
}
3658
3659
3660
// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3661
// in b for less than.
3662
// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3663
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3664
{
3665
return vreinterpretq_m128i_u32(
3666
vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3667
}
3668
3669
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3670
// in b for lesser than.
3671
// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3672
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3673
{
3674
return vreinterpretq_m128i_u8(
3675
vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3676
}
3677
3678
// Compare packed double-precision (64-bit) floating-point elements in a and b
3679
// for less-than, and store the results in dst.
3680
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3681
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3682
{
3683
#if defined(__aarch64__)
3684
return vreinterpretq_m128d_u64(
3685
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3686
#else
3687
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3688
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3689
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3690
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3691
uint64_t d[2];
3692
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3693
d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3694
3695
return vreinterpretq_m128d_u64(vld1q_u64(d));
3696
#endif
3697
}
3698
3699
// Compare the lower double-precision (64-bit) floating-point elements in a and
3700
// b for less-than, store the result in the lower element of dst, and copy the
3701
// upper element from a to the upper element of dst.
3702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3703
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3704
{
3705
#if defined(__aarch64__)
3706
return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3707
#else
3708
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3709
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3710
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3711
uint64_t d[2];
3712
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3713
d[1] = a1;
3714
3715
return vreinterpretq_m128d_u64(vld1q_u64(d));
3716
#endif
3717
}
3718
3719
// Compare packed double-precision (64-bit) floating-point elements in a and b
3720
// for not-equal, and store the results in dst.
3721
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3722
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3723
{
3724
#if defined(__aarch64__)
3725
return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3726
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3727
#else
3728
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3729
uint32x4_t cmp =
3730
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3731
uint32x4_t swapped = vrev64q_u32(cmp);
3732
return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3733
#endif
3734
}
3735
3736
// Compare the lower double-precision (64-bit) floating-point elements in a and
3737
// b for not-equal, store the result in the lower element of dst, and copy the
3738
// upper element from a to the upper element of dst.
3739
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3740
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3741
{
3742
return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3743
}
3744
3745
// Compare packed double-precision (64-bit) floating-point elements in a and b
3746
// for not-greater-than-or-equal, and store the results in dst.
3747
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3748
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3749
{
3750
#if defined(__aarch64__)
3751
return vreinterpretq_m128d_u64(veorq_u64(
3752
vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3753
vdupq_n_u64(UINT64_MAX)));
3754
#else
3755
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3756
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3757
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3758
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3759
uint64_t d[2];
3760
d[0] =
3761
!((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3762
d[1] =
3763
!((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3764
3765
return vreinterpretq_m128d_u64(vld1q_u64(d));
3766
#endif
3767
}
3768
3769
// Compare the lower double-precision (64-bit) floating-point elements in a and
3770
// b for not-greater-than-or-equal, store the result in the lower element of
3771
// dst, and copy the upper element from a to the upper element of dst.
3772
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3773
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3774
{
3775
return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3776
}
3777
3778
// Compare packed double-precision (64-bit) floating-point elements in a and b
3779
// for not-greater-than, and store the results in dst.
3780
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3781
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3782
{
3783
#if defined(__aarch64__)
3784
return vreinterpretq_m128d_u64(veorq_u64(
3785
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3786
vdupq_n_u64(UINT64_MAX)));
3787
#else
3788
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3789
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3790
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3791
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3792
uint64_t d[2];
3793
d[0] =
3794
!((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3795
d[1] =
3796
!((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3797
3798
return vreinterpretq_m128d_u64(vld1q_u64(d));
3799
#endif
3800
}
3801
3802
// Compare the lower double-precision (64-bit) floating-point elements in a and
3803
// b for not-greater-than, store the result in the lower element of dst, and
3804
// copy the upper element from a to the upper element of dst.
3805
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3806
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3807
{
3808
return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3809
}
3810
3811
// Compare packed double-precision (64-bit) floating-point elements in a and b
3812
// for not-less-than-or-equal, and store the results in dst.
3813
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3814
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3815
{
3816
#if defined(__aarch64__)
3817
return vreinterpretq_m128d_u64(veorq_u64(
3818
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3819
vdupq_n_u64(UINT64_MAX)));
3820
#else
3821
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3822
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3823
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3824
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3825
uint64_t d[2];
3826
d[0] =
3827
!((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3828
d[1] =
3829
!((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3830
3831
return vreinterpretq_m128d_u64(vld1q_u64(d));
3832
#endif
3833
}
3834
3835
// Compare the lower double-precision (64-bit) floating-point elements in a and
3836
// b for not-less-than-or-equal, store the result in the lower element of dst,
3837
// and copy the upper element from a to the upper element of dst.
3838
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3839
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3840
{
3841
return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3842
}
3843
3844
// Compare packed double-precision (64-bit) floating-point elements in a and b
3845
// for not-less-than, and store the results in dst.
3846
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3847
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3848
{
3849
#if defined(__aarch64__)
3850
return vreinterpretq_m128d_u64(veorq_u64(
3851
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3852
vdupq_n_u64(UINT64_MAX)));
3853
#else
3854
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3855
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3856
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3857
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3858
uint64_t d[2];
3859
d[0] =
3860
!((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3861
d[1] =
3862
!((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3863
3864
return vreinterpretq_m128d_u64(vld1q_u64(d));
3865
#endif
3866
}
3867
3868
// Compare the lower double-precision (64-bit) floating-point elements in a and
3869
// b for not-less-than, store the result in the lower element of dst, and copy
3870
// the upper element from a to the upper element of dst.
3871
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3872
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3873
{
3874
return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3875
}
3876
3877
// Compare packed double-precision (64-bit) floating-point elements in a and b
3878
// to see if neither is NaN, and store the results in dst.
3879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3880
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3881
{
3882
#if defined(__aarch64__)
3883
// Excluding NaNs, any two floating point numbers can be compared.
3884
uint64x2_t not_nan_a =
3885
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3886
uint64x2_t not_nan_b =
3887
vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3888
return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3889
#else
3890
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3891
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3892
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3893
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3894
uint64_t d[2];
3895
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3896
(*(double *) &b0) == (*(double *) &b0))
3897
? ~UINT64_C(0)
3898
: UINT64_C(0);
3899
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3900
(*(double *) &b1) == (*(double *) &b1))
3901
? ~UINT64_C(0)
3902
: UINT64_C(0);
3903
3904
return vreinterpretq_m128d_u64(vld1q_u64(d));
3905
#endif
3906
}
3907
3908
// Compare the lower double-precision (64-bit) floating-point elements in a and
3909
// b to see if neither is NaN, store the result in the lower element of dst, and
3910
// copy the upper element from a to the upper element of dst.
3911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3912
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3913
{
3914
#if defined(__aarch64__)
3915
return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3916
#else
3917
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3918
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3919
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3920
uint64_t d[2];
3921
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3922
(*(double *) &b0) == (*(double *) &b0))
3923
? ~UINT64_C(0)
3924
: UINT64_C(0);
3925
d[1] = a1;
3926
3927
return vreinterpretq_m128d_u64(vld1q_u64(d));
3928
#endif
3929
}
3930
3931
// Compare packed double-precision (64-bit) floating-point elements in a and b
3932
// to see if either is NaN, and store the results in dst.
3933
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3934
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3935
{
3936
#if defined(__aarch64__)
3937
// Two NaNs are not equal in comparison operation.
3938
uint64x2_t not_nan_a =
3939
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3940
uint64x2_t not_nan_b =
3941
vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3942
return vreinterpretq_m128d_s32(
3943
vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3944
#else
3945
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3946
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3947
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3948
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3949
uint64_t d[2];
3950
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3951
(*(double *) &b0) == (*(double *) &b0))
3952
? UINT64_C(0)
3953
: ~UINT64_C(0);
3954
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3955
(*(double *) &b1) == (*(double *) &b1))
3956
? UINT64_C(0)
3957
: ~UINT64_C(0);
3958
3959
return vreinterpretq_m128d_u64(vld1q_u64(d));
3960
#endif
3961
}
3962
3963
// Compare the lower double-precision (64-bit) floating-point elements in a and
3964
// b to see if either is NaN, store the result in the lower element of dst, and
3965
// copy the upper element from a to the upper element of dst.
3966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3967
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3968
{
3969
#if defined(__aarch64__)
3970
return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3971
#else
3972
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3973
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3974
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3975
uint64_t d[2];
3976
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3977
(*(double *) &b0) == (*(double *) &b0))
3978
? UINT64_C(0)
3979
: ~UINT64_C(0);
3980
d[1] = a1;
3981
3982
return vreinterpretq_m128d_u64(vld1q_u64(d));
3983
#endif
3984
}
3985
3986
// Compare the lower double-precision (64-bit) floating-point element in a and b
3987
// for greater-than-or-equal, and return the boolean result (0 or 1).
3988
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3989
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3990
{
3991
#if defined(__aarch64__)
3992
return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3993
#else
3994
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3995
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3996
3997
return (*(double *) &a0 >= *(double *) &b0);
3998
#endif
3999
}
4000
4001
// Compare the lower double-precision (64-bit) floating-point element in a and b
4002
// for greater-than, and return the boolean result (0 or 1).
4003
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
4004
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
4005
{
4006
#if defined(__aarch64__)
4007
return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
4008
#else
4009
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4010
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4011
4012
return (*(double *) &a0 > *(double *) &b0);
4013
#endif
4014
}
4015
4016
// Compare the lower double-precision (64-bit) floating-point element in a and b
4017
// for less-than-or-equal, and return the boolean result (0 or 1).
4018
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
4019
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
4020
{
4021
#if defined(__aarch64__)
4022
return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
4023
#else
4024
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4025
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4026
4027
return (*(double *) &a0 <= *(double *) &b0);
4028
#endif
4029
}
4030
4031
// Compare the lower double-precision (64-bit) floating-point element in a and b
4032
// for less-than, and return the boolean result (0 or 1).
4033
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
4034
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
4035
{
4036
#if defined(__aarch64__)
4037
return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
4038
#else
4039
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4040
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4041
4042
return (*(double *) &a0 < *(double *) &b0);
4043
#endif
4044
}
4045
4046
// Compare the lower double-precision (64-bit) floating-point element in a and b
4047
// for equality, and return the boolean result (0 or 1).
4048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
4049
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
4050
{
4051
#if defined(__aarch64__)
4052
return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
4053
#else
4054
uint32x4_t a_not_nan =
4055
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
4056
uint32x4_t b_not_nan =
4057
vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
4058
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4059
uint32x4_t a_eq_b =
4060
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
4061
uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
4062
vreinterpretq_u64_u32(a_eq_b));
4063
return vgetq_lane_u64(and_results, 0) & 0x1;
4064
#endif
4065
}
4066
4067
// Compare the lower double-precision (64-bit) floating-point element in a and b
4068
// for not-equal, and return the boolean result (0 or 1).
4069
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
4070
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
4071
{
4072
return !_mm_comieq_sd(a, b);
4073
}
4074
4075
// Convert packed signed 32-bit integers in a to packed double-precision
4076
// (64-bit) floating-point elements, and store the results in dst.
4077
//
4078
// FOR j := 0 to 1
4079
// i := j*32
4080
// m := j*64
4081
// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4082
// ENDFOR
4083
//
4084
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
4085
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
4086
{
4087
#if defined(__aarch64__)
4088
return vreinterpretq_m128d_f64(
4089
vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
4090
#else
4091
double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4092
double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
4093
return _mm_set_pd(a1, a0);
4094
#endif
4095
}
4096
4097
// Converts the four signed 32-bit integer values of a to single-precision,
4098
// floating-point values
4099
// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
4100
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4101
{
4102
return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4103
}
4104
4105
// Convert packed double-precision (64-bit) floating-point elements in a to
4106
// packed 32-bit integers, and store the results in dst.
4107
//
4108
// FOR j := 0 to 1
4109
// i := 32*j
4110
// k := 64*j
4111
// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
4112
// ENDFOR
4113
//
4114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
4115
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
4116
{
4117
// vrnd32xq_f64 not supported on clang
4118
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
4119
float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
4120
int64x2_t integers = vcvtq_s64_f64(rounded);
4121
return vreinterpretq_m128i_s32(
4122
vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
4123
#else
4124
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4125
double d0 = ((double *) &rnd)[0];
4126
double d1 = ((double *) &rnd)[1];
4127
return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
4128
#endif
4129
}
4130
4131
// Convert packed double-precision (64-bit) floating-point elements in a to
4132
// packed 32-bit integers, and store the results in dst.
4133
//
4134
// FOR j := 0 to 1
4135
// i := 32*j
4136
// k := 64*j
4137
// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
4138
// ENDFOR
4139
//
4140
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
4141
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
4142
{
4143
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4144
double d0 = ((double *) &rnd)[0];
4145
double d1 = ((double *) &rnd)[1];
4146
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
4147
return vreinterpret_m64_s32(vld1_s32(data));
4148
}
4149
4150
// Convert packed double-precision (64-bit) floating-point elements in a to
4151
// packed single-precision (32-bit) floating-point elements, and store the
4152
// results in dst.
4153
//
4154
// FOR j := 0 to 1
4155
// i := 32*j
4156
// k := 64*j
4157
// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4158
// ENDFOR
4159
// dst[127:64] := 0
4160
//
4161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
4162
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4163
{
4164
#if defined(__aarch64__)
4165
float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4166
return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4167
#else
4168
float a0 = (float) ((double *) &a)[0];
4169
float a1 = (float) ((double *) &a)[1];
4170
return _mm_set_ps(0, 0, a1, a0);
4171
#endif
4172
}
4173
4174
// Convert packed signed 32-bit integers in a to packed double-precision
4175
// (64-bit) floating-point elements, and store the results in dst.
4176
//
4177
// FOR j := 0 to 1
4178
// i := j*32
4179
// m := j*64
4180
// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4181
// ENDFOR
4182
//
4183
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
4184
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
4185
{
4186
#if defined(__aarch64__)
4187
return vreinterpretq_m128d_f64(
4188
vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
4189
#else
4190
double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
4191
double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
4192
return _mm_set_pd(a1, a0);
4193
#endif
4194
}
4195
4196
// Converts the four single-precision, floating-point values of a to signed
4197
// 32-bit integer values.
4198
//
4199
// r0 := (int) a0
4200
// r1 := (int) a1
4201
// r2 := (int) a2
4202
// r3 := (int) a3
4203
//
4204
// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4205
// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4206
// does not support! It is supported on ARMv8-A however.
4207
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4208
{
4209
#if defined(__ARM_FEATURE_FRINT)
4210
return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
4211
#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4212
switch (_MM_GET_ROUNDING_MODE()) {
4213
case _MM_ROUND_NEAREST:
4214
return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4215
case _MM_ROUND_DOWN:
4216
return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
4217
case _MM_ROUND_UP:
4218
return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4219
default: // _MM_ROUND_TOWARD_ZERO
4220
return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4221
}
4222
#else
4223
float *f = (float *) &a;
4224
switch (_MM_GET_ROUNDING_MODE()) {
4225
case _MM_ROUND_NEAREST: {
4226
uint32x4_t signmask = vdupq_n_u32(0x80000000);
4227
float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4228
vdupq_n_f32(0.5f)); /* +/- 0.5 */
4229
int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4230
vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4231
int32x4_t r_trunc = vcvtq_s32_f32(
4232
vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4233
int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4234
vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4235
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4236
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4237
float32x4_t delta = vsubq_f32(
4238
vreinterpretq_f32_m128(a),
4239
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4240
uint32x4_t is_delta_half =
4241
vceqq_f32(delta, half); /* delta == +/- 0.5 */
4242
return vreinterpretq_m128i_s32(
4243
vbslq_s32(is_delta_half, r_even, r_normal));
4244
}
4245
case _MM_ROUND_DOWN:
4246
return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4247
floorf(f[0]));
4248
case _MM_ROUND_UP:
4249
return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4250
ceilf(f[0]));
4251
default: // _MM_ROUND_TOWARD_ZERO
4252
return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4253
(int32_t) f[0]);
4254
}
4255
#endif
4256
}
4257
4258
// Convert packed single-precision (32-bit) floating-point elements in a to
4259
// packed double-precision (64-bit) floating-point elements, and store the
4260
// results in dst.
4261
//
4262
// FOR j := 0 to 1
4263
// i := 64*j
4264
// k := 32*j
4265
// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4266
// ENDFOR
4267
//
4268
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
4269
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4270
{
4271
#if defined(__aarch64__)
4272
return vreinterpretq_m128d_f64(
4273
vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4274
#else
4275
double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4276
double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4277
return _mm_set_pd(a1, a0);
4278
#endif
4279
}
4280
4281
// Copy the lower double-precision (64-bit) floating-point element of a to dst.
4282
//
4283
// dst[63:0] := a[63:0]
4284
//
4285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
4286
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4287
{
4288
#if defined(__aarch64__)
4289
return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4290
#else
4291
return ((double *) &a)[0];
4292
#endif
4293
}
4294
4295
// Convert the lower double-precision (64-bit) floating-point element in a to a
4296
// 32-bit integer, and store the result in dst.
4297
//
4298
// dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4299
//
4300
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
4301
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
4302
{
4303
#if defined(__aarch64__)
4304
return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4305
#else
4306
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4307
double ret = ((double *) &rnd)[0];
4308
return (int32_t) ret;
4309
#endif
4310
}
4311
4312
// Convert the lower double-precision (64-bit) floating-point element in a to a
4313
// 64-bit integer, and store the result in dst.
4314
//
4315
// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4316
//
4317
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
4318
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4319
{
4320
#if defined(__aarch64__)
4321
return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4322
#else
4323
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4324
double ret = ((double *) &rnd)[0];
4325
return (int64_t) ret;
4326
#endif
4327
}
4328
4329
// Convert the lower double-precision (64-bit) floating-point element in a to a
4330
// 64-bit integer, and store the result in dst.
4331
//
4332
// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4333
//
4334
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
4335
#define _mm_cvtsd_si64x _mm_cvtsd_si64
4336
4337
// Convert the lower double-precision (64-bit) floating-point element in b to a
4338
// single-precision (32-bit) floating-point element, store the result in the
4339
// lower element of dst, and copy the upper 3 packed elements from a to the
4340
// upper elements of dst.
4341
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4342
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4343
{
4344
#if defined(__aarch64__)
4345
return vreinterpretq_m128_f32(vsetq_lane_f32(
4346
vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4347
vreinterpretq_f32_m128(a), 0));
4348
#else
4349
return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4350
vreinterpretq_f32_m128(a), 0));
4351
#endif
4352
}
4353
4354
// Copy the lower 32-bit integer in a to dst.
4355
//
4356
// dst[31:0] := a[31:0]
4357
//
4358
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4359
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4360
{
4361
return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4362
}
4363
4364
// Copy the lower 64-bit integer in a to dst.
4365
//
4366
// dst[63:0] := a[63:0]
4367
//
4368
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4369
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4370
{
4371
return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4372
}
4373
4374
// Copy the lower 64-bit integer in a to dst.
4375
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4376
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4377
4378
// Convert the signed 32-bit integer b to a double-precision (64-bit)
4379
// floating-point element, store the result in the lower element of dst, and
4380
// copy the upper element from a to the upper element of dst.
4381
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4382
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4383
{
4384
#if defined(__aarch64__)
4385
return vreinterpretq_m128d_f64(
4386
vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4387
#else
4388
double bf = (double) b;
4389
return vreinterpretq_m128d_s64(
4390
vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4391
#endif
4392
}
4393
4394
// Copy the lower 64-bit integer in a to dst.
4395
//
4396
// dst[63:0] := a[63:0]
4397
//
4398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4399
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4400
4401
// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4402
// zero extending the upper bits.
4403
//
4404
// r0 := a
4405
// r1 := 0x0
4406
// r2 := 0x0
4407
// r3 := 0x0
4408
//
4409
// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4410
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4411
{
4412
return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4413
}
4414
4415
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4416
// floating-point element, store the result in the lower element of dst, and
4417
// copy the upper element from a to the upper element of dst.
4418
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4419
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4420
{
4421
#if defined(__aarch64__)
4422
return vreinterpretq_m128d_f64(
4423
vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4424
#else
4425
double bf = (double) b;
4426
return vreinterpretq_m128d_s64(
4427
vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4428
#endif
4429
}
4430
4431
// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4432
// zero extending the upper bits.
4433
//
4434
// r0 := a
4435
// r1 := 0x0
4436
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4437
{
4438
return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4439
}
4440
4441
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4442
// element.
4443
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4444
#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4445
4446
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4447
// floating-point element, store the result in the lower element of dst, and
4448
// copy the upper element from a to the upper element of dst.
4449
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4450
#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4451
4452
// Convert the lower single-precision (32-bit) floating-point element in b to a
4453
// double-precision (64-bit) floating-point element, store the result in the
4454
// lower element of dst, and copy the upper element from a to the upper element
4455
// of dst.
4456
//
4457
// dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4458
// dst[127:64] := a[127:64]
4459
//
4460
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4461
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4462
{
4463
double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4464
#if defined(__aarch64__)
4465
return vreinterpretq_m128d_f64(
4466
vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4467
#else
4468
return vreinterpretq_m128d_s64(
4469
vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4470
#endif
4471
}
4472
4473
// Convert packed double-precision (64-bit) floating-point elements in a to
4474
// packed 32-bit integers with truncation, and store the results in dst.
4475
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4476
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4477
{
4478
double a0 = ((double *) &a)[0];
4479
double a1 = ((double *) &a)[1];
4480
return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4481
}
4482
4483
// Convert packed double-precision (64-bit) floating-point elements in a to
4484
// packed 32-bit integers with truncation, and store the results in dst.
4485
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4486
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4487
{
4488
double a0 = ((double *) &a)[0];
4489
double a1 = ((double *) &a)[1];
4490
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4491
return vreinterpret_m64_s32(vld1_s32(data));
4492
}
4493
4494
// Converts the four single-precision, floating-point values of a to signed
4495
// 32-bit integer values using truncate.
4496
// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4497
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4498
{
4499
return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4500
}
4501
4502
// Convert the lower double-precision (64-bit) floating-point element in a to a
4503
// 32-bit integer with truncation, and store the result in dst.
4504
//
4505
// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4506
//
4507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4508
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4509
{
4510
double ret = *((double *) &a);
4511
return (int32_t) ret;
4512
}
4513
4514
// Convert the lower double-precision (64-bit) floating-point element in a to a
4515
// 64-bit integer with truncation, and store the result in dst.
4516
//
4517
// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4518
//
4519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4520
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4521
{
4522
#if defined(__aarch64__)
4523
return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4524
#else
4525
double ret = *((double *) &a);
4526
return (int64_t) ret;
4527
#endif
4528
}
4529
4530
// Convert the lower double-precision (64-bit) floating-point element in a to a
4531
// 64-bit integer with truncation, and store the result in dst.
4532
//
4533
// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4534
//
4535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4536
#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4537
4538
// Divide packed double-precision (64-bit) floating-point elements in a by
4539
// packed elements in b, and store the results in dst.
4540
//
4541
// FOR j := 0 to 1
4542
// i := 64*j
4543
// dst[i+63:i] := a[i+63:i] / b[i+63:i]
4544
// ENDFOR
4545
//
4546
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4547
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4548
{
4549
#if defined(__aarch64__)
4550
return vreinterpretq_m128d_f64(
4551
vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4552
#else
4553
double *da = (double *) &a;
4554
double *db = (double *) &b;
4555
double c[2];
4556
c[0] = da[0] / db[0];
4557
c[1] = da[1] / db[1];
4558
return vld1q_f32((float32_t *) c);
4559
#endif
4560
}
4561
4562
// Divide the lower double-precision (64-bit) floating-point element in a by the
4563
// lower double-precision (64-bit) floating-point element in b, store the result
4564
// in the lower element of dst, and copy the upper element from a to the upper
4565
// element of dst.
4566
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4567
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4568
{
4569
#if defined(__aarch64__)
4570
float64x2_t tmp =
4571
vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4572
return vreinterpretq_m128d_f64(
4573
vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4574
#else
4575
return _mm_move_sd(a, _mm_div_pd(a, b));
4576
#endif
4577
}
4578
4579
// Extracts the selected signed or unsigned 16-bit integer from a and zero
4580
// extends.
4581
// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4582
// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4583
#define _mm_extract_epi16(a, imm) \
4584
vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4585
4586
// Inserts the least significant 16 bits of b into the selected 16-bit integer
4587
// of a.
4588
// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4589
// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4590
// __constrange(0,8) int imm)
4591
#define _mm_insert_epi16(a, b, imm) \
4592
__extension__({ \
4593
vreinterpretq_m128i_s16( \
4594
vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4595
})
4596
4597
// Loads two double-precision from 16-byte aligned memory, floating-point
4598
// values.
4599
//
4600
// dst[127:0] := MEM[mem_addr+127:mem_addr]
4601
//
4602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4603
FORCE_INLINE __m128d _mm_load_pd(const double *p)
4604
{
4605
#if defined(__aarch64__)
4606
return vreinterpretq_m128d_f64(vld1q_f64(p));
4607
#else
4608
const float *fp = (const float *) p;
4609
float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4610
return vreinterpretq_m128d_f32(vld1q_f32(data));
4611
#endif
4612
}
4613
4614
// Load a double-precision (64-bit) floating-point element from memory into both
4615
// elements of dst.
4616
//
4617
// dst[63:0] := MEM[mem_addr+63:mem_addr]
4618
// dst[127:64] := MEM[mem_addr+63:mem_addr]
4619
//
4620
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4621
#define _mm_load_pd1 _mm_load1_pd
4622
4623
// Load a double-precision (64-bit) floating-point element from memory into the
4624
// lower of dst, and zero the upper element. mem_addr does not need to be
4625
// aligned on any particular boundary.
4626
//
4627
// dst[63:0] := MEM[mem_addr+63:mem_addr]
4628
// dst[127:64] := 0
4629
//
4630
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4631
FORCE_INLINE __m128d _mm_load_sd(const double *p)
4632
{
4633
#if defined(__aarch64__)
4634
return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4635
#else
4636
const float *fp = (const float *) p;
4637
float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4638
return vreinterpretq_m128d_f32(vld1q_f32(data));
4639
#endif
4640
}
4641
4642
// Loads 128-bit value. :
4643
// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4644
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4645
{
4646
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4647
}
4648
4649
// Load a double-precision (64-bit) floating-point element from memory into both
4650
// elements of dst.
4651
//
4652
// dst[63:0] := MEM[mem_addr+63:mem_addr]
4653
// dst[127:64] := MEM[mem_addr+63:mem_addr]
4654
//
4655
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4656
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4657
{
4658
#if defined(__aarch64__)
4659
return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4660
#else
4661
return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4662
#endif
4663
}
4664
4665
// Load a double-precision (64-bit) floating-point element from memory into the
4666
// upper element of dst, and copy the lower element from a to dst. mem_addr does
4667
// not need to be aligned on any particular boundary.
4668
//
4669
// dst[63:0] := a[63:0]
4670
// dst[127:64] := MEM[mem_addr+63:mem_addr]
4671
//
4672
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4673
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4674
{
4675
#if defined(__aarch64__)
4676
return vreinterpretq_m128d_f64(
4677
vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4678
#else
4679
return vreinterpretq_m128d_f32(vcombine_f32(
4680
vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4681
#endif
4682
}
4683
4684
// Load 64-bit integer from memory into the first element of dst.
4685
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4686
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4687
{
4688
/* Load the lower 64 bits of the value pointed to by p into the
4689
* lower 64 bits of the result, zeroing the upper 64 bits of the result.
4690
*/
4691
return vreinterpretq_m128i_s32(
4692
vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4693
}
4694
4695
// Load a double-precision (64-bit) floating-point element from memory into the
4696
// lower element of dst, and copy the upper element from a to dst. mem_addr does
4697
// not need to be aligned on any particular boundary.
4698
//
4699
// dst[63:0] := MEM[mem_addr+63:mem_addr]
4700
// dst[127:64] := a[127:64]
4701
//
4702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4703
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4704
{
4705
#if defined(__aarch64__)
4706
return vreinterpretq_m128d_f64(
4707
vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4708
#else
4709
return vreinterpretq_m128d_f32(
4710
vcombine_f32(vld1_f32((const float *) p),
4711
vget_high_f32(vreinterpretq_f32_m128d(a))));
4712
#endif
4713
}
4714
4715
// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4716
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4717
// general-protection exception may be generated.
4718
//
4719
// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4720
// dst[127:64] := MEM[mem_addr+63:mem_addr]
4721
//
4722
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4723
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4724
{
4725
#if defined(__aarch64__)
4726
float64x2_t v = vld1q_f64(p);
4727
return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4728
#else
4729
int64x2_t v = vld1q_s64((const int64_t *) p);
4730
return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4731
#endif
4732
}
4733
4734
// Loads two double-precision from unaligned memory, floating-point values.
4735
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4736
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4737
{
4738
return _mm_load_pd(p);
4739
}
4740
4741
// Loads 128-bit value. :
4742
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4743
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4744
{
4745
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4746
}
4747
4748
// Load unaligned 32-bit integer from memory into the first element of dst.
4749
//
4750
// dst[31:0] := MEM[mem_addr+31:mem_addr]
4751
// dst[MAX:32] := 0
4752
//
4753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4754
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4755
{
4756
return vreinterpretq_m128i_s32(
4757
vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4758
}
4759
4760
// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4761
// integers from b.
4762
//
4763
// r0 := (a0 * b0) + (a1 * b1)
4764
// r1 := (a2 * b2) + (a3 * b3)
4765
// r2 := (a4 * b4) + (a5 * b5)
4766
// r3 := (a6 * b6) + (a7 * b7)
4767
// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4768
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4769
{
4770
int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4771
vget_low_s16(vreinterpretq_s16_m128i(b)));
4772
#if defined(__aarch64__)
4773
int32x4_t high =
4774
vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4775
4776
return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4777
#else
4778
int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4779
vget_high_s16(vreinterpretq_s16_m128i(b)));
4780
4781
int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4782
int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4783
4784
return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4785
#endif
4786
}
4787
4788
// Conditionally store 8-bit integer elements from a into memory using mask
4789
// (elements are not stored when the highest bit is not set in the corresponding
4790
// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4791
// on any particular boundary.
4792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4793
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4794
{
4795
int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4796
__m128 b = _mm_load_ps((const float *) mem_addr);
4797
int8x16_t masked =
4798
vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4799
vreinterpretq_s8_m128(b));
4800
vst1q_s8((int8_t *) mem_addr, masked);
4801
}
4802
4803
// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4804
// signed 16-bit integers from b.
4805
// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4806
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4807
{
4808
return vreinterpretq_m128i_s16(
4809
vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4810
}
4811
4812
// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4813
// 16 unsigned 8-bit integers from b.
4814
// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4815
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4816
{
4817
return vreinterpretq_m128i_u8(
4818
vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4819
}
4820
4821
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4822
// and store packed maximum values in dst.
4823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4824
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4825
{
4826
#if defined(__aarch64__)
4827
#if SSE2NEON_PRECISE_MINMAX
4828
float64x2_t _a = vreinterpretq_f64_m128d(a);
4829
float64x2_t _b = vreinterpretq_f64_m128d(b);
4830
return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4831
#else
4832
return vreinterpretq_m128d_f64(
4833
vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4834
#endif
4835
#else
4836
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4837
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4838
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4839
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4840
uint64_t d[2];
4841
d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4842
d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4843
4844
return vreinterpretq_m128d_u64(vld1q_u64(d));
4845
#endif
4846
}
4847
4848
// Compare the lower double-precision (64-bit) floating-point elements in a and
4849
// b, store the maximum value in the lower element of dst, and copy the upper
4850
// element from a to the upper element of dst.
4851
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4852
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4853
{
4854
#if defined(__aarch64__)
4855
return _mm_move_sd(a, _mm_max_pd(a, b));
4856
#else
4857
double *da = (double *) &a;
4858
double *db = (double *) &b;
4859
double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4860
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4861
#endif
4862
}
4863
4864
// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4865
// signed 16-bit integers from b.
4866
// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4867
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4868
{
4869
return vreinterpretq_m128i_s16(
4870
vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4871
}
4872
4873
// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4874
// 16 unsigned 8-bit integers from b.
4875
// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4876
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4877
{
4878
return vreinterpretq_m128i_u8(
4879
vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4880
}
4881
4882
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4883
// and store packed minimum values in dst.
4884
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4885
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4886
{
4887
#if defined(__aarch64__)
4888
#if SSE2NEON_PRECISE_MINMAX
4889
float64x2_t _a = vreinterpretq_f64_m128d(a);
4890
float64x2_t _b = vreinterpretq_f64_m128d(b);
4891
return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4892
#else
4893
return vreinterpretq_m128d_f64(
4894
vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4895
#endif
4896
#else
4897
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4898
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4899
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4900
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4901
uint64_t d[2];
4902
d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4903
d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4904
return vreinterpretq_m128d_u64(vld1q_u64(d));
4905
#endif
4906
}
4907
4908
// Compare the lower double-precision (64-bit) floating-point elements in a and
4909
// b, store the minimum value in the lower element of dst, and copy the upper
4910
// element from a to the upper element of dst.
4911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4912
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4913
{
4914
#if defined(__aarch64__)
4915
return _mm_move_sd(a, _mm_min_pd(a, b));
4916
#else
4917
double *da = (double *) &a;
4918
double *db = (double *) &b;
4919
double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4920
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4921
#endif
4922
}
4923
4924
// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4925
// upper element.
4926
//
4927
// dst[63:0] := a[63:0]
4928
// dst[127:64] := 0
4929
//
4930
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4931
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4932
{
4933
return vreinterpretq_m128i_s64(
4934
vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4935
}
4936
4937
// Move the lower double-precision (64-bit) floating-point element from b to the
4938
// lower element of dst, and copy the upper element from a to the upper element
4939
// of dst.
4940
//
4941
// dst[63:0] := b[63:0]
4942
// dst[127:64] := a[127:64]
4943
//
4944
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4945
FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4946
{
4947
return vreinterpretq_m128d_f32(
4948
vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4949
vget_high_f32(vreinterpretq_f32_m128d(a))));
4950
}
4951
4952
// NEON does not provide a version of this function.
4953
// Creates a 16-bit mask from the most significant bits of the 16 signed or
4954
// unsigned 8-bit integers in a and zero extends the upper bits.
4955
// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4956
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4957
{
4958
// Use increasingly wide shifts+adds to collect the sign bits
4959
// together.
4960
// Since the widening shifts would be rather confusing to follow in little
4961
// endian, everything will be illustrated in big endian order instead. This
4962
// has a different result - the bits would actually be reversed on a big
4963
// endian machine.
4964
4965
// Starting input (only half the elements are shown):
4966
// 89 ff 1d c0 00 10 99 33
4967
uint8x16_t input = vreinterpretq_u8_m128i(a);
4968
4969
// Shift out everything but the sign bits with an unsigned shift right.
4970
//
4971
// Bytes of the vector::
4972
// 89 ff 1d c0 00 10 99 33
4973
// \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4974
// | | | | | | | |
4975
// 01 01 00 01 00 00 01 00
4976
//
4977
// Bits of first important lane(s):
4978
// 10001001 (89)
4979
// \______
4980
// |
4981
// 00000001 (01)
4982
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4983
4984
// Merge the even lanes together with a 16-bit unsigned shift right + add.
4985
// 'xx' represents garbage data which will be ignored in the final result.
4986
// In the important bytes, the add functions like a binary OR.
4987
//
4988
// 01 01 00 01 00 00 01 00
4989
// \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4990
// \| \| \| \|
4991
// xx 03 xx 01 xx 00 xx 02
4992
//
4993
// 00000001 00000001 (01 01)
4994
// \_______ |
4995
// \|
4996
// xxxxxxxx xxxxxx11 (xx 03)
4997
uint32x4_t paired16 =
4998
vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4999
5000
// Repeat with a wider 32-bit shift + add.
5001
// xx 03 xx 01 xx 00 xx 02
5002
// \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
5003
// 14))
5004
// \| \|
5005
// xx xx xx 0d xx xx xx 02
5006
//
5007
// 00000011 00000001 (03 01)
5008
// \\_____ ||
5009
// '----.\||
5010
// xxxxxxxx xxxx1101 (xx 0d)
5011
uint64x2_t paired32 =
5012
vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
5013
5014
// Last, an even wider 64-bit shift + add to get our result in the low 8 bit
5015
// lanes. xx xx xx 0d xx xx xx 02
5016
// \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
5017
// 28))
5018
// \|
5019
// xx xx xx xx xx xx xx d2
5020
//
5021
// 00001101 00000010 (0d 02)
5022
// \ \___ | |
5023
// '---. \| |
5024
// xxxxxxxx 11010010 (xx d2)
5025
uint8x16_t paired64 =
5026
vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
5027
5028
// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
5029
// xx xx xx xx xx xx xx d2
5030
// || return paired64[0]
5031
// d2
5032
// Note: Little endian would return the correct value 4b (01001011) instead.
5033
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
5034
}
5035
5036
// Set each bit of mask dst based on the most significant bit of the
5037
// corresponding packed double-precision (64-bit) floating-point element in a.
5038
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
5039
FORCE_INLINE int _mm_movemask_pd(__m128d a)
5040
{
5041
uint64x2_t input = vreinterpretq_u64_m128d(a);
5042
uint64x2_t high_bits = vshrq_n_u64(input, 63);
5043
return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
5044
}
5045
5046
// Copy the lower 64-bit integer in a to dst.
5047
//
5048
// dst[63:0] := a[63:0]
5049
//
5050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
5051
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
5052
{
5053
return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
5054
}
5055
5056
// Copy the 64-bit integer a to the lower element of dst, and zero the upper
5057
// element.
5058
//
5059
// dst[63:0] := a[63:0]
5060
// dst[127:64] := 0
5061
//
5062
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
5063
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
5064
{
5065
return vreinterpretq_m128i_s64(
5066
vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
5067
}
5068
5069
// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
5070
// a and b, and store the unsigned 64-bit results in dst.
5071
//
5072
// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
5073
// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
5074
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
5075
{
5076
// vmull_u32 upcasts instead of masking, so we downcast.
5077
uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
5078
uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
5079
return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
5080
}
5081
5082
// Multiply packed double-precision (64-bit) floating-point elements in a and b,
5083
// and store the results in dst.
5084
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
5085
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
5086
{
5087
#if defined(__aarch64__)
5088
return vreinterpretq_m128d_f64(
5089
vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5090
#else
5091
double *da = (double *) &a;
5092
double *db = (double *) &b;
5093
double c[2];
5094
c[0] = da[0] * db[0];
5095
c[1] = da[1] * db[1];
5096
return vld1q_f32((float32_t *) c);
5097
#endif
5098
}
5099
5100
// Multiply the lower double-precision (64-bit) floating-point element in a and
5101
// b, store the result in the lower element of dst, and copy the upper element
5102
// from a to the upper element of dst.
5103
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
5104
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
5105
{
5106
return _mm_move_sd(a, _mm_mul_pd(a, b));
5107
}
5108
5109
// Multiply the low unsigned 32-bit integers from a and b, and store the
5110
// unsigned 64-bit result in dst.
5111
//
5112
// dst[63:0] := a[31:0] * b[31:0]
5113
//
5114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
5115
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
5116
{
5117
return vreinterpret_m64_u64(vget_low_u64(
5118
vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
5119
}
5120
5121
// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
5122
// integers from b.
5123
//
5124
// r0 := (a0 * b0)[31:16]
5125
// r1 := (a1 * b1)[31:16]
5126
// ...
5127
// r7 := (a7 * b7)[31:16]
5128
//
5129
// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
5130
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
5131
{
5132
/* FIXME: issue with large values because of result saturation */
5133
// int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
5134
// vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
5135
// vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
5136
int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
5137
int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
5138
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
5139
int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
5140
int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
5141
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
5142
uint16x8x2_t r =
5143
vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
5144
return vreinterpretq_m128i_u16(r.val[1]);
5145
}
5146
5147
// Multiply the packed unsigned 16-bit integers in a and b, producing
5148
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
5149
// integers in dst.
5150
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
5151
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
5152
{
5153
uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
5154
uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
5155
uint32x4_t ab3210 = vmull_u16(a3210, b3210);
5156
#if defined(__aarch64__)
5157
uint32x4_t ab7654 =
5158
vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
5159
uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
5160
vreinterpretq_u16_u32(ab7654));
5161
return vreinterpretq_m128i_u16(r);
5162
#else
5163
uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
5164
uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
5165
uint32x4_t ab7654 = vmull_u16(a7654, b7654);
5166
uint16x8x2_t r =
5167
vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
5168
return vreinterpretq_m128i_u16(r.val[1]);
5169
#endif
5170
}
5171
5172
// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
5173
// unsigned 16-bit integers from b.
5174
//
5175
// r0 := (a0 * b0)[15:0]
5176
// r1 := (a1 * b1)[15:0]
5177
// ...
5178
// r7 := (a7 * b7)[15:0]
5179
//
5180
// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
5181
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
5182
{
5183
return vreinterpretq_m128i_s16(
5184
vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5185
}
5186
5187
// Compute the bitwise OR of packed double-precision (64-bit) floating-point
5188
// elements in a and b, and store the results in dst.
5189
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
5190
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
5191
{
5192
return vreinterpretq_m128d_s64(
5193
vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
5194
}
5195
5196
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
5197
//
5198
// r := a | b
5199
//
5200
// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
5201
FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
5202
{
5203
return vreinterpretq_m128i_s32(
5204
vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5205
}
5206
5207
// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5208
// saturates.
5209
// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
5210
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5211
{
5212
return vreinterpretq_m128i_s8(
5213
vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5214
vqmovn_s16(vreinterpretq_s16_m128i(b))));
5215
}
5216
5217
// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5218
// and saturates.
5219
//
5220
// r0 := SignedSaturate(a0)
5221
// r1 := SignedSaturate(a1)
5222
// r2 := SignedSaturate(a2)
5223
// r3 := SignedSaturate(a3)
5224
// r4 := SignedSaturate(b0)
5225
// r5 := SignedSaturate(b1)
5226
// r6 := SignedSaturate(b2)
5227
// r7 := SignedSaturate(b3)
5228
//
5229
// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
5230
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5231
{
5232
return vreinterpretq_m128i_s16(
5233
vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5234
vqmovn_s32(vreinterpretq_s32_m128i(b))));
5235
}
5236
5237
// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5238
// integers and saturates.
5239
//
5240
// r0 := UnsignedSaturate(a0)
5241
// r1 := UnsignedSaturate(a1)
5242
// ...
5243
// r7 := UnsignedSaturate(a7)
5244
// r8 := UnsignedSaturate(b0)
5245
// r9 := UnsignedSaturate(b1)
5246
// ...
5247
// r15 := UnsignedSaturate(b7)
5248
//
5249
// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5250
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5251
{
5252
return vreinterpretq_m128i_u8(
5253
vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5254
vqmovun_s16(vreinterpretq_s16_m128i(b))));
5255
}
5256
5257
// Pause the processor. This is typically used in spin-wait loops and depending
5258
// on the x86 processor typical values are in the 40-100 cycle range. The
5259
// 'yield' instruction isn't a good fit because it's effectively a nop on most
5260
// Arm cores. Experience with several databases has shown has shown an 'isb' is
5261
// a reasonable approximation.
5262
FORCE_INLINE void _mm_pause()
5263
{
5264
__asm__ __volatile__("isb\n");
5265
}
5266
5267
// Compute the absolute differences of packed unsigned 8-bit integers in a and
5268
// b, then horizontally sum each consecutive 8 differences to produce two
5269
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5270
// 16 bits of 64-bit elements in dst.
5271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
5272
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
5273
{
5274
uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5275
return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5276
}
5277
5278
// Sets the 8 signed 16-bit integer values.
5279
// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5280
FORCE_INLINE __m128i _mm_set_epi16(short i7,
5281
short i6,
5282
short i5,
5283
short i4,
5284
short i3,
5285
short i2,
5286
short i1,
5287
short i0)
5288
{
5289
int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5290
return vreinterpretq_m128i_s16(vld1q_s16(data));
5291
}
5292
5293
// Sets the 4 signed 32-bit integer values.
5294
// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5295
FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5296
{
5297
int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5298
return vreinterpretq_m128i_s32(vld1q_s32(data));
5299
}
5300
5301
// Returns the __m128i structure with its two 64-bit integer values
5302
// initialized to the values of the two 64-bit integers passed in.
5303
// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5304
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
5305
{
5306
return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5307
}
5308
5309
// Returns the __m128i structure with its two 64-bit integer values
5310
// initialized to the values of the two 64-bit integers passed in.
5311
// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5312
FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5313
{
5314
return vreinterpretq_m128i_s64(
5315
vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5316
}
5317
5318
// Sets the 16 signed 8-bit integer values.
5319
// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5320
FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
5321
signed char b14,
5322
signed char b13,
5323
signed char b12,
5324
signed char b11,
5325
signed char b10,
5326
signed char b9,
5327
signed char b8,
5328
signed char b7,
5329
signed char b6,
5330
signed char b5,
5331
signed char b4,
5332
signed char b3,
5333
signed char b2,
5334
signed char b1,
5335
signed char b0)
5336
{
5337
int8_t ALIGN_STRUCT(16)
5338
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5339
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5340
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5341
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5342
return (__m128i) vld1q_s8(data);
5343
}
5344
5345
// Set packed double-precision (64-bit) floating-point elements in dst with the
5346
// supplied values.
5347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
5348
FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5349
{
5350
double ALIGN_STRUCT(16) data[2] = {e0, e1};
5351
#if defined(__aarch64__)
5352
return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5353
#else
5354
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5355
#endif
5356
}
5357
5358
// Broadcast double-precision (64-bit) floating-point value a to all elements of
5359
// dst.
5360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
5361
#define _mm_set_pd1 _mm_set1_pd
5362
5363
// Copy double-precision (64-bit) floating-point element a to the lower element
5364
// of dst, and zero the upper element.
5365
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
5366
FORCE_INLINE __m128d _mm_set_sd(double a)
5367
{
5368
#if defined(__aarch64__)
5369
return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
5370
#else
5371
return _mm_set_pd(0, a);
5372
#endif
5373
}
5374
5375
// Sets the 8 signed 16-bit integer values to w.
5376
//
5377
// r0 := w
5378
// r1 := w
5379
// ...
5380
// r7 := w
5381
//
5382
// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5383
FORCE_INLINE __m128i _mm_set1_epi16(short w)
5384
{
5385
return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5386
}
5387
5388
// Sets the 4 signed 32-bit integer values to i.
5389
//
5390
// r0 := i
5391
// r1 := i
5392
// r2 := i
5393
// r3 := I
5394
//
5395
// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5396
FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5397
{
5398
return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5399
}
5400
5401
// Sets the 2 signed 64-bit integer values to i.
5402
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5403
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5404
{
5405
return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5406
}
5407
5408
// Sets the 2 signed 64-bit integer values to i.
5409
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
5410
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5411
{
5412
return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5413
}
5414
5415
// Sets the 16 signed 8-bit integer values to b.
5416
//
5417
// r0 := b
5418
// r1 := b
5419
// ...
5420
// r15 := b
5421
//
5422
// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5423
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5424
{
5425
return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5426
}
5427
5428
// Broadcast double-precision (64-bit) floating-point value a to all elements of
5429
// dst.
5430
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
5431
FORCE_INLINE __m128d _mm_set1_pd(double d)
5432
{
5433
#if defined(__aarch64__)
5434
return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5435
#else
5436
return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5437
#endif
5438
}
5439
5440
// Sets the 8 signed 16-bit integer values in reverse order.
5441
//
5442
// Return Value
5443
// r0 := w0
5444
// r1 := w1
5445
// ...
5446
// r7 := w7
5447
FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5448
short w1,
5449
short w2,
5450
short w3,
5451
short w4,
5452
short w5,
5453
short w6,
5454
short w7)
5455
{
5456
int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5457
return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5458
}
5459
5460
// Sets the 4 signed 32-bit integer values in reverse order
5461
// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5462
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5463
{
5464
int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5465
return vreinterpretq_m128i_s32(vld1q_s32(data));
5466
}
5467
5468
// Set packed 64-bit integers in dst with the supplied values in reverse order.
5469
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
5470
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5471
{
5472
return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5473
}
5474
5475
// Sets the 16 signed 8-bit integer values in reverse order.
5476
// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5477
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5478
signed char b1,
5479
signed char b2,
5480
signed char b3,
5481
signed char b4,
5482
signed char b5,
5483
signed char b6,
5484
signed char b7,
5485
signed char b8,
5486
signed char b9,
5487
signed char b10,
5488
signed char b11,
5489
signed char b12,
5490
signed char b13,
5491
signed char b14,
5492
signed char b15)
5493
{
5494
int8_t ALIGN_STRUCT(16)
5495
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5496
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5497
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5498
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5499
return (__m128i) vld1q_s8(data);
5500
}
5501
5502
// Set packed double-precision (64-bit) floating-point elements in dst with the
5503
// supplied values in reverse order.
5504
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5505
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5506
{
5507
return _mm_set_pd(e0, e1);
5508
}
5509
5510
// Return vector of type __m128d with all elements set to zero.
5511
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5512
FORCE_INLINE __m128d _mm_setzero_pd(void)
5513
{
5514
#if defined(__aarch64__)
5515
return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5516
#else
5517
return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5518
#endif
5519
}
5520
5521
// Sets the 128-bit value to zero
5522
// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5523
FORCE_INLINE __m128i _mm_setzero_si128(void)
5524
{
5525
return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5526
}
5527
5528
// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5529
// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5530
// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5531
// __constrange(0,255) int imm)
5532
#ifdef _sse2neon_shuffle
5533
#define _mm_shuffle_epi32(a, imm) \
5534
__extension__({ \
5535
int32x4_t _input = vreinterpretq_s32_m128i(a); \
5536
int32x4_t _shuf = \
5537
vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5538
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5539
vreinterpretq_m128i_s32(_shuf); \
5540
})
5541
#else // generic
5542
#define _mm_shuffle_epi32(a, imm) \
5543
__extension__({ \
5544
__m128i ret; \
5545
switch (imm) { \
5546
case _MM_SHUFFLE(1, 0, 3, 2): \
5547
ret = _mm_shuffle_epi_1032((a)); \
5548
break; \
5549
case _MM_SHUFFLE(2, 3, 0, 1): \
5550
ret = _mm_shuffle_epi_2301((a)); \
5551
break; \
5552
case _MM_SHUFFLE(0, 3, 2, 1): \
5553
ret = _mm_shuffle_epi_0321((a)); \
5554
break; \
5555
case _MM_SHUFFLE(2, 1, 0, 3): \
5556
ret = _mm_shuffle_epi_2103((a)); \
5557
break; \
5558
case _MM_SHUFFLE(1, 0, 1, 0): \
5559
ret = _mm_shuffle_epi_1010((a)); \
5560
break; \
5561
case _MM_SHUFFLE(1, 0, 0, 1): \
5562
ret = _mm_shuffle_epi_1001((a)); \
5563
break; \
5564
case _MM_SHUFFLE(0, 1, 0, 1): \
5565
ret = _mm_shuffle_epi_0101((a)); \
5566
break; \
5567
case _MM_SHUFFLE(2, 2, 1, 1): \
5568
ret = _mm_shuffle_epi_2211((a)); \
5569
break; \
5570
case _MM_SHUFFLE(0, 1, 2, 2): \
5571
ret = _mm_shuffle_epi_0122((a)); \
5572
break; \
5573
case _MM_SHUFFLE(3, 3, 3, 2): \
5574
ret = _mm_shuffle_epi_3332((a)); \
5575
break; \
5576
case _MM_SHUFFLE(0, 0, 0, 0): \
5577
ret = _mm_shuffle_epi32_splat((a), 0); \
5578
break; \
5579
case _MM_SHUFFLE(1, 1, 1, 1): \
5580
ret = _mm_shuffle_epi32_splat((a), 1); \
5581
break; \
5582
case _MM_SHUFFLE(2, 2, 2, 2): \
5583
ret = _mm_shuffle_epi32_splat((a), 2); \
5584
break; \
5585
case _MM_SHUFFLE(3, 3, 3, 3): \
5586
ret = _mm_shuffle_epi32_splat((a), 3); \
5587
break; \
5588
default: \
5589
ret = _mm_shuffle_epi32_default((a), (imm)); \
5590
break; \
5591
} \
5592
ret; \
5593
})
5594
#endif
5595
5596
// Shuffle double-precision (64-bit) floating-point elements using the control
5597
// in imm8, and store the results in dst.
5598
//
5599
// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5600
// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5601
//
5602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5603
#ifdef _sse2neon_shuffle
5604
#define _mm_shuffle_pd(a, b, imm8) \
5605
vreinterpretq_m128d_s64( \
5606
vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5607
imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5608
#else
5609
#define _mm_shuffle_pd(a, b, imm8) \
5610
_mm_castsi128_pd(_mm_set_epi64x( \
5611
vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5612
vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5613
#endif
5614
5615
// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5616
// __constrange(0,255) int imm)
5617
#ifdef _sse2neon_shuffle
5618
#define _mm_shufflehi_epi16(a, imm) \
5619
__extension__({ \
5620
int16x8_t _input = vreinterpretq_s16_m128i(a); \
5621
int16x8_t _shuf = \
5622
vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5623
(((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5624
(((imm) >> 6) & 0x3) + 4); \
5625
vreinterpretq_m128i_s16(_shuf); \
5626
})
5627
#else // generic
5628
#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5629
#endif
5630
5631
// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5632
// __constrange(0,255) int imm)
5633
#ifdef _sse2neon_shuffle
5634
#define _mm_shufflelo_epi16(a, imm) \
5635
__extension__({ \
5636
int16x8_t _input = vreinterpretq_s16_m128i(a); \
5637
int16x8_t _shuf = vshuffleq_s16( \
5638
_input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5639
(((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5640
vreinterpretq_m128i_s16(_shuf); \
5641
})
5642
#else // generic
5643
#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5644
#endif
5645
5646
// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5647
// store the results in dst.
5648
//
5649
// FOR j := 0 to 7
5650
// i := j*16
5651
// IF count[63:0] > 15
5652
// dst[i+15:i] := 0
5653
// ELSE
5654
// dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5655
// FI
5656
// ENDFOR
5657
//
5658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5659
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5660
{
5661
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5662
if (_sse2neon_unlikely(c & ~15))
5663
return _mm_setzero_si128();
5664
5665
int16x8_t vc = vdupq_n_s16((int16_t) c);
5666
return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5667
}
5668
5669
// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5670
// store the results in dst.
5671
//
5672
// FOR j := 0 to 3
5673
// i := j*32
5674
// IF count[63:0] > 31
5675
// dst[i+31:i] := 0
5676
// ELSE
5677
// dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5678
// FI
5679
// ENDFOR
5680
//
5681
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5682
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5683
{
5684
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5685
if (_sse2neon_unlikely(c & ~31))
5686
return _mm_setzero_si128();
5687
5688
int32x4_t vc = vdupq_n_s32((int32_t) c);
5689
return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5690
}
5691
5692
// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5693
// store the results in dst.
5694
//
5695
// FOR j := 0 to 1
5696
// i := j*64
5697
// IF count[63:0] > 63
5698
// dst[i+63:i] := 0
5699
// ELSE
5700
// dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5701
// FI
5702
// ENDFOR
5703
//
5704
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5705
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5706
{
5707
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5708
if (_sse2neon_unlikely(c & ~63))
5709
return _mm_setzero_si128();
5710
5711
int64x2_t vc = vdupq_n_s64((int64_t) c);
5712
return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5713
}
5714
5715
// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5716
// store the results in dst.
5717
//
5718
// FOR j := 0 to 7
5719
// i := j*16
5720
// IF imm8[7:0] > 15
5721
// dst[i+15:i] := 0
5722
// ELSE
5723
// dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5724
// FI
5725
// ENDFOR
5726
//
5727
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5728
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5729
{
5730
if (_sse2neon_unlikely(imm & ~15))
5731
return _mm_setzero_si128();
5732
return vreinterpretq_m128i_s16(
5733
vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5734
}
5735
5736
// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5737
// store the results in dst.
5738
//
5739
// FOR j := 0 to 3
5740
// i := j*32
5741
// IF imm8[7:0] > 31
5742
// dst[i+31:i] := 0
5743
// ELSE
5744
// dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5745
// FI
5746
// ENDFOR
5747
//
5748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5749
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5750
{
5751
if (_sse2neon_unlikely(imm & ~31))
5752
return _mm_setzero_si128();
5753
return vreinterpretq_m128i_s32(
5754
vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5755
}
5756
5757
// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5758
// store the results in dst.
5759
//
5760
// FOR j := 0 to 1
5761
// i := j*64
5762
// IF imm8[7:0] > 63
5763
// dst[i+63:i] := 0
5764
// ELSE
5765
// dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5766
// FI
5767
// ENDFOR
5768
//
5769
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5770
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5771
{
5772
if (_sse2neon_unlikely(imm & ~63))
5773
return _mm_setzero_si128();
5774
return vreinterpretq_m128i_s64(
5775
vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5776
}
5777
5778
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5779
// dst.
5780
//
5781
// tmp := imm8[7:0]
5782
// IF tmp > 15
5783
// tmp := 16
5784
// FI
5785
// dst[127:0] := a[127:0] << (tmp*8)
5786
//
5787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5788
#define _mm_slli_si128(a, imm) \
5789
__extension__({ \
5790
int8x16_t ret; \
5791
if (_sse2neon_unlikely(imm == 0)) \
5792
ret = vreinterpretq_s8_m128i(a); \
5793
else if (_sse2neon_unlikely((imm) & ~15)) \
5794
ret = vdupq_n_s8(0); \
5795
else \
5796
ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), \
5797
((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5798
vreinterpretq_m128i_s8(ret); \
5799
})
5800
5801
// Compute the square root of packed double-precision (64-bit) floating-point
5802
// elements in a, and store the results in dst.
5803
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5804
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5805
{
5806
#if defined(__aarch64__)
5807
return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5808
#else
5809
double a0 = sqrt(((double *) &a)[0]);
5810
double a1 = sqrt(((double *) &a)[1]);
5811
return _mm_set_pd(a1, a0);
5812
#endif
5813
}
5814
5815
// Compute the square root of the lower double-precision (64-bit) floating-point
5816
// element in b, store the result in the lower element of dst, and copy the
5817
// upper element from a to the upper element of dst.
5818
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5819
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5820
{
5821
#if defined(__aarch64__)
5822
return _mm_move_sd(a, _mm_sqrt_pd(b));
5823
#else
5824
return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5825
#endif
5826
}
5827
5828
// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5829
// and store the results in dst.
5830
//
5831
// FOR j := 0 to 7
5832
// i := j*16
5833
// IF count[63:0] > 15
5834
// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5835
// ELSE
5836
// dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5837
// FI
5838
// ENDFOR
5839
//
5840
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5841
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5842
{
5843
int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5844
if (_sse2neon_unlikely(c & ~15))
5845
return _mm_cmplt_epi16(a, _mm_setzero_si128());
5846
return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5847
}
5848
5849
// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5850
// and store the results in dst.
5851
//
5852
// FOR j := 0 to 3
5853
// i := j*32
5854
// IF count[63:0] > 31
5855
// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5856
// ELSE
5857
// dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5858
// FI
5859
// ENDFOR
5860
//
5861
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5862
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5863
{
5864
int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5865
if (_sse2neon_unlikely(c & ~31))
5866
return _mm_cmplt_epi32(a, _mm_setzero_si128());
5867
return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5868
}
5869
5870
// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5871
// bits, and store the results in dst.
5872
//
5873
// FOR j := 0 to 7
5874
// i := j*16
5875
// IF imm8[7:0] > 15
5876
// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5877
// ELSE
5878
// dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5879
// FI
5880
// ENDFOR
5881
//
5882
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5883
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5884
{
5885
const int count = (imm & ~15) ? 15 : imm;
5886
return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5887
}
5888
5889
// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5890
// and store the results in dst.
5891
//
5892
// FOR j := 0 to 3
5893
// i := j*32
5894
// IF imm8[7:0] > 31
5895
// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5896
// ELSE
5897
// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5898
// FI
5899
// ENDFOR
5900
//
5901
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5902
// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5903
#define _mm_srai_epi32(a, imm) \
5904
__extension__({ \
5905
__m128i ret; \
5906
if (_sse2neon_unlikely((imm) == 0)) { \
5907
ret = a; \
5908
} else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5909
ret = vreinterpretq_m128i_s32( \
5910
vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5911
} else { \
5912
ret = vreinterpretq_m128i_s32( \
5913
vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5914
} \
5915
ret; \
5916
})
5917
5918
// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5919
// store the results in dst.
5920
//
5921
// FOR j := 0 to 7
5922
// i := j*16
5923
// IF count[63:0] > 15
5924
// dst[i+15:i] := 0
5925
// ELSE
5926
// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5927
// FI
5928
// ENDFOR
5929
//
5930
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5931
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5932
{
5933
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5934
if (_sse2neon_unlikely(c & ~15))
5935
return _mm_setzero_si128();
5936
5937
int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5938
return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5939
}
5940
5941
// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5942
// store the results in dst.
5943
//
5944
// FOR j := 0 to 3
5945
// i := j*32
5946
// IF count[63:0] > 31
5947
// dst[i+31:i] := 0
5948
// ELSE
5949
// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5950
// FI
5951
// ENDFOR
5952
//
5953
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5954
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5955
{
5956
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5957
if (_sse2neon_unlikely(c & ~31))
5958
return _mm_setzero_si128();
5959
5960
int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5961
return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5962
}
5963
5964
// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5965
// store the results in dst.
5966
//
5967
// FOR j := 0 to 1
5968
// i := j*64
5969
// IF count[63:0] > 63
5970
// dst[i+63:i] := 0
5971
// ELSE
5972
// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5973
// FI
5974
// ENDFOR
5975
//
5976
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5977
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5978
{
5979
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5980
if (_sse2neon_unlikely(c & ~63))
5981
return _mm_setzero_si128();
5982
5983
int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5984
return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5985
}
5986
5987
// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5988
// store the results in dst.
5989
//
5990
// FOR j := 0 to 7
5991
// i := j*16
5992
// IF imm8[7:0] > 15
5993
// dst[i+15:i] := 0
5994
// ELSE
5995
// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5996
// FI
5997
// ENDFOR
5998
//
5999
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
6000
#define _mm_srli_epi16(a, imm) \
6001
__extension__({ \
6002
__m128i ret; \
6003
if (_sse2neon_unlikely((imm) & ~15)) { \
6004
ret = _mm_setzero_si128(); \
6005
} else { \
6006
ret = vreinterpretq_m128i_u16( \
6007
vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
6008
} \
6009
ret; \
6010
})
6011
6012
// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
6013
// store the results in dst.
6014
//
6015
// FOR j := 0 to 3
6016
// i := j*32
6017
// IF imm8[7:0] > 31
6018
// dst[i+31:i] := 0
6019
// ELSE
6020
// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
6021
// FI
6022
// ENDFOR
6023
//
6024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
6025
// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
6026
#define _mm_srli_epi32(a, imm) \
6027
__extension__({ \
6028
__m128i ret; \
6029
if (_sse2neon_unlikely((imm) & ~31)) { \
6030
ret = _mm_setzero_si128(); \
6031
} else { \
6032
ret = vreinterpretq_m128i_u32( \
6033
vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
6034
} \
6035
ret; \
6036
})
6037
6038
// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
6039
// store the results in dst.
6040
//
6041
// FOR j := 0 to 1
6042
// i := j*64
6043
// IF imm8[7:0] > 63
6044
// dst[i+63:i] := 0
6045
// ELSE
6046
// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
6047
// FI
6048
// ENDFOR
6049
//
6050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
6051
#define _mm_srli_epi64(a, imm) \
6052
__extension__({ \
6053
__m128i ret; \
6054
if (_sse2neon_unlikely((imm) & ~63)) { \
6055
ret = _mm_setzero_si128(); \
6056
} else { \
6057
ret = vreinterpretq_m128i_u64( \
6058
vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
6059
} \
6060
ret; \
6061
})
6062
6063
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
6064
// dst.
6065
//
6066
// tmp := imm8[7:0]
6067
// IF tmp > 15
6068
// tmp := 16
6069
// FI
6070
// dst[127:0] := a[127:0] >> (tmp*8)
6071
//
6072
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
6073
#define _mm_srli_si128(a, imm) \
6074
__extension__({ \
6075
int8x16_t ret; \
6076
if (_sse2neon_unlikely((imm) & ~15)) \
6077
ret = vdupq_n_s8(0); \
6078
else \
6079
ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
6080
(imm > 15 ? 0 : imm)); \
6081
vreinterpretq_m128i_s8(ret); \
6082
})
6083
6084
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6085
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
6086
// or a general-protection exception may be generated.
6087
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
6088
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
6089
{
6090
#if defined(__aarch64__)
6091
vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
6092
#else
6093
vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
6094
#endif
6095
}
6096
6097
// Store the lower double-precision (64-bit) floating-point element from a into
6098
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
6099
// boundary or a general-protection exception may be generated.
6100
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
6101
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
6102
{
6103
#if defined(__aarch64__)
6104
float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
6105
vst1q_f64((float64_t *) mem_addr,
6106
vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
6107
#else
6108
float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
6109
vst1q_f32((float32_t *) mem_addr,
6110
vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
6111
#endif
6112
}
6113
6114
// Store the lower double-precision (64-bit) floating-point element from a into
6115
// memory. mem_addr does not need to be aligned on any particular boundary.
6116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
6117
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
6118
{
6119
#if defined(__aarch64__)
6120
vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6121
#else
6122
vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
6123
#endif
6124
}
6125
6126
// Stores four 32-bit integer values as (as a __m128i value) at the address p.
6127
// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
6128
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
6129
{
6130
vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6131
}
6132
6133
// Store the lower double-precision (64-bit) floating-point element from a into
6134
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
6135
// boundary or a general-protection exception may be generated.
6136
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
6137
#define _mm_store1_pd _mm_store_pd1
6138
6139
// Store the upper double-precision (64-bit) floating-point element from a into
6140
// memory.
6141
//
6142
// MEM[mem_addr+63:mem_addr] := a[127:64]
6143
//
6144
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
6145
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
6146
{
6147
#if defined(__aarch64__)
6148
vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
6149
#else
6150
vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
6151
#endif
6152
}
6153
6154
// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
6155
// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
6156
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
6157
{
6158
vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
6159
}
6160
6161
// Store the lower double-precision (64-bit) floating-point element from a into
6162
// memory.
6163
//
6164
// MEM[mem_addr+63:mem_addr] := a[63:0]
6165
//
6166
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
6167
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
6168
{
6169
#if defined(__aarch64__)
6170
vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6171
#else
6172
vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
6173
#endif
6174
}
6175
6176
// Store 2 double-precision (64-bit) floating-point elements from a into memory
6177
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
6178
// general-protection exception may be generated.
6179
//
6180
// MEM[mem_addr+63:mem_addr] := a[127:64]
6181
// MEM[mem_addr+127:mem_addr+64] := a[63:0]
6182
//
6183
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
6184
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
6185
{
6186
float32x4_t f = vreinterpretq_f32_m128d(a);
6187
_mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
6188
}
6189
6190
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6191
// elements) from a into memory. mem_addr does not need to be aligned on any
6192
// particular boundary.
6193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
6194
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
6195
{
6196
_mm_store_pd(mem_addr, a);
6197
}
6198
6199
// Stores 128-bits of integer data a at the address p.
6200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
6201
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
6202
{
6203
vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6204
}
6205
6206
// Stores 32-bits of integer data a at the address p.
6207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
6208
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
6209
{
6210
vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
6211
}
6212
6213
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6214
// elements) from a into memory using a non-temporal memory hint. mem_addr must
6215
// be aligned on a 16-byte boundary or a general-protection exception may be
6216
// generated.
6217
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
6218
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
6219
{
6220
#if __has_builtin(__builtin_nontemporal_store)
6221
__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
6222
#elif defined(__aarch64__)
6223
vst1q_f64(p, vreinterpretq_f64_m128d(a));
6224
#else
6225
vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
6226
#endif
6227
}
6228
6229
// Stores the data in a to the address p without polluting the caches. If the
6230
// cache line containing address p is already in the cache, the cache will be
6231
// updated.
6232
// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
6233
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6234
{
6235
#if __has_builtin(__builtin_nontemporal_store)
6236
__builtin_nontemporal_store(a, p);
6237
#else
6238
vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6239
#endif
6240
}
6241
6242
// Store 32-bit integer a into memory using a non-temporal hint to minimize
6243
// cache pollution. If the cache line containing address mem_addr is already in
6244
// the cache, the cache will be updated.
6245
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
6246
FORCE_INLINE void _mm_stream_si32(int *p, int a)
6247
{
6248
vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6249
}
6250
6251
// Store 64-bit integer a into memory using a non-temporal hint to minimize
6252
// cache pollution. If the cache line containing address mem_addr is already in
6253
// the cache, the cache will be updated.
6254
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
6255
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
6256
{
6257
vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6258
}
6259
6260
// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6261
// store the results in dst.
6262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
6263
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
6264
{
6265
return vreinterpretq_m128i_s16(
6266
vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6267
}
6268
6269
// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6270
// unsigned 32-bit integers of a.
6271
//
6272
// r0 := a0 - b0
6273
// r1 := a1 - b1
6274
// r2 := a2 - b2
6275
// r3 := a3 - b3
6276
//
6277
// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6278
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
6279
{
6280
return vreinterpretq_m128i_s32(
6281
vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6282
}
6283
6284
// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6285
// and store the results in dst.
6286
// r0 := a0 - b0
6287
// r1 := a1 - b1
6288
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
6289
{
6290
return vreinterpretq_m128i_s64(
6291
vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
6292
}
6293
6294
// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6295
// store the results in dst.
6296
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
6297
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
6298
{
6299
return vreinterpretq_m128i_s8(
6300
vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6301
}
6302
6303
// Subtract packed double-precision (64-bit) floating-point elements in b from
6304
// packed double-precision (64-bit) floating-point elements in a, and store the
6305
// results in dst.
6306
//
6307
// FOR j := 0 to 1
6308
// i := j*64
6309
// dst[i+63:i] := a[i+63:i] - b[i+63:i]
6310
// ENDFOR
6311
//
6312
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
6313
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
6314
{
6315
#if defined(__aarch64__)
6316
return vreinterpretq_m128d_f64(
6317
vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6318
#else
6319
double *da = (double *) &a;
6320
double *db = (double *) &b;
6321
double c[2];
6322
c[0] = da[0] - db[0];
6323
c[1] = da[1] - db[1];
6324
return vld1q_f32((float32_t *) c);
6325
#endif
6326
}
6327
6328
// Subtract the lower double-precision (64-bit) floating-point element in b from
6329
// the lower double-precision (64-bit) floating-point element in a, store the
6330
// result in the lower element of dst, and copy the upper element from a to the
6331
// upper element of dst.
6332
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
6333
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
6334
{
6335
return _mm_move_sd(a, _mm_sub_pd(a, b));
6336
}
6337
6338
// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6339
//
6340
// dst[63:0] := a[63:0] - b[63:0]
6341
//
6342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
6343
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
6344
{
6345
return vreinterpret_m64_s64(
6346
vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
6347
}
6348
6349
// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6350
// of a and saturates.
6351
//
6352
// r0 := SignedSaturate(a0 - b0)
6353
// r1 := SignedSaturate(a1 - b1)
6354
// ...
6355
// r7 := SignedSaturate(a7 - b7)
6356
//
6357
// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6358
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
6359
{
6360
return vreinterpretq_m128i_s16(
6361
vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6362
}
6363
6364
// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6365
// of a and saturates.
6366
//
6367
// r0 := SignedSaturate(a0 - b0)
6368
// r1 := SignedSaturate(a1 - b1)
6369
// ...
6370
// r15 := SignedSaturate(a15 - b15)
6371
//
6372
// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6373
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
6374
{
6375
return vreinterpretq_m128i_s8(
6376
vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6377
}
6378
6379
// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6380
// integers of a and saturates..
6381
// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6382
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
6383
{
6384
return vreinterpretq_m128i_u16(
6385
vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
6386
}
6387
6388
// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6389
// integers of a and saturates.
6390
//
6391
// r0 := UnsignedSaturate(a0 - b0)
6392
// r1 := UnsignedSaturate(a1 - b1)
6393
// ...
6394
// r15 := UnsignedSaturate(a15 - b15)
6395
//
6396
// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6397
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
6398
{
6399
return vreinterpretq_m128i_u8(
6400
vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
6401
}
6402
6403
#define _mm_ucomieq_sd _mm_comieq_sd
6404
#define _mm_ucomige_sd _mm_comige_sd
6405
#define _mm_ucomigt_sd _mm_comigt_sd
6406
#define _mm_ucomile_sd _mm_comile_sd
6407
#define _mm_ucomilt_sd _mm_comilt_sd
6408
#define _mm_ucomineq_sd _mm_comineq_sd
6409
6410
// Return vector of type __m128d with undefined elements.
6411
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
6412
FORCE_INLINE __m128d _mm_undefined_pd(void)
6413
{
6414
#if defined(__GNUC__) || defined(__clang__)
6415
#pragma GCC diagnostic push
6416
#pragma GCC diagnostic ignored "-Wuninitialized"
6417
#endif
6418
__m128d a;
6419
return a;
6420
#if defined(__GNUC__) || defined(__clang__)
6421
#pragma GCC diagnostic pop
6422
#endif
6423
}
6424
6425
// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6426
// upper 4 signed or unsigned 16-bit integers in b.
6427
//
6428
// r0 := a4
6429
// r1 := b4
6430
// r2 := a5
6431
// r3 := b5
6432
// r4 := a6
6433
// r5 := b6
6434
// r6 := a7
6435
// r7 := b7
6436
//
6437
// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6438
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6439
{
6440
#if defined(__aarch64__)
6441
return vreinterpretq_m128i_s16(
6442
vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6443
#else
6444
int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6445
int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6446
int16x4x2_t result = vzip_s16(a1, b1);
6447
return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6448
#endif
6449
}
6450
6451
// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6452
// upper 2 signed or unsigned 32-bit integers in b.
6453
// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6454
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6455
{
6456
#if defined(__aarch64__)
6457
return vreinterpretq_m128i_s32(
6458
vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6459
#else
6460
int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6461
int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6462
int32x2x2_t result = vzip_s32(a1, b1);
6463
return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6464
#endif
6465
}
6466
6467
// Interleaves the upper signed or unsigned 64-bit integer in a with the
6468
// upper signed or unsigned 64-bit integer in b.
6469
//
6470
// r0 := a1
6471
// r1 := b1
6472
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6473
{
6474
int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6475
int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6476
return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6477
}
6478
6479
// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6480
// 8 signed or unsigned 8-bit integers in b.
6481
//
6482
// r0 := a8
6483
// r1 := b8
6484
// r2 := a9
6485
// r3 := b9
6486
// ...
6487
// r14 := a15
6488
// r15 := b15
6489
//
6490
// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6491
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6492
{
6493
#if defined(__aarch64__)
6494
return vreinterpretq_m128i_s8(
6495
vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6496
#else
6497
int8x8_t a1 =
6498
vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6499
int8x8_t b1 =
6500
vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6501
int8x8x2_t result = vzip_s8(a1, b1);
6502
return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6503
#endif
6504
}
6505
6506
// Unpack and interleave double-precision (64-bit) floating-point elements from
6507
// the high half of a and b, and store the results in dst.
6508
//
6509
// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6510
// dst[63:0] := src1[127:64]
6511
// dst[127:64] := src2[127:64]
6512
// RETURN dst[127:0]
6513
// }
6514
// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6515
//
6516
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
6517
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6518
{
6519
#if defined(__aarch64__)
6520
return vreinterpretq_m128d_f64(
6521
vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6522
#else
6523
return vreinterpretq_m128d_s64(
6524
vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6525
vget_high_s64(vreinterpretq_s64_m128d(b))));
6526
#endif
6527
}
6528
6529
// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6530
// lower 4 signed or unsigned 16-bit integers in b.
6531
//
6532
// r0 := a0
6533
// r1 := b0
6534
// r2 := a1
6535
// r3 := b1
6536
// r4 := a2
6537
// r5 := b2
6538
// r6 := a3
6539
// r7 := b3
6540
//
6541
// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6542
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6543
{
6544
#if defined(__aarch64__)
6545
return vreinterpretq_m128i_s16(
6546
vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6547
#else
6548
int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6549
int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6550
int16x4x2_t result = vzip_s16(a1, b1);
6551
return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6552
#endif
6553
}
6554
6555
// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6556
// lower 2 signed or unsigned 32 - bit integers in b.
6557
//
6558
// r0 := a0
6559
// r1 := b0
6560
// r2 := a1
6561
// r3 := b1
6562
//
6563
// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6564
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6565
{
6566
#if defined(__aarch64__)
6567
return vreinterpretq_m128i_s32(
6568
vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6569
#else
6570
int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6571
int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6572
int32x2x2_t result = vzip_s32(a1, b1);
6573
return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6574
#endif
6575
}
6576
6577
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6578
{
6579
int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6580
int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6581
return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6582
}
6583
6584
// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6585
// 8 signed or unsigned 8-bit integers in b.
6586
//
6587
// r0 := a0
6588
// r1 := b0
6589
// r2 := a1
6590
// r3 := b1
6591
// ...
6592
// r14 := a7
6593
// r15 := b7
6594
//
6595
// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6596
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6597
{
6598
#if defined(__aarch64__)
6599
return vreinterpretq_m128i_s8(
6600
vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6601
#else
6602
int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6603
int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6604
int8x8x2_t result = vzip_s8(a1, b1);
6605
return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6606
#endif
6607
}
6608
6609
// Unpack and interleave double-precision (64-bit) floating-point elements from
6610
// the low half of a and b, and store the results in dst.
6611
//
6612
// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6613
// dst[63:0] := src1[63:0]
6614
// dst[127:64] := src2[63:0]
6615
// RETURN dst[127:0]
6616
// }
6617
// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6618
//
6619
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
6620
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6621
{
6622
#if defined(__aarch64__)
6623
return vreinterpretq_m128d_f64(
6624
vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6625
#else
6626
return vreinterpretq_m128d_s64(
6627
vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6628
vget_low_s64(vreinterpretq_s64_m128d(b))));
6629
#endif
6630
}
6631
6632
// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6633
// elements in a and b, and store the results in dst.
6634
//
6635
// FOR j := 0 to 1
6636
// i := j*64
6637
// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6638
// ENDFOR
6639
//
6640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
6641
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6642
{
6643
return vreinterpretq_m128d_s64(
6644
veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6645
}
6646
6647
// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6648
// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6649
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6650
{
6651
return vreinterpretq_m128i_s32(
6652
veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6653
}
6654
6655
/* SSE3 */
6656
6657
// Alternatively add and subtract packed double-precision (64-bit)
6658
// floating-point elements in a to/from packed elements in b, and store the
6659
// results in dst.
6660
//
6661
// FOR j := 0 to 1
6662
// i := j*64
6663
// IF ((j & 1) == 0)
6664
// dst[i+63:i] := a[i+63:i] - b[i+63:i]
6665
// ELSE
6666
// dst[i+63:i] := a[i+63:i] + b[i+63:i]
6667
// FI
6668
// ENDFOR
6669
//
6670
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
6671
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6672
{
6673
_sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6674
#if defined(__aarch64__)
6675
return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6676
vreinterpretq_f64_m128d(b),
6677
vreinterpretq_f64_m128d(mask)));
6678
#else
6679
return _mm_add_pd(_mm_mul_pd(b, mask), a);
6680
#endif
6681
}
6682
6683
// Alternatively add and subtract packed single-precision (32-bit)
6684
// floating-point elements in a to/from packed elements in b, and store the
6685
// results in dst.
6686
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
6687
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6688
{
6689
_sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6690
#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6691
return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6692
vreinterpretq_f32_m128(mask),
6693
vreinterpretq_f32_m128(b)));
6694
#else
6695
return _mm_add_ps(_mm_mul_ps(b, mask), a);
6696
#endif
6697
}
6698
6699
// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6700
// elements in a and b, and pack the results in dst.
6701
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
6702
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6703
{
6704
#if defined(__aarch64__)
6705
return vreinterpretq_m128d_f64(
6706
vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6707
#else
6708
double *da = (double *) &a;
6709
double *db = (double *) &b;
6710
double c[] = {da[0] + da[1], db[0] + db[1]};
6711
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6712
#endif
6713
}
6714
6715
// Computes pairwise add of each argument as single-precision, floating-point
6716
// values a and b.
6717
// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6718
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6719
{
6720
#if defined(__aarch64__)
6721
return vreinterpretq_m128_f32(
6722
vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6723
#else
6724
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6725
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6726
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6727
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6728
return vreinterpretq_m128_f32(
6729
vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6730
#endif
6731
}
6732
6733
// Horizontally subtract adjacent pairs of double-precision (64-bit)
6734
// floating-point elements in a and b, and pack the results in dst.
6735
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
6736
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6737
{
6738
#if defined(__aarch64__)
6739
float64x2_t a = vreinterpretq_f64_m128d(_a);
6740
float64x2_t b = vreinterpretq_f64_m128d(_b);
6741
return vreinterpretq_m128d_f64(
6742
vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6743
#else
6744
double *da = (double *) &_a;
6745
double *db = (double *) &_b;
6746
double c[] = {da[0] - da[1], db[0] - db[1]};
6747
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6748
#endif
6749
}
6750
6751
// Horizontally subtract adjacent pairs of single-precision (32-bit)
6752
// floating-point elements in a and b, and pack the results in dst.
6753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
6754
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6755
{
6756
float32x4_t a = vreinterpretq_f32_m128(_a);
6757
float32x4_t b = vreinterpretq_f32_m128(_b);
6758
#if defined(__aarch64__)
6759
return vreinterpretq_m128_f32(
6760
vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6761
#else
6762
float32x4x2_t c = vuzpq_f32(a, b);
6763
return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6764
#endif
6765
}
6766
6767
// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6768
// may perform better than _mm_loadu_si128 when the data crosses a cache line
6769
// boundary.
6770
//
6771
// dst[127:0] := MEM[mem_addr+127:mem_addr]
6772
//
6773
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
6774
#define _mm_lddqu_si128 _mm_loadu_si128
6775
6776
// Load a double-precision (64-bit) floating-point element from memory into both
6777
// elements of dst.
6778
//
6779
// dst[63:0] := MEM[mem_addr+63:mem_addr]
6780
// dst[127:64] := MEM[mem_addr+63:mem_addr]
6781
//
6782
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
6783
#define _mm_loaddup_pd _mm_load1_pd
6784
6785
// Duplicate the low double-precision (64-bit) floating-point element from a,
6786
// and store the results in dst.
6787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6788
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6789
{
6790
#if defined(__aarch64__)
6791
return vreinterpretq_m128d_f64(
6792
vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6793
#else
6794
return vreinterpretq_m128d_u64(
6795
vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6796
#endif
6797
}
6798
6799
// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6800
// from a, and store the results in dst.
6801
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6802
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6803
{
6804
#if defined(__aarch64__)
6805
return vreinterpretq_m128_f32(
6806
vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6807
#elif defined(_sse2neon_shuffle)
6808
return vreinterpretq_m128_f32(vshuffleq_s32(
6809
vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6810
#else
6811
float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6812
float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6813
float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6814
return vreinterpretq_m128_f32(vld1q_f32(data));
6815
#endif
6816
}
6817
6818
// Duplicate even-indexed single-precision (32-bit) floating-point elements
6819
// from a, and store the results in dst.
6820
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6821
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6822
{
6823
#if defined(__aarch64__)
6824
return vreinterpretq_m128_f32(
6825
vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6826
#elif defined(_sse2neon_shuffle)
6827
return vreinterpretq_m128_f32(vshuffleq_s32(
6828
vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6829
#else
6830
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6831
float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6832
float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6833
return vreinterpretq_m128_f32(vld1q_f32(data));
6834
#endif
6835
}
6836
6837
/* SSSE3 */
6838
6839
// Compute the absolute value of packed signed 16-bit integers in a, and store
6840
// the unsigned results in dst.
6841
//
6842
// FOR j := 0 to 7
6843
// i := j*16
6844
// dst[i+15:i] := ABS(a[i+15:i])
6845
// ENDFOR
6846
//
6847
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6848
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6849
{
6850
return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6851
}
6852
6853
// Compute the absolute value of packed signed 32-bit integers in a, and store
6854
// the unsigned results in dst.
6855
//
6856
// FOR j := 0 to 3
6857
// i := j*32
6858
// dst[i+31:i] := ABS(a[i+31:i])
6859
// ENDFOR
6860
//
6861
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6862
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6863
{
6864
return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6865
}
6866
6867
// Compute the absolute value of packed signed 8-bit integers in a, and store
6868
// the unsigned results in dst.
6869
//
6870
// FOR j := 0 to 15
6871
// i := j*8
6872
// dst[i+7:i] := ABS(a[i+7:i])
6873
// ENDFOR
6874
//
6875
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6876
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6877
{
6878
return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6879
}
6880
6881
// Compute the absolute value of packed signed 16-bit integers in a, and store
6882
// the unsigned results in dst.
6883
//
6884
// FOR j := 0 to 3
6885
// i := j*16
6886
// dst[i+15:i] := ABS(a[i+15:i])
6887
// ENDFOR
6888
//
6889
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6890
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6891
{
6892
return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6893
}
6894
6895
// Compute the absolute value of packed signed 32-bit integers in a, and store
6896
// the unsigned results in dst.
6897
//
6898
// FOR j := 0 to 1
6899
// i := j*32
6900
// dst[i+31:i] := ABS(a[i+31:i])
6901
// ENDFOR
6902
//
6903
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6904
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6905
{
6906
return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6907
}
6908
6909
// Compute the absolute value of packed signed 8-bit integers in a, and store
6910
// the unsigned results in dst.
6911
//
6912
// FOR j := 0 to 7
6913
// i := j*8
6914
// dst[i+7:i] := ABS(a[i+7:i])
6915
// ENDFOR
6916
//
6917
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6918
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6919
{
6920
return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6921
}
6922
6923
// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6924
// the result right by imm8 bytes, and store the low 16 bytes in dst.
6925
//
6926
// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6927
// dst[127:0] := tmp[127:0]
6928
//
6929
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6930
#define _mm_alignr_epi8(a, b, imm) \
6931
__extension__({ \
6932
uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6933
uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6934
__m128i ret; \
6935
if (_sse2neon_unlikely((imm) & ~31)) \
6936
ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6937
else if (imm >= 16) \
6938
ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6939
else \
6940
ret = \
6941
vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6942
ret; \
6943
})
6944
6945
// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6946
// the result right by imm8 bytes, and store the low 8 bytes in dst.
6947
//
6948
// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6949
// dst[63:0] := tmp[63:0]
6950
//
6951
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6952
#define _mm_alignr_pi8(a, b, imm) \
6953
__extension__({ \
6954
__m64 ret; \
6955
if (_sse2neon_unlikely((imm) >= 16)) { \
6956
ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6957
} else { \
6958
uint8x8_t tmp_low, tmp_high; \
6959
if ((imm) >= 8) { \
6960
const int idx = (imm) -8; \
6961
tmp_low = vreinterpret_u8_m64(a); \
6962
tmp_high = vdup_n_u8(0); \
6963
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6964
} else { \
6965
const int idx = (imm); \
6966
tmp_low = vreinterpret_u8_m64(b); \
6967
tmp_high = vreinterpret_u8_m64(a); \
6968
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6969
} \
6970
} \
6971
ret; \
6972
})
6973
6974
// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6975
// values a and b.
6976
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6977
{
6978
int16x8_t a = vreinterpretq_s16_m128i(_a);
6979
int16x8_t b = vreinterpretq_s16_m128i(_b);
6980
#if defined(__aarch64__)
6981
return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6982
#else
6983
return vreinterpretq_m128i_s16(
6984
vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6985
vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6986
#endif
6987
}
6988
6989
// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6990
// values a and b.
6991
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6992
{
6993
int32x4_t a = vreinterpretq_s32_m128i(_a);
6994
int32x4_t b = vreinterpretq_s32_m128i(_b);
6995
#if defined(__aarch64__)
6996
return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6997
#else
6998
return vreinterpretq_m128i_s32(
6999
vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
7000
vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
7001
#endif
7002
}
7003
7004
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
7005
// signed 16-bit results in dst.
7006
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
7007
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
7008
{
7009
return vreinterpret_m64_s16(
7010
vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
7011
}
7012
7013
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
7014
// signed 32-bit results in dst.
7015
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
7016
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
7017
{
7018
return vreinterpret_m64_s32(
7019
vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
7020
}
7021
7022
// Computes saturated pairwise sub of each argument as a 16-bit signed
7023
// integer values a and b.
7024
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
7025
{
7026
#if defined(__aarch64__)
7027
int16x8_t a = vreinterpretq_s16_m128i(_a);
7028
int16x8_t b = vreinterpretq_s16_m128i(_b);
7029
return vreinterpretq_s64_s16(
7030
vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7031
#else
7032
int32x4_t a = vreinterpretq_s32_m128i(_a);
7033
int32x4_t b = vreinterpretq_s32_m128i(_b);
7034
// Interleave using vshrn/vmovn
7035
// [a0|a2|a4|a6|b0|b2|b4|b6]
7036
// [a1|a3|a5|a7|b1|b3|b5|b7]
7037
int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
7038
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
7039
// Saturated add
7040
return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
7041
#endif
7042
}
7043
7044
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
7045
// saturation, and pack the signed 16-bit results in dst.
7046
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
7047
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
7048
{
7049
int16x4_t a = vreinterpret_s16_m64(_a);
7050
int16x4_t b = vreinterpret_s16_m64(_b);
7051
#if defined(__aarch64__)
7052
return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7053
#else
7054
int16x4x2_t res = vuzp_s16(a, b);
7055
return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
7056
#endif
7057
}
7058
7059
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
7060
// the signed 16-bit results in dst.
7061
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
7062
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
7063
{
7064
int16x8_t a = vreinterpretq_s16_m128i(_a);
7065
int16x8_t b = vreinterpretq_s16_m128i(_b);
7066
#if defined(__aarch64__)
7067
return vreinterpretq_m128i_s16(
7068
vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7069
#else
7070
int16x8x2_t c = vuzpq_s16(a, b);
7071
return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
7072
#endif
7073
}
7074
7075
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
7076
// the signed 32-bit results in dst.
7077
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
7078
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
7079
{
7080
int32x4_t a = vreinterpretq_s32_m128i(_a);
7081
int32x4_t b = vreinterpretq_s32_m128i(_b);
7082
#if defined(__aarch64__)
7083
return vreinterpretq_m128i_s32(
7084
vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
7085
#else
7086
int32x4x2_t c = vuzpq_s32(a, b);
7087
return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
7088
#endif
7089
}
7090
7091
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
7092
// the signed 16-bit results in dst.
7093
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
7094
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
7095
{
7096
int16x4_t a = vreinterpret_s16_m64(_a);
7097
int16x4_t b = vreinterpret_s16_m64(_b);
7098
#if defined(__aarch64__)
7099
return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7100
#else
7101
int16x4x2_t c = vuzp_s16(a, b);
7102
return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
7103
#endif
7104
}
7105
7106
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
7107
// the signed 32-bit results in dst.
7108
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
7109
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
7110
{
7111
int32x2_t a = vreinterpret_s32_m64(_a);
7112
int32x2_t b = vreinterpret_s32_m64(_b);
7113
#if defined(__aarch64__)
7114
return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
7115
#else
7116
int32x2x2_t c = vuzp_s32(a, b);
7117
return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
7118
#endif
7119
}
7120
7121
// Computes saturated pairwise difference of each argument as a 16-bit signed
7122
// integer values a and b.
7123
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
7124
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
7125
{
7126
int16x8_t a = vreinterpretq_s16_m128i(_a);
7127
int16x8_t b = vreinterpretq_s16_m128i(_b);
7128
#if defined(__aarch64__)
7129
return vreinterpretq_m128i_s16(
7130
vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7131
#else
7132
int16x8x2_t c = vuzpq_s16(a, b);
7133
return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
7134
#endif
7135
}
7136
7137
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
7138
// using saturation, and pack the signed 16-bit results in dst.
7139
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
7140
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
7141
{
7142
int16x4_t a = vreinterpret_s16_m64(_a);
7143
int16x4_t b = vreinterpret_s16_m64(_b);
7144
#if defined(__aarch64__)
7145
return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7146
#else
7147
int16x4x2_t c = vuzp_s16(a, b);
7148
return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
7149
#endif
7150
}
7151
7152
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
7153
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7154
// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
7155
// and pack the saturated results in dst.
7156
//
7157
// FOR j := 0 to 7
7158
// i := j*16
7159
// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
7160
// a[i+7:i]*b[i+7:i] )
7161
// ENDFOR
7162
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
7163
{
7164
#if defined(__aarch64__)
7165
uint8x16_t a = vreinterpretq_u8_m128i(_a);
7166
int8x16_t b = vreinterpretq_s8_m128i(_b);
7167
int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
7168
vmovl_s8(vget_low_s8(b)));
7169
int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
7170
vmovl_s8(vget_high_s8(b)));
7171
return vreinterpretq_m128i_s16(
7172
vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
7173
#else
7174
// This would be much simpler if x86 would choose to zero extend OR sign
7175
// extend, not both. This could probably be optimized better.
7176
uint16x8_t a = vreinterpretq_u16_m128i(_a);
7177
int16x8_t b = vreinterpretq_s16_m128i(_b);
7178
7179
// Zero extend a
7180
int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
7181
int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
7182
7183
// Sign extend by shifting left then shifting right.
7184
int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
7185
int16x8_t b_odd = vshrq_n_s16(b, 8);
7186
7187
// multiply
7188
int16x8_t prod1 = vmulq_s16(a_even, b_even);
7189
int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
7190
7191
// saturated add
7192
return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
7193
#endif
7194
}
7195
7196
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
7197
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7198
// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
7199
// pack the saturated results in dst.
7200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
7201
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
7202
{
7203
uint16x4_t a = vreinterpret_u16_m64(_a);
7204
int16x4_t b = vreinterpret_s16_m64(_b);
7205
7206
// Zero extend a
7207
int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7208
int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7209
7210
// Sign extend by shifting left then shifting right.
7211
int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7212
int16x4_t b_odd = vshr_n_s16(b, 8);
7213
7214
// multiply
7215
int16x4_t prod1 = vmul_s16(a_even, b_even);
7216
int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7217
7218
// saturated add
7219
return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
7220
}
7221
7222
// Multiply packed signed 16-bit integers in a and b, producing intermediate
7223
// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
7224
// the packed 16-bit integers in dst.
7225
//
7226
// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
7227
// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
7228
// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
7229
// ...
7230
// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
7231
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
7232
{
7233
// Has issues due to saturation
7234
// return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
7235
7236
// Multiply
7237
int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
7238
vget_low_s16(vreinterpretq_s16_m128i(b)));
7239
int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
7240
vget_high_s16(vreinterpretq_s16_m128i(b)));
7241
7242
// Rounding narrowing shift right
7243
// narrow = (int16_t)((mul + 16384) >> 15);
7244
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7245
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7246
7247
// Join together
7248
return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
7249
}
7250
7251
// Multiply packed signed 16-bit integers in a and b, producing intermediate
7252
// signed 32-bit integers. Truncate each intermediate integer to the 18 most
7253
// significant bits, round by adding 1, and store bits [16:1] to dst.
7254
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
7255
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
7256
{
7257
int32x4_t mul_extend =
7258
vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
7259
7260
// Rounding narrowing shift right
7261
return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7262
}
7263
7264
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7265
// corresponding 8-bit element of b, and store the results in dst.
7266
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
7267
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
7268
{
7269
int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
7270
uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
7271
uint8x16_t idx_masked =
7272
vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
7273
#if defined(__aarch64__)
7274
return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7275
#elif defined(__GNUC__)
7276
int8x16_t ret;
7277
// %e and %f represent the even and odd D registers
7278
// respectively.
7279
__asm__ __volatile__(
7280
"vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7281
"vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7282
: [ret] "=&w"(ret)
7283
: [tbl] "w"(tbl), [idx] "w"(idx_masked));
7284
return vreinterpretq_m128i_s8(ret);
7285
#else
7286
// use this line if testing on aarch64
7287
int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7288
return vreinterpretq_m128i_s8(
7289
vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7290
vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7291
#endif
7292
}
7293
7294
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7295
// corresponding 8-bit element of b, and store the results in dst.
7296
//
7297
// FOR j := 0 to 7
7298
// i := j*8
7299
// IF b[i+7] == 1
7300
// dst[i+7:i] := 0
7301
// ELSE
7302
// index[2:0] := b[i+2:i]
7303
// dst[i+7:i] := a[index*8+7:index*8]
7304
// FI
7305
// ENDFOR
7306
//
7307
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
7308
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
7309
{
7310
const int8x8_t controlMask =
7311
vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
7312
int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7313
return vreinterpret_m64_s8(res);
7314
}
7315
7316
// Negate packed 16-bit integers in a when the corresponding signed
7317
// 16-bit integer in b is negative, and store the results in dst.
7318
// Element in dst are zeroed out when the corresponding element
7319
// in b is zero.
7320
//
7321
// for i in 0..7
7322
// if b[i] < 0
7323
// r[i] := -a[i]
7324
// else if b[i] == 0
7325
// r[i] := 0
7326
// else
7327
// r[i] := a[i]
7328
// fi
7329
// done
7330
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
7331
{
7332
int16x8_t a = vreinterpretq_s16_m128i(_a);
7333
int16x8_t b = vreinterpretq_s16_m128i(_b);
7334
7335
// signed shift right: faster than vclt
7336
// (b < 0) ? 0xFFFF : 0
7337
uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7338
// (b == 0) ? 0xFFFF : 0
7339
#if defined(__aarch64__)
7340
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7341
#else
7342
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7343
#endif
7344
7345
// bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7346
// 'a') based on ltMask
7347
int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7348
// res = masked & (~zeroMask)
7349
int16x8_t res = vbicq_s16(masked, zeroMask);
7350
return vreinterpretq_m128i_s16(res);
7351
}
7352
7353
// Negate packed 32-bit integers in a when the corresponding signed
7354
// 32-bit integer in b is negative, and store the results in dst.
7355
// Element in dst are zeroed out when the corresponding element
7356
// in b is zero.
7357
//
7358
// for i in 0..3
7359
// if b[i] < 0
7360
// r[i] := -a[i]
7361
// else if b[i] == 0
7362
// r[i] := 0
7363
// else
7364
// r[i] := a[i]
7365
// fi
7366
// done
7367
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
7368
{
7369
int32x4_t a = vreinterpretq_s32_m128i(_a);
7370
int32x4_t b = vreinterpretq_s32_m128i(_b);
7371
7372
// signed shift right: faster than vclt
7373
// (b < 0) ? 0xFFFFFFFF : 0
7374
uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7375
7376
// (b == 0) ? 0xFFFFFFFF : 0
7377
#if defined(__aarch64__)
7378
int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7379
#else
7380
int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7381
#endif
7382
7383
// bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7384
// 'a') based on ltMask
7385
int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7386
// res = masked & (~zeroMask)
7387
int32x4_t res = vbicq_s32(masked, zeroMask);
7388
return vreinterpretq_m128i_s32(res);
7389
}
7390
7391
// Negate packed 8-bit integers in a when the corresponding signed
7392
// 8-bit integer in b is negative, and store the results in dst.
7393
// Element in dst are zeroed out when the corresponding element
7394
// in b is zero.
7395
//
7396
// for i in 0..15
7397
// if b[i] < 0
7398
// r[i] := -a[i]
7399
// else if b[i] == 0
7400
// r[i] := 0
7401
// else
7402
// r[i] := a[i]
7403
// fi
7404
// done
7405
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
7406
{
7407
int8x16_t a = vreinterpretq_s8_m128i(_a);
7408
int8x16_t b = vreinterpretq_s8_m128i(_b);
7409
7410
// signed shift right: faster than vclt
7411
// (b < 0) ? 0xFF : 0
7412
uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7413
7414
// (b == 0) ? 0xFF : 0
7415
#if defined(__aarch64__)
7416
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7417
#else
7418
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7419
#endif
7420
7421
// bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
7422
// based on ltMask
7423
int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7424
// res = masked & (~zeroMask)
7425
int8x16_t res = vbicq_s8(masked, zeroMask);
7426
7427
return vreinterpretq_m128i_s8(res);
7428
}
7429
7430
// Negate packed 16-bit integers in a when the corresponding signed 16-bit
7431
// integer in b is negative, and store the results in dst. Element in dst are
7432
// zeroed out when the corresponding element in b is zero.
7433
//
7434
// FOR j := 0 to 3
7435
// i := j*16
7436
// IF b[i+15:i] < 0
7437
// dst[i+15:i] := -(a[i+15:i])
7438
// ELSE IF b[i+15:i] == 0
7439
// dst[i+15:i] := 0
7440
// ELSE
7441
// dst[i+15:i] := a[i+15:i]
7442
// FI
7443
// ENDFOR
7444
//
7445
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
7446
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7447
{
7448
int16x4_t a = vreinterpret_s16_m64(_a);
7449
int16x4_t b = vreinterpret_s16_m64(_b);
7450
7451
// signed shift right: faster than vclt
7452
// (b < 0) ? 0xFFFF : 0
7453
uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7454
7455
// (b == 0) ? 0xFFFF : 0
7456
#if defined(__aarch64__)
7457
int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7458
#else
7459
int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7460
#endif
7461
7462
// bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
7463
// based on ltMask
7464
int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7465
// res = masked & (~zeroMask)
7466
int16x4_t res = vbic_s16(masked, zeroMask);
7467
7468
return vreinterpret_m64_s16(res);
7469
}
7470
7471
// Negate packed 32-bit integers in a when the corresponding signed 32-bit
7472
// integer in b is negative, and store the results in dst. Element in dst are
7473
// zeroed out when the corresponding element in b is zero.
7474
//
7475
// FOR j := 0 to 1
7476
// i := j*32
7477
// IF b[i+31:i] < 0
7478
// dst[i+31:i] := -(a[i+31:i])
7479
// ELSE IF b[i+31:i] == 0
7480
// dst[i+31:i] := 0
7481
// ELSE
7482
// dst[i+31:i] := a[i+31:i]
7483
// FI
7484
// ENDFOR
7485
//
7486
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
7487
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7488
{
7489
int32x2_t a = vreinterpret_s32_m64(_a);
7490
int32x2_t b = vreinterpret_s32_m64(_b);
7491
7492
// signed shift right: faster than vclt
7493
// (b < 0) ? 0xFFFFFFFF : 0
7494
uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7495
7496
// (b == 0) ? 0xFFFFFFFF : 0
7497
#if defined(__aarch64__)
7498
int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7499
#else
7500
int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7501
#endif
7502
7503
// bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
7504
// based on ltMask
7505
int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7506
// res = masked & (~zeroMask)
7507
int32x2_t res = vbic_s32(masked, zeroMask);
7508
7509
return vreinterpret_m64_s32(res);
7510
}
7511
7512
// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7513
// in b is negative, and store the results in dst. Element in dst are zeroed out
7514
// when the corresponding element in b is zero.
7515
//
7516
// FOR j := 0 to 7
7517
// i := j*8
7518
// IF b[i+7:i] < 0
7519
// dst[i+7:i] := -(a[i+7:i])
7520
// ELSE IF b[i+7:i] == 0
7521
// dst[i+7:i] := 0
7522
// ELSE
7523
// dst[i+7:i] := a[i+7:i]
7524
// FI
7525
// ENDFOR
7526
//
7527
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
7528
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7529
{
7530
int8x8_t a = vreinterpret_s8_m64(_a);
7531
int8x8_t b = vreinterpret_s8_m64(_b);
7532
7533
// signed shift right: faster than vclt
7534
// (b < 0) ? 0xFF : 0
7535
uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7536
7537
// (b == 0) ? 0xFF : 0
7538
#if defined(__aarch64__)
7539
int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7540
#else
7541
int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7542
#endif
7543
7544
// bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
7545
// based on ltMask
7546
int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7547
// res = masked & (~zeroMask)
7548
int8x8_t res = vbic_s8(masked, zeroMask);
7549
7550
return vreinterpret_m64_s8(res);
7551
}
7552
7553
/* SSE4.1 */
7554
7555
// Blend packed 16-bit integers from a and b using control mask imm8, and store
7556
// the results in dst.
7557
//
7558
// FOR j := 0 to 7
7559
// i := j*16
7560
// IF imm8[j]
7561
// dst[i+15:i] := b[i+15:i]
7562
// ELSE
7563
// dst[i+15:i] := a[i+15:i]
7564
// FI
7565
// ENDFOR
7566
// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7567
// __constrange(0,255) int imm)
7568
#define _mm_blend_epi16(a, b, imm) \
7569
__extension__({ \
7570
const uint16_t ones = 0xffff; \
7571
const uint16_t zeros = 0x0000; \
7572
const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros, \
7573
((imm) & (1 << 1)) ? ones : zeros, \
7574
((imm) & (1 << 2)) ? ones : zeros, \
7575
((imm) & (1 << 3)) ? ones : zeros, \
7576
((imm) & (1 << 4)) ? ones : zeros, \
7577
((imm) & (1 << 5)) ? ones : zeros, \
7578
((imm) & (1 << 6)) ? ones : zeros, \
7579
((imm) & (1 << 7)) ? ones : zeros}; \
7580
uint16x8_t _mask_vec = vld1q_u16(_mask); \
7581
uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7582
uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7583
vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7584
})
7585
7586
// Blend packed double-precision (64-bit) floating-point elements from a and b
7587
// using control mask imm8, and store the results in dst.
7588
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
7589
#define _mm_blend_pd(a, b, imm) \
7590
__extension__({ \
7591
const uint64_t _mask[2] = { \
7592
((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7593
((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7594
uint64x2_t _mask_vec = vld1q_u64(_mask); \
7595
uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7596
uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7597
vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7598
})
7599
7600
// Blend packed single-precision (32-bit) floating-point elements from a and b
7601
// using mask, and store the results in dst.
7602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
7603
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7604
{
7605
const uint32_t ALIGN_STRUCT(16)
7606
data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7607
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7608
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7609
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7610
uint32x4_t mask = vld1q_u32(data);
7611
float32x4_t a = vreinterpretq_f32_m128(_a);
7612
float32x4_t b = vreinterpretq_f32_m128(_b);
7613
return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7614
}
7615
7616
// Blend packed 8-bit integers from a and b using mask, and store the results in
7617
// dst.
7618
//
7619
// FOR j := 0 to 15
7620
// i := j*8
7621
// IF mask[i+7]
7622
// dst[i+7:i] := b[i+7:i]
7623
// ELSE
7624
// dst[i+7:i] := a[i+7:i]
7625
// FI
7626
// ENDFOR
7627
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7628
{
7629
// Use a signed shift right to create a mask with the sign bit
7630
uint8x16_t mask =
7631
vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7632
uint8x16_t a = vreinterpretq_u8_m128i(_a);
7633
uint8x16_t b = vreinterpretq_u8_m128i(_b);
7634
return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7635
}
7636
7637
// Blend packed double-precision (64-bit) floating-point elements from a and b
7638
// using mask, and store the results in dst.
7639
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
7640
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7641
{
7642
uint64x2_t mask =
7643
vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7644
#if defined(__aarch64__)
7645
float64x2_t a = vreinterpretq_f64_m128d(_a);
7646
float64x2_t b = vreinterpretq_f64_m128d(_b);
7647
return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7648
#else
7649
uint64x2_t a = vreinterpretq_u64_m128d(_a);
7650
uint64x2_t b = vreinterpretq_u64_m128d(_b);
7651
return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7652
#endif
7653
}
7654
7655
// Blend packed single-precision (32-bit) floating-point elements from a and b
7656
// using mask, and store the results in dst.
7657
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
7658
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7659
{
7660
// Use a signed shift right to create a mask with the sign bit
7661
uint32x4_t mask =
7662
vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7663
float32x4_t a = vreinterpretq_f32_m128(_a);
7664
float32x4_t b = vreinterpretq_f32_m128(_b);
7665
return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7666
}
7667
7668
// Round the packed double-precision (64-bit) floating-point elements in a up
7669
// to an integer value, and store the results as packed double-precision
7670
// floating-point elements in dst.
7671
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
7672
FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7673
{
7674
#if defined(__aarch64__)
7675
return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7676
#else
7677
double *f = (double *) &a;
7678
return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7679
#endif
7680
}
7681
7682
// Round the packed single-precision (32-bit) floating-point elements in a up to
7683
// an integer value, and store the results as packed single-precision
7684
// floating-point elements in dst.
7685
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
7686
FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7687
{
7688
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7689
return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7690
#else
7691
float *f = (float *) &a;
7692
return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7693
#endif
7694
}
7695
7696
// Round the lower double-precision (64-bit) floating-point element in b up to
7697
// an integer value, store the result as a double-precision floating-point
7698
// element in the lower element of dst, and copy the upper element from a to the
7699
// upper element of dst.
7700
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
7701
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7702
{
7703
return _mm_move_sd(a, _mm_ceil_pd(b));
7704
}
7705
7706
// Round the lower single-precision (32-bit) floating-point element in b up to
7707
// an integer value, store the result as a single-precision floating-point
7708
// element in the lower element of dst, and copy the upper 3 packed elements
7709
// from a to the upper elements of dst.
7710
//
7711
// dst[31:0] := CEIL(b[31:0])
7712
// dst[127:32] := a[127:32]
7713
//
7714
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
7715
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7716
{
7717
return _mm_move_ss(a, _mm_ceil_ps(b));
7718
}
7719
7720
// Compare packed 64-bit integers in a and b for equality, and store the results
7721
// in dst
7722
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7723
{
7724
#if defined(__aarch64__)
7725
return vreinterpretq_m128i_u64(
7726
vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7727
#else
7728
// ARMv7 lacks vceqq_u64
7729
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7730
uint32x4_t cmp =
7731
vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7732
uint32x4_t swapped = vrev64q_u32(cmp);
7733
return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7734
#endif
7735
}
7736
7737
// Converts the four signed 16-bit integers in the lower 64 bits to four signed
7738
// 32-bit integers.
7739
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7740
{
7741
return vreinterpretq_m128i_s32(
7742
vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7743
}
7744
7745
// Converts the two signed 16-bit integers in the lower 32 bits two signed
7746
// 32-bit integers.
7747
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7748
{
7749
int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7750
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7751
int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7752
return vreinterpretq_m128i_s64(s64x2);
7753
}
7754
7755
// Converts the two signed 32-bit integers in the lower 64 bits to two signed
7756
// 64-bit integers.
7757
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7758
{
7759
return vreinterpretq_m128i_s64(
7760
vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7761
}
7762
7763
// Converts the four unsigned 8-bit integers in the lower 16 bits to four
7764
// unsigned 32-bit integers.
7765
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7766
{
7767
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7768
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7769
return vreinterpretq_m128i_s16(s16x8);
7770
}
7771
7772
// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7773
// unsigned 32-bit integers.
7774
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7775
{
7776
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7777
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7778
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7779
return vreinterpretq_m128i_s32(s32x4);
7780
}
7781
7782
// Converts the two signed 8-bit integers in the lower 32 bits to four
7783
// signed 64-bit integers.
7784
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7785
{
7786
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7787
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7788
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7789
int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7790
return vreinterpretq_m128i_s64(s64x2);
7791
}
7792
7793
// Converts the four unsigned 16-bit integers in the lower 64 bits to four
7794
// unsigned 32-bit integers.
7795
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7796
{
7797
return vreinterpretq_m128i_u32(
7798
vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7799
}
7800
7801
// Converts the two unsigned 16-bit integers in the lower 32 bits to two
7802
// unsigned 64-bit integers.
7803
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7804
{
7805
uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7806
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7807
uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7808
return vreinterpretq_m128i_u64(u64x2);
7809
}
7810
7811
// Converts the two unsigned 32-bit integers in the lower 64 bits to two
7812
// unsigned 64-bit integers.
7813
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7814
{
7815
return vreinterpretq_m128i_u64(
7816
vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7817
}
7818
7819
// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7820
// and store the results in dst.
7821
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
7822
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7823
{
7824
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7825
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7826
return vreinterpretq_m128i_u16(u16x8);
7827
}
7828
7829
// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7830
// unsigned 32-bit integers.
7831
// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7832
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7833
{
7834
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7835
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7836
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7837
return vreinterpretq_m128i_u32(u32x4);
7838
}
7839
7840
// Converts the two unsigned 8-bit integers in the lower 16 bits to two
7841
// unsigned 64-bit integers.
7842
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7843
{
7844
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7845
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7846
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7847
uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7848
return vreinterpretq_m128i_u64(u64x2);
7849
}
7850
7851
// Conditionally multiply the packed double-precision (64-bit) floating-point
7852
// elements in a and b using the high 4 bits in imm8, sum the four products, and
7853
// conditionally store the sum in dst using the low 4 bits of imm8.
7854
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
7855
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
7856
{
7857
// Generate mask value from constant immediate bit value
7858
const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7859
const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7860
#if !SSE2NEON_PRECISE_DP
7861
const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7862
const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7863
#endif
7864
// Conditional multiplication
7865
#if !SSE2NEON_PRECISE_DP
7866
__m128d mul = _mm_mul_pd(a, b);
7867
const __m128d mulMask =
7868
_mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7869
__m128d tmp = _mm_and_pd(mul, mulMask);
7870
#else
7871
#if defined(__aarch64__)
7872
double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7873
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7874
: 0;
7875
double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7876
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7877
: 0;
7878
#else
7879
double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7880
double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7881
#endif
7882
__m128d tmp = _mm_set_pd(d1, d0);
7883
#endif
7884
// Sum the products
7885
#if defined(__aarch64__)
7886
double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7887
#else
7888
double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7889
#endif
7890
// Conditionally store the sum
7891
const __m128d sumMask =
7892
_mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7893
__m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7894
return res;
7895
}
7896
7897
// Conditionally multiply the packed single-precision (32-bit) floating-point
7898
// elements in a and b using the high 4 bits in imm8, sum the four products,
7899
// and conditionally store the sum in dst using the low 4 bits of imm.
7900
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
7901
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7902
{
7903
#if defined(__aarch64__)
7904
/* shortcuts */
7905
if (imm == 0xFF) {
7906
return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7907
}
7908
if (imm == 0x7F) {
7909
float32x4_t m = _mm_mul_ps(a, b);
7910
m[3] = 0;
7911
return _mm_set1_ps(vaddvq_f32(m));
7912
}
7913
#endif
7914
7915
float s = 0, c = 0;
7916
float32x4_t f32a = vreinterpretq_f32_m128(a);
7917
float32x4_t f32b = vreinterpretq_f32_m128(b);
7918
7919
/* To improve the accuracy of floating-point summation, Kahan algorithm
7920
* is used for each operation.
7921
*/
7922
if (imm & (1 << 4))
7923
_sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7924
if (imm & (1 << 5))
7925
_sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7926
if (imm & (1 << 6))
7927
_sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7928
if (imm & (1 << 7))
7929
_sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7930
s += c;
7931
7932
float32x4_t res = {
7933
(imm & 0x1) ? s : 0,
7934
(imm & 0x2) ? s : 0,
7935
(imm & 0x4) ? s : 0,
7936
(imm & 0x8) ? s : 0,
7937
};
7938
return vreinterpretq_m128_f32(res);
7939
}
7940
7941
// Extracts the selected signed or unsigned 32-bit integer from a and zero
7942
// extends.
7943
// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7944
#define _mm_extract_epi32(a, imm) \
7945
vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7946
7947
// Extracts the selected signed or unsigned 64-bit integer from a and zero
7948
// extends.
7949
// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7950
#define _mm_extract_epi64(a, imm) \
7951
vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7952
7953
// Extracts the selected signed or unsigned 8-bit integer from a and zero
7954
// extends.
7955
// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7956
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7957
#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7958
7959
// Extracts the selected single-precision (32-bit) floating-point from a.
7960
// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7961
#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7962
7963
// Round the packed double-precision (64-bit) floating-point elements in a down
7964
// to an integer value, and store the results as packed double-precision
7965
// floating-point elements in dst.
7966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7967
FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7968
{
7969
#if defined(__aarch64__)
7970
return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7971
#else
7972
double *f = (double *) &a;
7973
return _mm_set_pd(floor(f[1]), floor(f[0]));
7974
#endif
7975
}
7976
7977
// Round the packed single-precision (32-bit) floating-point elements in a down
7978
// to an integer value, and store the results as packed single-precision
7979
// floating-point elements in dst.
7980
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7981
FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7982
{
7983
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7984
return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7985
#else
7986
float *f = (float *) &a;
7987
return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7988
#endif
7989
}
7990
7991
// Round the lower double-precision (64-bit) floating-point element in b down to
7992
// an integer value, store the result as a double-precision floating-point
7993
// element in the lower element of dst, and copy the upper element from a to the
7994
// upper element of dst.
7995
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7996
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7997
{
7998
return _mm_move_sd(a, _mm_floor_pd(b));
7999
}
8000
8001
// Round the lower single-precision (32-bit) floating-point element in b down to
8002
// an integer value, store the result as a single-precision floating-point
8003
// element in the lower element of dst, and copy the upper 3 packed elements
8004
// from a to the upper elements of dst.
8005
//
8006
// dst[31:0] := FLOOR(b[31:0])
8007
// dst[127:32] := a[127:32]
8008
//
8009
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
8010
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
8011
{
8012
return _mm_move_ss(a, _mm_floor_ps(b));
8013
}
8014
8015
// Inserts the least significant 32 bits of b into the selected 32-bit integer
8016
// of a.
8017
// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
8018
// __constrange(0,4) int imm)
8019
#define _mm_insert_epi32(a, b, imm) \
8020
__extension__({ \
8021
vreinterpretq_m128i_s32( \
8022
vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
8023
})
8024
8025
// Inserts the least significant 64 bits of b into the selected 64-bit integer
8026
// of a.
8027
// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
8028
// __constrange(0,2) int imm)
8029
#define _mm_insert_epi64(a, b, imm) \
8030
__extension__({ \
8031
vreinterpretq_m128i_s64( \
8032
vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
8033
})
8034
8035
// Inserts the least significant 8 bits of b into the selected 8-bit integer
8036
// of a.
8037
// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
8038
// __constrange(0,16) int imm)
8039
#define _mm_insert_epi8(a, b, imm) \
8040
__extension__({ \
8041
vreinterpretq_m128i_s8( \
8042
vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
8043
})
8044
8045
// Copy a to tmp, then insert a single-precision (32-bit) floating-point
8046
// element from b into tmp using the control in imm8. Store tmp to dst using
8047
// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
8048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
8049
#define _mm_insert_ps(a, b, imm8) \
8050
__extension__({ \
8051
float32x4_t tmp1 = \
8052
vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
8053
vreinterpretq_f32_m128(a), 0); \
8054
float32x4_t tmp2 = \
8055
vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
8056
((imm8 >> 4) & 0x3)); \
8057
const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
8058
((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
8059
((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
8060
((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
8061
uint32x4_t mask = vld1q_u32(data); \
8062
float32x4_t all_zeros = vdupq_n_f32(0); \
8063
\
8064
vreinterpretq_m128_f32( \
8065
vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
8066
})
8067
8068
// epi versions of min/max
8069
// Computes the pariwise maximums of the four signed 32-bit integer values of a
8070
// and b.
8071
//
8072
// A 128-bit parameter that can be defined with the following equations:
8073
// r0 := (a0 > b0) ? a0 : b0
8074
// r1 := (a1 > b1) ? a1 : b1
8075
// r2 := (a2 > b2) ? a2 : b2
8076
// r3 := (a3 > b3) ? a3 : b3
8077
//
8078
// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
8079
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
8080
{
8081
return vreinterpretq_m128i_s32(
8082
vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8083
}
8084
8085
// Compare packed signed 8-bit integers in a and b, and store packed maximum
8086
// values in dst.
8087
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
8088
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
8089
{
8090
return vreinterpretq_m128i_s8(
8091
vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
8092
}
8093
8094
// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
8095
// values in dst.
8096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
8097
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
8098
{
8099
return vreinterpretq_m128i_u16(
8100
vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
8101
}
8102
8103
// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
8104
// values in dst.
8105
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
8106
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
8107
{
8108
return vreinterpretq_m128i_u32(
8109
vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
8110
}
8111
8112
// Computes the pariwise minima of the four signed 32-bit integer values of a
8113
// and b.
8114
//
8115
// A 128-bit parameter that can be defined with the following equations:
8116
// r0 := (a0 < b0) ? a0 : b0
8117
// r1 := (a1 < b1) ? a1 : b1
8118
// r2 := (a2 < b2) ? a2 : b2
8119
// r3 := (a3 < b3) ? a3 : b3
8120
//
8121
// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
8122
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
8123
{
8124
return vreinterpretq_m128i_s32(
8125
vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8126
}
8127
8128
// Compare packed signed 8-bit integers in a and b, and store packed minimum
8129
// values in dst.
8130
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
8131
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
8132
{
8133
return vreinterpretq_m128i_s8(
8134
vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
8135
}
8136
8137
// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
8138
// values in dst.
8139
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
8140
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
8141
{
8142
return vreinterpretq_m128i_u16(
8143
vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
8144
}
8145
8146
// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
8147
// values in dst.
8148
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
8149
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
8150
{
8151
return vreinterpretq_m128i_u32(
8152
vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
8153
}
8154
8155
// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
8156
// in a, store the minimum and index in dst, and zero the remaining bits in dst.
8157
//
8158
// index[2:0] := 0
8159
// min[15:0] := a[15:0]
8160
// FOR j := 0 to 7
8161
// i := j*16
8162
// IF a[i+15:i] < min[15:0]
8163
// index[2:0] := j
8164
// min[15:0] := a[i+15:i]
8165
// FI
8166
// ENDFOR
8167
// dst[15:0] := min[15:0]
8168
// dst[18:16] := index[2:0]
8169
// dst[127:19] := 0
8170
//
8171
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
8172
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
8173
{
8174
__m128i dst;
8175
uint16_t min, idx = 0;
8176
#if defined(__aarch64__)
8177
// Find the minimum value
8178
min = vminvq_u16(vreinterpretq_u16_m128i(a));
8179
8180
// Get the index of the minimum value
8181
static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
8182
uint16x8_t minv = vdupq_n_u16(min);
8183
uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
8184
idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
8185
#else
8186
// Find the minimum value
8187
__m64 tmp;
8188
tmp = vreinterpret_m64_u16(
8189
vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
8190
vget_high_u16(vreinterpretq_u16_m128i(a))));
8191
tmp = vreinterpret_m64_u16(
8192
vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
8193
tmp = vreinterpret_m64_u16(
8194
vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
8195
min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
8196
// Get the index of the minimum value
8197
int i;
8198
for (i = 0; i < 8; i++) {
8199
if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
8200
idx = (uint16_t) i;
8201
break;
8202
}
8203
a = _mm_srli_si128(a, 2);
8204
}
8205
#endif
8206
// Generate result
8207
dst = _mm_setzero_si128();
8208
dst = vreinterpretq_m128i_u16(
8209
vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
8210
dst = vreinterpretq_m128i_u16(
8211
vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
8212
return dst;
8213
}
8214
8215
// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
8216
// 8-bit integers in a compared to those in b, and store the 16-bit results in
8217
// dst. Eight SADs are performed using one quadruplet from b and eight
8218
// quadruplets from a. One quadruplet is selected from b starting at on the
8219
// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
8220
// integers selected from a starting at the offset specified in imm8.
8221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
8222
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
8223
{
8224
uint8x16_t _a, _b;
8225
8226
switch (imm & 0x4) {
8227
case 0:
8228
// do nothing
8229
_a = vreinterpretq_u8_m128i(a);
8230
break;
8231
case 4:
8232
_a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
8233
vreinterpretq_u32_m128i(a), 1));
8234
break;
8235
default:
8236
#if defined(__GNUC__) || defined(__clang__)
8237
__builtin_unreachable();
8238
#endif
8239
break;
8240
}
8241
8242
switch (imm & 0x3) {
8243
case 0:
8244
_b = vreinterpretq_u8_u32(
8245
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
8246
break;
8247
case 1:
8248
_b = vreinterpretq_u8_u32(
8249
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
8250
break;
8251
case 2:
8252
_b = vreinterpretq_u8_u32(
8253
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
8254
break;
8255
case 3:
8256
_b = vreinterpretq_u8_u32(
8257
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
8258
break;
8259
default:
8260
#if defined(__GNUC__) || defined(__clang__)
8261
__builtin_unreachable();
8262
#endif
8263
break;
8264
}
8265
8266
int16x8_t c04, c15, c26, c37;
8267
uint8x8_t low_b = vget_low_u8(_b);
8268
c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
8269
uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
8270
c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
8271
uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
8272
c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
8273
uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
8274
c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
8275
#if defined(__aarch64__)
8276
// |0|4|2|6|
8277
c04 = vpaddq_s16(c04, c26);
8278
// |1|5|3|7|
8279
c15 = vpaddq_s16(c15, c37);
8280
8281
int32x4_t trn1_c =
8282
vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8283
int32x4_t trn2_c =
8284
vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8285
return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8286
vreinterpretq_s16_s32(trn2_c)));
8287
#else
8288
int16x4_t c01, c23, c45, c67;
8289
c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8290
c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8291
c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8292
c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8293
8294
return vreinterpretq_m128i_s16(
8295
vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8296
#endif
8297
}
8298
8299
// Multiply the low signed 32-bit integers from each packed 64-bit element in
8300
// a and b, and store the signed 64-bit results in dst.
8301
//
8302
// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8303
// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8304
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
8305
{
8306
// vmull_s32 upcasts instead of masking, so we downcast.
8307
int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8308
int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8309
return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8310
}
8311
8312
// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8313
// unsigned 32-bit integers from b.
8314
// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8315
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
8316
{
8317
return vreinterpretq_m128i_s32(
8318
vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8319
}
8320
8321
// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8322
// integers and saturates.
8323
//
8324
// r0 := UnsignedSaturate(a0)
8325
// r1 := UnsignedSaturate(a1)
8326
// r2 := UnsignedSaturate(a2)
8327
// r3 := UnsignedSaturate(a3)
8328
// r4 := UnsignedSaturate(b0)
8329
// r5 := UnsignedSaturate(b1)
8330
// r6 := UnsignedSaturate(b2)
8331
// r7 := UnsignedSaturate(b3)
8332
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
8333
{
8334
return vreinterpretq_m128i_u16(
8335
vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8336
vqmovun_s32(vreinterpretq_s32_m128i(b))));
8337
}
8338
8339
// Round the packed double-precision (64-bit) floating-point elements in a using
8340
// the rounding parameter, and store the results as packed double-precision
8341
// floating-point elements in dst.
8342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
8343
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
8344
{
8345
#if defined(__aarch64__)
8346
switch (rounding) {
8347
case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8348
return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8349
case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8350
return _mm_floor_pd(a);
8351
case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8352
return _mm_ceil_pd(a);
8353
case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8354
return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8355
default: //_MM_FROUND_CUR_DIRECTION
8356
return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8357
}
8358
#else
8359
double *v_double = (double *) &a;
8360
8361
if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8362
(rounding == _MM_FROUND_CUR_DIRECTION &&
8363
_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8364
double res[2], tmp;
8365
for (int i = 0; i < 2; i++) {
8366
tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8367
double roundDown = floor(tmp); // Round down value
8368
double roundUp = ceil(tmp); // Round up value
8369
double diffDown = tmp - roundDown;
8370
double diffUp = roundUp - tmp;
8371
if (diffDown < diffUp) {
8372
/* If it's closer to the round down value, then use it */
8373
res[i] = roundDown;
8374
} else if (diffDown > diffUp) {
8375
/* If it's closer to the round up value, then use it */
8376
res[i] = roundUp;
8377
} else {
8378
/* If it's equidistant between round up and round down value,
8379
* pick the one which is an even number */
8380
double half = roundDown / 2;
8381
if (half != floor(half)) {
8382
/* If the round down value is odd, return the round up value
8383
*/
8384
res[i] = roundUp;
8385
} else {
8386
/* If the round up value is odd, return the round down value
8387
*/
8388
res[i] = roundDown;
8389
}
8390
}
8391
res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8392
}
8393
return _mm_set_pd(res[1], res[0]);
8394
} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8395
(rounding == _MM_FROUND_CUR_DIRECTION &&
8396
_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8397
return _mm_floor_pd(a);
8398
} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8399
(rounding == _MM_FROUND_CUR_DIRECTION &&
8400
_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8401
return _mm_ceil_pd(a);
8402
}
8403
return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8404
v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8405
#endif
8406
}
8407
8408
// Round the packed single-precision (32-bit) floating-point elements in a using
8409
// the rounding parameter, and store the results as packed single-precision
8410
// floating-point elements in dst.
8411
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8412
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
8413
{
8414
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8415
switch (rounding) {
8416
case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8417
return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8418
case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8419
return _mm_floor_ps(a);
8420
case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8421
return _mm_ceil_ps(a);
8422
case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8423
return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8424
default: //_MM_FROUND_CUR_DIRECTION
8425
return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8426
}
8427
#else
8428
float *v_float = (float *) &a;
8429
8430
if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8431
(rounding == _MM_FROUND_CUR_DIRECTION &&
8432
_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8433
uint32x4_t signmask = vdupq_n_u32(0x80000000);
8434
float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8435
vdupq_n_f32(0.5f)); /* +/- 0.5 */
8436
int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8437
vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8438
int32x4_t r_trunc = vcvtq_s32_f32(
8439
vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8440
int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8441
vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8442
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8443
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8444
float32x4_t delta = vsubq_f32(
8445
vreinterpretq_f32_m128(a),
8446
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8447
uint32x4_t is_delta_half =
8448
vceqq_f32(delta, half); /* delta == +/- 0.5 */
8449
return vreinterpretq_m128_f32(
8450
vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8451
} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8452
(rounding == _MM_FROUND_CUR_DIRECTION &&
8453
_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8454
return _mm_floor_ps(a);
8455
} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8456
(rounding == _MM_FROUND_CUR_DIRECTION &&
8457
_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8458
return _mm_ceil_ps(a);
8459
}
8460
return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8461
v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8462
v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8463
v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8464
#endif
8465
}
8466
8467
// Round the lower double-precision (64-bit) floating-point element in b using
8468
// the rounding parameter, store the result as a double-precision floating-point
8469
// element in the lower element of dst, and copy the upper element from a to the
8470
// upper element of dst.
8471
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
8472
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
8473
{
8474
return _mm_move_sd(a, _mm_round_pd(b, rounding));
8475
}
8476
8477
// Round the lower single-precision (32-bit) floating-point element in b using
8478
// the rounding parameter, store the result as a single-precision floating-point
8479
// element in the lower element of dst, and copy the upper 3 packed elements
8480
// from a to the upper elements of dst. Rounding is done according to the
8481
// rounding[3:0] parameter, which can be one of:
8482
// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8483
// suppress exceptions
8484
// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
8485
// suppress exceptions
8486
// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
8487
// exceptions
8488
// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
8489
// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8490
// _MM_SET_ROUNDING_MODE
8491
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
8492
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
8493
{
8494
return _mm_move_ss(a, _mm_round_ps(b, rounding));
8495
}
8496
8497
// Load 128-bits of integer data from memory into dst using a non-temporal
8498
// memory hint. mem_addr must be aligned on a 16-byte boundary or a
8499
// general-protection exception may be generated.
8500
//
8501
// dst[127:0] := MEM[mem_addr+127:mem_addr]
8502
//
8503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
8504
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
8505
{
8506
#if __has_builtin(__builtin_nontemporal_store)
8507
return __builtin_nontemporal_load(p);
8508
#else
8509
return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8510
#endif
8511
}
8512
8513
// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8514
// all 1's, and return 1 if the result is zero, otherwise return 0.
8515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
8516
FORCE_INLINE int _mm_test_all_ones(__m128i a)
8517
{
8518
return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8519
~(uint64_t) 0;
8520
}
8521
8522
// Compute the bitwise AND of 128 bits (representing integer data) in a and
8523
// mask, and return 1 if the result is zero, otherwise return 0.
8524
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
8525
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
8526
{
8527
int64x2_t a_and_mask =
8528
vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
8529
return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8530
}
8531
8532
// Compute the bitwise AND of 128 bits (representing integer data) in a and
8533
// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8534
// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8535
// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8536
// otherwise return 0.
8537
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
8538
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
8539
{
8540
uint64x2_t zf =
8541
vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8542
uint64x2_t cf =
8543
vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8544
uint64x2_t result = vandq_u64(zf, cf);
8545
return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8546
}
8547
8548
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8549
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8550
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8551
// otherwise set CF to 0. Return the CF value.
8552
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
8553
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
8554
{
8555
int64x2_t s64 =
8556
vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
8557
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8558
}
8559
8560
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8561
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8562
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8563
// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8564
// otherwise return 0.
8565
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
8566
#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8567
8568
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8569
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8570
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8571
// otherwise set CF to 0. Return the ZF value.
8572
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
8573
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
8574
{
8575
int64x2_t s64 =
8576
vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8577
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8578
}
8579
8580
/* SSE4.2 */
8581
8582
const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
8583
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8584
};
8585
const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
8586
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8587
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8588
};
8589
8590
/* specify the source data format */
8591
#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
8592
#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
8593
#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
8594
#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
8595
8596
/* specify the comparison operation */
8597
#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
8598
#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
8599
#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
8600
#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
8601
8602
/* specify the polarity */
8603
#define _SIDD_POSITIVE_POLARITY 0x00
8604
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
8605
#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
8606
#define _SIDD_MASKED_NEGATIVE_POLARITY \
8607
0x30 /* negate results only before end of string */
8608
8609
/* specify the output selection in _mm_cmpXstri */
8610
#define _SIDD_LEAST_SIGNIFICANT 0x00
8611
#define _SIDD_MOST_SIGNIFICANT 0x40
8612
8613
/* specify the output selection in _mm_cmpXstrm */
8614
#define _SIDD_BIT_MASK 0x00
8615
#define _SIDD_UNIT_MASK 0x40
8616
8617
/* Pattern Matching for C macros.
8618
* https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
8619
*/
8620
8621
/* catenate */
8622
#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
8623
#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
8624
8625
#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
8626
/* run the 2nd parameter */
8627
#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
8628
/* run the 1st parameter */
8629
#define SSE2NEON_IIF_1(t, ...) t
8630
8631
#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
8632
#define SSE2NEON_COMPL_0 1
8633
#define SSE2NEON_COMPL_1 0
8634
8635
#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
8636
#define SSE2NEON_DEC_1 0
8637
#define SSE2NEON_DEC_2 1
8638
#define SSE2NEON_DEC_3 2
8639
#define SSE2NEON_DEC_4 3
8640
#define SSE2NEON_DEC_5 4
8641
#define SSE2NEON_DEC_6 5
8642
#define SSE2NEON_DEC_7 6
8643
#define SSE2NEON_DEC_8 7
8644
#define SSE2NEON_DEC_9 8
8645
#define SSE2NEON_DEC_10 9
8646
#define SSE2NEON_DEC_11 10
8647
#define SSE2NEON_DEC_12 11
8648
#define SSE2NEON_DEC_13 12
8649
#define SSE2NEON_DEC_14 13
8650
#define SSE2NEON_DEC_15 14
8651
#define SSE2NEON_DEC_16 15
8652
8653
/* detection */
8654
#define SSE2NEON_CHECK_N(x, n, ...) n
8655
#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
8656
#define SSE2NEON_PROBE(x) x, 1,
8657
8658
#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
8659
#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
8660
8661
#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
8662
#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
8663
8664
#define SSE2NEON_EAT(...)
8665
#define SSE2NEON_EXPAND(...) __VA_ARGS__
8666
#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
8667
8668
/* recursion */
8669
/* deferred expression */
8670
#define SSE2NEON_EMPTY()
8671
#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
8672
#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
8673
#define SSE2NEON_EXPAND(...) __VA_ARGS__
8674
8675
#define SSE2NEON_EVAL(...) \
8676
SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
8677
#define SSE2NEON_EVAL1(...) \
8678
SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
8679
#define SSE2NEON_EVAL2(...) \
8680
SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
8681
#define SSE2NEON_EVAL3(...) __VA_ARGS__
8682
8683
#define SSE2NEON_REPEAT(count, macro, ...) \
8684
SSE2NEON_WHEN(count) \
8685
(SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
8686
SSE2NEON_DEC(count), macro, \
8687
__VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
8688
__VA_ARGS__))
8689
#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
8690
8691
#define SSE2NEON_SIZE_OF_byte 8
8692
#define SSE2NEON_NUMBER_OF_LANES_byte 16
8693
#define SSE2NEON_SIZE_OF_word 16
8694
#define SSE2NEON_NUMBER_OF_LANES_word 8
8695
8696
#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
8697
mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
8698
vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
8699
vreinterpretq_##type##_m128i(a)));
8700
8701
#define SSE2NEON_FILL_LANE(i, type) \
8702
vec_b[i] = \
8703
vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
8704
8705
#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
8706
number_of_lanes, byte_or_word) \
8707
do { \
8708
SSE2NEON_CAT( \
8709
data_type_prefix, \
8710
SSE2NEON_CAT(size, \
8711
SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
8712
vec_b[number_of_lanes]; \
8713
__m128i mask = SSE2NEON_IIF(byte_or_word)( \
8714
vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
8715
vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
8716
SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
8717
SSE2NEON_CAT(type_prefix, size))) \
8718
for (int i = 0; i < number_of_lanes; i++) { \
8719
mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
8720
size)(SSE2NEON_CAT(vbslq_u, size)( \
8721
SSE2NEON_CAT(vreinterpretq_u, \
8722
SSE2NEON_CAT(size, _m128i))(mask), \
8723
SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
8724
vec_b[i], \
8725
SSE2NEON_CAT( \
8726
vreinterpretq_, \
8727
SSE2NEON_CAT(type_prefix, \
8728
SSE2NEON_CAT(size, _m128i(a))))), \
8729
SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
8730
vec_b[i], \
8731
SSE2NEON_CAT( \
8732
vreinterpretq_, \
8733
SSE2NEON_CAT(type_prefix, \
8734
SSE2NEON_CAT(size, _m128i(a))))))); \
8735
} \
8736
} while (0)
8737
8738
#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
8739
do { \
8740
SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
8741
SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
8742
SSE2NEON_CAT(u, size))) \
8743
} while (0)
8744
8745
#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
8746
static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
8747
int lb) \
8748
{ \
8749
__m128i mtx[16]; \
8750
PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8751
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
8752
return SSE2NEON_CAT( \
8753
_sse2neon_aggregate_equal_any_, \
8754
SSE2NEON_CAT( \
8755
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8756
SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
8757
type))))(la, lb, mtx); \
8758
}
8759
8760
#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
8761
static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
8762
int lb) \
8763
{ \
8764
__m128i mtx[16]; \
8765
PCMPSTR_RANGES( \
8766
a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8767
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
8768
return SSE2NEON_CAT( \
8769
_sse2neon_aggregate_ranges_, \
8770
SSE2NEON_CAT( \
8771
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8772
SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
8773
type))))(la, lb, mtx); \
8774
}
8775
8776
#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
8777
static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
8778
__m128i b, int lb) \
8779
{ \
8780
__m128i mtx[16]; \
8781
PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8782
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
8783
return SSE2NEON_CAT( \
8784
_sse2neon_aggregate_equal_ordered_, \
8785
SSE2NEON_CAT( \
8786
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8787
SSE2NEON_CAT(x, \
8788
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
8789
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
8790
}
8791
8792
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
8793
{
8794
int res = 0;
8795
int m = (1 << la) - 1;
8796
uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8797
uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
8798
uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
8799
uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8800
for (int j = 0; j < lb; j++) {
8801
mtx[j] = vreinterpretq_m128i_u8(
8802
vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8803
mtx[j] = vreinterpretq_m128i_u8(
8804
vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8805
int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
8806
res |= (tmp << j);
8807
}
8808
return res;
8809
}
8810
8811
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
8812
{
8813
int res = 0;
8814
int m = (1 << la) - 1;
8815
uint16x8_t vec =
8816
vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8817
for (int j = 0; j < lb; j++) {
8818
mtx[j] = vreinterpretq_m128i_u16(
8819
vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8820
mtx[j] = vreinterpretq_m128i_u16(
8821
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8822
int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
8823
res |= (tmp << j);
8824
}
8825
return res;
8826
}
8827
8828
/* clang-format off */
8829
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
8830
prefix##IMPL(byte) \
8831
prefix##IMPL(word)
8832
/* clang-format on */
8833
8834
SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
8835
8836
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
8837
{
8838
int res = 0;
8839
int m = (1 << la) - 1;
8840
uint16x8_t vec =
8841
vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8842
for (int j = 0; j < lb; j++) {
8843
mtx[j] = vreinterpretq_m128i_u16(
8844
vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8845
mtx[j] = vreinterpretq_m128i_u16(
8846
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8847
__m128i tmp = vreinterpretq_m128i_u32(
8848
vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
8849
uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
8850
vreinterpretq_u32_m128i(tmp));
8851
#if defined(__aarch64__)
8852
int t = vaddvq_u32(vec_res) ? 1 : 0;
8853
#else
8854
uint64x2_t sumh = vpaddlq_u32(vec_res);
8855
int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
8856
#endif
8857
res |= (t << j);
8858
}
8859
return res;
8860
}
8861
8862
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
8863
{
8864
int res = 0;
8865
int m = (1 << la) - 1;
8866
uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8867
uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
8868
uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
8869
uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8870
for (int j = 0; j < lb; j++) {
8871
mtx[j] = vreinterpretq_m128i_u8(
8872
vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8873
mtx[j] = vreinterpretq_m128i_u8(
8874
vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8875
__m128i tmp = vreinterpretq_m128i_u16(
8876
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
8877
uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
8878
vreinterpretq_u16_m128i(tmp));
8879
int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
8880
res |= (t << j);
8881
}
8882
return res;
8883
}
8884
8885
#define SSE2NEON_CMP_RANGES_IS_BYTE 1
8886
#define SSE2NEON_CMP_RANGES_IS_WORD 0
8887
8888
/* clang-format off */
8889
#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
8890
prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
8891
prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
8892
prefix##IMPL(word, uint, u, prefix##IS_WORD) \
8893
prefix##IMPL(word, int, s, prefix##IS_WORD)
8894
/* clang-format on */
8895
8896
SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
8897
8898
#undef SSE2NEON_CMP_RANGES_IS_BYTE
8899
#undef SSE2NEON_CMP_RANGES_IS_WORD
8900
8901
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
8902
{
8903
uint8x16_t mtx =
8904
vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
8905
int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8906
int m1 = 0x10000 - (1 << la);
8907
int tb = 0x10000 - (1 << lb);
8908
uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
8909
uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
8910
vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8911
vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
8912
vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
8913
vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
8914
vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
8915
tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
8916
tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
8917
8918
res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
8919
res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
8920
res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
8921
res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
8922
res_lo = vand_u8(res_lo, vec_mask);
8923
res_hi = vand_u8(res_hi, vec_mask);
8924
8925
int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
8926
return res;
8927
}
8928
8929
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
8930
{
8931
uint16x8_t mtx =
8932
vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
8933
int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8934
int m1 = 0x100 - (1 << la);
8935
int tb = 0x100 - (1 << lb);
8936
uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8937
uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8938
uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8939
uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8940
mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8941
mtx = vbslq_u16(vec1, tmp, mtx);
8942
mtx = vandq_u16(mtx, vec_mask);
8943
return _sse2neon_vaddvq_u16(mtx);
8944
}
8945
8946
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8947
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8948
8949
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8950
static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8951
int bound, int la, int lb, __m128i mtx[16]) \
8952
{ \
8953
int res = 0; \
8954
int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8955
uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8956
vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8957
vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8958
uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8959
vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8960
vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8961
vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8962
uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8963
uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8964
for (int j = 0; j < lb; j++) { \
8965
mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8966
vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8967
} \
8968
for (int j = lb; j < bound; j++) { \
8969
mtx[j] = vreinterpretq_m128i_u##size( \
8970
vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8971
} \
8972
unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8973
(unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8974
for (int i = 0; i < bound; i++) { \
8975
int val = 1; \
8976
for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8977
val &= ptr[k * bound + j]; \
8978
res += val << i; \
8979
} \
8980
return res; \
8981
}
8982
8983
/* clang-format off */
8984
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8985
prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8986
prefix##IMPL(16, 8, prefix##IS_UWORD)
8987
/* clang-format on */
8988
8989
SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8990
8991
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8992
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8993
8994
/* clang-format off */
8995
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8996
prefix##IMPL(byte) \
8997
prefix##IMPL(word)
8998
/* clang-format on */
8999
9000
SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
9001
9002
#define SSE2NEON_CMPESTR_LIST \
9003
_(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
9004
_(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
9005
_(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
9006
_(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
9007
_(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
9008
_(CMP_UWORD_RANGES, cmp_uword_ranges) \
9009
_(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
9010
_(CMP_SWORD_RANGES, cmp_sword_ranges) \
9011
_(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
9012
_(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
9013
_(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
9014
_(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
9015
_(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
9016
_(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
9017
_(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
9018
_(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
9019
9020
enum {
9021
#define _(name, func_suffix) name,
9022
SSE2NEON_CMPESTR_LIST
9023
#undef _
9024
};
9025
typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
9026
static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
9027
#define _(name, func_suffix) _sse2neon_##func_suffix,
9028
SSE2NEON_CMPESTR_LIST
9029
#undef _
9030
};
9031
9032
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
9033
{
9034
switch (imm8 & 0x30) {
9035
case _SIDD_NEGATIVE_POLARITY:
9036
res ^= 0xffffffff;
9037
break;
9038
case _SIDD_MASKED_NEGATIVE_POLARITY:
9039
res ^= (1 << lb) - 1;
9040
break;
9041
default:
9042
break;
9043
}
9044
9045
return res & ((bound == 8) ? 0xFF : 0xFFFF);
9046
}
9047
9048
FORCE_INLINE int _sse2neon_clz(unsigned int x)
9049
{
9050
#if _MSC_VER
9051
DWORD cnt = 0;
9052
if (_BitScanForward(&cnt, x))
9053
return cnt;
9054
return 32;
9055
#else
9056
return x != 0 ? __builtin_clz(x) : 32;
9057
#endif
9058
}
9059
9060
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
9061
{
9062
#if _MSC_VER
9063
DWORD cnt = 0;
9064
if (_BitScanReverse(&cnt, x))
9065
return 31 - cnt;
9066
return 32;
9067
#else
9068
return x != 0 ? __builtin_ctz(x) : 32;
9069
#endif
9070
}
9071
9072
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
9073
{
9074
#if _MSC_VER
9075
unsigned long cnt;
9076
#ifdef defined(SSE2NEON_HAS_BITSCAN64)
9077
(defined(_M_AMD64) || defined(__x86_64__))
9078
if((_BitScanForward64(&cnt, x))
9079
return (int)(cnt);
9080
#else
9081
if (_BitScanForward(&cnt, (unsigned long) (x)))
9082
return (int) cnt;
9083
if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
9084
return (int) (cnt + 32);
9085
#endif
9086
return 64;
9087
#else
9088
return x != 0 ? __builtin_ctzll(x) : 64;
9089
#endif
9090
}
9091
9092
#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
9093
9094
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
9095
const int var = (imm & 0x01) ? 8 : 16
9096
9097
#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
9098
int tmp1 = la ^ (la >> 31); \
9099
la = tmp1 - (la >> 31); \
9100
int tmp2 = lb ^ (lb >> 31); \
9101
lb = tmp2 - (lb >> 31); \
9102
la = SSE2NEON_MIN(la, bound); \
9103
lb = SSE2NEON_MIN(lb, bound)
9104
9105
// Compare all pairs of character in string a and b,
9106
// then aggregate the result.
9107
// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
9108
// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
9109
// string a and b.
9110
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
9111
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
9112
SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
9113
int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
9114
r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
9115
9116
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
9117
return (r2 == 0) ? bound \
9118
: ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
9119
: _sse2neon_ctz(r2))
9120
9121
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
9122
__m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
9123
if (imm8 & 0x40) { \
9124
if (bound == 8) { \
9125
uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
9126
vld1q_u16(_sse2neon_cmpestr_mask16b)); \
9127
dst = vreinterpretq_m128i_u16(vbslq_u16( \
9128
tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
9129
} else { \
9130
uint8x16_t vec_r2 = \
9131
vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
9132
uint8x16_t tmp = \
9133
vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
9134
dst = vreinterpretq_m128i_u8( \
9135
vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
9136
} \
9137
} else { \
9138
if (bound == 16) { \
9139
dst = vreinterpretq_m128i_u16( \
9140
vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
9141
} else { \
9142
dst = vreinterpretq_m128i_u8( \
9143
vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
9144
} \
9145
} \
9146
return dst
9147
9148
// Compare packed strings in a and b with lengths la and lb using the control
9149
// in imm8, and returns 1 if b did not contain a null character and the
9150
// resulting mask was zero, and 0 otherwise.
9151
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
9152
FORCE_INLINE int _mm_cmpestra(__m128i a,
9153
int la,
9154
__m128i b,
9155
int lb,
9156
const int imm8)
9157
{
9158
int lb_cpy = lb;
9159
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9160
return !r2 & (lb_cpy > bound);
9161
}
9162
9163
// Compare packed strings in a and b with lengths la and lb using the control in
9164
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
9165
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
9166
FORCE_INLINE int _mm_cmpestrc(__m128i a,
9167
int la,
9168
__m128i b,
9169
int lb,
9170
const int imm8)
9171
{
9172
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9173
return r2 != 0;
9174
}
9175
9176
// Compare packed strings in a and b with lengths la and lb using the control
9177
// in imm8, and store the generated index in dst.
9178
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
9179
FORCE_INLINE int _mm_cmpestri(__m128i a,
9180
int la,
9181
__m128i b,
9182
int lb,
9183
const int imm8)
9184
{
9185
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9186
SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
9187
}
9188
9189
// Compare packed strings in a and b with lengths la and lb using the control
9190
// in imm8, and store the generated mask in dst.
9191
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
9192
FORCE_INLINE __m128i
9193
_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
9194
{
9195
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9196
SSE2NEON_CMPSTR_GENERATE_MASK(dst);
9197
}
9198
9199
// Compare packed strings in a and b with lengths la and lb using the control in
9200
// imm8, and returns bit 0 of the resulting bit mask.
9201
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
9202
FORCE_INLINE int _mm_cmpestro(__m128i a,
9203
int la,
9204
__m128i b,
9205
int lb,
9206
const int imm8)
9207
{
9208
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9209
return r2 & 1;
9210
}
9211
9212
// Compare packed strings in a and b with lengths la and lb using the control in
9213
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
9214
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
9215
FORCE_INLINE int _mm_cmpestrs(__m128i a,
9216
int la,
9217
__m128i b,
9218
int lb,
9219
const int imm8)
9220
{
9221
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9222
return la <= (bound - 1);
9223
}
9224
9225
// Compare packed strings in a and b with lengths la and lb using the control in
9226
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
9227
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
9228
FORCE_INLINE int _mm_cmpestrz(__m128i a,
9229
int la,
9230
__m128i b,
9231
int lb,
9232
const int imm8)
9233
{
9234
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9235
return lb <= (bound - 1);
9236
}
9237
9238
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
9239
do { \
9240
if (imm8 & 0x01) { \
9241
uint16x8_t equal_mask_##str = \
9242
vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
9243
uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
9244
uint64_t matches_##str = \
9245
vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
9246
len = _sse2neon_ctzll(matches_##str) >> 3; \
9247
} else { \
9248
uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
9249
vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
9250
uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
9251
uint64_t matches_##str = \
9252
vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
9253
len = _sse2neon_ctzll(matches_##str) >> 2; \
9254
} \
9255
} while (0)
9256
9257
#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
9258
int la, lb; \
9259
do { \
9260
SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
9261
SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
9262
} while (0)
9263
9264
// Compare packed strings with implicit lengths in a and b using the control in
9265
// imm8, and returns 1 if b did not contain a null character and the resulting
9266
// mask was zero, and 0 otherwise.
9267
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
9268
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
9269
{
9270
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9271
return !r2 & (lb >= bound);
9272
}
9273
9274
// Compare packed strings with implicit lengths in a and b using the control in
9275
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
9276
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
9277
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
9278
{
9279
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9280
return r2 != 0;
9281
}
9282
9283
// Compare packed strings with implicit lengths in a and b using the control in
9284
// imm8, and store the generated index in dst.
9285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
9286
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
9287
{
9288
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9289
SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
9290
}
9291
9292
// Compare packed strings with implicit lengths in a and b using the control in
9293
// imm8, and store the generated mask in dst.
9294
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
9295
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
9296
{
9297
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9298
SSE2NEON_CMPSTR_GENERATE_MASK(dst);
9299
}
9300
9301
// Compare packed strings with implicit lengths in a and b using the control in
9302
// imm8, and returns bit 0 of the resulting bit mask.
9303
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
9304
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
9305
{
9306
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9307
return r2 & 1;
9308
}
9309
9310
// Compare packed strings with implicit lengths in a and b using the control in
9311
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
9312
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
9313
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
9314
{
9315
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9316
int la;
9317
SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
9318
return la <= (bound - 1);
9319
}
9320
9321
// Compare packed strings with implicit lengths in a and b using the control in
9322
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
9323
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
9324
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
9325
{
9326
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9327
int lb;
9328
SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
9329
return lb <= (bound - 1);
9330
}
9331
9332
// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
9333
// in b for greater than.
9334
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
9335
{
9336
#if defined(__aarch64__)
9337
return vreinterpretq_m128i_u64(
9338
vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
9339
#else
9340
return vreinterpretq_m128i_s64(vshrq_n_s64(
9341
vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
9342
63));
9343
#endif
9344
}
9345
9346
// Starting with the initial value in crc, accumulates a CRC32 value for
9347
// unsigned 16-bit integer v.
9348
// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
9349
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
9350
{
9351
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9352
__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
9353
: [c] "+r"(crc)
9354
: [v] "r"(v));
9355
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9356
crc = __crc32ch(crc, v);
9357
#else
9358
crc = _mm_crc32_u8(crc, v & 0xff);
9359
crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
9360
#endif
9361
return crc;
9362
}
9363
9364
// Starting with the initial value in crc, accumulates a CRC32 value for
9365
// unsigned 32-bit integer v.
9366
// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
9367
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
9368
{
9369
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9370
__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
9371
: [c] "+r"(crc)
9372
: [v] "r"(v));
9373
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9374
crc = __crc32cw(crc, v);
9375
#else
9376
crc = _mm_crc32_u16(crc, v & 0xffff);
9377
crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
9378
#endif
9379
return crc;
9380
}
9381
9382
// Starting with the initial value in crc, accumulates a CRC32 value for
9383
// unsigned 64-bit integer v.
9384
// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
9385
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
9386
{
9387
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9388
__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
9389
: [c] "+r"(crc)
9390
: [v] "r"(v));
9391
#else
9392
crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
9393
crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
9394
#endif
9395
return crc;
9396
}
9397
9398
// Starting with the initial value in crc, accumulates a CRC32 value for
9399
// unsigned 8-bit integer v.
9400
// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
9401
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
9402
{
9403
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9404
__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
9405
: [c] "+r"(crc)
9406
: [v] "r"(v));
9407
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9408
crc = __crc32cb(crc, v);
9409
#else
9410
crc ^= v;
9411
for (int bit = 0; bit < 8; bit++) {
9412
if (crc & 1)
9413
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
9414
else
9415
crc = (crc >> 1);
9416
}
9417
#endif
9418
return crc;
9419
}
9420
9421
/* AES */
9422
9423
#if !defined(__ARM_FEATURE_CRYPTO)
9424
/* clang-format off */
9425
#define SSE2NEON_AES_SBOX(w) \
9426
{ \
9427
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
9428
w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
9429
w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
9430
w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
9431
w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
9432
w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
9433
w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
9434
w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
9435
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
9436
w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
9437
w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
9438
w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
9439
w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
9440
w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
9441
w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
9442
w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
9443
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
9444
w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
9445
w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
9446
w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
9447
w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
9448
w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
9449
w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
9450
w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
9451
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
9452
w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
9453
w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
9454
w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
9455
w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
9456
w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
9457
w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
9458
w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
9459
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
9460
w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
9461
w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
9462
w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
9463
w(0xb0), w(0x54), w(0xbb), w(0x16) \
9464
}
9465
#define SSE2NEON_AES_RSBOX(w) \
9466
{ \
9467
w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
9468
w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
9469
w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
9470
w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
9471
w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
9472
w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
9473
w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
9474
w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
9475
w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
9476
w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
9477
w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
9478
w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
9479
w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
9480
w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
9481
w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
9482
w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
9483
w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
9484
w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
9485
w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
9486
w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
9487
w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
9488
w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
9489
w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
9490
w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
9491
w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
9492
w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
9493
w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
9494
w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
9495
w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
9496
w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
9497
w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
9498
w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
9499
w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
9500
w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
9501
w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
9502
w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
9503
w(0x55), w(0x21), w(0x0c), w(0x7d) \
9504
}
9505
/* clang-format on */
9506
9507
/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
9508
#define SSE2NEON_AES_H0(x) (x)
9509
static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
9510
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
9511
#undef SSE2NEON_AES_H0
9512
9513
/* x_time function and matrix multiply function */
9514
#if !defined(__aarch64__)
9515
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
9516
#define SSE2NEON_MULTIPLY(x, y) \
9517
(((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
9518
((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
9519
((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
9520
((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
9521
#endif
9522
9523
// In the absence of crypto extensions, implement aesenc using regular neon
9524
// intrinsics instead. See:
9525
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
9526
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
9527
// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
9528
// for more information Reproduced with permission of the author.
9529
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
9530
{
9531
#if defined(__aarch64__)
9532
static const uint8_t shift_rows[] = {
9533
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
9534
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
9535
};
9536
static const uint8_t ror32by8[] = {
9537
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9538
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9539
};
9540
9541
uint8x16_t v;
9542
uint8x16_t w = vreinterpretq_u8_m128i(a);
9543
9544
/* shift rows */
9545
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9546
9547
/* sub bytes */
9548
// Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
9549
// look up each of the table. After each lookup, we load the next table
9550
// which locates at the next 64-bytes. In the meantime, the index in the
9551
// table would be smaller than it was, so the index parameters of
9552
// `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
9553
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
9554
// 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
9555
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
9556
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
9557
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
9558
9559
/* mix columns */
9560
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9561
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9562
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9563
9564
/* add round key */
9565
return vreinterpretq_m128i_u8(w) ^ RoundKey;
9566
9567
#else /* ARMv7-A implementation for a table-based AES */
9568
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
9569
(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
9570
((uint32_t) (b1) << 8) | (uint32_t) (b0))
9571
// muliplying 'x' by 2 in GF(2^8)
9572
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
9573
// muliplying 'x' by 3 in GF(2^8)
9574
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
9575
#define SSE2NEON_AES_U0(p) \
9576
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
9577
#define SSE2NEON_AES_U1(p) \
9578
SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
9579
#define SSE2NEON_AES_U2(p) \
9580
SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
9581
#define SSE2NEON_AES_U3(p) \
9582
SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
9583
9584
// this generates a table containing every possible permutation of
9585
// shift_rows() and sub_bytes() with mix_columns().
9586
static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
9587
SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
9588
SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
9589
SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
9590
SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
9591
};
9592
#undef SSE2NEON_AES_B2W
9593
#undef SSE2NEON_AES_F2
9594
#undef SSE2NEON_AES_F3
9595
#undef SSE2NEON_AES_U0
9596
#undef SSE2NEON_AES_U1
9597
#undef SSE2NEON_AES_U2
9598
#undef SSE2NEON_AES_U3
9599
9600
uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
9601
uint32_t x1 =
9602
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
9603
uint32_t x2 =
9604
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
9605
uint32_t x3 =
9606
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
9607
9608
// finish the modulo addition step in mix_columns()
9609
__m128i out = _mm_set_epi32(
9610
(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
9611
aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
9612
(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
9613
aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
9614
(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
9615
aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
9616
(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
9617
aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
9618
9619
return _mm_xor_si128(out, RoundKey);
9620
#endif
9621
}
9622
9623
// Perform one round of an AES decryption flow on data (state) in a using the
9624
// round key in RoundKey, and store the result in dst.
9625
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9626
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9627
{
9628
#if defined(__aarch64__)
9629
static const uint8_t inv_shift_rows[] = {
9630
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9631
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9632
};
9633
static const uint8_t ror32by8[] = {
9634
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9635
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9636
};
9637
9638
uint8x16_t v;
9639
uint8x16_t w = vreinterpretq_u8_m128i(a);
9640
9641
// inverse shift rows
9642
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9643
9644
// inverse sub bytes
9645
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
9646
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
9647
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
9648
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
9649
9650
// inverse mix columns
9651
// muliplying 'v' by 4 in GF(2^8)
9652
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9653
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9654
v ^= w;
9655
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9656
9657
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
9658
0x1b); // muliplying 'v' by 2 in GF(2^8)
9659
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9660
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9661
9662
// add round key
9663
return vreinterpretq_m128i_u8(w) ^ RoundKey;
9664
9665
#else /* ARMv7-A NEON implementation */
9666
/* FIXME: optimized for NEON */
9667
uint8_t i, e, f, g, h, v[4][4];
9668
uint8_t *_a = (uint8_t *) &a;
9669
for (i = 0; i < 16; ++i) {
9670
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9671
}
9672
9673
// inverse mix columns
9674
for (i = 0; i < 4; ++i) {
9675
e = v[i][0];
9676
f = v[i][1];
9677
g = v[i][2];
9678
h = v[i][3];
9679
9680
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
9681
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
9682
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
9683
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
9684
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
9685
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
9686
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
9687
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
9688
}
9689
9690
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
9691
#endif
9692
}
9693
9694
// Perform the last round of an AES encryption flow on data (state) in a using
9695
// the round key in RoundKey, and store the result in dst.
9696
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9697
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9698
{
9699
#if defined(__aarch64__)
9700
static const uint8_t shift_rows[] = {
9701
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
9702
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
9703
};
9704
9705
uint8x16_t v;
9706
uint8x16_t w = vreinterpretq_u8_m128i(a);
9707
9708
// shift rows
9709
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9710
9711
// sub bytes
9712
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
9713
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
9714
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
9715
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
9716
9717
// add round key
9718
return vreinterpretq_m128i_u8(v) ^ RoundKey;
9719
9720
#else /* ARMv7-A implementation */
9721
uint8_t v[16] = {
9722
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
9723
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
9724
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
9725
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
9726
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
9727
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
9728
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
9729
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
9730
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
9731
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
9732
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
9733
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
9734
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
9735
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
9736
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
9737
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
9738
};
9739
9740
return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
9741
#endif
9742
}
9743
9744
// Perform the last round of an AES decryption flow on data (state) in a using
9745
// the round key in RoundKey, and store the result in dst.
9746
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9747
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9748
{
9749
#if defined(__aarch64__)
9750
static const uint8_t inv_shift_rows[] = {
9751
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9752
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9753
};
9754
9755
uint8x16_t v;
9756
uint8x16_t w = vreinterpretq_u8_m128i(a);
9757
9758
// inverse shift rows
9759
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9760
9761
// inverse sub bytes
9762
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
9763
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
9764
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
9765
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
9766
9767
// add round key
9768
return vreinterpretq_m128i_u8(v) ^ RoundKey;
9769
9770
#else /* ARMv7-A NEON implementation */
9771
/* FIXME: optimized for NEON */
9772
uint8_t v[4][4];
9773
uint8_t *_a = (uint8_t *) &a;
9774
for (int i = 0; i < 16; ++i) {
9775
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9776
}
9777
9778
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
9779
#endif
9780
}
9781
9782
// Perform the InvMixColumns transformation on a and store the result in dst.
9783
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9784
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9785
{
9786
#if defined(__aarch64__)
9787
static const uint8_t ror32by8[] = {
9788
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9789
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9790
};
9791
uint8x16_t v = vreinterpretq_u8_m128i(a);
9792
uint8x16_t w;
9793
9794
// multiplying 'v' by 4 in GF(2^8)
9795
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9796
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9797
v ^= w;
9798
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9799
9800
// multiplying 'v' by 2 in GF(2^8)
9801
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9802
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9803
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9804
return vreinterpretq_m128i_u8(w);
9805
9806
#else /* ARMv7-A NEON implementation */
9807
uint8_t i, e, f, g, h, v[4][4];
9808
vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
9809
for (i = 0; i < 4; ++i) {
9810
e = v[i][0];
9811
f = v[i][1];
9812
g = v[i][2];
9813
h = v[i][3];
9814
9815
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
9816
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
9817
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
9818
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
9819
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
9820
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
9821
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
9822
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
9823
}
9824
9825
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
9826
#endif
9827
}
9828
9829
// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
9830
// This instruction generates a round key for AES encryption. See
9831
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
9832
// for details.
9833
//
9834
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9835
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9836
{
9837
#if defined(__aarch64__)
9838
uint8x16_t _a = vreinterpretq_u8_m128i(a);
9839
uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
9840
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
9841
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
9842
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
9843
9844
uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
9845
uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
9846
uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
9847
vreinterpretq_u32_u8(v));
9848
uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
9849
uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));
9850
9851
return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));
9852
9853
#else /* ARMv7-A NEON implementation */
9854
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
9855
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
9856
for (int i = 0; i < 4; ++i) {
9857
((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
9858
((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
9859
}
9860
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
9861
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
9862
#endif
9863
}
9864
#undef SSE2NEON_AES_SBOX
9865
#undef SSE2NEON_AES_RSBOX
9866
9867
#if defined(__aarch64__)
9868
#undef SSE2NEON_XT
9869
#undef SSE2NEON_MULTIPLY
9870
#endif
9871
9872
#else /* __ARM_FEATURE_CRYPTO */
9873
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
9874
// AESMC and then manually applying the real key as an xor operation. This
9875
// unfortunately means an additional xor op; the compiler should be able to
9876
// optimize this away for repeated calls however. See
9877
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
9878
// for more details.
9879
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
9880
{
9881
return vreinterpretq_m128i_u8(
9882
vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
9883
vreinterpretq_u8_m128i(b));
9884
}
9885
9886
// Perform one round of an AES decryption flow on data (state) in a using the
9887
// round key in RoundKey, and store the result in dst.
9888
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9889
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9890
{
9891
return vreinterpretq_m128i_u8(veorq_u8(
9892
vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9893
vreinterpretq_u8_m128i(RoundKey)));
9894
}
9895
9896
// Perform the last round of an AES encryption flow on data (state) in a using
9897
// the round key in RoundKey, and store the result in dst.
9898
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9899
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9900
{
9901
return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
9902
vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9903
RoundKey);
9904
}
9905
9906
// Perform the last round of an AES decryption flow on data (state) in a using
9907
// the round key in RoundKey, and store the result in dst.
9908
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9909
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9910
{
9911
return vreinterpretq_m128i_u8(
9912
vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
9913
vreinterpretq_u8_m128i(RoundKey);
9914
}
9915
9916
// Perform the InvMixColumns transformation on a and store the result in dst.
9917
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9918
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9919
{
9920
return vreinterpretq_m128i_u8(vaesimcq_u8(a));
9921
}
9922
9923
// Assist in expanding the AES cipher key by computing steps towards generating
9924
// a round key for encryption cipher using data from a and an 8-bit round
9925
// constant specified in imm8, and store the result in dst."
9926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9927
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9928
{
9929
// AESE does ShiftRows and SubBytes on A
9930
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9931
9932
uint8x16_t dest = {
9933
// Undo ShiftRows step from AESE and extract X1 and X3
9934
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9935
u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9936
u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9937
u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9938
};
9939
uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9940
return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
9941
}
9942
#endif
9943
9944
/* Others */
9945
9946
// Perform a carry-less multiplication of two 64-bit integers, selected from a
9947
// and b according to imm8, and store the results in dst.
9948
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9949
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9950
{
9951
uint64x2_t a = vreinterpretq_u64_m128i(_a);
9952
uint64x2_t b = vreinterpretq_u64_m128i(_b);
9953
switch (imm & 0x11) {
9954
case 0x00:
9955
return vreinterpretq_m128i_u64(
9956
_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9957
case 0x01:
9958
return vreinterpretq_m128i_u64(
9959
_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9960
case 0x10:
9961
return vreinterpretq_m128i_u64(
9962
_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9963
case 0x11:
9964
return vreinterpretq_m128i_u64(
9965
_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9966
default:
9967
abort();
9968
}
9969
}
9970
9971
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
9972
{
9973
union {
9974
fpcr_bitfield field;
9975
#if defined(__aarch64__)
9976
uint64_t value;
9977
#else
9978
uint32_t value;
9979
#endif
9980
} r;
9981
9982
#if defined(__aarch64__)
9983
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
9984
#else
9985
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9986
#endif
9987
9988
return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9989
}
9990
9991
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9992
// return that count in dst.
9993
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9994
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9995
{
9996
#if defined(__aarch64__)
9997
#if __has_builtin(__builtin_popcount)
9998
return __builtin_popcount(a);
9999
#else
10000
return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
10001
#endif
10002
#else
10003
uint32_t count = 0;
10004
uint8x8_t input_val, count8x8_val;
10005
uint16x4_t count16x4_val;
10006
uint32x2_t count32x2_val;
10007
10008
input_val = vld1_u8((uint8_t *) &a);
10009
count8x8_val = vcnt_u8(input_val);
10010
count16x4_val = vpaddl_u8(count8x8_val);
10011
count32x2_val = vpaddl_u16(count16x4_val);
10012
10013
vst1_u32(&count, count32x2_val);
10014
return count;
10015
#endif
10016
}
10017
10018
// Count the number of bits set to 1 in unsigned 64-bit integer a, and
10019
// return that count in dst.
10020
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
10021
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
10022
{
10023
#if defined(__aarch64__)
10024
#if __has_builtin(__builtin_popcountll)
10025
return __builtin_popcountll(a);
10026
#else
10027
return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
10028
#endif
10029
#else
10030
uint64_t count = 0;
10031
uint8x8_t input_val, count8x8_val;
10032
uint16x4_t count16x4_val;
10033
uint32x2_t count32x2_val;
10034
uint64x1_t count64x1_val;
10035
10036
input_val = vld1_u8((uint8_t *) &a);
10037
count8x8_val = vcnt_u8(input_val);
10038
count16x4_val = vpaddl_u8(count8x8_val);
10039
count32x2_val = vpaddl_u16(count16x4_val);
10040
count64x1_val = vpaddl_u32(count32x2_val);
10041
vst1_u64(&count, count64x1_val);
10042
return count;
10043
#endif
10044
}
10045
10046
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
10047
{
10048
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
10049
// regardless of the value of the FZ bit.
10050
union {
10051
fpcr_bitfield field;
10052
#if defined(__aarch64__)
10053
uint64_t value;
10054
#else
10055
uint32_t value;
10056
#endif
10057
} r;
10058
10059
#if defined(__aarch64__)
10060
__asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
10061
#else
10062
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
10063
#endif
10064
10065
r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
10066
10067
#if defined(__aarch64__)
10068
__asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
10069
#else
10070
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
10071
#endif
10072
}
10073
10074
// Return the current 64-bit value of the processor's time-stamp counter.
10075
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
10076
10077
FORCE_INLINE uint64_t _rdtsc(void)
10078
{
10079
#if defined(__aarch64__)
10080
uint64_t val;
10081
10082
/* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
10083
* system counter is at least 56 bits wide; from Armv8.6, the counter
10084
* must be 64 bits wide. So the system counter could be less than 64
10085
* bits wide and it is attributed with the flag 'cap_user_time_short'
10086
* is true.
10087
*/
10088
__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
10089
10090
return val;
10091
#else
10092
uint32_t pmccntr, pmuseren, pmcntenset;
10093
// Read the user mode Performance Monitoring Unit (PMU)
10094
// User Enable Register (PMUSERENR) access permissions.
10095
__asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
10096
if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
10097
__asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
10098
if (pmcntenset & 0x80000000UL) { // Is it counting?
10099
__asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
10100
// The counter is set up to count every 64th cycle
10101
return (uint64_t) (pmccntr) << 6;
10102
}
10103
}
10104
10105
// Fallback to syscall as we can't enable PMUSERENR in user mode.
10106
struct timeval tv;
10107
gettimeofday(&tv, NULL);
10108
return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
10109
#endif
10110
}
10111
10112
#if defined(__GNUC__) || defined(__clang__)
10113
#pragma pop_macro("ALIGN_STRUCT")
10114
#pragma pop_macro("FORCE_INLINE")
10115
#endif
10116
10117
#if defined(__GNUC__) && !defined(__clang__)
10118
#pragma GCC pop_options
10119
#endif
10120
10121
#endif
10122
10123