CoCalc -- sse2neon.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/arm/sse2neon.h
⁹⁷¹⁵ views
1
#ifndef SSE2NEON_H
2
#define SSE2NEON_H
3

4
// This header file provides a simple API translation layer
5
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6
//
7
// Contributors to this work are:
8
//   John W. Ratcliff <[email protected]>
9
//   Brandon Rowlett <[email protected]>
10
//   Ken Fast <[email protected]>
11
//   Eric van Beurden <[email protected]>
12
//   Alexander Potylitsin <[email protected]>
13
//   Hasindu Gamaarachchi <[email protected]>
14
//   Jim Huang <[email protected]>
15
//   Mark Cheng <[email protected]>
16
//   Malcolm James MacLeod <[email protected]>
17
//   Devin Hussey (easyaspi314) <[email protected]>
18
//   Sebastian Pop <[email protected]>
19
//   Developer Ecosystem Engineering <[email protected]>
20
//   Danila Kutenin <[email protected]>
21
//   François Turban (JishinMaster) <[email protected]>
22
//   Pei-Hsuan Hung <[email protected]>
23
//   Yang-Hao Yuan <[email protected]>
24
//   Syoyo Fujita <[email protected]>
25
//   Brecht Van Lommel <[email protected]>
26
//   Jonathan Hue <[email protected]>
27
//   Cuda Chen <[email protected]>
28
//   Aymen Qader <[email protected]>
29

30
/*
31
 * sse2neon is freely redistributable under the MIT License.
32
 *
33
 * Permission is hereby granted, free of charge, to any person obtaining a copy
34
 * of this software and associated documentation files (the "Software"), to deal
35
 * in the Software without restriction, including without limitation the rights
36
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
37
 * copies of the Software, and to permit persons to whom the Software is
38
 * furnished to do so, subject to the following conditions:
39
 *
40
 * The above copyright notice and this permission notice shall be included in
41
 * all copies or substantial portions of the Software.
42
 *
43
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
48
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49
 * SOFTWARE.
50
 */
51

52
/* Tunable configurations */
53

54
/* Enable precise implementation of math operations
55
 * This would slow down the computation a bit, but gives consistent result with
56
 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
57
 */
58
/* _mm_min|max_ps|ss|pd|sd */
59
#ifndef SSE2NEON_PRECISE_MINMAX
60
#define SSE2NEON_PRECISE_MINMAX (0)
61
#endif
62
/* _mm_rcp_ps and _mm_div_ps */
63
#ifndef SSE2NEON_PRECISE_DIV
64
#define SSE2NEON_PRECISE_DIV (0)
65
#endif
66
/* _mm_sqrt_ps and _mm_rsqrt_ps */
67
#ifndef SSE2NEON_PRECISE_SQRT
68
#define SSE2NEON_PRECISE_SQRT (0)
69
#endif
70
/* _mm_dp_pd */
71
#ifndef SSE2NEON_PRECISE_DP
72
#define SSE2NEON_PRECISE_DP (0)
73
#endif
74

75
/* compiler specific definitions */
76
#if defined(__GNUC__) || defined(__clang__)
77
#pragma push_macro("FORCE_INLINE")
78
#pragma push_macro("ALIGN_STRUCT")
79
#define FORCE_INLINE static inline __attribute__((always_inline))
80
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
81
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
82
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
83
#else /* non-GNU / non-clang compilers */
84
#warning "Macro name collisions may happen with unsupported compiler."
85
#ifndef FORCE_INLINE
86
#define FORCE_INLINE static inline
87
#endif
88
#ifndef ALIGN_STRUCT
89
#define ALIGN_STRUCT(x) __declspec(align(x))
90
#endif
91
#define _sse2neon_likely(x) (x)
92
#define _sse2neon_unlikely(x) (x)
93
#endif
94

95
/* C language does not allow initializing a variable with a function call. */
96
#ifdef __cplusplus
97
#define _sse2neon_const static const
98
#else
99
#define _sse2neon_const const
100
#endif
101

102
#include <stdint.h>
103
#include <stdlib.h>
104

105
#if defined(_WIN32) && !defined(__MINGW32__)
106
/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
107
 * from both MinGW-w64 and MSVC.
108
 */
109
#define SSE2NEON_ALLOC_DEFINED
110
#endif
111

112
/* If using MSVC */
113
#ifdef _MSC_VER
114
#include <intrin.h>
115
#if (defined(_M_AMD64) || defined(__x86_64__)) || \
116
    (defined(_M_ARM) || defined(__arm__))
117
#define SSE2NEON_HAS_BITSCAN64
118
#endif
119
#endif
120

121
/* Compiler barrier */
122
#define SSE2NEON_BARRIER()                     \
123
    do {                                       \
124
        __asm__ __volatile__("" ::: "memory"); \
125
        (void) 0;                              \
126
    } while (0)
127

128
/* Memory barriers
129
 * __atomic_thread_fence does not include a compiler barrier; instead,
130
 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
131
 * semantics.
132
 */
133
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
134
#include <stdatomic.h>
135
#endif
136

137
FORCE_INLINE void _sse2neon_smp_mb(void)
138
{
139
    SSE2NEON_BARRIER();
140
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
141
    !defined(__STDC_NO_ATOMICS__)
142
    atomic_thread_fence(memory_order_seq_cst);
143
#elif defined(__GNUC__) || defined(__clang__)
144
    __atomic_thread_fence(__ATOMIC_SEQ_CST);
145
#else
146
    /* FIXME: MSVC support */
147
#endif
148
}
149

150
/* Architecture-specific build options */
151
/* FIXME: #pragma GCC push_options is only available on GCC */
152
#if defined(__GNUC__)
153
#if defined(__arm__) && __ARM_ARCH == 7
154
/* According to ARM C Language Extensions Architecture specification,
155
 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
156
 * architecture supported.
157
 */
158
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
159
#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
160
#endif
161
#if !defined(__clang__)
162
#pragma GCC push_options
163
#pragma GCC target("fpu=neon")
164
#endif
165
#elif defined(__aarch64__)
166
#if !defined(__clang__)
167
#pragma GCC push_options
168
#pragma GCC target("+simd")
169
#endif
170
#elif __ARM_ARCH == 8
171
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
172
#error \
173
    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
174
#endif
175
#if !defined(__clang__)
176
#pragma GCC push_options
177
#endif
178
#else
179
#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
180
#endif
181
#endif
182

183
#include <arm_neon.h>
184
#if !defined(__aarch64__) && (__ARM_ARCH == 8)
185
#if defined __has_include && __has_include(<arm_acle.h>)
186
#include <arm_acle.h>
187
#endif
188
#endif
189

190
/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
191
 * and other Arm microarchtectures use.
192
 * From sysctl -a on Apple M1:
193
 * hw.cachelinesize: 128
194
 */
195
#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
196
#define SSE2NEON_CACHELINE_SIZE 128
197
#else
198
#define SSE2NEON_CACHELINE_SIZE 64
199
#endif
200

201
/* Rounding functions require either Aarch64 instructions or libm failback */
202
#if !defined(__aarch64__)
203
#include <math.h>
204
#endif
205

206
/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
207
 * or even not accessible in user mode.
208
 * To write or access to these registers in user mode,
209
 * we have to perform syscall instead.
210
 */
211
#if !defined(__aarch64__)
212
#include <sys/time.h>
213
#endif
214

215
/* "__has_builtin" can be used to query support for built-in functions
216
 * provided by gcc/clang and other compilers that support it.
217
 */
218
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
219
/* Compatibility with gcc <= 9 */
220
#if defined(__GNUC__) && (__GNUC__ <= 9)
221
#define __has_builtin(x) HAS##x
222
#define HAS__builtin_popcount 1
223
#define HAS__builtin_popcountll 1
224

225
// __builtin_shuffle introduced in GCC 4.7.0
226
#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
227
#define HAS__builtin_shuffle 1
228
#else
229
#define HAS__builtin_shuffle 0
230
#endif
231

232
#define HAS__builtin_shufflevector 0
233
#define HAS__builtin_nontemporal_store 0
234
#else
235
#define __has_builtin(x) 0
236
#endif
237
#endif
238

239
/**
240
 * MACRO for shuffle parameter for _mm_shuffle_ps().
241
 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
242
 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
243
 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
244
 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
245
 * fp0 is the same for fp0 of result.
246
 */
247
#if defined(__aarch64__)
248
#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
249
2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
250
#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
251
2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
252
4)+16+3) } )
253
#endif
254

255
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
256
    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
257

258
#if __has_builtin(__builtin_shufflevector)
259
#define _sse2neon_shuffle(type, a, b, ...) \
260
    __builtin_shufflevector(a, b, __VA_ARGS__)
261
#elif __has_builtin(__builtin_shuffle)
262
#define _sse2neon_shuffle(type, a, b, ...) \
263
    __extension__({                        \
264
        type tmp = {__VA_ARGS__};          \
265
        __builtin_shuffle(a, b, tmp);      \
266
    })
267
#endif
268

269
#ifdef _sse2neon_shuffle
270
#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
271
#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
272
#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
273
#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
274
#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
275
#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
276
#endif
277

278
/* Rounding mode macros. */
279
#define _MM_FROUND_TO_NEAREST_INT 0x00
280
#define _MM_FROUND_TO_NEG_INF 0x01
281
#define _MM_FROUND_TO_POS_INF 0x02
282
#define _MM_FROUND_TO_ZERO 0x03
283
#define _MM_FROUND_CUR_DIRECTION 0x04
284
#define _MM_FROUND_NO_EXC 0x08
285
#define _MM_FROUND_RAISE_EXC 0x00
286
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
287
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
288
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
289
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
290
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
291
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
292
#define _MM_ROUND_NEAREST 0x0000
293
#define _MM_ROUND_DOWN 0x2000
294
#define _MM_ROUND_UP 0x4000
295
#define _MM_ROUND_TOWARD_ZERO 0x6000
296
/* Flush zero mode macros. */
297
#define _MM_FLUSH_ZERO_MASK 0x8000
298
#define _MM_FLUSH_ZERO_ON 0x8000
299
#define _MM_FLUSH_ZERO_OFF 0x0000
300
/* Denormals are zeros mode macros. */
301
#define _MM_DENORMALS_ZERO_MASK 0x0040
302
#define _MM_DENORMALS_ZERO_ON 0x0040
303
#define _MM_DENORMALS_ZERO_OFF 0x0000
304

305
/* indicate immediate constant argument in a given range */
306
#define __constrange(a, b) const
307

308
/* A few intrinsics accept traditional data types like ints or floats, but
309
 * most operate on data types that are specific to SSE.
310
 * If a vector type ends in d, it contains doubles, and if it does not have
311
 * a suffix, it contains floats. An integer vector type can contain any type
312
 * of integer, from chars to shorts to unsigned long longs.
313
 */
314
typedef int64x1_t __m64;
315
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
316
// On ARM 32-bit architecture, the float64x2_t is not supported.
317
// The data type __m128d should be represented in a different way for related
318
// intrinsic conversion.
319
#if defined(__aarch64__)
320
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
321
#else
322
typedef float32x4_t __m128d;
323
#endif
324
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
325

326
// __int64 is defined in the Intrinsics Guide which maps to different datatype
327
// in different data model
328
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
329
#if (defined(__x86_64__) || defined(__i386__))
330
#define __int64 long long
331
#else
332
#define __int64 int64_t
333
#endif
334
#endif
335

336
/* type-safe casting between types */
337

338
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
339
#define vreinterpretq_m128_f32(x) (x)
340
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
341

342
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
343
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
344
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
345
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
346

347
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
348
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
349
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
350
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
351

352
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
353
#define vreinterpretq_f32_m128(x) (x)
354
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
355

356
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
357
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
358
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
359
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
360

361
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
362
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
363
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
364
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
365

366
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
367
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
368
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
369
#define vreinterpretq_m128i_s64(x) (x)
370

371
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
372
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
373
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
374
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
375

376
#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
377
#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
378

379
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
380
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
381
#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
382
#define vreinterpretq_s64_m128i(x) (x)
383

384
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
385
#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
386
#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
387
#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
388

389
#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
390
#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
391
#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
392
#define vreinterpret_m64_s64(x) (x)
393

394
#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
395
#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
396
#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
397
#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
398

399
#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
400
#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
401
#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
402

403
#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
404
#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
405
#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
406
#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
407

408
#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
409
#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
410
#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
411
#define vreinterpret_s64_m64(x) (x)
412

413
#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
414

415
#if defined(__aarch64__)
416
#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
417
#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
418

419
#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
420

421
#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
422
#define vreinterpretq_m128d_f64(x) (x)
423

424
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
425

426
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
427
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
428

429
#define vreinterpretq_f64_m128d(x) (x)
430
#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
431
#else
432
#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
433
#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
434

435
#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
436
#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
437

438
#define vreinterpretq_m128d_f32(x) (x)
439

440
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
441

442
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
443
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
444

445
#define vreinterpretq_f32_m128d(x) (x)
446
#endif
447

448
// A struct is defined in this header file called 'SIMDVec' which can be used
449
// by applications which attempt to access the contents of an __m128 struct
450
// directly.  It is important to note that accessing the __m128 struct directly
451
// is bad coding practice by Microsoft: @see:
452
// https://docs.microsoft.com/en-us/cpp/cpp/m128
453
//
454
// However, some legacy source code may try to access the contents of an __m128
455
// struct directly so the developer can use the SIMDVec as an alias for it.  Any
456
// casting must be done manually by the developer, as you cannot cast or
457
// otherwise alias the base NEON data type for intrinsic operations.
458
//
459
// union intended to allow direct access to an __m128 variable using the names
460
// that the MSVC compiler provides.  This union should really only be used when
461
// trying to access the members of the vector as integer values.  GCC/clang
462
// allow native access to the float members through a simple array access
463
// operator (in C since 4.6, in C++ since 4.8).
464
//
465
// Ideally direct accesses to SIMD vectors should not be used since it can cause
466
// a performance hit.  If it really is needed however, the original __m128
467
// variable can be aliased with a pointer to this union and used to access
468
// individual components.  The use of this union should be hidden behind a macro
469
// that is used throughout the codebase to access the members instead of always
470
// declaring this type of variable.
471
typedef union ALIGN_STRUCT(16) SIMDVec {
472
    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
473
    int8_t m128_i8[16];    // as signed 8-bit integers.
474
    int16_t m128_i16[8];   // as signed 16-bit integers.
475
    int32_t m128_i32[4];   // as signed 32-bit integers.
476
    int64_t m128_i64[2];   // as signed 64-bit integers.
477
    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
478
    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
479
    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
480
    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
481
} SIMDVec;
482

483
// casting using SIMDVec
484
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
485
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
486
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
487

488
/* SSE macros */
489
#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
490
#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
491
#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
492
#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
493

494
// Function declaration
495
// SSE
496
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
497
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
498
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
499
FORCE_INLINE __m128 _mm_set_ps1(float);
500
FORCE_INLINE __m128 _mm_setzero_ps(void);
501
// SSE2
502
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
503
FORCE_INLINE __m128i _mm_castps_si128(__m128);
504
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
505
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
506
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
507
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
508
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
509
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
510
FORCE_INLINE __m128d _mm_set_pd(double, double);
511
FORCE_INLINE __m128i _mm_set1_epi32(int);
512
FORCE_INLINE __m128i _mm_setzero_si128();
513
// SSE4.1
514
FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
515
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
516
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
517
FORCE_INLINE __m128 _mm_floor_ps(__m128);
518
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
519
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
520
// SSE4.2
521
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
522

523
/* Backwards compatibility for compilers with lack of specific type support */
524

525
// Older gcc does not define vld1q_u8_x4 type
526
#if defined(__GNUC__) && !defined(__clang__) &&                        \
527
    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
528
     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
529
     (__GNUC__ <= 9 && defined(__aarch64__)))
530
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
531
{
532
    uint8x16x4_t ret;
533
    ret.val[0] = vld1q_u8(p + 0);
534
    ret.val[1] = vld1q_u8(p + 16);
535
    ret.val[2] = vld1q_u8(p + 32);
536
    ret.val[3] = vld1q_u8(p + 48);
537
    return ret;
538
}
539
#else
540
// Wraps vld1q_u8_x4
541
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
542
{
543
    return vld1q_u8_x4(p);
544
}
545
#endif
546

547
#if !defined(__aarch64__)
548
/* emulate vaddv u8 variant */
549
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
550
{
551
    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
552
    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
553
}
554
#else
555
// Wraps vaddv_u8
556
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
557
{
558
    return vaddv_u8(v8);
559
}
560
#endif
561

562
#if !defined(__aarch64__)
563
/* emulate vaddvq u8 variant */
564
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
565
{
566
    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
567
    uint8_t res = 0;
568
    for (int i = 0; i < 8; ++i)
569
        res += tmp[i];
570
    return res;
571
}
572
#else
573
// Wraps vaddvq_u8
574
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
575
{
576
    return vaddvq_u8(a);
577
}
578
#endif
579

580
#if !defined(__aarch64__)
581
/* emulate vaddvq u16 variant */
582
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
583
{
584
    uint32x4_t m = vpaddlq_u16(a);
585
    uint64x2_t n = vpaddlq_u32(m);
586
    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
587

588
    return vget_lane_u32((uint32x2_t) o, 0);
589
}
590
#else
591
// Wraps vaddvq_u16
592
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
593
{
594
    return vaddvq_u16(a);
595
}
596
#endif
597

598
/* Function Naming Conventions
599
 * The naming convention of SSE intrinsics is straightforward. A generic SSE
600
 * intrinsic function is given as follows:
601
 *   _mm_<name>_<data_type>
602
 *
603
 * The parts of this format are given as follows:
604
 * 1. <name> describes the operation performed by the intrinsic
605
 * 2. <data_type> identifies the data type of the function's primary arguments
606
 *
607
 * This last part, <data_type>, is a little complicated. It identifies the
608
 * content of the input values, and can be set to any of the following values:
609
 * + ps - vectors contain floats (ps stands for packed single-precision)
610
 * + pd - vectors cantain doubles (pd stands for packed double-precision)
611
 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
612
 *                            signed integers
613
 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
614
 *                            unsigned integers
615
 * + si128 - unspecified 128-bit vector or 256-bit vector
616
 * + m128/m128i/m128d - identifies input vector types when they are different
617
 *                      than the type of the returned vector
618
 *
619
 * For example, _mm_setzero_ps. The _mm implies that the function returns
620
 * a 128-bit vector. The _ps at the end implies that the argument vectors
621
 * contain floats.
622
 *
623
 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
624
 *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
625
 *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
626
 *   // Set packed 8-bit integers
627
 *   // 128 bits, 16 chars, per 8 bits
628
 *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
629
 *                                  4, 5, 12, 13, 6, 7, 14, 15);
630
 *   // Shuffle packed 8-bit integers
631
 *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
632
 *
633
 * Data (Number, Binary, Byte Index):
634
    +------+------+-------------+------+------+-------------+
635
    |      1      |      2      |      3      |      4      | Number
636
    +------+------+------+------+------+------+------+------+
637
    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
638
    +------+------+------+------+------+------+------+------+
639
    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
640
    +------+------+------+------+------+------+------+------+
641

642
    +------+------+------+------+------+------+------+------+
643
    |      5      |      6      |      7      |      8      | Number
644
    +------+------+------+------+------+------+------+------+
645
    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
646
    +------+------+------+------+------+------+------+------+
647
    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
648
    +------+------+------+------+------+------+------+------+
649
 * Index (Byte Index):
650
    +------+------+------+------+------+------+------+------+
651
    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
652
    +------+------+------+------+------+------+------+------+
653

654
    +------+------+------+------+------+------+------+------+
655
    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
656
    +------+------+------+------+------+------+------+------+
657
 * Result:
658
    +------+------+------+------+------+------+------+------+
659
    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
660
    +------+------+------+------+------+------+------+------+
661
    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
662
    +------+------+------+------+------+------+------+------+
663
    |     256     |      2      |      5      |      6      | Number
664
    +------+------+------+------+------+------+------+------+
665

666
    +------+------+------+------+------+------+------+------+
667
    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
668
    +------+------+------+------+------+------+------+------+
669
    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
670
    +------+------+------+------+------+------+------+------+
671
    |      3      |      7      |      4      |      8      | Number
672
    +------+------+------+------+------+------+-------------+
673
 */
674

675
/* Constants for use with _mm_prefetch. */
676
enum _mm_hint {
677
    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
678
    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
679
    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
680
    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
681
};
682

683
// The bit field mapping to the FPCR(floating-point control register)
684
typedef struct {
685
    uint16_t res0;
686
    uint8_t res1 : 6;
687
    uint8_t bit22 : 1;
688
    uint8_t bit23 : 1;
689
    uint8_t bit24 : 1;
690
    uint8_t res2 : 7;
691
#if defined(__aarch64__)
692
    uint32_t res3;
693
#endif
694
} fpcr_bitfield;
695

696
// Takes the upper 64 bits of a and places it in the low end of the result
697
// Takes the lower 64 bits of b and places it into the high end of the result.
698
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
699
{
700
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
701
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
702
    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
703
}
704

705
// takes the lower two 32-bit values from a and swaps them and places in high
706
// end of result takes the higher two 32 bit values from b and swaps them and
707
// places in low end of result.
708
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
709
{
710
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
711
    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
712
    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
713
}
714

715
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
716
{
717
    float32x2_t a21 = vget_high_f32(
718
        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
719
    float32x2_t b03 = vget_low_f32(
720
        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
721
    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
722
}
723

724
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
725
{
726
    float32x2_t a03 = vget_low_f32(
727
        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
728
    float32x2_t b21 = vget_high_f32(
729
        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
730
    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
731
}
732

733
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
734
{
735
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
736
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
737
    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
738
}
739

740
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
741
{
742
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
743
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
744
    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
745
}
746

747
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
748
{
749
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
750
    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
751
    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
752
}
753

754
// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
755
// high
756
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
757
{
758
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
759
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
760
    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
761
}
762

763
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
764
{
765
    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
766
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
767
    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
768
}
769

770
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
771
{
772
    float32x2_t a22 =
773
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
774
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
775
    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
776
}
777

778
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
779
{
780
    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
781
    float32x2_t b22 =
782
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
783
    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
784
}
785

786
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
787
{
788
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
789
    float32x2_t a22 =
790
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
791
    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
792
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
793
    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
794
}
795

796
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
797
{
798
    float32x2_t a33 =
799
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
800
    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
801
    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
802
}
803

804
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
805
{
806
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
807
    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
808
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
809
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
810
    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
811
}
812

813
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
814
{
815
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
816
    float32_t b2 = vgetq_lane_f32(b, 2);
817
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
818
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
819
    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
820
}
821

822
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
823
{
824
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
825
    float32_t b2 = vgetq_lane_f32(b, 2);
826
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
827
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
828
    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
829
}
830

831
// Kahan summation for accurate summation of floating-point numbers.
832
// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
833
FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
834
{
835
    y -= *c;
836
    float t = *sum + y;
837
    *c = (t - *sum) - y;
838
    *sum = t;
839
}
840

841
#if defined(__ARM_FEATURE_CRYPTO) && \
842
    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
843
// Wraps vmull_p64
844
FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
845
{
846
    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
847
    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
848
    return vreinterpretq_u64_p128(vmull_p64(a, b));
849
}
850
#else  // ARMv7 polyfill
851
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
852
//
853
// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
854
// 64-bit->128-bit polynomial multiply.
855
//
856
// It needs some work and is somewhat slow, but it is still faster than all
857
// known scalar methods.
858
//
859
// Algorithm adapted to C from
860
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
861
// from "Fast Software Polynomial Multiplication on ARM Processors Using the
862
// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
863
// (https://hal.inria.fr/hal-01506572)
864
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
865
{
866
    poly8x8_t a = vreinterpret_p8_u64(_a);
867
    poly8x8_t b = vreinterpret_p8_u64(_b);
868

869
    // Masks
870
    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
871
                                    vcreate_u8(0x00000000ffffffff));
872
    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
873
                                    vcreate_u8(0x0000000000000000));
874

875
    // Do the multiplies, rotating with vext to get all combinations
876
    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
877
    uint8x16_t e =
878
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
879
    uint8x16_t f =
880
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
881
    uint8x16_t g =
882
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
883
    uint8x16_t h =
884
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
885
    uint8x16_t i =
886
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
887
    uint8x16_t j =
888
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
889
    uint8x16_t k =
890
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
891

892
    // Add cross products
893
    uint8x16_t l = veorq_u8(e, f);  // L = E + F
894
    uint8x16_t m = veorq_u8(g, h);  // M = G + H
895
    uint8x16_t n = veorq_u8(i, j);  // N = I + J
896

897
    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
898
    // instructions.
899
#if defined(__aarch64__)
900
    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
901
        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
902
    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
903
        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
904
    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
905
        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
906
    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
907
        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
908
#else
909
    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
910
    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
911
    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
912
    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
913
#endif
914
    // t0 = (L) (P0 + P1) << 8
915
    // t1 = (M) (P2 + P3) << 16
916
    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
917
    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
918
    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
919

920
    // t2 = (N) (P4 + P5) << 24
921
    // t3 = (K) (P6 + P7) << 32
922
    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
923
    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
924
    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
925

926
    // De-interleave
927
#if defined(__aarch64__)
928
    uint8x16_t t0 = vreinterpretq_u8_u64(
929
        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
930
    uint8x16_t t1 = vreinterpretq_u8_u64(
931
        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
932
    uint8x16_t t2 = vreinterpretq_u8_u64(
933
        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
934
    uint8x16_t t3 = vreinterpretq_u8_u64(
935
        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
936
#else
937
    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
938
    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
939
    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
940
    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
941
#endif
942
    // Shift the cross products
943
    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
944
    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
945
    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
946
    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
947

948
    // Accumulate the products
949
    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
950
    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
951
    uint8x16_t mix = veorq_u8(d, cross1);
952
    uint8x16_t r = veorq_u8(mix, cross2);
953
    return vreinterpretq_u64_u8(r);
954
}
955
#endif  // ARMv7 polyfill
956

957
// C equivalent:
958
//   __m128i _mm_shuffle_epi32_default(__m128i a,
959
//                                     __constrange(0, 255) int imm) {
960
//       __m128i ret;
961
//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
962
//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
963
//       return ret;
964
//   }
965
#define _mm_shuffle_epi32_default(a, imm)                                   \
966
    __extension__({                                                         \
967
        int32x4_t ret;                                                      \
968
        ret = vmovq_n_s32(                                                  \
969
            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
970
        ret = vsetq_lane_s32(                                               \
971
            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
972
            ret, 1);                                                        \
973
        ret = vsetq_lane_s32(                                               \
974
            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
975
            ret, 2);                                                        \
976
        ret = vsetq_lane_s32(                                               \
977
            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
978
            ret, 3);                                                        \
979
        vreinterpretq_m128i_s32(ret);                                       \
980
    })
981

982
// Takes the upper 64 bits of a and places it in the low end of the result
983
// Takes the lower 64 bits of a and places it into the high end of the result.
984
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
985
{
986
    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
987
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
988
    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
989
}
990

991
// takes the lower two 32-bit values from a and swaps them and places in low end
992
// of result takes the higher two 32 bit values from a and swaps them and places
993
// in high end of result.
994
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
995
{
996
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
997
    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
998
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
999
}
1000

1001
// rotates the least significant 32 bits into the most significant 32 bits, and
1002
// shifts the rest down
1003
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1004
{
1005
    return vreinterpretq_m128i_s32(
1006
        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1007
}
1008

1009
// rotates the most significant 32 bits into the least significant 32 bits, and
1010
// shifts the rest up
1011
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1012
{
1013
    return vreinterpretq_m128i_s32(
1014
        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1015
}
1016

1017
// gets the lower 64 bits of a, and places it in the upper 64 bits
1018
// gets the lower 64 bits of a and places it in the lower 64 bits
1019
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1020
{
1021
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1022
    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1023
}
1024

1025
// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1026
// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1027
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1028
{
1029
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1030
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1031
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1032
}
1033

1034
// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1035
// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1036
// places it in the lower 64 bits
1037
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1038
{
1039
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1040
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1041
}
1042

1043
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1044
{
1045
    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1046
    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1047
    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1048
}
1049

1050
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1051
{
1052
    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1053
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1054
    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1055
}
1056

1057
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1058
{
1059
    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1060
    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1061
    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1062
}
1063

1064
// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1065
// int imm)
1066
#if defined(__aarch64__)
1067
#define _mm_shuffle_epi32_splat(a, imm)                          \
1068
    __extension__({                                              \
1069
        vreinterpretq_m128i_s32(                                 \
1070
            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1071
    })
1072
#else
1073
#define _mm_shuffle_epi32_splat(a, imm)                                      \
1074
    __extension__({                                                          \
1075
        vreinterpretq_m128i_s32(                                             \
1076
            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1077
    })
1078
#endif
1079

1080
// NEON does not support a general purpose permute intrinsic
1081
// Selects four specific single-precision, floating-point values from a and b,
1082
// based on the mask i.
1083
//
1084
// C equivalent:
1085
//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1086
//                                 __constrange(0, 255) int imm) {
1087
//       __m128 ret;
1088
//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1089
//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1090
//       return ret;
1091
//   }
1092
//
1093
// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1094
#define _mm_shuffle_ps_default(a, b, imm)                                  \
1095
    __extension__({                                                        \
1096
        float32x4_t ret;                                                   \
1097
        ret = vmovq_n_f32(                                                 \
1098
            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1099
        ret = vsetq_lane_f32(                                              \
1100
            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1101
            ret, 1);                                                       \
1102
        ret = vsetq_lane_f32(                                              \
1103
            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1104
            ret, 2);                                                       \
1105
        ret = vsetq_lane_f32(                                              \
1106
            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1107
            ret, 3);                                                       \
1108
        vreinterpretq_m128_f32(ret);                                       \
1109
    })
1110

1111
// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1112
// by imm.
1113
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1114
// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1115
//                                                   __constrange(0,255) int
1116
//                                                   imm)
1117
#define _mm_shufflelo_epi16_function(a, imm)                                  \
1118
    __extension__({                                                           \
1119
        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1120
        int16x4_t lowBits = vget_low_s16(ret);                                \
1121
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1122
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1123
                             1);                                              \
1124
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1125
                             2);                                              \
1126
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1127
                             3);                                              \
1128
        vreinterpretq_m128i_s16(ret);                                         \
1129
    })
1130

1131
// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1132
// by imm.
1133
// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1134
// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1135
//                                                   __constrange(0,255) int
1136
//                                                   imm)
1137
#define _mm_shufflehi_epi16_function(a, imm)                                   \
1138
    __extension__({                                                            \
1139
        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1140
        int16x4_t highBits = vget_high_s16(ret);                               \
1141
        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1142
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1143
                             5);                                               \
1144
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1145
                             6);                                               \
1146
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1147
                             7);                                               \
1148
        vreinterpretq_m128i_s16(ret);                                          \
1149
    })
1150

1151
/* MMX */
1152

1153
//_mm_empty is a no-op on arm
1154
FORCE_INLINE void _mm_empty(void) {}
1155

1156
/* SSE */
1157

1158
// Adds the four single-precision, floating-point values of a and b.
1159
//
1160
//   r0 := a0 + b0
1161
//   r1 := a1 + b1
1162
//   r2 := a2 + b2
1163
//   r3 := a3 + b3
1164
//
1165
// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1166
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1167
{
1168
    return vreinterpretq_m128_f32(
1169
        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1170
}
1171

1172
// adds the scalar single-precision floating point values of a and b.
1173
// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1174
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1175
{
1176
    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1177
    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1178
    // the upper values in the result must be the remnants of <a>.
1179
    return vreinterpretq_m128_f32(vaddq_f32(a, value));
1180
}
1181

1182
// Computes the bitwise AND of the four single-precision, floating-point values
1183
// of a and b.
1184
//
1185
//   r0 := a0 & b0
1186
//   r1 := a1 & b1
1187
//   r2 := a2 & b2
1188
//   r3 := a3 & b3
1189
//
1190
// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1191
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1192
{
1193
    return vreinterpretq_m128_s32(
1194
        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1195
}
1196

1197
// Computes the bitwise AND-NOT of the four single-precision, floating-point
1198
// values of a and b.
1199
//
1200
//   r0 := ~a0 & b0
1201
//   r1 := ~a1 & b1
1202
//   r2 := ~a2 & b2
1203
//   r3 := ~a3 & b3
1204
//
1205
// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1206
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1207
{
1208
    return vreinterpretq_m128_s32(
1209
        vbicq_s32(vreinterpretq_s32_m128(b),
1210
                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1211
}
1212

1213
// Average packed unsigned 16-bit integers in a and b, and store the results in
1214
// dst.
1215
//
1216
//   FOR j := 0 to 3
1217
//     i := j*16
1218
//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1219
//   ENDFOR
1220
//
1221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1222
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1223
{
1224
    return vreinterpret_m64_u16(
1225
        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1226
}
1227

1228
// Average packed unsigned 8-bit integers in a and b, and store the results in
1229
// dst.
1230
//
1231
//   FOR j := 0 to 7
1232
//     i := j*8
1233
//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1234
//   ENDFOR
1235
//
1236
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1237
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1238
{
1239
    return vreinterpret_m64_u8(
1240
        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1241
}
1242

1243
// Compares for equality.
1244
// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1245
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1246
{
1247
    return vreinterpretq_m128_u32(
1248
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1249
}
1250

1251
// Compares for equality.
1252
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1253
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1254
{
1255
    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1256
}
1257

1258
// Compares for greater than or equal.
1259
// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1260
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1261
{
1262
    return vreinterpretq_m128_u32(
1263
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1264
}
1265

1266
// Compares for greater than or equal.
1267
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1268
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1269
{
1270
    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1271
}
1272

1273
// Compares for greater than.
1274
//
1275
//   r0 := (a0 > b0) ? 0xffffffff : 0x0
1276
//   r1 := (a1 > b1) ? 0xffffffff : 0x0
1277
//   r2 := (a2 > b2) ? 0xffffffff : 0x0
1278
//   r3 := (a3 > b3) ? 0xffffffff : 0x0
1279
//
1280
// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1281
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1282
{
1283
    return vreinterpretq_m128_u32(
1284
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1285
}
1286

1287
// Compares for greater than.
1288
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1289
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1290
{
1291
    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1292
}
1293

1294
// Compares for less than or equal.
1295
//
1296
//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
1297
//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
1298
//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
1299
//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
1300
//
1301
// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1302
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1303
{
1304
    return vreinterpretq_m128_u32(
1305
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1306
}
1307

1308
// Compares for less than or equal.
1309
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1310
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1311
{
1312
    return _mm_move_ss(a, _mm_cmple_ps(a, b));
1313
}
1314

1315
// Compares for less than
1316
// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1317
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1318
{
1319
    return vreinterpretq_m128_u32(
1320
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1321
}
1322

1323
// Compares for less than
1324
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1325
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1326
{
1327
    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1328
}
1329

1330
// Compares for inequality.
1331
// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1332
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1333
{
1334
    return vreinterpretq_m128_u32(vmvnq_u32(
1335
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1336
}
1337

1338
// Compares for inequality.
1339
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1340
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1341
{
1342
    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1343
}
1344

1345
// Compares for not greater than or equal.
1346
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1347
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1348
{
1349
    return vreinterpretq_m128_u32(vmvnq_u32(
1350
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1351
}
1352

1353
// Compares for not greater than or equal.
1354
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1355
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1356
{
1357
    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1358
}
1359

1360
// Compares for not greater than.
1361
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1362
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1363
{
1364
    return vreinterpretq_m128_u32(vmvnq_u32(
1365
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1366
}
1367

1368
// Compares for not greater than.
1369
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1370
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1371
{
1372
    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1373
}
1374

1375
// Compares for not less than or equal.
1376
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1377
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1378
{
1379
    return vreinterpretq_m128_u32(vmvnq_u32(
1380
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1381
}
1382

1383
// Compares for not less than or equal.
1384
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1385
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1386
{
1387
    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1388
}
1389

1390
// Compares for not less than.
1391
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1392
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1393
{
1394
    return vreinterpretq_m128_u32(vmvnq_u32(
1395
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1396
}
1397

1398
// Compares for not less than.
1399
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1400
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1401
{
1402
    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1403
}
1404

1405
// Compares the four 32-bit floats in a and b to check if any values are NaN.
1406
// Ordered compare between each value returns true for "orderable" and false for
1407
// "not orderable" (NaN).
1408
// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1409
// also:
1410
// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1411
// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1412
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1413
{
1414
    // Note: NEON does not have ordered compare builtin
1415
    // Need to compare a eq a and b eq b to check for NaN
1416
    // Do AND of results to get final
1417
    uint32x4_t ceqaa =
1418
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1419
    uint32x4_t ceqbb =
1420
        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1421
    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1422
}
1423

1424
// Compares for ordered.
1425
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1426
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1427
{
1428
    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1429
}
1430

1431
// Compares for unordered.
1432
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1433
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1434
{
1435
    uint32x4_t f32a =
1436
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1437
    uint32x4_t f32b =
1438
        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1439
    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1440
}
1441

1442
// Compares for unordered.
1443
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1444
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1445
{
1446
    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1447
}
1448

1449
// Compares the lower single-precision floating point scalar values of a and b
1450
// using an equality operation. :
1451
// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1452
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1453
{
1454
    uint32x4_t a_eq_b =
1455
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1456
    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1457
}
1458

1459
// Compares the lower single-precision floating point scalar values of a and b
1460
// using a greater than or equal operation. :
1461
// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1462
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1463
{
1464
    uint32x4_t a_ge_b =
1465
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1466
    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1467
}
1468

1469
// Compares the lower single-precision floating point scalar values of a and b
1470
// using a greater than operation. :
1471
// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1472
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1473
{
1474
    uint32x4_t a_gt_b =
1475
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1476
    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1477
}
1478

1479
// Compares the lower single-precision floating point scalar values of a and b
1480
// using a less than or equal operation. :
1481
// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1482
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1483
{
1484
    uint32x4_t a_le_b =
1485
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1486
    return vgetq_lane_u32(a_le_b, 0) & 0x1;
1487
}
1488

1489
// Compares the lower single-precision floating point scalar values of a and b
1490
// using a less than operation. :
1491
// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1492
// note!! The documentation on MSDN is incorrect!  If either of the values is a
1493
// NAN the docs say you will get a one, but in fact, it will return a zero!!
1494
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1495
{
1496
    uint32x4_t a_lt_b =
1497
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1498
    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1499
}
1500

1501
// Compares the lower single-precision floating point scalar values of a and b
1502
// using an inequality operation. :
1503
// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1504
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1505
{
1506
    return !_mm_comieq_ss(a, b);
1507
}
1508

1509
// Convert packed signed 32-bit integers in b to packed single-precision
1510
// (32-bit) floating-point elements, store the results in the lower 2 elements
1511
// of dst, and copy the upper 2 packed elements from a to the upper elements of
1512
// dst.
1513
//
1514
//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1515
//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1516
//   dst[95:64] := a[95:64]
1517
//   dst[127:96] := a[127:96]
1518
//
1519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1520
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1521
{
1522
    return vreinterpretq_m128_f32(
1523
        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1524
                     vget_high_f32(vreinterpretq_f32_m128(a))));
1525
}
1526

1527
// Convert packed single-precision (32-bit) floating-point elements in a to
1528
// packed 32-bit integers, and store the results in dst.
1529
//
1530
//   FOR j := 0 to 1
1531
//       i := 32*j
1532
//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1533
//   ENDFOR
1534
//
1535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1536
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1537
{
1538
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1539
    return vreinterpret_m64_s32(
1540
        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1541
#else
1542
    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1543
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1544
#endif
1545
}
1546

1547
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1548
// floating-point element, store the result in the lower element of dst, and
1549
// copy the upper 3 packed elements from a to the upper elements of dst.
1550
//
1551
//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1552
//   dst[127:32] := a[127:32]
1553
//
1554
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1555
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1556
{
1557
    return vreinterpretq_m128_f32(
1558
        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1559
}
1560

1561
// Convert the lower single-precision (32-bit) floating-point element in a to a
1562
// 32-bit integer, and store the result in dst.
1563
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1564
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1565
{
1566
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1567
    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1568
                          0);
1569
#else
1570
    float32_t data = vgetq_lane_f32(
1571
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1572
    return (int32_t) data;
1573
#endif
1574
}
1575

1576
// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1577
// floating-point elements, and store the results in dst.
1578
//
1579
//   FOR j := 0 to 3
1580
//      i := j*16
1581
//      m := j*32
1582
//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1583
//   ENDFOR
1584
//
1585
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1586
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1587
{
1588
    return vreinterpretq_m128_f32(
1589
        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1590
}
1591

1592
// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1593
// floating-point elements, store the results in the lower 2 elements of dst,
1594
// and copy the upper 2 packed elements from a to the upper elements of dst.
1595
//
1596
//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1597
//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1598
//   dst[95:64] := a[95:64]
1599
//   dst[127:96] := a[127:96]
1600
//
1601
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1602
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1603
{
1604
    return vreinterpretq_m128_f32(
1605
        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1606
                     vget_high_f32(vreinterpretq_f32_m128(a))));
1607
}
1608

1609
// Convert packed signed 32-bit integers in a to packed single-precision
1610
// (32-bit) floating-point elements, store the results in the lower 2 elements
1611
// of dst, then convert the packed signed 32-bit integers in b to
1612
// single-precision (32-bit) floating-point element, and store the results in
1613
// the upper 2 elements of dst.
1614
//
1615
//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1616
//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1617
//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1618
//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1619
//
1620
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1621
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1622
{
1623
    return vreinterpretq_m128_f32(vcvtq_f32_s32(
1624
        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1625
}
1626

1627
// Convert the lower packed 8-bit integers in a to packed single-precision
1628
// (32-bit) floating-point elements, and store the results in dst.
1629
//
1630
//   FOR j := 0 to 3
1631
//      i := j*8
1632
//      m := j*32
1633
//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1634
//   ENDFOR
1635
//
1636
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1637
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1638
{
1639
    return vreinterpretq_m128_f32(vcvtq_f32_s32(
1640
        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1641
}
1642

1643
// Convert packed single-precision (32-bit) floating-point elements in a to
1644
// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1645
// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1646
// 0x7FFFFFFF.
1647
//
1648
//   FOR j := 0 to 3
1649
//     i := 16*j
1650
//     k := 32*j
1651
//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1652
//       dst[i+15:i] := 0x7FFF
1653
//     ELSE
1654
//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1655
//     FI
1656
//   ENDFOR
1657
//
1658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1659
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1660
{
1661
    return vreinterpret_m64_s16(
1662
        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1663
}
1664

1665
// Convert packed single-precision (32-bit) floating-point elements in a to
1666
// packed 32-bit integers, and store the results in dst.
1667
//
1668
//   FOR j := 0 to 1
1669
//       i := 32*j
1670
//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1671
//   ENDFOR
1672
//
1673
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1674
#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1675

1676
// Convert packed single-precision (32-bit) floating-point elements in a to
1677
// packed 8-bit integers, and store the results in lower 4 elements of dst.
1678
// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1679
// between 0x7F and 0x7FFFFFFF.
1680
//
1681
//   FOR j := 0 to 3
1682
//     i := 8*j
1683
//     k := 32*j
1684
//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1685
//       dst[i+7:i] := 0x7F
1686
//     ELSE
1687
//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1688
//     FI
1689
//   ENDFOR
1690
//
1691
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1692
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1693
{
1694
    return vreinterpret_m64_s8(vqmovn_s16(
1695
        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1696
}
1697

1698
// Convert packed unsigned 16-bit integers in a to packed single-precision
1699
// (32-bit) floating-point elements, and store the results in dst.
1700
//
1701
//   FOR j := 0 to 3
1702
//      i := j*16
1703
//      m := j*32
1704
//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1705
//   ENDFOR
1706
//
1707
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1708
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1709
{
1710
    return vreinterpretq_m128_f32(
1711
        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1712
}
1713

1714
// Convert the lower packed unsigned 8-bit integers in a to packed
1715
// single-precision (32-bit) floating-point elements, and store the results in
1716
// dst.
1717
//
1718
//   FOR j := 0 to 3
1719
//      i := j*8
1720
//      m := j*32
1721
//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1722
//   ENDFOR
1723
//
1724
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1725
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1726
{
1727
    return vreinterpretq_m128_f32(vcvtq_f32_u32(
1728
        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1729
}
1730

1731
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1732
// floating-point element, store the result in the lower element of dst, and
1733
// copy the upper 3 packed elements from a to the upper elements of dst.
1734
//
1735
//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1736
//   dst[127:32] := a[127:32]
1737
//
1738
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1739
#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1740

1741
// Convert the signed 64-bit integer b to a single-precision (32-bit)
1742
// floating-point element, store the result in the lower element of dst, and
1743
// copy the upper 3 packed elements from a to the upper elements of dst.
1744
//
1745
//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1746
//   dst[127:32] := a[127:32]
1747
//
1748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1749
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1750
{
1751
    return vreinterpretq_m128_f32(
1752
        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1753
}
1754

1755
// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1756
//
1757
//   dst[31:0] := a[31:0]
1758
//
1759
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1760
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1761
{
1762
    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1763
}
1764

1765
// Convert the lower single-precision (32-bit) floating-point element in a to a
1766
// 32-bit integer, and store the result in dst.
1767
//
1768
//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1769
//
1770
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1771
#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1772

1773
// Convert the lower single-precision (32-bit) floating-point element in a to a
1774
// 64-bit integer, and store the result in dst.
1775
//
1776
//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1777
//
1778
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1779
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1780
{
1781
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1782
    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1783
#else
1784
    float32_t data = vgetq_lane_f32(
1785
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1786
    return (int64_t) data;
1787
#endif
1788
}
1789

1790
// Convert packed single-precision (32-bit) floating-point elements in a to
1791
// packed 32-bit integers with truncation, and store the results in dst.
1792
//
1793
//   FOR j := 0 to 1
1794
//      i := 32*j
1795
//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1796
//   ENDFOR
1797
//
1798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1799
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1800
{
1801
    return vreinterpret_m64_s32(
1802
        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1803
}
1804

1805
// Convert the lower single-precision (32-bit) floating-point element in a to a
1806
// 32-bit integer with truncation, and store the result in dst.
1807
//
1808
//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1809
//
1810
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1811
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1812
{
1813
    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1814
}
1815

1816
// Convert packed single-precision (32-bit) floating-point elements in a to
1817
// packed 32-bit integers with truncation, and store the results in dst.
1818
//
1819
//   FOR j := 0 to 1
1820
//      i := 32*j
1821
//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1822
//   ENDFOR
1823
//
1824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1825
#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1826

1827
// Convert the lower single-precision (32-bit) floating-point element in a to a
1828
// 32-bit integer with truncation, and store the result in dst.
1829
//
1830
//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1831
//
1832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1833
#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1834

1835
// Convert the lower single-precision (32-bit) floating-point element in a to a
1836
// 64-bit integer with truncation, and store the result in dst.
1837
//
1838
//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1839
//
1840
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1841
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1842
{
1843
    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1844
}
1845

1846
// Divides the four single-precision, floating-point values of a and b.
1847
//
1848
//   r0 := a0 / b0
1849
//   r1 := a1 / b1
1850
//   r2 := a2 / b2
1851
//   r3 := a3 / b3
1852
//
1853
// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1854
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1855
{
1856
#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1857
    return vreinterpretq_m128_f32(
1858
        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1859
#else
1860
    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1861
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1862
#if SSE2NEON_PRECISE_DIV
1863
    // Additional Netwon-Raphson iteration for accuracy
1864
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1865
#endif
1866
    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1867
#endif
1868
}
1869

1870
// Divides the scalar single-precision floating point value of a by b.
1871
// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1872
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1873
{
1874
    float32_t value =
1875
        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1876
    return vreinterpretq_m128_f32(
1877
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1878
}
1879

1880
// Extract a 16-bit integer from a, selected with imm8, and store the result in
1881
// the lower element of dst.
1882
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1883
#define _mm_extract_pi16(a, imm) \
1884
    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1885

1886
// Free aligned memory that was allocated with _mm_malloc.
1887
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1888
#if !defined(SSE2NEON_ALLOC_DEFINED)
1889
FORCE_INLINE void _mm_free(void *addr)
1890
{
1891
#if defined(_WIN32)
1892
    _aligned_free(addr);
1893
#else
1894
    free(addr);
1895
#endif
1896
}
1897
#endif
1898

1899
// Macro: Get the flush zero bits from the MXCSR control and status register.
1900
// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1901
// _MM_FLUSH_ZERO_OFF
1902
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1903
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
1904
{
1905
    union {
1906
        fpcr_bitfield field;
1907
#if defined(__aarch64__)
1908
        uint64_t value;
1909
#else
1910
        uint32_t value;
1911
#endif
1912
    } r;
1913

1914
#if defined(__aarch64__)
1915
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1916
#else
1917
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1918
#endif
1919

1920
    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1921
}
1922

1923
// Macro: Get the rounding mode bits from the MXCSR control and status register.
1924
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1925
// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1927
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1928
{
1929
    union {
1930
        fpcr_bitfield field;
1931
#if defined(__aarch64__)
1932
        uint64_t value;
1933
#else
1934
        uint32_t value;
1935
#endif
1936
    } r;
1937

1938
#if defined(__aarch64__)
1939
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1940
#else
1941
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1942
#endif
1943

1944
    if (r.field.bit22) {
1945
        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1946
    } else {
1947
        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1948
    }
1949
}
1950

1951
// Copy a to dst, and insert the 16-bit integer i into dst at the location
1952
// specified by imm8.
1953
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1954
#define _mm_insert_pi16(a, b, imm)                               \
1955
    __extension__({                                              \
1956
        vreinterpret_m64_s16(                                    \
1957
            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1958
    })
1959

1960
// Loads four single-precision, floating-point values.
1961
// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1962
FORCE_INLINE __m128 _mm_load_ps(const float *p)
1963
{
1964
    return vreinterpretq_m128_f32(vld1q_f32(p));
1965
}
1966

1967
// Load a single-precision (32-bit) floating-point element from memory into all
1968
// elements of dst.
1969
//
1970
//   dst[31:0] := MEM[mem_addr+31:mem_addr]
1971
//   dst[63:32] := MEM[mem_addr+31:mem_addr]
1972
//   dst[95:64] := MEM[mem_addr+31:mem_addr]
1973
//   dst[127:96] := MEM[mem_addr+31:mem_addr]
1974
//
1975
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1976
#define _mm_load_ps1 _mm_load1_ps
1977

1978
// Loads an single - precision, floating - point value into the low word and
1979
// clears the upper three words.
1980
// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1981
FORCE_INLINE __m128 _mm_load_ss(const float *p)
1982
{
1983
    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1984
}
1985

1986
// Loads a single single-precision, floating-point value, copying it into all
1987
// four words
1988
// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1989
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1990
{
1991
    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1992
}
1993

1994
// Sets the upper two single-precision, floating-point values with 64
1995
// bits of data loaded from the address p; the lower two values are passed
1996
// through from a.
1997
//
1998
//   r0 := a0
1999
//   r1 := a1
2000
//   r2 := *p0
2001
//   r3 := *p1
2002
//
2003
// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
2004
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
2005
{
2006
    return vreinterpretq_m128_f32(
2007
        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
2008
}
2009

2010
// Sets the lower two single-precision, floating-point values with 64
2011
// bits of data loaded from the address p; the upper two values are passed
2012
// through from a.
2013
//
2014
// Return Value
2015
//   r0 := *p0
2016
//   r1 := *p1
2017
//   r2 := a2
2018
//   r3 := a3
2019
//
2020
// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
2021
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
2022
{
2023
    return vreinterpretq_m128_f32(
2024
        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
2025
}
2026

2027
// Load 4 single-precision (32-bit) floating-point elements from memory into dst
2028
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2029
// general-protection exception may be generated.
2030
//
2031
//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
2032
//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
2033
//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
2034
//   dst[127:96] := MEM[mem_addr+31:mem_addr]
2035
//
2036
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
2037
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
2038
{
2039
    float32x4_t v = vrev64q_f32(vld1q_f32(p));
2040
    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
2041
}
2042

2043
// Loads four single-precision, floating-point values.
2044
// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
2045
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
2046
{
2047
    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
2048
    // equivalent for neon
2049
    return vreinterpretq_m128_f32(vld1q_f32(p));
2050
}
2051

2052
// Load unaligned 16-bit integer from memory into the first element of dst.
2053
//
2054
//   dst[15:0] := MEM[mem_addr+15:mem_addr]
2055
//   dst[MAX:16] := 0
2056
//
2057
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
2058
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
2059
{
2060
    return vreinterpretq_m128i_s16(
2061
        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
2062
}
2063

2064
// Load unaligned 64-bit integer from memory into the first element of dst.
2065
//
2066
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
2067
//   dst[MAX:64] := 0
2068
//
2069
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
2070
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
2071
{
2072
    return vreinterpretq_m128i_s64(
2073
        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
2074
}
2075

2076
// Allocate aligned blocks of memory.
2077
// https://software.intel.com/en-us/
2078
//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
2079
#if !defined(SSE2NEON_ALLOC_DEFINED)
2080
FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
2081
{
2082
    void *ptr;
2083
    if (align == 1)
2084
        return malloc(size);
2085
    if (align == 2 || (sizeof(void *) == 8 && align == 4))
2086
        align = sizeof(void *);
2087
#if defined(_WIN32)
2088
    ptr = _aligned_malloc(size, align);
2089
    if (ptr)
2090
        return ptr;
2091
#else
2092
    if (!posix_memalign(&ptr, align, size))
2093
        return ptr;
2094
#endif
2095
    return NULL;
2096
}
2097
#endif
2098

2099
// Conditionally store 8-bit integer elements from a into memory using mask
2100
// (elements are not stored when the highest bit is not set in the corresponding
2101
// element) and a non-temporal memory hint.
2102
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
2103
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
2104
{
2105
    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
2106
    __m128 b = _mm_load_ps((const float *) mem_addr);
2107
    int8x8_t masked =
2108
        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
2109
                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
2110
    vst1_s8((int8_t *) mem_addr, masked);
2111
}
2112

2113
// Conditionally store 8-bit integer elements from a into memory using mask
2114
// (elements are not stored when the highest bit is not set in the corresponding
2115
// element) and a non-temporal memory hint.
2116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
2117
#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2118

2119
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2120
// values in dst.
2121
//
2122
//   FOR j := 0 to 3
2123
//      i := j*16
2124
//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
2125
//   ENDFOR
2126
//
2127
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
2128
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
2129
{
2130
    return vreinterpret_m64_s16(
2131
        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2132
}
2133

2134
// Computes the maximums of the four single-precision, floating-point values of
2135
// a and b.
2136
// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2137
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2138
{
2139
#if SSE2NEON_PRECISE_MINMAX
2140
    float32x4_t _a = vreinterpretq_f32_m128(a);
2141
    float32x4_t _b = vreinterpretq_f32_m128(b);
2142
    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
2143
#else
2144
    return vreinterpretq_m128_f32(
2145
        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2146
#endif
2147
}
2148

2149
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2150
// values in dst.
2151
//
2152
//   FOR j := 0 to 7
2153
//      i := j*8
2154
//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
2155
//   ENDFOR
2156
//
2157
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2158
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2159
{
2160
    return vreinterpret_m64_u8(
2161
        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2162
}
2163

2164
// Computes the maximum of the two lower scalar single-precision floating point
2165
// values of a and b.
2166
// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2167
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2168
{
2169
    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2170
    return vreinterpretq_m128_f32(
2171
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2172
}
2173

2174
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2175
// values in dst.
2176
//
2177
//   FOR j := 0 to 3
2178
//      i := j*16
2179
//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2180
//   ENDFOR
2181
//
2182
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2183
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2184
{
2185
    return vreinterpret_m64_s16(
2186
        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2187
}
2188

2189
// Computes the minima of the four single-precision, floating-point values of a
2190
// and b.
2191
// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2192
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2193
{
2194
#if SSE2NEON_PRECISE_MINMAX
2195
    float32x4_t _a = vreinterpretq_f32_m128(a);
2196
    float32x4_t _b = vreinterpretq_f32_m128(b);
2197
    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2198
#else
2199
    return vreinterpretq_m128_f32(
2200
        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2201
#endif
2202
}
2203

2204
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2205
// values in dst.
2206
//
2207
//   FOR j := 0 to 7
2208
//      i := j*8
2209
//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2210
//   ENDFOR
2211
//
2212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2213
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2214
{
2215
    return vreinterpret_m64_u8(
2216
        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2217
}
2218

2219
// Computes the minimum of the two lower scalar single-precision floating point
2220
// values of a and b.
2221
// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2222
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2223
{
2224
    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2225
    return vreinterpretq_m128_f32(
2226
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2227
}
2228

2229
// Sets the low word to the single-precision, floating-point value of b
2230
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2231
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2232
{
2233
    return vreinterpretq_m128_f32(
2234
        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2235
                       vreinterpretq_f32_m128(a), 0));
2236
}
2237

2238
// Moves the upper two values of B into the lower two values of A.
2239
//
2240
//   r3 := a3
2241
//   r2 := a2
2242
//   r1 := b3
2243
//   r0 := b2
2244
FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2245
{
2246
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2247
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2248
    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2249
}
2250

2251
// Moves the lower two values of B into the upper two values of A.
2252
//
2253
//   r3 := b1
2254
//   r2 := b0
2255
//   r1 := a1
2256
//   r0 := a0
2257
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2258
{
2259
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2260
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2261
    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2262
}
2263

2264
// Create mask from the most significant bit of each 8-bit element in a, and
2265
// store the result in dst.
2266
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2267
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2268
{
2269
    uint8x8_t input = vreinterpret_u8_m64(a);
2270
#if defined(__aarch64__)
2271
    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2272
    uint8x8_t tmp = vshr_n_u8(input, 7);
2273
    return vaddv_u8(vshl_u8(tmp, shift));
2274
#else
2275
    // Refer the implementation of `_mm_movemask_epi8`
2276
    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2277
    uint32x2_t paired16 =
2278
        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2279
    uint8x8_t paired32 =
2280
        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2281
    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2282
#endif
2283
}
2284

2285
// NEON does not provide this method
2286
// Creates a 4-bit mask from the most significant bits of the four
2287
// single-precision, floating-point values.
2288
// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2289
FORCE_INLINE int _mm_movemask_ps(__m128 a)
2290
{
2291
    uint32x4_t input = vreinterpretq_u32_m128(a);
2292
#if defined(__aarch64__)
2293
    static const int32x4_t shift = {0, 1, 2, 3};
2294
    uint32x4_t tmp = vshrq_n_u32(input, 31);
2295
    return vaddvq_u32(vshlq_u32(tmp, shift));
2296
#else
2297
    // Uses the exact same method as _mm_movemask_epi8, see that for details.
2298
    // Shift out everything but the sign bits with a 32-bit unsigned shift
2299
    // right.
2300
    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2301
    // Merge the two pairs together with a 64-bit unsigned shift right + add.
2302
    uint8x16_t paired =
2303
        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2304
    // Extract the result.
2305
    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2306
#endif
2307
}
2308

2309
// Multiplies the four single-precision, floating-point values of a and b.
2310
//
2311
//   r0 := a0 * b0
2312
//   r1 := a1 * b1
2313
//   r2 := a2 * b2
2314
//   r3 := a3 * b3
2315
//
2316
// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2317
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2318
{
2319
    return vreinterpretq_m128_f32(
2320
        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2321
}
2322

2323
// Multiply the lower single-precision (32-bit) floating-point element in a and
2324
// b, store the result in the lower element of dst, and copy the upper 3 packed
2325
// elements from a to the upper elements of dst.
2326
//
2327
//   dst[31:0] := a[31:0] * b[31:0]
2328
//   dst[127:32] := a[127:32]
2329
//
2330
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2331
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2332
{
2333
    return _mm_move_ss(a, _mm_mul_ps(a, b));
2334
}
2335

2336
// Multiply the packed unsigned 16-bit integers in a and b, producing
2337
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2338
// integers in dst.
2339
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2340
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2341
{
2342
    return vreinterpret_m64_u16(vshrn_n_u32(
2343
        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2344
}
2345

2346
// Computes the bitwise OR of the four single-precision, floating-point values
2347
// of a and b.
2348
// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2349
FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2350
{
2351
    return vreinterpretq_m128_s32(
2352
        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2353
}
2354

2355
// Average packed unsigned 8-bit integers in a and b, and store the results in
2356
// dst.
2357
//
2358
//   FOR j := 0 to 7
2359
//     i := j*8
2360
//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2361
//   ENDFOR
2362
//
2363
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2364
#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2365

2366
// Average packed unsigned 16-bit integers in a and b, and store the results in
2367
// dst.
2368
//
2369
//   FOR j := 0 to 3
2370
//     i := j*16
2371
//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2372
//   ENDFOR
2373
//
2374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2375
#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2376

2377
// Extract a 16-bit integer from a, selected with imm8, and store the result in
2378
// the lower element of dst.
2379
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2380
#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2381

2382
// Copy a to dst, and insert the 16-bit integer i into dst at the location
2383
// specified by imm8.
2384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2385
#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2386

2387
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2388
// values in dst.
2389
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2390
#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2391

2392
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2393
// values in dst.
2394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2395
#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2396

2397
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2398
// values in dst.
2399
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2400
#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2401

2402
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2403
// values in dst.
2404
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2405
#define _m_pminub(a, b) _mm_min_pu8(a, b)
2406

2407
// Create mask from the most significant bit of each 8-bit element in a, and
2408
// store the result in dst.
2409
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2410
#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2411

2412
// Multiply the packed unsigned 16-bit integers in a and b, producing
2413
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2414
// integers in dst.
2415
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2416
#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2417

2418
// Fetch the line of data from memory that contains address p to a location in
2419
// the cache heirarchy specified by the locality hint i.
2420
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2421
FORCE_INLINE void _mm_prefetch(char const *p, int i)
2422
{
2423
    switch (i) {
2424
    case _MM_HINT_NTA:
2425
        __builtin_prefetch(p, 0, 0);
2426
        break;
2427
    case _MM_HINT_T0:
2428
        __builtin_prefetch(p, 0, 3);
2429
        break;
2430
    case _MM_HINT_T1:
2431
        __builtin_prefetch(p, 0, 2);
2432
        break;
2433
    case _MM_HINT_T2:
2434
        __builtin_prefetch(p, 0, 1);
2435
        break;
2436
    }
2437
}
2438

2439
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2440
// b, then horizontally sum each consecutive 8 differences to produce four
2441
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2442
// 16 bits of dst.
2443
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2444
#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2445

2446
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2447
// in dst.
2448
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2449
#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2450

2451
// Compute the approximate reciprocal of packed single-precision (32-bit)
2452
// floating-point elements in a, and store the results in dst. The maximum
2453
// relative error for this approximation is less than 1.5*2^-12.
2454
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2455
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2456
{
2457
    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2458
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2459
#if SSE2NEON_PRECISE_DIV
2460
    // Additional Netwon-Raphson iteration for accuracy
2461
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2462
#endif
2463
    return vreinterpretq_m128_f32(recip);
2464
}
2465

2466
// Compute the approximate reciprocal of the lower single-precision (32-bit)
2467
// floating-point element in a, store the result in the lower element of dst,
2468
// and copy the upper 3 packed elements from a to the upper elements of dst. The
2469
// maximum relative error for this approximation is less than 1.5*2^-12.
2470
//
2471
//   dst[31:0] := (1.0 / a[31:0])
2472
//   dst[127:32] := a[127:32]
2473
//
2474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2475
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2476
{
2477
    return _mm_move_ss(a, _mm_rcp_ps(a));
2478
}
2479

2480
// Computes the approximations of the reciprocal square roots of the four
2481
// single-precision floating point values of in.
2482
// The current precision is 1% error.
2483
// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2484
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2485
{
2486
    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2487
#if SSE2NEON_PRECISE_SQRT
2488
    // Additional Netwon-Raphson iteration for accuracy
2489
    out = vmulq_f32(
2490
        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2491
    out = vmulq_f32(
2492
        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2493
#endif
2494
    return vreinterpretq_m128_f32(out);
2495
}
2496

2497
// Compute the approximate reciprocal square root of the lower single-precision
2498
// (32-bit) floating-point element in a, store the result in the lower element
2499
// of dst, and copy the upper 3 packed elements from a to the upper elements of
2500
// dst.
2501
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2502
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2503
{
2504
    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2505
}
2506

2507
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2508
// b, then horizontally sum each consecutive 8 differences to produce four
2509
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2510
// 16 bits of dst.
2511
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2512
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2513
{
2514
    uint64x1_t t = vpaddl_u32(vpaddl_u16(
2515
        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2516
    return vreinterpret_m64_u16(
2517
        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2518
}
2519

2520
// Macro: Set the flush zero bits of the MXCSR control and status register to
2521
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2522
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2524
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2525
{
2526
    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2527
    // regardless of the value of the FZ bit.
2528
    union {
2529
        fpcr_bitfield field;
2530
#if defined(__aarch64__)
2531
        uint64_t value;
2532
#else
2533
        uint32_t value;
2534
#endif
2535
    } r;
2536

2537
#if defined(__aarch64__)
2538
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2539
#else
2540
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2541
#endif
2542

2543
    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2544

2545
#if defined(__aarch64__)
2546
    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2547
#else
2548
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
2549
#endif
2550
}
2551

2552
// Sets the four single-precision, floating-point values to the four inputs.
2553
// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2554
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2555
{
2556
    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2557
    return vreinterpretq_m128_f32(vld1q_f32(data));
2558
}
2559

2560
// Sets the four single-precision, floating-point values to w.
2561
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2562
FORCE_INLINE __m128 _mm_set_ps1(float _w)
2563
{
2564
    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2565
}
2566

2567
// Macro: Set the rounding mode bits of the MXCSR control and status register to
2568
// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2569
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2570
// _MM_ROUND_TOWARD_ZERO
2571
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2572
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2573
{
2574
    union {
2575
        fpcr_bitfield field;
2576
#if defined(__aarch64__)
2577
        uint64_t value;
2578
#else
2579
        uint32_t value;
2580
#endif
2581
    } r;
2582

2583
#if defined(__aarch64__)
2584
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2585
#else
2586
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2587
#endif
2588

2589
    switch (rounding) {
2590
    case _MM_ROUND_TOWARD_ZERO:
2591
        r.field.bit22 = 1;
2592
        r.field.bit23 = 1;
2593
        break;
2594
    case _MM_ROUND_DOWN:
2595
        r.field.bit22 = 0;
2596
        r.field.bit23 = 1;
2597
        break;
2598
    case _MM_ROUND_UP:
2599
        r.field.bit22 = 1;
2600
        r.field.bit23 = 0;
2601
        break;
2602
    default:  //_MM_ROUND_NEAREST
2603
        r.field.bit22 = 0;
2604
        r.field.bit23 = 0;
2605
    }
2606

2607
#if defined(__aarch64__)
2608
    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2609
#else
2610
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
2611
#endif
2612
}
2613

2614
// Copy single-precision (32-bit) floating-point element a to the lower element
2615
// of dst, and zero the upper 3 elements.
2616
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2617
FORCE_INLINE __m128 _mm_set_ss(float a)
2618
{
2619
    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2620
}
2621

2622
// Sets the four single-precision, floating-point values to w.
2623
//
2624
//   r0 := r1 := r2 := r3 := w
2625
//
2626
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2627
FORCE_INLINE __m128 _mm_set1_ps(float _w)
2628
{
2629
    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2630
}
2631

2632
// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2633
FORCE_INLINE void _mm_setcsr(unsigned int a)
2634
{
2635
    _MM_SET_ROUNDING_MODE(a);
2636
}
2637

2638
// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2639
FORCE_INLINE unsigned int _mm_getcsr()
2640
{
2641
    return _MM_GET_ROUNDING_MODE();
2642
}
2643

2644
// Sets the four single-precision, floating-point values to the four inputs in
2645
// reverse order.
2646
// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2647
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2648
{
2649
    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2650
    return vreinterpretq_m128_f32(vld1q_f32(data));
2651
}
2652

2653
// Clears the four single-precision, floating-point values.
2654
// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2655
FORCE_INLINE __m128 _mm_setzero_ps(void)
2656
{
2657
    return vreinterpretq_m128_f32(vdupq_n_f32(0));
2658
}
2659

2660
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2661
// in dst.
2662
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2663
#ifdef _sse2neon_shuffle
2664
#define _mm_shuffle_pi16(a, imm)                                           \
2665
    __extension__({                                                        \
2666
        vreinterpret_m64_s16(vshuffle_s16(                                 \
2667
            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2668
            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
2669
    })
2670
#else
2671
#define _mm_shuffle_pi16(a, imm)                                               \
2672
    __extension__({                                                            \
2673
        int16x4_t ret;                                                         \
2674
        ret =                                                                  \
2675
            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2676
        ret = vset_lane_s16(                                                   \
2677
            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
2678
            1);                                                                \
2679
        ret = vset_lane_s16(                                                   \
2680
            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
2681
            2);                                                                \
2682
        ret = vset_lane_s16(                                                   \
2683
            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
2684
            3);                                                                \
2685
        vreinterpret_m64_s16(ret);                                             \
2686
    })
2687
#endif
2688

2689
// Perform a serializing operation on all store-to-memory instructions that were
2690
// issued prior to this instruction. Guarantees that every store instruction
2691
// that precedes, in program order, is globally visible before any store
2692
// instruction which follows the fence in program order.
2693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2694
FORCE_INLINE void _mm_sfence(void)
2695
{
2696
    _sse2neon_smp_mb();
2697
}
2698

2699
// Perform a serializing operation on all load-from-memory and store-to-memory
2700
// instructions that were issued prior to this instruction. Guarantees that
2701
// every memory access that precedes, in program order, the memory fence
2702
// instruction is globally visible before any memory instruction which follows
2703
// the fence in program order.
2704
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2705
FORCE_INLINE void _mm_mfence(void)
2706
{
2707
    _sse2neon_smp_mb();
2708
}
2709

2710
// Perform a serializing operation on all load-from-memory instructions that
2711
// were issued prior to this instruction. Guarantees that every load instruction
2712
// that precedes, in program order, is globally visible before any load
2713
// instruction which follows the fence in program order.
2714
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2715
FORCE_INLINE void _mm_lfence(void)
2716
{
2717
    _sse2neon_smp_mb();
2718
}
2719

2720
// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2721
// int imm)
2722
#ifdef _sse2neon_shuffle
2723
#define _mm_shuffle_ps(a, b, imm)                                              \
2724
    __extension__({                                                            \
2725
        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
2726
        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
2727
        float32x4_t _shuf =                                                    \
2728
            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2729
                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2730
        vreinterpretq_m128_f32(_shuf);                                         \
2731
    })
2732
#else  // generic
2733
#define _mm_shuffle_ps(a, b, imm)                          \
2734
    __extension__({                                        \
2735
        __m128 ret;                                        \
2736
        switch (imm) {                                     \
2737
        case _MM_SHUFFLE(1, 0, 3, 2):                      \
2738
            ret = _mm_shuffle_ps_1032((a), (b));           \
2739
            break;                                         \
2740
        case _MM_SHUFFLE(2, 3, 0, 1):                      \
2741
            ret = _mm_shuffle_ps_2301((a), (b));           \
2742
            break;                                         \
2743
        case _MM_SHUFFLE(0, 3, 2, 1):                      \
2744
            ret = _mm_shuffle_ps_0321((a), (b));           \
2745
            break;                                         \
2746
        case _MM_SHUFFLE(2, 1, 0, 3):                      \
2747
            ret = _mm_shuffle_ps_2103((a), (b));           \
2748
            break;                                         \
2749
        case _MM_SHUFFLE(1, 0, 1, 0):                      \
2750
            ret = _mm_movelh_ps((a), (b));                 \
2751
            break;                                         \
2752
        case _MM_SHUFFLE(1, 0, 0, 1):                      \
2753
            ret = _mm_shuffle_ps_1001((a), (b));           \
2754
            break;                                         \
2755
        case _MM_SHUFFLE(0, 1, 0, 1):                      \
2756
            ret = _mm_shuffle_ps_0101((a), (b));           \
2757
            break;                                         \
2758
        case _MM_SHUFFLE(3, 2, 1, 0):                      \
2759
            ret = _mm_shuffle_ps_3210((a), (b));           \
2760
            break;                                         \
2761
        case _MM_SHUFFLE(0, 0, 1, 1):                      \
2762
            ret = _mm_shuffle_ps_0011((a), (b));           \
2763
            break;                                         \
2764
        case _MM_SHUFFLE(0, 0, 2, 2):                      \
2765
            ret = _mm_shuffle_ps_0022((a), (b));           \
2766
            break;                                         \
2767
        case _MM_SHUFFLE(2, 2, 0, 0):                      \
2768
            ret = _mm_shuffle_ps_2200((a), (b));           \
2769
            break;                                         \
2770
        case _MM_SHUFFLE(3, 2, 0, 2):                      \
2771
            ret = _mm_shuffle_ps_3202((a), (b));           \
2772
            break;                                         \
2773
        case _MM_SHUFFLE(3, 2, 3, 2):                      \
2774
            ret = _mm_movehl_ps((b), (a));                 \
2775
            break;                                         \
2776
        case _MM_SHUFFLE(1, 1, 3, 3):                      \
2777
            ret = _mm_shuffle_ps_1133((a), (b));           \
2778
            break;                                         \
2779
        case _MM_SHUFFLE(2, 0, 1, 0):                      \
2780
            ret = _mm_shuffle_ps_2010((a), (b));           \
2781
            break;                                         \
2782
        case _MM_SHUFFLE(2, 0, 0, 1):                      \
2783
            ret = _mm_shuffle_ps_2001((a), (b));           \
2784
            break;                                         \
2785
        case _MM_SHUFFLE(2, 0, 3, 2):                      \
2786
            ret = _mm_shuffle_ps_2032((a), (b));           \
2787
            break;                                         \
2788
        default:                                           \
2789
            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2790
            break;                                         \
2791
        }                                                  \
2792
        ret;                                               \
2793
    })
2794
#endif
2795

2796
// Computes the approximations of square roots of the four single-precision,
2797
// floating-point values of a. First computes reciprocal square roots and then
2798
// reciprocals of the four values.
2799
//
2800
//   r0 := sqrt(a0)
2801
//   r1 := sqrt(a1)
2802
//   r2 := sqrt(a2)
2803
//   r3 := sqrt(a3)
2804
//
2805
// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2806
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2807
{
2808
#if SSE2NEON_PRECISE_SQRT
2809
    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2810

2811
    // Test for vrsqrteq_f32(0) -> positive infinity case.
2812
    // Change to zero, so that s * 1/sqrt(s) result is zero too.
2813
    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2814
    const uint32x4_t div_by_zero =
2815
        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2816
    recip = vreinterpretq_f32_u32(
2817
        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2818

2819
    // Additional Netwon-Raphson iteration for accuracy
2820
    recip = vmulq_f32(
2821
        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2822
        recip);
2823
    recip = vmulq_f32(
2824
        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2825
        recip);
2826

2827
    // sqrt(s) = s * 1/sqrt(s)
2828
    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2829
#elif defined(__aarch64__)
2830
    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2831
#else
2832
    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2833
    float32x4_t sq = vrecpeq_f32(recipsq);
2834
    return vreinterpretq_m128_f32(sq);
2835
#endif
2836
}
2837

2838
// Computes the approximation of the square root of the scalar single-precision
2839
// floating point value of in.
2840
// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2841
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2842
{
2843
    float32_t value =
2844
        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2845
    return vreinterpretq_m128_f32(
2846
        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2847
}
2848

2849
// Stores four single-precision, floating-point values.
2850
// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2851
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2852
{
2853
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2854
}
2855

2856
// Store the lower single-precision (32-bit) floating-point element from a into
2857
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2858
// boundary or a general-protection exception may be generated.
2859
//
2860
//   MEM[mem_addr+31:mem_addr] := a[31:0]
2861
//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2862
//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2863
//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2864
//
2865
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2866
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2867
{
2868
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2869
    vst1q_f32(p, vdupq_n_f32(a0));
2870
}
2871

2872
// Stores the lower single - precision, floating - point value.
2873
// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2874
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2875
{
2876
    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2877
}
2878

2879
// Store the lower single-precision (32-bit) floating-point element from a into
2880
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2881
// boundary or a general-protection exception may be generated.
2882
//
2883
//   MEM[mem_addr+31:mem_addr] := a[31:0]
2884
//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2885
//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2886
//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2887
//
2888
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2889
#define _mm_store1_ps _mm_store_ps1
2890

2891
// Stores the upper two single-precision, floating-point values of a to the
2892
// address p.
2893
//
2894
//   *p0 := a2
2895
//   *p1 := a3
2896
//
2897
// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2898
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2899
{
2900
    *p = vreinterpret_m64_f32(vget_high_f32(a));
2901
}
2902

2903
// Stores the lower two single-precision floating point values of a to the
2904
// address p.
2905
//
2906
//   *p0 := a0
2907
//   *p1 := a1
2908
//
2909
// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2910
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2911
{
2912
    *p = vreinterpret_m64_f32(vget_low_f32(a));
2913
}
2914

2915
// Store 4 single-precision (32-bit) floating-point elements from a into memory
2916
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2917
// general-protection exception may be generated.
2918
//
2919
//   MEM[mem_addr+31:mem_addr] := a[127:96]
2920
//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
2921
//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
2922
//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2923
//
2924
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2925
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2926
{
2927
    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2928
    float32x4_t rev = vextq_f32(tmp, tmp, 2);
2929
    vst1q_f32(p, rev);
2930
}
2931

2932
// Stores four single-precision, floating-point values.
2933
// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2934
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2935
{
2936
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2937
}
2938

2939
// Stores 16-bits of integer data a at the address p.
2940
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2941
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2942
{
2943
    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2944
}
2945

2946
// Stores 64-bits of integer data a at the address p.
2947
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2948
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2949
{
2950
    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2951
}
2952

2953
// Store 64-bits of integer data from a into memory using a non-temporal memory
2954
// hint.
2955
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2956
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2957
{
2958
    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2959
}
2960

2961
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2962
// point elements) from a into memory using a non-temporal memory hint.
2963
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2964
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2965
{
2966
#if __has_builtin(__builtin_nontemporal_store)
2967
    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
2968
#else
2969
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2970
#endif
2971
}
2972

2973
// Subtracts the four single-precision, floating-point values of a and b.
2974
//
2975
//   r0 := a0 - b0
2976
//   r1 := a1 - b1
2977
//   r2 := a2 - b2
2978
//   r3 := a3 - b3
2979
//
2980
// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2981
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2982
{
2983
    return vreinterpretq_m128_f32(
2984
        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2985
}
2986

2987
// Subtract the lower single-precision (32-bit) floating-point element in b from
2988
// the lower single-precision (32-bit) floating-point element in a, store the
2989
// result in the lower element of dst, and copy the upper 3 packed elements from
2990
// a to the upper elements of dst.
2991
//
2992
//   dst[31:0] := a[31:0] - b[31:0]
2993
//   dst[127:32] := a[127:32]
2994
//
2995
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2996
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2997
{
2998
    return _mm_move_ss(a, _mm_sub_ps(a, b));
2999
}
3000

3001
// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
3002
// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
3003
// transposed matrix in these vectors (row0 now contains column 0, etc.).
3004
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
3005
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
3006
    do {                                                  \
3007
        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
3008
        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
3009
        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
3010
                            vget_low_f32(ROW23.val[0]));  \
3011
        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
3012
                            vget_low_f32(ROW23.val[1]));  \
3013
        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
3014
                            vget_high_f32(ROW23.val[0])); \
3015
        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
3016
                            vget_high_f32(ROW23.val[1])); \
3017
    } while (0)
3018

3019
// according to the documentation, these intrinsics behave the same as the
3020
// non-'u' versions.  We'll just alias them here.
3021
#define _mm_ucomieq_ss _mm_comieq_ss
3022
#define _mm_ucomige_ss _mm_comige_ss
3023
#define _mm_ucomigt_ss _mm_comigt_ss
3024
#define _mm_ucomile_ss _mm_comile_ss
3025
#define _mm_ucomilt_ss _mm_comilt_ss
3026
#define _mm_ucomineq_ss _mm_comineq_ss
3027

3028
// Return vector of type __m128i with undefined elements.
3029
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
3030
FORCE_INLINE __m128i _mm_undefined_si128(void)
3031
{
3032
#if defined(__GNUC__) || defined(__clang__)
3033
#pragma GCC diagnostic push
3034
#pragma GCC diagnostic ignored "-Wuninitialized"
3035
#endif
3036
    __m128i a;
3037
    return a;
3038
#if defined(__GNUC__) || defined(__clang__)
3039
#pragma GCC diagnostic pop
3040
#endif
3041
}
3042

3043
// Return vector of type __m128 with undefined elements.
3044
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
3045
FORCE_INLINE __m128 _mm_undefined_ps(void)
3046
{
3047
#if defined(__GNUC__) || defined(__clang__)
3048
#pragma GCC diagnostic push
3049
#pragma GCC diagnostic ignored "-Wuninitialized"
3050
#endif
3051
    __m128 a;
3052
    return a;
3053
#if defined(__GNUC__) || defined(__clang__)
3054
#pragma GCC diagnostic pop
3055
#endif
3056
}
3057

3058
// Selects and interleaves the upper two single-precision, floating-point values
3059
// from a and b.
3060
//
3061
//   r0 := a2
3062
//   r1 := b2
3063
//   r2 := a3
3064
//   r3 := b3
3065
//
3066
// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
3067
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
3068
{
3069
#if defined(__aarch64__)
3070
    return vreinterpretq_m128_f32(
3071
        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3072
#else
3073
    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
3074
    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
3075
    float32x2x2_t result = vzip_f32(a1, b1);
3076
    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3077
#endif
3078
}
3079

3080
// Selects and interleaves the lower two single-precision, floating-point values
3081
// from a and b.
3082
//
3083
//   r0 := a0
3084
//   r1 := b0
3085
//   r2 := a1
3086
//   r3 := b1
3087
//
3088
// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
3089
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
3090
{
3091
#if defined(__aarch64__)
3092
    return vreinterpretq_m128_f32(
3093
        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3094
#else
3095
    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
3096
    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
3097
    float32x2x2_t result = vzip_f32(a1, b1);
3098
    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3099
#endif
3100
}
3101

3102
// Computes bitwise EXOR (exclusive-or) of the four single-precision,
3103
// floating-point values of a and b.
3104
// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
3105
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
3106
{
3107
    return vreinterpretq_m128_s32(
3108
        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
3109
}
3110

3111
/* SSE2 */
3112

3113
// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
3114
// unsigned 16-bit integers in b.
3115
// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
3116
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
3117
{
3118
    return vreinterpretq_m128i_s16(
3119
        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3120
}
3121

3122
// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
3123
// unsigned 32-bit integers in b.
3124
//
3125
//   r0 := a0 + b0
3126
//   r1 := a1 + b1
3127
//   r2 := a2 + b2
3128
//   r3 := a3 + b3
3129
//
3130
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
3131
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
3132
{
3133
    return vreinterpretq_m128i_s32(
3134
        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3135
}
3136

3137
// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
3138
// unsigned 32-bit integers in b.
3139
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
3140
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
3141
{
3142
    return vreinterpretq_m128i_s64(
3143
        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
3144
}
3145

3146
// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3147
// unsigned 8-bit integers in b.
3148
// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
3149
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3150
{
3151
    return vreinterpretq_m128i_s8(
3152
        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3153
}
3154

3155
// Add packed double-precision (64-bit) floating-point elements in a and b, and
3156
// store the results in dst.
3157
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
3158
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
3159
{
3160
#if defined(__aarch64__)
3161
    return vreinterpretq_m128d_f64(
3162
        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3163
#else
3164
    double *da = (double *) &a;
3165
    double *db = (double *) &b;
3166
    double c[2];
3167
    c[0] = da[0] + db[0];
3168
    c[1] = da[1] + db[1];
3169
    return vld1q_f32((float32_t *) c);
3170
#endif
3171
}
3172

3173
// Add the lower double-precision (64-bit) floating-point element in a and b,
3174
// store the result in the lower element of dst, and copy the upper element from
3175
// a to the upper element of dst.
3176
//
3177
//   dst[63:0] := a[63:0] + b[63:0]
3178
//   dst[127:64] := a[127:64]
3179
//
3180
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
3181
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
3182
{
3183
#if defined(__aarch64__)
3184
    return _mm_move_sd(a, _mm_add_pd(a, b));
3185
#else
3186
    double *da = (double *) &a;
3187
    double *db = (double *) &b;
3188
    double c[2];
3189
    c[0] = da[0] + db[0];
3190
    c[1] = da[1];
3191
    return vld1q_f32((float32_t *) c);
3192
#endif
3193
}
3194

3195
// Add 64-bit integers a and b, and store the result in dst.
3196
//
3197
//   dst[63:0] := a[63:0] + b[63:0]
3198
//
3199
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
3200
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3201
{
3202
    return vreinterpret_m64_s64(
3203
        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3204
}
3205

3206
// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3207
// and saturates.
3208
//
3209
//   r0 := SignedSaturate(a0 + b0)
3210
//   r1 := SignedSaturate(a1 + b1)
3211
//   ...
3212
//   r7 := SignedSaturate(a7 + b7)
3213
//
3214
// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3215
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3216
{
3217
    return vreinterpretq_m128i_s16(
3218
        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3219
}
3220

3221
// Add packed signed 8-bit integers in a and b using saturation, and store the
3222
// results in dst.
3223
//
3224
//   FOR j := 0 to 15
3225
//     i := j*8
3226
//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3227
//   ENDFOR
3228
//
3229
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
3230
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3231
{
3232
    return vreinterpretq_m128i_s8(
3233
        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3234
}
3235

3236
// Add packed unsigned 16-bit integers in a and b using saturation, and store
3237
// the results in dst.
3238
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3239
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3240
{
3241
    return vreinterpretq_m128i_u16(
3242
        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3243
}
3244

3245
// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3246
// b and saturates..
3247
// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3248
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3249
{
3250
    return vreinterpretq_m128i_u8(
3251
        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3252
}
3253

3254
// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3255
// elements in a and b, and store the results in dst.
3256
//
3257
//   FOR j := 0 to 1
3258
//     i := j*64
3259
//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3260
//   ENDFOR
3261
//
3262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3263
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3264
{
3265
    return vreinterpretq_m128d_s64(
3266
        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3267
}
3268

3269
// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3270
// b.
3271
//
3272
//   r := a & b
3273
//
3274
// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3275
FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3276
{
3277
    return vreinterpretq_m128i_s32(
3278
        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3279
}
3280

3281
// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3282
// elements in a and then AND with b, and store the results in dst.
3283
//
3284
//   FOR j := 0 to 1
3285
// 	     i := j*64
3286
// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3287
//   ENDFOR
3288
//
3289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3290
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3291
{
3292
    // *NOTE* argument swap
3293
    return vreinterpretq_m128d_s64(
3294
        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3295
}
3296

3297
// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3298
// 128-bit value in a.
3299
//
3300
//   r := (~a) & b
3301
//
3302
// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3303
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3304
{
3305
    return vreinterpretq_m128i_s32(
3306
        vbicq_s32(vreinterpretq_s32_m128i(b),
3307
                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
3308
}
3309

3310
// Computes the average of the 8 unsigned 16-bit integers in a and the 8
3311
// unsigned 16-bit integers in b and rounds.
3312
//
3313
//   r0 := (a0 + b0) / 2
3314
//   r1 := (a1 + b1) / 2
3315
//   ...
3316
//   r7 := (a7 + b7) / 2
3317
//
3318
// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3319
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3320
{
3321
    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3322
                                 vreinterpretq_u16_m128i(b));
3323
}
3324

3325
// Computes the average of the 16 unsigned 8-bit integers in a and the 16
3326
// unsigned 8-bit integers in b and rounds.
3327
//
3328
//   r0 := (a0 + b0) / 2
3329
//   r1 := (a1 + b1) / 2
3330
//   ...
3331
//   r15 := (a15 + b15) / 2
3332
//
3333
// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3334
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3335
{
3336
    return vreinterpretq_m128i_u8(
3337
        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3338
}
3339

3340
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3341
// dst.
3342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3343
#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3344

3345
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3346
// dst.
3347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3348
#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3349

3350
// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3351
// compilation and does not generate any instructions, thus it has zero latency.
3352
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3353
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3354
{
3355
    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3356
}
3357

3358
// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3359
// compilation and does not generate any instructions, thus it has zero latency.
3360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3361
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3362
{
3363
    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3364
}
3365

3366
// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3367
// compilation and does not generate any instructions, thus it has zero latency.
3368
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3369
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3370
{
3371
    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3372
}
3373

3374
// Applies a type cast to reinterpret four 32-bit floating point values passed
3375
// in as a 128-bit parameter as packed 32-bit integers.
3376
// https://msdn.microsoft.com/en-us/library/bb514099.aspx
3377
FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3378
{
3379
    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3380
}
3381

3382
// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3383
// compilation and does not generate any instructions, thus it has zero latency.
3384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3385
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3386
{
3387
#if defined(__aarch64__)
3388
    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3389
#else
3390
    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3391
#endif
3392
}
3393

3394
// Applies a type cast to reinterpret four 32-bit integers passed in as a
3395
// 128-bit parameter as packed 32-bit floating point values.
3396
// https://msdn.microsoft.com/en-us/library/bb514029.aspx
3397
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3398
{
3399
    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3400
}
3401

3402
// Invalidate and flush the cache line that contains p from all levels of the
3403
// cache hierarchy.
3404
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3405
#if defined(__APPLE__)
3406
#include <libkern/OSCacheControl.h>
3407
#endif
3408
FORCE_INLINE void _mm_clflush(void const *p)
3409
{
3410
    (void) p;
3411

3412
    /* sys_icache_invalidate is supported since macOS 10.5.
3413
     * However, it does not work on non-jailbroken iOS devices, although the
3414
     * compilation is successful.
3415
     */
3416
#if defined(__APPLE__)
3417
    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3418
#elif defined(__GNUC__) || defined(__clang__)
3419
    uintptr_t ptr = (uintptr_t) p;
3420
    __builtin___clear_cache((char *) ptr,
3421
                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3422
#else
3423
    /* FIXME: MSVC support */
3424
#endif
3425
}
3426

3427
// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3428
// unsigned 16-bit integers in b for equality.
3429
// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3430
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3431
{
3432
    return vreinterpretq_m128i_u16(
3433
        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3434
}
3435

3436
// Compare packed 32-bit integers in a and b for equality, and store the results
3437
// in dst
3438
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3439
{
3440
    return vreinterpretq_m128i_u32(
3441
        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3442
}
3443

3444
// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3445
// unsigned 8-bit integers in b for equality.
3446
// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3447
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3448
{
3449
    return vreinterpretq_m128i_u8(
3450
        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3451
}
3452

3453
// Compare packed double-precision (64-bit) floating-point elements in a and b
3454
// for equality, and store the results in dst.
3455
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3456
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3457
{
3458
#if defined(__aarch64__)
3459
    return vreinterpretq_m128d_u64(
3460
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3461
#else
3462
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3463
    uint32x4_t cmp =
3464
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3465
    uint32x4_t swapped = vrev64q_u32(cmp);
3466
    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3467
#endif
3468
}
3469

3470
// Compare the lower double-precision (64-bit) floating-point elements in a and
3471
// b for equality, store the result in the lower element of dst, and copy the
3472
// upper element from a to the upper element of dst.
3473
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3474
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3475
{
3476
    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3477
}
3478

3479
// Compare packed double-precision (64-bit) floating-point elements in a and b
3480
// for greater-than-or-equal, and store the results in dst.
3481
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3482
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3483
{
3484
#if defined(__aarch64__)
3485
    return vreinterpretq_m128d_u64(
3486
        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3487
#else
3488
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3489
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3490
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3491
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3492
    uint64_t d[2];
3493
    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3494
    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3495

3496
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3497
#endif
3498
}
3499

3500
// Compare the lower double-precision (64-bit) floating-point elements in a and
3501
// b for greater-than-or-equal, store the result in the lower element of dst,
3502
// and copy the upper element from a to the upper element of dst.
3503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3504
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3505
{
3506
#if defined(__aarch64__)
3507
    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3508
#else
3509
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3510
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3511
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3512
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3513
    uint64_t d[2];
3514
    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3515
    d[1] = a1;
3516

3517
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3518
#endif
3519
}
3520

3521
// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3522
// in b for greater than.
3523
//
3524
//   r0 := (a0 > b0) ? 0xffff : 0x0
3525
//   r1 := (a1 > b1) ? 0xffff : 0x0
3526
//   ...
3527
//   r7 := (a7 > b7) ? 0xffff : 0x0
3528
//
3529
// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3530
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3531
{
3532
    return vreinterpretq_m128i_u16(
3533
        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3534
}
3535

3536
// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3537
// in b for greater than.
3538
// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3539
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3540
{
3541
    return vreinterpretq_m128i_u32(
3542
        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3543
}
3544

3545
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3546
// in b for greater than.
3547
//
3548
//   r0 := (a0 > b0) ? 0xff : 0x0
3549
//   r1 := (a1 > b1) ? 0xff : 0x0
3550
//   ...
3551
//   r15 := (a15 > b15) ? 0xff : 0x0
3552
//
3553
// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3554
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3555
{
3556
    return vreinterpretq_m128i_u8(
3557
        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3558
}
3559

3560
// Compare packed double-precision (64-bit) floating-point elements in a and b
3561
// for greater-than, and store the results in dst.
3562
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3563
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3564
{
3565
#if defined(__aarch64__)
3566
    return vreinterpretq_m128d_u64(
3567
        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3568
#else
3569
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3570
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3571
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3572
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3573
    uint64_t d[2];
3574
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3575
    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3576

3577
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3578
#endif
3579
}
3580

3581
// Compare the lower double-precision (64-bit) floating-point elements in a and
3582
// b for greater-than, store the result in the lower element of dst, and copy
3583
// the upper element from a to the upper element of dst.
3584
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3585
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3586
{
3587
#if defined(__aarch64__)
3588
    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3589
#else
3590
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3591
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3592
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3593
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3594
    uint64_t d[2];
3595
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3596
    d[1] = a1;
3597

3598
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3599
#endif
3600
}
3601

3602
// Compare packed double-precision (64-bit) floating-point elements in a and b
3603
// for less-than-or-equal, and store the results in dst.
3604
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3605
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3606
{
3607
#if defined(__aarch64__)
3608
    return vreinterpretq_m128d_u64(
3609
        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3610
#else
3611
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3612
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3613
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3614
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3615
    uint64_t d[2];
3616
    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3617
    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3618

3619
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3620
#endif
3621
}
3622

3623
// Compare the lower double-precision (64-bit) floating-point elements in a and
3624
// b for less-than-or-equal, store the result in the lower element of dst, and
3625
// copy the upper element from a to the upper element of dst.
3626
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3627
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3628
{
3629
#if defined(__aarch64__)
3630
    return _mm_move_sd(a, _mm_cmple_pd(a, b));
3631
#else
3632
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3633
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3634
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3635
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3636
    uint64_t d[2];
3637
    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3638
    d[1] = a1;
3639

3640
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3641
#endif
3642
}
3643

3644
// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3645
// in b for less than.
3646
//
3647
//   r0 := (a0 < b0) ? 0xffff : 0x0
3648
//   r1 := (a1 < b1) ? 0xffff : 0x0
3649
//   ...
3650
//   r7 := (a7 < b7) ? 0xffff : 0x0
3651
//
3652
// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3653
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3654
{
3655
    return vreinterpretq_m128i_u16(
3656
        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3657
}
3658

3659

3660
// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3661
// in b for less than.
3662
// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3663
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3664
{
3665
    return vreinterpretq_m128i_u32(
3666
        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3667
}
3668

3669
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3670
// in b for lesser than.
3671
// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3672
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3673
{
3674
    return vreinterpretq_m128i_u8(
3675
        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3676
}
3677

3678
// Compare packed double-precision (64-bit) floating-point elements in a and b
3679
// for less-than, and store the results in dst.
3680
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3681
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3682
{
3683
#if defined(__aarch64__)
3684
    return vreinterpretq_m128d_u64(
3685
        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3686
#else
3687
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3688
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3689
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3690
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3691
    uint64_t d[2];
3692
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3693
    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3694

3695
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3696
#endif
3697
}
3698

3699
// Compare the lower double-precision (64-bit) floating-point elements in a and
3700
// b for less-than, store the result in the lower element of dst, and copy the
3701
// upper element from a to the upper element of dst.
3702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3703
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3704
{
3705
#if defined(__aarch64__)
3706
    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3707
#else
3708
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3709
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3710
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3711
    uint64_t d[2];
3712
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3713
    d[1] = a1;
3714

3715
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3716
#endif
3717
}
3718

3719
// Compare packed double-precision (64-bit) floating-point elements in a and b
3720
// for not-equal, and store the results in dst.
3721
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3722
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3723
{
3724
#if defined(__aarch64__)
3725
    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3726
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3727
#else
3728
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3729
    uint32x4_t cmp =
3730
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3731
    uint32x4_t swapped = vrev64q_u32(cmp);
3732
    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3733
#endif
3734
}
3735

3736
// Compare the lower double-precision (64-bit) floating-point elements in a and
3737
// b for not-equal, store the result in the lower element of dst, and copy the
3738
// upper element from a to the upper element of dst.
3739
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3740
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3741
{
3742
    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3743
}
3744

3745
// Compare packed double-precision (64-bit) floating-point elements in a and b
3746
// for not-greater-than-or-equal, and store the results in dst.
3747
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3748
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3749
{
3750
#if defined(__aarch64__)
3751
    return vreinterpretq_m128d_u64(veorq_u64(
3752
        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3753
        vdupq_n_u64(UINT64_MAX)));
3754
#else
3755
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3756
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3757
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3758
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3759
    uint64_t d[2];
3760
    d[0] =
3761
        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3762
    d[1] =
3763
        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3764

3765
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3766
#endif
3767
}
3768

3769
// Compare the lower double-precision (64-bit) floating-point elements in a and
3770
// b for not-greater-than-or-equal, store the result in the lower element of
3771
// dst, and copy the upper element from a to the upper element of dst.
3772
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3773
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3774
{
3775
    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3776
}
3777

3778
// Compare packed double-precision (64-bit) floating-point elements in a and b
3779
// for not-greater-than, and store the results in dst.
3780
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3781
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3782
{
3783
#if defined(__aarch64__)
3784
    return vreinterpretq_m128d_u64(veorq_u64(
3785
        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3786
        vdupq_n_u64(UINT64_MAX)));
3787
#else
3788
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3789
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3790
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3791
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3792
    uint64_t d[2];
3793
    d[0] =
3794
        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3795
    d[1] =
3796
        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3797

3798
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3799
#endif
3800
}
3801

3802
// Compare the lower double-precision (64-bit) floating-point elements in a and
3803
// b for not-greater-than, store the result in the lower element of dst, and
3804
// copy the upper element from a to the upper element of dst.
3805
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3806
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3807
{
3808
    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3809
}
3810

3811
// Compare packed double-precision (64-bit) floating-point elements in a and b
3812
// for not-less-than-or-equal, and store the results in dst.
3813
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3814
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3815
{
3816
#if defined(__aarch64__)
3817
    return vreinterpretq_m128d_u64(veorq_u64(
3818
        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3819
        vdupq_n_u64(UINT64_MAX)));
3820
#else
3821
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3822
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3823
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3824
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3825
    uint64_t d[2];
3826
    d[0] =
3827
        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3828
    d[1] =
3829
        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3830

3831
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3832
#endif
3833
}
3834

3835
// Compare the lower double-precision (64-bit) floating-point elements in a and
3836
// b for not-less-than-or-equal, store the result in the lower element of dst,
3837
// and copy the upper element from a to the upper element of dst.
3838
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3839
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3840
{
3841
    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3842
}
3843

3844
// Compare packed double-precision (64-bit) floating-point elements in a and b
3845
// for not-less-than, and store the results in dst.
3846
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3847
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3848
{
3849
#if defined(__aarch64__)
3850
    return vreinterpretq_m128d_u64(veorq_u64(
3851
        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3852
        vdupq_n_u64(UINT64_MAX)));
3853
#else
3854
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3855
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3856
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3857
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3858
    uint64_t d[2];
3859
    d[0] =
3860
        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3861
    d[1] =
3862
        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3863

3864
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3865
#endif
3866
}
3867

3868
// Compare the lower double-precision (64-bit) floating-point elements in a and
3869
// b for not-less-than, store the result in the lower element of dst, and copy
3870
// the upper element from a to the upper element of dst.
3871
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3872
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3873
{
3874
    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3875
}
3876

3877
// Compare packed double-precision (64-bit) floating-point elements in a and b
3878
// to see if neither is NaN, and store the results in dst.
3879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3880
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3881
{
3882
#if defined(__aarch64__)
3883
    // Excluding NaNs, any two floating point numbers can be compared.
3884
    uint64x2_t not_nan_a =
3885
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3886
    uint64x2_t not_nan_b =
3887
        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3888
    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3889
#else
3890
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3891
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3892
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3893
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3894
    uint64_t d[2];
3895
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3896
            (*(double *) &b0) == (*(double *) &b0))
3897
               ? ~UINT64_C(0)
3898
               : UINT64_C(0);
3899
    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3900
            (*(double *) &b1) == (*(double *) &b1))
3901
               ? ~UINT64_C(0)
3902
               : UINT64_C(0);
3903

3904
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3905
#endif
3906
}
3907

3908
// Compare the lower double-precision (64-bit) floating-point elements in a and
3909
// b to see if neither is NaN, store the result in the lower element of dst, and
3910
// copy the upper element from a to the upper element of dst.
3911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3912
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3913
{
3914
#if defined(__aarch64__)
3915
    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3916
#else
3917
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3918
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3919
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3920
    uint64_t d[2];
3921
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3922
            (*(double *) &b0) == (*(double *) &b0))
3923
               ? ~UINT64_C(0)
3924
               : UINT64_C(0);
3925
    d[1] = a1;
3926

3927
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3928
#endif
3929
}
3930

3931
// Compare packed double-precision (64-bit) floating-point elements in a and b
3932
// to see if either is NaN, and store the results in dst.
3933
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3934
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3935
{
3936
#if defined(__aarch64__)
3937
    // Two NaNs are not equal in comparison operation.
3938
    uint64x2_t not_nan_a =
3939
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3940
    uint64x2_t not_nan_b =
3941
        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3942
    return vreinterpretq_m128d_s32(
3943
        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3944
#else
3945
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3946
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3947
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3948
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3949
    uint64_t d[2];
3950
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3951
            (*(double *) &b0) == (*(double *) &b0))
3952
               ? UINT64_C(0)
3953
               : ~UINT64_C(0);
3954
    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3955
            (*(double *) &b1) == (*(double *) &b1))
3956
               ? UINT64_C(0)
3957
               : ~UINT64_C(0);
3958

3959
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3960
#endif
3961
}
3962

3963
// Compare the lower double-precision (64-bit) floating-point elements in a and
3964
// b to see if either is NaN, store the result in the lower element of dst, and
3965
// copy the upper element from a to the upper element of dst.
3966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3967
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3968
{
3969
#if defined(__aarch64__)
3970
    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3971
#else
3972
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3973
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3974
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3975
    uint64_t d[2];
3976
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3977
            (*(double *) &b0) == (*(double *) &b0))
3978
               ? UINT64_C(0)
3979
               : ~UINT64_C(0);
3980
    d[1] = a1;
3981

3982
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3983
#endif
3984
}
3985

3986
// Compare the lower double-precision (64-bit) floating-point element in a and b
3987
// for greater-than-or-equal, and return the boolean result (0 or 1).
3988
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3989
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3990
{
3991
#if defined(__aarch64__)
3992
    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3993
#else
3994
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3995
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3996

3997
    return (*(double *) &a0 >= *(double *) &b0);
3998
#endif
3999
}
4000

4001
// Compare the lower double-precision (64-bit) floating-point element in a and b
4002
// for greater-than, and return the boolean result (0 or 1).
4003
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
4004
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
4005
{
4006
#if defined(__aarch64__)
4007
    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
4008
#else
4009
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4010
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4011

4012
    return (*(double *) &a0 > *(double *) &b0);
4013
#endif
4014
}
4015

4016
// Compare the lower double-precision (64-bit) floating-point element in a and b
4017
// for less-than-or-equal, and return the boolean result (0 or 1).
4018
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
4019
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
4020
{
4021
#if defined(__aarch64__)
4022
    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
4023
#else
4024
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4025
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4026

4027
    return (*(double *) &a0 <= *(double *) &b0);
4028
#endif
4029
}
4030

4031
// Compare the lower double-precision (64-bit) floating-point element in a and b
4032
// for less-than, and return the boolean result (0 or 1).
4033
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
4034
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
4035
{
4036
#if defined(__aarch64__)
4037
    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
4038
#else
4039
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4040
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4041

4042
    return (*(double *) &a0 < *(double *) &b0);
4043
#endif
4044
}
4045

4046
// Compare the lower double-precision (64-bit) floating-point element in a and b
4047
// for equality, and return the boolean result (0 or 1).
4048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
4049
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
4050
{
4051
#if defined(__aarch64__)
4052
    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
4053
#else
4054
    uint32x4_t a_not_nan =
4055
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
4056
    uint32x4_t b_not_nan =
4057
        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
4058
    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4059
    uint32x4_t a_eq_b =
4060
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
4061
    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
4062
                                       vreinterpretq_u64_u32(a_eq_b));
4063
    return vgetq_lane_u64(and_results, 0) & 0x1;
4064
#endif
4065
}
4066

4067
// Compare the lower double-precision (64-bit) floating-point element in a and b
4068
// for not-equal, and return the boolean result (0 or 1).
4069
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
4070
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
4071
{
4072
    return !_mm_comieq_sd(a, b);
4073
}
4074

4075
// Convert packed signed 32-bit integers in a to packed double-precision
4076
// (64-bit) floating-point elements, and store the results in dst.
4077
//
4078
//   FOR j := 0 to 1
4079
//     i := j*32
4080
//     m := j*64
4081
//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4082
//   ENDFOR
4083
//
4084
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
4085
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
4086
{
4087
#if defined(__aarch64__)
4088
    return vreinterpretq_m128d_f64(
4089
        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
4090
#else
4091
    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4092
    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
4093
    return _mm_set_pd(a1, a0);
4094
#endif
4095
}
4096

4097
// Converts the four signed 32-bit integer values of a to single-precision,
4098
// floating-point values
4099
// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
4100
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4101
{
4102
    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4103
}
4104

4105
// Convert packed double-precision (64-bit) floating-point elements in a to
4106
// packed 32-bit integers, and store the results in dst.
4107
//
4108
//   FOR j := 0 to 1
4109
//      i := 32*j
4110
//      k := 64*j
4111
//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
4112
//   ENDFOR
4113
//
4114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
4115
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
4116
{
4117
// vrnd32xq_f64 not supported on clang
4118
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
4119
    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
4120
    int64x2_t integers = vcvtq_s64_f64(rounded);
4121
    return vreinterpretq_m128i_s32(
4122
        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
4123
#else
4124
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4125
    double d0 = ((double *) &rnd)[0];
4126
    double d1 = ((double *) &rnd)[1];
4127
    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
4128
#endif
4129
}
4130

4131
// Convert packed double-precision (64-bit) floating-point elements in a to
4132
// packed 32-bit integers, and store the results in dst.
4133
//
4134
//   FOR j := 0 to 1
4135
//      i := 32*j
4136
//      k := 64*j
4137
//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
4138
//   ENDFOR
4139
//
4140
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
4141
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
4142
{
4143
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4144
    double d0 = ((double *) &rnd)[0];
4145
    double d1 = ((double *) &rnd)[1];
4146
    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
4147
    return vreinterpret_m64_s32(vld1_s32(data));
4148
}
4149

4150
// Convert packed double-precision (64-bit) floating-point elements in a to
4151
// packed single-precision (32-bit) floating-point elements, and store the
4152
// results in dst.
4153
//
4154
//   FOR j := 0 to 1
4155
//     i := 32*j
4156
//     k := 64*j
4157
//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4158
//   ENDFOR
4159
//   dst[127:64] := 0
4160
//
4161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
4162
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4163
{
4164
#if defined(__aarch64__)
4165
    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4166
    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4167
#else
4168
    float a0 = (float) ((double *) &a)[0];
4169
    float a1 = (float) ((double *) &a)[1];
4170
    return _mm_set_ps(0, 0, a1, a0);
4171
#endif
4172
}
4173

4174
// Convert packed signed 32-bit integers in a to packed double-precision
4175
// (64-bit) floating-point elements, and store the results in dst.
4176
//
4177
//   FOR j := 0 to 1
4178
//     i := j*32
4179
//     m := j*64
4180
//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4181
//   ENDFOR
4182
//
4183
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
4184
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
4185
{
4186
#if defined(__aarch64__)
4187
    return vreinterpretq_m128d_f64(
4188
        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
4189
#else
4190
    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
4191
    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
4192
    return _mm_set_pd(a1, a0);
4193
#endif
4194
}
4195

4196
// Converts the four single-precision, floating-point values of a to signed
4197
// 32-bit integer values.
4198
//
4199
//   r0 := (int) a0
4200
//   r1 := (int) a1
4201
//   r2 := (int) a2
4202
//   r3 := (int) a3
4203
//
4204
// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4205
// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4206
// does not support! It is supported on ARMv8-A however.
4207
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4208
{
4209
#if defined(__ARM_FEATURE_FRINT)
4210
    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
4211
#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4212
    switch (_MM_GET_ROUNDING_MODE()) {
4213
    case _MM_ROUND_NEAREST:
4214
        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4215
    case _MM_ROUND_DOWN:
4216
        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
4217
    case _MM_ROUND_UP:
4218
        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4219
    default:  // _MM_ROUND_TOWARD_ZERO
4220
        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4221
    }
4222
#else
4223
    float *f = (float *) &a;
4224
    switch (_MM_GET_ROUNDING_MODE()) {
4225
    case _MM_ROUND_NEAREST: {
4226
        uint32x4_t signmask = vdupq_n_u32(0x80000000);
4227
        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4228
                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
4229
        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4230
            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4231
        int32x4_t r_trunc = vcvtq_s32_f32(
4232
            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4233
        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4234
            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4235
        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4236
                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4237
        float32x4_t delta = vsubq_f32(
4238
            vreinterpretq_f32_m128(a),
4239
            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4240
        uint32x4_t is_delta_half =
4241
            vceqq_f32(delta, half); /* delta == +/- 0.5 */
4242
        return vreinterpretq_m128i_s32(
4243
            vbslq_s32(is_delta_half, r_even, r_normal));
4244
    }
4245
    case _MM_ROUND_DOWN:
4246
        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4247
                             floorf(f[0]));
4248
    case _MM_ROUND_UP:
4249
        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4250
                             ceilf(f[0]));
4251
    default:  // _MM_ROUND_TOWARD_ZERO
4252
        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4253
                             (int32_t) f[0]);
4254
    }
4255
#endif
4256
}
4257

4258
// Convert packed single-precision (32-bit) floating-point elements in a to
4259
// packed double-precision (64-bit) floating-point elements, and store the
4260
// results in dst.
4261
//
4262
//   FOR j := 0 to 1
4263
//     i := 64*j
4264
//     k := 32*j
4265
//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4266
//   ENDFOR
4267
//
4268
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
4269
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4270
{
4271
#if defined(__aarch64__)
4272
    return vreinterpretq_m128d_f64(
4273
        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4274
#else
4275
    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4276
    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4277
    return _mm_set_pd(a1, a0);
4278
#endif
4279
}
4280

4281
// Copy the lower double-precision (64-bit) floating-point element of a to dst.
4282
//
4283
//   dst[63:0] := a[63:0]
4284
//
4285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
4286
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4287
{
4288
#if defined(__aarch64__)
4289
    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4290
#else
4291
    return ((double *) &a)[0];
4292
#endif
4293
}
4294

4295
// Convert the lower double-precision (64-bit) floating-point element in a to a
4296
// 32-bit integer, and store the result in dst.
4297
//
4298
//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4299
//
4300
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
4301
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
4302
{
4303
#if defined(__aarch64__)
4304
    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4305
#else
4306
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4307
    double ret = ((double *) &rnd)[0];
4308
    return (int32_t) ret;
4309
#endif
4310
}
4311

4312
// Convert the lower double-precision (64-bit) floating-point element in a to a
4313
// 64-bit integer, and store the result in dst.
4314
//
4315
//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4316
//
4317
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
4318
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4319
{
4320
#if defined(__aarch64__)
4321
    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4322
#else
4323
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4324
    double ret = ((double *) &rnd)[0];
4325
    return (int64_t) ret;
4326
#endif
4327
}
4328

4329
// Convert the lower double-precision (64-bit) floating-point element in a to a
4330
// 64-bit integer, and store the result in dst.
4331
//
4332
//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4333
//
4334
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
4335
#define _mm_cvtsd_si64x _mm_cvtsd_si64
4336

4337
// Convert the lower double-precision (64-bit) floating-point element in b to a
4338
// single-precision (32-bit) floating-point element, store the result in the
4339
// lower element of dst, and copy the upper 3 packed elements from a to the
4340
// upper elements of dst.
4341
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4342
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4343
{
4344
#if defined(__aarch64__)
4345
    return vreinterpretq_m128_f32(vsetq_lane_f32(
4346
        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4347
        vreinterpretq_f32_m128(a), 0));
4348
#else
4349
    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4350
                                                 vreinterpretq_f32_m128(a), 0));
4351
#endif
4352
}
4353

4354
// Copy the lower 32-bit integer in a to dst.
4355
//
4356
//   dst[31:0] := a[31:0]
4357
//
4358
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4359
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4360
{
4361
    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4362
}
4363

4364
// Copy the lower 64-bit integer in a to dst.
4365
//
4366
//   dst[63:0] := a[63:0]
4367
//
4368
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4369
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4370
{
4371
    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4372
}
4373

4374
// Copy the lower 64-bit integer in a to dst.
4375
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4376
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4377

4378
// Convert the signed 32-bit integer b to a double-precision (64-bit)
4379
// floating-point element, store the result in the lower element of dst, and
4380
// copy the upper element from a to the upper element of dst.
4381
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4382
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4383
{
4384
#if defined(__aarch64__)
4385
    return vreinterpretq_m128d_f64(
4386
        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4387
#else
4388
    double bf = (double) b;
4389
    return vreinterpretq_m128d_s64(
4390
        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4391
#endif
4392
}
4393

4394
// Copy the lower 64-bit integer in a to dst.
4395
//
4396
//   dst[63:0] := a[63:0]
4397
//
4398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4399
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4400

4401
// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4402
// zero extending the upper bits.
4403
//
4404
//   r0 := a
4405
//   r1 := 0x0
4406
//   r2 := 0x0
4407
//   r3 := 0x0
4408
//
4409
// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4410
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4411
{
4412
    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4413
}
4414

4415
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4416
// floating-point element, store the result in the lower element of dst, and
4417
// copy the upper element from a to the upper element of dst.
4418
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4419
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4420
{
4421
#if defined(__aarch64__)
4422
    return vreinterpretq_m128d_f64(
4423
        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4424
#else
4425
    double bf = (double) b;
4426
    return vreinterpretq_m128d_s64(
4427
        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4428
#endif
4429
}
4430

4431
// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4432
// zero extending the upper bits.
4433
//
4434
//   r0 := a
4435
//   r1 := 0x0
4436
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4437
{
4438
    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4439
}
4440

4441
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4442
// element.
4443
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4444
#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4445

4446
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4447
// floating-point element, store the result in the lower element of dst, and
4448
// copy the upper element from a to the upper element of dst.
4449
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4450
#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4451

4452
// Convert the lower single-precision (32-bit) floating-point element in b to a
4453
// double-precision (64-bit) floating-point element, store the result in the
4454
// lower element of dst, and copy the upper element from a to the upper element
4455
// of dst.
4456
//
4457
//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4458
//   dst[127:64] := a[127:64]
4459
//
4460
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4461
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4462
{
4463
    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4464
#if defined(__aarch64__)
4465
    return vreinterpretq_m128d_f64(
4466
        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4467
#else
4468
    return vreinterpretq_m128d_s64(
4469
        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4470
#endif
4471
}
4472

4473
// Convert packed double-precision (64-bit) floating-point elements in a to
4474
// packed 32-bit integers with truncation, and store the results in dst.
4475
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4476
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4477
{
4478
    double a0 = ((double *) &a)[0];
4479
    double a1 = ((double *) &a)[1];
4480
    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4481
}
4482

4483
// Convert packed double-precision (64-bit) floating-point elements in a to
4484
// packed 32-bit integers with truncation, and store the results in dst.
4485
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4486
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4487
{
4488
    double a0 = ((double *) &a)[0];
4489
    double a1 = ((double *) &a)[1];
4490
    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4491
    return vreinterpret_m64_s32(vld1_s32(data));
4492
}
4493

4494
// Converts the four single-precision, floating-point values of a to signed
4495
// 32-bit integer values using truncate.
4496
// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4497
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4498
{
4499
    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4500
}
4501

4502
// Convert the lower double-precision (64-bit) floating-point element in a to a
4503
// 32-bit integer with truncation, and store the result in dst.
4504
//
4505
//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4506
//
4507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4508
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4509
{
4510
    double ret = *((double *) &a);
4511
    return (int32_t) ret;
4512
}
4513

4514
// Convert the lower double-precision (64-bit) floating-point element in a to a
4515
// 64-bit integer with truncation, and store the result in dst.
4516
//
4517
//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4518
//
4519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4520
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4521
{
4522
#if defined(__aarch64__)
4523
    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4524
#else
4525
    double ret = *((double *) &a);
4526
    return (int64_t) ret;
4527
#endif
4528
}
4529

4530
// Convert the lower double-precision (64-bit) floating-point element in a to a
4531
// 64-bit integer with truncation, and store the result in dst.
4532
//
4533
//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4534
//
4535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4536
#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4537

4538
// Divide packed double-precision (64-bit) floating-point elements in a by
4539
// packed elements in b, and store the results in dst.
4540
//
4541
//  FOR j := 0 to 1
4542
//    i := 64*j
4543
//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
4544
//  ENDFOR
4545
//
4546
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4547
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4548
{
4549
#if defined(__aarch64__)
4550
    return vreinterpretq_m128d_f64(
4551
        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4552
#else
4553
    double *da = (double *) &a;
4554
    double *db = (double *) &b;
4555
    double c[2];
4556
    c[0] = da[0] / db[0];
4557
    c[1] = da[1] / db[1];
4558
    return vld1q_f32((float32_t *) c);
4559
#endif
4560
}
4561

4562
// Divide the lower double-precision (64-bit) floating-point element in a by the
4563
// lower double-precision (64-bit) floating-point element in b, store the result
4564
// in the lower element of dst, and copy the upper element from a to the upper
4565
// element of dst.
4566
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4567
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4568
{
4569
#if defined(__aarch64__)
4570
    float64x2_t tmp =
4571
        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4572
    return vreinterpretq_m128d_f64(
4573
        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4574
#else
4575
    return _mm_move_sd(a, _mm_div_pd(a, b));
4576
#endif
4577
}
4578

4579
// Extracts the selected signed or unsigned 16-bit integer from a and zero
4580
// extends.
4581
// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4582
// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4583
#define _mm_extract_epi16(a, imm) \
4584
    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4585

4586
// Inserts the least significant 16 bits of b into the selected 16-bit integer
4587
// of a.
4588
// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4589
// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4590
//                                       __constrange(0,8) int imm)
4591
#define _mm_insert_epi16(a, b, imm)                                  \
4592
    __extension__({                                                  \
4593
        vreinterpretq_m128i_s16(                                     \
4594
            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4595
    })
4596

4597
// Loads two double-precision from 16-byte aligned memory, floating-point
4598
// values.
4599
//
4600
//   dst[127:0] := MEM[mem_addr+127:mem_addr]
4601
//
4602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4603
FORCE_INLINE __m128d _mm_load_pd(const double *p)
4604
{
4605
#if defined(__aarch64__)
4606
    return vreinterpretq_m128d_f64(vld1q_f64(p));
4607
#else
4608
    const float *fp = (const float *) p;
4609
    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4610
    return vreinterpretq_m128d_f32(vld1q_f32(data));
4611
#endif
4612
}
4613

4614
// Load a double-precision (64-bit) floating-point element from memory into both
4615
// elements of dst.
4616
//
4617
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
4618
//   dst[127:64] := MEM[mem_addr+63:mem_addr]
4619
//
4620
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4621
#define _mm_load_pd1 _mm_load1_pd
4622

4623
// Load a double-precision (64-bit) floating-point element from memory into the
4624
// lower of dst, and zero the upper element. mem_addr does not need to be
4625
// aligned on any particular boundary.
4626
//
4627
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
4628
//   dst[127:64] := 0
4629
//
4630
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4631
FORCE_INLINE __m128d _mm_load_sd(const double *p)
4632
{
4633
#if defined(__aarch64__)
4634
    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4635
#else
4636
    const float *fp = (const float *) p;
4637
    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4638
    return vreinterpretq_m128d_f32(vld1q_f32(data));
4639
#endif
4640
}
4641

4642
// Loads 128-bit value. :
4643
// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4644
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4645
{
4646
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4647
}
4648

4649
// Load a double-precision (64-bit) floating-point element from memory into both
4650
// elements of dst.
4651
//
4652
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
4653
//   dst[127:64] := MEM[mem_addr+63:mem_addr]
4654
//
4655
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4656
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4657
{
4658
#if defined(__aarch64__)
4659
    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4660
#else
4661
    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4662
#endif
4663
}
4664

4665
// Load a double-precision (64-bit) floating-point element from memory into the
4666
// upper element of dst, and copy the lower element from a to dst. mem_addr does
4667
// not need to be aligned on any particular boundary.
4668
//
4669
//   dst[63:0] := a[63:0]
4670
//   dst[127:64] := MEM[mem_addr+63:mem_addr]
4671
//
4672
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4673
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4674
{
4675
#if defined(__aarch64__)
4676
    return vreinterpretq_m128d_f64(
4677
        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4678
#else
4679
    return vreinterpretq_m128d_f32(vcombine_f32(
4680
        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4681
#endif
4682
}
4683

4684
// Load 64-bit integer from memory into the first element of dst.
4685
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4686
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4687
{
4688
    /* Load the lower 64 bits of the value pointed to by p into the
4689
     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4690
     */
4691
    return vreinterpretq_m128i_s32(
4692
        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4693
}
4694

4695
// Load a double-precision (64-bit) floating-point element from memory into the
4696
// lower element of dst, and copy the upper element from a to dst. mem_addr does
4697
// not need to be aligned on any particular boundary.
4698
//
4699
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
4700
//   dst[127:64] := a[127:64]
4701
//
4702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4703
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4704
{
4705
#if defined(__aarch64__)
4706
    return vreinterpretq_m128d_f64(
4707
        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4708
#else
4709
    return vreinterpretq_m128d_f32(
4710
        vcombine_f32(vld1_f32((const float *) p),
4711
                     vget_high_f32(vreinterpretq_f32_m128d(a))));
4712
#endif
4713
}
4714

4715
// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4716
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4717
// general-protection exception may be generated.
4718
//
4719
//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4720
//   dst[127:64] := MEM[mem_addr+63:mem_addr]
4721
//
4722
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4723
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4724
{
4725
#if defined(__aarch64__)
4726
    float64x2_t v = vld1q_f64(p);
4727
    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4728
#else
4729
    int64x2_t v = vld1q_s64((const int64_t *) p);
4730
    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4731
#endif
4732
}
4733

4734
// Loads two double-precision from unaligned memory, floating-point values.
4735
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4736
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4737
{
4738
    return _mm_load_pd(p);
4739
}
4740

4741
// Loads 128-bit value. :
4742
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4743
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4744
{
4745
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4746
}
4747

4748
// Load unaligned 32-bit integer from memory into the first element of dst.
4749
//
4750
//   dst[31:0] := MEM[mem_addr+31:mem_addr]
4751
//   dst[MAX:32] := 0
4752
//
4753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4754
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4755
{
4756
    return vreinterpretq_m128i_s32(
4757
        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4758
}
4759

4760
// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4761
// integers from b.
4762
//
4763
//   r0 := (a0 * b0) + (a1 * b1)
4764
//   r1 := (a2 * b2) + (a3 * b3)
4765
//   r2 := (a4 * b4) + (a5 * b5)
4766
//   r3 := (a6 * b6) + (a7 * b7)
4767
// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4768
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4769
{
4770
    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4771
                              vget_low_s16(vreinterpretq_s16_m128i(b)));
4772
#if defined(__aarch64__)
4773
    int32x4_t high =
4774
        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4775

4776
    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4777
#else
4778
    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4779
                               vget_high_s16(vreinterpretq_s16_m128i(b)));
4780

4781
    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4782
    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4783

4784
    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4785
#endif
4786
}
4787

4788
// Conditionally store 8-bit integer elements from a into memory using mask
4789
// (elements are not stored when the highest bit is not set in the corresponding
4790
// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4791
// on any particular boundary.
4792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4793
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4794
{
4795
    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4796
    __m128 b = _mm_load_ps((const float *) mem_addr);
4797
    int8x16_t masked =
4798
        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4799
                 vreinterpretq_s8_m128(b));
4800
    vst1q_s8((int8_t *) mem_addr, masked);
4801
}
4802

4803
// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4804
// signed 16-bit integers from b.
4805
// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4806
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4807
{
4808
    return vreinterpretq_m128i_s16(
4809
        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4810
}
4811

4812
// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4813
// 16 unsigned 8-bit integers from b.
4814
// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4815
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4816
{
4817
    return vreinterpretq_m128i_u8(
4818
        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4819
}
4820

4821
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4822
// and store packed maximum values in dst.
4823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4824
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4825
{
4826
#if defined(__aarch64__)
4827
#if SSE2NEON_PRECISE_MINMAX
4828
    float64x2_t _a = vreinterpretq_f64_m128d(a);
4829
    float64x2_t _b = vreinterpretq_f64_m128d(b);
4830
    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4831
#else
4832
    return vreinterpretq_m128d_f64(
4833
        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4834
#endif
4835
#else
4836
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4837
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4838
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4839
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4840
    uint64_t d[2];
4841
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4842
    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4843

4844
    return vreinterpretq_m128d_u64(vld1q_u64(d));
4845
#endif
4846
}
4847

4848
// Compare the lower double-precision (64-bit) floating-point elements in a and
4849
// b, store the maximum value in the lower element of dst, and copy the upper
4850
// element from a to the upper element of dst.
4851
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4852
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4853
{
4854
#if defined(__aarch64__)
4855
    return _mm_move_sd(a, _mm_max_pd(a, b));
4856
#else
4857
    double *da = (double *) &a;
4858
    double *db = (double *) &b;
4859
    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4860
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4861
#endif
4862
}
4863

4864
// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4865
// signed 16-bit integers from b.
4866
// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4867
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4868
{
4869
    return vreinterpretq_m128i_s16(
4870
        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4871
}
4872

4873
// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4874
// 16 unsigned 8-bit integers from b.
4875
// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4876
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4877
{
4878
    return vreinterpretq_m128i_u8(
4879
        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4880
}
4881

4882
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4883
// and store packed minimum values in dst.
4884
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4885
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4886
{
4887
#if defined(__aarch64__)
4888
#if SSE2NEON_PRECISE_MINMAX
4889
    float64x2_t _a = vreinterpretq_f64_m128d(a);
4890
    float64x2_t _b = vreinterpretq_f64_m128d(b);
4891
    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4892
#else
4893
    return vreinterpretq_m128d_f64(
4894
        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4895
#endif
4896
#else
4897
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4898
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4899
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4900
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4901
    uint64_t d[2];
4902
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4903
    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4904
    return vreinterpretq_m128d_u64(vld1q_u64(d));
4905
#endif
4906
}
4907

4908
// Compare the lower double-precision (64-bit) floating-point elements in a and
4909
// b, store the minimum value in the lower element of dst, and copy the upper
4910
// element from a to the upper element of dst.
4911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4912
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4913
{
4914
#if defined(__aarch64__)
4915
    return _mm_move_sd(a, _mm_min_pd(a, b));
4916
#else
4917
    double *da = (double *) &a;
4918
    double *db = (double *) &b;
4919
    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4920
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4921
#endif
4922
}
4923

4924
// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4925
// upper element.
4926
//
4927
//   dst[63:0] := a[63:0]
4928
//   dst[127:64] := 0
4929
//
4930
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4931
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4932
{
4933
    return vreinterpretq_m128i_s64(
4934
        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4935
}
4936

4937
// Move the lower double-precision (64-bit) floating-point element from b to the
4938
// lower element of dst, and copy the upper element from a to the upper element
4939
// of dst.
4940
//
4941
//   dst[63:0] := b[63:0]
4942
//   dst[127:64] := a[127:64]
4943
//
4944
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4945
FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4946
{
4947
    return vreinterpretq_m128d_f32(
4948
        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4949
                     vget_high_f32(vreinterpretq_f32_m128d(a))));
4950
}
4951

4952
// NEON does not provide a version of this function.
4953
// Creates a 16-bit mask from the most significant bits of the 16 signed or
4954
// unsigned 8-bit integers in a and zero extends the upper bits.
4955
// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4956
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4957
{
4958
    // Use increasingly wide shifts+adds to collect the sign bits
4959
    // together.
4960
    // Since the widening shifts would be rather confusing to follow in little
4961
    // endian, everything will be illustrated in big endian order instead. This
4962
    // has a different result - the bits would actually be reversed on a big
4963
    // endian machine.
4964

4965
    // Starting input (only half the elements are shown):
4966
    // 89 ff 1d c0 00 10 99 33
4967
    uint8x16_t input = vreinterpretq_u8_m128i(a);
4968

4969
    // Shift out everything but the sign bits with an unsigned shift right.
4970
    //
4971
    // Bytes of the vector::
4972
    // 89 ff 1d c0 00 10 99 33
4973
    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
4974
    //  |  |  |  |  |  |  |  |
4975
    // 01 01 00 01 00 00 01 00
4976
    //
4977
    // Bits of first important lane(s):
4978
    // 10001001 (89)
4979
    // \______
4980
    //        |
4981
    // 00000001 (01)
4982
    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4983

4984
    // Merge the even lanes together with a 16-bit unsigned shift right + add.
4985
    // 'xx' represents garbage data which will be ignored in the final result.
4986
    // In the important bytes, the add functions like a binary OR.
4987
    //
4988
    // 01 01 00 01 00 00 01 00
4989
    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
4990
    //    \|    \|    \|    \|
4991
    // xx 03 xx 01 xx 00 xx 02
4992
    //
4993
    // 00000001 00000001 (01 01)
4994
    //        \_______ |
4995
    //                \|
4996
    // xxxxxxxx xxxxxx11 (xx 03)
4997
    uint32x4_t paired16 =
4998
        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4999

5000
    // Repeat with a wider 32-bit shift + add.
5001
    // xx 03 xx 01 xx 00 xx 02
5002
    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
5003
    //     14))
5004
    //          \|          \|
5005
    // xx xx xx 0d xx xx xx 02
5006
    //
5007
    // 00000011 00000001 (03 01)
5008
    //        \\_____ ||
5009
    //         '----.\||
5010
    // xxxxxxxx xxxx1101 (xx 0d)
5011
    uint64x2_t paired32 =
5012
        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
5013

5014
    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
5015
    // lanes. xx xx xx 0d xx xx xx 02
5016
    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
5017
    //            28))
5018
    //                      \|
5019
    // xx xx xx xx xx xx xx d2
5020
    //
5021
    // 00001101 00000010 (0d 02)
5022
    //     \   \___ |  |
5023
    //      '---.  \|  |
5024
    // xxxxxxxx 11010010 (xx d2)
5025
    uint8x16_t paired64 =
5026
        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
5027

5028
    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
5029
    // xx xx xx xx xx xx xx d2
5030
    //                      ||  return paired64[0]
5031
    //                      d2
5032
    // Note: Little endian would return the correct value 4b (01001011) instead.
5033
    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
5034
}
5035

5036
// Set each bit of mask dst based on the most significant bit of the
5037
// corresponding packed double-precision (64-bit) floating-point element in a.
5038
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
5039
FORCE_INLINE int _mm_movemask_pd(__m128d a)
5040
{
5041
    uint64x2_t input = vreinterpretq_u64_m128d(a);
5042
    uint64x2_t high_bits = vshrq_n_u64(input, 63);
5043
    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
5044
}
5045

5046
// Copy the lower 64-bit integer in a to dst.
5047
//
5048
//   dst[63:0] := a[63:0]
5049
//
5050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
5051
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
5052
{
5053
    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
5054
}
5055

5056
// Copy the 64-bit integer a to the lower element of dst, and zero the upper
5057
// element.
5058
//
5059
//   dst[63:0] := a[63:0]
5060
//   dst[127:64] := 0
5061
//
5062
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
5063
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
5064
{
5065
    return vreinterpretq_m128i_s64(
5066
        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
5067
}
5068

5069
// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
5070
// a and b, and store the unsigned 64-bit results in dst.
5071
//
5072
//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
5073
//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
5074
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
5075
{
5076
    // vmull_u32 upcasts instead of masking, so we downcast.
5077
    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
5078
    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
5079
    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
5080
}
5081

5082
// Multiply packed double-precision (64-bit) floating-point elements in a and b,
5083
// and store the results in dst.
5084
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
5085
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
5086
{
5087
#if defined(__aarch64__)
5088
    return vreinterpretq_m128d_f64(
5089
        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5090
#else
5091
    double *da = (double *) &a;
5092
    double *db = (double *) &b;
5093
    double c[2];
5094
    c[0] = da[0] * db[0];
5095
    c[1] = da[1] * db[1];
5096
    return vld1q_f32((float32_t *) c);
5097
#endif
5098
}
5099

5100
// Multiply the lower double-precision (64-bit) floating-point element in a and
5101
// b, store the result in the lower element of dst, and copy the upper element
5102
// from a to the upper element of dst.
5103
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
5104
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
5105
{
5106
    return _mm_move_sd(a, _mm_mul_pd(a, b));
5107
}
5108

5109
// Multiply the low unsigned 32-bit integers from a and b, and store the
5110
// unsigned 64-bit result in dst.
5111
//
5112
//   dst[63:0] := a[31:0] * b[31:0]
5113
//
5114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
5115
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
5116
{
5117
    return vreinterpret_m64_u64(vget_low_u64(
5118
        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
5119
}
5120

5121
// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
5122
// integers from b.
5123
//
5124
//   r0 := (a0 * b0)[31:16]
5125
//   r1 := (a1 * b1)[31:16]
5126
//   ...
5127
//   r7 := (a7 * b7)[31:16]
5128
//
5129
// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
5130
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
5131
{
5132
    /* FIXME: issue with large values because of result saturation */
5133
    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
5134
    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
5135
    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
5136
    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
5137
    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
5138
    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
5139
    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
5140
    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
5141
    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
5142
    uint16x8x2_t r =
5143
        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
5144
    return vreinterpretq_m128i_u16(r.val[1]);
5145
}
5146

5147
// Multiply the packed unsigned 16-bit integers in a and b, producing
5148
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
5149
// integers in dst.
5150
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
5151
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
5152
{
5153
    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
5154
    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
5155
    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
5156
#if defined(__aarch64__)
5157
    uint32x4_t ab7654 =
5158
        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
5159
    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
5160
                              vreinterpretq_u16_u32(ab7654));
5161
    return vreinterpretq_m128i_u16(r);
5162
#else
5163
    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
5164
    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
5165
    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
5166
    uint16x8x2_t r =
5167
        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
5168
    return vreinterpretq_m128i_u16(r.val[1]);
5169
#endif
5170
}
5171

5172
// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
5173
// unsigned 16-bit integers from b.
5174
//
5175
//   r0 := (a0 * b0)[15:0]
5176
//   r1 := (a1 * b1)[15:0]
5177
//   ...
5178
//   r7 := (a7 * b7)[15:0]
5179
//
5180
// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
5181
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
5182
{
5183
    return vreinterpretq_m128i_s16(
5184
        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5185
}
5186

5187
// Compute the bitwise OR of packed double-precision (64-bit) floating-point
5188
// elements in a and b, and store the results in dst.
5189
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
5190
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
5191
{
5192
    return vreinterpretq_m128d_s64(
5193
        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
5194
}
5195

5196
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
5197
//
5198
//   r := a | b
5199
//
5200
// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
5201
FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
5202
{
5203
    return vreinterpretq_m128i_s32(
5204
        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5205
}
5206

5207
// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5208
// saturates.
5209
// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
5210
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5211
{
5212
    return vreinterpretq_m128i_s8(
5213
        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5214
                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
5215
}
5216

5217
// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5218
// and saturates.
5219
//
5220
//   r0 := SignedSaturate(a0)
5221
//   r1 := SignedSaturate(a1)
5222
//   r2 := SignedSaturate(a2)
5223
//   r3 := SignedSaturate(a3)
5224
//   r4 := SignedSaturate(b0)
5225
//   r5 := SignedSaturate(b1)
5226
//   r6 := SignedSaturate(b2)
5227
//   r7 := SignedSaturate(b3)
5228
//
5229
// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
5230
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5231
{
5232
    return vreinterpretq_m128i_s16(
5233
        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5234
                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
5235
}
5236

5237
// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5238
// integers and saturates.
5239
//
5240
//   r0 := UnsignedSaturate(a0)
5241
//   r1 := UnsignedSaturate(a1)
5242
//   ...
5243
//   r7 := UnsignedSaturate(a7)
5244
//   r8 := UnsignedSaturate(b0)
5245
//   r9 := UnsignedSaturate(b1)
5246
//   ...
5247
//   r15 := UnsignedSaturate(b7)
5248
//
5249
// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5250
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5251
{
5252
    return vreinterpretq_m128i_u8(
5253
        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5254
                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
5255
}
5256

5257
// Pause the processor. This is typically used in spin-wait loops and depending
5258
// on the x86 processor typical values are in the 40-100 cycle range. The
5259
// 'yield' instruction isn't a good fit because it's effectively a nop on most
5260
// Arm cores. Experience with several databases has shown has shown an 'isb' is
5261
// a reasonable approximation.
5262
FORCE_INLINE void _mm_pause()
5263
{
5264
    __asm__ __volatile__("isb\n");
5265
}
5266

5267
// Compute the absolute differences of packed unsigned 8-bit integers in a and
5268
// b, then horizontally sum each consecutive 8 differences to produce two
5269
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5270
// 16 bits of 64-bit elements in dst.
5271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
5272
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
5273
{
5274
    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5275
    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5276
}
5277

5278
// Sets the 8 signed 16-bit integer values.
5279
// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5280
FORCE_INLINE __m128i _mm_set_epi16(short i7,
5281
                                   short i6,
5282
                                   short i5,
5283
                                   short i4,
5284
                                   short i3,
5285
                                   short i2,
5286
                                   short i1,
5287
                                   short i0)
5288
{
5289
    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5290
    return vreinterpretq_m128i_s16(vld1q_s16(data));
5291
}
5292

5293
// Sets the 4 signed 32-bit integer values.
5294
// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5295
FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5296
{
5297
    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5298
    return vreinterpretq_m128i_s32(vld1q_s32(data));
5299
}
5300

5301
// Returns the __m128i structure with its two 64-bit integer values
5302
// initialized to the values of the two 64-bit integers passed in.
5303
// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5304
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
5305
{
5306
    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5307
}
5308

5309
// Returns the __m128i structure with its two 64-bit integer values
5310
// initialized to the values of the two 64-bit integers passed in.
5311
// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5312
FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5313
{
5314
    return vreinterpretq_m128i_s64(
5315
        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5316
}
5317

5318
// Sets the 16 signed 8-bit integer values.
5319
// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5320
FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
5321
                                  signed char b14,
5322
                                  signed char b13,
5323
                                  signed char b12,
5324
                                  signed char b11,
5325
                                  signed char b10,
5326
                                  signed char b9,
5327
                                  signed char b8,
5328
                                  signed char b7,
5329
                                  signed char b6,
5330
                                  signed char b5,
5331
                                  signed char b4,
5332
                                  signed char b3,
5333
                                  signed char b2,
5334
                                  signed char b1,
5335
                                  signed char b0)
5336
{
5337
    int8_t ALIGN_STRUCT(16)
5338
        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5339
                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5340
                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5341
                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5342
    return (__m128i) vld1q_s8(data);
5343
}
5344

5345
// Set packed double-precision (64-bit) floating-point elements in dst with the
5346
// supplied values.
5347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
5348
FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5349
{
5350
    double ALIGN_STRUCT(16) data[2] = {e0, e1};
5351
#if defined(__aarch64__)
5352
    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5353
#else
5354
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5355
#endif
5356
}
5357

5358
// Broadcast double-precision (64-bit) floating-point value a to all elements of
5359
// dst.
5360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
5361
#define _mm_set_pd1 _mm_set1_pd
5362

5363
// Copy double-precision (64-bit) floating-point element a to the lower element
5364
// of dst, and zero the upper element.
5365
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
5366
FORCE_INLINE __m128d _mm_set_sd(double a)
5367
{
5368
#if defined(__aarch64__)
5369
    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
5370
#else
5371
    return _mm_set_pd(0, a);
5372
#endif
5373
}
5374

5375
// Sets the 8 signed 16-bit integer values to w.
5376
//
5377
//   r0 := w
5378
//   r1 := w
5379
//   ...
5380
//   r7 := w
5381
//
5382
// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5383
FORCE_INLINE __m128i _mm_set1_epi16(short w)
5384
{
5385
    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5386
}
5387

5388
// Sets the 4 signed 32-bit integer values to i.
5389
//
5390
//   r0 := i
5391
//   r1 := i
5392
//   r2 := i
5393
//   r3 := I
5394
//
5395
// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5396
FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5397
{
5398
    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5399
}
5400

5401
// Sets the 2 signed 64-bit integer values to i.
5402
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5403
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5404
{
5405
    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5406
}
5407

5408
// Sets the 2 signed 64-bit integer values to i.
5409
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
5410
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5411
{
5412
    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5413
}
5414

5415
// Sets the 16 signed 8-bit integer values to b.
5416
//
5417
//   r0 := b
5418
//   r1 := b
5419
//   ...
5420
//   r15 := b
5421
//
5422
// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5423
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5424
{
5425
    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5426
}
5427

5428
// Broadcast double-precision (64-bit) floating-point value a to all elements of
5429
// dst.
5430
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
5431
FORCE_INLINE __m128d _mm_set1_pd(double d)
5432
{
5433
#if defined(__aarch64__)
5434
    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5435
#else
5436
    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5437
#endif
5438
}
5439

5440
// Sets the 8 signed 16-bit integer values in reverse order.
5441
//
5442
// Return Value
5443
//   r0 := w0
5444
//   r1 := w1
5445
//   ...
5446
//   r7 := w7
5447
FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5448
                                    short w1,
5449
                                    short w2,
5450
                                    short w3,
5451
                                    short w4,
5452
                                    short w5,
5453
                                    short w6,
5454
                                    short w7)
5455
{
5456
    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5457
    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5458
}
5459

5460
// Sets the 4 signed 32-bit integer values in reverse order
5461
// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5462
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5463
{
5464
    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5465
    return vreinterpretq_m128i_s32(vld1q_s32(data));
5466
}
5467

5468
// Set packed 64-bit integers in dst with the supplied values in reverse order.
5469
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
5470
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5471
{
5472
    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5473
}
5474

5475
// Sets the 16 signed 8-bit integer values in reverse order.
5476
// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5477
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5478
                                   signed char b1,
5479
                                   signed char b2,
5480
                                   signed char b3,
5481
                                   signed char b4,
5482
                                   signed char b5,
5483
                                   signed char b6,
5484
                                   signed char b7,
5485
                                   signed char b8,
5486
                                   signed char b9,
5487
                                   signed char b10,
5488
                                   signed char b11,
5489
                                   signed char b12,
5490
                                   signed char b13,
5491
                                   signed char b14,
5492
                                   signed char b15)
5493
{
5494
    int8_t ALIGN_STRUCT(16)
5495
        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5496
                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5497
                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5498
                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5499
    return (__m128i) vld1q_s8(data);
5500
}
5501

5502
// Set packed double-precision (64-bit) floating-point elements in dst with the
5503
// supplied values in reverse order.
5504
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5505
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5506
{
5507
    return _mm_set_pd(e0, e1);
5508
}
5509

5510
// Return vector of type __m128d with all elements set to zero.
5511
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5512
FORCE_INLINE __m128d _mm_setzero_pd(void)
5513
{
5514
#if defined(__aarch64__)
5515
    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5516
#else
5517
    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5518
#endif
5519
}
5520

5521
// Sets the 128-bit value to zero
5522
// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5523
FORCE_INLINE __m128i _mm_setzero_si128(void)
5524
{
5525
    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5526
}
5527

5528
// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5529
// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5530
// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5531
//                                        __constrange(0,255) int imm)
5532
#ifdef _sse2neon_shuffle
5533
#define _mm_shuffle_epi32(a, imm)                                            \
5534
    __extension__({                                                          \
5535
        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
5536
        int32x4_t _shuf =                                                    \
5537
            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5538
                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
5539
        vreinterpretq_m128i_s32(_shuf);                                      \
5540
    })
5541
#else  // generic
5542
#define _mm_shuffle_epi32(a, imm)                        \
5543
    __extension__({                                      \
5544
        __m128i ret;                                     \
5545
        switch (imm) {                                   \
5546
        case _MM_SHUFFLE(1, 0, 3, 2):                    \
5547
            ret = _mm_shuffle_epi_1032((a));             \
5548
            break;                                       \
5549
        case _MM_SHUFFLE(2, 3, 0, 1):                    \
5550
            ret = _mm_shuffle_epi_2301((a));             \
5551
            break;                                       \
5552
        case _MM_SHUFFLE(0, 3, 2, 1):                    \
5553
            ret = _mm_shuffle_epi_0321((a));             \
5554
            break;                                       \
5555
        case _MM_SHUFFLE(2, 1, 0, 3):                    \
5556
            ret = _mm_shuffle_epi_2103((a));             \
5557
            break;                                       \
5558
        case _MM_SHUFFLE(1, 0, 1, 0):                    \
5559
            ret = _mm_shuffle_epi_1010((a));             \
5560
            break;                                       \
5561
        case _MM_SHUFFLE(1, 0, 0, 1):                    \
5562
            ret = _mm_shuffle_epi_1001((a));             \
5563
            break;                                       \
5564
        case _MM_SHUFFLE(0, 1, 0, 1):                    \
5565
            ret = _mm_shuffle_epi_0101((a));             \
5566
            break;                                       \
5567
        case _MM_SHUFFLE(2, 2, 1, 1):                    \
5568
            ret = _mm_shuffle_epi_2211((a));             \
5569
            break;                                       \
5570
        case _MM_SHUFFLE(0, 1, 2, 2):                    \
5571
            ret = _mm_shuffle_epi_0122((a));             \
5572
            break;                                       \
5573
        case _MM_SHUFFLE(3, 3, 3, 2):                    \
5574
            ret = _mm_shuffle_epi_3332((a));             \
5575
            break;                                       \
5576
        case _MM_SHUFFLE(0, 0, 0, 0):                    \
5577
            ret = _mm_shuffle_epi32_splat((a), 0);       \
5578
            break;                                       \
5579
        case _MM_SHUFFLE(1, 1, 1, 1):                    \
5580
            ret = _mm_shuffle_epi32_splat((a), 1);       \
5581
            break;                                       \
5582
        case _MM_SHUFFLE(2, 2, 2, 2):                    \
5583
            ret = _mm_shuffle_epi32_splat((a), 2);       \
5584
            break;                                       \
5585
        case _MM_SHUFFLE(3, 3, 3, 3):                    \
5586
            ret = _mm_shuffle_epi32_splat((a), 3);       \
5587
            break;                                       \
5588
        default:                                         \
5589
            ret = _mm_shuffle_epi32_default((a), (imm)); \
5590
            break;                                       \
5591
        }                                                \
5592
        ret;                                             \
5593
    })
5594
#endif
5595

5596
// Shuffle double-precision (64-bit) floating-point elements using the control
5597
// in imm8, and store the results in dst.
5598
//
5599
//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5600
//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5601
//
5602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5603
#ifdef _sse2neon_shuffle
5604
#define _mm_shuffle_pd(a, b, imm8)                                            \
5605
    vreinterpretq_m128d_s64(                                                  \
5606
        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5607
                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5608
#else
5609
#define _mm_shuffle_pd(a, b, imm8)                                     \
5610
    _mm_castsi128_pd(_mm_set_epi64x(                                   \
5611
        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5612
        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5613
#endif
5614

5615
// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5616
//                                          __constrange(0,255) int imm)
5617
#ifdef _sse2neon_shuffle
5618
#define _mm_shufflehi_epi16(a, imm)                                           \
5619
    __extension__({                                                           \
5620
        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
5621
        int16x8_t _shuf =                                                     \
5622
            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
5623
                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5624
                          (((imm) >> 6) & 0x3) + 4);                          \
5625
        vreinterpretq_m128i_s16(_shuf);                                       \
5626
    })
5627
#else  // generic
5628
#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5629
#endif
5630

5631
// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5632
//                                          __constrange(0,255) int imm)
5633
#ifdef _sse2neon_shuffle
5634
#define _mm_shufflelo_epi16(a, imm)                                  \
5635
    __extension__({                                                  \
5636
        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
5637
        int16x8_t _shuf = vshuffleq_s16(                             \
5638
            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
5639
            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5640
        vreinterpretq_m128i_s16(_shuf);                              \
5641
    })
5642
#else  // generic
5643
#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5644
#endif
5645

5646
// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5647
// store the results in dst.
5648
//
5649
//   FOR j := 0 to 7
5650
//     i := j*16
5651
//     IF count[63:0] > 15
5652
//       dst[i+15:i] := 0
5653
//     ELSE
5654
//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5655
//     FI
5656
//   ENDFOR
5657
//
5658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5659
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5660
{
5661
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5662
    if (_sse2neon_unlikely(c & ~15))
5663
        return _mm_setzero_si128();
5664

5665
    int16x8_t vc = vdupq_n_s16((int16_t) c);
5666
    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5667
}
5668

5669
// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5670
// store the results in dst.
5671
//
5672
//   FOR j := 0 to 3
5673
//     i := j*32
5674
//     IF count[63:0] > 31
5675
//       dst[i+31:i] := 0
5676
//     ELSE
5677
//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5678
//     FI
5679
//   ENDFOR
5680
//
5681
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5682
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5683
{
5684
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5685
    if (_sse2neon_unlikely(c & ~31))
5686
        return _mm_setzero_si128();
5687

5688
    int32x4_t vc = vdupq_n_s32((int32_t) c);
5689
    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5690
}
5691

5692
// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5693
// store the results in dst.
5694
//
5695
//   FOR j := 0 to 1
5696
//     i := j*64
5697
//     IF count[63:0] > 63
5698
//       dst[i+63:i] := 0
5699
//     ELSE
5700
//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5701
//     FI
5702
//   ENDFOR
5703
//
5704
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5705
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5706
{
5707
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5708
    if (_sse2neon_unlikely(c & ~63))
5709
        return _mm_setzero_si128();
5710

5711
    int64x2_t vc = vdupq_n_s64((int64_t) c);
5712
    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5713
}
5714

5715
// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5716
// store the results in dst.
5717
//
5718
//   FOR j := 0 to 7
5719
//     i := j*16
5720
//     IF imm8[7:0] > 15
5721
//       dst[i+15:i] := 0
5722
//     ELSE
5723
//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5724
//     FI
5725
//   ENDFOR
5726
//
5727
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5728
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5729
{
5730
    if (_sse2neon_unlikely(imm & ~15))
5731
        return _mm_setzero_si128();
5732
    return vreinterpretq_m128i_s16(
5733
        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5734
}
5735

5736
// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5737
// store the results in dst.
5738
//
5739
//   FOR j := 0 to 3
5740
//     i := j*32
5741
//     IF imm8[7:0] > 31
5742
//       dst[i+31:i] := 0
5743
//     ELSE
5744
//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5745
//     FI
5746
//   ENDFOR
5747
//
5748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5749
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5750
{
5751
    if (_sse2neon_unlikely(imm & ~31))
5752
        return _mm_setzero_si128();
5753
    return vreinterpretq_m128i_s32(
5754
        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5755
}
5756

5757
// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5758
// store the results in dst.
5759
//
5760
//   FOR j := 0 to 1
5761
//     i := j*64
5762
//     IF imm8[7:0] > 63
5763
//       dst[i+63:i] := 0
5764
//     ELSE
5765
//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5766
//     FI
5767
//   ENDFOR
5768
//
5769
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5770
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5771
{
5772
    if (_sse2neon_unlikely(imm & ~63))
5773
        return _mm_setzero_si128();
5774
    return vreinterpretq_m128i_s64(
5775
        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5776
}
5777

5778
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5779
// dst.
5780
//
5781
//   tmp := imm8[7:0]
5782
//   IF tmp > 15
5783
//     tmp := 16
5784
//   FI
5785
//   dst[127:0] := a[127:0] << (tmp*8)
5786
//
5787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5788
#define _mm_slli_si128(a, imm)                                         \
5789
    __extension__({                                                    \
5790
        int8x16_t ret;                                                 \
5791
        if (_sse2neon_unlikely(imm == 0))                              \
5792
            ret = vreinterpretq_s8_m128i(a);                           \
5793
        else if (_sse2neon_unlikely((imm) & ~15))                      \
5794
            ret = vdupq_n_s8(0);                                       \
5795
        else                                                           \
5796
            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
5797
                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5798
        vreinterpretq_m128i_s8(ret);                                   \
5799
    })
5800

5801
// Compute the square root of packed double-precision (64-bit) floating-point
5802
// elements in a, and store the results in dst.
5803
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5804
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5805
{
5806
#if defined(__aarch64__)
5807
    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5808
#else
5809
    double a0 = sqrt(((double *) &a)[0]);
5810
    double a1 = sqrt(((double *) &a)[1]);
5811
    return _mm_set_pd(a1, a0);
5812
#endif
5813
}
5814

5815
// Compute the square root of the lower double-precision (64-bit) floating-point
5816
// element in b, store the result in the lower element of dst, and copy the
5817
// upper element from a to the upper element of dst.
5818
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5819
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5820
{
5821
#if defined(__aarch64__)
5822
    return _mm_move_sd(a, _mm_sqrt_pd(b));
5823
#else
5824
    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5825
#endif
5826
}
5827

5828
// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5829
// and store the results in dst.
5830
//
5831
//   FOR j := 0 to 7
5832
//     i := j*16
5833
//     IF count[63:0] > 15
5834
//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5835
//     ELSE
5836
//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5837
//     FI
5838
//  ENDFOR
5839
//
5840
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5841
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5842
{
5843
    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5844
    if (_sse2neon_unlikely(c & ~15))
5845
        return _mm_cmplt_epi16(a, _mm_setzero_si128());
5846
    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5847
}
5848

5849
// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5850
// and store the results in dst.
5851
//
5852
//   FOR j := 0 to 3
5853
//     i := j*32
5854
//     IF count[63:0] > 31
5855
//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5856
//     ELSE
5857
//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5858
//     FI
5859
//  ENDFOR
5860
//
5861
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5862
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5863
{
5864
    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5865
    if (_sse2neon_unlikely(c & ~31))
5866
        return _mm_cmplt_epi32(a, _mm_setzero_si128());
5867
    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5868
}
5869

5870
// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5871
// bits, and store the results in dst.
5872
//
5873
//   FOR j := 0 to 7
5874
//     i := j*16
5875
//     IF imm8[7:0] > 15
5876
//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5877
//     ELSE
5878
//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5879
//     FI
5880
//   ENDFOR
5881
//
5882
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5883
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5884
{
5885
    const int count = (imm & ~15) ? 15 : imm;
5886
    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5887
}
5888

5889
// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5890
// and store the results in dst.
5891
//
5892
//   FOR j := 0 to 3
5893
//     i := j*32
5894
//     IF imm8[7:0] > 31
5895
//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5896
//     ELSE
5897
//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5898
//     FI
5899
//   ENDFOR
5900
//
5901
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5902
// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5903
#define _mm_srai_epi32(a, imm)                                               \
5904
    __extension__({                                                          \
5905
        __m128i ret;                                                         \
5906
        if (_sse2neon_unlikely((imm) == 0)) {                                \
5907
            ret = a;                                                         \
5908
        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
5909
            ret = vreinterpretq_m128i_s32(                                   \
5910
                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5911
        } else {                                                             \
5912
            ret = vreinterpretq_m128i_s32(                                   \
5913
                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
5914
        }                                                                    \
5915
        ret;                                                                 \
5916
    })
5917

5918
// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5919
// store the results in dst.
5920
//
5921
//   FOR j := 0 to 7
5922
//     i := j*16
5923
//     IF count[63:0] > 15
5924
//       dst[i+15:i] := 0
5925
//     ELSE
5926
//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5927
//     FI
5928
//   ENDFOR
5929
//
5930
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5931
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5932
{
5933
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5934
    if (_sse2neon_unlikely(c & ~15))
5935
        return _mm_setzero_si128();
5936

5937
    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5938
    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5939
}
5940

5941
// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5942
// store the results in dst.
5943
//
5944
//   FOR j := 0 to 3
5945
//     i := j*32
5946
//     IF count[63:0] > 31
5947
//       dst[i+31:i] := 0
5948
//     ELSE
5949
//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5950
//     FI
5951
//   ENDFOR
5952
//
5953
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5954
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5955
{
5956
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5957
    if (_sse2neon_unlikely(c & ~31))
5958
        return _mm_setzero_si128();
5959

5960
    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5961
    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5962
}
5963

5964
// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5965
// store the results in dst.
5966
//
5967
//   FOR j := 0 to 1
5968
//     i := j*64
5969
//     IF count[63:0] > 63
5970
//       dst[i+63:i] := 0
5971
//     ELSE
5972
//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5973
//     FI
5974
//   ENDFOR
5975
//
5976
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5977
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5978
{
5979
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5980
    if (_sse2neon_unlikely(c & ~63))
5981
        return _mm_setzero_si128();
5982

5983
    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5984
    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5985
}
5986

5987
// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5988
// store the results in dst.
5989
//
5990
//   FOR j := 0 to 7
5991
//     i := j*16
5992
//     IF imm8[7:0] > 15
5993
//       dst[i+15:i] := 0
5994
//     ELSE
5995
//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5996
//     FI
5997
//   ENDFOR
5998
//
5999
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
6000
#define _mm_srli_epi16(a, imm)                                               \
6001
    __extension__({                                                          \
6002
        __m128i ret;                                                         \
6003
        if (_sse2neon_unlikely((imm) & ~15)) {                               \
6004
            ret = _mm_setzero_si128();                                       \
6005
        } else {                                                             \
6006
            ret = vreinterpretq_m128i_u16(                                   \
6007
                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
6008
        }                                                                    \
6009
        ret;                                                                 \
6010
    })
6011

6012
// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
6013
// store the results in dst.
6014
//
6015
//   FOR j := 0 to 3
6016
//     i := j*32
6017
//     IF imm8[7:0] > 31
6018
//       dst[i+31:i] := 0
6019
//     ELSE
6020
//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
6021
//     FI
6022
//   ENDFOR
6023
//
6024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
6025
// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
6026
#define _mm_srli_epi32(a, imm)                                               \
6027
    __extension__({                                                          \
6028
        __m128i ret;                                                         \
6029
        if (_sse2neon_unlikely((imm) & ~31)) {                               \
6030
            ret = _mm_setzero_si128();                                       \
6031
        } else {                                                             \
6032
            ret = vreinterpretq_m128i_u32(                                   \
6033
                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
6034
        }                                                                    \
6035
        ret;                                                                 \
6036
    })
6037

6038
// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
6039
// store the results in dst.
6040
//
6041
//   FOR j := 0 to 1
6042
//     i := j*64
6043
//     IF imm8[7:0] > 63
6044
//       dst[i+63:i] := 0
6045
//     ELSE
6046
//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
6047
//     FI
6048
//   ENDFOR
6049
//
6050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
6051
#define _mm_srli_epi64(a, imm)                                               \
6052
    __extension__({                                                          \
6053
        __m128i ret;                                                         \
6054
        if (_sse2neon_unlikely((imm) & ~63)) {                               \
6055
            ret = _mm_setzero_si128();                                       \
6056
        } else {                                                             \
6057
            ret = vreinterpretq_m128i_u64(                                   \
6058
                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
6059
        }                                                                    \
6060
        ret;                                                                 \
6061
    })
6062

6063
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
6064
// dst.
6065
//
6066
//   tmp := imm8[7:0]
6067
//   IF tmp > 15
6068
//     tmp := 16
6069
//   FI
6070
//   dst[127:0] := a[127:0] >> (tmp*8)
6071
//
6072
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
6073
#define _mm_srli_si128(a, imm)                                       \
6074
    __extension__({                                                  \
6075
        int8x16_t ret;                                               \
6076
        if (_sse2neon_unlikely((imm) & ~15))                         \
6077
            ret = vdupq_n_s8(0);                                     \
6078
        else                                                         \
6079
            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
6080
                           (imm > 15 ? 0 : imm));                    \
6081
        vreinterpretq_m128i_s8(ret);                                 \
6082
    })
6083

6084
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6085
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
6086
// or a general-protection exception may be generated.
6087
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
6088
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
6089
{
6090
#if defined(__aarch64__)
6091
    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
6092
#else
6093
    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
6094
#endif
6095
}
6096

6097
// Store the lower double-precision (64-bit) floating-point element from a into
6098
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
6099
// boundary or a general-protection exception may be generated.
6100
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
6101
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
6102
{
6103
#if defined(__aarch64__)
6104
    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
6105
    vst1q_f64((float64_t *) mem_addr,
6106
              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
6107
#else
6108
    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
6109
    vst1q_f32((float32_t *) mem_addr,
6110
              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
6111
#endif
6112
}
6113

6114
// Store the lower double-precision (64-bit) floating-point element from a into
6115
// memory. mem_addr does not need to be aligned on any particular boundary.
6116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
6117
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
6118
{
6119
#if defined(__aarch64__)
6120
    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6121
#else
6122
    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
6123
#endif
6124
}
6125

6126
// Stores four 32-bit integer values as (as a __m128i value) at the address p.
6127
// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
6128
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
6129
{
6130
    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6131
}
6132

6133
// Store the lower double-precision (64-bit) floating-point element from a into
6134
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
6135
// boundary or a general-protection exception may be generated.
6136
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
6137
#define _mm_store1_pd _mm_store_pd1
6138

6139
// Store the upper double-precision (64-bit) floating-point element from a into
6140
// memory.
6141
//
6142
//   MEM[mem_addr+63:mem_addr] := a[127:64]
6143
//
6144
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
6145
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
6146
{
6147
#if defined(__aarch64__)
6148
    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
6149
#else
6150
    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
6151
#endif
6152
}
6153

6154
// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
6155
// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
6156
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
6157
{
6158
    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
6159
}
6160

6161
// Store the lower double-precision (64-bit) floating-point element from a into
6162
// memory.
6163
//
6164
//   MEM[mem_addr+63:mem_addr] := a[63:0]
6165
//
6166
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
6167
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
6168
{
6169
#if defined(__aarch64__)
6170
    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6171
#else
6172
    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
6173
#endif
6174
}
6175

6176
// Store 2 double-precision (64-bit) floating-point elements from a into memory
6177
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
6178
// general-protection exception may be generated.
6179
//
6180
//   MEM[mem_addr+63:mem_addr] := a[127:64]
6181
//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
6182
//
6183
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
6184
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
6185
{
6186
    float32x4_t f = vreinterpretq_f32_m128d(a);
6187
    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
6188
}
6189

6190
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6191
// elements) from a into memory. mem_addr does not need to be aligned on any
6192
// particular boundary.
6193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
6194
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
6195
{
6196
    _mm_store_pd(mem_addr, a);
6197
}
6198

6199
// Stores 128-bits of integer data a at the address p.
6200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
6201
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
6202
{
6203
    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6204
}
6205

6206
// Stores 32-bits of integer data a at the address p.
6207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
6208
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
6209
{
6210
    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
6211
}
6212

6213
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6214
// elements) from a into memory using a non-temporal memory hint. mem_addr must
6215
// be aligned on a 16-byte boundary or a general-protection exception may be
6216
// generated.
6217
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
6218
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
6219
{
6220
#if __has_builtin(__builtin_nontemporal_store)
6221
    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
6222
#elif defined(__aarch64__)
6223
    vst1q_f64(p, vreinterpretq_f64_m128d(a));
6224
#else
6225
    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
6226
#endif
6227
}
6228

6229
// Stores the data in a to the address p without polluting the caches.  If the
6230
// cache line containing address p is already in the cache, the cache will be
6231
// updated.
6232
// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
6233
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6234
{
6235
#if __has_builtin(__builtin_nontemporal_store)
6236
    __builtin_nontemporal_store(a, p);
6237
#else
6238
    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6239
#endif
6240
}
6241

6242
// Store 32-bit integer a into memory using a non-temporal hint to minimize
6243
// cache pollution. If the cache line containing address mem_addr is already in
6244
// the cache, the cache will be updated.
6245
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
6246
FORCE_INLINE void _mm_stream_si32(int *p, int a)
6247
{
6248
    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6249
}
6250

6251
// Store 64-bit integer a into memory using a non-temporal hint to minimize
6252
// cache pollution. If the cache line containing address mem_addr is already in
6253
// the cache, the cache will be updated.
6254
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
6255
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
6256
{
6257
    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6258
}
6259

6260
// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6261
// store the results in dst.
6262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
6263
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
6264
{
6265
    return vreinterpretq_m128i_s16(
6266
        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6267
}
6268

6269
// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6270
// unsigned 32-bit integers of a.
6271
//
6272
//   r0 := a0 - b0
6273
//   r1 := a1 - b1
6274
//   r2 := a2 - b2
6275
//   r3 := a3 - b3
6276
//
6277
// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6278
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
6279
{
6280
    return vreinterpretq_m128i_s32(
6281
        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6282
}
6283

6284
// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6285
// and store the results in dst.
6286
//    r0 := a0 - b0
6287
//    r1 := a1 - b1
6288
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
6289
{
6290
    return vreinterpretq_m128i_s64(
6291
        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
6292
}
6293

6294
// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6295
// store the results in dst.
6296
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
6297
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
6298
{
6299
    return vreinterpretq_m128i_s8(
6300
        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6301
}
6302

6303
// Subtract packed double-precision (64-bit) floating-point elements in b from
6304
// packed double-precision (64-bit) floating-point elements in a, and store the
6305
// results in dst.
6306
//
6307
//   FOR j := 0 to 1
6308
//     i := j*64
6309
//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
6310
//   ENDFOR
6311
//
6312
//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
6313
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
6314
{
6315
#if defined(__aarch64__)
6316
    return vreinterpretq_m128d_f64(
6317
        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6318
#else
6319
    double *da = (double *) &a;
6320
    double *db = (double *) &b;
6321
    double c[2];
6322
    c[0] = da[0] - db[0];
6323
    c[1] = da[1] - db[1];
6324
    return vld1q_f32((float32_t *) c);
6325
#endif
6326
}
6327

6328
// Subtract the lower double-precision (64-bit) floating-point element in b from
6329
// the lower double-precision (64-bit) floating-point element in a, store the
6330
// result in the lower element of dst, and copy the upper element from a to the
6331
// upper element of dst.
6332
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
6333
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
6334
{
6335
    return _mm_move_sd(a, _mm_sub_pd(a, b));
6336
}
6337

6338
// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6339
//
6340
//   dst[63:0] := a[63:0] - b[63:0]
6341
//
6342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
6343
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
6344
{
6345
    return vreinterpret_m64_s64(
6346
        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
6347
}
6348

6349
// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6350
// of a and saturates.
6351
//
6352
//   r0 := SignedSaturate(a0 - b0)
6353
//   r1 := SignedSaturate(a1 - b1)
6354
//   ...
6355
//   r7 := SignedSaturate(a7 - b7)
6356
//
6357
// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6358
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
6359
{
6360
    return vreinterpretq_m128i_s16(
6361
        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6362
}
6363

6364
// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6365
// of a and saturates.
6366
//
6367
//   r0 := SignedSaturate(a0 - b0)
6368
//   r1 := SignedSaturate(a1 - b1)
6369
//   ...
6370
//   r15 := SignedSaturate(a15 - b15)
6371
//
6372
// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6373
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
6374
{
6375
    return vreinterpretq_m128i_s8(
6376
        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6377
}
6378

6379
// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6380
// integers of a and saturates..
6381
// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6382
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
6383
{
6384
    return vreinterpretq_m128i_u16(
6385
        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
6386
}
6387

6388
// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6389
// integers of a and saturates.
6390
//
6391
//   r0 := UnsignedSaturate(a0 - b0)
6392
//   r1 := UnsignedSaturate(a1 - b1)
6393
//   ...
6394
//   r15 := UnsignedSaturate(a15 - b15)
6395
//
6396
// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6397
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
6398
{
6399
    return vreinterpretq_m128i_u8(
6400
        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
6401
}
6402

6403
#define _mm_ucomieq_sd _mm_comieq_sd
6404
#define _mm_ucomige_sd _mm_comige_sd
6405
#define _mm_ucomigt_sd _mm_comigt_sd
6406
#define _mm_ucomile_sd _mm_comile_sd
6407
#define _mm_ucomilt_sd _mm_comilt_sd
6408
#define _mm_ucomineq_sd _mm_comineq_sd
6409

6410
// Return vector of type __m128d with undefined elements.
6411
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
6412
FORCE_INLINE __m128d _mm_undefined_pd(void)
6413
{
6414
#if defined(__GNUC__) || defined(__clang__)
6415
#pragma GCC diagnostic push
6416
#pragma GCC diagnostic ignored "-Wuninitialized"
6417
#endif
6418
    __m128d a;
6419
    return a;
6420
#if defined(__GNUC__) || defined(__clang__)
6421
#pragma GCC diagnostic pop
6422
#endif
6423
}
6424

6425
// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6426
// upper 4 signed or unsigned 16-bit integers in b.
6427
//
6428
//   r0 := a4
6429
//   r1 := b4
6430
//   r2 := a5
6431
//   r3 := b5
6432
//   r4 := a6
6433
//   r5 := b6
6434
//   r6 := a7
6435
//   r7 := b7
6436
//
6437
// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6438
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6439
{
6440
#if defined(__aarch64__)
6441
    return vreinterpretq_m128i_s16(
6442
        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6443
#else
6444
    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6445
    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6446
    int16x4x2_t result = vzip_s16(a1, b1);
6447
    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6448
#endif
6449
}
6450

6451
// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6452
// upper 2 signed or unsigned 32-bit integers in b.
6453
// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6454
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6455
{
6456
#if defined(__aarch64__)
6457
    return vreinterpretq_m128i_s32(
6458
        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6459
#else
6460
    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6461
    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6462
    int32x2x2_t result = vzip_s32(a1, b1);
6463
    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6464
#endif
6465
}
6466

6467
// Interleaves the upper signed or unsigned 64-bit integer in a with the
6468
// upper signed or unsigned 64-bit integer in b.
6469
//
6470
//   r0 := a1
6471
//   r1 := b1
6472
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6473
{
6474
    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6475
    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6476
    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6477
}
6478

6479
// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6480
// 8 signed or unsigned 8-bit integers in b.
6481
//
6482
//   r0 := a8
6483
//   r1 := b8
6484
//   r2 := a9
6485
//   r3 := b9
6486
//   ...
6487
//   r14 := a15
6488
//   r15 := b15
6489
//
6490
// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6491
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6492
{
6493
#if defined(__aarch64__)
6494
    return vreinterpretq_m128i_s8(
6495
        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6496
#else
6497
    int8x8_t a1 =
6498
        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6499
    int8x8_t b1 =
6500
        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6501
    int8x8x2_t result = vzip_s8(a1, b1);
6502
    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6503
#endif
6504
}
6505

6506
// Unpack and interleave double-precision (64-bit) floating-point elements from
6507
// the high half of a and b, and store the results in dst.
6508
//
6509
//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6510
//     dst[63:0] := src1[127:64]
6511
//     dst[127:64] := src2[127:64]
6512
//     RETURN dst[127:0]
6513
//   }
6514
//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6515
//
6516
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
6517
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6518
{
6519
#if defined(__aarch64__)
6520
    return vreinterpretq_m128d_f64(
6521
        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6522
#else
6523
    return vreinterpretq_m128d_s64(
6524
        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6525
                     vget_high_s64(vreinterpretq_s64_m128d(b))));
6526
#endif
6527
}
6528

6529
// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6530
// lower 4 signed or unsigned 16-bit integers in b.
6531
//
6532
//   r0 := a0
6533
//   r1 := b0
6534
//   r2 := a1
6535
//   r3 := b1
6536
//   r4 := a2
6537
//   r5 := b2
6538
//   r6 := a3
6539
//   r7 := b3
6540
//
6541
// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6542
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6543
{
6544
#if defined(__aarch64__)
6545
    return vreinterpretq_m128i_s16(
6546
        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6547
#else
6548
    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6549
    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6550
    int16x4x2_t result = vzip_s16(a1, b1);
6551
    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6552
#endif
6553
}
6554

6555
// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6556
// lower 2 signed or unsigned 32 - bit integers in b.
6557
//
6558
//   r0 := a0
6559
//   r1 := b0
6560
//   r2 := a1
6561
//   r3 := b1
6562
//
6563
// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6564
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6565
{
6566
#if defined(__aarch64__)
6567
    return vreinterpretq_m128i_s32(
6568
        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6569
#else
6570
    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6571
    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6572
    int32x2x2_t result = vzip_s32(a1, b1);
6573
    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6574
#endif
6575
}
6576

6577
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6578
{
6579
    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6580
    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6581
    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6582
}
6583

6584
// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6585
// 8 signed or unsigned 8-bit integers in b.
6586
//
6587
//   r0 := a0
6588
//   r1 := b0
6589
//   r2 := a1
6590
//   r3 := b1
6591
//   ...
6592
//   r14 := a7
6593
//   r15 := b7
6594
//
6595
// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6596
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6597
{
6598
#if defined(__aarch64__)
6599
    return vreinterpretq_m128i_s8(
6600
        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6601
#else
6602
    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6603
    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6604
    int8x8x2_t result = vzip_s8(a1, b1);
6605
    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6606
#endif
6607
}
6608

6609
// Unpack and interleave double-precision (64-bit) floating-point elements from
6610
// the low half of a and b, and store the results in dst.
6611
//
6612
//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6613
//     dst[63:0] := src1[63:0]
6614
//     dst[127:64] := src2[63:0]
6615
//     RETURN dst[127:0]
6616
//   }
6617
//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6618
//
6619
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
6620
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6621
{
6622
#if defined(__aarch64__)
6623
    return vreinterpretq_m128d_f64(
6624
        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6625
#else
6626
    return vreinterpretq_m128d_s64(
6627
        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6628
                     vget_low_s64(vreinterpretq_s64_m128d(b))));
6629
#endif
6630
}
6631

6632
// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6633
// elements in a and b, and store the results in dst.
6634
//
6635
//   FOR j := 0 to 1
6636
//      i := j*64
6637
//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6638
//   ENDFOR
6639
//
6640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
6641
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6642
{
6643
    return vreinterpretq_m128d_s64(
6644
        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6645
}
6646

6647
// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6648
// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6649
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6650
{
6651
    return vreinterpretq_m128i_s32(
6652
        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6653
}
6654

6655
/* SSE3 */
6656

6657
// Alternatively add and subtract packed double-precision (64-bit)
6658
// floating-point elements in a to/from packed elements in b, and store the
6659
// results in dst.
6660
//
6661
// FOR j := 0 to 1
6662
//   i := j*64
6663
//   IF ((j & 1) == 0)
6664
//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
6665
//   ELSE
6666
//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
6667
//   FI
6668
// ENDFOR
6669
//
6670
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
6671
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6672
{
6673
    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6674
#if defined(__aarch64__)
6675
    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6676
                                             vreinterpretq_f64_m128d(b),
6677
                                             vreinterpretq_f64_m128d(mask)));
6678
#else
6679
    return _mm_add_pd(_mm_mul_pd(b, mask), a);
6680
#endif
6681
}
6682

6683
// Alternatively add and subtract packed single-precision (32-bit)
6684
// floating-point elements in a to/from packed elements in b, and store the
6685
// results in dst.
6686
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
6687
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6688
{
6689
    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6690
#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6691
    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6692
                                            vreinterpretq_f32_m128(mask),
6693
                                            vreinterpretq_f32_m128(b)));
6694
#else
6695
    return _mm_add_ps(_mm_mul_ps(b, mask), a);
6696
#endif
6697
}
6698

6699
// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6700
// elements in a and b, and pack the results in dst.
6701
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
6702
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6703
{
6704
#if defined(__aarch64__)
6705
    return vreinterpretq_m128d_f64(
6706
        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6707
#else
6708
    double *da = (double *) &a;
6709
    double *db = (double *) &b;
6710
    double c[] = {da[0] + da[1], db[0] + db[1]};
6711
    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6712
#endif
6713
}
6714

6715
// Computes pairwise add of each argument as single-precision, floating-point
6716
// values a and b.
6717
// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6718
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6719
{
6720
#if defined(__aarch64__)
6721
    return vreinterpretq_m128_f32(
6722
        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6723
#else
6724
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6725
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6726
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6727
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6728
    return vreinterpretq_m128_f32(
6729
        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6730
#endif
6731
}
6732

6733
// Horizontally subtract adjacent pairs of double-precision (64-bit)
6734
// floating-point elements in a and b, and pack the results in dst.
6735
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
6736
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6737
{
6738
#if defined(__aarch64__)
6739
    float64x2_t a = vreinterpretq_f64_m128d(_a);
6740
    float64x2_t b = vreinterpretq_f64_m128d(_b);
6741
    return vreinterpretq_m128d_f64(
6742
        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6743
#else
6744
    double *da = (double *) &_a;
6745
    double *db = (double *) &_b;
6746
    double c[] = {da[0] - da[1], db[0] - db[1]};
6747
    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6748
#endif
6749
}
6750

6751
// Horizontally subtract adjacent pairs of single-precision (32-bit)
6752
// floating-point elements in a and b, and pack the results in dst.
6753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
6754
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6755
{
6756
    float32x4_t a = vreinterpretq_f32_m128(_a);
6757
    float32x4_t b = vreinterpretq_f32_m128(_b);
6758
#if defined(__aarch64__)
6759
    return vreinterpretq_m128_f32(
6760
        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6761
#else
6762
    float32x4x2_t c = vuzpq_f32(a, b);
6763
    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6764
#endif
6765
}
6766

6767
// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6768
// may perform better than _mm_loadu_si128 when the data crosses a cache line
6769
// boundary.
6770
//
6771
//   dst[127:0] := MEM[mem_addr+127:mem_addr]
6772
//
6773
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
6774
#define _mm_lddqu_si128 _mm_loadu_si128
6775

6776
// Load a double-precision (64-bit) floating-point element from memory into both
6777
// elements of dst.
6778
//
6779
//   dst[63:0] := MEM[mem_addr+63:mem_addr]
6780
//   dst[127:64] := MEM[mem_addr+63:mem_addr]
6781
//
6782
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
6783
#define _mm_loaddup_pd _mm_load1_pd
6784

6785
// Duplicate the low double-precision (64-bit) floating-point element from a,
6786
// and store the results in dst.
6787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6788
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6789
{
6790
#if defined(__aarch64__)
6791
    return vreinterpretq_m128d_f64(
6792
        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6793
#else
6794
    return vreinterpretq_m128d_u64(
6795
        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6796
#endif
6797
}
6798

6799
// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6800
// from a, and store the results in dst.
6801
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6802
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6803
{
6804
#if defined(__aarch64__)
6805
    return vreinterpretq_m128_f32(
6806
        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6807
#elif defined(_sse2neon_shuffle)
6808
    return vreinterpretq_m128_f32(vshuffleq_s32(
6809
        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6810
#else
6811
    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6812
    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6813
    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6814
    return vreinterpretq_m128_f32(vld1q_f32(data));
6815
#endif
6816
}
6817

6818
// Duplicate even-indexed single-precision (32-bit) floating-point elements
6819
// from a, and store the results in dst.
6820
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6821
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6822
{
6823
#if defined(__aarch64__)
6824
    return vreinterpretq_m128_f32(
6825
        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6826
#elif defined(_sse2neon_shuffle)
6827
    return vreinterpretq_m128_f32(vshuffleq_s32(
6828
        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6829
#else
6830
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6831
    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6832
    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6833
    return vreinterpretq_m128_f32(vld1q_f32(data));
6834
#endif
6835
}
6836

6837
/* SSSE3 */
6838

6839
// Compute the absolute value of packed signed 16-bit integers in a, and store
6840
// the unsigned results in dst.
6841
//
6842
//   FOR j := 0 to 7
6843
//     i := j*16
6844
//     dst[i+15:i] := ABS(a[i+15:i])
6845
//   ENDFOR
6846
//
6847
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6848
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6849
{
6850
    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6851
}
6852

6853
// Compute the absolute value of packed signed 32-bit integers in a, and store
6854
// the unsigned results in dst.
6855
//
6856
//   FOR j := 0 to 3
6857
//     i := j*32
6858
//     dst[i+31:i] := ABS(a[i+31:i])
6859
//   ENDFOR
6860
//
6861
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6862
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6863
{
6864
    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6865
}
6866

6867
// Compute the absolute value of packed signed 8-bit integers in a, and store
6868
// the unsigned results in dst.
6869
//
6870
//   FOR j := 0 to 15
6871
//     i := j*8
6872
//     dst[i+7:i] := ABS(a[i+7:i])
6873
//   ENDFOR
6874
//
6875
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6876
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6877
{
6878
    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6879
}
6880

6881
// Compute the absolute value of packed signed 16-bit integers in a, and store
6882
// the unsigned results in dst.
6883
//
6884
//   FOR j := 0 to 3
6885
//     i := j*16
6886
//     dst[i+15:i] := ABS(a[i+15:i])
6887
//   ENDFOR
6888
//
6889
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6890
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6891
{
6892
    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6893
}
6894

6895
// Compute the absolute value of packed signed 32-bit integers in a, and store
6896
// the unsigned results in dst.
6897
//
6898
//   FOR j := 0 to 1
6899
//     i := j*32
6900
//     dst[i+31:i] := ABS(a[i+31:i])
6901
//   ENDFOR
6902
//
6903
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6904
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6905
{
6906
    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6907
}
6908

6909
// Compute the absolute value of packed signed 8-bit integers in a, and store
6910
// the unsigned results in dst.
6911
//
6912
//   FOR j := 0 to 7
6913
//     i := j*8
6914
//     dst[i+7:i] := ABS(a[i+7:i])
6915
//   ENDFOR
6916
//
6917
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6918
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6919
{
6920
    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6921
}
6922

6923
// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6924
// the result right by imm8 bytes, and store the low 16 bytes in dst.
6925
//
6926
//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6927
//   dst[127:0] := tmp[127:0]
6928
//
6929
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6930
#define _mm_alignr_epi8(a, b, imm)                                            \
6931
    __extension__({                                                           \
6932
        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
6933
        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
6934
        __m128i ret;                                                          \
6935
        if (_sse2neon_unlikely((imm) & ~31))                                  \
6936
            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
6937
        else if (imm >= 16)                                                   \
6938
            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
6939
        else                                                                  \
6940
            ret =                                                             \
6941
                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6942
        ret;                                                                  \
6943
    })
6944

6945
// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6946
// the result right by imm8 bytes, and store the low 8 bytes in dst.
6947
//
6948
//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6949
//   dst[63:0] := tmp[63:0]
6950
//
6951
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6952
#define _mm_alignr_pi8(a, b, imm)                                           \
6953
    __extension__({                                                         \
6954
        __m64 ret;                                                          \
6955
        if (_sse2neon_unlikely((imm) >= 16)) {                              \
6956
            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
6957
        } else {                                                            \
6958
            uint8x8_t tmp_low, tmp_high;                                    \
6959
            if ((imm) >= 8) {                                               \
6960
                const int idx = (imm) -8;                                   \
6961
                tmp_low = vreinterpret_u8_m64(a);                           \
6962
                tmp_high = vdup_n_u8(0);                                    \
6963
                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6964
            } else {                                                        \
6965
                const int idx = (imm);                                      \
6966
                tmp_low = vreinterpret_u8_m64(b);                           \
6967
                tmp_high = vreinterpret_u8_m64(a);                          \
6968
                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6969
            }                                                               \
6970
        }                                                                   \
6971
        ret;                                                                \
6972
    })
6973

6974
// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6975
// values a and b.
6976
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6977
{
6978
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6979
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6980
#if defined(__aarch64__)
6981
    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6982
#else
6983
    return vreinterpretq_m128i_s16(
6984
        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6985
                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6986
#endif
6987
}
6988

6989
// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6990
// values a and b.
6991
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6992
{
6993
    int32x4_t a = vreinterpretq_s32_m128i(_a);
6994
    int32x4_t b = vreinterpretq_s32_m128i(_b);
6995
#if defined(__aarch64__)
6996
    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6997
#else
6998
    return vreinterpretq_m128i_s32(
6999
        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
7000
                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
7001
#endif
7002
}
7003

7004
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
7005
// signed 16-bit results in dst.
7006
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
7007
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
7008
{
7009
    return vreinterpret_m64_s16(
7010
        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
7011
}
7012

7013
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
7014
// signed 32-bit results in dst.
7015
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
7016
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
7017
{
7018
    return vreinterpret_m64_s32(
7019
        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
7020
}
7021

7022
// Computes saturated pairwise sub of each argument as a 16-bit signed
7023
// integer values a and b.
7024
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
7025
{
7026
#if defined(__aarch64__)
7027
    int16x8_t a = vreinterpretq_s16_m128i(_a);
7028
    int16x8_t b = vreinterpretq_s16_m128i(_b);
7029
    return vreinterpretq_s64_s16(
7030
        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7031
#else
7032
    int32x4_t a = vreinterpretq_s32_m128i(_a);
7033
    int32x4_t b = vreinterpretq_s32_m128i(_b);
7034
    // Interleave using vshrn/vmovn
7035
    // [a0|a2|a4|a6|b0|b2|b4|b6]
7036
    // [a1|a3|a5|a7|b1|b3|b5|b7]
7037
    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
7038
    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
7039
    // Saturated add
7040
    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
7041
#endif
7042
}
7043

7044
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
7045
// saturation, and pack the signed 16-bit results in dst.
7046
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
7047
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
7048
{
7049
    int16x4_t a = vreinterpret_s16_m64(_a);
7050
    int16x4_t b = vreinterpret_s16_m64(_b);
7051
#if defined(__aarch64__)
7052
    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7053
#else
7054
    int16x4x2_t res = vuzp_s16(a, b);
7055
    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
7056
#endif
7057
}
7058

7059
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
7060
// the signed 16-bit results in dst.
7061
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
7062
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
7063
{
7064
    int16x8_t a = vreinterpretq_s16_m128i(_a);
7065
    int16x8_t b = vreinterpretq_s16_m128i(_b);
7066
#if defined(__aarch64__)
7067
    return vreinterpretq_m128i_s16(
7068
        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7069
#else
7070
    int16x8x2_t c = vuzpq_s16(a, b);
7071
    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
7072
#endif
7073
}
7074

7075
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
7076
// the signed 32-bit results in dst.
7077
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
7078
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
7079
{
7080
    int32x4_t a = vreinterpretq_s32_m128i(_a);
7081
    int32x4_t b = vreinterpretq_s32_m128i(_b);
7082
#if defined(__aarch64__)
7083
    return vreinterpretq_m128i_s32(
7084
        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
7085
#else
7086
    int32x4x2_t c = vuzpq_s32(a, b);
7087
    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
7088
#endif
7089
}
7090

7091
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
7092
// the signed 16-bit results in dst.
7093
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
7094
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
7095
{
7096
    int16x4_t a = vreinterpret_s16_m64(_a);
7097
    int16x4_t b = vreinterpret_s16_m64(_b);
7098
#if defined(__aarch64__)
7099
    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7100
#else
7101
    int16x4x2_t c = vuzp_s16(a, b);
7102
    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
7103
#endif
7104
}
7105

7106
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
7107
// the signed 32-bit results in dst.
7108
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
7109
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
7110
{
7111
    int32x2_t a = vreinterpret_s32_m64(_a);
7112
    int32x2_t b = vreinterpret_s32_m64(_b);
7113
#if defined(__aarch64__)
7114
    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
7115
#else
7116
    int32x2x2_t c = vuzp_s32(a, b);
7117
    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
7118
#endif
7119
}
7120

7121
// Computes saturated pairwise difference of each argument as a 16-bit signed
7122
// integer values a and b.
7123
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
7124
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
7125
{
7126
    int16x8_t a = vreinterpretq_s16_m128i(_a);
7127
    int16x8_t b = vreinterpretq_s16_m128i(_b);
7128
#if defined(__aarch64__)
7129
    return vreinterpretq_m128i_s16(
7130
        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
7131
#else
7132
    int16x8x2_t c = vuzpq_s16(a, b);
7133
    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
7134
#endif
7135
}
7136

7137
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
7138
// using saturation, and pack the signed 16-bit results in dst.
7139
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
7140
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
7141
{
7142
    int16x4_t a = vreinterpret_s16_m64(_a);
7143
    int16x4_t b = vreinterpret_s16_m64(_b);
7144
#if defined(__aarch64__)
7145
    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
7146
#else
7147
    int16x4x2_t c = vuzp_s16(a, b);
7148
    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
7149
#endif
7150
}
7151

7152
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
7153
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7154
// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
7155
// and pack the saturated results in dst.
7156
//
7157
//   FOR j := 0 to 7
7158
//      i := j*16
7159
//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
7160
//      a[i+7:i]*b[i+7:i] )
7161
//   ENDFOR
7162
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
7163
{
7164
#if defined(__aarch64__)
7165
    uint8x16_t a = vreinterpretq_u8_m128i(_a);
7166
    int8x16_t b = vreinterpretq_s8_m128i(_b);
7167
    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
7168
                             vmovl_s8(vget_low_s8(b)));
7169
    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
7170
                             vmovl_s8(vget_high_s8(b)));
7171
    return vreinterpretq_m128i_s16(
7172
        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
7173
#else
7174
    // This would be much simpler if x86 would choose to zero extend OR sign
7175
    // extend, not both. This could probably be optimized better.
7176
    uint16x8_t a = vreinterpretq_u16_m128i(_a);
7177
    int16x8_t b = vreinterpretq_s16_m128i(_b);
7178

7179
    // Zero extend a
7180
    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
7181
    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
7182

7183
    // Sign extend by shifting left then shifting right.
7184
    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
7185
    int16x8_t b_odd = vshrq_n_s16(b, 8);
7186

7187
    // multiply
7188
    int16x8_t prod1 = vmulq_s16(a_even, b_even);
7189
    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
7190

7191
    // saturated add
7192
    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
7193
#endif
7194
}
7195

7196
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
7197
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7198
// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
7199
// pack the saturated results in dst.
7200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
7201
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
7202
{
7203
    uint16x4_t a = vreinterpret_u16_m64(_a);
7204
    int16x4_t b = vreinterpret_s16_m64(_b);
7205

7206
    // Zero extend a
7207
    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7208
    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7209

7210
    // Sign extend by shifting left then shifting right.
7211
    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7212
    int16x4_t b_odd = vshr_n_s16(b, 8);
7213

7214
    // multiply
7215
    int16x4_t prod1 = vmul_s16(a_even, b_even);
7216
    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7217

7218
    // saturated add
7219
    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
7220
}
7221

7222
// Multiply packed signed 16-bit integers in a and b, producing intermediate
7223
// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
7224
// the packed 16-bit integers in dst.
7225
//
7226
//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
7227
//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
7228
//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
7229
//   ...
7230
//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
7231
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
7232
{
7233
    // Has issues due to saturation
7234
    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
7235

7236
    // Multiply
7237
    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
7238
                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
7239
    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
7240
                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
7241

7242
    // Rounding narrowing shift right
7243
    // narrow = (int16_t)((mul + 16384) >> 15);
7244
    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7245
    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7246

7247
    // Join together
7248
    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
7249
}
7250

7251
// Multiply packed signed 16-bit integers in a and b, producing intermediate
7252
// signed 32-bit integers. Truncate each intermediate integer to the 18 most
7253
// significant bits, round by adding 1, and store bits [16:1] to dst.
7254
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
7255
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
7256
{
7257
    int32x4_t mul_extend =
7258
        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
7259

7260
    // Rounding narrowing shift right
7261
    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7262
}
7263

7264
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7265
// corresponding 8-bit element of b, and store the results in dst.
7266
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
7267
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
7268
{
7269
    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
7270
    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
7271
    uint8x16_t idx_masked =
7272
        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
7273
#if defined(__aarch64__)
7274
    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7275
#elif defined(__GNUC__)
7276
    int8x16_t ret;
7277
    // %e and %f represent the even and odd D registers
7278
    // respectively.
7279
    __asm__ __volatile__(
7280
        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7281
        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7282
        : [ret] "=&w"(ret)
7283
        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
7284
    return vreinterpretq_m128i_s8(ret);
7285
#else
7286
    // use this line if testing on aarch64
7287
    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7288
    return vreinterpretq_m128i_s8(
7289
        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7290
                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7291
#endif
7292
}
7293

7294
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7295
// corresponding 8-bit element of b, and store the results in dst.
7296
//
7297
//   FOR j := 0 to 7
7298
//     i := j*8
7299
//     IF b[i+7] == 1
7300
//       dst[i+7:i] := 0
7301
//     ELSE
7302
//       index[2:0] := b[i+2:i]
7303
//       dst[i+7:i] := a[index*8+7:index*8]
7304
//     FI
7305
//   ENDFOR
7306
//
7307
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
7308
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
7309
{
7310
    const int8x8_t controlMask =
7311
        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
7312
    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7313
    return vreinterpret_m64_s8(res);
7314
}
7315

7316
// Negate packed 16-bit integers in a when the corresponding signed
7317
// 16-bit integer in b is negative, and store the results in dst.
7318
// Element in dst are zeroed out when the corresponding element
7319
// in b is zero.
7320
//
7321
//   for i in 0..7
7322
//     if b[i] < 0
7323
//       r[i] := -a[i]
7324
//     else if b[i] == 0
7325
//       r[i] := 0
7326
//     else
7327
//       r[i] := a[i]
7328
//     fi
7329
//   done
7330
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
7331
{
7332
    int16x8_t a = vreinterpretq_s16_m128i(_a);
7333
    int16x8_t b = vreinterpretq_s16_m128i(_b);
7334

7335
    // signed shift right: faster than vclt
7336
    // (b < 0) ? 0xFFFF : 0
7337
    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7338
    // (b == 0) ? 0xFFFF : 0
7339
#if defined(__aarch64__)
7340
    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7341
#else
7342
    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7343
#endif
7344

7345
    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7346
    // 'a') based on ltMask
7347
    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7348
    // res = masked & (~zeroMask)
7349
    int16x8_t res = vbicq_s16(masked, zeroMask);
7350
    return vreinterpretq_m128i_s16(res);
7351
}
7352

7353
// Negate packed 32-bit integers in a when the corresponding signed
7354
// 32-bit integer in b is negative, and store the results in dst.
7355
// Element in dst are zeroed out when the corresponding element
7356
// in b is zero.
7357
//
7358
//   for i in 0..3
7359
//     if b[i] < 0
7360
//       r[i] := -a[i]
7361
//     else if b[i] == 0
7362
//       r[i] := 0
7363
//     else
7364
//       r[i] := a[i]
7365
//     fi
7366
//   done
7367
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
7368
{
7369
    int32x4_t a = vreinterpretq_s32_m128i(_a);
7370
    int32x4_t b = vreinterpretq_s32_m128i(_b);
7371

7372
    // signed shift right: faster than vclt
7373
    // (b < 0) ? 0xFFFFFFFF : 0
7374
    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7375

7376
    // (b == 0) ? 0xFFFFFFFF : 0
7377
#if defined(__aarch64__)
7378
    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7379
#else
7380
    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7381
#endif
7382

7383
    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7384
    // 'a') based on ltMask
7385
    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7386
    // res = masked & (~zeroMask)
7387
    int32x4_t res = vbicq_s32(masked, zeroMask);
7388
    return vreinterpretq_m128i_s32(res);
7389
}
7390

7391
// Negate packed 8-bit integers in a when the corresponding signed
7392
// 8-bit integer in b is negative, and store the results in dst.
7393
// Element in dst are zeroed out when the corresponding element
7394
// in b is zero.
7395
//
7396
//   for i in 0..15
7397
//     if b[i] < 0
7398
//       r[i] := -a[i]
7399
//     else if b[i] == 0
7400
//       r[i] := 0
7401
//     else
7402
//       r[i] := a[i]
7403
//     fi
7404
//   done
7405
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
7406
{
7407
    int8x16_t a = vreinterpretq_s8_m128i(_a);
7408
    int8x16_t b = vreinterpretq_s8_m128i(_b);
7409

7410
    // signed shift right: faster than vclt
7411
    // (b < 0) ? 0xFF : 0
7412
    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7413

7414
    // (b == 0) ? 0xFF : 0
7415
#if defined(__aarch64__)
7416
    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7417
#else
7418
    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7419
#endif
7420

7421
    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
7422
    // based on ltMask
7423
    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7424
    // res = masked & (~zeroMask)
7425
    int8x16_t res = vbicq_s8(masked, zeroMask);
7426

7427
    return vreinterpretq_m128i_s8(res);
7428
}
7429

7430
// Negate packed 16-bit integers in a when the corresponding signed 16-bit
7431
// integer in b is negative, and store the results in dst. Element in dst are
7432
// zeroed out when the corresponding element in b is zero.
7433
//
7434
//   FOR j := 0 to 3
7435
//      i := j*16
7436
//      IF b[i+15:i] < 0
7437
//        dst[i+15:i] := -(a[i+15:i])
7438
//      ELSE IF b[i+15:i] == 0
7439
//        dst[i+15:i] := 0
7440
//      ELSE
7441
//        dst[i+15:i] := a[i+15:i]
7442
//      FI
7443
//   ENDFOR
7444
//
7445
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
7446
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7447
{
7448
    int16x4_t a = vreinterpret_s16_m64(_a);
7449
    int16x4_t b = vreinterpret_s16_m64(_b);
7450

7451
    // signed shift right: faster than vclt
7452
    // (b < 0) ? 0xFFFF : 0
7453
    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7454

7455
    // (b == 0) ? 0xFFFF : 0
7456
#if defined(__aarch64__)
7457
    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7458
#else
7459
    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7460
#endif
7461

7462
    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
7463
    // based on ltMask
7464
    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7465
    // res = masked & (~zeroMask)
7466
    int16x4_t res = vbic_s16(masked, zeroMask);
7467

7468
    return vreinterpret_m64_s16(res);
7469
}
7470

7471
// Negate packed 32-bit integers in a when the corresponding signed 32-bit
7472
// integer in b is negative, and store the results in dst. Element in dst are
7473
// zeroed out when the corresponding element in b is zero.
7474
//
7475
//   FOR j := 0 to 1
7476
//      i := j*32
7477
//      IF b[i+31:i] < 0
7478
//        dst[i+31:i] := -(a[i+31:i])
7479
//      ELSE IF b[i+31:i] == 0
7480
//        dst[i+31:i] := 0
7481
//      ELSE
7482
//        dst[i+31:i] := a[i+31:i]
7483
//      FI
7484
//   ENDFOR
7485
//
7486
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
7487
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7488
{
7489
    int32x2_t a = vreinterpret_s32_m64(_a);
7490
    int32x2_t b = vreinterpret_s32_m64(_b);
7491

7492
    // signed shift right: faster than vclt
7493
    // (b < 0) ? 0xFFFFFFFF : 0
7494
    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7495

7496
    // (b == 0) ? 0xFFFFFFFF : 0
7497
#if defined(__aarch64__)
7498
    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7499
#else
7500
    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7501
#endif
7502

7503
    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
7504
    // based on ltMask
7505
    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7506
    // res = masked & (~zeroMask)
7507
    int32x2_t res = vbic_s32(masked, zeroMask);
7508

7509
    return vreinterpret_m64_s32(res);
7510
}
7511

7512
// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7513
// in b is negative, and store the results in dst. Element in dst are zeroed out
7514
// when the corresponding element in b is zero.
7515
//
7516
//   FOR j := 0 to 7
7517
//      i := j*8
7518
//      IF b[i+7:i] < 0
7519
//        dst[i+7:i] := -(a[i+7:i])
7520
//      ELSE IF b[i+7:i] == 0
7521
//        dst[i+7:i] := 0
7522
//      ELSE
7523
//        dst[i+7:i] := a[i+7:i]
7524
//      FI
7525
//   ENDFOR
7526
//
7527
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
7528
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7529
{
7530
    int8x8_t a = vreinterpret_s8_m64(_a);
7531
    int8x8_t b = vreinterpret_s8_m64(_b);
7532

7533
    // signed shift right: faster than vclt
7534
    // (b < 0) ? 0xFF : 0
7535
    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7536

7537
    // (b == 0) ? 0xFF : 0
7538
#if defined(__aarch64__)
7539
    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7540
#else
7541
    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7542
#endif
7543

7544
    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
7545
    // based on ltMask
7546
    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7547
    // res = masked & (~zeroMask)
7548
    int8x8_t res = vbic_s8(masked, zeroMask);
7549

7550
    return vreinterpret_m64_s8(res);
7551
}
7552

7553
/* SSE4.1 */
7554

7555
// Blend packed 16-bit integers from a and b using control mask imm8, and store
7556
// the results in dst.
7557
//
7558
//   FOR j := 0 to 7
7559
//       i := j*16
7560
//       IF imm8[j]
7561
//           dst[i+15:i] := b[i+15:i]
7562
//       ELSE
7563
//           dst[i+15:i] := a[i+15:i]
7564
//       FI
7565
//   ENDFOR
7566
// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7567
//                                      __constrange(0,255) int imm)
7568
#define _mm_blend_epi16(a, b, imm)                                            \
7569
    __extension__({                                                           \
7570
        const uint16_t ones = 0xffff;                                         \
7571
        const uint16_t zeros = 0x0000;                                        \
7572
        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros,         \
7573
                                   ((imm) & (1 << 1)) ? ones : zeros,         \
7574
                                   ((imm) & (1 << 2)) ? ones : zeros,         \
7575
                                   ((imm) & (1 << 3)) ? ones : zeros,         \
7576
                                   ((imm) & (1 << 4)) ? ones : zeros,         \
7577
                                   ((imm) & (1 << 5)) ? ones : zeros,         \
7578
                                   ((imm) & (1 << 6)) ? ones : zeros,         \
7579
                                   ((imm) & (1 << 7)) ? ones : zeros};        \
7580
        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
7581
        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
7582
        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
7583
        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
7584
    })
7585

7586
// Blend packed double-precision (64-bit) floating-point elements from a and b
7587
// using control mask imm8, and store the results in dst.
7588
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
7589
#define _mm_blend_pd(a, b, imm)                                \
7590
    __extension__({                                            \
7591
        const uint64_t _mask[2] = {                            \
7592
            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
7593
            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
7594
        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
7595
        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
7596
        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
7597
        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7598
    })
7599

7600
// Blend packed single-precision (32-bit) floating-point elements from a and b
7601
// using mask, and store the results in dst.
7602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
7603
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7604
{
7605
    const uint32_t ALIGN_STRUCT(16)
7606
        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7607
                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7608
                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7609
                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7610
    uint32x4_t mask = vld1q_u32(data);
7611
    float32x4_t a = vreinterpretq_f32_m128(_a);
7612
    float32x4_t b = vreinterpretq_f32_m128(_b);
7613
    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7614
}
7615

7616
// Blend packed 8-bit integers from a and b using mask, and store the results in
7617
// dst.
7618
//
7619
//   FOR j := 0 to 15
7620
//       i := j*8
7621
//       IF mask[i+7]
7622
//           dst[i+7:i] := b[i+7:i]
7623
//       ELSE
7624
//           dst[i+7:i] := a[i+7:i]
7625
//       FI
7626
//   ENDFOR
7627
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7628
{
7629
    // Use a signed shift right to create a mask with the sign bit
7630
    uint8x16_t mask =
7631
        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7632
    uint8x16_t a = vreinterpretq_u8_m128i(_a);
7633
    uint8x16_t b = vreinterpretq_u8_m128i(_b);
7634
    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7635
}
7636

7637
// Blend packed double-precision (64-bit) floating-point elements from a and b
7638
// using mask, and store the results in dst.
7639
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
7640
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7641
{
7642
    uint64x2_t mask =
7643
        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7644
#if defined(__aarch64__)
7645
    float64x2_t a = vreinterpretq_f64_m128d(_a);
7646
    float64x2_t b = vreinterpretq_f64_m128d(_b);
7647
    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7648
#else
7649
    uint64x2_t a = vreinterpretq_u64_m128d(_a);
7650
    uint64x2_t b = vreinterpretq_u64_m128d(_b);
7651
    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7652
#endif
7653
}
7654

7655
// Blend packed single-precision (32-bit) floating-point elements from a and b
7656
// using mask, and store the results in dst.
7657
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
7658
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7659
{
7660
    // Use a signed shift right to create a mask with the sign bit
7661
    uint32x4_t mask =
7662
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7663
    float32x4_t a = vreinterpretq_f32_m128(_a);
7664
    float32x4_t b = vreinterpretq_f32_m128(_b);
7665
    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7666
}
7667

7668
// Round the packed double-precision (64-bit) floating-point elements in a up
7669
// to an integer value, and store the results as packed double-precision
7670
// floating-point elements in dst.
7671
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
7672
FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7673
{
7674
#if defined(__aarch64__)
7675
    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7676
#else
7677
    double *f = (double *) &a;
7678
    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7679
#endif
7680
}
7681

7682
// Round the packed single-precision (32-bit) floating-point elements in a up to
7683
// an integer value, and store the results as packed single-precision
7684
// floating-point elements in dst.
7685
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
7686
FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7687
{
7688
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7689
    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7690
#else
7691
    float *f = (float *) &a;
7692
    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7693
#endif
7694
}
7695

7696
// Round the lower double-precision (64-bit) floating-point element in b up to
7697
// an integer value, store the result as a double-precision floating-point
7698
// element in the lower element of dst, and copy the upper element from a to the
7699
// upper element of dst.
7700
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
7701
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7702
{
7703
    return _mm_move_sd(a, _mm_ceil_pd(b));
7704
}
7705

7706
// Round the lower single-precision (32-bit) floating-point element in b up to
7707
// an integer value, store the result as a single-precision floating-point
7708
// element in the lower element of dst, and copy the upper 3 packed elements
7709
// from a to the upper elements of dst.
7710
//
7711
//   dst[31:0] := CEIL(b[31:0])
7712
//   dst[127:32] := a[127:32]
7713
//
7714
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
7715
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7716
{
7717
    return _mm_move_ss(a, _mm_ceil_ps(b));
7718
}
7719

7720
// Compare packed 64-bit integers in a and b for equality, and store the results
7721
// in dst
7722
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7723
{
7724
#if defined(__aarch64__)
7725
    return vreinterpretq_m128i_u64(
7726
        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7727
#else
7728
    // ARMv7 lacks vceqq_u64
7729
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7730
    uint32x4_t cmp =
7731
        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7732
    uint32x4_t swapped = vrev64q_u32(cmp);
7733
    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7734
#endif
7735
}
7736

7737
// Converts the four signed 16-bit integers in the lower 64 bits to four signed
7738
// 32-bit integers.
7739
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7740
{
7741
    return vreinterpretq_m128i_s32(
7742
        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7743
}
7744

7745
// Converts the two signed 16-bit integers in the lower 32 bits two signed
7746
// 32-bit integers.
7747
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7748
{
7749
    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7750
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7751
    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7752
    return vreinterpretq_m128i_s64(s64x2);
7753
}
7754

7755
// Converts the two signed 32-bit integers in the lower 64 bits to two signed
7756
// 64-bit integers.
7757
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7758
{
7759
    return vreinterpretq_m128i_s64(
7760
        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7761
}
7762

7763
// Converts the four unsigned 8-bit integers in the lower 16 bits to four
7764
// unsigned 32-bit integers.
7765
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7766
{
7767
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
7768
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7769
    return vreinterpretq_m128i_s16(s16x8);
7770
}
7771

7772
// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7773
// unsigned 32-bit integers.
7774
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7775
{
7776
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7777
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7778
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7779
    return vreinterpretq_m128i_s32(s32x4);
7780
}
7781

7782
// Converts the two signed 8-bit integers in the lower 32 bits to four
7783
// signed 64-bit integers.
7784
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7785
{
7786
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7787
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7788
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7789
    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7790
    return vreinterpretq_m128i_s64(s64x2);
7791
}
7792

7793
// Converts the four unsigned 16-bit integers in the lower 64 bits to four
7794
// unsigned 32-bit integers.
7795
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7796
{
7797
    return vreinterpretq_m128i_u32(
7798
        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7799
}
7800

7801
// Converts the two unsigned 16-bit integers in the lower 32 bits to two
7802
// unsigned 64-bit integers.
7803
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7804
{
7805
    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7806
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7807
    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7808
    return vreinterpretq_m128i_u64(u64x2);
7809
}
7810

7811
// Converts the two unsigned 32-bit integers in the lower 64 bits to two
7812
// unsigned 64-bit integers.
7813
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7814
{
7815
    return vreinterpretq_m128i_u64(
7816
        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7817
}
7818

7819
// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7820
// and store the results in dst.
7821
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
7822
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7823
{
7824
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
7825
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7826
    return vreinterpretq_m128i_u16(u16x8);
7827
}
7828

7829
// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7830
// unsigned 32-bit integers.
7831
// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7832
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7833
{
7834
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7835
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7836
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7837
    return vreinterpretq_m128i_u32(u32x4);
7838
}
7839

7840
// Converts the two unsigned 8-bit integers in the lower 16 bits to two
7841
// unsigned 64-bit integers.
7842
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7843
{
7844
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7845
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7846
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7847
    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7848
    return vreinterpretq_m128i_u64(u64x2);
7849
}
7850

7851
// Conditionally multiply the packed double-precision (64-bit) floating-point
7852
// elements in a and b using the high 4 bits in imm8, sum the four products, and
7853
// conditionally store the sum in dst using the low 4 bits of imm8.
7854
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
7855
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
7856
{
7857
    // Generate mask value from constant immediate bit value
7858
    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7859
    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7860
#if !SSE2NEON_PRECISE_DP
7861
    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7862
    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7863
#endif
7864
    // Conditional multiplication
7865
#if !SSE2NEON_PRECISE_DP
7866
    __m128d mul = _mm_mul_pd(a, b);
7867
    const __m128d mulMask =
7868
        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7869
    __m128d tmp = _mm_and_pd(mul, mulMask);
7870
#else
7871
#if defined(__aarch64__)
7872
    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7873
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7874
                             : 0;
7875
    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7876
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7877
                             : 0;
7878
#else
7879
    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7880
    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7881
#endif
7882
    __m128d tmp = _mm_set_pd(d1, d0);
7883
#endif
7884
    // Sum the products
7885
#if defined(__aarch64__)
7886
    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7887
#else
7888
    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7889
#endif
7890
    // Conditionally store the sum
7891
    const __m128d sumMask =
7892
        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7893
    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7894
    return res;
7895
}
7896

7897
// Conditionally multiply the packed single-precision (32-bit) floating-point
7898
// elements in a and b using the high 4 bits in imm8, sum the four products,
7899
// and conditionally store the sum in dst using the low 4 bits of imm.
7900
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
7901
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7902
{
7903
#if defined(__aarch64__)
7904
    /* shortcuts */
7905
    if (imm == 0xFF) {
7906
        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7907
    }
7908
    if (imm == 0x7F) {
7909
        float32x4_t m = _mm_mul_ps(a, b);
7910
        m[3] = 0;
7911
        return _mm_set1_ps(vaddvq_f32(m));
7912
    }
7913
#endif
7914

7915
    float s = 0, c = 0;
7916
    float32x4_t f32a = vreinterpretq_f32_m128(a);
7917
    float32x4_t f32b = vreinterpretq_f32_m128(b);
7918

7919
    /* To improve the accuracy of floating-point summation, Kahan algorithm
7920
     * is used for each operation.
7921
     */
7922
    if (imm & (1 << 4))
7923
        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7924
    if (imm & (1 << 5))
7925
        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7926
    if (imm & (1 << 6))
7927
        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7928
    if (imm & (1 << 7))
7929
        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7930
    s += c;
7931

7932
    float32x4_t res = {
7933
        (imm & 0x1) ? s : 0,
7934
        (imm & 0x2) ? s : 0,
7935
        (imm & 0x4) ? s : 0,
7936
        (imm & 0x8) ? s : 0,
7937
    };
7938
    return vreinterpretq_m128_f32(res);
7939
}
7940

7941
// Extracts the selected signed or unsigned 32-bit integer from a and zero
7942
// extends.
7943
// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7944
#define _mm_extract_epi32(a, imm) \
7945
    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7946

7947
// Extracts the selected signed or unsigned 64-bit integer from a and zero
7948
// extends.
7949
// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7950
#define _mm_extract_epi64(a, imm) \
7951
    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7952

7953
// Extracts the selected signed or unsigned 8-bit integer from a and zero
7954
// extends.
7955
// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7956
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7957
#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7958

7959
// Extracts the selected single-precision (32-bit) floating-point from a.
7960
// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7961
#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7962

7963
// Round the packed double-precision (64-bit) floating-point elements in a down
7964
// to an integer value, and store the results as packed double-precision
7965
// floating-point elements in dst.
7966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7967
FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7968
{
7969
#if defined(__aarch64__)
7970
    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7971
#else
7972
    double *f = (double *) &a;
7973
    return _mm_set_pd(floor(f[1]), floor(f[0]));
7974
#endif
7975
}
7976

7977
// Round the packed single-precision (32-bit) floating-point elements in a down
7978
// to an integer value, and store the results as packed single-precision
7979
// floating-point elements in dst.
7980
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7981
FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7982
{
7983
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7984
    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7985
#else
7986
    float *f = (float *) &a;
7987
    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7988
#endif
7989
}
7990

7991
// Round the lower double-precision (64-bit) floating-point element in b down to
7992
// an integer value, store the result as a double-precision floating-point
7993
// element in the lower element of dst, and copy the upper element from a to the
7994
// upper element of dst.
7995
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7996
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7997
{
7998
    return _mm_move_sd(a, _mm_floor_pd(b));
7999
}
8000

8001
// Round the lower single-precision (32-bit) floating-point element in b down to
8002
// an integer value, store the result as a single-precision floating-point
8003
// element in the lower element of dst, and copy the upper 3 packed elements
8004
// from a to the upper elements of dst.
8005
//
8006
//   dst[31:0] := FLOOR(b[31:0])
8007
//   dst[127:32] := a[127:32]
8008
//
8009
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
8010
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
8011
{
8012
    return _mm_move_ss(a, _mm_floor_ps(b));
8013
}
8014

8015
// Inserts the least significant 32 bits of b into the selected 32-bit integer
8016
// of a.
8017
// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
8018
//                                       __constrange(0,4) int imm)
8019
#define _mm_insert_epi32(a, b, imm)                                  \
8020
    __extension__({                                                  \
8021
        vreinterpretq_m128i_s32(                                     \
8022
            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
8023
    })
8024

8025
// Inserts the least significant 64 bits of b into the selected 64-bit integer
8026
// of a.
8027
// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
8028
//                                       __constrange(0,2) int imm)
8029
#define _mm_insert_epi64(a, b, imm)                                  \
8030
    __extension__({                                                  \
8031
        vreinterpretq_m128i_s64(                                     \
8032
            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
8033
    })
8034

8035
// Inserts the least significant 8 bits of b into the selected 8-bit integer
8036
// of a.
8037
// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
8038
//                                      __constrange(0,16) int imm)
8039
#define _mm_insert_epi8(a, b, imm)                                 \
8040
    __extension__({                                                \
8041
        vreinterpretq_m128i_s8(                                    \
8042
            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
8043
    })
8044

8045
// Copy a to tmp, then insert a single-precision (32-bit) floating-point
8046
// element from b into tmp using the control in imm8. Store tmp to dst using
8047
// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
8048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
8049
#define _mm_insert_ps(a, b, imm8)                                              \
8050
    __extension__({                                                            \
8051
        float32x4_t tmp1 =                                                     \
8052
            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
8053
                           vreinterpretq_f32_m128(a), 0);                      \
8054
        float32x4_t tmp2 =                                                     \
8055
            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
8056
                           ((imm8 >> 4) & 0x3));                               \
8057
        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
8058
                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
8059
                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
8060
                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
8061
        uint32x4_t mask = vld1q_u32(data);                                     \
8062
        float32x4_t all_zeros = vdupq_n_f32(0);                                \
8063
                                                                               \
8064
        vreinterpretq_m128_f32(                                                \
8065
            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
8066
    })
8067

8068
// epi versions of min/max
8069
// Computes the pariwise maximums of the four signed 32-bit integer values of a
8070
// and b.
8071
//
8072
// A 128-bit parameter that can be defined with the following equations:
8073
//   r0 := (a0 > b0) ? a0 : b0
8074
//   r1 := (a1 > b1) ? a1 : b1
8075
//   r2 := (a2 > b2) ? a2 : b2
8076
//   r3 := (a3 > b3) ? a3 : b3
8077
//
8078
// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
8079
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
8080
{
8081
    return vreinterpretq_m128i_s32(
8082
        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8083
}
8084

8085
// Compare packed signed 8-bit integers in a and b, and store packed maximum
8086
// values in dst.
8087
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
8088
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
8089
{
8090
    return vreinterpretq_m128i_s8(
8091
        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
8092
}
8093

8094
// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
8095
// values in dst.
8096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
8097
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
8098
{
8099
    return vreinterpretq_m128i_u16(
8100
        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
8101
}
8102

8103
// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
8104
// values in dst.
8105
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
8106
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
8107
{
8108
    return vreinterpretq_m128i_u32(
8109
        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
8110
}
8111

8112
// Computes the pariwise minima of the four signed 32-bit integer values of a
8113
// and b.
8114
//
8115
// A 128-bit parameter that can be defined with the following equations:
8116
//   r0 := (a0 < b0) ? a0 : b0
8117
//   r1 := (a1 < b1) ? a1 : b1
8118
//   r2 := (a2 < b2) ? a2 : b2
8119
//   r3 := (a3 < b3) ? a3 : b3
8120
//
8121
// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
8122
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
8123
{
8124
    return vreinterpretq_m128i_s32(
8125
        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8126
}
8127

8128
// Compare packed signed 8-bit integers in a and b, and store packed minimum
8129
// values in dst.
8130
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
8131
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
8132
{
8133
    return vreinterpretq_m128i_s8(
8134
        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
8135
}
8136

8137
// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
8138
// values in dst.
8139
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
8140
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
8141
{
8142
    return vreinterpretq_m128i_u16(
8143
        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
8144
}
8145

8146
// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
8147
// values in dst.
8148
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
8149
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
8150
{
8151
    return vreinterpretq_m128i_u32(
8152
        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
8153
}
8154

8155
// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
8156
// in a, store the minimum and index in dst, and zero the remaining bits in dst.
8157
//
8158
//   index[2:0] := 0
8159
//   min[15:0] := a[15:0]
8160
//   FOR j := 0 to 7
8161
//       i := j*16
8162
//       IF a[i+15:i] < min[15:0]
8163
//           index[2:0] := j
8164
//           min[15:0] := a[i+15:i]
8165
//       FI
8166
//   ENDFOR
8167
//   dst[15:0] := min[15:0]
8168
//   dst[18:16] := index[2:0]
8169
//   dst[127:19] := 0
8170
//
8171
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
8172
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
8173
{
8174
    __m128i dst;
8175
    uint16_t min, idx = 0;
8176
#if defined(__aarch64__)
8177
    // Find the minimum value
8178
    min = vminvq_u16(vreinterpretq_u16_m128i(a));
8179

8180
    // Get the index of the minimum value
8181
    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
8182
    uint16x8_t minv = vdupq_n_u16(min);
8183
    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
8184
    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
8185
#else
8186
    // Find the minimum value
8187
    __m64 tmp;
8188
    tmp = vreinterpret_m64_u16(
8189
        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
8190
                 vget_high_u16(vreinterpretq_u16_m128i(a))));
8191
    tmp = vreinterpret_m64_u16(
8192
        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
8193
    tmp = vreinterpret_m64_u16(
8194
        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
8195
    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
8196
    // Get the index of the minimum value
8197
    int i;
8198
    for (i = 0; i < 8; i++) {
8199
        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
8200
            idx = (uint16_t) i;
8201
            break;
8202
        }
8203
        a = _mm_srli_si128(a, 2);
8204
    }
8205
#endif
8206
    // Generate result
8207
    dst = _mm_setzero_si128();
8208
    dst = vreinterpretq_m128i_u16(
8209
        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
8210
    dst = vreinterpretq_m128i_u16(
8211
        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
8212
    return dst;
8213
}
8214

8215
// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
8216
// 8-bit integers in a compared to those in b, and store the 16-bit results in
8217
// dst. Eight SADs are performed using one quadruplet from b and eight
8218
// quadruplets from a. One quadruplet is selected from b starting at on the
8219
// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
8220
// integers selected from a starting at the offset specified in imm8.
8221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
8222
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
8223
{
8224
    uint8x16_t _a, _b;
8225

8226
    switch (imm & 0x4) {
8227
    case 0:
8228
        // do nothing
8229
        _a = vreinterpretq_u8_m128i(a);
8230
        break;
8231
    case 4:
8232
        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
8233
                                            vreinterpretq_u32_m128i(a), 1));
8234
        break;
8235
    default:
8236
#if defined(__GNUC__) || defined(__clang__)
8237
        __builtin_unreachable();
8238
#endif
8239
        break;
8240
    }
8241

8242
    switch (imm & 0x3) {
8243
    case 0:
8244
        _b = vreinterpretq_u8_u32(
8245
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
8246
        break;
8247
    case 1:
8248
        _b = vreinterpretq_u8_u32(
8249
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
8250
        break;
8251
    case 2:
8252
        _b = vreinterpretq_u8_u32(
8253
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
8254
        break;
8255
    case 3:
8256
        _b = vreinterpretq_u8_u32(
8257
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
8258
        break;
8259
    default:
8260
#if defined(__GNUC__) || defined(__clang__)
8261
        __builtin_unreachable();
8262
#endif
8263
        break;
8264
    }
8265

8266
    int16x8_t c04, c15, c26, c37;
8267
    uint8x8_t low_b = vget_low_u8(_b);
8268
    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
8269
    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
8270
    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
8271
    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
8272
    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
8273
    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
8274
    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
8275
#if defined(__aarch64__)
8276
    // |0|4|2|6|
8277
    c04 = vpaddq_s16(c04, c26);
8278
    // |1|5|3|7|
8279
    c15 = vpaddq_s16(c15, c37);
8280

8281
    int32x4_t trn1_c =
8282
        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8283
    int32x4_t trn2_c =
8284
        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8285
    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8286
                                              vreinterpretq_s16_s32(trn2_c)));
8287
#else
8288
    int16x4_t c01, c23, c45, c67;
8289
    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8290
    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8291
    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8292
    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8293

8294
    return vreinterpretq_m128i_s16(
8295
        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8296
#endif
8297
}
8298

8299
// Multiply the low signed 32-bit integers from each packed 64-bit element in
8300
// a and b, and store the signed 64-bit results in dst.
8301
//
8302
//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8303
//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8304
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
8305
{
8306
    // vmull_s32 upcasts instead of masking, so we downcast.
8307
    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8308
    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8309
    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8310
}
8311

8312
// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8313
// unsigned 32-bit integers from b.
8314
// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8315
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
8316
{
8317
    return vreinterpretq_m128i_s32(
8318
        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8319
}
8320

8321
// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8322
// integers and saturates.
8323
//
8324
//   r0 := UnsignedSaturate(a0)
8325
//   r1 := UnsignedSaturate(a1)
8326
//   r2 := UnsignedSaturate(a2)
8327
//   r3 := UnsignedSaturate(a3)
8328
//   r4 := UnsignedSaturate(b0)
8329
//   r5 := UnsignedSaturate(b1)
8330
//   r6 := UnsignedSaturate(b2)
8331
//   r7 := UnsignedSaturate(b3)
8332
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
8333
{
8334
    return vreinterpretq_m128i_u16(
8335
        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8336
                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
8337
}
8338

8339
// Round the packed double-precision (64-bit) floating-point elements in a using
8340
// the rounding parameter, and store the results as packed double-precision
8341
// floating-point elements in dst.
8342
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
8343
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
8344
{
8345
#if defined(__aarch64__)
8346
    switch (rounding) {
8347
    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8348
        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8349
    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8350
        return _mm_floor_pd(a);
8351
    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8352
        return _mm_ceil_pd(a);
8353
    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8354
        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8355
    default:  //_MM_FROUND_CUR_DIRECTION
8356
        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8357
    }
8358
#else
8359
    double *v_double = (double *) &a;
8360

8361
    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8362
        (rounding == _MM_FROUND_CUR_DIRECTION &&
8363
         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8364
        double res[2], tmp;
8365
        for (int i = 0; i < 2; i++) {
8366
            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8367
            double roundDown = floor(tmp);  // Round down value
8368
            double roundUp = ceil(tmp);     // Round up value
8369
            double diffDown = tmp - roundDown;
8370
            double diffUp = roundUp - tmp;
8371
            if (diffDown < diffUp) {
8372
                /* If it's closer to the round down value, then use it */
8373
                res[i] = roundDown;
8374
            } else if (diffDown > diffUp) {
8375
                /* If it's closer to the round up value, then use it */
8376
                res[i] = roundUp;
8377
            } else {
8378
                /* If it's equidistant between round up and round down value,
8379
                 * pick the one which is an even number */
8380
                double half = roundDown / 2;
8381
                if (half != floor(half)) {
8382
                    /* If the round down value is odd, return the round up value
8383
                     */
8384
                    res[i] = roundUp;
8385
                } else {
8386
                    /* If the round up value is odd, return the round down value
8387
                     */
8388
                    res[i] = roundDown;
8389
                }
8390
            }
8391
            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8392
        }
8393
        return _mm_set_pd(res[1], res[0]);
8394
    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8395
               (rounding == _MM_FROUND_CUR_DIRECTION &&
8396
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8397
        return _mm_floor_pd(a);
8398
    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8399
               (rounding == _MM_FROUND_CUR_DIRECTION &&
8400
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8401
        return _mm_ceil_pd(a);
8402
    }
8403
    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8404
                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8405
#endif
8406
}
8407

8408
// Round the packed single-precision (32-bit) floating-point elements in a using
8409
// the rounding parameter, and store the results as packed single-precision
8410
// floating-point elements in dst.
8411
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8412
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
8413
{
8414
#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8415
    switch (rounding) {
8416
    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8417
        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8418
    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8419
        return _mm_floor_ps(a);
8420
    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8421
        return _mm_ceil_ps(a);
8422
    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8423
        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8424
    default:  //_MM_FROUND_CUR_DIRECTION
8425
        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8426
    }
8427
#else
8428
    float *v_float = (float *) &a;
8429

8430
    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8431
        (rounding == _MM_FROUND_CUR_DIRECTION &&
8432
         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8433
        uint32x4_t signmask = vdupq_n_u32(0x80000000);
8434
        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8435
                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
8436
        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8437
            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8438
        int32x4_t r_trunc = vcvtq_s32_f32(
8439
            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8440
        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8441
            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8442
        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8443
                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8444
        float32x4_t delta = vsubq_f32(
8445
            vreinterpretq_f32_m128(a),
8446
            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8447
        uint32x4_t is_delta_half =
8448
            vceqq_f32(delta, half); /* delta == +/- 0.5 */
8449
        return vreinterpretq_m128_f32(
8450
            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8451
    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8452
               (rounding == _MM_FROUND_CUR_DIRECTION &&
8453
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8454
        return _mm_floor_ps(a);
8455
    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8456
               (rounding == _MM_FROUND_CUR_DIRECTION &&
8457
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8458
        return _mm_ceil_ps(a);
8459
    }
8460
    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8461
                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8462
                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8463
                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8464
#endif
8465
}
8466

8467
// Round the lower double-precision (64-bit) floating-point element in b using
8468
// the rounding parameter, store the result as a double-precision floating-point
8469
// element in the lower element of dst, and copy the upper element from a to the
8470
// upper element of dst.
8471
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
8472
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
8473
{
8474
    return _mm_move_sd(a, _mm_round_pd(b, rounding));
8475
}
8476

8477
// Round the lower single-precision (32-bit) floating-point element in b using
8478
// the rounding parameter, store the result as a single-precision floating-point
8479
// element in the lower element of dst, and copy the upper 3 packed elements
8480
// from a to the upper elements of dst. Rounding is done according to the
8481
// rounding[3:0] parameter, which can be one of:
8482
//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8483
//     suppress exceptions
8484
//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
8485
//     suppress exceptions
8486
//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
8487
//     exceptions
8488
//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
8489
//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8490
//     _MM_SET_ROUNDING_MODE
8491
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
8492
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
8493
{
8494
    return _mm_move_ss(a, _mm_round_ps(b, rounding));
8495
}
8496

8497
// Load 128-bits of integer data from memory into dst using a non-temporal
8498
// memory hint. mem_addr must be aligned on a 16-byte boundary or a
8499
// general-protection exception may be generated.
8500
//
8501
//   dst[127:0] := MEM[mem_addr+127:mem_addr]
8502
//
8503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
8504
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
8505
{
8506
#if __has_builtin(__builtin_nontemporal_store)
8507
    return __builtin_nontemporal_load(p);
8508
#else
8509
    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8510
#endif
8511
}
8512

8513
// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8514
// all 1's, and return 1 if the result is zero, otherwise return 0.
8515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
8516
FORCE_INLINE int _mm_test_all_ones(__m128i a)
8517
{
8518
    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8519
           ~(uint64_t) 0;
8520
}
8521

8522
// Compute the bitwise AND of 128 bits (representing integer data) in a and
8523
// mask, and return 1 if the result is zero, otherwise return 0.
8524
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
8525
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
8526
{
8527
    int64x2_t a_and_mask =
8528
        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
8529
    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8530
}
8531

8532
// Compute the bitwise AND of 128 bits (representing integer data) in a and
8533
// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8534
// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8535
// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8536
// otherwise return 0.
8537
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
8538
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
8539
{
8540
    uint64x2_t zf =
8541
        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8542
    uint64x2_t cf =
8543
        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8544
    uint64x2_t result = vandq_u64(zf, cf);
8545
    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8546
}
8547

8548
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8549
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8550
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8551
// otherwise set CF to 0. Return the CF value.
8552
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
8553
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
8554
{
8555
    int64x2_t s64 =
8556
        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
8557
    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8558
}
8559

8560
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8561
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8562
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8563
// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8564
// otherwise return 0.
8565
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
8566
#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8567

8568
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8569
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8570
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8571
// otherwise set CF to 0. Return the ZF value.
8572
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
8573
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
8574
{
8575
    int64x2_t s64 =
8576
        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8577
    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8578
}
8579

8580
/* SSE4.2 */
8581

8582
const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
8583
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8584
};
8585
const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
8586
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8587
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8588
};
8589

8590
/* specify the source data format */
8591
#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
8592
#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
8593
#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
8594
#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
8595

8596
/* specify the comparison operation */
8597
#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
8598
#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
8599
#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
8600
#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
8601

8602
/* specify the polarity */
8603
#define _SIDD_POSITIVE_POLARITY 0x00
8604
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
8605
#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
8606
#define _SIDD_MASKED_NEGATIVE_POLARITY \
8607
    0x30 /* negate results only before end of string */
8608

8609
/* specify the output selection in _mm_cmpXstri */
8610
#define _SIDD_LEAST_SIGNIFICANT 0x00
8611
#define _SIDD_MOST_SIGNIFICANT 0x40
8612

8613
/* specify the output selection in _mm_cmpXstrm */
8614
#define _SIDD_BIT_MASK 0x00
8615
#define _SIDD_UNIT_MASK 0x40
8616

8617
/* Pattern Matching for C macros.
8618
 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
8619
 */
8620

8621
/* catenate */
8622
#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
8623
#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
8624

8625
#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
8626
/* run the 2nd parameter */
8627
#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
8628
/* run the 1st parameter */
8629
#define SSE2NEON_IIF_1(t, ...) t
8630

8631
#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
8632
#define SSE2NEON_COMPL_0 1
8633
#define SSE2NEON_COMPL_1 0
8634

8635
#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
8636
#define SSE2NEON_DEC_1 0
8637
#define SSE2NEON_DEC_2 1
8638
#define SSE2NEON_DEC_3 2
8639
#define SSE2NEON_DEC_4 3
8640
#define SSE2NEON_DEC_5 4
8641
#define SSE2NEON_DEC_6 5
8642
#define SSE2NEON_DEC_7 6
8643
#define SSE2NEON_DEC_8 7
8644
#define SSE2NEON_DEC_9 8
8645
#define SSE2NEON_DEC_10 9
8646
#define SSE2NEON_DEC_11 10
8647
#define SSE2NEON_DEC_12 11
8648
#define SSE2NEON_DEC_13 12
8649
#define SSE2NEON_DEC_14 13
8650
#define SSE2NEON_DEC_15 14
8651
#define SSE2NEON_DEC_16 15
8652

8653
/* detection */
8654
#define SSE2NEON_CHECK_N(x, n, ...) n
8655
#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
8656
#define SSE2NEON_PROBE(x) x, 1,
8657

8658
#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
8659
#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
8660

8661
#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
8662
#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
8663

8664
#define SSE2NEON_EAT(...)
8665
#define SSE2NEON_EXPAND(...) __VA_ARGS__
8666
#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
8667

8668
/* recursion */
8669
/* deferred expression */
8670
#define SSE2NEON_EMPTY()
8671
#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
8672
#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
8673
#define SSE2NEON_EXPAND(...) __VA_ARGS__
8674

8675
#define SSE2NEON_EVAL(...) \
8676
    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
8677
#define SSE2NEON_EVAL1(...) \
8678
    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
8679
#define SSE2NEON_EVAL2(...) \
8680
    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
8681
#define SSE2NEON_EVAL3(...) __VA_ARGS__
8682

8683
#define SSE2NEON_REPEAT(count, macro, ...)                         \
8684
    SSE2NEON_WHEN(count)                                           \
8685
    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
8686
        SSE2NEON_DEC(count), macro,                                \
8687
        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
8688
                                              __VA_ARGS__))
8689
#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
8690

8691
#define SSE2NEON_SIZE_OF_byte 8
8692
#define SSE2NEON_NUMBER_OF_LANES_byte 16
8693
#define SSE2NEON_SIZE_OF_word 16
8694
#define SSE2NEON_NUMBER_OF_LANES_word 8
8695

8696
#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
8697
    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
8698
        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
8699
        vreinterpretq_##type##_m128i(a)));
8700

8701
#define SSE2NEON_FILL_LANE(i, type) \
8702
    vec_b[i] =                      \
8703
        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
8704

8705
#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
8706
                       number_of_lanes, byte_or_word)                         \
8707
    do {                                                                      \
8708
        SSE2NEON_CAT(                                                         \
8709
            data_type_prefix,                                                 \
8710
            SSE2NEON_CAT(size,                                                \
8711
                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
8712
        vec_b[number_of_lanes];                                               \
8713
        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
8714
            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
8715
            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
8716
        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
8717
                                      SSE2NEON_CAT(type_prefix, size)))       \
8718
        for (int i = 0; i < number_of_lanes; i++) {                           \
8719
            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
8720
                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
8721
                SSE2NEON_CAT(vreinterpretq_u,                                 \
8722
                             SSE2NEON_CAT(size, _m128i))(mask),               \
8723
                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
8724
                    vec_b[i],                                                 \
8725
                    SSE2NEON_CAT(                                             \
8726
                        vreinterpretq_,                                       \
8727
                        SSE2NEON_CAT(type_prefix,                             \
8728
                                     SSE2NEON_CAT(size, _m128i(a))))),        \
8729
                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
8730
                    vec_b[i],                                                 \
8731
                    SSE2NEON_CAT(                                             \
8732
                        vreinterpretq_,                                       \
8733
                        SSE2NEON_CAT(type_prefix,                             \
8734
                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
8735
        }                                                                     \
8736
    } while (0)
8737

8738
#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
8739
    do {                                                                     \
8740
        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
8741
                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
8742
                                      SSE2NEON_CAT(u, size)))                \
8743
    } while (0)
8744

8745
#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
8746
    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
8747
                                                int lb)                       \
8748
    {                                                                         \
8749
        __m128i mtx[16];                                                      \
8750
        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
8751
                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
8752
        return SSE2NEON_CAT(                                                  \
8753
            _sse2neon_aggregate_equal_any_,                                   \
8754
            SSE2NEON_CAT(                                                     \
8755
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
8756
                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
8757
                                             type))))(la, lb, mtx);           \
8758
    }
8759

8760
#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
8761
    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
8762
                                                 int lb)                       \
8763
    {                                                                          \
8764
        __m128i mtx[16];                                                       \
8765
        PCMPSTR_RANGES(                                                        \
8766
            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
8767
            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
8768
        return SSE2NEON_CAT(                                                   \
8769
            _sse2neon_aggregate_ranges_,                                       \
8770
            SSE2NEON_CAT(                                                      \
8771
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
8772
                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
8773
                                             type))))(la, lb, mtx);            \
8774
    }
8775

8776
#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
8777
    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
8778
                                                    __m128i b, int lb)         \
8779
    {                                                                          \
8780
        __m128i mtx[16];                                                       \
8781
        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
8782
                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
8783
        return SSE2NEON_CAT(                                                   \
8784
            _sse2neon_aggregate_equal_ordered_,                                \
8785
            SSE2NEON_CAT(                                                      \
8786
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
8787
                SSE2NEON_CAT(x,                                                \
8788
                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
8789
            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
8790
    }
8791

8792
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
8793
{
8794
    int res = 0;
8795
    int m = (1 << la) - 1;
8796
    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8797
    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
8798
    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
8799
    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8800
    for (int j = 0; j < lb; j++) {
8801
        mtx[j] = vreinterpretq_m128i_u8(
8802
            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8803
        mtx[j] = vreinterpretq_m128i_u8(
8804
            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8805
        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
8806
        res |= (tmp << j);
8807
    }
8808
    return res;
8809
}
8810

8811
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
8812
{
8813
    int res = 0;
8814
    int m = (1 << la) - 1;
8815
    uint16x8_t vec =
8816
        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8817
    for (int j = 0; j < lb; j++) {
8818
        mtx[j] = vreinterpretq_m128i_u16(
8819
            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8820
        mtx[j] = vreinterpretq_m128i_u16(
8821
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8822
        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
8823
        res |= (tmp << j);
8824
    }
8825
    return res;
8826
}
8827

8828
/* clang-format off */
8829
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
8830
    prefix##IMPL(byte) \
8831
    prefix##IMPL(word)
8832
/* clang-format on */
8833

8834
SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
8835

8836
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
8837
{
8838
    int res = 0;
8839
    int m = (1 << la) - 1;
8840
    uint16x8_t vec =
8841
        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8842
    for (int j = 0; j < lb; j++) {
8843
        mtx[j] = vreinterpretq_m128i_u16(
8844
            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8845
        mtx[j] = vreinterpretq_m128i_u16(
8846
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8847
        __m128i tmp = vreinterpretq_m128i_u32(
8848
            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
8849
        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
8850
                                       vreinterpretq_u32_m128i(tmp));
8851
#if defined(__aarch64__)
8852
        int t = vaddvq_u32(vec_res) ? 1 : 0;
8853
#else
8854
        uint64x2_t sumh = vpaddlq_u32(vec_res);
8855
        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
8856
#endif
8857
        res |= (t << j);
8858
    }
8859
    return res;
8860
}
8861

8862
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
8863
{
8864
    int res = 0;
8865
    int m = (1 << la) - 1;
8866
    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8867
    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
8868
    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
8869
    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8870
    for (int j = 0; j < lb; j++) {
8871
        mtx[j] = vreinterpretq_m128i_u8(
8872
            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8873
        mtx[j] = vreinterpretq_m128i_u8(
8874
            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8875
        __m128i tmp = vreinterpretq_m128i_u16(
8876
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
8877
        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
8878
                                       vreinterpretq_u16_m128i(tmp));
8879
        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
8880
        res |= (t << j);
8881
    }
8882
    return res;
8883
}
8884

8885
#define SSE2NEON_CMP_RANGES_IS_BYTE 1
8886
#define SSE2NEON_CMP_RANGES_IS_WORD 0
8887

8888
/* clang-format off */
8889
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
8890
    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
8891
    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
8892
    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
8893
    prefix##IMPL(word, int, s, prefix##IS_WORD)
8894
/* clang-format on */
8895

8896
SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
8897

8898
#undef SSE2NEON_CMP_RANGES_IS_BYTE
8899
#undef SSE2NEON_CMP_RANGES_IS_WORD
8900

8901
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
8902
{
8903
    uint8x16_t mtx =
8904
        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
8905
    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8906
    int m1 = 0x10000 - (1 << la);
8907
    int tb = 0x10000 - (1 << lb);
8908
    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
8909
    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
8910
    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8911
    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
8912
    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
8913
    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
8914
    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
8915
    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
8916
    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
8917

8918
    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
8919
    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
8920
    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
8921
    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
8922
    res_lo = vand_u8(res_lo, vec_mask);
8923
    res_hi = vand_u8(res_hi, vec_mask);
8924

8925
    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
8926
    return res;
8927
}
8928

8929
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
8930
{
8931
    uint16x8_t mtx =
8932
        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
8933
    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8934
    int m1 = 0x100 - (1 << la);
8935
    int tb = 0x100 - (1 << lb);
8936
    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8937
    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8938
    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8939
    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8940
    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8941
    mtx = vbslq_u16(vec1, tmp, mtx);
8942
    mtx = vandq_u16(mtx, vec_mask);
8943
    return _sse2neon_vaddvq_u16(mtx);
8944
}
8945

8946
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8947
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8948

8949
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
8950
    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
8951
        int bound, int la, int lb, __m128i mtx[16])                            \
8952
    {                                                                          \
8953
        int res = 0;                                                           \
8954
        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
8955
        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
8956
            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
8957
            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
8958
        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
8959
            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
8960
                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8961
            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
8962
        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8963
        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
8964
        for (int j = 0; j < lb; j++) {                                         \
8965
            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
8966
                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
8967
        }                                                                      \
8968
        for (int j = lb; j < bound; j++) {                                     \
8969
            mtx[j] = vreinterpretq_m128i_u##size(                              \
8970
                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
8971
        }                                                                      \
8972
        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
8973
            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
8974
        for (int i = 0; i < bound; i++) {                                      \
8975
            int val = 1;                                                       \
8976
            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
8977
                val &= ptr[k * bound + j];                                     \
8978
            res += val << i;                                                   \
8979
        }                                                                      \
8980
        return res;                                                            \
8981
    }
8982

8983
/* clang-format off */
8984
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8985
    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
8986
    prefix##IMPL(16, 8, prefix##IS_UWORD)
8987
/* clang-format on */
8988

8989
SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8990

8991
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8992
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8993

8994
/* clang-format off */
8995
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8996
    prefix##IMPL(byte)                              \
8997
    prefix##IMPL(word)
8998
/* clang-format on */
8999

9000
SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
9001

9002
#define SSE2NEON_CMPESTR_LIST                          \
9003
    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
9004
    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
9005
    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
9006
    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
9007
    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
9008
    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
9009
    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
9010
    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
9011
    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
9012
    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
9013
    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
9014
    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
9015
    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
9016
    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
9017
    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
9018
    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
9019

9020
enum {
9021
#define _(name, func_suffix) name,
9022
    SSE2NEON_CMPESTR_LIST
9023
#undef _
9024
};
9025
typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
9026
static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
9027
#define _(name, func_suffix) _sse2neon_##func_suffix,
9028
    SSE2NEON_CMPESTR_LIST
9029
#undef _
9030
};
9031

9032
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
9033
{
9034
    switch (imm8 & 0x30) {
9035
    case _SIDD_NEGATIVE_POLARITY:
9036
        res ^= 0xffffffff;
9037
        break;
9038
    case _SIDD_MASKED_NEGATIVE_POLARITY:
9039
        res ^= (1 << lb) - 1;
9040
        break;
9041
    default:
9042
        break;
9043
    }
9044

9045
    return res & ((bound == 8) ? 0xFF : 0xFFFF);
9046
}
9047

9048
FORCE_INLINE int _sse2neon_clz(unsigned int x)
9049
{
9050
#if _MSC_VER
9051
    DWORD cnt = 0;
9052
    if (_BitScanForward(&cnt, x))
9053
        return cnt;
9054
    return 32;
9055
#else
9056
    return x != 0 ? __builtin_clz(x) : 32;
9057
#endif
9058
}
9059

9060
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
9061
{
9062
#if _MSC_VER
9063
    DWORD cnt = 0;
9064
    if (_BitScanReverse(&cnt, x))
9065
        return 31 - cnt;
9066
    return 32;
9067
#else
9068
    return x != 0 ? __builtin_ctz(x) : 32;
9069
#endif
9070
}
9071

9072
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
9073
{
9074
#if _MSC_VER
9075
    unsigned long cnt;
9076
#ifdef defined(SSE2NEON_HAS_BITSCAN64)
9077
    (defined(_M_AMD64) || defined(__x86_64__))
9078
        if((_BitScanForward64(&cnt, x))
9079
            return (int)(cnt);
9080
#else
9081
    if (_BitScanForward(&cnt, (unsigned long) (x)))
9082
        return (int) cnt;
9083
    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
9084
        return (int) (cnt + 32);
9085
#endif
9086
    return 64;
9087
#else
9088
    return x != 0 ? __builtin_ctzll(x) : 64;
9089
#endif
9090
}
9091

9092
#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
9093

9094
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
9095
    const int var = (imm & 0x01) ? 8 : 16
9096

9097
#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
9098
    int tmp1 = la ^ (la >> 31);                  \
9099
    la = tmp1 - (la >> 31);                      \
9100
    int tmp2 = lb ^ (lb >> 31);                  \
9101
    lb = tmp2 - (lb >> 31);                      \
9102
    la = SSE2NEON_MIN(la, bound);                \
9103
    lb = SSE2NEON_MIN(lb, bound)
9104

9105
// Compare all pairs of character in string a and b,
9106
// then aggregate the result.
9107
// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
9108
// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
9109
// string a and b.
9110
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
9111
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
9112
    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
9113
    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
9114
    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
9115

9116
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
9117
    return (r2 == 0) ? bound                                     \
9118
                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
9119
                                      : _sse2neon_ctz(r2))
9120

9121
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
9122
    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
9123
    if (imm8 & 0x40) {                                                         \
9124
        if (bound == 8) {                                                      \
9125
            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
9126
                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
9127
            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
9128
                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
9129
        } else {                                                               \
9130
            uint8x16_t vec_r2 =                                                \
9131
                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
9132
            uint8x16_t tmp =                                                   \
9133
                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
9134
            dst = vreinterpretq_m128i_u8(                                      \
9135
                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
9136
        }                                                                      \
9137
    } else {                                                                   \
9138
        if (bound == 16) {                                                     \
9139
            dst = vreinterpretq_m128i_u16(                                     \
9140
                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
9141
        } else {                                                               \
9142
            dst = vreinterpretq_m128i_u8(                                      \
9143
                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
9144
        }                                                                      \
9145
    }                                                                          \
9146
    return dst
9147

9148
// Compare packed strings in a and b with lengths la and lb using the control
9149
// in imm8, and returns 1 if b did not contain a null character and the
9150
// resulting mask was zero, and 0 otherwise.
9151
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
9152
FORCE_INLINE int _mm_cmpestra(__m128i a,
9153
                              int la,
9154
                              __m128i b,
9155
                              int lb,
9156
                              const int imm8)
9157
{
9158
    int lb_cpy = lb;
9159
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9160
    return !r2 & (lb_cpy > bound);
9161
}
9162

9163
// Compare packed strings in a and b with lengths la and lb using the control in
9164
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
9165
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
9166
FORCE_INLINE int _mm_cmpestrc(__m128i a,
9167
                              int la,
9168
                              __m128i b,
9169
                              int lb,
9170
                              const int imm8)
9171
{
9172
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9173
    return r2 != 0;
9174
}
9175

9176
// Compare packed strings in a and b with lengths la and lb using the control
9177
// in imm8, and store the generated index in dst.
9178
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
9179
FORCE_INLINE int _mm_cmpestri(__m128i a,
9180
                              int la,
9181
                              __m128i b,
9182
                              int lb,
9183
                              const int imm8)
9184
{
9185
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9186
    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
9187
}
9188

9189
// Compare packed strings in a and b with lengths la and lb using the control
9190
// in imm8, and store the generated mask in dst.
9191
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
9192
FORCE_INLINE __m128i
9193
_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
9194
{
9195
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9196
    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
9197
}
9198

9199
// Compare packed strings in a and b with lengths la and lb using the control in
9200
// imm8, and returns bit 0 of the resulting bit mask.
9201
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
9202
FORCE_INLINE int _mm_cmpestro(__m128i a,
9203
                              int la,
9204
                              __m128i b,
9205
                              int lb,
9206
                              const int imm8)
9207
{
9208
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
9209
    return r2 & 1;
9210
}
9211

9212
// Compare packed strings in a and b with lengths la and lb using the control in
9213
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
9214
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
9215
FORCE_INLINE int _mm_cmpestrs(__m128i a,
9216
                              int la,
9217
                              __m128i b,
9218
                              int lb,
9219
                              const int imm8)
9220
{
9221
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9222
    return la <= (bound - 1);
9223
}
9224

9225
// Compare packed strings in a and b with lengths la and lb using the control in
9226
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
9227
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
9228
FORCE_INLINE int _mm_cmpestrz(__m128i a,
9229
                              int la,
9230
                              __m128i b,
9231
                              int lb,
9232
                              const int imm8)
9233
{
9234
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9235
    return lb <= (bound - 1);
9236
}
9237

9238
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
9239
    do {                                                                 \
9240
        if (imm8 & 0x01) {                                               \
9241
            uint16x8_t equal_mask_##str =                                \
9242
                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
9243
            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
9244
            uint64_t matches_##str =                                     \
9245
                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
9246
            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
9247
        } else {                                                         \
9248
            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
9249
                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
9250
            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
9251
            uint64_t matches_##str =                                     \
9252
                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
9253
            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
9254
        }                                                                \
9255
    } while (0)
9256

9257
#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
9258
    int la, lb;                                  \
9259
    do {                                         \
9260
        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
9261
        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
9262
    } while (0)
9263

9264
// Compare packed strings with implicit lengths in a and b using the control in
9265
// imm8, and returns 1 if b did not contain a null character and the resulting
9266
// mask was zero, and 0 otherwise.
9267
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
9268
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
9269
{
9270
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9271
    return !r2 & (lb >= bound);
9272
}
9273

9274
// Compare packed strings with implicit lengths in a and b using the control in
9275
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
9276
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
9277
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
9278
{
9279
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9280
    return r2 != 0;
9281
}
9282

9283
// Compare packed strings with implicit lengths in a and b using the control in
9284
// imm8, and store the generated index in dst.
9285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
9286
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
9287
{
9288
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9289
    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
9290
}
9291

9292
// Compare packed strings with implicit lengths in a and b using the control in
9293
// imm8, and store the generated mask in dst.
9294
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
9295
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
9296
{
9297
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9298
    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
9299
}
9300

9301
// Compare packed strings with implicit lengths in a and b using the control in
9302
// imm8, and returns bit 0 of the resulting bit mask.
9303
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
9304
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
9305
{
9306
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
9307
    return r2 & 1;
9308
}
9309

9310
// Compare packed strings with implicit lengths in a and b using the control in
9311
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
9312
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
9313
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
9314
{
9315
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9316
    int la;
9317
    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
9318
    return la <= (bound - 1);
9319
}
9320

9321
// Compare packed strings with implicit lengths in a and b using the control in
9322
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
9323
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
9324
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
9325
{
9326
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
9327
    int lb;
9328
    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
9329
    return lb <= (bound - 1);
9330
}
9331

9332
// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
9333
// in b for greater than.
9334
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
9335
{
9336
#if defined(__aarch64__)
9337
    return vreinterpretq_m128i_u64(
9338
        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
9339
#else
9340
    return vreinterpretq_m128i_s64(vshrq_n_s64(
9341
        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
9342
        63));
9343
#endif
9344
}
9345

9346
// Starting with the initial value in crc, accumulates a CRC32 value for
9347
// unsigned 16-bit integer v.
9348
// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
9349
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
9350
{
9351
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9352
    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
9353
                         : [c] "+r"(crc)
9354
                         : [v] "r"(v));
9355
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9356
    crc = __crc32ch(crc, v);
9357
#else
9358
    crc = _mm_crc32_u8(crc, v & 0xff);
9359
    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
9360
#endif
9361
    return crc;
9362
}
9363

9364
// Starting with the initial value in crc, accumulates a CRC32 value for
9365
// unsigned 32-bit integer v.
9366
// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
9367
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
9368
{
9369
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9370
    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
9371
                         : [c] "+r"(crc)
9372
                         : [v] "r"(v));
9373
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9374
    crc = __crc32cw(crc, v);
9375
#else
9376
    crc = _mm_crc32_u16(crc, v & 0xffff);
9377
    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
9378
#endif
9379
    return crc;
9380
}
9381

9382
// Starting with the initial value in crc, accumulates a CRC32 value for
9383
// unsigned 64-bit integer v.
9384
// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
9385
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
9386
{
9387
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9388
    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
9389
                         : [c] "+r"(crc)
9390
                         : [v] "r"(v));
9391
#else
9392
    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
9393
    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
9394
#endif
9395
    return crc;
9396
}
9397

9398
// Starting with the initial value in crc, accumulates a CRC32 value for
9399
// unsigned 8-bit integer v.
9400
// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
9401
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
9402
{
9403
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
9404
    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
9405
                         : [c] "+r"(crc)
9406
                         : [v] "r"(v));
9407
#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
9408
    crc = __crc32cb(crc, v);
9409
#else
9410
    crc ^= v;
9411
    for (int bit = 0; bit < 8; bit++) {
9412
        if (crc & 1)
9413
            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
9414
        else
9415
            crc = (crc >> 1);
9416
    }
9417
#endif
9418
    return crc;
9419
}
9420

9421
/* AES */
9422

9423
#if !defined(__ARM_FEATURE_CRYPTO)
9424
/* clang-format off */
9425
#define SSE2NEON_AES_SBOX(w)                                           \
9426
    {                                                                  \
9427
        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
9428
        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
9429
        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
9430
        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
9431
        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
9432
        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
9433
        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
9434
        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
9435
        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
9436
        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
9437
        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
9438
        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
9439
        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
9440
        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
9441
        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
9442
        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
9443
        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
9444
        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
9445
        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
9446
        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
9447
        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
9448
        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
9449
        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
9450
        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
9451
        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
9452
        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
9453
        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
9454
        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
9455
        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
9456
        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
9457
        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
9458
        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
9459
        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
9460
        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
9461
        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
9462
        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
9463
        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
9464
    }
9465
#define SSE2NEON_AES_RSBOX(w)                                          \
9466
    {                                                                  \
9467
        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
9468
        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
9469
        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
9470
        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
9471
        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
9472
        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
9473
        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
9474
        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
9475
        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
9476
        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
9477
        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
9478
        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
9479
        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
9480
        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
9481
        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
9482
        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
9483
        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
9484
        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
9485
        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
9486
        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
9487
        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
9488
        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
9489
        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
9490
        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
9491
        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
9492
        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
9493
        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
9494
        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
9495
        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
9496
        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
9497
        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
9498
        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
9499
        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
9500
        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
9501
        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
9502
        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
9503
        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
9504
    }
9505
/* clang-format on */
9506

9507
/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
9508
#define SSE2NEON_AES_H0(x) (x)
9509
static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
9510
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
9511
#undef SSE2NEON_AES_H0
9512

9513
/* x_time function and matrix multiply function */
9514
#if !defined(__aarch64__)
9515
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
9516
#define SSE2NEON_MULTIPLY(x, y)                                  \
9517
    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
9518
     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
9519
     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
9520
     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
9521
#endif
9522

9523
// In the absence of crypto extensions, implement aesenc using regular neon
9524
// intrinsics instead. See:
9525
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
9526
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
9527
// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
9528
// for more information Reproduced with permission of the author.
9529
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
9530
{
9531
#if defined(__aarch64__)
9532
    static const uint8_t shift_rows[] = {
9533
        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
9534
        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
9535
    };
9536
    static const uint8_t ror32by8[] = {
9537
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9538
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9539
    };
9540

9541
    uint8x16_t v;
9542
    uint8x16_t w = vreinterpretq_u8_m128i(a);
9543

9544
    /* shift rows */
9545
    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9546

9547
    /* sub bytes */
9548
    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
9549
    // look up each of the table. After each lookup, we load the next table
9550
    // which locates at the next 64-bytes. In the meantime, the index in the
9551
    // table would be smaller than it was, so the index parameters of
9552
    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
9553
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
9554
    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
9555
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
9556
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
9557
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
9558

9559
    /* mix columns */
9560
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9561
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9562
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9563

9564
    /* add round key */
9565
    return vreinterpretq_m128i_u8(w) ^ RoundKey;
9566

9567
#else /* ARMv7-A implementation for a table-based AES */
9568
#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
9569
    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
9570
     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
9571
// muliplying 'x' by 2 in GF(2^8)
9572
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
9573
// muliplying 'x' by 3 in GF(2^8)
9574
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
9575
#define SSE2NEON_AES_U0(p) \
9576
    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
9577
#define SSE2NEON_AES_U1(p) \
9578
    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
9579
#define SSE2NEON_AES_U2(p) \
9580
    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
9581
#define SSE2NEON_AES_U3(p) \
9582
    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
9583

9584
    // this generates a table containing every possible permutation of
9585
    // shift_rows() and sub_bytes() with mix_columns().
9586
    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
9587
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
9588
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
9589
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
9590
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
9591
    };
9592
#undef SSE2NEON_AES_B2W
9593
#undef SSE2NEON_AES_F2
9594
#undef SSE2NEON_AES_F3
9595
#undef SSE2NEON_AES_U0
9596
#undef SSE2NEON_AES_U1
9597
#undef SSE2NEON_AES_U2
9598
#undef SSE2NEON_AES_U3
9599

9600
    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
9601
    uint32_t x1 =
9602
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
9603
    uint32_t x2 =
9604
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
9605
    uint32_t x3 =
9606
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
9607

9608
    // finish the modulo addition step in mix_columns()
9609
    __m128i out = _mm_set_epi32(
9610
        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
9611
         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
9612
        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
9613
         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
9614
        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
9615
         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
9616
        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
9617
         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
9618

9619
    return _mm_xor_si128(out, RoundKey);
9620
#endif
9621
}
9622

9623
// Perform one round of an AES decryption flow on data (state) in a using the
9624
// round key in RoundKey, and store the result in dst.
9625
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9626
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9627
{
9628
#if defined(__aarch64__)
9629
    static const uint8_t inv_shift_rows[] = {
9630
        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9631
        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9632
    };
9633
    static const uint8_t ror32by8[] = {
9634
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9635
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9636
    };
9637

9638
    uint8x16_t v;
9639
    uint8x16_t w = vreinterpretq_u8_m128i(a);
9640

9641
    // inverse shift rows
9642
    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9643

9644
    // inverse sub bytes
9645
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
9646
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
9647
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
9648
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
9649

9650
    // inverse mix columns
9651
    // muliplying 'v' by 4 in GF(2^8)
9652
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9653
    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9654
    v ^= w;
9655
    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9656

9657
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
9658
                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
9659
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9660
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9661

9662
    // add round key
9663
    return vreinterpretq_m128i_u8(w) ^ RoundKey;
9664

9665
#else /* ARMv7-A NEON implementation */
9666
    /* FIXME: optimized for NEON */
9667
    uint8_t i, e, f, g, h, v[4][4];
9668
    uint8_t *_a = (uint8_t *) &a;
9669
    for (i = 0; i < 16; ++i) {
9670
        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9671
    }
9672

9673
    // inverse mix columns
9674
    for (i = 0; i < 4; ++i) {
9675
        e = v[i][0];
9676
        f = v[i][1];
9677
        g = v[i][2];
9678
        h = v[i][3];
9679

9680
        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
9681
                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
9682
        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
9683
                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
9684
        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
9685
                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
9686
        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
9687
                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
9688
    }
9689

9690
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
9691
#endif
9692
}
9693

9694
// Perform the last round of an AES encryption flow on data (state) in a using
9695
// the round key in RoundKey, and store the result in dst.
9696
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9697
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9698
{
9699
#if defined(__aarch64__)
9700
    static const uint8_t shift_rows[] = {
9701
        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
9702
        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
9703
    };
9704

9705
    uint8x16_t v;
9706
    uint8x16_t w = vreinterpretq_u8_m128i(a);
9707

9708
    // shift rows
9709
    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9710

9711
    // sub bytes
9712
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
9713
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
9714
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
9715
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
9716

9717
    // add round key
9718
    return vreinterpretq_m128i_u8(v) ^ RoundKey;
9719

9720
#else /* ARMv7-A implementation */
9721
    uint8_t v[16] = {
9722
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
9723
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
9724
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
9725
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
9726
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
9727
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
9728
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
9729
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
9730
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
9731
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
9732
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
9733
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
9734
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
9735
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
9736
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
9737
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
9738
    };
9739

9740
    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
9741
#endif
9742
}
9743

9744
// Perform the last round of an AES decryption flow on data (state) in a using
9745
// the round key in RoundKey, and store the result in dst.
9746
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9747
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9748
{
9749
#if defined(__aarch64__)
9750
    static const uint8_t inv_shift_rows[] = {
9751
        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9752
        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9753
    };
9754

9755
    uint8x16_t v;
9756
    uint8x16_t w = vreinterpretq_u8_m128i(a);
9757

9758
    // inverse shift rows
9759
    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9760

9761
    // inverse sub bytes
9762
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
9763
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
9764
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
9765
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
9766

9767
    // add round key
9768
    return vreinterpretq_m128i_u8(v) ^ RoundKey;
9769

9770
#else /* ARMv7-A NEON implementation */
9771
    /* FIXME: optimized for NEON */
9772
    uint8_t v[4][4];
9773
    uint8_t *_a = (uint8_t *) &a;
9774
    for (int i = 0; i < 16; ++i) {
9775
        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9776
    }
9777

9778
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
9779
#endif
9780
}
9781

9782
// Perform the InvMixColumns transformation on a and store the result in dst.
9783
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9784
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9785
{
9786
#if defined(__aarch64__)
9787
    static const uint8_t ror32by8[] = {
9788
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9789
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9790
    };
9791
    uint8x16_t v = vreinterpretq_u8_m128i(a);
9792
    uint8x16_t w;
9793

9794
    // multiplying 'v' by 4 in GF(2^8)
9795
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9796
    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9797
    v ^= w;
9798
    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9799

9800
    // multiplying 'v' by 2 in GF(2^8)
9801
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9802
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9803
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9804
    return vreinterpretq_m128i_u8(w);
9805

9806
#else /* ARMv7-A NEON implementation */
9807
    uint8_t i, e, f, g, h, v[4][4];
9808
    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
9809
    for (i = 0; i < 4; ++i) {
9810
        e = v[i][0];
9811
        f = v[i][1];
9812
        g = v[i][2];
9813
        h = v[i][3];
9814

9815
        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
9816
                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
9817
        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
9818
                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
9819
        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
9820
                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
9821
        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
9822
                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
9823
    }
9824

9825
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
9826
#endif
9827
}
9828

9829
// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
9830
// This instruction generates a round key for AES encryption. See
9831
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
9832
// for details.
9833
//
9834
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9835
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9836
{
9837
#if defined(__aarch64__)
9838
    uint8x16_t _a = vreinterpretq_u8_m128i(a);
9839
    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
9840
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
9841
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
9842
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
9843

9844
    uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
9845
    uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
9846
    uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
9847
                             vreinterpretq_u32_u8(v));
9848
    uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
9849
    uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));
9850

9851
    return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));
9852

9853
#else /* ARMv7-A NEON implementation */
9854
    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
9855
    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
9856
    for (int i = 0; i < 4; ++i) {
9857
        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
9858
        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
9859
    }
9860
    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
9861
                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
9862
#endif
9863
}
9864
#undef SSE2NEON_AES_SBOX
9865
#undef SSE2NEON_AES_RSBOX
9866

9867
#if defined(__aarch64__)
9868
#undef SSE2NEON_XT
9869
#undef SSE2NEON_MULTIPLY
9870
#endif
9871

9872
#else /* __ARM_FEATURE_CRYPTO */
9873
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
9874
// AESMC and then manually applying the real key as an xor operation. This
9875
// unfortunately means an additional xor op; the compiler should be able to
9876
// optimize this away for repeated calls however. See
9877
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
9878
// for more details.
9879
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
9880
{
9881
    return vreinterpretq_m128i_u8(
9882
        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
9883
        vreinterpretq_u8_m128i(b));
9884
}
9885

9886
// Perform one round of an AES decryption flow on data (state) in a using the
9887
// round key in RoundKey, and store the result in dst.
9888
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9889
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9890
{
9891
    return vreinterpretq_m128i_u8(veorq_u8(
9892
        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9893
        vreinterpretq_u8_m128i(RoundKey)));
9894
}
9895

9896
// Perform the last round of an AES encryption flow on data (state) in a using
9897
// the round key in RoundKey, and store the result in dst.
9898
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9899
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9900
{
9901
    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
9902
                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9903
                         RoundKey);
9904
}
9905

9906
// Perform the last round of an AES decryption flow on data (state) in a using
9907
// the round key in RoundKey, and store the result in dst.
9908
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9909
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9910
{
9911
    return vreinterpretq_m128i_u8(
9912
               vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
9913
           vreinterpretq_u8_m128i(RoundKey);
9914
}
9915

9916
// Perform the InvMixColumns transformation on a and store the result in dst.
9917
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9918
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9919
{
9920
    return vreinterpretq_m128i_u8(vaesimcq_u8(a));
9921
}
9922

9923
// Assist in expanding the AES cipher key by computing steps towards generating
9924
// a round key for encryption cipher using data from a and an 8-bit round
9925
// constant specified in imm8, and store the result in dst."
9926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9927
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9928
{
9929
    // AESE does ShiftRows and SubBytes on A
9930
    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9931

9932
    uint8x16_t dest = {
9933
        // Undo ShiftRows step from AESE and extract X1 and X3
9934
        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
9935
        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
9936
        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
9937
        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
9938
    };
9939
    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9940
    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
9941
}
9942
#endif
9943

9944
/* Others */
9945

9946
// Perform a carry-less multiplication of two 64-bit integers, selected from a
9947
// and b according to imm8, and store the results in dst.
9948
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9949
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9950
{
9951
    uint64x2_t a = vreinterpretq_u64_m128i(_a);
9952
    uint64x2_t b = vreinterpretq_u64_m128i(_b);
9953
    switch (imm & 0x11) {
9954
    case 0x00:
9955
        return vreinterpretq_m128i_u64(
9956
            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9957
    case 0x01:
9958
        return vreinterpretq_m128i_u64(
9959
            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9960
    case 0x10:
9961
        return vreinterpretq_m128i_u64(
9962
            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9963
    case 0x11:
9964
        return vreinterpretq_m128i_u64(
9965
            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9966
    default:
9967
        abort();
9968
    }
9969
}
9970

9971
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
9972
{
9973
    union {
9974
        fpcr_bitfield field;
9975
#if defined(__aarch64__)
9976
        uint64_t value;
9977
#else
9978
        uint32_t value;
9979
#endif
9980
    } r;
9981

9982
#if defined(__aarch64__)
9983
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
9984
#else
9985
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9986
#endif
9987

9988
    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9989
}
9990

9991
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9992
// return that count in dst.
9993
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9994
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9995
{
9996
#if defined(__aarch64__)
9997
#if __has_builtin(__builtin_popcount)
9998
    return __builtin_popcount(a);
9999
#else
10000
    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
10001
#endif
10002
#else
10003
    uint32_t count = 0;
10004
    uint8x8_t input_val, count8x8_val;
10005
    uint16x4_t count16x4_val;
10006
    uint32x2_t count32x2_val;
10007

10008
    input_val = vld1_u8((uint8_t *) &a);
10009
    count8x8_val = vcnt_u8(input_val);
10010
    count16x4_val = vpaddl_u8(count8x8_val);
10011
    count32x2_val = vpaddl_u16(count16x4_val);
10012

10013
    vst1_u32(&count, count32x2_val);
10014
    return count;
10015
#endif
10016
}
10017

10018
// Count the number of bits set to 1 in unsigned 64-bit integer a, and
10019
// return that count in dst.
10020
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
10021
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
10022
{
10023
#if defined(__aarch64__)
10024
#if __has_builtin(__builtin_popcountll)
10025
    return __builtin_popcountll(a);
10026
#else
10027
    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
10028
#endif
10029
#else
10030
    uint64_t count = 0;
10031
    uint8x8_t input_val, count8x8_val;
10032
    uint16x4_t count16x4_val;
10033
    uint32x2_t count32x2_val;
10034
    uint64x1_t count64x1_val;
10035

10036
    input_val = vld1_u8((uint8_t *) &a);
10037
    count8x8_val = vcnt_u8(input_val);
10038
    count16x4_val = vpaddl_u8(count8x8_val);
10039
    count32x2_val = vpaddl_u16(count16x4_val);
10040
    count64x1_val = vpaddl_u32(count32x2_val);
10041
    vst1_u64(&count, count64x1_val);
10042
    return count;
10043
#endif
10044
}
10045

10046
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
10047
{
10048
    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
10049
    // regardless of the value of the FZ bit.
10050
    union {
10051
        fpcr_bitfield field;
10052
#if defined(__aarch64__)
10053
        uint64_t value;
10054
#else
10055
        uint32_t value;
10056
#endif
10057
    } r;
10058

10059
#if defined(__aarch64__)
10060
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
10061
#else
10062
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
10063
#endif
10064

10065
    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
10066

10067
#if defined(__aarch64__)
10068
    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
10069
#else
10070
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
10071
#endif
10072
}
10073

10074
// Return the current 64-bit value of the processor's time-stamp counter.
10075
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
10076

10077
FORCE_INLINE uint64_t _rdtsc(void)
10078
{
10079
#if defined(__aarch64__)
10080
    uint64_t val;
10081

10082
    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
10083
     * system counter is at least 56 bits wide; from Armv8.6, the counter
10084
     * must be 64 bits wide.  So the system counter could be less than 64
10085
     * bits wide and it is attributed with the flag 'cap_user_time_short'
10086
     * is true.
10087
     */
10088
    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
10089

10090
    return val;
10091
#else
10092
    uint32_t pmccntr, pmuseren, pmcntenset;
10093
    // Read the user mode Performance Monitoring Unit (PMU)
10094
    // User Enable Register (PMUSERENR) access permissions.
10095
    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
10096
    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
10097
        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
10098
        if (pmcntenset & 0x80000000UL) {  // Is it counting?
10099
            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
10100
            // The counter is set up to count every 64th cycle
10101
            return (uint64_t) (pmccntr) << 6;
10102
        }
10103
    }
10104

10105
    // Fallback to syscall as we can't enable PMUSERENR in user mode.
10106
    struct timeval tv;
10107
    gettimeofday(&tv, NULL);
10108
    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
10109
#endif
10110
}
10111

10112
#if defined(__GNUC__) || defined(__clang__)
10113
#pragma pop_macro("ALIGN_STRUCT")
10114
#pragma pop_macro("FORCE_INLINE")
10115
#endif
10116

10117
#if defined(__GNUC__) && !defined(__clang__)
10118
#pragma GCC pop_options
10119
#endif
10120

10121
#endif
10122

10123
Product

Resources

Company