CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Math/CrossSIMD.h
Views: 1401
1
// CrossSIMD
2
//
3
// Compatibility wrappers for SIMD dialects.
4
//
5
// In the long run, might do a more general single-source-SIMD wrapper here consisting
6
// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
7
// our various color conversion functions and so on in a pretty generic manner.
8
9
#include "ppsspp_config.h"
10
11
#include "stdint.h"
12
13
#ifdef __clang__
14
// Weird how you can't just use #pragma in a macro.
15
#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")
16
#else
17
#define DO_NOT_VECTORIZE_LOOP
18
#endif
19
20
#if PPSSPP_ARCH(SSE2)
21
#include <emmintrin.h>
22
#endif
23
24
#if PPSSPP_ARCH(ARM_NEON)
25
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
26
#include <arm64_neon.h>
27
#else
28
#include <arm_neon.h>
29
#endif
30
#endif
31
32
// Basic types
33
34
#if PPSSPP_ARCH(ARM64_NEON)
35
36
// No special ones here.
37
38
#elif PPSSPP_ARCH(ARM_NEON)
39
40
// Compatibility wrappers making ARM64 NEON code run on ARM32
41
// With optimization on, these should compile down to the optimal code.
42
43
inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
44
switch (lane & 3) {
45
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
46
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
47
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
48
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
49
}
50
}
51
52
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
53
switch (lane & 3) {
54
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
55
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
56
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
57
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
58
}
59
}
60
61
inline uint32x4_t vcgezq_f32(float32x4_t v) {
62
return vcgeq_f32(v, vdupq_n_f32(0.0f));
63
}
64
65
#endif
66
67