Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/intrin.h
4211 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
// Includes appropriate intrinsic header based on platform.
5
6
#pragma once
7
8
#include "align.h"
9
#include "types.h"
10
11
#include <type_traits>
12
13
#if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
14
#define CPU_ARCH_SIMD 1
15
#define CPU_ARCH_SSE 1
16
#include <emmintrin.h>
17
#include <immintrin.h>
18
#include <smmintrin.h>
19
#include <tmmintrin.h>
20
21
#if defined(__AVX2__)
22
#define CPU_ARCH_AVX 1
23
#define CPU_ARCH_AVX2 1
24
#define CPU_ARCH_SSE41 1
25
#elif defined(__AVX__)
26
#define CPU_ARCH_AVX 1
27
#define CPU_ARCH_SSE41 1
28
#elif defined(__SSE4_1__)
29
#define CPU_ARCH_SSE41 1
30
#endif
31
#elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64)
32
#define CPU_ARCH_SIMD 1
33
#define CPU_ARCH_NEON 1
34
#if defined(_MSC_VER) && !defined(__clang__)
35
#include <arm64_neon.h>
36
#else
37
#include <arm_neon.h>
38
#endif
39
#endif
40
41
#ifdef __APPLE__
42
#include <stdlib.h> // alloca
43
#else
44
#include <malloc.h> // alloca
45
#endif
46
47
/// Helper to disable loop vectorization.
48
#if defined(__clang__)
49
#define DONT_VECTORIZE_THIS_LOOP _Pragma("clang loop vectorize(disable)")
50
#elif defined(_MSC_VER)
51
#define DONT_VECTORIZE_THIS_LOOP __pragma(loop(no_vector))
52
#elif defined(__GNUC__)
53
#define DONT_VECTORIZE_THIS_LOOP _Pragma("GCC novector")
54
#else
55
#define DONT_VECTORIZE_THIS_LOOP
56
#endif
57
58
/// Only currently using 128-bit vectors at max.
59
inline constexpr u32 VECTOR_ALIGNMENT = 16;
60
61
/// Aligns allocation/pitch size to preferred host size.
62
template<typename T>
63
ALWAYS_INLINE static T VectorAlign(T value)
64
{
65
return Common::AlignUpPow2(value, VECTOR_ALIGNMENT);
66
}
67
68
template<typename T>
69
ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
70
{
71
static_assert(std::is_pointer_v<T>, "T is pointer type");
72
static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer");
73
T* dest = ptr;
74
75
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
76
static constexpr u32 PTRS_PER_VECTOR = (16 / sizeof(T));
77
const u32 aligned_count = count / PTRS_PER_VECTOR;
78
const u32 remaining_count = count % PTRS_PER_VECTOR;
79
80
#if defined(CPU_ARCH_SSE)
81
const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
82
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
83
const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
84
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
85
const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value));
86
#endif
87
88
// Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
89
#ifdef __clang__
90
#pragma clang loop unroll(disable)
91
#endif
92
for (u32 i = 0; i < aligned_count; i++)
93
{
94
#if defined(CPU_ARCH_SSE)
95
_mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
96
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
97
vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
98
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
99
vst1q_u32(reinterpret_cast<u32*>(dest), svalue);
100
#endif
101
dest += PTRS_PER_VECTOR;
102
}
103
#else
104
const u32 remaining_count = count;
105
#endif
106
107
for (u32 i = 0; i < remaining_count; i++)
108
*(dest++) = value;
109
}
110
111
ALWAYS_INLINE static void MultiPause()
112
{
113
#if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
114
_mm_pause();
115
_mm_pause();
116
_mm_pause();
117
_mm_pause();
118
_mm_pause();
119
_mm_pause();
120
_mm_pause();
121
_mm_pause();
122
#elif defined(CPU_ARCH_ARM64) && defined(_MSC_VER) && !defined(__clang__)
123
__isb(_ARM64_BARRIER_SY);
124
__isb(_ARM64_BARRIER_SY);
125
__isb(_ARM64_BARRIER_SY);
126
__isb(_ARM64_BARRIER_SY);
127
__isb(_ARM64_BARRIER_SY);
128
__isb(_ARM64_BARRIER_SY);
129
__isb(_ARM64_BARRIER_SY);
130
__isb(_ARM64_BARRIER_SY);
131
#elif defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_ARM32)
132
__asm__ __volatile__("isb");
133
__asm__ __volatile__("isb");
134
__asm__ __volatile__("isb");
135
__asm__ __volatile__("isb");
136
__asm__ __volatile__("isb");
137
__asm__ __volatile__("isb");
138
__asm__ __volatile__("isb");
139
__asm__ __volatile__("isb");
140
#elif defined(CPU_ARCH_RISCV64)
141
// Probably wrong... pause is optional :/
142
asm volatile("fence" ::: "memory");
143
#else
144
#pragma warning("Missing implementation")
145
#endif
146
}
147
148