CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Convert/SmallDataConvert.h
Views: 1401
1
#pragma once
2
3
#include <cstdint>
4
#include <cstring>
5
#include <cmath>
6
7
#include "Common/Common.h"
8
#include "ppsspp_config.h"
9
10
#ifdef _M_SSE
11
#include <emmintrin.h>
12
#endif
13
#if PPSSPP_ARCH(ARM_NEON)
14
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
15
#include <arm64_neon.h>
16
#else
17
#include <arm_neon.h>
18
#endif
19
#endif
20
21
extern const float one_over_255_x4[4];
22
extern const float exactly_255_x4[4];
23
24
// Utilities useful for filling in std140-layout uniform buffers, and similar.
25
// NEON intrinsics: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics?lang=en
26
27
// LSBs in f[0], etc.
28
inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
29
#ifdef _M_SSE
30
__m128i zero = _mm_setzero_si128();
31
__m128i value = _mm_set1_epi32(u);
32
__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);
33
__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));
34
_mm_storeu_ps(f, fvalues);
35
#elif PPSSPP_ARCH(ARM_NEON)
36
const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);
37
const uint16x8_t value16 = vmovl_u8(value);
38
const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));
39
const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));
40
vst1q_f32(f, valueFloat);
41
#else
42
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
43
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
44
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
45
f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
46
#endif
47
}
48
49
// Could be SSE optimized.
50
inline uint32_t Float4ToUint8x4(const float f[4]) {
51
#ifdef _M_SSE
52
__m128i zero = _mm_setzero_si128();
53
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
54
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
55
return _mm_cvtsi128_si32(ivalue);
56
#elif PPSSPP_ARCH(ARM_NEON)
57
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
58
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
59
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
60
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
61
uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
62
return vget_lane_u32(outValue32, 0);
63
#else
64
int i4[4];
65
for (int i = 0; i < 4; i++) {
66
if (f[i] > 1.0f) {
67
i4[i] = 255;
68
} else if (f[i] < 0.0f) {
69
i4[i] = 0;
70
} else {
71
i4[i] = (int)(f[i] * 255.0f);
72
}
73
}
74
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
75
#endif
76
}
77
78
inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
79
#ifdef _M_SSE
80
// Does actually clamp, no way to avoid it with the pack ops!
81
__m128i zero = _mm_setzero_si128();
82
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
83
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
84
return _mm_cvtsi128_si32(ivalue);
85
#elif PPSSPP_ARCH(ARM_NEON)
86
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
87
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
88
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
89
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
90
uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
91
return vget_lane_u32(outValue32, 0);
92
#else
93
u32 i4[4];
94
for (int i = 0; i < 4; i++) {
95
i4[i] = (int)(f[i] * 255.0f);
96
}
97
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
98
#endif
99
}
100
101
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
102
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
103
Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));
104
#else
105
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
106
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
107
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
108
f[3] = alpha * (1.0f / 255.0f);
109
#endif
110
}
111
112
inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
113
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
114
Uint8x4ToFloat4(f, u & 0xFFFFFF);
115
#else
116
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
117
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
118
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
119
f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
120
#endif
121
}
122
123
inline void Uint8x3ToFloat3(float f[4], uint32_t u) {
124
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
125
float temp[4];
126
Uint8x4ToFloat4(temp, u & 0xFFFFFF);
127
f[0] = temp[0];
128
f[1] = temp[1];
129
f[2] = temp[2];
130
#else
131
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
132
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
133
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
134
#endif
135
}
136
137
inline void Uint8x3ToInt4(int i[4], uint32_t u) {
138
i[0] = ((u >> 0) & 0xFF);
139
i[1] = ((u >> 8) & 0xFF);
140
i[2] = ((u >> 16) & 0xFF);
141
i[3] = 0;
142
}
143
144
inline void Uint8x3ToInt4_Alpha(int i[4], uint32_t u, uint8_t alpha) {
145
i[0] = ((u >> 0) & 0xFF);
146
i[1] = ((u >> 8) & 0xFF);
147
i[2] = ((u >> 16) & 0xFF);
148
i[3] = alpha;
149
}
150
151
inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {
152
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
153
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
154
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
155
f[3] = alpha;
156
}
157
158
inline void Uint8x1ToFloat4(float f[4], uint32_t u) {
159
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
160
f[1] = 0.0f;
161
f[2] = 0.0f;
162
f[3] = 0.0f;
163
}
164
165
// These are just for readability.
166
167
inline void CopyFloat2(float dest[2], const float src[2]) {
168
dest[0] = src[0];
169
dest[1] = src[1];
170
}
171
172
inline void CopyFloat3(float dest[3], const float src[3]) {
173
dest[0] = src[0];
174
dest[1] = src[1];
175
dest[2] = src[2];
176
}
177
178
inline void CopyFloat4(float dest[4], const float src[4]) {
179
#ifdef _M_SSE
180
_mm_storeu_ps(dest, _mm_loadu_ps(src));
181
#else
182
dest[0] = src[0];
183
dest[1] = src[1];
184
dest[2] = src[2];
185
dest[3] = src[3];
186
#endif
187
}
188
189
inline void CopyFloat1To4(float dest[4], const float src) {
190
#ifdef _M_SSE
191
_mm_storeu_ps(dest, _mm_set_ss(src));
192
#else
193
dest[0] = src;
194
dest[1] = 0.0f;
195
dest[2] = 0.0f;
196
dest[3] = 0.0f;
197
#endif
198
}
199
200
inline void CopyFloat2To4(float dest[4], const float src[2]) {
201
dest[0] = src[0];
202
dest[1] = src[1];
203
dest[2] = 0.0f;
204
dest[3] = 0.0f;
205
}
206
207
inline void CopyFloat3To4(float dest[4], const float src[3]) {
208
dest[0] = src[0];
209
dest[1] = src[1];
210
dest[2] = src[2];
211
dest[3] = 0.0f;
212
}
213
214
inline void CopyMatrix4x4(float dest[16], const float src[16]) {
215
memcpy(dest, src, sizeof(float) * 16);
216
}
217
218
inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {
219
#ifdef _M_SSE
220
__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
221
_mm_storeu_si128((__m128i *)dest, values);
222
#elif PPSSPP_ARCH(ARM_NEON)
223
const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);
224
vst1q_u32((uint32_t *)dest, values);
225
#else
226
uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
227
memcpy(dest, temp, sizeof(float) * 4);
228
#endif
229
}
230
231
// Note: If length is 0.0, it's gonna be left as 0.0 instead of trying to normalize. This is important.
232
inline void ExpandFloat24x3ToFloat4AndNormalize(float dest[4], const uint32_t src[3]) {
233
float temp[4];
234
ExpandFloat24x3ToFloat4(temp, src);
235
// TODO: Reuse code from NormalizedOr001 and optimize
236
float x = temp[0];
237
float y = temp[1];
238
float z = temp[2];
239
float len = sqrtf(x * x + y * y + z * z);
240
if (len != 0.0f)
241
len = 1.0f / len;
242
dest[0] = x * len;
243
dest[1] = y * len;
244
dest[2] = z * len;
245
dest[3] = 0.0f;
246
}
247
248
inline uint32_t BytesToUint32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
249
return (a) | (b << 8) | (c << 16) | (d << 24);
250
}
251
252
constexpr int32_t SignExtend8ToS32(uint32_t value) {
253
// This extends this sign at the 8th bit to the other 24 bits.
254
return (int8_t)(value & 0xFF);
255
}
256
257
constexpr uint32_t SignExtend8ToU32(uint32_t value) {
258
// Just treat the bits as unsigned.
259
return (uint32_t)SignExtend8ToS32(value);
260
}
261
262
constexpr int32_t SignExtend16ToS32(uint32_t value) {
263
// Same as SignExtend8toS32, but from the 16th bit.
264
return (int16_t)(value & 0xFFFF);
265
}
266
267
constexpr uint32_t SignExtend16ToU32(uint32_t value) {
268
return (uint32_t)SignExtend16ToS32(value);
269
}
270
271