CoCalc -- SmallDataConvert.h

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Convert/SmallDataConvert.h
Views: ¹⁴⁰¹
1
#pragma once
2

3
#include <cstdint>
4
#include <cstring>
5
#include <cmath>
6

7
#include "Common/Common.h"
8
#include "ppsspp_config.h"
9

10
#ifdef _M_SSE
11
#include <emmintrin.h>
12
#endif
13
#if PPSSPP_ARCH(ARM_NEON)
14
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
15
#include <arm64_neon.h>
16
#else
17
#include <arm_neon.h>
18
#endif
19
#endif
20

21
extern const float one_over_255_x4[4];
22
extern const float exactly_255_x4[4];
23

24
// Utilities useful for filling in std140-layout uniform buffers, and similar.
25
// NEON intrinsics: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics?lang=en
26

27
// LSBs in f[0], etc.
28
inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
29
#ifdef _M_SSE
30
	__m128i zero = _mm_setzero_si128();
31
	__m128i value = _mm_set1_epi32(u);
32
	__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);
33
	__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));
34
	_mm_storeu_ps(f, fvalues);
35
#elif PPSSPP_ARCH(ARM_NEON)
36
	const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);
37
	const uint16x8_t value16 = vmovl_u8(value);
38
	const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));
39
	const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));
40
	vst1q_f32(f, valueFloat);
41
#else
42
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
43
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
44
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
45
	f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
46
#endif
47
}
48

49
// Could be SSE optimized.
50
inline uint32_t Float4ToUint8x4(const float f[4]) {
51
#ifdef _M_SSE
52
	__m128i zero = _mm_setzero_si128();
53
	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
54
	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
55
	return _mm_cvtsi128_si32(ivalue);
56
#elif PPSSPP_ARCH(ARM_NEON)
57
	const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
58
	uint32x4_t ivalue32 = vcvtq_u32_f32(value);
59
	uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
60
	uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16));  // Is there no way to avoid the combine here?
61
	uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
62
	return vget_lane_u32(outValue32, 0);
63
#else
64
	int i4[4];
65
	for (int i = 0; i < 4; i++) {
66
		if (f[i] > 1.0f) {
67
			i4[i] = 255;
68
		} else if (f[i] < 0.0f) {
69
			i4[i] = 0;
70
		} else {
71
			i4[i] = (int)(f[i] * 255.0f);
72
		}
73
	}
74
	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
75
#endif
76
}
77

78
inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
79
#ifdef _M_SSE
80
	// Does actually clamp, no way to avoid it with the pack ops!
81
	__m128i zero = _mm_setzero_si128();
82
	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
83
	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
84
	return _mm_cvtsi128_si32(ivalue);
85
#elif PPSSPP_ARCH(ARM_NEON)
86
	const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
87
	uint32x4_t ivalue32 = vcvtq_u32_f32(value);
88
	uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
89
	uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16));  // Is there no way to avoid the combine here?
90
	uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
91
	return vget_lane_u32(outValue32, 0);
92
#else
93
	u32 i4[4];
94
	for (int i = 0; i < 4; i++) {
95
		i4[i] = (int)(f[i] * 255.0f);
96
	}
97
	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
98
#endif
99
}
100

101
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
102
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
103
	Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));
104
#else
105
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
106
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
107
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
108
	f[3] = alpha * (1.0f / 255.0f);
109
#endif
110
}
111

112
inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
113
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
114
	Uint8x4ToFloat4(f, u & 0xFFFFFF);
115
#else
116
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
117
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
118
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
119
	f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
120
#endif
121
}
122

123
inline void Uint8x3ToFloat3(float f[4], uint32_t u) {
124
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
125
	float temp[4];
126
	Uint8x4ToFloat4(temp, u & 0xFFFFFF);
127
	f[0] = temp[0];
128
	f[1] = temp[1];
129
	f[2] = temp[2];
130
#else
131
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
132
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
133
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
134
#endif
135
}
136

137
inline void Uint8x3ToInt4(int i[4], uint32_t u) {
138
	i[0] = ((u >> 0) & 0xFF);
139
	i[1] = ((u >> 8) & 0xFF);
140
	i[2] = ((u >> 16) & 0xFF);
141
	i[3] = 0;
142
}
143

144
inline void Uint8x3ToInt4_Alpha(int i[4], uint32_t u, uint8_t alpha) {
145
	i[0] = ((u >> 0) & 0xFF);
146
	i[1] = ((u >> 8) & 0xFF);
147
	i[2] = ((u >> 16) & 0xFF);
148
	i[3] = alpha;
149
}
150

151
inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {
152
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
153
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
154
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
155
	f[3] = alpha;
156
}
157

158
inline void Uint8x1ToFloat4(float f[4], uint32_t u) {
159
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
160
	f[1] = 0.0f;
161
	f[2] = 0.0f;
162
	f[3] = 0.0f;
163
}
164

165
// These are just for readability.
166

167
inline void CopyFloat2(float dest[2], const float src[2]) {
168
	dest[0] = src[0];
169
	dest[1] = src[1];
170
}
171

172
inline void CopyFloat3(float dest[3], const float src[3]) {
173
	dest[0] = src[0];
174
	dest[1] = src[1];
175
	dest[2] = src[2];
176
}
177

178
inline void CopyFloat4(float dest[4], const float src[4]) {
179
#ifdef _M_SSE
180
	_mm_storeu_ps(dest, _mm_loadu_ps(src));
181
#else
182
	dest[0] = src[0];
183
	dest[1] = src[1];
184
	dest[2] = src[2];
185
	dest[3] = src[3];
186
#endif
187
}
188

189
inline void CopyFloat1To4(float dest[4], const float src) {
190
#ifdef _M_SSE
191
	_mm_storeu_ps(dest, _mm_set_ss(src));
192
#else
193
	dest[0] = src;
194
	dest[1] = 0.0f;
195
	dest[2] = 0.0f;
196
	dest[3] = 0.0f;
197
#endif
198
}
199

200
inline void CopyFloat2To4(float dest[4], const float src[2]) {
201
	dest[0] = src[0];
202
	dest[1] = src[1];
203
	dest[2] = 0.0f;
204
	dest[3] = 0.0f;
205
}
206

207
inline void CopyFloat3To4(float dest[4], const float src[3]) {
208
	dest[0] = src[0];
209
	dest[1] = src[1];
210
	dest[2] = src[2];
211
	dest[3] = 0.0f;
212
}
213

214
inline void CopyMatrix4x4(float dest[16], const float src[16]) {
215
	memcpy(dest, src, sizeof(float) * 16);
216
}
217

218
inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {
219
#ifdef _M_SSE
220
	__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
221
	_mm_storeu_si128((__m128i *)dest, values);
222
#elif PPSSPP_ARCH(ARM_NEON)
223
	const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);
224
	vst1q_u32((uint32_t *)dest, values);
225
#else
226
	uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
227
	memcpy(dest, temp, sizeof(float) * 4);
228
#endif
229
}
230

231
// Note: If length is 0.0, it's gonna be left as 0.0 instead of trying to normalize. This is important.
232
inline void ExpandFloat24x3ToFloat4AndNormalize(float dest[4], const uint32_t src[3]) {
233
	float temp[4];
234
	ExpandFloat24x3ToFloat4(temp, src);
235
	// TODO: Reuse code from NormalizedOr001 and optimize
236
	float x = temp[0];
237
	float y = temp[1];
238
	float z = temp[2];
239
	float len = sqrtf(x * x + y * y + z * z);
240
	if (len != 0.0f)
241
		len = 1.0f / len;
242
	dest[0] = x * len;
243
	dest[1] = y * len;
244
	dest[2] = z * len;
245
	dest[3] = 0.0f;
246
}
247

248
inline uint32_t BytesToUint32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
249
	return (a) | (b << 8) | (c << 16) | (d << 24);
250
}
251

252
constexpr int32_t SignExtend8ToS32(uint32_t value) {
253
	// This extends this sign at the 8th bit to the other 24 bits.
254
	return (int8_t)(value & 0xFF);
255
}
256

257
constexpr uint32_t SignExtend8ToU32(uint32_t value) {
258
	// Just treat the bits as unsigned.
259
	return (uint32_t)SignExtend8ToS32(value);
260
}
261

262
constexpr int32_t SignExtend16ToS32(uint32_t value) {
263
	// Same as SignExtend8toS32, but from the 16th bit.
264
	return (int16_t)(value & 0xFFFF);
265
}
266

267
constexpr uint32_t SignExtend16ToU32(uint32_t value) {
268
	return (uint32_t)SignExtend16ToS32(value);
269
}
270

271
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company