CoCalc -- MIPSVFPUFallbacks.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/MIPSVFPUFallbacks.cpp
Views: ¹⁴⁰¹
1
#include <cmath>
2

3
#include "Common/BitScan.h"
4

5
#include "Core/MIPS/MIPSVFPUFallbacks.h"
6
#include "Core/MIPS/MIPSVFPUUtils.h"
7

8
// MIPSVFPUUtils now has the high precision instructions implemented by fp64
9
// in https://github.com/hrydgard/ppsspp/pull/16984 .
10
//
11
// These are our older approximations that are quite good but has flaws,
12
// but we need them to fall back to if the table files are missing.
13
//
14
// Note that currently, some of the new functions are not integrated in the JIT
15
// and are thus not normally used anyway.
16

17
// First the "trivial" fallbacks where we haven't done any accuracy work previously.
18

19
float vfpu_asin_fallback(float angle) {
20
	return (float)(asinf(angle) / M_PI_2);
21
}
22

23
float vfpu_rcp_fallback(float x) {
24
	return 1.0f / x;
25
}
26

27
float vfpu_log2_fallback(float x) {
28
	return log2f(x);
29
}
30

31
float vfpu_exp2_fallback(float x) {
32
	return exp2f(x);
33
}
34

35
// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
36
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
37
// to the shape of the sine curve which seem to be very slightly different.
38
//
39
// So setting a lower value.
40
#define PRECISION_EXP_THRESHOLD 0x65
41

42
union float2int {
43
	uint32_t i;
44
	float f;
45
};
46

47
float vfpu_sqrt_fallback(float a) {
48
	float2int val;
49
	val.f = a;
50

51
	if ((val.i & 0xff800000) == 0x7f800000) {
52
		if ((val.i & 0x007fffff) != 0) {
53
			val.i = 0x7f800001;
54
		}
55
		return val.f;
56
	}
57
	if ((val.i & 0x7f800000) == 0) {
58
		// Kill any sign.
59
		val.i = 0;
60
		return val.f;
61
	}
62
	if (val.i & 0x80000000) {
63
		val.i = 0x7f800001;
64
		return val.f;
65
	}
66

67
	int k = get_exp(val.i);
68
	uint32_t sp = get_mant(val.i);
69
	int less_bits = k & 1;
70
	k >>= 1;
71

72
	uint32_t z = 0x00C00000 >> less_bits;
73
	int64_t halfsp = sp >> 1;
74
	halfsp <<= 23 - less_bits;
75
	for (int i = 0; i < 6; ++i) {
76
		z = (z >> 1) + (uint32_t)(halfsp / z);
77
	}
78

79
	val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
80
	// The lower two bits never end up set on the PSP, it seems like.
81
	val.i &= 0xFFFFFFFC;
82

83
	return val.f;
84
}
85

86
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
87
	uint64_t m = (uint64_t)a * (uint64_t)b;
88
	if (m & 0x007FFFFF) {
89
		m += 0x01437000;
90
	}
91
	return (uint32_t)(m >> 23);
92
}
93

94
float vfpu_rsqrt_fallback(float a) {
95
	float2int val;
96
	val.f = a;
97

98
	if (val.i == 0x7f800000) {
99
		return 0.0f;
100
	}
101
	if ((val.i & 0x7fffffff) > 0x7f800000) {
102
		val.i = (val.i & 0x80000000) | 0x7f800001;
103
		return val.f;
104
	}
105
	if ((val.i & 0x7f800000) == 0) {
106
		val.i = (val.i & 0x80000000) | 0x7f800000;
107
		return val.f;
108
	}
109
	if (val.i & 0x80000000) {
110
		val.i = 0xff800001;
111
		return val.f;
112
	}
113

114
	int k = get_exp(val.i);
115
	uint32_t sp = get_mant(val.i);
116
	int less_bits = k & 1;
117
	k = -(k >> 1);
118

119
	uint32_t z = 0x00800000 >> less_bits;
120
	uint32_t halfsp = sp >> (1 + less_bits);
121
	for (int i = 0; i < 6; ++i) {
122
		uint32_t zsq = mant_mul(z, z);
123
		uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
124
		z = mant_mul(z, correction);
125
	}
126

127
	int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
128
	if (shift < 1) {
129
		z >>= -shift;
130
		k += -shift;
131
	} else if (shift > 0) {
132
		z <<= shift;
133
		k -= shift;
134
	}
135

136
	z >>= less_bits;
137

138
	val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
139
	val.i &= 0xFFFFFFFC;
140

141
	return val.f;
142
}
143

144
float vfpu_sin_fallback(float a) {
145
	float2int val;
146
	val.f = a;
147

148
	int32_t k = get_uexp(val.i);
149
	if (k == 255) {
150
		val.i = (val.i & 0xFF800001) | 1;
151
		return val.f;
152
	}
153

154
	if (k < PRECISION_EXP_THRESHOLD) {
155
		val.i &= 0x80000000;
156
		return val.f;
157
	}
158

159
	// Okay, now modulus by 4 to begin with (identical wave every 4.)
160
	int32_t mantissa = get_mant(val.i);
161
	if (k > 0x80) {
162
		const uint8_t over = k & 0x1F;
163
		mantissa = (mantissa << over) & 0x00FFFFFF;
164
		k = 0x80;
165
	}
166
	// This subtracts off the 2.  If we do, flip sign to inverse the wave.
167
	if (k == 0x80 && mantissa >= (1 << 23)) {
168
		val.i ^= 0x80000000;
169
		mantissa -= 1 << 23;
170
	}
171

172
	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
173
	mantissa <<= norm_shift;
174
	k -= norm_shift;
175

176
	if (k <= 0 || mantissa == 0) {
177
		val.i &= 0x80000000;
178
		return val.f;
179
	}
180

181
	// This is the value with modulus applied.
182
	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
183
	val.f = (float)sin((double)val.f * M_PI_2);
184
	val.i &= 0xFFFFFFFC;
185
	return val.f;
186
}
187

188
float vfpu_cos_fallback(float a) {
189
	float2int val;
190
	val.f = a;
191
	bool negate = false;
192

193
	int32_t k = get_uexp(val.i);
194
	if (k == 255) {
195
		// Note: unlike sin, cos always returns +NAN.
196
		val.i = (val.i & 0x7F800001) | 1;
197
		return val.f;
198
	}
199

200
	if (k < PRECISION_EXP_THRESHOLD)
201
		return 1.0f;
202

203
	// Okay, now modulus by 4 to begin with (identical wave every 4.)
204
	int32_t mantissa = get_mant(val.i);
205
	if (k > 0x80) {
206
		const uint8_t over = k & 0x1F;
207
		mantissa = (mantissa << over) & 0x00FFFFFF;
208
		k = 0x80;
209
	}
210
	// This subtracts off the 2.  If we do, negate the result value.
211
	if (k == 0x80 && mantissa >= (1 << 23)) {
212
		mantissa -= 1 << 23;
213
		negate = true;
214
	}
215

216
	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
217
	mantissa <<= norm_shift;
218
	k -= norm_shift;
219

220
	if (k <= 0 || mantissa == 0)
221
		return negate ? -1.0f : 1.0f;
222

223
	// This is the value with modulus applied.
224
	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
225
	if (val.f == 1.0f || val.f == -1.0f) {
226
		return negate ? 0.0f : -0.0f;
227
	}
228
	val.f = (float)cos((double)val.f * M_PI_2);
229
	val.i &= 0xFFFFFFFC;
230
	return negate ? -val.f : val.f;
231
}
232

233
void vfpu_sincos_fallback(float a, float &s, float &c) {
234
	float2int val;
235
	val.f = a;
236
	// For sin, negate the input, for cos negate the output.
237
	bool negate = false;
238

239
	int32_t k = get_uexp(val.i);
240
	if (k == 255) {
241
		val.i = (val.i & 0xFF800001) | 1;
242
		s = val.f;
243
		val.i &= 0x7F800001;
244
		c = val.f;
245
		return;
246
	}
247

248
	if (k < PRECISION_EXP_THRESHOLD) {
249
		val.i &= 0x80000000;
250
		s = val.f;
251
		c = 1.0f;
252
		return;
253
	}
254

255
	// Okay, now modulus by 4 to begin with (identical wave every 4.)
256
	int32_t mantissa = get_mant(val.i);
257
	if (k > 0x80) {
258
		const uint8_t over = k & 0x1F;
259
		mantissa = (mantissa << over) & 0x00FFFFFF;
260
		k = 0x80;
261
	}
262
	// This subtracts off the 2.  If we do, flip signs.
263
	if (k == 0x80 && mantissa >= (1 << 23)) {
264
		mantissa -= 1 << 23;
265
		negate = true;
266
	}
267

268
	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
269
	mantissa <<= norm_shift;
270
	k -= norm_shift;
271

272
	if (k <= 0 || mantissa == 0) {
273
		val.i &= 0x80000000;
274
		if (negate)
275
			val.i ^= 0x80000000;
276
		s = val.f;
277
		c = negate ? -1.0f : 1.0f;
278
		return;
279
	}
280

281
	// This is the value with modulus applied.
282
	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
283
	float2int i_sine, i_cosine;
284
	if (val.f == 1.0f) {
285
		i_sine.f = negate ? -1.0f : 1.0f;
286
		i_cosine.f = negate ? 0.0f : -0.0f;
287
	} else if (val.f == -1.0f) {
288
		i_sine.f = negate ? 1.0f : -1.0f;
289
		i_cosine.f = negate ? 0.0f : -0.0f;
290
	} else if (negate) {
291
		i_sine.f = (float)sin((double)-val.f * M_PI_2);
292
		i_cosine.f = -(float)cos((double)val.f * M_PI_2);
293
	} else {
294
		double angle = (double)val.f * M_PI_2;
295
#if defined(__linux__)
296
		double d_sine;
297
		double d_cosine;
298
		sincos(angle, &d_sine, &d_cosine);
299
		i_sine.f = (float)d_sine;
300
		i_cosine.f = (float)d_cosine;
301
#else
302
		i_sine.f = (float)sin(angle);
303
		i_cosine.f = (float)cos(angle);
304
#endif
305
	}
306

307
	i_sine.i &= 0xFFFFFFFC;
308
	i_cosine.i &= 0xFFFFFFFC;
309
	s = i_sine.f;
310
	c = i_cosine.f;
311
	return;
312
}
313

314
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company