CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/MIPSVFPUFallbacks.cpp
Views: 1401
1
#include <cmath>
2
3
#include "Common/BitScan.h"
4
5
#include "Core/MIPS/MIPSVFPUFallbacks.h"
6
#include "Core/MIPS/MIPSVFPUUtils.h"
7
8
// MIPSVFPUUtils now has the high precision instructions implemented by fp64
9
// in https://github.com/hrydgard/ppsspp/pull/16984 .
10
//
11
// These are our older approximations that are quite good but has flaws,
12
// but we need them to fall back to if the table files are missing.
13
//
14
// Note that currently, some of the new functions are not integrated in the JIT
15
// and are thus not normally used anyway.
16
17
// First the "trivial" fallbacks where we haven't done any accuracy work previously.
18
19
float vfpu_asin_fallback(float angle) {
20
return (float)(asinf(angle) / M_PI_2);
21
}
22
23
float vfpu_rcp_fallback(float x) {
24
return 1.0f / x;
25
}
26
27
float vfpu_log2_fallback(float x) {
28
return log2f(x);
29
}
30
31
float vfpu_exp2_fallback(float x) {
32
return exp2f(x);
33
}
34
35
// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
36
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
37
// to the shape of the sine curve which seem to be very slightly different.
38
//
39
// So setting a lower value.
40
#define PRECISION_EXP_THRESHOLD 0x65
41
42
union float2int {
43
uint32_t i;
44
float f;
45
};
46
47
float vfpu_sqrt_fallback(float a) {
48
float2int val;
49
val.f = a;
50
51
if ((val.i & 0xff800000) == 0x7f800000) {
52
if ((val.i & 0x007fffff) != 0) {
53
val.i = 0x7f800001;
54
}
55
return val.f;
56
}
57
if ((val.i & 0x7f800000) == 0) {
58
// Kill any sign.
59
val.i = 0;
60
return val.f;
61
}
62
if (val.i & 0x80000000) {
63
val.i = 0x7f800001;
64
return val.f;
65
}
66
67
int k = get_exp(val.i);
68
uint32_t sp = get_mant(val.i);
69
int less_bits = k & 1;
70
k >>= 1;
71
72
uint32_t z = 0x00C00000 >> less_bits;
73
int64_t halfsp = sp >> 1;
74
halfsp <<= 23 - less_bits;
75
for (int i = 0; i < 6; ++i) {
76
z = (z >> 1) + (uint32_t)(halfsp / z);
77
}
78
79
val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
80
// The lower two bits never end up set on the PSP, it seems like.
81
val.i &= 0xFFFFFFFC;
82
83
return val.f;
84
}
85
86
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
87
uint64_t m = (uint64_t)a * (uint64_t)b;
88
if (m & 0x007FFFFF) {
89
m += 0x01437000;
90
}
91
return (uint32_t)(m >> 23);
92
}
93
94
float vfpu_rsqrt_fallback(float a) {
95
float2int val;
96
val.f = a;
97
98
if (val.i == 0x7f800000) {
99
return 0.0f;
100
}
101
if ((val.i & 0x7fffffff) > 0x7f800000) {
102
val.i = (val.i & 0x80000000) | 0x7f800001;
103
return val.f;
104
}
105
if ((val.i & 0x7f800000) == 0) {
106
val.i = (val.i & 0x80000000) | 0x7f800000;
107
return val.f;
108
}
109
if (val.i & 0x80000000) {
110
val.i = 0xff800001;
111
return val.f;
112
}
113
114
int k = get_exp(val.i);
115
uint32_t sp = get_mant(val.i);
116
int less_bits = k & 1;
117
k = -(k >> 1);
118
119
uint32_t z = 0x00800000 >> less_bits;
120
uint32_t halfsp = sp >> (1 + less_bits);
121
for (int i = 0; i < 6; ++i) {
122
uint32_t zsq = mant_mul(z, z);
123
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
124
z = mant_mul(z, correction);
125
}
126
127
int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
128
if (shift < 1) {
129
z >>= -shift;
130
k += -shift;
131
} else if (shift > 0) {
132
z <<= shift;
133
k -= shift;
134
}
135
136
z >>= less_bits;
137
138
val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
139
val.i &= 0xFFFFFFFC;
140
141
return val.f;
142
}
143
144
float vfpu_sin_fallback(float a) {
145
float2int val;
146
val.f = a;
147
148
int32_t k = get_uexp(val.i);
149
if (k == 255) {
150
val.i = (val.i & 0xFF800001) | 1;
151
return val.f;
152
}
153
154
if (k < PRECISION_EXP_THRESHOLD) {
155
val.i &= 0x80000000;
156
return val.f;
157
}
158
159
// Okay, now modulus by 4 to begin with (identical wave every 4.)
160
int32_t mantissa = get_mant(val.i);
161
if (k > 0x80) {
162
const uint8_t over = k & 0x1F;
163
mantissa = (mantissa << over) & 0x00FFFFFF;
164
k = 0x80;
165
}
166
// This subtracts off the 2. If we do, flip sign to inverse the wave.
167
if (k == 0x80 && mantissa >= (1 << 23)) {
168
val.i ^= 0x80000000;
169
mantissa -= 1 << 23;
170
}
171
172
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
173
mantissa <<= norm_shift;
174
k -= norm_shift;
175
176
if (k <= 0 || mantissa == 0) {
177
val.i &= 0x80000000;
178
return val.f;
179
}
180
181
// This is the value with modulus applied.
182
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
183
val.f = (float)sin((double)val.f * M_PI_2);
184
val.i &= 0xFFFFFFFC;
185
return val.f;
186
}
187
188
float vfpu_cos_fallback(float a) {
189
float2int val;
190
val.f = a;
191
bool negate = false;
192
193
int32_t k = get_uexp(val.i);
194
if (k == 255) {
195
// Note: unlike sin, cos always returns +NAN.
196
val.i = (val.i & 0x7F800001) | 1;
197
return val.f;
198
}
199
200
if (k < PRECISION_EXP_THRESHOLD)
201
return 1.0f;
202
203
// Okay, now modulus by 4 to begin with (identical wave every 4.)
204
int32_t mantissa = get_mant(val.i);
205
if (k > 0x80) {
206
const uint8_t over = k & 0x1F;
207
mantissa = (mantissa << over) & 0x00FFFFFF;
208
k = 0x80;
209
}
210
// This subtracts off the 2. If we do, negate the result value.
211
if (k == 0x80 && mantissa >= (1 << 23)) {
212
mantissa -= 1 << 23;
213
negate = true;
214
}
215
216
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
217
mantissa <<= norm_shift;
218
k -= norm_shift;
219
220
if (k <= 0 || mantissa == 0)
221
return negate ? -1.0f : 1.0f;
222
223
// This is the value with modulus applied.
224
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
225
if (val.f == 1.0f || val.f == -1.0f) {
226
return negate ? 0.0f : -0.0f;
227
}
228
val.f = (float)cos((double)val.f * M_PI_2);
229
val.i &= 0xFFFFFFFC;
230
return negate ? -val.f : val.f;
231
}
232
233
void vfpu_sincos_fallback(float a, float &s, float &c) {
234
float2int val;
235
val.f = a;
236
// For sin, negate the input, for cos negate the output.
237
bool negate = false;
238
239
int32_t k = get_uexp(val.i);
240
if (k == 255) {
241
val.i = (val.i & 0xFF800001) | 1;
242
s = val.f;
243
val.i &= 0x7F800001;
244
c = val.f;
245
return;
246
}
247
248
if (k < PRECISION_EXP_THRESHOLD) {
249
val.i &= 0x80000000;
250
s = val.f;
251
c = 1.0f;
252
return;
253
}
254
255
// Okay, now modulus by 4 to begin with (identical wave every 4.)
256
int32_t mantissa = get_mant(val.i);
257
if (k > 0x80) {
258
const uint8_t over = k & 0x1F;
259
mantissa = (mantissa << over) & 0x00FFFFFF;
260
k = 0x80;
261
}
262
// This subtracts off the 2. If we do, flip signs.
263
if (k == 0x80 && mantissa >= (1 << 23)) {
264
mantissa -= 1 << 23;
265
negate = true;
266
}
267
268
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
269
mantissa <<= norm_shift;
270
k -= norm_shift;
271
272
if (k <= 0 || mantissa == 0) {
273
val.i &= 0x80000000;
274
if (negate)
275
val.i ^= 0x80000000;
276
s = val.f;
277
c = negate ? -1.0f : 1.0f;
278
return;
279
}
280
281
// This is the value with modulus applied.
282
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
283
float2int i_sine, i_cosine;
284
if (val.f == 1.0f) {
285
i_sine.f = negate ? -1.0f : 1.0f;
286
i_cosine.f = negate ? 0.0f : -0.0f;
287
} else if (val.f == -1.0f) {
288
i_sine.f = negate ? 1.0f : -1.0f;
289
i_cosine.f = negate ? 0.0f : -0.0f;
290
} else if (negate) {
291
i_sine.f = (float)sin((double)-val.f * M_PI_2);
292
i_cosine.f = -(float)cos((double)val.f * M_PI_2);
293
} else {
294
double angle = (double)val.f * M_PI_2;
295
#if defined(__linux__)
296
double d_sine;
297
double d_cosine;
298
sincos(angle, &d_sine, &d_cosine);
299
i_sine.f = (float)d_sine;
300
i_cosine.f = (float)d_cosine;
301
#else
302
i_sine.f = (float)sin(angle);
303
i_cosine.f = (float)cos(angle);
304
#endif
305
}
306
307
i_sine.i &= 0xFFFFFFFC;
308
i_cosine.i &= 0xFFFFFFFC;
309
s = i_sine.f;
310
c = i_cosine.f;
311
return;
312
}
313
314