CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/MIPSVFPUUtils.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <cstdint>
19
#include <limits>
20
#include <cstdio>
21
#include <cstring>
22
23
#include "Common/BitScan.h"
24
#include "Common/CommonFuncs.h"
25
#include "Common/File/VFS/VFS.h"
26
#include "Common/StringUtils.h"
27
#include "Core/Reporting.h"
28
#include "Core/MIPS/MIPS.h"
29
#include "Core/MIPS/MIPSVFPUUtils.h"
30
#include "Core/MIPS/MIPSVFPUFallbacks.h"
31
32
#ifdef _MSC_VER
33
#pragma warning(disable: 4146)
34
#endif
35
36
union float2int {
37
uint32_t i;
38
float f;
39
};
40
41
void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
42
int mtx = (vectorReg >> 2) & 7;
43
int col = vectorReg & 3;
44
int row = 0;
45
int length = 0;
46
int transpose = (vectorReg>>5) & 1;
47
48
switch (N) {
49
case V_Single: transpose = 0; row=(vectorReg>>5)&3; length = 1; break;
50
case V_Pair: row=(vectorReg>>5)&2; length = 2; break;
51
case V_Triple: row=(vectorReg>>6)&1; length = 3; break;
52
case V_Quad: row=(vectorReg>>5)&2; length = 4; break;
53
default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
54
}
55
56
for (int i = 0; i < length; i++) {
57
int index = mtx * 4;
58
if (transpose)
59
index += ((row+i)&3) + col*32;
60
else
61
index += col + ((row+i)&3)*32;
62
regs[i] = index;
63
}
64
}
65
66
void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
67
int mtx = (matrixReg >> 2) & 7;
68
int col = matrixReg & 3;
69
70
int row = 0;
71
int side = 0;
72
int transpose = (matrixReg >> 5) & 1;
73
74
switch (N) {
75
case M_1x1: transpose = 0; row = (matrixReg >> 5) & 3; side = 1; break;
76
case M_2x2: row = (matrixReg >> 5) & 2; side = 2; break;
77
case M_3x3: row = (matrixReg >> 6) & 1; side = 3; break;
78
case M_4x4: row = (matrixReg >> 5) & 2; side = 4; break;
79
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
80
}
81
82
for (int i = 0; i < side; i++) {
83
for (int j = 0; j < side; j++) {
84
int index = mtx * 4;
85
if (transpose)
86
index += ((row+i)&3) + ((col+j)&3)*32;
87
else
88
index += ((col+j)&3) + ((row+i)&3)*32;
89
regs[j*4 + i] = index;
90
}
91
}
92
}
93
94
int GetMatrixName(int matrix, MatrixSize msize, int column, int row, bool transposed) {
95
// TODO: Fix (?)
96
int name = (matrix * 4) | (transposed << 5);
97
switch (msize) {
98
case M_4x4:
99
if (row || column) {
100
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i or column %i for size %i", row, column, msize);
101
}
102
break;
103
104
case M_3x3:
105
if (row & ~2) {
106
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
107
}
108
if (column & ~2) {
109
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
110
}
111
name |= (row << 6) | column;
112
break;
113
114
case M_2x2:
115
if (row & ~2) {
116
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
117
}
118
if (column & ~2) {
119
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
120
}
121
name |= (row << 5) | column;
122
break;
123
124
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
125
}
126
127
return name;
128
}
129
130
int GetColumnName(int matrix, MatrixSize msize, int column, int offset) {
131
return matrix * 4 + column + offset * 32;
132
}
133
134
int GetRowName(int matrix, MatrixSize msize, int column, int offset) {
135
return 0x20 | (matrix * 4 + column + offset * 32);
136
}
137
138
void GetMatrixColumns(int matrixReg, MatrixSize msize, u8 vecs[4]) {
139
int n = GetMatrixSide(msize);
140
141
int col = matrixReg & 3;
142
int row = (matrixReg >> 5) & 2;
143
int transpose = (matrixReg >> 5) & 1;
144
145
for (int i = 0; i < n; i++) {
146
vecs[i] = (transpose << 5) | (row << 5) | (matrixReg & 0x1C) | (i + col);
147
}
148
}
149
150
void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
151
int n = GetMatrixSide(msize);
152
int col = matrixReg & 3;
153
int row = (matrixReg >> 5) & 2;
154
155
int swappedCol = row ? (msize == M_3x3 ? 1 : 2) : 0;
156
int swappedRow = col ? 2 : 0;
157
int transpose = ((matrixReg >> 5) & 1) ^ 1;
158
159
for (int i = 0; i < n; i++) {
160
vecs[i] = (transpose << 5) | (swappedRow << 5) | (matrixReg & 0x1C) | (i + swappedCol);
161
}
162
}
163
164
void ReadVector(float *rd, VectorSize size, int reg) {
165
int row;
166
int length;
167
switch (size) {
168
case V_Single: rd[0] = currentMIPS->v[voffset[reg]]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
169
case V_Pair: row=(reg>>5)&2; length = 2; break;
170
case V_Triple: row=(reg>>6)&1; length = 3; break;
171
case V_Quad: row=(reg>>5)&2; length = 4; break;
172
default: length = 0; break;
173
}
174
int transpose = (reg >> 5) & 1;
175
const int mtx = ((reg << 2) & 0x70);
176
const int col = reg & 3;
177
// NOTE: We now skip the voffset lookups.
178
if (transpose) {
179
const int base = mtx + col;
180
for (int i = 0; i < length; i++)
181
rd[i] = currentMIPS->v[base + ((row + i) & 3) * 4];
182
} else {
183
const int base = mtx + col * 4;
184
for (int i = 0; i < length; i++)
185
rd[i] = currentMIPS->v[base + ((row + i) & 3)];
186
}
187
}
188
189
void WriteVector(const float *rd, VectorSize size, int reg) {
190
int row;
191
int length;
192
193
switch (size) {
194
case V_Single: if (!currentMIPS->VfpuWriteMask(0)) currentMIPS->v[voffset[reg]] = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
195
case V_Pair: row=(reg>>5)&2; length = 2; break;
196
case V_Triple: row=(reg>>6)&1; length = 3; break;
197
case V_Quad: row=(reg>>5)&2; length = 4; break;
198
default: length = 0; break;
199
}
200
201
const int mtx = ((reg << 2) & 0x70);
202
const int col = reg & 3;
203
bool transpose = (reg >> 5) & 1;
204
// NOTE: We now skip the voffset lookups.
205
if (transpose) {
206
const int base = mtx + col;
207
if (currentMIPS->VfpuWriteMask() == 0) {
208
for (int i = 0; i < length; i++)
209
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
210
} else {
211
for (int i = 0; i < length; i++) {
212
if (!currentMIPS->VfpuWriteMask(i)) {
213
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
214
}
215
}
216
}
217
} else {
218
const int base = mtx + col * 4;
219
if (currentMIPS->VfpuWriteMask() == 0) {
220
for (int i = 0; i < length; i++)
221
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
222
} else {
223
for (int i = 0; i < length; i++) {
224
if (!currentMIPS->VfpuWriteMask(i)) {
225
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
226
}
227
}
228
}
229
}
230
}
231
232
u32 VFPURewritePrefix(int ctrl, u32 remove, u32 add) {
233
u32 prefix = currentMIPS->vfpuCtrl[ctrl];
234
return (prefix & ~remove) | add;
235
}
236
237
void ReadMatrix(float *rd, MatrixSize size, int reg) {
238
int row = 0;
239
int side = 0;
240
int transpose = (reg >> 5) & 1;
241
242
switch (size) {
243
case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;
244
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
245
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
246
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
247
default: side = 0; break;
248
}
249
250
int mtx = (reg >> 2) & 7;
251
int col = reg & 3;
252
253
// The voffset ordering is now integrated in these formulas,
254
// eliminating a table lookup.
255
const float *v = currentMIPS->v + (size_t)mtx * 16;
256
if (transpose) {
257
if (side == 4 && col == 0 && row == 0) {
258
// Fast path: Simple 4x4 transpose. TODO: Optimize.
259
for (int j = 0; j < 4; j++) {
260
for (int i = 0; i < 4; i++) {
261
rd[j * 4 + i] = v[i * 4 + j];
262
}
263
}
264
} else {
265
for (int j = 0; j < side; j++) {
266
for (int i = 0; i < side; i++) {
267
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
268
rd[j * 4 + i] = v[index];
269
}
270
}
271
}
272
} else {
273
if (side == 4 && col == 0 && row == 0) {
274
// Fast path
275
memcpy(rd, v, sizeof(float) * 16); // rd[j * 4 + i] = v[j * 4 + i];
276
} else {
277
for (int j = 0; j < side; j++) {
278
for (int i = 0; i < side; i++) {
279
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
280
rd[j * 4 + i] = v[index];
281
}
282
}
283
}
284
}
285
}
286
287
void WriteMatrix(const float *rd, MatrixSize size, int reg) {
288
int mtx = (reg>>2)&7;
289
int col = reg&3;
290
291
int row;
292
int side;
293
int transpose = (reg >> 5) & 1;
294
295
switch (size) {
296
case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;
297
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
298
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
299
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
300
default: side = 0;
301
}
302
303
if (currentMIPS->VfpuWriteMask() != 0) {
304
ERROR_LOG_REPORT(Log::CPU, "Write mask used with vfpu matrix instruction.");
305
}
306
307
// The voffset ordering is now integrated in these formulas,
308
// eliminating a table lookup.
309
float *v = currentMIPS->v + (size_t)mtx * 16;
310
if (transpose) {
311
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
312
// Fast path: Simple 4x4 transpose. TODO: Optimize.
313
for (int j = 0; j < side; j++) {
314
for (int i = 0; i < side; i++) {
315
v[i * 4 + j] = rd[j * 4 + i];
316
}
317
}
318
} else {
319
for (int j = 0; j < side; j++) {
320
for (int i = 0; i < side; i++) {
321
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
322
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
323
v[index] = rd[j * 4 + i];
324
}
325
}
326
}
327
}
328
} else {
329
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
330
memcpy(v, rd, sizeof(float) * 16); // v[j * 4 + i] = rd[j * 4 + i];
331
} else {
332
for (int j = 0; j < side; j++) {
333
for (int i = 0; i < side; i++) {
334
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
335
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
336
v[index] = rd[j * 4 + i];
337
}
338
}
339
}
340
}
341
}
342
}
343
344
int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {
345
// Different matrices? Can't overlap, return early.
346
if (((vec1 >> 2) & 7) != ((vec2 >> 2) & 7))
347
return 0;
348
349
int n1 = GetNumVectorElements(size1);
350
int n2 = GetNumVectorElements(size2);
351
u8 regs1[4];
352
u8 regs2[4];
353
GetVectorRegs(regs1, size1, vec1);
354
GetVectorRegs(regs2, size1, vec2);
355
int count = 0;
356
for (int i = 0; i < n1; i++) {
357
for (int j = 0; j < n2; j++) {
358
if (regs1[i] == regs2[j])
359
count++;
360
}
361
}
362
return count;
363
}
364
365
VectorSize GetHalfVectorSizeSafe(VectorSize sz) {
366
switch (sz) {
367
case V_Pair: return V_Single;
368
case V_Quad: return V_Pair;
369
default: return V_Invalid;
370
}
371
}
372
373
VectorSize GetHalfVectorSize(VectorSize sz) {
374
VectorSize res = GetHalfVectorSizeSafe(sz);
375
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
376
return res;
377
}
378
379
VectorSize GetDoubleVectorSizeSafe(VectorSize sz) {
380
switch (sz) {
381
case V_Single: return V_Pair;
382
case V_Pair: return V_Quad;
383
default: return V_Invalid;
384
}
385
}
386
387
VectorSize GetDoubleVectorSize(VectorSize sz) {
388
VectorSize res = GetDoubleVectorSizeSafe(sz);
389
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
390
return res;
391
}
392
393
VectorSize GetVectorSizeSafe(MatrixSize sz) {
394
switch (sz) {
395
case M_1x1: return V_Single;
396
case M_2x2: return V_Pair;
397
case M_3x3: return V_Triple;
398
case M_4x4: return V_Quad;
399
default: return V_Invalid;
400
}
401
}
402
403
VectorSize GetVectorSize(MatrixSize sz) {
404
VectorSize res = GetVectorSizeSafe(sz);
405
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
406
return res;
407
}
408
409
MatrixSize GetMatrixSizeSafe(VectorSize sz) {
410
switch (sz) {
411
case V_Single: return M_1x1;
412
case V_Pair: return M_2x2;
413
case V_Triple: return M_3x3;
414
case V_Quad: return M_4x4;
415
default: return M_Invalid;
416
}
417
}
418
419
MatrixSize GetMatrixSize(VectorSize sz) {
420
MatrixSize res = GetMatrixSizeSafe(sz);
421
_assert_msg_(res != M_Invalid, "%s: Bad vector size", __FUNCTION__);
422
return res;
423
}
424
425
VectorSize MatrixVectorSizeSafe(MatrixSize sz) {
426
switch (sz) {
427
case M_1x1: return V_Single;
428
case M_2x2: return V_Pair;
429
case M_3x3: return V_Triple;
430
case M_4x4: return V_Quad;
431
default: return V_Invalid;
432
}
433
}
434
435
VectorSize MatrixVectorSize(MatrixSize sz) {
436
VectorSize res = MatrixVectorSizeSafe(sz);
437
_assert_msg_(res != V_Invalid, "%s: Bad matrix size", __FUNCTION__);
438
return res;
439
}
440
441
int GetMatrixSideSafe(MatrixSize sz) {
442
switch (sz) {
443
case M_1x1: return 1;
444
case M_2x2: return 2;
445
case M_3x3: return 3;
446
case M_4x4: return 4;
447
default: return 0;
448
}
449
}
450
451
int GetMatrixSide(MatrixSize sz) {
452
int res = GetMatrixSideSafe(sz);
453
_assert_msg_(res != 0, "%s: Bad matrix size", __FUNCTION__);
454
return res;
455
}
456
457
// TODO: Optimize
458
MatrixOverlapType GetMatrixOverlap(int mtx1, int mtx2, MatrixSize msize) {
459
int n = GetMatrixSide(msize);
460
461
if (mtx1 == mtx2)
462
return OVERLAP_EQUAL;
463
464
u8 m1[16];
465
u8 m2[16];
466
GetMatrixRegs(m1, msize, mtx1);
467
GetMatrixRegs(m2, msize, mtx2);
468
469
// Simply do an exhaustive search.
470
for (int x = 0; x < n; x++) {
471
for (int y = 0; y < n; y++) {
472
int val = m1[y * 4 + x];
473
for (int a = 0; a < n; a++) {
474
for (int b = 0; b < n; b++) {
475
if (m2[a * 4 + b] == val) {
476
return OVERLAP_PARTIAL;
477
}
478
}
479
}
480
}
481
}
482
483
return OVERLAP_NONE;
484
}
485
486
std::string GetVectorNotation(int reg, VectorSize size) {
487
int mtx = (reg>>2)&7;
488
int col = reg&3;
489
int row = 0;
490
int transpose = (reg>>5)&1;
491
char c;
492
switch (size) {
493
case V_Single: transpose=0; c='S'; row=(reg>>5)&3; break;
494
case V_Pair: c='C'; row=(reg>>5)&2; break;
495
case V_Triple: c='C'; row=(reg>>6)&1; break;
496
case V_Quad: c='C'; row=(reg>>5)&2; break;
497
default: c='?'; break;
498
}
499
if (transpose && c == 'C') c='R';
500
if (transpose)
501
return StringFromFormat("%c%i%i%i", c, mtx, row, col);
502
return StringFromFormat("%c%i%i%i", c, mtx, col, row);
503
}
504
505
std::string GetMatrixNotation(int reg, MatrixSize size) {
506
int mtx = (reg>>2)&7;
507
int col = reg&3;
508
int row = 0;
509
int transpose = (reg>>5)&1;
510
char c;
511
switch (size)
512
{
513
case M_2x2: c='M'; row=(reg>>5)&2; break;
514
case M_3x3: c='M'; row=(reg>>6)&1; break;
515
case M_4x4: c='M'; row=(reg>>5)&2; break;
516
default: c='?'; break;
517
}
518
if (transpose && c=='M') c='E';
519
if (transpose)
520
return StringFromFormat("%c%i%i%i", c, mtx, row, col);
521
return StringFromFormat("%c%i%i%i", c, mtx, col, row);
522
}
523
524
bool GetVFPUCtrlMask(int reg, u32 *mask) {
525
switch (reg) {
526
case VFPU_CTRL_SPREFIX:
527
case VFPU_CTRL_TPREFIX:
528
*mask = 0x000FFFFF;
529
return true;
530
case VFPU_CTRL_DPREFIX:
531
*mask = 0x00000FFF;
532
return true;
533
case VFPU_CTRL_CC:
534
*mask = 0x0000003F;
535
return true;
536
case VFPU_CTRL_INF4:
537
*mask = 0xFFFFFFFF;
538
return true;
539
case VFPU_CTRL_RSV5:
540
case VFPU_CTRL_RSV6:
541
case VFPU_CTRL_REV:
542
// Don't change anything, these regs are read only.
543
return false;
544
case VFPU_CTRL_RCX0:
545
case VFPU_CTRL_RCX1:
546
case VFPU_CTRL_RCX2:
547
case VFPU_CTRL_RCX3:
548
case VFPU_CTRL_RCX4:
549
case VFPU_CTRL_RCX5:
550
case VFPU_CTRL_RCX6:
551
case VFPU_CTRL_RCX7:
552
*mask = 0x3FFFFFFF;
553
return true;
554
default:
555
return false;
556
}
557
}
558
559
float Float16ToFloat32(unsigned short l)
560
{
561
float2int f2i;
562
563
unsigned short float16 = l;
564
unsigned int sign = (float16 >> VFPU_SH_FLOAT16_SIGN) & VFPU_MASK_FLOAT16_SIGN;
565
int exponent = (float16 >> VFPU_SH_FLOAT16_EXP) & VFPU_MASK_FLOAT16_EXP;
566
unsigned int fraction = float16 & VFPU_MASK_FLOAT16_FRAC;
567
568
float f;
569
if (exponent == VFPU_FLOAT16_EXP_MAX)
570
{
571
f2i.i = sign << 31;
572
f2i.i |= 255 << 23;
573
f2i.i |= fraction;
574
f = f2i.f;
575
}
576
else if (exponent == 0 && fraction == 0)
577
{
578
f = sign == 1 ? -0.0f : 0.0f;
579
}
580
else
581
{
582
if (exponent == 0)
583
{
584
do
585
{
586
fraction <<= 1;
587
exponent--;
588
}
589
while (!(fraction & (VFPU_MASK_FLOAT16_FRAC + 1)));
590
591
fraction &= VFPU_MASK_FLOAT16_FRAC;
592
}
593
594
/* Convert to 32-bit single-precision IEEE754. */
595
f2i.i = sign << 31;
596
f2i.i |= (exponent + 112) << 23;
597
f2i.i |= fraction << 13;
598
f=f2i.f;
599
}
600
return f;
601
}
602
603
// Reference C++ version.
604
static float vfpu_dot_cpp(const float a[4], const float b[4]) {
605
static const int EXTRA_BITS = 2;
606
float2int result;
607
float2int src[2];
608
609
int32_t exps[4];
610
int32_t mants[4];
611
int32_t signs[4];
612
int32_t max_exp = 0;
613
int32_t last_inf = -1;
614
615
for (int i = 0; i < 4; i++) {
616
src[0].f = a[i];
617
src[1].f = b[i];
618
619
int32_t aexp = get_uexp(src[0].i);
620
int32_t bexp = get_uexp(src[1].i);
621
int32_t amant = get_mant(src[0].i) << EXTRA_BITS;
622
int32_t bmant = get_mant(src[1].i) << EXTRA_BITS;
623
624
exps[i] = aexp + bexp - 127;
625
if (aexp == 255) {
626
// INF * 0 = NAN
627
if ((src[0].i & 0x007FFFFF) != 0 || bexp == 0) {
628
result.i = 0x7F800001;
629
return result.f;
630
}
631
mants[i] = get_mant(0) << EXTRA_BITS;
632
exps[i] = 255;
633
} else if (bexp == 255) {
634
if ((src[1].i & 0x007FFFFF) != 0 || aexp == 0) {
635
result.i = 0x7F800001;
636
return result.f;
637
}
638
mants[i] = get_mant(0) << EXTRA_BITS;
639
exps[i] = 255;
640
} else {
641
// TODO: Adjust precision?
642
uint64_t adjust = (uint64_t)amant * (uint64_t)bmant;
643
mants[i] = (adjust >> (23 + EXTRA_BITS)) & 0x7FFFFFFF;
644
}
645
signs[i] = get_sign(src[0].i) ^ get_sign(src[1].i);
646
647
if (exps[i] > max_exp) {
648
max_exp = exps[i];
649
}
650
if (exps[i] >= 255) {
651
// Infinity minus infinity is not a real number.
652
if (last_inf != -1 && signs[i] != last_inf) {
653
result.i = 0x7F800001;
654
return result.f;
655
}
656
last_inf = signs[i];
657
}
658
}
659
660
int32_t mant_sum = 0;
661
for (int i = 0; i < 4; i++) {
662
int exp = max_exp - exps[i];
663
if (exp >= 32) {
664
mants[i] = 0;
665
} else {
666
mants[i] >>= exp;
667
}
668
if (signs[i]) {
669
mants[i] = -mants[i];
670
}
671
mant_sum += mants[i];
672
}
673
674
uint32_t sign_sum = 0;
675
if (mant_sum < 0) {
676
sign_sum = 0x80000000;
677
mant_sum = -mant_sum;
678
}
679
680
// Truncate off the extra bits now. We want to zero them for rounding purposes.
681
mant_sum >>= EXTRA_BITS;
682
683
if (mant_sum == 0 || max_exp <= 0) {
684
return 0.0f;
685
}
686
687
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
688
if (shift < 0) {
689
// Round to even if we'd shift away a 0.5.
690
const uint32_t round_bit = 1 << (-shift - 1);
691
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
692
mant_sum += round_bit;
693
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
694
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
695
mant_sum += round_bit;
696
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
697
}
698
mant_sum >>= -shift;
699
max_exp += -shift;
700
} else {
701
mant_sum <<= shift;
702
max_exp -= shift;
703
}
704
_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
705
706
if (max_exp >= 255) {
707
max_exp = 255;
708
mant_sum = 0;
709
} else if (max_exp <= 0) {
710
return 0.0f;
711
}
712
713
result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
714
return result.f;
715
}
716
717
#if defined(__SSE2__)
718
719
#include <emmintrin.h>
720
721
static inline __m128i mulhi32x4(__m128i a, __m128i b) {
722
__m128i m02 = _mm_mul_epu32(a, b);
723
__m128i m13 = _mm_mul_epu32(
724
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
725
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
726
__m128i m=_mm_unpacklo_epi32(
727
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 3, 1)),
728
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 3, 1)));
729
return m;
730
}
731
732
// Values of rounding_mode:
733
// -1 - detect at runtime
734
// 0 - assume round-to-nearest-ties-to-even
735
// 1 - round yourself in integer math
736
template<int rounding_mode=-1>
737
static float vfpu_dot_sse2(const float a[4], const float b[4])
738
{
739
static const int EXTRA_BITS = 2;
740
741
bool is_default_rounding_mode = (rounding_mode == 0);
742
if(rounding_mode == -1)
743
{
744
volatile float test05 = 5.9604644775390625e-08f; // 0.5*2^-23
745
volatile float test15 = 1.78813934326171875e-07f; // 1.5*2^-23
746
const float res15 = 1.0000002384185791015625f; // 1+2^-22
747
test05 += 1.0f;
748
test15 += 1.0f;
749
is_default_rounding_mode = (test05 == 1.0f && test15 == res15);
750
}
751
__m128 A = _mm_loadu_ps(a);
752
__m128 B = _mm_loadu_ps(b);
753
// Extract exponents.
754
__m128 exp_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
755
__m128 eA = _mm_and_ps(A, exp_mask);
756
__m128 eB = _mm_and_ps(B, exp_mask);
757
__m128i exps = _mm_srli_epi32(_mm_add_epi32(
758
_mm_castps_si128(eA),
759
_mm_castps_si128(eB)),23);
760
// Find maximum exponent, stored as float32 in [1;2),
761
// so we can use _mm_max_ps() with normal arguments.
762
__m128 t = _mm_or_ps(_mm_castsi128_ps(exps), _mm_set1_ps(1.0f));
763
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(2, 3, 0, 1))));
764
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(1, 0, 3, 2))));
765
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_set1_epi32(0x3F80007F)));
766
int32_t mexp = _mm_cvtsi128_si32(_mm_castps_si128(t)) & 511;
767
// NOTE: mexp is doubly-biased, same for exps.
768
int32_t max_exp = mexp - 127;
769
// Fall back on anything weird.
770
__m128 finiteA = _mm_sub_ps(A, A);
771
__m128 finiteB = _mm_sub_ps(B, B);
772
finiteA = _mm_cmpeq_ps(finiteA, finiteA);
773
finiteB = _mm_cmpeq_ps(finiteB, finiteB);
774
if(max_exp >= 255 || _mm_movemask_ps(_mm_and_ps(finiteA, finiteB)) != 15) return vfpu_dot_cpp(a, b);
775
// Extract significands.
776
__m128i mA = _mm_or_si128(_mm_and_si128(_mm_castps_si128(A),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
777
__m128i mB = _mm_or_si128(_mm_and_si128(_mm_castps_si128(B),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
778
// Multiply.
779
// NOTE: vfpu_dot does multiplication as
780
// ((x<<EXTRA_BITS)*(y<<EXTRA_BITS))>>(23+EXTRA_BITS),
781
// here we do (x*y)>>(23-EXTRA_BITS-1),
782
// which produces twice the result (neither expression
783
// overflows in our case). We need that because our
784
// variable-shift scheme (below) must shift by at least 1 bit.
785
static const int s = 32-(23 - EXTRA_BITS - 1), s0 = s / 2,s1 = s - s0;
786
// We compute ((x*y)>>shift) as
787
// (((x*y)<<(32-shift))>>32), which we express as
788
// (((x<<s0)*(y<<s1))>>32) (neither shift overflows).
789
__m128i m = mulhi32x4(_mm_slli_epi32(mA, s0), _mm_slli_epi32(mB, s1));
790
// Shift according to max_exp. Since SSE2 doesn't have
791
// variable per-lane shifts, we multiply *again*,
792
// specifically, x>>y turns into (x<<(1<<(32-y)))>>32.
793
// We compute 1<<(32-y) using floating-point casts.
794
// NOTE: the cast for 1<<31 produces the correct value,
795
// since the _mm_cvttps_epi32 error code just happens
796
// to be 0x80000000.
797
// So (since we pre-multiplied m by 2), we need
798
// (m>>1)>>(mexp-exps),
799
// i.e. m>>(mexp+1-exps),
800
// i.e. (m<<(32-(mexp+1-exps)))>>32,
801
// i.e. (m<<(exps-(mexp-31)))>>32.
802
__m128i amounts = _mm_sub_epi32(exps, _mm_set1_epi32(mexp - 31));
803
// Clamp by 0. Both zero and negative amounts produce zero,
804
// since they correspond to right-shifting by 32 or more bits.
805
amounts = _mm_and_si128(amounts, _mm_cmpgt_epi32(amounts, _mm_set1_epi32(0)));
806
// Set up multipliers.
807
__m128i bits = _mm_add_epi32(_mm_set1_epi32(0x3F800000), _mm_slli_epi32(amounts, 23));
808
__m128i muls = _mm_cvttps_epi32(_mm_castsi128_ps(bits));
809
m = mulhi32x4(m, muls);
810
// Extract signs.
811
__m128i signs = _mm_cmpgt_epi32(
812
_mm_set1_epi32(0),
813
_mm_xor_si128(_mm_castps_si128(A), _mm_castps_si128(B)));
814
// Apply signs to m.
815
m = _mm_sub_epi32(_mm_xor_si128(m, signs), signs);
816
// Horizontal sum.
817
// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
818
__m128i h64 = _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2));
819
__m128i s64 = _mm_add_epi32(h64, m);
820
__m128i h32 = _mm_shufflelo_epi16(s64, _MM_SHUFFLE(1, 0, 3, 2));
821
__m128i s32 = _mm_add_epi32(s64, h32);
822
int32_t mant_sum = _mm_cvtsi128_si32(s32);
823
824
// The rest is scalar.
825
uint32_t sign_sum = 0;
826
if (mant_sum < 0) {
827
sign_sum = 0x80000000;
828
mant_sum = -mant_sum;
829
}
830
831
// Truncate off the extra bits now. We want to zero them for rounding purposes.
832
mant_sum >>= EXTRA_BITS;
833
834
if (mant_sum == 0 || max_exp <= 0) {
835
return 0.0f;
836
}
837
838
if(is_default_rounding_mode)
839
{
840
float2int r;
841
r.f = (float)mant_sum;
842
mant_sum = (r.i & 0x007FFFFF) | 0x00800000;
843
max_exp += (r.i >> 23) - 0x96;
844
}
845
else
846
{
847
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
848
if (shift < 0) {
849
// Round to even if we'd shift away a 0.5.
850
const uint32_t round_bit = 1 << (-shift - 1);
851
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
852
mant_sum += round_bit;
853
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
854
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
855
mant_sum += round_bit;
856
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
857
}
858
mant_sum >>= -shift;
859
max_exp += -shift;
860
} else {
861
mant_sum <<= shift;
862
max_exp -= shift;
863
}
864
_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
865
}
866
867
if (max_exp >= 255) {
868
max_exp = 255;
869
mant_sum = 0;
870
} else if (max_exp <= 0) {
871
return 0.0f;
872
}
873
874
float2int result;
875
result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
876
return result.f;
877
}
878
879
#endif // defined(__SSE2__)
880
881
float vfpu_dot(const float a[4], const float b[4]) {
882
#if defined(__SSE2__)
883
return vfpu_dot_sse2(a, b);
884
#else
885
return vfpu_dot_cpp(a, b);
886
#endif
887
}
888
889
//==============================================================================
890
// The code below attempts to exactly match behaviour of
891
// PSP's vrnd instructions. See investigation starting around
892
// https://github.com/hrydgard/ppsspp/issues/16946#issuecomment-1467261209
893
// for details.
894
895
// Redundant currently, since MIPSState::Init() already
896
// does this on its own, but left as-is to be self-contained.
897
void vrnd_init_default(uint32_t *rcx) {
898
rcx[0] = 0x00000001;
899
rcx[1] = 0x00000002;
900
rcx[2] = 0x00000004;
901
rcx[3] = 0x00000008;
902
rcx[4] = 0x00000000;
903
rcx[5] = 0x00000000;
904
rcx[6] = 0x00000000;
905
rcx[7] = 0x00000000;
906
}
907
908
void vrnd_init(uint32_t seed, uint32_t *rcx) {
909
for(int i = 0; i < 8; ++i) rcx[i] =
910
0x3F800000u | // 1.0f mask.
911
((seed >> ((i / 4) * 16)) & 0xFFFFu) | // lower or upper half of the seed.
912
(((seed >> (4 * i)) & 0xF) << 16); // remaining nibble.
913
914
}
915
916
uint32_t vrnd_generate(uint32_t *rcx) {
917
// The actual RNG state appears to be 5 parts
918
// (32-bit each) stored into the registers as follows:
919
uint32_t A = (rcx[0] & 0xFFFFu) | (rcx[4] << 16);
920
uint32_t B = (rcx[1] & 0xFFFFu) | (rcx[5] << 16);
921
uint32_t C = (rcx[2] & 0xFFFFu) | (rcx[6] << 16);
922
uint32_t D = (rcx[3] & 0xFFFFu) | (rcx[7] << 16);
923
uint32_t E = (((rcx[0] >> 16) & 0xF) << 0) |
924
(((rcx[1] >> 16) & 0xF) << 4) |
925
(((rcx[2] >> 16) & 0xF) << 8) |
926
(((rcx[3] >> 16) & 0xF) << 12) |
927
(((rcx[4] >> 16) & 0xF) << 16) |
928
(((rcx[5] >> 16) & 0xF) << 20) |
929
(((rcx[6] >> 16) & 0xF) << 24) |
930
(((rcx[7] >> 16) & 0xF) << 28);
931
// Update.
932
// LCG with classic parameters.
933
A = 69069u * A + 1u; // NOTE: decimal constants.
934
// Xorshift, with classic parameters. Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs".
935
B ^= B << 13;
936
B ^= B >> 17;
937
B ^= B << 5;
938
// Sequence similar to Pell numbers ( https://en.wikipedia.org/wiki/Pell_number ),
939
// except with different starting values, and an occasional increment (E).
940
uint32_t t= 2u * D + C + E;
941
// NOTE: the details of how E-part is set are somewhat of a guess
942
// at the moment. The expression below looks weird, but does match
943
// the available test data.
944
E = uint32_t((uint64_t(C) + uint64_t(D >> 1) + uint64_t(E)) >> 32);
945
C = D;
946
D = t;
947
// Store.
948
rcx[0] = 0x3F800000u | (((E >> 0) & 0xF) << 16) | (A & 0xFFFFu);
949
rcx[1] = 0x3F800000u | (((E >> 4) & 0xF) << 16) | (B & 0xFFFFu);
950
rcx[2] = 0x3F800000u | (((E >> 8) & 0xF) << 16) | (C & 0xFFFFu);
951
rcx[3] = 0x3F800000u | (((E >> 12) & 0xF) << 16) | (D & 0xFFFFu);
952
rcx[4] = 0x3F800000u | (((E >> 16) & 0xF) << 16) | (A >> 16);
953
rcx[5] = 0x3F800000u | (((E >> 20) & 0xF) << 16) | (B >> 16);
954
rcx[6] = 0x3F800000u | (((E >> 24) & 0xF) << 16) | (C >> 16);
955
rcx[7] = 0x3F800000u | (((E >> 28) & 0xF) << 16) | (D >> 16);
956
// Return value.
957
return A + B + D;
958
}
959
960
//==============================================================================
961
// The code below attempts to exactly match the output of
962
// several PSP's VFPU functions. For the sake of
963
// making lookup tables smaller the code is
964
// somewhat gnarly.
965
// Lookup tables sometimes store deltas from (explicitly computable)
966
// estimations, to allow to store them in smaller types.
967
// See https://github.com/hrydgard/ppsspp/issues/16946 for details.
968
969
// Lookup tables.
970
// Note: these are never unloaded, and stay till program termination.
971
static uint32_t (*vfpu_sin_lut8192)=nullptr;
972
static int8_t (*vfpu_sin_lut_delta)[2]=nullptr;
973
static int16_t (*vfpu_sin_lut_interval_delta)=nullptr;
974
static uint8_t (*vfpu_sin_lut_exceptions)=nullptr;
975
976
static int8_t (*vfpu_sqrt_lut)[2]=nullptr;
977
978
static int8_t (*vfpu_rsqrt_lut)[2]=nullptr;
979
980
static uint32_t (*vfpu_exp2_lut65536)=nullptr;
981
static uint8_t (*vfpu_exp2_lut)[2]=nullptr;
982
983
static uint32_t (*vfpu_log2_lut65536)=nullptr;
984
static uint32_t (*vfpu_log2_lut65536_quadratic)=nullptr;
985
static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr;
986
987
static int32_t (*vfpu_asin_lut65536)[3]=nullptr;
988
static uint64_t (*vfpu_asin_lut_deltas)=nullptr;
989
static uint16_t (*vfpu_asin_lut_indices)=nullptr;
990
991
static int8_t (*vfpu_rcp_lut)[2]=nullptr;
992
993
template<typename T>
994
static inline bool load_vfpu_table(T *&ptr, const char *filename, size_t expected_size) {
995
#if COMMON_BIG_ENDIAN
996
// Tables are little-endian.
997
#error Byteswap for VFPU tables not implemented
998
#endif
999
if (ptr) return true; // Already loaded.
1000
size_t size = 0u;
1001
INFO_LOG(Log::CPU, "Loading '%s'...", filename);
1002
ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
1003
if (!ptr || size != expected_size) {
1004
ERROR_LOG(Log::CPU, "Error loading '%s' (size=%u, expected: %u)", filename, (unsigned)size, (unsigned)expected_size);
1005
delete[] ptr;
1006
ptr = nullptr;
1007
return false;
1008
}
1009
INFO_LOG(Log::CPU, "Successfully loaded '%s'", filename);
1010
return true;
1011
}
1012
1013
#define LOAD_TABLE(name, expected_size)\
1014
load_vfpu_table(name,"vfpu/" #name ".dat",expected_size)
1015
1016
// Note: PSP sin/cos output only has 22 significant
1017
// binary digits.
1018
static inline uint32_t vfpu_sin_quantum(uint32_t x) {
1019
return x < 1u << 22?
1020
1u:
1021
1u << (32 - 22 - clz32_nonzero(x));
1022
}
1023
1024
static inline uint32_t vfpu_sin_truncate_bits(u32 x) {
1025
return x & -vfpu_sin_quantum(x);
1026
}
1027
1028
static inline uint32_t vfpu_sin_fixed(uint32_t arg) {
1029
// Handle endpoints.
1030
if(arg == 0u) return 0u;
1031
if(arg == 0x00800000) return 0x10000000;
1032
// Get endpoints for 8192-wide interval.
1033
uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];
1034
uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];
1035
// Approximate endpoints for 64-wide interval via lerp.
1036
uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);
1037
uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);
1038
// Adjust endpoints from deltas, and increase working precision.
1039
uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);
1040
uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);
1041
// Compute approximation via lerp. Is off by at most 1 quantum.
1042
uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);
1043
v=vfpu_sin_truncate_bits(v);
1044
// Look up exceptions via binary search.
1045
// Note: vfpu_sin_lut_interval_delta stores
1046
// deltas from interval estimation.
1047
uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;
1048
uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;
1049
while(lo < hi) {
1050
uint32_t m = (lo + hi) / 2;
1051
// Note: vfpu_sin_lut_exceptions stores
1052
// index&127 (for each initial interval the
1053
// upper bits of index are the same, namely
1054
// arg&-128), plus direction (0 for +1, and
1055
// 128 for -1).
1056
uint32_t b = vfpu_sin_lut_exceptions[m];
1057
uint32_t e = (arg & -128u)+(b & 127u);
1058
if(e == arg) {
1059
v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);
1060
break;
1061
}
1062
else if(e < arg) lo = m + 1;
1063
else hi = m;
1064
}
1065
return v;
1066
}
1067
1068
float vfpu_sin(float x) {
1069
static bool loaded =
1070
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
1071
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
1072
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
1073
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
1074
if (!loaded)
1075
return vfpu_sin_fallback(x);
1076
uint32_t bits;
1077
memcpy(&bits, &x, sizeof(x));
1078
uint32_t sign = bits & 0x80000000u;
1079
uint32_t exponent = (bits >> 23) & 0xFFu;
1080
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
1081
if(exponent == 0xFFu) {
1082
// NOTE: this bitpattern is a signaling
1083
// NaN on x86, so maybe just return
1084
// a normal qNaN?
1085
float y;
1086
bits=sign ^ 0x7F800001u;
1087
memcpy(&y, &bits, sizeof(y));
1088
return y;
1089
}
1090
if(exponent < 0x7Fu) {
1091
if(exponent < 0x7Fu-23u) significand = 0u;
1092
else significand >>= (0x7F - exponent);
1093
}
1094
else if(exponent > 0x7Fu) {
1095
// There is weirdness for large exponents.
1096
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
1097
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
1098
else significand <<= (exponent - 0x7Fu);
1099
}
1100
sign ^= ((significand << 7) & 0x80000000u);
1101
significand &= 0x00FFFFFFu;
1102
if(significand > 0x00800000u) significand = 0x01000000u - significand;
1103
uint32_t ret = vfpu_sin_fixed(significand);
1104
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
1105
}
1106
1107
float vfpu_cos(float x) {
1108
static bool loaded =
1109
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
1110
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
1111
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
1112
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
1113
if (!loaded)
1114
return vfpu_cos_fallback(x);
1115
uint32_t bits;
1116
memcpy(&bits, &x, sizeof(x));
1117
bits &= 0x7FFFFFFFu;
1118
uint32_t sign = 0u;
1119
uint32_t exponent = (bits >> 23) & 0xFFu;
1120
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
1121
if(exponent == 0xFFu) {
1122
// NOTE: this bitpattern is a signaling
1123
// NaN on x86, so maybe just return
1124
// a normal qNaN?
1125
float y;
1126
bits = sign ^ 0x7F800001u;
1127
memcpy(&y, &bits, sizeof(y));
1128
return y;
1129
}
1130
if(exponent < 0x7Fu) {
1131
if(exponent < 0x7Fu - 23u) significand = 0u;
1132
else significand >>= (0x7F - exponent);
1133
}
1134
else if(exponent > 0x7Fu) {
1135
// There is weirdness for large exponents.
1136
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
1137
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
1138
else significand <<= (exponent - 0x7Fu);
1139
}
1140
sign ^= ((significand << 7) & 0x80000000u);
1141
significand &= 0x00FFFFFFu;
1142
if(significand >= 0x00800000u) {
1143
significand = 0x01000000u - significand;
1144
sign ^= 0x80000000u;
1145
}
1146
uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);
1147
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
1148
}
1149
1150
void vfpu_sincos(float a, float &s, float &c) {
1151
// Just invoke both sin and cos.
1152
// Suboptimal but whatever.
1153
s = vfpu_sin(a);
1154
c = vfpu_cos(a);
1155
}
1156
1157
// Integer square root of 2^23*x (rounded to zero).
1158
// Input is in 2^23 <= x < 2^25, and representable in float.
1159
static inline uint32_t isqrt23(uint32_t x) {
1160
#if 0
1161
// Reference version.
1162
int dir=fesetround(FE_TOWARDZERO);
1163
uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));
1164
fesetround(dir);
1165
return ret;
1166
#elif 1
1167
// Double version.
1168
// Verified to produce correct result for all valid inputs,
1169
// in all rounding modes, both in double and double-extended (x87)
1170
// precision.
1171
// Requires correctly-rounded sqrt (which on any IEEE-754 system
1172
// it should be).
1173
return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));
1174
#else
1175
// Pure integer version, if you don't like floating point.
1176
// Based on code from Hacker's Delight. See isqrt4 in
1177
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
1178
// Relatively slow.
1179
uint64_t t=uint64_t(x) << 23, m, y, b;
1180
m=0x4000000000000000ull;
1181
y=0;
1182
while(m != 0) // Do 32 times.
1183
{
1184
b=y | m;
1185
y=y >> 1;
1186
if(t >= b)
1187
{
1188
t = t - b;
1189
y = y | m;
1190
}
1191
m = m >> 2;
1192
}
1193
return y;
1194
#endif
1195
}
1196
1197
// Returns floating-point bitpattern.
1198
static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
1199
// Endpoints of input.
1200
uint32_t lo =(x + 0u) & -64u;
1201
uint32_t hi = (x + 64u) & -64u;
1202
// Convert input to 9.23 fixed-point.
1203
lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);
1204
hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);
1205
// Estimate endpoints of output.
1206
uint32_t A = 0x3F000000u + isqrt23(lo);
1207
uint32_t B = 0x3F000000u + isqrt23(hi);
1208
// Apply deltas, and increase the working precision.
1209
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);
1210
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);
1211
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1212
// Truncate lower 2 bits.
1213
ret &= -4u;
1214
return ret;
1215
}
1216
1217
float vfpu_sqrt(float x) {
1218
static bool loaded =
1219
LOAD_TABLE(vfpu_sqrt_lut, 262144);
1220
if (!loaded)
1221
return vfpu_sqrt_fallback(x);
1222
uint32_t bits;
1223
memcpy(&bits, &x, sizeof(bits));
1224
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1225
// Denormals (and zeroes) get +0, regardless
1226
// of sign.
1227
return +0.0f;
1228
}
1229
if(bits >> 31) {
1230
// Other negatives get NaN.
1231
bits = 0x7F800001u;
1232
memcpy(&x, &bits, sizeof(x));
1233
return x;
1234
}
1235
if((bits >> 23) == 255u) {
1236
// Inf/NaN gets Inf/NaN.
1237
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);
1238
memcpy(&x, &bits, sizeof(bits));
1239
return x;
1240
}
1241
int32_t exponent = int32_t(bits >> 23) - 127;
1242
// Bottom bit of exponent (inverted) + significand (except bottom bit).
1243
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
1244
bits = vfpu_sqrt_fixed(index);
1245
bits += uint32_t(exponent >> 1) << 23;
1246
memcpy(&x, &bits, sizeof(bits));
1247
return x;
1248
}
1249
1250
// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.
1251
static inline uint32_t rsqrt_floor22(uint32_t x) {
1252
#if 1
1253
// Verified correct in all rounding directions,
1254
// by exhaustive search.
1255
return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33
1256
#else
1257
// Pure integer version, if you don't like floating point.
1258
// Based on code from Hacker's Delight. See isqrt4 in
1259
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
1260
// Relatively slow.
1261
uint64_t t=uint64_t(x) << 22, m, y, b;
1262
m = 0x4000000000000000ull;
1263
y = 0;
1264
while(m != 0) // Do 32 times.
1265
{
1266
b = y | m;
1267
y = y >> 1;
1268
if(t >= b)
1269
{
1270
t = t - b;
1271
y = y | m;
1272
}
1273
m = m >> 2;
1274
}
1275
y = (1ull << 44) / y;
1276
// Decrement if y > floor(2^33 / sqrt(x)).
1277
// This hack works because exhaustive
1278
// search (on [2^22;2^24]) says it does.
1279
if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;
1280
return uint32_t(y);
1281
#endif
1282
}
1283
1284
// Returns floating-point bitpattern.
1285
static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
1286
// Endpoints of input.
1287
uint32_t lo = (x + 0u) & -64u;
1288
uint32_t hi = (x + 64u) & -64u;
1289
// Convert input to 10.22 fixed-point.
1290
lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);
1291
hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);
1292
// Estimate endpoints of output.
1293
uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);
1294
uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);
1295
// Apply deltas, and increase the working precision.
1296
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);
1297
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);
1298
// Evaluate via lerp.
1299
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1300
// Truncate lower 2 bits.
1301
ret &= -4u;
1302
return ret;
1303
}
1304
1305
float vfpu_rsqrt(float x) {
1306
static bool loaded =
1307
LOAD_TABLE(vfpu_rsqrt_lut, 262144);
1308
if (!loaded)
1309
return vfpu_rsqrt_fallback(x);
1310
uint32_t bits;
1311
memcpy(&bits, &x, sizeof(bits));
1312
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1313
// Denormals (and zeroes) get inf of the same sign.
1314
bits = 0x7F800000u | (bits & 0x80000000u);
1315
memcpy(&x, &bits, sizeof(x));
1316
return x;
1317
}
1318
if(bits >> 31) {
1319
// Other negatives get negative NaN.
1320
bits = 0xFF800001u;
1321
memcpy(&x, &bits, sizeof(x));
1322
return x;
1323
}
1324
if((bits >> 23) == 255u) {
1325
// inf gets 0, NaN gets NaN.
1326
bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);
1327
memcpy(&x, &bits, sizeof(bits));
1328
return x;
1329
}
1330
int32_t exponent = int32_t(bits >> 23) - 127;
1331
// Bottom bit of exponent (inverted) + significand (except bottom bit).
1332
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
1333
bits = vfpu_rsqrt_fixed(index);
1334
bits -= uint32_t(exponent >> 1) << 23;
1335
memcpy(&x, &bits, sizeof(bits));
1336
return x;
1337
}
1338
1339
static inline uint32_t vfpu_asin_quantum(uint32_t x) {
1340
return x<1u<<23?
1341
1u:
1342
1u<<(32-23-clz32_nonzero(x));
1343
}
1344
1345
static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {
1346
return x & -vfpu_asin_quantum(x);
1347
}
1348
1349
// Input is fixed 9.23, output is fixed 2.30.
1350
static inline uint32_t vfpu_asin_approx(uint32_t x) {
1351
const int32_t *C = vfpu_asin_lut65536[x >> 16];
1352
x &= 0xFFFFu;
1353
return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));
1354
}
1355
1356
// Input is fixed 9.23, output is fixed 2.30.
1357
static uint32_t vfpu_asin_fixed(uint32_t x) {
1358
if(x == 0u) return 0u;
1359
if(x == 1u << 23) return 1u << 30;
1360
uint32_t ret = vfpu_asin_approx(x);
1361
uint32_t index = vfpu_asin_lut_indices[x / 21u];
1362
uint64_t deltas = vfpu_asin_lut_deltas[index];
1363
return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);
1364
}
1365
1366
float vfpu_asin(float x) {
1367
static bool loaded =
1368
LOAD_TABLE(vfpu_asin_lut65536, 1536)&&
1369
LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
1370
LOAD_TABLE(vfpu_asin_lut_deltas, 517448);
1371
if (!loaded)
1372
return vfpu_asin_fallback(x);
1373
1374
uint32_t bits;
1375
memcpy(&bits, &x, sizeof(x));
1376
uint32_t sign = bits & 0x80000000u;
1377
bits = bits & 0x7FFFFFFFu;
1378
if(bits > 0x3F800000u) {
1379
bits = 0x7F800001u ^ sign;
1380
memcpy(&x, &bits, sizeof(x));
1381
return x;
1382
}
1383
1384
bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23
1385
x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30
1386
if(sign) x = -x;
1387
return x;
1388
}
1389
1390
static inline uint32_t vfpu_exp2_approx(uint32_t x) {
1391
if(x == 0x00800000u) return 0x00800000u;
1392
uint32_t a=vfpu_exp2_lut65536[x >> 16];
1393
x &= 0x0000FFFFu;
1394
uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));
1395
return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;
1396
}
1397
1398
static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
1399
if(x == 0u) return 0u;
1400
if(x == 0x00800000u) return 0x00800000u;
1401
uint32_t A = vfpu_exp2_approx((x ) & -64u);
1402
uint32_t B = vfpu_exp2_approx((x + 64) & -64u);
1403
uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;
1404
uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;
1405
uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1406
y &= -4u;
1407
return y;
1408
}
1409
1410
float vfpu_exp2(float x) {
1411
static bool loaded =
1412
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
1413
LOAD_TABLE(vfpu_exp2_lut, 262144);
1414
if (!loaded)
1415
return vfpu_exp2_fallback(x);
1416
int32_t bits;
1417
memcpy(&bits, &x, sizeof(bits));
1418
if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
1419
// Denormals are treated as 0.
1420
return 1.0f;
1421
}
1422
if(x != x) {
1423
// NaN gets NaN.
1424
bits = 0x7F800001u;
1425
memcpy(&x, &bits, sizeof(x));
1426
return x;
1427
}
1428
if(x <= -126.0f) {
1429
// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).
1430
// But yes, -126.0f produces +0.0f.
1431
return 0.0f;
1432
}
1433
if(x >= +128.0f) {
1434
// Large numbers get infinity.
1435
bits = 0x7F800000u;
1436
memcpy(&x, &bits, sizeof(x));
1437
return x;
1438
}
1439
bits = int32_t(x * 0x1p23f);
1440
if(x < 0.0f) --bits; // Yes, really.
1441
bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));
1442
memcpy(&x, &bits, sizeof(bits));
1443
return x;
1444
}
1445
1446
float vfpu_rexp2(float x) {
1447
return vfpu_exp2(-x);
1448
}
1449
1450
// Input fixed 9.23, output fixed 10.22.
1451
// Returns log2(1+x).
1452
static inline uint32_t vfpu_log2_approx(uint32_t x) {
1453
uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];
1454
uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];
1455
uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];
1456
x &= 0xFFFFu;
1457
uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
1458
uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
1459
ret += d;
1460
return uint32_t(ret >> 16);
1461
}
1462
1463
// Matches PSP output on all known values.
1464
float vfpu_log2(float x) {
1465
static bool loaded =
1466
LOAD_TABLE(vfpu_log2_lut65536, 516)&&
1467
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&
1468
LOAD_TABLE(vfpu_log2_lut, 2097152);
1469
if (!loaded)
1470
return vfpu_log2_fallback(x);
1471
uint32_t bits;
1472
memcpy(&bits, &x, sizeof(bits));
1473
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1474
// Denormals (and zeroes) get -inf.
1475
bits = 0xFF800000u;
1476
memcpy(&x, &bits, sizeof(x));
1477
return x;
1478
}
1479
if(bits & 0x80000000u) {
1480
// Other negatives get NaN.
1481
bits = 0x7F800001u;
1482
memcpy(&x, &bits, sizeof(x));
1483
return x;
1484
}
1485
if((bits >> 23) == 255u) {
1486
// NaN gets NaN, +inf gets +inf.
1487
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);
1488
memcpy(&x, &bits, sizeof(x));
1489
return x;
1490
}
1491
uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;
1492
uint32_t i = bits & 0x007FFFFFu;
1493
if(e >> 31 && i >= 0x007FFE00u) {
1494
// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,
1495
// since the table doesn't give the right answer.
1496
float c = float(int32_t(~e) >> 23);
1497
// Note: if c is 0 the sign of -0 output is correct.
1498
return i < 0x007FFEF7u ? // 1-265*2^{-24}
1499
-3.05175781e-05f - c:
1500
-0.0f - c;
1501
}
1502
int d = (e < 0x01000000u ? 0 : 8 - clz32_nonzero(e) - int(e >> 31));
1503
//assert(d >= 0 && d < 8);
1504
uint32_t q = 1u << d;
1505
uint32_t A = vfpu_log2_approx((i ) & -64u) & -q;
1506
uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;
1507
uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;
1508
uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;
1509
uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);
1510
v &= -q;
1511
bits = e ^ (2u * v);
1512
x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f
1513
return x;
1514
}
1515
1516
static inline uint32_t vfpu_rcp_approx(uint32_t i) {
1517
return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);
1518
}
1519
1520
float vfpu_rcp(float x) {
1521
static bool loaded =
1522
LOAD_TABLE(vfpu_rcp_lut, 262144);
1523
if (!loaded)
1524
return vfpu_rcp_fallback(x);
1525
uint32_t bits;
1526
memcpy(&bits, &x, sizeof(bits));
1527
uint32_t s = bits & 0x80000000u;
1528
uint32_t e = bits & 0x7F800000u;
1529
uint32_t i = bits & 0x007FFFFFu;
1530
if((bits & 0x7FFFFFFFu) > 0x7E800000u) {
1531
bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);
1532
memcpy(&x, &bits, sizeof(x));
1533
return x;
1534
}
1535
if(e==0u) {
1536
bits = s^0x7F800000u;
1537
memcpy(&x, &bits, sizeof(x));
1538
return x;
1539
}
1540
uint32_t A = vfpu_rcp_approx((i ) & -64u);
1541
uint32_t B = vfpu_rcp_approx((i + 64) & -64u);
1542
uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;
1543
uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;
1544
uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);
1545
v &= -4u;
1546
bits = s + (0x3F800000u - e) + v;
1547
memcpy(&x, &bits, sizeof(x));
1548
return x;
1549
}
1550
1551
//==============================================================================
1552
1553
void InitVFPU() {
1554
#if 0
1555
// Load all in advance.
1556
LOAD_TABLE(vfpu_asin_lut65536 , 1536);
1557
LOAD_TABLE(vfpu_asin_lut_deltas , 517448);
1558
LOAD_TABLE(vfpu_asin_lut_indices , 798916);
1559
LOAD_TABLE(vfpu_exp2_lut65536 , 512);
1560
LOAD_TABLE(vfpu_exp2_lut , 262144);
1561
LOAD_TABLE(vfpu_log2_lut65536 , 516);
1562
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512);
1563
LOAD_TABLE(vfpu_log2_lut , 2097152);
1564
LOAD_TABLE(vfpu_rcp_lut , 262144);
1565
LOAD_TABLE(vfpu_rsqrt_lut , 262144);
1566
LOAD_TABLE(vfpu_sin_lut8192 , 4100);
1567
LOAD_TABLE(vfpu_sin_lut_delta , 262144);
1568
LOAD_TABLE(vfpu_sin_lut_exceptions , 86938);
1569
LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074);
1570
LOAD_TABLE(vfpu_sqrt_lut , 262144);
1571
#endif
1572
}
1573
1574