CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM64/Arm64CompVFPU.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(ARM64)
20
21
#include <cmath>
22
#include "Common/Arm64Emitter.h"
23
#include "Common/CPUDetect.h"
24
#include "Common/Data/Convert/SmallDataConvert.h"
25
#include "Common/Math/math_util.h"
26
27
#include "Core/Compatibility.h"
28
#include "Core/Config.h"
29
#include "Core/MemMap.h"
30
#include "Core/Reporting.h"
31
#include "Core/System.h"
32
#include "Core/MIPS/MIPS.h"
33
#include "Core/MIPS/MIPSTables.h"
34
#include "Core/MIPS/MIPSAnalyst.h"
35
#include "Core/MIPS/MIPSCodeUtils.h"
36
#include "Core/MIPS/ARM64/Arm64Jit.h"
37
#include "Core/MIPS/ARM64/Arm64RegCache.h"
38
39
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
40
// Currently known non working ones should have DISABLE.
41
42
// #define CONDITIONAL_DISABLE(flag) { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
43
#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }
44
#define DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
45
46
#define _RS MIPS_GET_RS(op)
47
#define _RT MIPS_GET_RT(op)
48
#define _RD MIPS_GET_RD(op)
49
#define _FS MIPS_GET_FS(op)
50
#define _FT MIPS_GET_FT(op)
51
#define _FD MIPS_GET_FD(op)
52
#define _SA MIPS_GET_SA(op)
53
#define _POS ((op>> 6) & 0x1F)
54
#define _SIZE ((op>>11) & 0x1F)
55
#define _IMM16 (signed short)(op & 0xFFFF)
56
#define _IMM26 (op & 0x03FFFFFF)
57
58
namespace MIPSComp {
59
using namespace Arm64Gen;
60
using namespace Arm64JitConstants;
61
62
// Vector regs can overlap in all sorts of swizzled ways.
63
// This does allow a single overlap in sregs[i].
64
static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)
65
{
66
for (int i = 0; i < sn; ++i) {
67
if (sregs[i] == dreg && i != di)
68
return false;
69
}
70
for (int i = 0; i < tn; ++i) {
71
if (tregs[i] == dreg)
72
return false;
73
}
74
75
// Hurray, no overlap, we can write directly.
76
return true;
77
}
78
79
static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)
80
{
81
return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
82
}
83
84
void Arm64Jit::Comp_VPFX(MIPSOpcode op) {
85
CONDITIONAL_DISABLE(VFPU_XFER);
86
int data = op & 0xFFFFF;
87
int regnum = (op >> 24) & 3;
88
switch (regnum) {
89
case 0: // S
90
js.prefixS = data;
91
js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
92
break;
93
case 1: // T
94
js.prefixT = data;
95
js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
96
break;
97
case 2: // D
98
js.prefixD = data & 0x00000FFF;
99
js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
100
break;
101
default:
102
ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
103
break;
104
}
105
}
106
107
void Arm64Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
108
if (prefix == 0xE4)
109
return;
110
111
int n = GetNumVectorElements(sz);
112
u8 origV[4];
113
static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
114
115
for (int i = 0; i < n; i++)
116
origV[i] = vregs[i];
117
118
for (int i = 0; i < n; i++) {
119
int regnum = (prefix >> (i * 2)) & 3;
120
int abs = (prefix >> (8 + i)) & 1;
121
int negate = (prefix >> (16 + i)) & 1;
122
int constants = (prefix >> (12 + i)) & 1;
123
124
// Unchanged, hurray.
125
if (!constants && regnum == i && !abs && !negate)
126
continue;
127
128
// This puts the value into a temp reg, so we won't write the modified value back.
129
vregs[i] = fpr.GetTempV();
130
if (!constants) {
131
fpr.MapDirtyInV(vregs[i], origV[regnum]);
132
fpr.SpillLockV(vregs[i]);
133
134
// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
135
// TODO: But some ops seem to use const 0 instead?
136
if (regnum >= n) {
137
WARN_LOG(Log::CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, GetCompilerPC(), MIPSDisasmAt(GetCompilerPC()).c_str());
138
regnum = 0;
139
}
140
141
if (abs) {
142
fp.FABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));
143
if (negate)
144
fp.FNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));
145
} else {
146
if (negate)
147
fp.FNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));
148
else
149
fp.FMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));
150
}
151
} else {
152
fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);
153
fpr.SpillLockV(vregs[i]);
154
fp.MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs << 2)], SCRATCH1, (bool)negate);
155
}
156
}
157
}
158
159
void Arm64Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
160
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
161
162
GetVectorRegs(regs, sz, vectorReg);
163
if (js.prefixD == 0)
164
return;
165
166
int n = GetNumVectorElements(sz);
167
for (int i = 0; i < n; i++) {
168
// Hopefully this is rare, we'll just write it into a reg we drop.
169
if (js.VfpuWriteMask(i))
170
regs[i] = fpr.GetTempV();
171
}
172
}
173
174
void Arm64Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
175
_assert_msg_(js.prefixDFlag & JitState::PREFIX_KNOWN, "Unexpected unknown prefix!");
176
if (!js.prefixD)
177
return;
178
179
int n = GetNumVectorElements(sz);
180
for (int i = 0; i < n; i++) {
181
if (js.VfpuWriteMask(i))
182
continue;
183
184
int sat = (js.prefixD >> (i * 2)) & 3;
185
if (sat == 1) {
186
// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
187
fpr.MapRegV(vregs[i], MAP_DIRTY);
188
189
fp.MOVI2F(S0, 0.0f, SCRATCH1);
190
fp.MOVI2F(S1, 1.0f, SCRATCH1);
191
fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
192
fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
193
} else if (sat == 3) {
194
// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]
195
fpr.MapRegV(vregs[i], MAP_DIRTY);
196
197
fp.MOVI2F(S0, -1.0f, SCRATCH1);
198
fp.MOVI2F(S1, 1.0f, SCRATCH1);
199
fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
200
fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
201
}
202
}
203
}
204
205
void Arm64Jit::Comp_SV(MIPSOpcode op) {
206
CONDITIONAL_DISABLE(LSU_VFPU);
207
CheckMemoryBreakpoint();
208
209
s32 offset = (signed short)(op & 0xFFFC);
210
int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
211
MIPSGPReg rs = _RS;
212
213
std::vector<FixupBranch> skips;
214
switch (op >> 26) {
215
case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);
216
{
217
if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset >= 0 && offset < 16384) {
218
gpr.MapRegAsPointer(rs);
219
fpr.MapRegV(vt, MAP_NOINIT | MAP_DIRTY);
220
fp.LDR(32, INDEX_UNSIGNED, fpr.V(vt), gpr.RPtr(rs), offset);
221
break;
222
}
223
224
// CC might be set by slow path below, so load regs first.
225
fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
226
if (gpr.IsImm(rs)) {
227
#ifdef MASKED_PSP_MEMORY
228
u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;
229
#else
230
u32 addr = offset + gpr.GetImm(rs);
231
#endif
232
gpr.SetRegImm(SCRATCH1, addr);
233
} else {
234
gpr.MapReg(rs);
235
if (g_Config.bFastMemory) {
236
SetScratch1ToEffectiveAddress(rs, offset);
237
} else {
238
skips = SetScratch1ForSafeAddress(rs, offset, SCRATCH2);
239
}
240
}
241
fp.LDR(32, fpr.V(vt), SCRATCH1_64, ArithOption(MEMBASEREG));
242
for (auto skip : skips) {
243
SetJumpTarget(skip);
244
}
245
}
246
break;
247
248
case 58: //sv.s // Memory::Write_U32(VI(vt), addr);
249
{
250
if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset >= 0 && offset < 16384) {
251
gpr.MapRegAsPointer(rs);
252
fpr.MapRegV(vt, 0);
253
fp.STR(32, INDEX_UNSIGNED, fpr.V(vt), gpr.RPtr(rs), offset);
254
break;
255
}
256
257
// CC might be set by slow path below, so load regs first.
258
fpr.MapRegV(vt);
259
if (gpr.IsImm(rs)) {
260
#ifdef MASKED_PSP_MEMORY
261
u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;
262
#else
263
u32 addr = offset + gpr.GetImm(rs);
264
#endif
265
gpr.SetRegImm(SCRATCH1, addr);
266
} else {
267
gpr.MapReg(rs);
268
if (g_Config.bFastMemory) {
269
SetScratch1ToEffectiveAddress(rs, offset);
270
} else {
271
skips = SetScratch1ForSafeAddress(rs, offset, SCRATCH2);
272
}
273
}
274
fp.STR(32, fpr.V(vt), SCRATCH1_64, ArithOption(MEMBASEREG));
275
for (auto skip : skips) {
276
SetJumpTarget(skip);
277
}
278
}
279
break;
280
281
282
default:
283
DISABLE;
284
}
285
}
286
287
void Arm64Jit::Comp_SVQ(MIPSOpcode op) {
288
CONDITIONAL_DISABLE(LSU_VFPU);
289
CheckMemoryBreakpoint();
290
291
int imm = (signed short)(op&0xFFFC);
292
int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
293
MIPSGPReg rs = _RS;
294
295
std::vector<FixupBranch> skips;
296
switch (op >> 26)
297
{
298
case 54: //lv.q
299
{
300
// CC might be set by slow path below, so load regs first.
301
u8 vregs[4];
302
GetVectorRegs(vregs, V_Quad, vt);
303
fpr.MapRegsAndSpillLockV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
304
305
if (gpr.IsImm(rs)) {
306
#ifdef MASKED_PSP_MEMORY
307
u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;
308
#else
309
u32 addr = imm + gpr.GetImm(rs);
310
#endif
311
gpr.SetRegImm(SCRATCH1_64, addr + (uintptr_t)Memory::base);
312
} else {
313
gpr.MapReg(rs);
314
if (g_Config.bFastMemory) {
315
SetScratch1ToEffectiveAddress(rs, imm);
316
} else {
317
skips = SetScratch1ForSafeAddress(rs, imm, SCRATCH2);
318
}
319
if (jo.enablePointerify) {
320
MOVK(SCRATCH1_64, ((uint64_t)Memory::base) >> 32, SHIFT_32);
321
} else {
322
ADD(SCRATCH1_64, SCRATCH1_64, MEMBASEREG);
323
}
324
}
325
326
fp.LDP(32, INDEX_SIGNED, fpr.V(vregs[0]), fpr.V(vregs[1]), SCRATCH1_64, 0);
327
fp.LDP(32, INDEX_SIGNED, fpr.V(vregs[2]), fpr.V(vregs[3]), SCRATCH1_64, 8);
328
329
for (auto skip : skips) {
330
SetJumpTarget(skip);
331
}
332
}
333
break;
334
335
case 62: //sv.q
336
{
337
// CC might be set by slow path below, so load regs first.
338
u8 vregs[4];
339
GetVectorRegs(vregs, V_Quad, vt);
340
fpr.MapRegsAndSpillLockV(vregs, V_Quad, 0);
341
342
if (gpr.IsImm(rs)) {
343
#ifdef MASKED_PSP_MEMORY
344
u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;
345
#else
346
u32 addr = imm + gpr.GetImm(rs);
347
#endif
348
gpr.SetRegImm(SCRATCH1_64, addr + (uintptr_t)Memory::base);
349
} else {
350
gpr.MapReg(rs);
351
if (g_Config.bFastMemory) {
352
SetScratch1ToEffectiveAddress(rs, imm);
353
} else {
354
skips = SetScratch1ForSafeAddress(rs, imm, SCRATCH2);
355
}
356
if (jo.enablePointerify) {
357
MOVK(SCRATCH1_64, ((uint64_t)Memory::base) >> 32, SHIFT_32);
358
} else {
359
ADD(SCRATCH1_64, SCRATCH1_64, MEMBASEREG);
360
}
361
}
362
fp.STP(32, INDEX_SIGNED, fpr.V(vregs[0]), fpr.V(vregs[1]), SCRATCH1_64, 0);
363
fp.STP(32, INDEX_SIGNED, fpr.V(vregs[2]), fpr.V(vregs[3]), SCRATCH1_64, 8);
364
365
for (auto skip : skips) {
366
SetJumpTarget(skip);
367
}
368
}
369
break;
370
371
default:
372
DISABLE;
373
break;
374
}
375
fpr.ReleaseSpillLocksAndDiscardTemps();
376
}
377
378
void Arm64Jit::Comp_VVectorInit(MIPSOpcode op) {
379
CONDITIONAL_DISABLE(VFPU_XFER);
380
// WARNING: No prefix support!
381
if (js.HasUnknownPrefix()) {
382
DISABLE;
383
}
384
385
switch ((op >> 16) & 0xF) {
386
case 6: // v=zeros; break; //vzero
387
fp.MOVI2F(S0, 0.0f, SCRATCH1);
388
break;
389
case 7: // v=ones; break; //vone
390
fp.MOVI2F(S0, 1.0f, SCRATCH1);
391
break;
392
default:
393
DISABLE;
394
break;
395
}
396
397
VectorSize sz = GetVecSize(op);
398
int n = GetNumVectorElements(sz);
399
400
u8 dregs[4];
401
GetVectorRegsPrefixD(dregs, sz, _VD);
402
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
403
404
for (int i = 0; i < n; ++i)
405
fp.FMOV(fpr.V(dregs[i]), S0);
406
407
ApplyPrefixD(dregs, sz);
408
409
fpr.ReleaseSpillLocksAndDiscardTemps();
410
}
411
412
void Arm64Jit::Comp_VIdt(MIPSOpcode op) {
413
CONDITIONAL_DISABLE(VFPU_XFER);
414
if (js.HasUnknownPrefix()) {
415
DISABLE;
416
}
417
418
int vd = _VD;
419
VectorSize sz = GetVecSize(op);
420
int n = GetNumVectorElements(sz);
421
fp.MOVI2F(S0, 0.0f, SCRATCH1);
422
fp.MOVI2F(S1, 1.0f, SCRATCH1);
423
u8 dregs[4];
424
GetVectorRegsPrefixD(dregs, sz, _VD);
425
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
426
switch (sz) {
427
case V_Pair:
428
fp.FMOV(fpr.V(dregs[0]), (vd & 1) == 0 ? S1 : S0);
429
fp.FMOV(fpr.V(dregs[1]), (vd & 1) == 1 ? S1 : S0);
430
break;
431
case V_Quad:
432
fp.FMOV(fpr.V(dregs[0]), (vd & 3) == 0 ? S1 : S0);
433
fp.FMOV(fpr.V(dregs[1]), (vd & 3) == 1 ? S1 : S0);
434
fp.FMOV(fpr.V(dregs[2]), (vd & 3) == 2 ? S1 : S0);
435
fp.FMOV(fpr.V(dregs[3]), (vd & 3) == 3 ? S1 : S0);
436
break;
437
default:
438
_dbg_assert_msg_( 0, "Trying to interpret instruction that can't be interpreted");
439
break;
440
}
441
442
ApplyPrefixD(dregs, sz);
443
444
fpr.ReleaseSpillLocksAndDiscardTemps();
445
}
446
447
void Arm64Jit::Comp_VMatrixInit(MIPSOpcode op) {
448
CONDITIONAL_DISABLE(VFPU_XFER);
449
if (js.HasUnknownPrefix()) {
450
// Don't think matrix init ops care about prefixes.
451
// DISABLE;
452
}
453
454
MatrixSize sz = GetMtxSize(op);
455
int n = GetMatrixSide(sz);
456
457
u8 dregs[16];
458
GetMatrixRegs(dregs, sz, _VD);
459
460
switch ((op >> 16) & 0xF) {
461
case 3: // vmidt
462
fp.MOVI2F(S0, 0.0f, SCRATCH1);
463
fp.MOVI2F(S1, 1.0f, SCRATCH1);
464
for (int a = 0; a < n; a++) {
465
for (int b = 0; b < n; b++) {
466
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
467
fp.FMOV(fpr.V(dregs[a * 4 + b]), a == b ? S1 : S0);
468
}
469
}
470
break;
471
case 6: // vmzero
472
fp.MOVI2F(S0, 0.0f, SCRATCH1);
473
for (int a = 0; a < n; a++) {
474
for (int b = 0; b < n; b++) {
475
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
476
fp.FMOV(fpr.V(dregs[a * 4 + b]), S0);
477
}
478
}
479
break;
480
case 7: // vmone
481
fp.MOVI2F(S1, 1.0f, SCRATCH1);
482
for (int a = 0; a < n; a++) {
483
for (int b = 0; b < n; b++) {
484
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
485
fp.FMOV(fpr.V(dregs[a * 4 + b]), S1);
486
}
487
}
488
break;
489
}
490
491
fpr.ReleaseSpillLocksAndDiscardTemps();
492
}
493
494
void Arm64Jit::Comp_VHdp(MIPSOpcode op) {
495
CONDITIONAL_DISABLE(VFPU_VEC);
496
if (js.HasUnknownPrefix()) {
497
DISABLE;
498
}
499
500
int vd = _VD;
501
int vs = _VS;
502
int vt = _VT;
503
VectorSize sz = GetVecSize(op);
504
505
// TODO: Force read one of them into regs? probably not.
506
u8 sregs[4], tregs[4], dregs[1];
507
GetVectorRegsPrefixS(sregs, sz, vs);
508
GetVectorRegsPrefixT(tregs, sz, vt);
509
GetVectorRegsPrefixD(dregs, V_Single, vd);
510
511
// TODO: applyprefixST here somehow (shuffle, etc...)
512
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
513
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
514
fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));
515
516
int n = GetNumVectorElements(sz);
517
for (int i = 1; i < n; i++) {
518
// sum += s[i]*t[i];
519
if (i == n - 1) {
520
fp.FADD(S0, S0, fpr.V(tregs[i]));
521
} else {
522
fp.FMADD(S0, fpr.V(sregs[i]), fpr.V(tregs[i]), S0);
523
}
524
}
525
fpr.ReleaseSpillLocksAndDiscardTemps();
526
527
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
528
529
fp.FMOV(fpr.V(dregs[0]), S0);
530
ApplyPrefixD(dregs, V_Single);
531
fpr.ReleaseSpillLocksAndDiscardTemps();
532
}
533
534
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
535
536
void Arm64Jit::Comp_Vhoriz(MIPSOpcode op) {
537
CONDITIONAL_DISABLE(VFPU_VEC);
538
if (js.HasUnknownPrefix()) {
539
DISABLE;
540
}
541
542
int vd = _VD;
543
int vs = _VS;
544
int vt = _VT;
545
VectorSize sz = GetVecSize(op);
546
547
// TODO: Force read one of them into regs? probably not.
548
u8 sregs[4], dregs[1];
549
GetVectorRegsPrefixS(sregs, sz, vs);
550
GetVectorRegsPrefixD(dregs, V_Single, vd);
551
552
// TODO: applyprefixST here somehow (shuffle, etc...)
553
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
554
555
int n = GetNumVectorElements(sz);
556
557
bool is_vavg = ((op >> 16) & 0x1f) == 7;
558
if (is_vavg) {
559
fp.MOVI2F(S1, vavg_table[n - 1], SCRATCH1);
560
}
561
// Have to start at +0.000 for the correct sign.
562
fp.MOVI2F(S0, 0.0f, SCRATCH1);
563
for (int i = 0; i < n; i++) {
564
// sum += s[i];
565
fp.FADD(S0, S0, fpr.V(sregs[i]));
566
}
567
568
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
569
if (is_vavg) {
570
fp.FMUL(fpr.V(dregs[0]), S0, S1);
571
} else {
572
fp.FMOV(fpr.V(dregs[0]), S0);
573
}
574
ApplyPrefixD(dregs, V_Single);
575
fpr.ReleaseSpillLocksAndDiscardTemps();
576
}
577
578
void Arm64Jit::Comp_VDot(MIPSOpcode op) {
579
CONDITIONAL_DISABLE(VFPU_VEC);
580
if (js.HasUnknownPrefix()) {
581
DISABLE;
582
}
583
584
int vd = _VD;
585
int vs = _VS;
586
int vt = _VT;
587
VectorSize sz = GetVecSize(op);
588
589
// TODO: Force read one of them into regs? probably not.
590
u8 sregs[4], tregs[4], dregs[1];
591
GetVectorRegsPrefixS(sregs, sz, vs);
592
GetVectorRegsPrefixT(tregs, sz, vt);
593
GetVectorRegsPrefixD(dregs, V_Single, vd);
594
595
// TODO: applyprefixST here somehow (shuffle, etc...)
596
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
597
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
598
fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));
599
600
int n = GetNumVectorElements(sz);
601
for (int i = 1; i < n; i++) {
602
// sum += s[i]*t[i];
603
fp.FMADD(S0, fpr.V(sregs[i]), fpr.V(tregs[i]), S0);
604
}
605
fpr.ReleaseSpillLocksAndDiscardTemps();
606
607
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
608
609
fp.FMOV(fpr.V(dregs[0]), S0);
610
ApplyPrefixD(dregs, V_Single);
611
fpr.ReleaseSpillLocksAndDiscardTemps();
612
}
613
614
void Arm64Jit::Comp_VecDo3(MIPSOpcode op) {
615
CONDITIONAL_DISABLE(VFPU_VEC);
616
if (js.HasUnknownPrefix()) {
617
DISABLE;
618
}
619
620
int vd = _VD;
621
int vs = _VS;
622
int vt = _VT;
623
624
VectorSize sz = GetVecSize(op);
625
int n = GetNumVectorElements(sz);
626
627
u8 sregs[4], tregs[4], dregs[4];
628
GetVectorRegsPrefixS(sregs, sz, _VS);
629
GetVectorRegsPrefixT(tregs, sz, _VT);
630
GetVectorRegsPrefixD(dregs, sz, _VD);
631
632
MIPSReg tempregs[4];
633
for (int i = 0; i < n; i++) {
634
if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {
635
tempregs[i] = fpr.GetTempV();
636
} else {
637
tempregs[i] = dregs[i];
638
}
639
}
640
641
// Map first, then work. This will allow us to use VLDMIA more often
642
// (when we add the appropriate map function) and the instruction ordering
643
// will improve.
644
// Note that mapping like this (instead of first all sregs, first all tregs etc)
645
// reduces the amount of continuous registers a lot :(
646
for (int i = 0; i < n; i++) {
647
fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);
648
fpr.SpillLockV(tempregs[i]);
649
fpr.SpillLockV(sregs[i]);
650
fpr.SpillLockV(tregs[i]);
651
}
652
653
for (int i = 0; i < n; i++) {
654
switch (op >> 26) {
655
case 24: //VFPU0
656
switch ((op >> 23) & 7) {
657
case 0: // d[i] = s[i] + t[i]; break; //vadd
658
fp.FADD(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
659
break;
660
case 1: // d[i] = s[i] - t[i]; break; //vsub
661
fp.FSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
662
break;
663
case 7: // d[i] = s[i] / t[i]; break; //vdiv
664
fp.FDIV(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
665
break;
666
default:
667
DISABLE;
668
}
669
break;
670
case 25: //VFPU1
671
switch ((op >> 23) & 7) {
672
case 0: // d[i] = s[i] * t[i]; break; //vmul
673
fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
674
break;
675
default:
676
DISABLE;
677
}
678
break;
679
// Fortunately there is FMIN/FMAX on ARM64!
680
case 27: //VFPU3
681
switch ((op >> 23) & 7) {
682
case 2: // vmin
683
{
684
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
685
FixupBranch unordered = B(CC_VS);
686
fp.FMIN(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
687
FixupBranch skip = B();
688
689
SetJumpTarget(unordered);
690
// Move to integer registers, it'll be easier. Or maybe there's a simd way?
691
fp.FMOV(SCRATCH1, fpr.V(sregs[i]));
692
fp.FMOV(SCRATCH2, fpr.V(tregs[i]));
693
// And together to find if both have negative set.
694
TST(SCRATCH1, SCRATCH2);
695
FixupBranch cmpPositive = B(CC_PL);
696
// If both are negative, "min" is the greater of the two, since it has the largest mantissa.
697
CMP(SCRATCH1, SCRATCH2);
698
CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_GE);
699
FixupBranch skipPositive = B();
700
// If either one is positive, we just want the lowest one.
701
SetJumpTarget(cmpPositive);
702
CMP(SCRATCH1, SCRATCH2);
703
CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_LE);
704
SetJumpTarget(skipPositive);
705
// Now, whether negative or positive, move to the result.
706
fp.FMOV(fpr.V(tempregs[i]), SCRATCH1);
707
SetJumpTarget(skip);
708
break;
709
}
710
case 3: // vmax
711
{
712
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
713
FixupBranch unordered = B(CC_VS);
714
fp.FMAX(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
715
FixupBranch skip = B();
716
717
SetJumpTarget(unordered);
718
// Move to integer registers, it'll be easier. Or maybe there's a simd way?
719
fp.FMOV(SCRATCH1, fpr.V(sregs[i]));
720
fp.FMOV(SCRATCH2, fpr.V(tregs[i]));
721
// And together to find if both have negative set.
722
TST(SCRATCH1, SCRATCH2);
723
FixupBranch cmpPositive = B(CC_PL);
724
// If both are negative, "max" is the least of the two, since it has the lowest mantissa.
725
CMP(SCRATCH1, SCRATCH2);
726
CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_LE);
727
FixupBranch skipPositive = B();
728
// If either one is positive, we just want the highest one.
729
SetJumpTarget(cmpPositive);
730
CMP(SCRATCH1, SCRATCH2);
731
CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_GE);
732
SetJumpTarget(skipPositive);
733
// Now, whether negative or positive, move to the result.
734
fp.FMOV(fpr.V(tempregs[i]), SCRATCH1);
735
SetJumpTarget(skip);
736
break;
737
}
738
case 6: // vsge
739
DISABLE; // pending testing
740
break;
741
case 7: // vslt
742
DISABLE; // pending testing
743
break;
744
}
745
break;
746
747
default:
748
DISABLE;
749
}
750
}
751
752
for (int i = 0; i < n; i++) {
753
if (dregs[i] != tempregs[i]) {
754
fpr.MapDirtyInV(dregs[i], tempregs[i]);
755
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
756
}
757
}
758
ApplyPrefixD(dregs, sz);
759
760
fpr.ReleaseSpillLocksAndDiscardTemps();
761
}
762
763
void Arm64Jit::Comp_VV2Op(MIPSOpcode op) {
764
CONDITIONAL_DISABLE(VFPU_VEC);
765
if (js.HasUnknownPrefix()) {
766
DISABLE;
767
}
768
769
// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
770
if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
771
return;
772
}
773
774
// Catch the disabled operations immediately so we don't map registers unnecessarily later.
775
// Move these down to the big switch below as they are implemented.
776
switch ((op >> 16) & 0x1f) {
777
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
778
DISABLE;
779
break;
780
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
781
DISABLE;
782
break;
783
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
784
DISABLE;
785
break;
786
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
787
DISABLE;
788
break;
789
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
790
DISABLE;
791
break;
792
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
793
DISABLE;
794
break;
795
default:
796
;
797
}
798
799
VectorSize sz = GetVecSize(op);
800
int n = GetNumVectorElements(sz);
801
802
u8 sregs[4], dregs[4];
803
GetVectorRegsPrefixS(sregs, sz, _VS);
804
GetVectorRegsPrefixD(dregs, sz, _VD);
805
806
MIPSReg tempregs[4];
807
for (int i = 0; i < n; ++i) {
808
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
809
tempregs[i] = fpr.GetTempV();
810
} else {
811
tempregs[i] = dregs[i];
812
}
813
}
814
815
// Pre map the registers to get better instruction ordering.
816
// Note that mapping like this (instead of first all sregs, first all tempregs etc)
817
// reduces the amount of continuous registers a lot :(
818
for (int i = 0; i < n; i++) {
819
fpr.MapDirtyInV(tempregs[i], sregs[i]);
820
fpr.SpillLockV(tempregs[i]);
821
fpr.SpillLockV(sregs[i]);
822
}
823
824
// Warning: sregs[i] and tempxregs[i] may be the same reg.
825
// Helps for vmov, hurts for vrcp, etc.
826
for (int i = 0; i < n; i++) {
827
switch ((op >> 16) & 0x1f) {
828
case 0: // d[i] = s[i]; break; //vmov
829
// Probably for swizzle.
830
fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
831
break;
832
case 1: // d[i] = fabsf(s[i]); break; //vabs
833
fp.FABS(fpr.V(tempregs[i]), fpr.V(sregs[i]));
834
break;
835
case 2: // d[i] = -s[i]; break; //vneg
836
fp.FNEG(fpr.V(tempregs[i]), fpr.V(sregs[i]));
837
break;
838
case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
839
if (i == 0) {
840
fp.MOVI2F(S0, 0.0f, SCRATCH1);
841
fp.MOVI2F(S1, 1.0f, SCRATCH1);
842
}
843
fp.FCMP(fpr.V(sregs[i]), S0);
844
fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
845
fp.FMAX(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);
846
fp.FMIN(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);
847
break;
848
case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
849
if (i == 0) {
850
fp.MOVI2F(S0, -1.0f, SCRATCH1);
851
fp.MOVI2F(S1, 1.0f, SCRATCH1);
852
}
853
fp.FCMP(fpr.V(sregs[i]), S0);
854
fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
855
fp.FMAX(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);
856
fp.FMIN(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);
857
break;
858
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
859
if (i == 0) {
860
fp.MOVI2F(S0, 1.0f, SCRATCH1);
861
}
862
fp.FDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
863
break;
864
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
865
if (i == 0) {
866
fp.MOVI2F(S0, 1.0f, SCRATCH1);
867
}
868
fp.FSQRT(S1, fpr.V(sregs[i]));
869
fp.FDIV(fpr.V(tempregs[i]), S0, S1);
870
break;
871
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
872
fp.FSQRT(fpr.V(tempregs[i]), fpr.V(sregs[i]));
873
fp.FABS(fpr.V(tempregs[i]), fpr.V(tempregs[i]));
874
break;
875
case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin
876
DISABLE;
877
break;
878
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
879
if (i == 0) {
880
fp.MOVI2F(S0, -1.0f, SCRATCH1);
881
}
882
fp.FDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
883
break;
884
default:
885
ERROR_LOG(Log::JIT, "case missing in vfpu vv2op");
886
DISABLE;
887
break;
888
}
889
}
890
891
for (int i = 0; i < n; ++i) {
892
if (dregs[i] != tempregs[i]) {
893
fpr.MapDirtyInV(dregs[i], tempregs[i]);
894
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
895
}
896
}
897
898
ApplyPrefixD(dregs, sz);
899
900
fpr.ReleaseSpillLocksAndDiscardTemps();
901
}
902
903
void Arm64Jit::Comp_Vi2f(MIPSOpcode op) {
904
CONDITIONAL_DISABLE(VFPU_VEC);
905
if (js.HasUnknownPrefix()) {
906
DISABLE;
907
}
908
909
VectorSize sz = GetVecSize(op);
910
int n = GetNumVectorElements(sz);
911
912
int imm = (op >> 16) & 0x1f;
913
const float mult = 1.0f / (float)(1UL << imm);
914
915
u8 sregs[4], dregs[4];
916
GetVectorRegsPrefixS(sregs, sz, _VS);
917
GetVectorRegsPrefixD(dregs, sz, _VD);
918
919
MIPSReg tempregs[4];
920
for (int i = 0; i < n; ++i) {
921
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
922
tempregs[i] = fpr.GetTempV();
923
} else {
924
tempregs[i] = dregs[i];
925
}
926
}
927
928
if (mult != 1.0f)
929
fp.MOVI2F(S0, mult, SCRATCH1);
930
931
// TODO: Use the SCVTF with builtin scaling where possible.
932
for (int i = 0; i < n; i++) {
933
fpr.MapDirtyInV(tempregs[i], sregs[i]);
934
fp.SCVTF(fpr.V(tempregs[i]), fpr.V(sregs[i]));
935
if (mult != 1.0f)
936
fp.FMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);
937
}
938
939
for (int i = 0; i < n; ++i) {
940
if (dregs[i] != tempregs[i]) {
941
fpr.MapDirtyInV(dregs[i], tempregs[i]);
942
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
943
}
944
}
945
946
ApplyPrefixD(dregs, sz);
947
fpr.ReleaseSpillLocksAndDiscardTemps();
948
}
949
950
void Arm64Jit::Comp_Vh2f(MIPSOpcode op) {
951
// TODO: Fix by porting the general SSE solution to NEON
952
// FCVTL doesn't provide identical results to the PSP hardware, according to the unit test:
953
// O vh2f: 00000000,400c0000,00000000,7ff00000
954
// E vh2f: 00000000,400c0000,00000000,7f800380
955
DISABLE;
956
957
CONDITIONAL_DISABLE(VFPU_VEC);
958
if (js.HasUnknownPrefix()) {
959
DISABLE;
960
}
961
962
u8 sregs[4], dregs[4];
963
VectorSize sz = GetVecSize(op);
964
VectorSize outSz;
965
966
switch (sz) {
967
case V_Single:
968
outSz = V_Pair;
969
break;
970
case V_Pair:
971
outSz = V_Quad;
972
break;
973
default:
974
DISABLE;
975
}
976
977
int n = GetNumVectorElements(sz);
978
int nOut = n * 2;
979
GetVectorRegsPrefixS(sregs, sz, _VS);
980
GetVectorRegsPrefixD(dregs, outSz, _VD);
981
982
// Take the single registers and combine them to a D register.
983
for (int i = 0; i < n; i++) {
984
fpr.MapRegV(sregs[i], sz);
985
fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);
986
}
987
// Convert four 16-bit floats in D0 to four 32-bit floats in Q0 (even if we only have two...)
988
fp.FCVTL(32, Q0, D0);
989
// Split apart again.
990
for (int i = 0; i < nOut; i++) {
991
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
992
fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);
993
}
994
995
ApplyPrefixD(dregs, sz);
996
fpr.ReleaseSpillLocksAndDiscardTemps();
997
}
998
999
void Arm64Jit::Comp_Vf2i(MIPSOpcode op) {
1000
DISABLE;
1001
}
1002
1003
void Arm64Jit::Comp_Mftv(MIPSOpcode op) {
1004
CONDITIONAL_DISABLE(VFPU_XFER);
1005
int imm = op & 0xFF;
1006
MIPSGPReg rt = _RT;
1007
switch ((op >> 21) & 0x1f) {
1008
case 3: //mfv / mfvc
1009
// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
1010
if (rt != 0) {
1011
if (imm < 128) { //R(rt) = VI(imm);
1012
if (!fpr.IsInRAMV(imm)) {
1013
fpr.MapRegV(imm, 0);
1014
gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
1015
fp.FMOV(gpr.R(rt), fpr.V(imm));
1016
} else {
1017
gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
1018
LDR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, fpr.GetMipsRegOffsetV(imm));
1019
}
1020
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc
1021
if (imm - 128 == VFPU_CTRL_CC) {
1022
if (gpr.IsImm(MIPS_REG_VFPUCC)) {
1023
gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));
1024
} else {
1025
gpr.MapDirtyIn(rt, MIPS_REG_VFPUCC);
1026
MOV(gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));
1027
}
1028
} else {
1029
// In case we have a saved prefix.
1030
FlushPrefixV();
1031
gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
1032
LDR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));
1033
}
1034
} else {
1035
//ERROR - maybe need to make this value too an "interlock" value?
1036
ERROR_LOG(Log::CPU, "mfv - invalid register %i", imm);
1037
}
1038
}
1039
break;
1040
1041
case 7: // mtv
1042
if (imm < 128) {
1043
if (rt == MIPS_REG_ZERO) {
1044
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
1045
fp.MOVI2F(fpr.V(imm), 0.0f, SCRATCH1);
1046
} else if (!gpr.IsInRAM(rt)) {
1047
gpr.MapReg(rt);
1048
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
1049
fp.FMOV(fpr.V(imm), gpr.R(rt));
1050
} else {
1051
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
1052
fp.LDR(32, INDEX_UNSIGNED, fpr.V(imm), CTXREG, gpr.GetMipsRegOffset(rt));
1053
}
1054
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
1055
if (imm - 128 == VFPU_CTRL_CC) {
1056
if (gpr.IsImm(rt)) {
1057
gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));
1058
} else {
1059
gpr.MapDirtyIn(MIPS_REG_VFPUCC, rt);
1060
MOV(gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));
1061
}
1062
} else {
1063
gpr.MapReg(rt);
1064
STR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));
1065
}
1066
1067
// TODO: Optimization if rt is Imm?
1068
// Set these BEFORE disable!
1069
if (imm - 128 == VFPU_CTRL_SPREFIX) {
1070
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1071
js.blockWrotePrefixes = true;
1072
} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
1073
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1074
js.blockWrotePrefixes = true;
1075
} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
1076
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1077
js.blockWrotePrefixes = true;
1078
}
1079
} else {
1080
//ERROR
1081
_dbg_assert_msg_( 0, "mtv - invalid register");
1082
}
1083
break;
1084
1085
default:
1086
DISABLE;
1087
}
1088
1089
fpr.ReleaseSpillLocksAndDiscardTemps();
1090
}
1091
1092
void Arm64Jit::Comp_Vmfvc(MIPSOpcode op) {
1093
CONDITIONAL_DISABLE(VFPU_XFER);
1094
1095
int vd = _VD;
1096
int imm = (op >> 8) & 0x7F;
1097
if (imm < VFPU_CTRL_MAX) {
1098
fpr.MapRegV(vd);
1099
if (imm == VFPU_CTRL_CC) {
1100
gpr.MapReg(MIPS_REG_VFPUCC, 0);
1101
fp.FMOV(fpr.V(vd), gpr.R(MIPS_REG_VFPUCC));
1102
} else {
1103
ADDI2R(SCRATCH1_64, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCH2);
1104
fp.LDR(32, INDEX_UNSIGNED, fpr.V(vd), SCRATCH1_64, 0);
1105
}
1106
fpr.ReleaseSpillLocksAndDiscardTemps();
1107
} else {
1108
fpr.MapRegV(vd);
1109
fp.MOVI2F(fpr.V(vd), 0.0f, SCRATCH1);
1110
}
1111
}
1112
1113
void Arm64Jit::Comp_Vmtvc(MIPSOpcode op) {
1114
CONDITIONAL_DISABLE(VFPU_XFER);
1115
1116
int vs = _VS;
1117
int imm = op & 0x7F;
1118
if (imm < VFPU_CTRL_MAX) {
1119
fpr.MapRegV(vs);
1120
if (imm == VFPU_CTRL_CC) {
1121
gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY | MAP_NOINIT);
1122
fp.FMOV(gpr.R(MIPS_REG_VFPUCC), fpr.V(vs));
1123
} else {
1124
ADDI2R(SCRATCH1_64, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCH2);
1125
fp.STR(32, INDEX_UNSIGNED, fpr.V(vs), SCRATCH1_64, 0);
1126
}
1127
fpr.ReleaseSpillLocksAndDiscardTemps();
1128
1129
if (imm == VFPU_CTRL_SPREFIX) {
1130
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1131
js.blockWrotePrefixes = true;
1132
} else if (imm == VFPU_CTRL_TPREFIX) {
1133
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1134
js.blockWrotePrefixes = true;
1135
} else if (imm == VFPU_CTRL_DPREFIX) {
1136
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1137
js.blockWrotePrefixes = true;
1138
}
1139
}
1140
}
1141
1142
void Arm64Jit::Comp_Vmmov(MIPSOpcode op) {
1143
CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
1144
if (!js.HasNoPrefix()) {
1145
DISABLE;
1146
}
1147
1148
if (_VS == _VD) {
1149
// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
1150
return;
1151
}
1152
1153
MatrixSize sz = GetMtxSize(op);
1154
int n = GetMatrixSide(sz);
1155
1156
u8 sregs[16], dregs[16];
1157
GetMatrixRegs(sregs, sz, _VS);
1158
GetMatrixRegs(dregs, sz, _VD);
1159
1160
switch (GetMatrixOverlap(_VS, _VD, sz)) {
1161
case OVERLAP_EQUAL:
1162
// In-place transpose
1163
DISABLE;
1164
case OVERLAP_PARTIAL:
1165
DISABLE;
1166
case OVERLAP_NONE:
1167
default:
1168
break;
1169
}
1170
1171
for (int a = 0; a < n; a++) {
1172
for (int b = 0; b < n; b++) {
1173
fpr.MapDirtyInV(dregs[a * 4 + b], sregs[a * 4 + b]);
1174
fp.FMOV(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[a * 4 + b]));
1175
}
1176
}
1177
fpr.ReleaseSpillLocksAndDiscardTemps();
1178
}
1179
1180
void Arm64Jit::Comp_VScl(MIPSOpcode op) {
1181
CONDITIONAL_DISABLE(VFPU_VEC);
1182
if (js.HasUnknownPrefix()) {
1183
DISABLE;
1184
}
1185
1186
VectorSize sz = GetVecSize(op);
1187
int n = GetNumVectorElements(sz);
1188
1189
u8 sregs[4], dregs[4], treg;
1190
GetVectorRegsPrefixS(sregs, sz, _VS);
1191
// TODO: Prefixes seem strange...
1192
GetVectorRegsPrefixT(&treg, V_Single, _VT);
1193
GetVectorRegsPrefixD(dregs, sz, _VD);
1194
1195
// Move to S0 early, so we don't have to worry about overlap with scale.
1196
fpr.LoadToRegV(S0, treg);
1197
1198
// For prefixes to work, we just have to ensure that none of the output registers spill
1199
// and that there's no overlap.
1200
MIPSReg tempregs[4];
1201
for (int i = 0; i < n; ++i) {
1202
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1203
// Need to use temp regs
1204
tempregs[i] = fpr.GetTempV();
1205
} else {
1206
tempregs[i] = dregs[i];
1207
}
1208
}
1209
1210
// The meat of the function!
1211
for (int i = 0; i < n; i++) {
1212
fpr.MapDirtyInV(tempregs[i], sregs[i]);
1213
fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0);
1214
}
1215
1216
for (int i = 0; i < n; i++) {
1217
// All must be mapped for prefixes to work.
1218
if (dregs[i] != tempregs[i]) {
1219
fpr.MapDirtyInV(dregs[i], tempregs[i]);
1220
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
1221
}
1222
}
1223
1224
ApplyPrefixD(dregs, sz);
1225
1226
fpr.ReleaseSpillLocksAndDiscardTemps();
1227
}
1228
1229
void Arm64Jit::Comp_Vmmul(MIPSOpcode op) {
1230
CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
1231
if (!js.HasNoPrefix()) {
1232
DISABLE;
1233
}
1234
1235
if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
1236
// Fall back to interpreter, which has the accurate implementation.
1237
// Later we might do something more optimized here.
1238
DISABLE;
1239
}
1240
1241
MatrixSize sz = GetMtxSize(op);
1242
int n = GetMatrixSide(sz);
1243
1244
u8 sregs[16], tregs[16], dregs[16];
1245
GetMatrixRegs(sregs, sz, _VS);
1246
GetMatrixRegs(tregs, sz, _VT);
1247
GetMatrixRegs(dregs, sz, _VD);
1248
1249
MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
1250
MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
1251
1252
if (soverlap || toverlap) {
1253
DISABLE;
1254
} else {
1255
for (int a = 0; a < n; a++) {
1256
for (int b = 0; b < n; b++) {
1257
fpr.MapDirtyInInV(dregs[a * 4 + b], sregs[b * 4], tregs[a * 4], true);
1258
fp.FMUL(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[b * 4]), fpr.V(tregs[a * 4]));
1259
for (int c = 1; c < n; c++) {
1260
fpr.MapDirtyInInV(dregs[a * 4 + b], sregs[b * 4 + c], tregs[a * 4 + c], false);
1261
fp.FMUL(S0, fpr.V(sregs[b * 4 + c]), fpr.V(tregs[a * 4 + c]));
1262
fp.FADD(fpr.V(dregs[a * 4 + b]), fpr.V(dregs[a * 4 + b]), S0);
1263
}
1264
}
1265
}
1266
fpr.ReleaseSpillLocksAndDiscardTemps();
1267
}
1268
}
1269
1270
void Arm64Jit::Comp_Vmscl(MIPSOpcode op) {
1271
DISABLE;
1272
}
1273
1274
void Arm64Jit::Comp_Vtfm(MIPSOpcode op) {
1275
CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
1276
if (!js.HasNoPrefix()) {
1277
DISABLE;
1278
}
1279
1280
VectorSize sz = GetVecSize(op);
1281
MatrixSize msz = GetMtxSize(op);
1282
int n = GetNumVectorElements(sz);
1283
int ins = (op >> 23) & 7;
1284
1285
bool homogenous = false;
1286
if (n == ins) {
1287
n++;
1288
sz = (VectorSize)((int)(sz)+1);
1289
msz = (MatrixSize)((int)(msz)+1);
1290
homogenous = true;
1291
}
1292
// Otherwise, n should already be ins + 1.
1293
else if (n != ins + 1) {
1294
DISABLE;
1295
}
1296
1297
u8 sregs[16], dregs[4], tregs[4];
1298
GetMatrixRegs(sregs, msz, _VS);
1299
GetVectorRegs(tregs, sz, _VT);
1300
GetVectorRegs(dregs, sz, _VD);
1301
1302
MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, msz);
1303
MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, msz);
1304
1305
int tempregs[4];
1306
for (int i = 0; i < n; i++) {
1307
if (soverlap || toverlap) {
1308
tempregs[i] = fpr.GetTempV();
1309
} else {
1310
tempregs[i] = dregs[i];
1311
}
1312
fpr.SpillLockV(tempregs[i]);
1313
}
1314
for (int i = 0; i < n; i++) {
1315
fpr.MapRegV(tempregs[i], MAP_NOINIT);
1316
fpr.MapInInV(sregs[i * 4], tregs[0]);
1317
fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i * 4]), fpr.V(tregs[0]));
1318
for (int k = 1; k < n; k++) {
1319
if (!homogenous || k != n - 1) {
1320
fpr.MapInInV(sregs[i * 4 + k], tregs[k]);
1321
fp.FMADD(fpr.V(tempregs[i]), fpr.V(sregs[i * 4 + k]), fpr.V(tregs[k]), fpr.V(tempregs[i]));
1322
} else {
1323
fpr.MapRegV(sregs[i * 4 + k]);
1324
fp.FADD(fpr.V(tempregs[i]), fpr.V(tempregs[i]), fpr.V(sregs[i * 4 + k]));
1325
}
1326
}
1327
}
1328
for (int i = 0; i < n; i++) {
1329
u8 temp = tempregs[i];
1330
if (temp != dregs[i]) {
1331
fpr.MapDirtyInV(dregs[i], temp, true);
1332
fp.FMOV(fpr.V(dregs[i]), fpr.V(temp));
1333
}
1334
}
1335
1336
fpr.ReleaseSpillLocksAndDiscardTemps();
1337
}
1338
1339
void Arm64Jit::Comp_VCrs(MIPSOpcode op) {
1340
DISABLE;
1341
}
1342
1343
void Arm64Jit::Comp_VDet(MIPSOpcode op) {
1344
DISABLE;
1345
}
1346
1347
void Arm64Jit::Comp_Vi2x(MIPSOpcode op) {
1348
CONDITIONAL_DISABLE(VFPU_VEC);
1349
if (js.HasUnknownPrefix())
1350
DISABLE;
1351
1352
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
1353
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
1354
1355
// These instructions pack pairs or quads of integers into 32 bits.
1356
// The unsigned (u) versions skip the sign bit when packing.
1357
VectorSize sz = GetVecSize(op);
1358
VectorSize outsize;
1359
if (bits == 8) {
1360
outsize = V_Single;
1361
if (sz != V_Quad) {
1362
DISABLE;
1363
}
1364
} else {
1365
switch (sz) {
1366
case V_Pair:
1367
outsize = V_Single;
1368
break;
1369
case V_Quad:
1370
outsize = V_Pair;
1371
break;
1372
default:
1373
DISABLE;
1374
}
1375
}
1376
1377
u8 sregs[4], dregs[4];
1378
GetVectorRegsPrefixS(sregs, sz, _VS);
1379
GetVectorRegsPrefixD(dregs, outsize, _VD);
1380
1381
int n = GetNumVectorElements(sz);
1382
int nOut = GetNumVectorElements(outsize);
1383
1384
// Take the single registers and combine them to a D or Q register.
1385
for (int i = 0; i < n; i++) {
1386
fpr.MapRegV(sregs[i], sz);
1387
fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);
1388
}
1389
1390
if (unsignedOp) {
1391
// What's the best way to zero a Q reg?
1392
fp.EOR(Q1, Q1, Q1);
1393
fp.SMAX(32, Q0, Q0, Q1);
1394
}
1395
1396
// At this point, we simply need to collect the high bits of each 32-bit lane into one register.
1397
if (bits == 8) {
1398
// Really want to do a SHRN(..., 23/24) but that can't be encoded. So we synthesize it.
1399
fp.USHR(32, Q0, Q0, 16);
1400
fp.SHRN(16, D0, Q0, unsignedOp ? 7 : 8);
1401
fp.XTN(8, D0, Q0);
1402
} else {
1403
fp.SHRN(16, D0, Q0, unsignedOp ? 15 : 16);
1404
}
1405
1406
// Split apart again.
1407
for (int i = 0; i < nOut; i++) {
1408
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1409
fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);
1410
}
1411
1412
ApplyPrefixD(dregs, outsize);
1413
fpr.ReleaseSpillLocksAndDiscardTemps();
1414
}
1415
1416
void Arm64Jit::Comp_Vx2i(MIPSOpcode op) {
1417
CONDITIONAL_DISABLE(VFPU_VEC);
1418
if (js.HasUnknownPrefix())
1419
DISABLE;
1420
1421
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1422
bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1423
1424
// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1425
// at the top. vus2i shifts it an extra bit right afterward.
1426
// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1427
// at the top too. vuc2i is a bit special (see below.)
1428
// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1429
// then use it for both.
1430
1431
VectorSize sz = GetVecSize(op);
1432
VectorSize outsize;
1433
if (bits == 8) {
1434
outsize = V_Quad;
1435
} else {
1436
switch (sz) {
1437
case V_Single:
1438
outsize = V_Pair;
1439
break;
1440
case V_Pair:
1441
outsize = V_Quad;
1442
break;
1443
default:
1444
DISABLE;
1445
}
1446
}
1447
1448
u8 sregs[4], dregs[4];
1449
GetVectorRegsPrefixS(sregs, sz, _VS);
1450
GetVectorRegsPrefixD(dregs, outsize, _VD);
1451
1452
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1453
int n = 1;
1454
if (sz == V_Single) {
1455
n = 1;
1456
} else if (sz == V_Pair) {
1457
n = 2;
1458
} else if (bits == 8) {
1459
n = 1;
1460
}
1461
1462
// Take the single registers and combine them to a D or Q register.
1463
for (int i = 0; i < n; i++) {
1464
fpr.MapRegV(sregs[i], sz);
1465
fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);
1466
}
1467
1468
if (bits == 16) {
1469
// Simply expand, to upper bits.
1470
// Hm, can't find a USHLL equivalent that works with shift == size?
1471
fp.UXTL(16, Q0, D0);
1472
fp.SHL(32, Q0, Q0, 16);
1473
} else if (bits == 8) {
1474
fp.UXTL(8, Q0, D0);
1475
fp.UXTL(16, Q0, D0);
1476
fp.SHL(32, Q0, D0, 24);
1477
if (unsignedOp) {
1478
// vuc2i is a bit special. It spreads out the bits like this:
1479
// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.
1480
fp.USHR(32, Q1, Q0, 8);
1481
fp.ORR(Q0, Q0, Q1);
1482
fp.USHR(32, Q1, Q0, 16);
1483
fp.ORR(Q0, Q0, Q1);
1484
}
1485
}
1486
1487
// At this point we have the regs in the 4 lanes.
1488
// In the "u" mode, we need to shift it out of the sign bit.
1489
if (unsignedOp) {
1490
Arm64Gen::ARM64Reg reg = (outsize == V_Quad) ? Q0 : D0;
1491
fp.USHR(32, reg, reg, 1);
1492
}
1493
1494
fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_NOINIT);
1495
1496
int nOut = 2;
1497
if (outsize == V_Quad)
1498
nOut = 4;
1499
1500
// Split apart again.
1501
for (int i = 0; i < nOut; i++) {
1502
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1503
fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);
1504
}
1505
1506
ApplyPrefixD(dregs, outsize);
1507
fpr.ReleaseSpillLocksAndDiscardTemps();
1508
}
1509
1510
void Arm64Jit::Comp_VCrossQuat(MIPSOpcode op) {
1511
// This op does not support prefixes anyway.
1512
CONDITIONAL_DISABLE(VFPU_VEC);
1513
if (!js.HasNoPrefix())
1514
DISABLE;
1515
1516
VectorSize sz = GetVecSize(op);
1517
int n = GetNumVectorElements(sz);
1518
1519
u8 sregs[4], tregs[4], dregs[4];
1520
GetVectorRegs(sregs, sz, _VS);
1521
GetVectorRegs(tregs, sz, _VT);
1522
GetVectorRegs(dregs, sz, _VD);
1523
1524
// Map everything into registers.
1525
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1526
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
1527
1528
if (sz == V_Triple) {
1529
MIPSReg temp3 = fpr.GetTempV();
1530
MIPSReg temp4 = fpr.GetTempV();
1531
fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
1532
fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);
1533
// Cross product vcrsp.t
1534
1535
// Note: using FMSUB here causes accuracy issues, see #18203.
1536
// Compute X: s[1] * t[2] - s[2] * t[1]
1537
fp.FMUL(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[2]));
1538
fp.FMUL(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[1]));
1539
fp.FSUB(S0, fpr.V(temp3), fpr.V(temp4));
1540
1541
// Compute Y: s[2] * t[0] - s[0] * t[2]
1542
fp.FMUL(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[0]));
1543
fp.FMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[2]));
1544
fp.FSUB(S1, fpr.V(temp3), fpr.V(temp4));
1545
1546
// Compute Z: s[0] * t[1] - s[1] * t[0]
1547
fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
1548
fp.FMUL(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[0]));
1549
fp.FSUB(fpr.V(temp3), fpr.V(temp3), fpr.V(temp4));
1550
1551
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
1552
fp.FMOV(fpr.V(dregs[0]), S0);
1553
fp.FMOV(fpr.V(dregs[1]), S1);
1554
fp.FMOV(fpr.V(dregs[2]), fpr.V(temp3));
1555
} else if (sz == V_Quad) {
1556
MIPSReg temp3 = fpr.GetTempV();
1557
MIPSReg temp4 = fpr.GetTempV();
1558
fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
1559
fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);
1560
1561
// Quaternion product vqmul.q untested
1562
// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
1563
fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[3]));
1564
fp.FMADD(S0, fpr.V(sregs[1]), fpr.V(tregs[2]), S0);
1565
fp.FMSUB(S0, fpr.V(sregs[2]), fpr.V(tregs[1]), S0);
1566
fp.FMADD(S0, fpr.V(sregs[3]), fpr.V(tregs[0]), S0);
1567
1568
//d[1] = -s[0] * t[2] + s[1] * t[3] + s[2] * t[0] + s[3] * t[1];
1569
fp.FNMUL(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));
1570
fp.FMADD(S1, fpr.V(sregs[1]), fpr.V(tregs[3]), S1);
1571
fp.FMADD(S1, fpr.V(sregs[2]), fpr.V(tregs[0]), S1);
1572
fp.FMADD(S1, fpr.V(sregs[3]), fpr.V(tregs[1]), S1);
1573
1574
//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
1575
fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
1576
fp.FMSUB(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]), fpr.V(temp3));
1577
fp.FMADD(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[3]), fpr.V(temp3));
1578
fp.FMADD(fpr.V(temp3), fpr.V(sregs[3]), fpr.V(tregs[2]), fpr.V(temp3));
1579
1580
//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
1581
fp.FNMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[0]));
1582
fp.FMSUB(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[1]), fpr.V(temp4));
1583
fp.FMSUB(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[2]), fpr.V(temp4));
1584
fp.FMADD(fpr.V(temp4), fpr.V(sregs[3]), fpr.V(tregs[3]), fpr.V(temp4));
1585
1586
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
1587
fp.FMOV(fpr.V(dregs[0]), S0);
1588
fp.FMOV(fpr.V(dregs[1]), S1);
1589
fp.FMOV(fpr.V(dregs[2]), fpr.V(temp3));
1590
fp.FMOV(fpr.V(dregs[3]), fpr.V(temp4));
1591
}
1592
1593
fpr.ReleaseSpillLocksAndDiscardTemps();
1594
}
1595
1596
void Arm64Jit::Comp_Vcmp(MIPSOpcode op) {
1597
CONDITIONAL_DISABLE(VFPU_COMP);
1598
if (js.HasUnknownPrefix())
1599
DISABLE;
1600
1601
VectorSize sz = GetVecSize(op);
1602
int n = GetNumVectorElements(sz);
1603
1604
VCondition cond = (VCondition)(op & 0xF);
1605
1606
u8 sregs[4], tregs[4];
1607
GetVectorRegsPrefixS(sregs, sz, _VS);
1608
GetVectorRegsPrefixT(tregs, sz, _VT);
1609
1610
// Some, we just fall back to the interpreter.
1611
// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.
1612
1613
switch (cond) {
1614
case VC_EI: // c = my_isinf(s[i]); break;
1615
case VC_NI: // c = !my_isinf(s[i]); break;
1616
DISABLE;
1617
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1618
case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
1619
case VC_EN: // c = my_isnan(s[i]); break;
1620
case VC_NN: // c = !my_isnan(s[i]); break;
1621
if (_VS != _VT)
1622
DISABLE;
1623
break;
1624
1625
case VC_EZ:
1626
case VC_NZ:
1627
break;
1628
default:
1629
;
1630
}
1631
1632
// First, let's get the trivial ones.
1633
int affected_bits = (1 << 4) | (1 << 5); // 4 and 5
1634
1635
MOVI2R(SCRATCH1, 0);
1636
for (int i = 0; i < n; ++i) {
1637
// Let's only handle the easy ones, and fall back on the interpreter for the rest.
1638
CCFlags flag = CC_AL;
1639
switch (cond) {
1640
case VC_FL: // c = 0;
1641
break;
1642
1643
case VC_TR: // c = 1
1644
if (i == 0) {
1645
if (n == 1) {
1646
MOVI2R(SCRATCH1, 0x31);
1647
} else {
1648
MOVI2R(SCRATCH1, 1ULL << i);
1649
}
1650
} else {
1651
ORRI2R(SCRATCH1, SCRATCH1, 1ULL << i);
1652
}
1653
break;
1654
1655
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1656
case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;
1657
// For these, we use the integer ALU as there is no support on ARM for testing for INF.
1658
// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.
1659
// We need an extra temporary register so we store away SCRATCH1.
1660
STR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
1661
fpr.MapRegV(sregs[i], 0);
1662
MOVI2R(SCRATCH1, 0x7F800000);
1663
fp.FMOV(SCRATCH2, fpr.V(sregs[i]));
1664
AND(SCRATCH2, SCRATCH2, SCRATCH1);
1665
CMP(SCRATCH2, SCRATCH1); // (SCRATCH2 & 0x7F800000) == 0x7F800000
1666
flag = cond == VC_ES ? CC_EQ : CC_NEQ;
1667
LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
1668
break;
1669
1670
case VC_EN: // c = my_isnan(s[i]); break; // Tekken 6
1671
// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
1672
fpr.MapInInV(sregs[i], tregs[i]);
1673
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1674
flag = CC_VS; // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
1675
break;
1676
1677
case VC_NN: // c = !my_isnan(s[i]); break;
1678
// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
1679
fpr.MapInInV(sregs[i], tregs[i]);
1680
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1681
flag = CC_VC; // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
1682
break;
1683
1684
case VC_EQ: // c = s[i] == t[i]
1685
fpr.MapInInV(sregs[i], tregs[i]);
1686
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1687
flag = CC_EQ;
1688
break;
1689
1690
case VC_LT: // c = s[i] < t[i]
1691
fpr.MapInInV(sregs[i], tregs[i]);
1692
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1693
flag = CC_LO;
1694
break;
1695
1696
case VC_LE: // c = s[i] <= t[i];
1697
fpr.MapInInV(sregs[i], tregs[i]);
1698
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1699
flag = CC_LS;
1700
break;
1701
1702
case VC_NE: // c = s[i] != t[i]
1703
fpr.MapInInV(sregs[i], tregs[i]);
1704
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1705
flag = CC_NEQ;
1706
break;
1707
1708
case VC_GE: // c = s[i] >= t[i]
1709
fpr.MapInInV(sregs[i], tregs[i]);
1710
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1711
flag = CC_GE;
1712
break;
1713
1714
case VC_GT: // c = s[i] > t[i]
1715
fpr.MapInInV(sregs[i], tregs[i]);
1716
fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1717
flag = CC_GT;
1718
break;
1719
1720
case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f
1721
fpr.MapRegV(sregs[i]);
1722
fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
1723
flag = CC_EQ;
1724
break;
1725
1726
case VC_NZ: // c = s[i] != 0
1727
fpr.MapRegV(sregs[i]);
1728
fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
1729
flag = CC_NEQ;
1730
break;
1731
1732
default:
1733
DISABLE;
1734
}
1735
if (flag != CC_AL) {
1736
FixupBranch b = B(InvertCond(flag));
1737
if (i == 0) {
1738
if (n == 1) {
1739
MOVI2R(SCRATCH1, 0x31);
1740
} else {
1741
MOVI2R(SCRATCH1, 1); // 1 << i, but i == 0
1742
}
1743
} else {
1744
ORRI2R(SCRATCH1, SCRATCH1, 1ULL << i);
1745
}
1746
SetJumpTarget(b);
1747
}
1748
1749
affected_bits |= 1 << i;
1750
}
1751
1752
// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common
1753
// after all.
1754
if (n > 1) {
1755
CMP(SCRATCH1, affected_bits & 0xF);
1756
FixupBranch skip1 = B(CC_NEQ);
1757
ORRI2R(SCRATCH1, SCRATCH1, 1 << 5);
1758
SetJumpTarget(skip1);
1759
1760
CMP(SCRATCH1, 0);
1761
FixupBranch skip2 = B(CC_EQ);
1762
ORRI2R(SCRATCH1, SCRATCH1, 1 << 4);
1763
SetJumpTarget(skip2);
1764
}
1765
1766
gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);
1767
ANDI2R(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), ~affected_bits, SCRATCH2);
1768
ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCH1);
1769
1770
fpr.ReleaseSpillLocksAndDiscardTemps();
1771
}
1772
1773
void Arm64Jit::Comp_Vcmov(MIPSOpcode op) {
1774
CONDITIONAL_DISABLE(VFPU_COMP);
1775
if (js.HasUnknownPrefix()) {
1776
DISABLE;
1777
}
1778
1779
VectorSize sz = GetVecSize(op);
1780
int n = GetNumVectorElements(sz);
1781
1782
u8 sregs[4], dregs[4];
1783
GetVectorRegsPrefixS(sregs, sz, _VS);
1784
GetVectorRegsPrefixD(dregs, sz, _VD);
1785
int tf = (op >> 19) & 1;
1786
int imm3 = (op >> 16) & 7;
1787
1788
for (int i = 0; i < n; ++i) {
1789
// Simplification: Disable if overlap unsafe
1790
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1791
DISABLE;
1792
}
1793
}
1794
1795
if (imm3 < 6) {
1796
// Test one bit of CC. This bit decides whether none or all subregisters are copied.
1797
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
1798
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1799
gpr.MapReg(MIPS_REG_VFPUCC);
1800
TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << imm3);
1801
// TODO: Use fsel?
1802
FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
1803
for (int i = 0; i < n; i++) {
1804
fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
1805
}
1806
SetJumpTarget(b);
1807
} else {
1808
// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
1809
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
1810
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1811
gpr.MapReg(MIPS_REG_VFPUCC);
1812
for (int i = 0; i < n; i++) {
1813
TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << i);
1814
FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
1815
fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
1816
SetJumpTarget(b);
1817
}
1818
}
1819
1820
ApplyPrefixD(dregs, sz);
1821
fpr.ReleaseSpillLocksAndDiscardTemps();
1822
}
1823
1824
void Arm64Jit::Comp_Viim(MIPSOpcode op) {
1825
CONDITIONAL_DISABLE(VFPU_XFER);
1826
if (js.HasUnknownPrefix()) {
1827
DISABLE;
1828
}
1829
1830
u8 dreg;
1831
GetVectorRegs(&dreg, V_Single, _VT);
1832
1833
s32 imm = SignExtend16ToS32(op);
1834
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
1835
fp.MOVI2F(fpr.V(dreg), (float)imm, SCRATCH1);
1836
1837
ApplyPrefixD(&dreg, V_Single);
1838
fpr.ReleaseSpillLocksAndDiscardTemps();
1839
}
1840
1841
void Arm64Jit::Comp_Vfim(MIPSOpcode op) {
1842
CONDITIONAL_DISABLE(VFPU_XFER);
1843
if (js.HasUnknownPrefix()) {
1844
DISABLE;
1845
}
1846
1847
u8 dreg;
1848
GetVectorRegs(&dreg, V_Single, _VT);
1849
1850
FP16 half;
1851
half.u = op & 0xFFFF;
1852
FP32 fval = half_to_float_fast5(half);
1853
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
1854
fp.MOVI2F(fpr.V(dreg), fval.f, SCRATCH1);
1855
1856
ApplyPrefixD(&dreg, V_Single);
1857
fpr.ReleaseSpillLocksAndDiscardTemps();
1858
}
1859
1860
void Arm64Jit::Comp_Vcst(MIPSOpcode op) {
1861
CONDITIONAL_DISABLE(VFPU_XFER);
1862
if (js.HasUnknownPrefix()) {
1863
DISABLE;
1864
}
1865
1866
int conNum = (op >> 16) & 0x1f;
1867
int vd = _VD;
1868
1869
VectorSize sz = GetVecSize(op);
1870
int n = GetNumVectorElements(sz);
1871
1872
u8 dregs[4];
1873
GetVectorRegsPrefixD(dregs, sz, _VD);
1874
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1875
1876
MOVP2R(SCRATCH1_64, (void *)&cst_constants[conNum]);
1877
fp.LDR(32, INDEX_UNSIGNED, S0, SCRATCH1_64, 0);
1878
for (int i = 0; i < n; ++i)
1879
fp.FMOV(fpr.V(dregs[i]), S0);
1880
1881
ApplyPrefixD(dregs, sz);
1882
fpr.ReleaseSpillLocksAndDiscardTemps();
1883
}
1884
1885
static double SinCos(float angle) {
1886
union { struct { float sin; float cos; }; double out; } sincos;
1887
vfpu_sincos(angle, sincos.sin, sincos.cos);
1888
return sincos.out;
1889
}
1890
1891
static double SinCosNegSin(float angle) {
1892
union { struct { float sin; float cos; }; double out; } sincos;
1893
vfpu_sincos(angle, sincos.sin, sincos.cos);
1894
sincos.sin = -sincos.sin;
1895
return sincos.out;
1896
}
1897
1898
void Arm64Jit::CompVrotShuffle(u8 *dregs, int imm, VectorSize sz, bool negSin) {
1899
int n = GetNumVectorElements(sz);
1900
char what[4] = { '0', '0', '0', '0' };
1901
if (((imm >> 2) & 3) == (imm & 3)) {
1902
for (int i = 0; i < 4; i++)
1903
what[i] = 'S';
1904
}
1905
what[(imm >> 2) & 3] = 'S';
1906
what[imm & 3] = 'C';
1907
1908
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
1909
for (int i = 0; i < n; i++) {
1910
switch (what[i]) {
1911
case 'C': fp.FMOV(fpr.V(dregs[i]), S1); break;
1912
case 'S': if (negSin) fp.FNEG(fpr.V(dregs[i]), S0); else fp.FMOV(fpr.V(dregs[i]), S0); break;
1913
case '0':
1914
{
1915
fp.MOVI2F(fpr.V(dregs[i]), 0.0f);
1916
break;
1917
}
1918
default:
1919
ERROR_LOG(Log::JIT, "Bad what in vrot");
1920
break;
1921
}
1922
}
1923
}
1924
1925
// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
1926
// calling the math library.
1927
void Arm64Jit::Comp_VRot(MIPSOpcode op) {
1928
// VRot probably doesn't accept prefixes anyway.
1929
CONDITIONAL_DISABLE(VFPU_VEC);
1930
if (js.HasUnknownPrefix()) {
1931
DISABLE;
1932
}
1933
1934
int vd = _VD;
1935
int vs = _VS;
1936
1937
VectorSize sz = GetVecSize(op);
1938
int n = GetNumVectorElements(sz);
1939
1940
u8 dregs[4];
1941
u8 dregs2[4];
1942
1943
MIPSOpcode nextOp = GetOffsetInstruction(1);
1944
int vd2 = -1;
1945
int imm2 = -1;
1946
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
1947
// Pair of vrot. Let's join them.
1948
vd2 = MIPS_GET_VD(nextOp);
1949
imm2 = (nextOp >> 16) & 0x1f;
1950
// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);
1951
}
1952
u8 sreg;
1953
GetVectorRegs(dregs, sz, vd);
1954
if (vd2 >= 0)
1955
GetVectorRegs(dregs2, sz, vd2);
1956
GetVectorRegs(&sreg, V_Single, vs);
1957
1958
int imm = (op >> 16) & 0x1f;
1959
1960
gpr.FlushBeforeCall();
1961
fpr.FlushAll();
1962
1963
// Don't need to SaveStaticRegs here as long as they are all in callee-save regs - this callee won't read them.
1964
1965
bool negSin1 = (imm & 0x10) ? true : false;
1966
1967
fpr.MapRegV(sreg);
1968
fp.FMOV(S0, fpr.V(sreg));
1969
QuickCallFunction(SCRATCH2_64, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);
1970
// Here, sin and cos are stored together in Q0.d. On ARM32 we could use it directly
1971
// but with ARM64's register organization, we need to split it up.
1972
fp.INS(32, Q1, 0, Q0, 1);
1973
1974
CompVrotShuffle(dregs, imm, sz, false);
1975
if (vd2 != -1) {
1976
// If the negsin setting differs between the two joint invocations, we need to flip the second one.
1977
bool negSin2 = (imm2 & 0x10) ? true : false;
1978
CompVrotShuffle(dregs2, imm2, sz, negSin1 != negSin2);
1979
EatInstruction(nextOp);
1980
}
1981
1982
fpr.ReleaseSpillLocksAndDiscardTemps();
1983
}
1984
1985
void Arm64Jit::Comp_Vsgn(MIPSOpcode op) {
1986
DISABLE;
1987
}
1988
1989
void Arm64Jit::Comp_Vocp(MIPSOpcode op) {
1990
CONDITIONAL_DISABLE(VFPU_VEC);
1991
if (js.HasUnknownPrefix()) {
1992
DISABLE;
1993
}
1994
1995
VectorSize sz = GetVecSize(op);
1996
int n = GetNumVectorElements(sz);
1997
1998
// This is a hack that modifies prefixes. We eat them later, so just overwrite.
1999
// S prefix forces the negate flags.
2000
js.prefixS |= 0x000F0000;
2001
// T prefix forces constants on and regnum to 1.
2002
// That means negate still works, and abs activates a different constant.
2003
js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2004
2005
u8 sregs[4], tregs[4], dregs[4];
2006
GetVectorRegsPrefixS(sregs, sz, _VS);
2007
GetVectorRegsPrefixT(tregs, sz, _VS);
2008
GetVectorRegsPrefixD(dregs, sz, _VD);
2009
2010
MIPSReg tempregs[4];
2011
for (int i = 0; i < n; ++i) {
2012
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2013
tempregs[i] = fpr.GetTempV();
2014
} else {
2015
tempregs[i] = dregs[i];
2016
}
2017
}
2018
2019
fp.MOVI2F(S0, 1.0f, SCRATCH1);
2020
for (int i = 0; i < n; ++i) {
2021
fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);
2022
fp.FADD(fpr.V(tempregs[i]), fpr.V(tregs[i]), fpr.V(sregs[i]));
2023
}
2024
2025
for (int i = 0; i < n; ++i) {
2026
if (dregs[i] != tempregs[i]) {
2027
fpr.MapDirtyInV(dregs[i], tempregs[i]);
2028
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
2029
}
2030
}
2031
2032
ApplyPrefixD(dregs, sz);
2033
2034
fpr.ReleaseSpillLocksAndDiscardTemps();
2035
}
2036
2037
void Arm64Jit::Comp_ColorConv(MIPSOpcode op) {
2038
DISABLE;
2039
}
2040
2041
void Arm64Jit::Comp_Vbfy(MIPSOpcode op) {
2042
DISABLE;
2043
}
2044
}
2045
2046
#endif // PPSSPP_ARCH(ARM64)
2047
2048