Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM/ArmCompVFPU.cpp
5686 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(ARM)
20
21
#include <cmath>
22
#include "Common/CPUDetect.h"
23
#include "Common/Data/Convert/SmallDataConvert.h"
24
#include "Common/Math/math_util.h"
25
26
#include "Core/Compatibility.h"
27
#include "Core/Config.h"
28
#include "Core/MemMap.h"
29
#include "Core/Reporting.h"
30
#include "Core/System.h"
31
#include "Core/MIPS/MIPS.h"
32
#include "Core/MIPS/MIPSTables.h"
33
#include "Core/MIPS/MIPSAnalyst.h"
34
#include "Core/MIPS/MIPSCodeUtils.h"
35
36
#include "Core/MIPS/ARM/ArmJit.h"
37
#include "Core/MIPS/ARM/ArmRegCache.h"
38
39
// Cool NEON references:
40
// http://www.delmarnorth.com/microwave/requirements/neon-test-tutorial.pdf
41
42
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
43
// Currently known non working ones should have DISABLE.
44
45
// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
46
#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }
47
#define DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
48
49
#define _RS MIPS_GET_RS(op)
50
#define _RT MIPS_GET_RT(op)
51
#define _RD MIPS_GET_RD(op)
52
#define _FS MIPS_GET_FS(op)
53
#define _FT MIPS_GET_FT(op)
54
#define _FD MIPS_GET_FD(op)
55
#define _SA MIPS_GET_SA(op)
56
#define _POS ((op>> 6) & 0x1F)
57
#define _SIZE ((op>>11) & 0x1F)
58
#define _IMM16 (signed short)(op & 0xFFFF)
59
#define _IMM26 (op & 0x03FFFFFF)
60
61
namespace MIPSComp
62
{
63
using namespace ArmGen;
64
using namespace ArmJitConstants;
65
66
// Vector regs can overlap in all sorts of swizzled ways.
67
// This does allow a single overlap in sregs[i].
68
static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)
69
{
70
for (int i = 0; i < sn; ++i)
71
{
72
if (sregs[i] == dreg && i != di)
73
return false;
74
}
75
for (int i = 0; i < tn; ++i)
76
{
77
if (tregs[i] == dreg)
78
return false;
79
}
80
81
// Hurray, no overlap, we can write directly.
82
return true;
83
}
84
85
static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)
86
{
87
return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
88
}
89
90
void ArmJit::Comp_VPFX(MIPSOpcode op)
91
{
92
CONDITIONAL_DISABLE(VFPU_XFER);
93
int data = op & 0xFFFFF;
94
int regnum = (op >> 24) & 3;
95
switch (regnum) {
96
case 0: // S
97
js.prefixS = data;
98
js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
99
break;
100
case 1: // T
101
js.prefixT = data;
102
js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
103
break;
104
case 2: // D
105
js.prefixD = data & 0x00000FFF;
106
js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
107
break;
108
default:
109
ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
110
break;
111
}
112
}
113
114
void ArmJit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
115
if (prefix == 0xE4)
116
return;
117
118
int n = GetNumVectorElements(sz);
119
u8 origV[4];
120
static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
121
122
for (int i = 0; i < n; i++)
123
origV[i] = vregs[i];
124
125
for (int i = 0; i < n; i++) {
126
int regnum = (prefix >> (i*2)) & 3;
127
int abs = (prefix >> (8+i)) & 1;
128
int negate = (prefix >> (16+i)) & 1;
129
int constants = (prefix >> (12+i)) & 1;
130
131
// Unchanged, hurray.
132
if (!constants && regnum == i && !abs && !negate)
133
continue;
134
135
// This puts the value into a temp reg, so we won't write the modified value back.
136
vregs[i] = fpr.GetTempV();
137
if (!constants) {
138
fpr.MapDirtyInV(vregs[i], origV[regnum]);
139
fpr.SpillLockV(vregs[i]);
140
141
// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
142
// TODO: But some ops seem to use const 0 instead?
143
if (regnum >= n) {
144
WARN_LOG(Log::CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, GetCompilerPC(), MIPSDisasmAt(GetCompilerPC()).c_str());
145
regnum = 0;
146
}
147
148
if (abs) {
149
VABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));
150
if (negate)
151
VNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));
152
} else {
153
if (negate)
154
VNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));
155
else
156
VMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));
157
}
158
} else {
159
fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);
160
fpr.SpillLockV(vregs[i]);
161
MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs<<2)], SCRATCHREG1, negate != 0);
162
}
163
}
164
}
165
166
void ArmJit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
167
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
168
169
GetVectorRegs(regs, sz, vectorReg);
170
if (js.prefixD == 0)
171
return;
172
173
int n = GetNumVectorElements(sz);
174
for (int i = 0; i < n; i++) {
175
// Hopefully this is rare, we'll just write it into a reg we drop.
176
if (js.VfpuWriteMask(i))
177
regs[i] = fpr.GetTempV();
178
}
179
}
180
181
void ArmJit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
182
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
183
if (!js.prefixD)
184
return;
185
186
int n = GetNumVectorElements(sz);
187
for (int i = 0; i < n; i++) {
188
if (js.VfpuWriteMask(i))
189
continue;
190
191
int sat = (js.prefixD >> (i * 2)) & 3;
192
if (sat == 1) {
193
// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
194
fpr.MapRegV(vregs[i], MAP_DIRTY);
195
196
MOVI2F(S0, 0.0f, SCRATCHREG1);
197
MOVI2F(S1, 1.0f, SCRATCHREG1);
198
VCMP(fpr.V(vregs[i]), S0);
199
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
200
SetCC(CC_LS);
201
VMOV(fpr.V(vregs[i]), S0);
202
SetCC(CC_AL);
203
VCMP(fpr.V(vregs[i]), S1);
204
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
205
SetCC(CC_GT);
206
VMOV(fpr.V(vregs[i]), S1);
207
SetCC(CC_AL);
208
} else if (sat == 3) {
209
// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]
210
fpr.MapRegV(vregs[i], MAP_DIRTY);
211
212
MOVI2F(S0, -1.0f, SCRATCHREG1);
213
MOVI2F(S1, 1.0f, SCRATCHREG1);
214
VCMP(fpr.V(vregs[i]), S0);
215
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
216
SetCC(CC_LO);
217
VMOV(fpr.V(vregs[i]), S0);
218
SetCC(CC_AL);
219
VCMP(fpr.V(vregs[i]), S1);
220
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
221
SetCC(CC_GT);
222
VMOV(fpr.V(vregs[i]), S1);
223
SetCC(CC_AL);
224
}
225
}
226
}
227
228
void ArmJit::Comp_SV(MIPSOpcode op) {
229
CONDITIONAL_DISABLE(LSU_VFPU);
230
CheckMemoryBreakpoint();
231
232
s32 offset = (signed short)(op & 0xFFFC);
233
int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
234
MIPSGPReg rs = _RS;
235
236
bool doCheck = false;
237
switch (op >> 26)
238
{
239
case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);
240
{
241
if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset < 0x400 && offset > -0x400) {
242
gpr.MapRegAsPointer(rs);
243
fpr.MapRegV(vt, MAP_NOINIT | MAP_DIRTY);
244
VLDR(fpr.V(vt), gpr.RPtr(rs), offset);
245
break;
246
}
247
248
// CC might be set by slow path below, so load regs first.
249
fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
250
if (gpr.IsImm(rs)) {
251
u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;
252
gpr.SetRegImm(R0, addr + (u32)Memory::base);
253
} else {
254
gpr.MapReg(rs);
255
if (g_Config.bFastMemory) {
256
SetR0ToEffectiveAddress(rs, offset);
257
} else {
258
SetCCAndR0ForSafeAddress(rs, offset, SCRATCHREG2);
259
doCheck = true;
260
}
261
ADD(R0, R0, MEMBASEREG);
262
}
263
#ifdef __ARM_ARCH_7S__
264
FixupBranch skip;
265
if (doCheck) {
266
skip = B_CC(CC_EQ);
267
}
268
VLDR(fpr.V(vt), R0, 0);
269
if (doCheck) {
270
SetJumpTarget(skip);
271
SetCC(CC_AL);
272
}
273
#else
274
VLDR(fpr.V(vt), R0, 0);
275
if (doCheck) {
276
SetCC(CC_EQ);
277
MOVI2F(fpr.V(vt), 0.0f, SCRATCHREG1);
278
SetCC(CC_AL);
279
}
280
#endif
281
}
282
break;
283
284
case 58: //sv.s // Memory::Write_U32(VI(vt), addr);
285
{
286
if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset < 0x400 && offset > -0x400) {
287
gpr.MapRegAsPointer(rs);
288
fpr.MapRegV(vt, 0);
289
VSTR(fpr.V(vt), gpr.RPtr(rs), offset);
290
break;
291
}
292
293
// CC might be set by slow path below, so load regs first.
294
fpr.MapRegV(vt);
295
if (gpr.IsImm(rs)) {
296
u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;
297
gpr.SetRegImm(R0, addr + (u32)Memory::base);
298
} else {
299
gpr.MapReg(rs);
300
if (g_Config.bFastMemory) {
301
SetR0ToEffectiveAddress(rs, offset);
302
} else {
303
SetCCAndR0ForSafeAddress(rs, offset, SCRATCHREG2);
304
doCheck = true;
305
}
306
ADD(R0, R0, MEMBASEREG);
307
}
308
#ifdef __ARM_ARCH_7S__
309
FixupBranch skip;
310
if (doCheck) {
311
skip = B_CC(CC_EQ);
312
}
313
VSTR(fpr.V(vt), R0, 0);
314
if (doCheck) {
315
SetJumpTarget(skip);
316
SetCC(CC_AL);
317
}
318
#else
319
VSTR(fpr.V(vt), R0, 0);
320
if (doCheck) {
321
SetCC(CC_AL);
322
}
323
#endif
324
}
325
break;
326
327
328
default:
329
DISABLE;
330
}
331
}
332
333
void ArmJit::Comp_SVQ(MIPSOpcode op) {
334
CONDITIONAL_DISABLE(LSU_VFPU);
335
CheckMemoryBreakpoint();
336
337
int imm = (signed short)(op&0xFFFC);
338
int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
339
MIPSGPReg rs = _RS;
340
341
bool doCheck = false;
342
switch (op >> 26)
343
{
344
case 54: //lv.q
345
{
346
// CC might be set by slow path below, so load regs first.
347
u8 vregs[4];
348
GetVectorRegs(vregs, V_Quad, vt);
349
fpr.MapRegsAndSpillLockV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
350
351
if (gpr.IsImm(rs)) {
352
u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;
353
gpr.SetRegImm(R0, addr + (u32)Memory::base);
354
} else {
355
gpr.MapReg(rs);
356
if (g_Config.bFastMemory) {
357
SetR0ToEffectiveAddress(rs, imm);
358
} else {
359
SetCCAndR0ForSafeAddress(rs, imm, SCRATCHREG2);
360
doCheck = true;
361
}
362
ADD(R0, R0, MEMBASEREG);
363
}
364
365
#ifdef __ARM_ARCH_7S__
366
FixupBranch skip;
367
if (doCheck) {
368
skip = B_CC(CC_EQ);
369
}
370
371
bool consecutive = true;
372
for (int i = 0; i < 3 && consecutive; i++)
373
if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))
374
consecutive = false;
375
if (consecutive) {
376
VLDMIA(R0, false, fpr.V(vregs[0]), 4);
377
} else {
378
for (int i = 0; i < 4; i++)
379
VLDR(fpr.V(vregs[i]), R0, i * 4);
380
}
381
382
if (doCheck) {
383
SetJumpTarget(skip);
384
SetCC(CC_AL);
385
}
386
#else
387
bool consecutive = true;
388
for (int i = 0; i < 3 && consecutive; i++)
389
if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))
390
consecutive = false;
391
if (consecutive) {
392
VLDMIA(R0, false, fpr.V(vregs[0]), 4);
393
} else {
394
for (int i = 0; i < 4; i++)
395
VLDR(fpr.V(vregs[i]), R0, i * 4);
396
}
397
398
if (doCheck) {
399
SetCC(CC_EQ);
400
MOVI2R(SCRATCHREG1, 0);
401
for (int i = 0; i < 4; i++)
402
VMOV(fpr.V(vregs[i]), SCRATCHREG1);
403
SetCC(CC_AL);
404
}
405
#endif
406
}
407
break;
408
409
case 62: //sv.q
410
{
411
// CC might be set by slow path below, so load regs first.
412
u8 vregs[4];
413
GetVectorRegs(vregs, V_Quad, vt);
414
fpr.MapRegsAndSpillLockV(vregs, V_Quad, 0);
415
416
if (gpr.IsImm(rs)) {
417
u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;
418
gpr.SetRegImm(R0, addr + (u32)Memory::base);
419
} else {
420
gpr.MapReg(rs);
421
if (g_Config.bFastMemory) {
422
SetR0ToEffectiveAddress(rs, imm);
423
} else {
424
SetCCAndR0ForSafeAddress(rs, imm, SCRATCHREG2);
425
doCheck = true;
426
}
427
ADD(R0, R0, MEMBASEREG);
428
}
429
430
#ifdef __ARM_ARCH_7S__
431
FixupBranch skip;
432
if (doCheck) {
433
skip = B_CC(CC_EQ);
434
}
435
436
bool consecutive = true;
437
for (int i = 0; i < 3 && consecutive; i++)
438
if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))
439
consecutive = false;
440
if (consecutive) {
441
VSTMIA(R0, false, fpr.V(vregs[0]), 4);
442
} else {
443
for (int i = 0; i < 4; i++)
444
VSTR(fpr.V(vregs[i]), R0, i * 4);
445
}
446
447
if (doCheck) {
448
SetJumpTarget(skip);
449
SetCC(CC_AL);
450
}
451
#else
452
bool consecutive = true;
453
for (int i = 0; i < 3 && consecutive; i++)
454
if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))
455
consecutive = false;
456
if (consecutive) {
457
VSTMIA(R0, false, fpr.V(vregs[0]), 4);
458
} else {
459
for (int i = 0; i < 4; i++)
460
VSTR(fpr.V(vregs[i]), R0, i * 4);
461
}
462
463
if (doCheck) {
464
SetCC(CC_AL);
465
}
466
#endif
467
}
468
break;
469
470
default:
471
DISABLE;
472
break;
473
}
474
fpr.ReleaseSpillLocksAndDiscardTemps();
475
}
476
477
void ArmJit::Comp_VVectorInit(MIPSOpcode op) {
478
CONDITIONAL_DISABLE(VFPU_XFER);
479
// WARNING: No prefix support!
480
if (js.HasUnknownPrefix()) {
481
DISABLE;
482
}
483
484
switch ((op >> 16) & 0xF)
485
{
486
case 6: // v=zeros; break; //vzero
487
MOVI2F(S0, 0.0f, SCRATCHREG1);
488
break;
489
case 7: // v=ones; break; //vone
490
MOVI2F(S0, 1.0f, SCRATCHREG1);
491
break;
492
default:
493
DISABLE;
494
break;
495
}
496
497
VectorSize sz = GetVecSize(op);
498
int n = GetNumVectorElements(sz);
499
500
u8 dregs[4];
501
GetVectorRegsPrefixD(dregs, sz, _VD);
502
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
503
504
for (int i = 0; i < n; ++i)
505
VMOV(fpr.V(dregs[i]), S0);
506
507
ApplyPrefixD(dregs, sz);
508
509
fpr.ReleaseSpillLocksAndDiscardTemps();
510
}
511
512
void ArmJit::Comp_VIdt(MIPSOpcode op) {
513
CONDITIONAL_DISABLE(VFPU_XFER);
514
if (js.HasUnknownPrefix()) {
515
DISABLE;
516
}
517
518
int vd = _VD;
519
VectorSize sz = GetVecSize(op);
520
int n = GetNumVectorElements(sz);
521
MOVI2F(S0, 0.0f, SCRATCHREG1);
522
MOVI2F(S1, 1.0f, SCRATCHREG1);
523
u8 dregs[4];
524
GetVectorRegsPrefixD(dregs, sz, _VD);
525
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
526
switch (sz)
527
{
528
case V_Pair:
529
VMOV(fpr.V(dregs[0]), (vd&1)==0 ? S1 : S0);
530
VMOV(fpr.V(dregs[1]), (vd&1)==1 ? S1 : S0);
531
break;
532
case V_Quad:
533
VMOV(fpr.V(dregs[0]), (vd&3)==0 ? S1 : S0);
534
VMOV(fpr.V(dregs[1]), (vd&3)==1 ? S1 : S0);
535
VMOV(fpr.V(dregs[2]), (vd&3)==2 ? S1 : S0);
536
VMOV(fpr.V(dregs[3]), (vd&3)==3 ? S1 : S0);
537
break;
538
default:
539
_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");
540
break;
541
}
542
543
ApplyPrefixD(dregs, sz);
544
545
fpr.ReleaseSpillLocksAndDiscardTemps();
546
}
547
548
void ArmJit::Comp_VMatrixInit(MIPSOpcode op) {
549
CONDITIONAL_DISABLE(VFPU_XFER);
550
if (js.HasUnknownPrefix()) {
551
// Don't think matrix init ops care about prefixes.
552
// DISABLE;
553
}
554
555
MatrixSize sz = GetMtxSize(op);
556
int n = GetMatrixSide(sz);
557
558
u8 dregs[16];
559
GetMatrixRegs(dregs, sz, _VD);
560
561
switch ((op >> 16) & 0xF) {
562
case 3: // vmidt
563
MOVI2F(S0, 0.0f, SCRATCHREG1);
564
MOVI2F(S1, 1.0f, SCRATCHREG1);
565
for (int a = 0; a < n; a++) {
566
for (int b = 0; b < n; b++) {
567
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
568
VMOV(fpr.V(dregs[a * 4 + b]), a == b ? S1 : S0);
569
}
570
}
571
break;
572
case 6: // vmzero
573
MOVI2F(S0, 0.0f, SCRATCHREG1);
574
for (int a = 0; a < n; a++) {
575
for (int b = 0; b < n; b++) {
576
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
577
VMOV(fpr.V(dregs[a * 4 + b]), S0);
578
}
579
}
580
break;
581
case 7: // vmone
582
MOVI2F(S1, 1.0f, SCRATCHREG1);
583
for (int a = 0; a < n; a++) {
584
for (int b = 0; b < n; b++) {
585
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
586
VMOV(fpr.V(dregs[a * 4 + b]), S1);
587
}
588
}
589
break;
590
}
591
592
fpr.ReleaseSpillLocksAndDiscardTemps();
593
}
594
595
void ArmJit::Comp_VHdp(MIPSOpcode op) {
596
CONDITIONAL_DISABLE(VFPU_VEC);
597
if (js.HasUnknownPrefix()) {
598
DISABLE;
599
}
600
601
int vd = _VD;
602
int vs = _VS;
603
int vt = _VT;
604
VectorSize sz = GetVecSize(op);
605
606
// TODO: Force read one of them into regs? probably not.
607
u8 sregs[4], tregs[4], dregs[1];
608
GetVectorRegsPrefixS(sregs, sz, vs);
609
GetVectorRegsPrefixT(tregs, sz, vt);
610
GetVectorRegsPrefixD(dregs, V_Single, vd);
611
612
// TODO: applyprefixST here somehow (shuffle, etc...)
613
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
614
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
615
VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));
616
617
int n = GetNumVectorElements(sz);
618
for (int i = 1; i < n; i++) {
619
// sum += s[i]*t[i];
620
if (i == n - 1) {
621
VADD(S0, S0, fpr.V(tregs[i]));
622
} else {
623
VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i]));
624
}
625
}
626
fpr.ReleaseSpillLocksAndDiscardTemps();
627
628
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
629
630
VMOV(fpr.V(dregs[0]), S0);
631
ApplyPrefixD(dregs, V_Single);
632
fpr.ReleaseSpillLocksAndDiscardTemps();
633
}
634
635
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
636
637
void ArmJit::Comp_Vhoriz(MIPSOpcode op) {
638
CONDITIONAL_DISABLE(VFPU_VEC);
639
if (js.HasUnknownPrefix()) {
640
DISABLE;
641
}
642
643
int vd = _VD;
644
int vs = _VS;
645
int vt = _VT;
646
VectorSize sz = GetVecSize(op);
647
648
// TODO: Force read one of them into regs? probably not.
649
u8 sregs[4], dregs[1];
650
GetVectorRegsPrefixS(sregs, sz, vs);
651
GetVectorRegsPrefixD(dregs, V_Single, vd);
652
653
// TODO: applyprefixST here somehow (shuffle, etc...)
654
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
655
656
int n = GetNumVectorElements(sz);
657
658
bool is_vavg = ((op >> 16) & 0x1f) == 7;
659
if (is_vavg) {
660
MOVI2F(S1, vavg_table[n - 1], R0);
661
}
662
// Have to start at +0.000 for the correct sign.
663
MOVI2F(S0, 0.0f, SCRATCHREG1);
664
for (int i = 0; i < n; i++) {
665
// sum += s[i];
666
VADD(S0, S0, fpr.V(sregs[i]));
667
}
668
669
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
670
if (is_vavg) {
671
VMUL(fpr.V(dregs[0]), S0, S1);
672
} else {
673
VMOV(fpr.V(dregs[0]), S0);
674
}
675
ApplyPrefixD(dregs, V_Single);
676
fpr.ReleaseSpillLocksAndDiscardTemps();
677
}
678
679
void ArmJit::Comp_VDot(MIPSOpcode op) {
680
CONDITIONAL_DISABLE(VFPU_VEC);
681
if (js.HasUnknownPrefix()) {
682
DISABLE;
683
}
684
685
int vd = _VD;
686
int vs = _VS;
687
int vt = _VT;
688
VectorSize sz = GetVecSize(op);
689
690
// TODO: Force read one of them into regs? probably not.
691
u8 sregs[4], tregs[4], dregs[1];
692
GetVectorRegsPrefixS(sregs, sz, vs);
693
GetVectorRegsPrefixT(tregs, sz, vt);
694
GetVectorRegsPrefixD(dregs, V_Single, vd);
695
696
// TODO: applyprefixST here somehow (shuffle, etc...)
697
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
698
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
699
VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));
700
701
int n = GetNumVectorElements(sz);
702
for (int i = 1; i < n; i++) {
703
// sum += s[i]*t[i];
704
VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i]));
705
}
706
fpr.ReleaseSpillLocksAndDiscardTemps();
707
708
fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);
709
710
VMOV(fpr.V(dregs[0]), S0);
711
ApplyPrefixD(dregs, V_Single);
712
fpr.ReleaseSpillLocksAndDiscardTemps();
713
}
714
715
void ArmJit::Comp_VecDo3(MIPSOpcode op) {
716
CONDITIONAL_DISABLE(VFPU_VEC);
717
if (js.HasUnknownPrefix()) {
718
DISABLE;
719
}
720
721
int vd = _VD;
722
int vs = _VS;
723
int vt = _VT;
724
725
VectorSize sz = GetVecSize(op);
726
int n = GetNumVectorElements(sz);
727
728
u8 sregs[4], tregs[4], dregs[4];
729
GetVectorRegsPrefixS(sregs, sz, _VS);
730
GetVectorRegsPrefixT(tregs, sz, _VT);
731
GetVectorRegsPrefixD(dregs, sz, _VD);
732
733
MIPSReg tempregs[4];
734
for (int i = 0; i < n; i++) {
735
if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {
736
tempregs[i] = fpr.GetTempV();
737
} else {
738
tempregs[i] = dregs[i];
739
}
740
}
741
742
// Map first, then work. This will allow us to use VLDMIA more often
743
// (when we add the appropriate map function) and the instruction ordering
744
// will improve.
745
// Note that mapping like this (instead of first all sregs, first all tregs etc)
746
// reduces the amount of continuous registers a lot :(
747
for (int i = 0; i < n; i++) {
748
fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);
749
fpr.SpillLockV(tempregs[i]);
750
fpr.SpillLockV(sregs[i]);
751
fpr.SpillLockV(tregs[i]);
752
}
753
754
for (int i = 0; i < n; i++) {
755
switch (op >> 26) {
756
case 24: //VFPU0
757
switch ((op >> 23)&7) {
758
case 0: // d[i] = s[i] + t[i]; break; //vadd
759
VADD(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
760
break;
761
case 1: // d[i] = s[i] - t[i]; break; //vsub
762
VSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
763
break;
764
case 7: // d[i] = s[i] / t[i]; break; //vdiv
765
VDIV(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
766
break;
767
default:
768
DISABLE;
769
}
770
break;
771
case 25: //VFPU1
772
switch ((op >> 23) & 7) {
773
case 0: // d[i] = s[i] * t[i]; break; //vmul
774
VMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));
775
break;
776
default:
777
DISABLE;
778
}
779
break;
780
// Unfortunately there is no VMIN/VMAX on ARM without NEON.
781
case 27: //VFPU3
782
switch ((op >> 23) & 7) {
783
case 2: // vmin
784
{
785
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
786
VMRS_APSR();
787
FixupBranch skipNAN = B_CC(CC_VC);
788
VMOV(SCRATCHREG1, fpr.V(sregs[i]));
789
VMOV(SCRATCHREG2, fpr.V(tregs[i]));
790
// If both are negative, we reverse the comparison. We want the highest mantissa then.
791
// Also, between -NAN and -5.0, we want -NAN to be less.
792
TST(SCRATCHREG1, SCRATCHREG2);
793
FixupBranch cmpPositive = B_CC(CC_PL);
794
CMP(SCRATCHREG2, SCRATCHREG1);
795
FixupBranch skipPositive = B();
796
SetJumpTarget(cmpPositive);
797
CMP(SCRATCHREG1, SCRATCHREG2);
798
SetJumpTarget(skipPositive);
799
SetCC(CC_AL);
800
SetJumpTarget(skipNAN);
801
SetCC(CC_LT);
802
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
803
SetCC(CC_GE);
804
VMOV(fpr.V(tempregs[i]), fpr.V(tregs[i]));
805
SetCC(CC_AL);
806
break;
807
}
808
case 3: // vmax
809
{
810
VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));
811
VMRS_APSR();
812
FixupBranch skipNAN = B_CC(CC_VC);
813
VMOV(SCRATCHREG1, fpr.V(sregs[i]));
814
VMOV(SCRATCHREG2, fpr.V(tregs[i]));
815
// If both are negative, we reverse the comparison. We want the lowest mantissa then.
816
// Also, between -NAN and -5.0, we want -5.0 to be greater.
817
TST(SCRATCHREG2, SCRATCHREG1);
818
FixupBranch cmpPositive = B_CC(CC_PL);
819
CMP(SCRATCHREG1, SCRATCHREG2);
820
FixupBranch skipPositive = B();
821
SetJumpTarget(cmpPositive);
822
CMP(SCRATCHREG2, SCRATCHREG1);
823
SetJumpTarget(skipPositive);
824
SetCC(CC_AL);
825
SetJumpTarget(skipNAN);
826
SetCC(CC_LT);
827
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
828
SetCC(CC_GE);
829
VMOV(fpr.V(tempregs[i]), fpr.V(tregs[i]));
830
SetCC(CC_AL);
831
break;
832
}
833
case 6: // vsge
834
DISABLE; // pending testing
835
VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));
836
VMRS_APSR();
837
// Unordered is always 0.
838
SetCC(CC_GE);
839
MOVI2F(fpr.V(tempregs[i]), 1.0f, SCRATCHREG1);
840
SetCC(CC_LT);
841
MOVI2F(fpr.V(tempregs[i]), 0.0f, SCRATCHREG1);
842
SetCC(CC_AL);
843
break;
844
case 7: // vslt
845
DISABLE; // pending testing
846
VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));
847
VMRS_APSR();
848
// Unordered is always 0.
849
SetCC(CC_LO);
850
MOVI2F(fpr.V(tempregs[i]), 1.0f, SCRATCHREG1);
851
SetCC(CC_HS);
852
MOVI2F(fpr.V(tempregs[i]), 0.0f, SCRATCHREG1);
853
SetCC(CC_AL);
854
break;
855
}
856
break;
857
858
default:
859
DISABLE;
860
}
861
}
862
863
for (int i = 0; i < n; i++) {
864
if (dregs[i] != tempregs[i]) {
865
fpr.MapDirtyInV(dregs[i], tempregs[i]);
866
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
867
}
868
}
869
ApplyPrefixD(dregs, sz);
870
871
fpr.ReleaseSpillLocksAndDiscardTemps();
872
}
873
874
void ArmJit::Comp_VV2Op(MIPSOpcode op) {
875
CONDITIONAL_DISABLE(VFPU_VEC);
876
if (js.HasUnknownPrefix()) {
877
DISABLE;
878
}
879
880
// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
881
if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
882
return;
883
}
884
885
// Catch the disabled operations immediately so we don't map registers unnecessarily later.
886
// Move these down to the big switch below as they are implemented.
887
switch ((op >> 16) & 0x1f) {
888
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
889
DISABLE;
890
break;
891
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
892
DISABLE;
893
break;
894
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
895
DISABLE;
896
break;
897
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
898
DISABLE;
899
break;
900
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
901
DISABLE;
902
break;
903
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
904
DISABLE;
905
break;
906
default:
907
;
908
}
909
910
VectorSize sz = GetVecSize(op);
911
int n = GetNumVectorElements(sz);
912
913
u8 sregs[4], dregs[4];
914
GetVectorRegsPrefixS(sregs, sz, _VS);
915
GetVectorRegsPrefixD(dregs, sz, _VD);
916
917
MIPSReg tempregs[4];
918
for (int i = 0; i < n; ++i) {
919
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
920
tempregs[i] = fpr.GetTempV();
921
} else {
922
tempregs[i] = dregs[i];
923
}
924
}
925
926
// Get some extra temps, used by vasin only.
927
ARMReg t2 = INVALID_REG, t3 = INVALID_REG, t4 = INVALID_REG;
928
if (((op >> 16) & 0x1f) == 23) {
929
// Only get here on vasin.
930
int t[3] = { fpr.GetTempV(), fpr.GetTempV(), fpr.GetTempV() };
931
fpr.MapRegV(t[0], MAP_NOINIT);
932
fpr.MapRegV(t[1], MAP_NOINIT);
933
fpr.MapRegV(t[2], MAP_NOINIT);
934
t2 = fpr.V(t[0]);
935
t3 = fpr.V(t[1]);
936
t4 = fpr.V(t[2]);
937
}
938
939
// Pre map the registers to get better instruction ordering.
940
// Note that mapping like this (instead of first all sregs, first all tempregs etc)
941
// reduces the amount of continuous registers a lot :(
942
for (int i = 0; i < n; i++) {
943
fpr.MapDirtyInV(tempregs[i], sregs[i]);
944
fpr.SpillLockV(tempregs[i]);
945
fpr.SpillLockV(sregs[i]);
946
}
947
948
// Warning: sregs[i] and tempxregs[i] may be the same reg.
949
// Helps for vmov, hurts for vrcp, etc.
950
for (int i = 0; i < n; i++) {
951
switch ((op >> 16) & 0x1f) {
952
case 0: // d[i] = s[i]; break; //vmov
953
// Probably for swizzle.
954
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
955
break;
956
case 1: // d[i] = fabsf(s[i]); break; //vabs
957
VABS(fpr.V(tempregs[i]), fpr.V(sregs[i]));
958
break;
959
case 2: // d[i] = -s[i]; break; //vneg
960
VNEG(fpr.V(tempregs[i]), fpr.V(sregs[i]));
961
break;
962
case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
963
if (i == 0) {
964
MOVI2F(S0, 0.0f, SCRATCHREG1);
965
MOVI2F(S1, 1.0f, SCRATCHREG1);
966
}
967
VCMP(fpr.V(sregs[i]), S0);
968
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
969
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
970
SetCC(CC_LS);
971
VMOV(fpr.V(tempregs[i]), S0);
972
SetCC(CC_AL);
973
VCMP(fpr.V(sregs[i]), S1);
974
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
975
SetCC(CC_GT);
976
VMOV(fpr.V(tempregs[i]), S1);
977
SetCC(CC_AL);
978
break;
979
case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
980
if (i == 0) {
981
MOVI2F(S0, -1.0f, SCRATCHREG1);
982
MOVI2F(S1, 1.0f, SCRATCHREG1);
983
}
984
VCMP(fpr.V(sregs[i]), S0);
985
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
986
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
987
SetCC(CC_LO);
988
VMOV(fpr.V(tempregs[i]), S0);
989
SetCC(CC_AL);
990
VCMP(fpr.V(sregs[i]), S1);
991
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
992
SetCC(CC_GT);
993
VMOV(fpr.V(tempregs[i]), S1);
994
SetCC(CC_AL);
995
break;
996
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
997
if (i == 0) {
998
MOVI2F(S0, 1.0f, SCRATCHREG1);
999
}
1000
VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
1001
break;
1002
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
1003
if (i == 0) {
1004
MOVI2F(S0, 1.0f, SCRATCHREG1);
1005
}
1006
VSQRT(S1, fpr.V(sregs[i]));
1007
VDIV(fpr.V(tempregs[i]), S0, S1);
1008
break;
1009
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
1010
VSQRT(fpr.V(tempregs[i]), fpr.V(sregs[i]));
1011
VABS(fpr.V(tempregs[i]), fpr.V(tempregs[i]));
1012
break;
1013
case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin
1014
// Seems to work well enough but can disable if it becomes a problem.
1015
// Should be easy enough to translate to NEON. There we can load all the constants
1016
// in one go of course.
1017
VCMP(fpr.V(sregs[i])); // flags = sign(sregs[i])
1018
VMRS_APSR();
1019
MOVI2F(S0, 1.0f, SCRATCHREG1);
1020
VABS(t4, fpr.V(sregs[i])); // t4 = |sregs[i]|
1021
VSUB(t3, S0, t4);
1022
VSQRT(t3, t3); // t3 = sqrt(1 - |sregs[i]|)
1023
MOVI2F(S1, -0.0187293f, SCRATCHREG1);
1024
MOVI2F(t2, 0.0742610f, SCRATCHREG1);
1025
VMLA(t2, t4, S1);
1026
MOVI2F(S1, -0.2121144f, SCRATCHREG1);
1027
VMLA(S1, t4, t2);
1028
MOVI2F(t2, 1.5707288f, SCRATCHREG1);
1029
VMLA(t2, t4, S1);
1030
MOVI2F(fpr.V(tempregs[i]), M_PI / 2, SCRATCHREG1);
1031
VMLS(fpr.V(tempregs[i]), t2, t3); // tr[i] = M_PI / 2 - t2 * t3
1032
{
1033
FixupBranch br = B_CC(CC_GE);
1034
VNEG(fpr.V(tempregs[i]), fpr.V(tempregs[i]));
1035
SetJumpTarget(br);
1036
}
1037
// Correction factor for PSP range. Could be baked into the calculation above?
1038
MOVI2F(S1, 1.0f / (M_PI / 2), SCRATCHREG1);
1039
VMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);
1040
break;
1041
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
1042
if (i == 0) {
1043
MOVI2F(S0, -1.0f, SCRATCHREG1);
1044
}
1045
VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
1046
break;
1047
default:
1048
ERROR_LOG(Log::JIT, "case missing in vfpu vv2op");
1049
DISABLE;
1050
break;
1051
}
1052
}
1053
1054
for (int i = 0; i < n; ++i) {
1055
if (dregs[i] != tempregs[i]) {
1056
fpr.MapDirtyInV(dregs[i], tempregs[i]);
1057
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
1058
}
1059
}
1060
1061
ApplyPrefixD(dregs, sz);
1062
1063
fpr.ReleaseSpillLocksAndDiscardTemps();
1064
}
1065
1066
void ArmJit::Comp_Vi2f(MIPSOpcode op) {
1067
CONDITIONAL_DISABLE(VFPU_VEC);
1068
if (js.HasUnknownPrefix()) {
1069
DISABLE;
1070
}
1071
1072
VectorSize sz = GetVecSize(op);
1073
int n = GetNumVectorElements(sz);
1074
1075
int imm = (op >> 16) & 0x1f;
1076
const float mult = 1.0f / (float)(1UL << imm);
1077
1078
u8 sregs[4], dregs[4];
1079
GetVectorRegsPrefixS(sregs, sz, _VS);
1080
GetVectorRegsPrefixD(dregs, sz, _VD);
1081
1082
MIPSReg tempregs[4];
1083
for (int i = 0; i < n; ++i) {
1084
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1085
tempregs[i] = fpr.GetTempV();
1086
} else {
1087
tempregs[i] = dregs[i];
1088
}
1089
}
1090
1091
if (mult != 1.0f)
1092
MOVI2F(S0, mult, SCRATCHREG1);
1093
1094
for (int i = 0; i < n; i++) {
1095
fpr.MapDirtyInV(tempregs[i], sregs[i]);
1096
VCVT(fpr.V(tempregs[i]), fpr.V(sregs[i]), TO_FLOAT | IS_SIGNED);
1097
if (mult != 1.0f)
1098
VMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);
1099
}
1100
1101
for (int i = 0; i < n; ++i) {
1102
if (dregs[i] != tempregs[i]) {
1103
fpr.MapDirtyInV(dregs[i], tempregs[i]);
1104
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
1105
}
1106
}
1107
1108
ApplyPrefixD(dregs, sz);
1109
fpr.ReleaseSpillLocksAndDiscardTemps();
1110
}
1111
1112
void ArmJit::Comp_Vh2f(MIPSOpcode op) {
1113
CONDITIONAL_DISABLE(VFPU_VEC);
1114
if (js.HasUnknownPrefix()) {
1115
DISABLE;
1116
}
1117
1118
// This multi-VCVT.F32.F16 is only available in the VFPv4 extension.
1119
// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.
1120
if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {
1121
// No hardware support for half-to-float, fallback to interpreter
1122
// TODO: Translate the fast SSE solution to standard integer/VFP stuff
1123
// for the weaker CPUs.
1124
DISABLE;
1125
}
1126
1127
u8 sregs[4], dregs[4];
1128
VectorSize sz = GetVecSize(op);
1129
VectorSize outSz;
1130
1131
switch (sz) {
1132
case V_Single:
1133
outSz = V_Pair;
1134
break;
1135
case V_Pair:
1136
outSz = V_Quad;
1137
break;
1138
default:
1139
DISABLE;
1140
}
1141
1142
int n = GetNumVectorElements(sz);
1143
int nOut = n * 2;
1144
GetVectorRegsPrefixS(sregs, sz, _VS);
1145
GetVectorRegsPrefixD(dregs, outSz, _VD);
1146
1147
static const ARMReg tmp[4] = { S0, S1, S2, S3 };
1148
1149
for (int i = 0; i < n; i++) {
1150
fpr.MapRegV(sregs[i], sz);
1151
VMOV(tmp[i], fpr.V(sregs[i]));
1152
}
1153
1154
// This always converts four 16-bit floats in D0 to four 32-bit floats
1155
// in Q0. If we are dealing with a pair here, we just ignore the upper two outputs.
1156
// There are also a couple of other instructions that do it one at a time but doesn't
1157
// seem worth the trouble.
1158
VCVTF32F16(Q0, D0);
1159
1160
for (int i = 0; i < nOut; i++) {
1161
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1162
VMOV(fpr.V(dregs[i]), tmp[i]);
1163
}
1164
1165
ApplyPrefixD(dregs, sz);
1166
fpr.ReleaseSpillLocksAndDiscardTemps();
1167
}
1168
1169
void ArmJit::Comp_Vf2i(MIPSOpcode op) {
1170
CONDITIONAL_DISABLE(VFPU_VEC);
1171
1172
if (js.HasUnknownPrefix()) {
1173
DISABLE;
1174
}
1175
DISABLE;
1176
1177
VectorSize sz = GetVecSize(op);
1178
int n = GetNumVectorElements(sz);
1179
1180
int imm = (op >> 16) & 0x1f;
1181
float mult = (float)(1ULL << imm);
1182
1183
switch ((op >> 21) & 0x1f)
1184
{
1185
case 17:
1186
break; //z - truncate. Easy to support.
1187
case 16:
1188
case 18:
1189
case 19:
1190
DISABLE;
1191
break;
1192
}
1193
1194
u8 sregs[4], dregs[4];
1195
GetVectorRegsPrefixS(sregs, sz, _VS);
1196
GetVectorRegsPrefixD(dregs, sz, _VD);
1197
1198
MIPSReg tempregs[4];
1199
for (int i = 0; i < n; ++i) {
1200
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1201
tempregs[i] = fpr.GetTempV();
1202
} else {
1203
tempregs[i] = dregs[i];
1204
}
1205
}
1206
1207
if (mult != 1.0f)
1208
MOVI2F(S1, mult, SCRATCHREG1);
1209
1210
for (int i = 0; i < n; i++) {
1211
fpr.MapDirtyInV(tempregs[i], sregs[i]);
1212
switch ((op >> 21) & 0x1f) {
1213
case 16: /* TODO */ break; //n
1214
case 17:
1215
if (mult != 1.0f) {
1216
VMUL(S0, fpr.V(sregs[i]), S1);
1217
VCVT(fpr.V(tempregs[i]), S0, TO_INT | ROUND_TO_ZERO);
1218
} else {
1219
VCVT(fpr.V(tempregs[i]), fpr.V(sregs[i]), TO_INT | ROUND_TO_ZERO);
1220
}
1221
break;
1222
case 18: /* TODO */ break; //u
1223
case 19: /* TODO */ break; //d
1224
}
1225
}
1226
1227
for (int i = 0; i < n; ++i) {
1228
if (dregs[i] != tempregs[i]) {
1229
fpr.MapDirtyInV(dregs[i], tempregs[i]);
1230
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
1231
}
1232
}
1233
1234
ApplyPrefixD(dregs, sz);
1235
fpr.ReleaseSpillLocksAndDiscardTemps();
1236
}
1237
1238
void ArmJit::Comp_Mftv(MIPSOpcode op) {
1239
CONDITIONAL_DISABLE(VFPU_XFER);
1240
1241
int imm = op & 0xFF;
1242
MIPSGPReg rt = _RT;
1243
switch ((op >> 21) & 0x1f) {
1244
case 3: //mfv / mfvc
1245
// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
1246
if (rt != 0) {
1247
if (imm < 128) { //R(rt) = VI(imm);
1248
fpr.MapRegV(imm, 0);
1249
gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
1250
VMOV(gpr.R(rt), fpr.V(imm));
1251
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc
1252
if (imm - 128 == VFPU_CTRL_CC) {
1253
if (gpr.IsImm(MIPS_REG_VFPUCC)) {
1254
gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));
1255
} else {
1256
gpr.MapDirtyIn(rt, MIPS_REG_VFPUCC);
1257
MOV(gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));
1258
}
1259
} else {
1260
// In case we have a saved prefix.
1261
FlushPrefixV();
1262
gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
1263
LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));
1264
}
1265
} else {
1266
//ERROR - maybe need to make this value too an "interlock" value?
1267
ERROR_LOG(Log::CPU, "mfv - invalid register %i", imm);
1268
}
1269
}
1270
break;
1271
1272
case 7: // mtv
1273
if (imm < 128) {
1274
gpr.MapReg(rt);
1275
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
1276
VMOV(fpr.V(imm), gpr.R(rt));
1277
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
1278
if (imm - 128 == VFPU_CTRL_CC) {
1279
if (gpr.IsImm(rt)) {
1280
gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));
1281
} else {
1282
gpr.MapDirtyIn(MIPS_REG_VFPUCC, rt);
1283
MOV(gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));
1284
}
1285
} else {
1286
gpr.MapReg(rt);
1287
STR(gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));
1288
}
1289
1290
// TODO: Optimization if rt is Imm?
1291
// Set these BEFORE disable!
1292
if (imm - 128 == VFPU_CTRL_SPREFIX) {
1293
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1294
js.blockWrotePrefixes = true;
1295
} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
1296
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1297
js.blockWrotePrefixes = true;
1298
} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
1299
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1300
js.blockWrotePrefixes = true;
1301
}
1302
} else {
1303
//ERROR
1304
_dbg_assert_msg_(false,"mtv - invalid register");
1305
}
1306
break;
1307
1308
default:
1309
DISABLE;
1310
}
1311
1312
fpr.ReleaseSpillLocksAndDiscardTemps();
1313
}
1314
1315
void ArmJit::Comp_Vmfvc(MIPSOpcode op) {
1316
CONDITIONAL_DISABLE(VFPU_XFER);
1317
1318
int vd = _VD;
1319
int imm = (op >> 8) & 0x7F;
1320
if (imm < VFPU_CTRL_MAX) {
1321
fpr.MapRegV(vd);
1322
if (imm == VFPU_CTRL_CC) {
1323
gpr.MapReg(MIPS_REG_VFPUCC, 0);
1324
VMOV(fpr.V(vd), gpr.R(MIPS_REG_VFPUCC));
1325
} else {
1326
ADDI2R(SCRATCHREG1, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCHREG2);
1327
VLDR(fpr.V(vd), SCRATCHREG1, 0);
1328
}
1329
fpr.ReleaseSpillLocksAndDiscardTemps();
1330
} else {
1331
fpr.MapRegV(vd);
1332
MOVI2F(fpr.V(vd), 0.0f, SCRATCHREG1);
1333
}
1334
}
1335
1336
void ArmJit::Comp_Vmtvc(MIPSOpcode op) {
1337
CONDITIONAL_DISABLE(VFPU_XFER);
1338
1339
int vs = _VS;
1340
int imm = op & 0x7F;
1341
if (imm < VFPU_CTRL_MAX) {
1342
fpr.MapRegV(vs);
1343
if (imm == VFPU_CTRL_CC) {
1344
gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY | MAP_NOINIT);
1345
VMOV(gpr.R(MIPS_REG_VFPUCC), fpr.V(vs));
1346
} else {
1347
ADDI2R(SCRATCHREG1, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCHREG2);
1348
VSTR(fpr.V(vs), SCRATCHREG1, 0);
1349
}
1350
fpr.ReleaseSpillLocksAndDiscardTemps();
1351
1352
if (imm == VFPU_CTRL_SPREFIX) {
1353
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1354
js.blockWrotePrefixes = true;
1355
} else if (imm == VFPU_CTRL_TPREFIX) {
1356
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1357
js.blockWrotePrefixes = true;
1358
} else if (imm == VFPU_CTRL_DPREFIX) {
1359
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1360
js.blockWrotePrefixes = true;
1361
}
1362
}
1363
}
1364
1365
void ArmJit::Comp_Vmmov(MIPSOpcode op) {
1366
CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
1367
1368
// This probably ignores prefixes for all sane intents and purposes.
1369
if (_VS == _VD) {
1370
// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
1371
return;
1372
}
1373
1374
MatrixSize sz = GetMtxSize(op);
1375
int n = GetMatrixSide(sz);
1376
1377
u8 sregs[16], dregs[16];
1378
GetMatrixRegs(sregs, sz, _VS);
1379
GetMatrixRegs(dregs, sz, _VD);
1380
1381
// Rough overlap check.
1382
bool overlap = false;
1383
if (GetMtx(_VS) == GetMtx(_VD)) {
1384
// Potential overlap (guaranteed for 3x3 or more).
1385
overlap = true;
1386
}
1387
1388
if (overlap) {
1389
// Not so common, fallback.
1390
DISABLE;
1391
} else {
1392
for (int a = 0; a < n; a++) {
1393
for (int b = 0; b < n; b++) {
1394
fpr.MapDirtyInV(dregs[a * 4 + b], sregs[a * 4 + b]);
1395
VMOV(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[a * 4 + b]));
1396
}
1397
}
1398
fpr.ReleaseSpillLocksAndDiscardTemps();
1399
}
1400
}
1401
1402
void ArmJit::Comp_VScl(MIPSOpcode op) {
1403
CONDITIONAL_DISABLE(VFPU_VEC);
1404
if (js.HasUnknownPrefix()) {
1405
DISABLE;
1406
}
1407
1408
VectorSize sz = GetVecSize(op);
1409
int n = GetNumVectorElements(sz);
1410
1411
u8 sregs[4], dregs[4], treg;
1412
GetVectorRegsPrefixS(sregs, sz, _VS);
1413
// TODO: Prefixes seem strange...
1414
GetVectorRegsPrefixT(&treg, V_Single, _VT);
1415
GetVectorRegsPrefixD(dregs, sz, _VD);
1416
1417
// Move to S0 early, so we don't have to worry about overlap with scale.
1418
fpr.LoadToRegV(S0, treg);
1419
1420
// For prefixes to work, we just have to ensure that none of the output registers spill
1421
// and that there's no overlap.
1422
MIPSReg tempregs[4];
1423
for (int i = 0; i < n; ++i) {
1424
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1425
// Need to use temp regs
1426
tempregs[i] = fpr.GetTempV();
1427
} else {
1428
tempregs[i] = dregs[i];
1429
}
1430
}
1431
1432
// The meat of the function!
1433
for (int i = 0; i < n; i++) {
1434
fpr.MapDirtyInV(tempregs[i], sregs[i]);
1435
VMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0);
1436
}
1437
1438
for (int i = 0; i < n; i++) {
1439
// All must be mapped for prefixes to work.
1440
if (dregs[i] != tempregs[i]) {
1441
fpr.MapDirtyInV(dregs[i], tempregs[i]);
1442
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
1443
}
1444
}
1445
1446
ApplyPrefixD(dregs, sz);
1447
1448
fpr.ReleaseSpillLocksAndDiscardTemps();
1449
}
1450
1451
void ArmJit::Comp_Vmmul(MIPSOpcode op) {
1452
CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
1453
if (!js.HasNoPrefix()) {
1454
DISABLE;
1455
}
1456
1457
if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
1458
// Fall back to interpreter, which has the accurate implementation.
1459
// Later we might do something more optimized here.
1460
DISABLE;
1461
}
1462
1463
MatrixSize sz = GetMtxSize(op);
1464
int n = GetMatrixSide(sz);
1465
1466
u8 sregs[16], tregs[16], dregs[16];
1467
GetMatrixRegs(sregs, sz, _VS);
1468
GetMatrixRegs(tregs, sz, _VT);
1469
GetMatrixRegs(dregs, sz, _VD);
1470
1471
// Rough overlap check.
1472
bool overlap = false;
1473
if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {
1474
// Potential overlap (guaranteed for 3x3 or more).
1475
overlap = true;
1476
}
1477
1478
if (overlap) {
1479
DISABLE;
1480
} else {
1481
for (int a = 0; a < n; a++) {
1482
for (int b = 0; b < n; b++) {
1483
fpr.MapInInV(sregs[b * 4], tregs[a * 4]);
1484
VMUL(S0, fpr.V(sregs[b * 4]), fpr.V(tregs[a * 4]));
1485
for (int c = 1; c < n; c++) {
1486
fpr.MapInInV(sregs[b * 4 + c], tregs[a * 4 + c]);
1487
VMLA(S0, fpr.V(sregs[b * 4 + c]), fpr.V(tregs[a * 4 + c]));
1488
}
1489
fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);
1490
VMOV(fpr.V(dregs[a * 4 + b]), S0);
1491
}
1492
}
1493
fpr.ReleaseSpillLocksAndDiscardTemps();
1494
}
1495
}
1496
1497
void ArmJit::Comp_Vmscl(MIPSOpcode op) {
1498
DISABLE;
1499
}
1500
1501
void ArmJit::Comp_Vtfm(MIPSOpcode op) {
1502
CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
1503
if (js.HasUnknownPrefix()) {
1504
DISABLE;
1505
}
1506
1507
// TODO: This probably ignores prefixes? Or maybe uses D?
1508
1509
VectorSize sz = GetVecSize(op);
1510
MatrixSize msz = GetMtxSize(op);
1511
int n = GetNumVectorElements(sz);
1512
int ins = (op >> 23) & 7;
1513
1514
bool homogenous = false;
1515
if (n == ins) {
1516
n++;
1517
sz = (VectorSize)((int)(sz) + 1);
1518
msz = (MatrixSize)((int)(msz) + 1);
1519
homogenous = true;
1520
}
1521
// Otherwise, n should already be ins + 1.
1522
else if (n != ins + 1) {
1523
DISABLE;
1524
}
1525
1526
u8 sregs[16], dregs[4], tregs[4];
1527
GetMatrixRegs(sregs, msz, _VS);
1528
GetVectorRegs(tregs, sz, _VT);
1529
GetVectorRegs(dregs, sz, _VD);
1530
1531
// TODO: test overlap, optimize.
1532
int tempregs[4];
1533
for (int i = 0; i < n; i++) {
1534
fpr.MapInInV(sregs[i * 4], tregs[0]);
1535
VMUL(S0, fpr.V(sregs[i * 4]), fpr.V(tregs[0]));
1536
for (int k = 1; k < n; k++) {
1537
if (!homogenous || k != n - 1) {
1538
fpr.MapInInV(sregs[i * 4 + k], tregs[k]);
1539
VMLA(S0, fpr.V(sregs[i * 4 + k]), fpr.V(tregs[k]));
1540
} else {
1541
fpr.MapRegV(sregs[i * 4 + k]);
1542
VADD(S0, S0, fpr.V(sregs[i * 4 + k]));
1543
}
1544
}
1545
1546
int temp = fpr.GetTempV();
1547
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
1548
fpr.SpillLockV(temp);
1549
VMOV(fpr.V(temp), S0);
1550
tempregs[i] = temp;
1551
}
1552
for (int i = 0; i < n; i++) {
1553
u8 temp = tempregs[i];
1554
fpr.MapRegV(dregs[i], MAP_NOINIT | MAP_DIRTY);
1555
VMOV(fpr.V(dregs[i]), fpr.V(temp));
1556
}
1557
1558
fpr.ReleaseSpillLocksAndDiscardTemps();
1559
}
1560
1561
void ArmJit::Comp_VCrs(MIPSOpcode op) {
1562
DISABLE;
1563
}
1564
1565
void ArmJit::Comp_VDet(MIPSOpcode op) {
1566
DISABLE;
1567
}
1568
1569
void ArmJit::Comp_Vi2x(MIPSOpcode op) {
1570
CONDITIONAL_DISABLE(VFPU_VEC);
1571
if (js.HasUnknownPrefix()) {
1572
DISABLE;
1573
}
1574
1575
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
1576
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
1577
1578
if (unsignedOp) {
1579
// Requires a tricky clamp operation that we can't do without more temps, see below
1580
DISABLE;
1581
}
1582
1583
// These instructions pack pairs or quads of integers into 32 bits.
1584
// The unsigned (u) versions skip the sign bit when packing.
1585
VectorSize sz = GetVecSize(op);
1586
VectorSize outsize;
1587
if (bits == 8) {
1588
outsize = V_Single;
1589
if (sz != V_Quad) {
1590
DISABLE;
1591
}
1592
} else {
1593
switch (sz) {
1594
case V_Pair:
1595
outsize = V_Single;
1596
break;
1597
case V_Quad:
1598
outsize = V_Pair;
1599
break;
1600
default:
1601
DISABLE;
1602
}
1603
}
1604
1605
u8 sregs[4], dregs[4];
1606
GetVectorRegsPrefixS(sregs, sz, _VS);
1607
GetVectorRegsPrefixD(dregs, outsize, _VD);
1608
1609
// First, let's assemble the sregs into lanes of either D0 (pair) or Q0 (quad).
1610
bool quad = sz == V_Quad;
1611
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1612
VMOV(S0, fpr.V(sregs[0]));
1613
VMOV(S1, fpr.V(sregs[1]));
1614
if (quad) {
1615
VMOV(S2, fpr.V(sregs[2]));
1616
VMOV(S3, fpr.V(sregs[3]));
1617
}
1618
1619
// TODO: For "u" type ops, we clamp to zero and shift off the sign bit first.
1620
// Need some temp regs to do that efficiently, right?
1621
1622
// At this point, we simply need to collect the high bits of each 32-bit lane into one register.
1623
if (bits == 8) {
1624
// Really want to do a VSHRN(..., 24) but that can't be encoded. So we synthesize it.
1625
VSHR(I_32, Q0, Q0, 16);
1626
VSHRN(I_32, D0, Q0, 8);
1627
VMOVN(I_16, D0, Q0);
1628
} else {
1629
VSHRN(I_32, D0, Q0, 16);
1630
}
1631
1632
fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_DIRTY|MAP_NOINIT);
1633
VMOV(fpr.V(dregs[0]), S0);
1634
if (outsize == V_Pair) {
1635
VMOV(fpr.V(dregs[1]), S1);
1636
}
1637
1638
ApplyPrefixD(dregs, outsize);
1639
fpr.ReleaseSpillLocksAndDiscardTemps();
1640
}
1641
1642
void ArmJit::Comp_Vx2i(MIPSOpcode op) {
1643
CONDITIONAL_DISABLE(VFPU_VEC);
1644
if (js.HasUnknownPrefix()) {
1645
DISABLE;
1646
}
1647
1648
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1649
bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1650
1651
if (bits == 8 && unsignedOp) {
1652
// vuc2i is odd and needs temp registers for implementation.
1653
DISABLE;
1654
}
1655
// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1656
// at the top. vus2i shifts it an extra bit right afterward.
1657
// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1658
// at the top too. vuc2i is a bit special (see below.)
1659
// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1660
// then use it for both.
1661
1662
VectorSize sz = GetVecSize(op);
1663
VectorSize outsize;
1664
if (bits == 8) {
1665
outsize = V_Quad;
1666
} else {
1667
switch (sz) {
1668
case V_Single:
1669
outsize = V_Pair;
1670
break;
1671
case V_Pair:
1672
outsize = V_Quad;
1673
break;
1674
default:
1675
DISABLE;
1676
}
1677
}
1678
1679
u8 sregs[4], dregs[4];
1680
GetVectorRegsPrefixS(sregs, sz, _VS);
1681
GetVectorRegsPrefixD(dregs, outsize, _VD);
1682
1683
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1684
if (sz == V_Single) {
1685
VMOV(S0, fpr.V(sregs[0]));
1686
} else if (sz == V_Pair) {
1687
VMOV(S0, fpr.V(sregs[0]));
1688
VMOV(S1, fpr.V(sregs[1]));
1689
} else if (bits == 8) {
1690
// For some reason, sz is quad on vc2i.
1691
VMOV(S0, fpr.V(sregs[0]));
1692
}
1693
1694
1695
if (bits == 16) {
1696
// Simply expand, to upper bits.
1697
VSHLL(I_16, Q0, D0, 16);
1698
} else if (bits == 8) {
1699
if (unsignedOp) {
1700
// vuc2i is a bit special. It spreads out the bits like this:
1701
// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.
1702
// TODO
1703
} else {
1704
VSHLL(I_8, Q0, D0, 8);
1705
VSHLL(I_16, Q0, D0, 16);
1706
}
1707
}
1708
1709
// At this point we have the regs in the 4 lanes.
1710
// In the "u" mode, we need to shift it out of the sign bit.
1711
if (unsignedOp) {
1712
ArmGen::ARMReg reg = (outsize == V_Quad) ? Q0 : D0;
1713
VSHR(I_32 | I_UNSIGNED, reg, reg, 1);
1714
}
1715
1716
fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_NOINIT);
1717
1718
VMOV(fpr.V(dregs[0]), S0);
1719
VMOV(fpr.V(dregs[1]), S1);
1720
if (outsize == V_Quad) {
1721
VMOV(fpr.V(dregs[2]), S2);
1722
VMOV(fpr.V(dregs[3]), S3);
1723
}
1724
1725
ApplyPrefixD(dregs, outsize);
1726
fpr.ReleaseSpillLocksAndDiscardTemps();
1727
}
1728
1729
void ArmJit::Comp_VCrossQuat(MIPSOpcode op) {
1730
// This op does not support prefixes anyway.
1731
CONDITIONAL_DISABLE(VFPU_VEC);
1732
if (js.HasUnknownPrefix())
1733
DISABLE;
1734
1735
VectorSize sz = GetVecSize(op);
1736
int n = GetNumVectorElements(sz);
1737
1738
u8 sregs[4], tregs[4], dregs[4];
1739
GetVectorRegs(sregs, sz, _VS);
1740
GetVectorRegs(tregs, sz, _VT);
1741
GetVectorRegs(dregs, sz, _VD);
1742
1743
// Map everything into registers.
1744
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
1745
fpr.MapRegsAndSpillLockV(tregs, sz, 0);
1746
1747
if (sz == V_Triple) {
1748
MIPSReg temp3 = fpr.GetTempV();
1749
fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
1750
// Cross product vcrsp.t
1751
1752
// Compute X
1753
VMUL(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));
1754
VMLS(S0, fpr.V(sregs[2]), fpr.V(tregs[1]));
1755
1756
// Compute Y
1757
VMUL(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));
1758
VMLS(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));
1759
1760
// Compute Z
1761
VMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
1762
VMLS(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]));
1763
1764
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
1765
VMOV(fpr.V(dregs[0]), S0);
1766
VMOV(fpr.V(dregs[1]), S1);
1767
VMOV(fpr.V(dregs[2]), fpr.V(temp3));
1768
} else if (sz == V_Quad) {
1769
MIPSReg temp3 = fpr.GetTempV();
1770
MIPSReg temp4 = fpr.GetTempV();
1771
fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
1772
fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);
1773
1774
// Quaternion product vqmul.q untested
1775
// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
1776
VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[3]));
1777
VMLA(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));
1778
VMLS(S0, fpr.V(sregs[2]), fpr.V(tregs[1]));
1779
VMLA(S0, fpr.V(sregs[3]), fpr.V(tregs[0]));
1780
1781
//d[1] = -s[0] * t[2] + s[1] * t[3] + s[2] * t[0] + s[3] * t[1];
1782
VNMUL(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));
1783
VMLA(S1, fpr.V(sregs[1]), fpr.V(tregs[3]));
1784
VMLA(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));
1785
VMLA(S1, fpr.V(sregs[3]), fpr.V(tregs[1]));
1786
1787
//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
1788
VMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
1789
VMLS(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]));
1790
VMLA(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[3]));
1791
VMLA(fpr.V(temp3), fpr.V(sregs[3]), fpr.V(tregs[2]));
1792
1793
//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
1794
VNMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[0]));
1795
VMLS(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[1]));
1796
VMLS(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[2]));
1797
VMLA(fpr.V(temp4), fpr.V(sregs[3]), fpr.V(tregs[3]));
1798
1799
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
1800
VMOV(fpr.V(dregs[0]), S0);
1801
VMOV(fpr.V(dregs[1]), S1);
1802
VMOV(fpr.V(dregs[2]), fpr.V(temp3));
1803
VMOV(fpr.V(dregs[3]), fpr.V(temp4));
1804
}
1805
1806
fpr.ReleaseSpillLocksAndDiscardTemps();
1807
}
1808
1809
void ArmJit::Comp_Vcmp(MIPSOpcode op) {
1810
CONDITIONAL_DISABLE(VFPU_COMP);
1811
if (js.HasUnknownPrefix())
1812
DISABLE;
1813
1814
VectorSize sz = GetVecSize(op);
1815
int n = GetNumVectorElements(sz);
1816
1817
VCondition cond = (VCondition)(op & 0xF);
1818
1819
u8 sregs[4], tregs[4];
1820
GetVectorRegsPrefixS(sregs, sz, _VS);
1821
GetVectorRegsPrefixT(tregs, sz, _VT);
1822
1823
// Some, we just fall back to the interpreter.
1824
// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.
1825
1826
switch (cond) {
1827
case VC_EI: // c = my_isinf(s[i]); break;
1828
case VC_NI: // c = !my_isinf(s[i]); break;
1829
DISABLE;
1830
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1831
case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
1832
case VC_EN: // c = my_isnan(s[i]); break;
1833
case VC_NN: // c = !my_isnan(s[i]); break;
1834
if (_VS != _VT)
1835
DISABLE;
1836
break;
1837
1838
case VC_EZ:
1839
case VC_NZ:
1840
break;
1841
default:
1842
;
1843
}
1844
1845
// First, let's get the trivial ones.
1846
int affected_bits = (1 << 4) | (1 << 5); // 4 and 5
1847
1848
MOVI2R(SCRATCHREG1, 0);
1849
for (int i = 0; i < n; ++i) {
1850
// Let's only handle the easy ones, and fall back on the interpreter for the rest.
1851
CCFlags flag = CC_AL;
1852
switch (cond) {
1853
case VC_FL: // c = 0;
1854
break;
1855
1856
case VC_TR: // c = 1
1857
if (i == 0) {
1858
if (n == 1) {
1859
MOVI2R(SCRATCHREG1, 0x31);
1860
} else {
1861
MOVI2R(SCRATCHREG1, 1 << i);
1862
}
1863
} else {
1864
ORR(SCRATCHREG1, SCRATCHREG1, 1 << i);
1865
}
1866
break;
1867
1868
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1869
case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;
1870
// For these, we use the integer ALU as there is no support on ARM for testing for INF.
1871
// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.
1872
// We need an extra temporary register so we store away SCRATCHREG1.
1873
STR(SCRATCHREG1, CTXREG, offsetof(MIPSState, temp));
1874
fpr.MapRegV(sregs[i], 0);
1875
MOVI2R(SCRATCHREG1, 0x7F800000);
1876
VMOV(SCRATCHREG2, fpr.V(sregs[i]));
1877
AND(SCRATCHREG2, SCRATCHREG2, SCRATCHREG1);
1878
CMP(SCRATCHREG2, SCRATCHREG1); // (SCRATCHREG2 & 0x7F800000) == 0x7F800000
1879
flag = cond == VC_ES ? CC_EQ : CC_NEQ;
1880
LDR(SCRATCHREG1, CTXREG, offsetof(MIPSState, temp));
1881
break;
1882
1883
case VC_EN: // c = my_isnan(s[i]); break; // Tekken 6
1884
// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
1885
fpr.MapInInV(sregs[i], tregs[i]);
1886
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1887
VMRS_APSR();
1888
flag = CC_VS; // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
1889
break;
1890
1891
case VC_NN: // c = !my_isnan(s[i]); break;
1892
// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
1893
fpr.MapInInV(sregs[i], tregs[i]);
1894
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1895
VMRS_APSR();
1896
flag = CC_VC; // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
1897
break;
1898
1899
case VC_EQ: // c = s[i] == t[i]
1900
fpr.MapInInV(sregs[i], tregs[i]);
1901
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1902
VMRS_APSR();
1903
flag = CC_EQ;
1904
break;
1905
1906
case VC_LT: // c = s[i] < t[i]
1907
fpr.MapInInV(sregs[i], tregs[i]);
1908
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1909
VMRS_APSR();
1910
flag = CC_LO;
1911
break;
1912
1913
case VC_LE: // c = s[i] <= t[i];
1914
fpr.MapInInV(sregs[i], tregs[i]);
1915
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1916
VMRS_APSR();
1917
flag = CC_LS;
1918
break;
1919
1920
case VC_NE: // c = s[i] != t[i]
1921
fpr.MapInInV(sregs[i], tregs[i]);
1922
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1923
VMRS_APSR();
1924
flag = CC_NEQ;
1925
break;
1926
1927
case VC_GE: // c = s[i] >= t[i]
1928
fpr.MapInInV(sregs[i], tregs[i]);
1929
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1930
VMRS_APSR();
1931
flag = CC_GE;
1932
break;
1933
1934
case VC_GT: // c = s[i] > t[i]
1935
fpr.MapInInV(sregs[i], tregs[i]);
1936
VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
1937
VMRS_APSR();
1938
flag = CC_GT;
1939
break;
1940
1941
case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f
1942
fpr.MapRegV(sregs[i]);
1943
VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
1944
VMRS_APSR();
1945
flag = CC_EQ;
1946
break;
1947
1948
case VC_NZ: // c = s[i] != 0
1949
fpr.MapRegV(sregs[i]);
1950
VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
1951
VMRS_APSR();
1952
flag = CC_NEQ;
1953
break;
1954
1955
default:
1956
DISABLE;
1957
}
1958
if (flag != CC_AL) {
1959
SetCC(flag);
1960
if (i == 0) {
1961
if (n == 1) {
1962
MOVI2R(SCRATCHREG1, 0x31);
1963
} else {
1964
MOVI2R(SCRATCHREG1, 1); // 1 << i, but i == 0
1965
}
1966
} else {
1967
ORR(SCRATCHREG1, SCRATCHREG1, 1 << i);
1968
}
1969
SetCC(CC_AL);
1970
}
1971
1972
affected_bits |= 1 << i;
1973
}
1974
1975
// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common
1976
// after all.
1977
if (n > 1) {
1978
CMP(SCRATCHREG1, affected_bits & 0xF);
1979
SetCC(CC_EQ);
1980
ORR(SCRATCHREG1, SCRATCHREG1, 1 << 5);
1981
SetCC(CC_AL);
1982
1983
CMP(SCRATCHREG1, 0);
1984
SetCC(CC_NEQ);
1985
ORR(SCRATCHREG1, SCRATCHREG1, 1 << 4);
1986
SetCC(CC_AL);
1987
}
1988
1989
gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);
1990
BIC(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), affected_bits);
1991
ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCHREG1);
1992
1993
fpr.ReleaseSpillLocksAndDiscardTemps();
1994
}
1995
1996
void ArmJit::Comp_Vcmov(MIPSOpcode op) {
1997
CONDITIONAL_DISABLE(VFPU_COMP);
1998
if (js.HasUnknownPrefix()) {
1999
DISABLE;
2000
}
2001
2002
VectorSize sz = GetVecSize(op);
2003
int n = GetNumVectorElements(sz);
2004
2005
u8 sregs[4], dregs[4];
2006
GetVectorRegsPrefixS(sregs, sz, _VS);
2007
GetVectorRegsPrefixD(dregs, sz, _VD);
2008
int tf = (op >> 19) & 1;
2009
int imm3 = (op >> 16) & 7;
2010
2011
for (int i = 0; i < n; ++i) {
2012
// Simplification: Disable if overlap unsafe
2013
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2014
DISABLE;
2015
}
2016
}
2017
2018
if (imm3 < 6) {
2019
// Test one bit of CC. This bit decides whether none or all subregisters are copied.
2020
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
2021
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
2022
gpr.MapReg(MIPS_REG_VFPUCC);
2023
TST(gpr.R(MIPS_REG_VFPUCC), 1 << imm3);
2024
SetCC(tf ? CC_EQ : CC_NEQ);
2025
for (int i = 0; i < n; i++) {
2026
VMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
2027
}
2028
SetCC(CC_AL);
2029
} else {
2030
// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
2031
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
2032
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
2033
gpr.MapReg(MIPS_REG_VFPUCC);
2034
for (int i = 0; i < n; i++) {
2035
TST(gpr.R(MIPS_REG_VFPUCC), 1 << i);
2036
SetCC(tf ? CC_EQ : CC_NEQ);
2037
VMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
2038
SetCC(CC_AL);
2039
}
2040
}
2041
2042
ApplyPrefixD(dregs, sz);
2043
fpr.ReleaseSpillLocksAndDiscardTemps();
2044
}
2045
2046
void ArmJit::Comp_Viim(MIPSOpcode op) {
2047
CONDITIONAL_DISABLE(VFPU_XFER);
2048
if (js.HasUnknownPrefix()) {
2049
DISABLE;
2050
}
2051
2052
u8 dreg;
2053
GetVectorRegs(&dreg, V_Single, _VT);
2054
2055
s32 imm = SignExtend16ToS32(op);
2056
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
2057
MOVI2F(fpr.V(dreg), (float)imm, SCRATCHREG1);
2058
2059
ApplyPrefixD(&dreg, V_Single);
2060
fpr.ReleaseSpillLocksAndDiscardTemps();
2061
}
2062
2063
void ArmJit::Comp_Vfim(MIPSOpcode op) {
2064
CONDITIONAL_DISABLE(VFPU_XFER);
2065
if (js.HasUnknownPrefix()) {
2066
DISABLE;
2067
}
2068
2069
u8 dreg;
2070
GetVectorRegs(&dreg, V_Single, _VT);
2071
2072
FP16 half;
2073
half.u = op & 0xFFFF;
2074
FP32 fval = half_to_float_fast5(half);
2075
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
2076
MOVI2F(fpr.V(dreg), fval.f, SCRATCHREG1);
2077
2078
ApplyPrefixD(&dreg, V_Single);
2079
fpr.ReleaseSpillLocksAndDiscardTemps();
2080
}
2081
2082
void ArmJit::Comp_Vcst(MIPSOpcode op) {
2083
CONDITIONAL_DISABLE(VFPU_XFER);
2084
if (js.HasUnknownPrefix()) {
2085
DISABLE;
2086
}
2087
2088
int conNum = (op >> 16) & 0x1f;
2089
int vd = _VD;
2090
2091
VectorSize sz = GetVecSize(op);
2092
int n = GetNumVectorElements(sz);
2093
2094
u8 dregs[4];
2095
GetVectorRegsPrefixD(dregs, sz, _VD);
2096
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2097
2098
gpr.SetRegImm(SCRATCHREG1, (u32)(void *)&cst_constants[conNum]);
2099
VLDR(S0, SCRATCHREG1, 0);
2100
for (int i = 0; i < n; ++i)
2101
VMOV(fpr.V(dregs[i]), S0);
2102
2103
ApplyPrefixD(dregs, sz);
2104
fpr.ReleaseSpillLocksAndDiscardTemps();
2105
}
2106
2107
static double SinCos(float angle) {
2108
union { struct { float sin; float cos; }; double out; } sincos;
2109
vfpu_sincos(angle, sincos.sin, sincos.cos);
2110
return sincos.out;
2111
}
2112
2113
static double SinCosNegSin(float angle) {
2114
union { struct { float sin; float cos; }; double out; } sincos;
2115
vfpu_sincos(angle, sincos.sin, sincos.cos);
2116
sincos.sin = -sincos.sin;
2117
return sincos.out;
2118
}
2119
2120
void ArmJit::CompVrotShuffle(u8 *dregs, int imm, VectorSize sz, bool negSin) {
2121
int n = GetNumVectorElements(sz);
2122
char what[4] = {'0', '0', '0', '0'};
2123
if (((imm >> 2) & 3) == (imm & 3)) {
2124
for (int i = 0; i < 4; i++)
2125
what[i] = 'S';
2126
}
2127
what[(imm >> 2) & 3] = 'S';
2128
what[imm & 3] = 'C';
2129
2130
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
2131
for (int i = 0; i < n; i++) {
2132
switch (what[i]) {
2133
case 'C': VMOV(fpr.V(dregs[i]), S1); break;
2134
case 'S': if (negSin) VNEG(fpr.V(dregs[i]), S0); else VMOV(fpr.V(dregs[i]), S0); break;
2135
case '0':
2136
{
2137
MOVI2F(fpr.V(dregs[i]), 0.0f, SCRATCHREG1);
2138
break;
2139
}
2140
default:
2141
ERROR_LOG(Log::JIT, "Bad what in vrot");
2142
break;
2143
}
2144
}
2145
}
2146
2147
// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
2148
// calling the math library.
2149
// Apparently this may not work on hardfp. I don't think we have any platforms using this though.
2150
void ArmJit::Comp_VRot(MIPSOpcode op) {
2151
// VRot probably doesn't accept prefixes anyway.
2152
CONDITIONAL_DISABLE(VFPU_VEC);
2153
if (js.HasUnknownPrefix()) {
2154
DISABLE;
2155
}
2156
2157
#if PPSSPP_ARCH(ARM_HARDFP)
2158
DISABLE;
2159
#endif
2160
2161
int vd = _VD;
2162
int vs = _VS;
2163
2164
VectorSize sz = GetVecSize(op);
2165
int n = GetNumVectorElements(sz);
2166
2167
u8 dregs[4];
2168
u8 dregs2[4];
2169
2170
MIPSOpcode nextOp = GetOffsetInstruction(1);
2171
int vd2 = -1;
2172
int imm2 = -1;
2173
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
2174
// Pair of vrot. Let's join them.
2175
vd2 = MIPS_GET_VD(nextOp);
2176
imm2 = (nextOp >> 16) & 0x1f;
2177
// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);
2178
}
2179
u8 sreg;
2180
GetVectorRegs(dregs, sz, vd);
2181
if (vd2 >= 0)
2182
GetVectorRegs(dregs2, sz, vd2);
2183
GetVectorRegs(&sreg, V_Single, vs);
2184
2185
int imm = (op >> 16) & 0x1f;
2186
2187
gpr.FlushBeforeCall();
2188
fpr.FlushAll();
2189
2190
bool negSin1 = (imm & 0x10) ? true : false;
2191
2192
fpr.MapRegV(sreg);
2193
// We should write a custom pure-asm function instead.
2194
#if defined(__ARM_PCS_VFP) // Hardfp
2195
VMOV(S0, fpr.V(sreg));
2196
#else // Softfp
2197
VMOV(R0, fpr.V(sreg));
2198
#endif
2199
// FlushBeforeCall saves R1.
2200
QuickCallFunction(R1, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);
2201
#if !defined(__ARM_PCS_VFP)
2202
// Returns D0 on hardfp and R0,R1 on softfp due to union joining the two floats
2203
VMOV(D0, R0, R1);
2204
#endif
2205
CompVrotShuffle(dregs, imm, sz, false);
2206
if (vd2 != -1) {
2207
// If the negsin setting differs between the two joint invocations, we need to flip the second one.
2208
bool negSin2 = (imm2 & 0x10) ? true : false;
2209
CompVrotShuffle(dregs2, imm2, sz, negSin1 != negSin2);
2210
EatInstruction(nextOp);
2211
}
2212
2213
fpr.ReleaseSpillLocksAndDiscardTemps();
2214
}
2215
2216
void ArmJit::Comp_Vsgn(MIPSOpcode op) {
2217
CONDITIONAL_DISABLE(VFPU_VEC);
2218
if (js.HasUnknownPrefix()) {
2219
DISABLE;
2220
}
2221
2222
VectorSize sz = GetVecSize(op);
2223
int n = GetNumVectorElements(sz);
2224
2225
u8 sregs[4], dregs[4];
2226
GetVectorRegsPrefixS(sregs, sz, _VS);
2227
GetVectorRegsPrefixD(dregs, sz, _VD);
2228
2229
MIPSReg tempregs[4];
2230
for (int i = 0; i < n; ++i) {
2231
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2232
tempregs[i] = fpr.GetTempV();
2233
} else {
2234
tempregs[i] = dregs[i];
2235
}
2236
}
2237
2238
for (int i = 0; i < n; ++i) {
2239
fpr.MapDirtyInV(tempregs[i], sregs[i]);
2240
VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
2241
VMOV(SCRATCHREG1, fpr.V(sregs[i]));
2242
VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).
2243
SetCC(CC_NEQ);
2244
AND(SCRATCHREG1, SCRATCHREG1, AssumeMakeOperand2(0x80000000));
2245
ORR(SCRATCHREG1, SCRATCHREG1, AssumeMakeOperand2(0x3F800000));
2246
SetCC(CC_EQ);
2247
MOV(SCRATCHREG1, AssumeMakeOperand2(0x0));
2248
SetCC(CC_AL);
2249
VMOV(fpr.V(tempregs[i]), SCRATCHREG1);
2250
}
2251
2252
for (int i = 0; i < n; ++i) {
2253
if (dregs[i] != tempregs[i]) {
2254
fpr.MapDirtyInV(dregs[i], tempregs[i]);
2255
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
2256
}
2257
}
2258
2259
ApplyPrefixD(dregs, sz);
2260
2261
fpr.ReleaseSpillLocksAndDiscardTemps();
2262
}
2263
2264
void ArmJit::Comp_Vocp(MIPSOpcode op) {
2265
CONDITIONAL_DISABLE(VFPU_VEC);
2266
if (js.HasUnknownPrefix()) {
2267
DISABLE;
2268
}
2269
2270
VectorSize sz = GetVecSize(op);
2271
int n = GetNumVectorElements(sz);
2272
2273
// This is a hack that modifies prefixes. We eat them later, so just overwrite.
2274
// S prefix forces the negate flags.
2275
js.prefixS |= 0x000F0000;
2276
// T prefix forces constants on and regnum to 1.
2277
// That means negate still works, and abs activates a different constant.
2278
js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2279
2280
u8 sregs[4], tregs[4], dregs[4];
2281
GetVectorRegsPrefixS(sregs, sz, _VS);
2282
GetVectorRegsPrefixT(tregs, sz, _VS);
2283
GetVectorRegsPrefixD(dregs, sz, _VD);
2284
2285
MIPSReg tempregs[4];
2286
for (int i = 0; i < n; ++i) {
2287
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2288
tempregs[i] = fpr.GetTempV();
2289
} else {
2290
tempregs[i] = dregs[i];
2291
}
2292
}
2293
2294
for (int i = 0; i < n; ++i) {
2295
fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);
2296
VADD(fpr.V(tempregs[i]), fpr.V(tregs[i]), fpr.V(sregs[i]));
2297
}
2298
2299
for (int i = 0; i < n; ++i) {
2300
if (dregs[i] != tempregs[i]) {
2301
fpr.MapDirtyInV(dregs[i], tempregs[i]);
2302
VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
2303
}
2304
}
2305
2306
ApplyPrefixD(dregs, sz);
2307
2308
fpr.ReleaseSpillLocksAndDiscardTemps();
2309
}
2310
2311
void ArmJit::Comp_ColorConv(MIPSOpcode op) {
2312
DISABLE;
2313
}
2314
2315
void ArmJit::Comp_Vbfy(MIPSOpcode op) {
2316
DISABLE;
2317
}
2318
}
2319
2320
#endif // PPSSPP_ARCH(ARM)
2321
2322