CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/CompVFPU.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
// Table 13.10 in http://agner.org/optimize/optimizing_assembly.pdf is cool - generate constants with
19
// short instruction sequences. Surprisingly many are possible.
20
21
#include "ppsspp_config.h"
22
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
23
24
#include <cmath>
25
#include <limits>
26
#include <emmintrin.h>
27
28
#include "Common/Math/math_util.h"
29
30
#include "Common/CPUDetect.h"
31
#include "Common/Log.h"
32
#include "Core/Compatibility.h"
33
#include "Core/Config.h"
34
#include "Core/MemMap.h"
35
#include "Core/Reporting.h"
36
#include "Core/System.h"
37
#include "Core/MIPS/MIPSAnalyst.h"
38
#include "Core/MIPS/MIPSCodeUtils.h"
39
#include "Core/MIPS/MIPSVFPUUtils.h"
40
#include "Core/MIPS/x86/Jit.h"
41
#include "Core/MIPS/x86/RegCache.h"
42
43
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
44
// Currently known non working ones should have DISABLE.
45
46
// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
47
#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }
48
#define DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
49
50
#define _RS MIPS_GET_RS(op)
51
#define _RT MIPS_GET_RT(op)
52
#define _RD MIPS_GET_RD(op)
53
#define _FS MIPS_GET_FS(op)
54
#define _FT MIPS_GET_FT(op)
55
#define _FD MIPS_GET_FD(op)
56
#define _SA MIPS_GET_SA(op)
57
#define _POS ((op>> 6) & 0x1F)
58
#define _SIZE ((op>>11) & 0x1F)
59
#define _IMM16 (signed short)(op & 0xFFFF)
60
#define _IMM26 (op & 0x03FFFFFF)
61
62
namespace MIPSComp
63
{
64
using namespace Gen;
65
using namespace X64JitConstants;
66
67
static const float one = 1.0f;
68
static const float minus_one = -1.0f;
69
70
alignas(16) const u32 noSignMask[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
71
alignas(16) const u32 signBitAll[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
72
alignas(16) const u32 signBitLower[4] = {0x80000000, 0, 0, 0};
73
alignas(16) const float oneOneOneOne[4] = {1.0f, 1.0f, 1.0f, 1.0f};
74
alignas(16) const u32 fourinfnan[4] = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
75
alignas(16) const float identityMatrix[4][4] = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };
76
77
void Jit::Comp_VPFX(MIPSOpcode op)
78
{
79
CONDITIONAL_DISABLE(VFPU_XFER);
80
int data = op & 0xFFFFF;
81
int regnum = (op >> 24) & 3;
82
switch (regnum) {
83
case 0: // S
84
js.prefixS = data;
85
js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
86
break;
87
case 1: // T
88
js.prefixT = data;
89
js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
90
break;
91
case 2: // D
92
js.prefixD = data & 0x00000FFF;
93
js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
94
break;
95
}
96
}
97
98
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
99
if (prefix == 0xE4) return;
100
101
int n = GetNumVectorElements(sz);
102
u8 origV[4];
103
static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
104
105
for (int i = 0; i < n; i++)
106
origV[i] = vregs[i];
107
108
for (int i = 0; i < n; i++) {
109
int regnum = (prefix >> (i*2)) & 3;
110
int abs = (prefix >> (8+i)) & 1;
111
int negate = (prefix >> (16+i)) & 1;
112
int constants = (prefix >> (12+i)) & 1;
113
114
// Unchanged, hurray.
115
if (!constants && regnum == i && !abs && !negate)
116
continue;
117
118
// This puts the value into a temp reg, so we won't write the modified value back.
119
vregs[i] = fpr.GetTempV();
120
fpr.MapRegV(vregs[i], MAP_NOINIT | MAP_DIRTY);
121
122
if (!constants) {
123
// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
124
// TODO: But some ops seem to use const 0 instead?
125
if (regnum >= n) {
126
ERROR_LOG_REPORT(Log::CPU, "Invalid VFPU swizzle: %08x / %d", prefix, sz);
127
regnum = 0;
128
}
129
fpr.SimpleRegV(origV[regnum], 0);
130
MOVSS(fpr.VX(vregs[i]), fpr.V(origV[regnum]));
131
if (abs) {
132
if (RipAccessible(&noSignMask)) {
133
ANDPS(fpr.VX(vregs[i]), M(&noSignMask)); // rip accessible
134
} else {
135
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
136
ANDPS(fpr.VX(vregs[i]), MatR(TEMPREG));
137
}
138
}
139
} else {
140
if (RipAccessible(constantArray)) {
141
MOVSS(fpr.VX(vregs[i]), M(&constantArray[regnum + (abs << 2)])); // rip accessible
142
} else {
143
MOV(PTRBITS, R(TEMPREG), ImmPtr(&constantArray[regnum + (abs << 2)]));
144
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
145
}
146
}
147
148
if (negate) {
149
if (RipAccessible(&signBitLower)) {
150
XORPS(fpr.VX(vregs[i]), M(&signBitLower)); // rip accessible
151
} else {
152
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
153
XORPS(fpr.VX(vregs[i]), MatR(TEMPREG));
154
}
155
}
156
// TODO: This probably means it will swap out soon, inefficiently...
157
fpr.ReleaseSpillLockV(vregs[i]);
158
}
159
}
160
161
void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
162
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
163
164
GetVectorRegs(regs, sz, vectorReg);
165
if (js.prefixD == 0)
166
return;
167
168
int n = GetNumVectorElements(sz);
169
for (int i = 0; i < n; i++) {
170
// Hopefully this is rare, we'll just write it into a reg we drop.
171
if (js.VfpuWriteMask(i))
172
regs[i] = fpr.GetTempV();
173
}
174
}
175
176
void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
177
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
178
if (!js.prefixD) return;
179
180
int n = GetNumVectorElements(sz);
181
for (int i = 0; i < n; i++) {
182
if (js.VfpuWriteMask(i))
183
continue;
184
185
int sat = (js.prefixD >> (i * 2)) & 3;
186
if (sat == 1) {
187
fpr.MapRegV(vregs[i], MAP_DIRTY);
188
189
// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
190
MOVSS(R(XMM0), fpr.VX(vregs[i]));
191
XORPS(XMM1, R(XMM1));
192
CMPLESS(XMM0, R(XMM1));
193
ANDNPS(XMM0, fpr.V(vregs[i]));
194
195
// Retain a NAN in XMM0 (must be second operand.)
196
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
197
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
198
MINSS(fpr.VX(vregs[i]), R(XMM0));
199
} else if (sat == 3) {
200
fpr.MapRegV(vregs[i], MAP_DIRTY);
201
202
// Check for < -1.0f, but careful of NANs.
203
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
204
MOVSS(XMM1, MatR(TEMPREG));
205
MOVSS(R(XMM0), fpr.VX(vregs[i]));
206
CMPLESS(XMM0, R(XMM1));
207
// If it was NOT less, the three ops below do nothing.
208
// Otherwise, they replace the value with -1.0f.
209
ANDPS(XMM1, R(XMM0));
210
ANDNPS(XMM0, fpr.V(vregs[i]));
211
ORPS(XMM0, R(XMM1));
212
213
// Retain a NAN in XMM0 (must be second operand.)
214
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
215
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
216
MINSS(fpr.VX(vregs[i]), R(XMM0));
217
}
218
}
219
}
220
221
// Vector regs can overlap in all sorts of swizzled ways.
222
// This does allow a single overlap in sregs[i].
223
bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
224
for (int i = 0; i < sn; ++i) {
225
if (sregs[i] == dreg && i != di)
226
return false;
227
}
228
for (int i = 0; i < tn; ++i) {
229
if (tregs[i] == dreg)
230
return false;
231
}
232
233
// Hurray, no overlap, we can write directly.
234
return true;
235
}
236
237
bool IsOverlapSafe(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
238
return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
239
}
240
241
void Jit::Comp_SV(MIPSOpcode op) {
242
CONDITIONAL_DISABLE(LSU_VFPU);
243
244
s32 imm = (signed short)(op&0xFFFC);
245
int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
246
MIPSGPReg rs = _RS;
247
248
CheckMemoryBreakpoint(0, rs, imm);
249
250
switch (op >> 26) {
251
case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);
252
{
253
gpr.Lock(rs);
254
fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
255
256
JitSafeMem safe(this, rs, imm);
257
OpArg src;
258
if (safe.PrepareRead(src, 4)) {
259
MOVSS(fpr.VX(vt), safe.NextFastAddress(0));
260
}
261
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
262
MOVD_xmm(fpr.VX(vt), R(EAX));
263
}
264
safe.Finish();
265
266
gpr.UnlockAll();
267
fpr.ReleaseSpillLocks();
268
}
269
break;
270
271
case 58: //sv.s // Memory::Write_U32(VI(vt), addr);
272
{
273
gpr.Lock(rs);
274
275
fpr.MapRegV(vt, 0);
276
277
JitSafeMem safe(this, rs, imm);
278
OpArg dest;
279
if (safe.PrepareWrite(dest, 4)) {
280
MOVSS(safe.NextFastAddress(0), fpr.VX(vt));
281
}
282
if (safe.PrepareSlowWrite()) {
283
MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vt));
284
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), 0);
285
}
286
safe.Finish();
287
288
fpr.ReleaseSpillLocks();
289
gpr.UnlockAll();
290
}
291
break;
292
293
default:
294
DISABLE;
295
}
296
}
297
298
void Jit::Comp_SVQ(MIPSOpcode op) {
299
CONDITIONAL_DISABLE(LSU_VFPU);
300
301
int imm = (signed short)(op&0xFFFC);
302
int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
303
MIPSGPReg rs = _RS;
304
305
CheckMemoryBreakpoint(0, rs, imm);
306
307
switch (op >> 26) {
308
case 53: //lvl.q/lvr.q
309
{
310
if (!g_Config.bFastMemory) {
311
DISABLE;
312
}
313
DISABLE;
314
315
gpr.MapReg(rs, true, false);
316
gpr.FlushLockX(ECX);
317
u8 vregs[4];
318
GetVectorRegs(vregs, V_Quad, vt);
319
MOV(32, R(EAX), gpr.R(rs));
320
ADD(32, R(EAX), Imm32(imm));
321
#ifdef MASKED_PSP_MEMORY
322
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
323
#endif
324
MOV(32, R(ECX), R(EAX));
325
SHR(32, R(EAX), Imm8(2));
326
AND(32, R(EAX), Imm32(0x3));
327
CMP(32, R(EAX), Imm32(0));
328
FixupBranch next = J_CC(CC_NE);
329
330
auto PSPMemAddr = [](X64Reg scaled, int offset) {
331
#if PPSSPP_ARCH(X86)
332
return MDisp(scaled, (u32)Memory::base + offset);
333
#else
334
return MComplex(MEMBASEREG, scaled, 1, offset);
335
#endif
336
};
337
338
fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY);
339
340
// Offset = 0
341
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 0));
342
343
FixupBranch skip0 = J();
344
SetJumpTarget(next);
345
CMP(32, R(EAX), Imm32(1));
346
next = J_CC(CC_NE);
347
348
// Offset = 1
349
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 4));
350
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 0));
351
352
FixupBranch skip1 = J();
353
SetJumpTarget(next);
354
CMP(32, R(EAX), Imm32(2));
355
next = J_CC(CC_NE);
356
357
// Offset = 2
358
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 8));
359
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 4));
360
MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 0));
361
362
FixupBranch skip2 = J();
363
SetJumpTarget(next);
364
CMP(32, R(EAX), Imm32(3));
365
next = J_CC(CC_NE);
366
367
// Offset = 3
368
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 12));
369
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 8));
370
MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 4));
371
MOVSS(fpr.RX(vregs[0]), PSPMemAddr(EAX, 0));
372
373
SetJumpTarget(next);
374
SetJumpTarget(skip0);
375
SetJumpTarget(skip1);
376
SetJumpTarget(skip2);
377
378
gpr.UnlockAll();
379
fpr.ReleaseSpillLocks();
380
}
381
break;
382
383
case 54: //lv.q
384
{
385
gpr.Lock(rs);
386
// This must be in a reg or an immediate.
387
// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
388
if (!gpr.IsImm(rs))
389
gpr.MapReg(rs, true, false);
390
391
u8 vregs[4];
392
GetVectorRegs(vregs, V_Quad, vt);
393
394
if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
395
JitSafeMem safe(this, rs, imm);
396
OpArg src;
397
if (safe.PrepareRead(src, 16)) {
398
// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.
399
if (g_Config.bFastMemory) {
400
MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
401
} else {
402
MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));
403
}
404
}
405
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
406
for (int i = 0; i < 4; i++) {
407
safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
408
// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.
409
MOVD_xmm(XMM0, R(EAX));
410
MOVSS(fpr.VSX(vregs), R(XMM0));
411
// Rotate things so we can read in the next higher float.
412
// By the end (4 rotates), they'll all be back into place.
413
SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));
414
}
415
}
416
safe.Finish();
417
gpr.UnlockAll();
418
fpr.ReleaseSpillLocks();
419
return;
420
}
421
422
fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
423
424
JitSafeMem safe(this, rs, imm);
425
OpArg src;
426
if (safe.PrepareRead(src, 16)) {
427
// Just copy 4 words the easiest way while not wasting registers.
428
for (int i = 0; i < 4; i++)
429
MOVSS(fpr.VX(vregs[i]), safe.NextFastAddress(i * 4));
430
}
431
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
432
for (int i = 0; i < 4; i++) {
433
safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
434
MOVD_xmm(fpr.VX(vregs[i]), R(EAX));
435
}
436
}
437
safe.Finish();
438
439
gpr.UnlockAll();
440
fpr.ReleaseSpillLocks();
441
}
442
break;
443
444
case 62: //sv.q
445
{
446
gpr.Lock(rs);
447
// This must be in a reg or an immediate.
448
// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
449
if (!gpr.IsImm(rs))
450
gpr.MapReg(rs, true, false);
451
452
u8 vregs[4];
453
GetVectorRegs(vregs, V_Quad, vt);
454
455
if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
456
JitSafeMem safe(this, rs, imm);
457
OpArg dest;
458
if (safe.PrepareWrite(dest, 16)) {
459
// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.
460
if (g_Config.bFastMemory) {
461
MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
462
} else {
463
MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));
464
}
465
}
466
if (safe.PrepareSlowWrite()) {
467
MOVAPS(XMM0, fpr.VS(vregs));
468
for (int i = 0; i < 4; i++) {
469
MOVSS(MIPSSTATE_VAR(temp), XMM0);
470
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
471
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
472
}
473
}
474
safe.Finish();
475
gpr.UnlockAll();
476
fpr.ReleaseSpillLocks();
477
return;
478
}
479
480
// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
481
fpr.MapRegsV(vregs, V_Quad, 0);
482
483
JitSafeMem safe(this, rs, imm);
484
OpArg dest;
485
if (safe.PrepareWrite(dest, 16)) {
486
for (int i = 0; i < 4; i++)
487
MOVSS(safe.NextFastAddress(i * 4), fpr.VX(vregs[i]));
488
}
489
if (safe.PrepareSlowWrite()) {
490
for (int i = 0; i < 4; i++) {
491
MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vregs[i]));
492
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
493
}
494
}
495
safe.Finish();
496
497
gpr.UnlockAll();
498
fpr.ReleaseSpillLocks();
499
}
500
break;
501
502
default:
503
DISABLE;
504
break;
505
}
506
}
507
508
void Jit::Comp_VVectorInit(MIPSOpcode op) {
509
CONDITIONAL_DISABLE(VFPU_XFER);
510
511
if (js.HasUnknownPrefix())
512
DISABLE;
513
514
VectorSize sz = GetVecSize(op);
515
int type = (op >> 16) & 0xF;
516
u8 dregs[4];
517
GetVectorRegsPrefixD(dregs, sz, _VD);
518
519
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
520
if (type == 6) {
521
XORPS(fpr.VSX(dregs), fpr.VS(dregs));
522
} else if (type == 7) {
523
if (RipAccessible(&oneOneOneOne)) {
524
MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne)); // rip accessible
525
} else {
526
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
527
MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
528
}
529
} else {
530
DISABLE;
531
}
532
ApplyPrefixD(dregs, sz);
533
fpr.ReleaseSpillLocks();
534
return;
535
}
536
537
switch (type) {
538
case 6: // v=zeros; break; //vzero
539
XORPS(XMM0, R(XMM0));
540
break;
541
case 7: // v=ones; break; //vone
542
if (RipAccessible(&one)) {
543
MOVSS(XMM0, M(&one)); // rip accessible
544
} else {
545
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
546
MOVSS(XMM0, MatR(TEMPREG));
547
}
548
break;
549
default:
550
DISABLE;
551
break;
552
}
553
554
int n = GetNumVectorElements(sz);
555
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
556
for (int i = 0; i < n; ++i)
557
MOVSS(fpr.VX(dregs[i]), R(XMM0));
558
ApplyPrefixD(dregs, sz);
559
560
fpr.ReleaseSpillLocks();
561
}
562
563
void Jit::Comp_VIdt(MIPSOpcode op) {
564
CONDITIONAL_DISABLE(VFPU_XFER);
565
if (js.HasUnknownPrefix())
566
DISABLE;
567
568
int vd = _VD;
569
VectorSize sz = GetVecSize(op);
570
int n = GetNumVectorElements(sz);
571
572
u8 dregs[4];
573
GetVectorRegsPrefixD(dregs, sz, _VD);
574
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
575
int row = vd & (n - 1);
576
if (RipAccessible(identityMatrix)) {
577
MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); // rip accessible
578
} else {
579
MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row]));
580
MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
581
}
582
ApplyPrefixD(dregs, sz);
583
fpr.ReleaseSpillLocks();
584
return;
585
}
586
587
XORPS(XMM0, R(XMM0));
588
if (RipAccessible(&one)) {
589
MOVSS(XMM1, M(&one)); // rip accessible
590
} else {
591
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
592
MOVSS(XMM1, MatR(TEMPREG));
593
}
594
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
595
switch (sz) {
596
case V_Pair:
597
MOVSS(fpr.VX(dregs[0]), R((vd&1)==0 ? XMM1 : XMM0));
598
MOVSS(fpr.VX(dregs[1]), R((vd&1)==1 ? XMM1 : XMM0));
599
break;
600
case V_Quad:
601
MOVSS(fpr.VX(dregs[0]), R((vd&3)==0 ? XMM1 : XMM0));
602
MOVSS(fpr.VX(dregs[1]), R((vd&3)==1 ? XMM1 : XMM0));
603
MOVSS(fpr.VX(dregs[2]), R((vd&3)==2 ? XMM1 : XMM0));
604
MOVSS(fpr.VX(dregs[3]), R((vd&3)==3 ? XMM1 : XMM0));
605
break;
606
default:
607
_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");
608
break;
609
}
610
ApplyPrefixD(dregs, sz);
611
fpr.ReleaseSpillLocks();
612
}
613
614
void Jit::Comp_VDot(MIPSOpcode op) {
615
CONDITIONAL_DISABLE(VFPU_VEC);
616
617
if (js.HasUnknownPrefix())
618
DISABLE;
619
620
VectorSize sz = GetVecSize(op);
621
int n = GetNumVectorElements(sz);
622
623
// TODO: Force read one of them into regs? probably not.
624
u8 sregs[4], tregs[4], dregs[1];
625
GetVectorRegsPrefixS(sregs, sz, _VS);
626
GetVectorRegsPrefixT(tregs, sz, _VT);
627
GetVectorRegsPrefixD(dregs, V_Single, _VD);
628
629
// With SSE2, these won't really give any performance benefit on their own, but may reduce
630
// conversion costs from/to SIMD form. However, the SSE4.1 DPPS may be worth it.
631
// Benchmarking will have to decide whether to enable this on < SSE4.1. Also a HADDPS version
632
// for SSE3 could be written.
633
if (fpr.TryMapDirtyInInVS(dregs, V_Single, sregs, sz, tregs, sz)) {
634
switch (sz) {
635
case V_Pair:
636
if (cpu_info.bSSE4_1) {
637
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
638
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
639
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x31);
640
} else {
641
MOVAPS(XMM0, fpr.VS(sregs));
642
DPPS(XMM0, fpr.VS(tregs), 0x31);
643
MOVAPS(fpr.VSX(dregs), R(XMM0));
644
}
645
} else {
646
MOVAPS(XMM0, fpr.VS(sregs));
647
MULPS(XMM0, fpr.VS(tregs));
648
MOVAPS(R(XMM1), XMM0);
649
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(1, 1, 1, 1));
650
ADDPS(XMM1, R(XMM0));
651
MOVAPS(fpr.VS(dregs), XMM1);
652
}
653
break;
654
case V_Triple:
655
if (cpu_info.bSSE4_1) {
656
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
657
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
658
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x71);
659
} else {
660
MOVAPS(XMM0, fpr.VS(sregs));
661
DPPS(XMM0, fpr.VS(tregs), 0x71);
662
MOVAPS(fpr.VSX(dregs), R(XMM0));
663
}
664
} else {
665
MOVAPS(XMM0, fpr.VS(sregs));
666
MULPS(XMM0, fpr.VS(tregs));
667
MOVAPS(R(XMM1), XMM0);
668
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(3, 2, 1, 1));
669
ADDSS(XMM1, R(XMM0));
670
SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(3, 2, 2, 2));
671
ADDSS(XMM1, R(XMM0));
672
MOVAPS(fpr.VS(dregs), XMM1);
673
}
674
break;
675
case V_Quad:
676
if (cpu_info.bSSE4_1) {
677
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
678
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
679
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0xF1);
680
} else {
681
MOVAPS(XMM0, fpr.VS(sregs));
682
DPPS(XMM0, fpr.VS(tregs), 0xF1);
683
MOVAPS(fpr.VSX(dregs), R(XMM0));
684
}
685
} /* else if (cpu_info.bSSE3) { // This is slower than the SSE2 solution on my Ivy!
686
MOVAPS(XMM0, fpr.VS(sregs));
687
MOVAPS(XMM1, fpr.VS(tregs));
688
HADDPS(XMM0, R(XMM1));
689
HADDPS(XMM0, R(XMM0));
690
MOVAPS(fpr.VSX(dregs), R(XMM0));
691
} */ else {
692
MOVAPS(XMM0, fpr.VS(sregs));
693
MOVAPS(XMM1, fpr.VS(tregs));
694
MULPS(XMM0, R(XMM1));
695
MOVAPS(XMM1, R(XMM0));
696
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(2, 3, 0, 1));
697
ADDPS(XMM0, R(XMM1));
698
MOVAPS(XMM1, R(XMM0));
699
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 1, 2, 3));
700
ADDSS(XMM0, R(XMM1));
701
MOVAPS(fpr.VSX(dregs), R(XMM0));
702
}
703
break;
704
default:
705
DISABLE;
706
}
707
ApplyPrefixD(dregs, V_Single);
708
fpr.ReleaseSpillLocks();
709
return;
710
}
711
712
// Flush SIMD.
713
fpr.SimpleRegsV(sregs, sz, 0);
714
fpr.SimpleRegsV(tregs, sz, 0);
715
fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
716
717
X64Reg tempxreg = XMM0;
718
if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
719
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
720
tempxreg = fpr.VX(dregs[0]);
721
}
722
723
// Need to start with +0.0f so it doesn't result in -0.0f.
724
MOVSS(tempxreg, fpr.V(sregs[0]));
725
MULSS(tempxreg, fpr.V(tregs[0]));
726
for (int i = 1; i < n; i++)
727
{
728
// sum += s[i]*t[i];
729
MOVSS(XMM1, fpr.V(sregs[i]));
730
MULSS(XMM1, fpr.V(tregs[i]));
731
ADDSS(tempxreg, R(XMM1));
732
}
733
734
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
735
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
736
MOVSS(fpr.V(dregs[0]), tempxreg);
737
}
738
739
ApplyPrefixD(dregs, V_Single);
740
741
fpr.ReleaseSpillLocks();
742
}
743
744
745
void Jit::Comp_VHdp(MIPSOpcode op) {
746
CONDITIONAL_DISABLE(VFPU_VEC);
747
748
if (js.HasUnknownPrefix())
749
DISABLE;
750
751
VectorSize sz = GetVecSize(op);
752
int n = GetNumVectorElements(sz);
753
754
u8 sregs[4], tregs[4], dregs[1];
755
GetVectorRegsPrefixS(sregs, sz, _VS);
756
GetVectorRegsPrefixT(tregs, sz, _VT);
757
GetVectorRegsPrefixD(dregs, V_Single, _VD);
758
759
// Flush SIMD.
760
fpr.SimpleRegsV(sregs, sz, 0);
761
fpr.SimpleRegsV(tregs, sz, 0);
762
fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
763
764
X64Reg tempxreg = XMM0;
765
if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
766
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
767
tempxreg = fpr.VX(dregs[0]);
768
}
769
770
// Need to start with +0.0f so it doesn't result in -0.0f.
771
MOVSS(tempxreg, fpr.V(sregs[0]));
772
MULSS(tempxreg, fpr.V(tregs[0]));
773
for (int i = 1; i < n; i++) {
774
// sum += (i == n-1) ? t[i] : s[i]*t[i];
775
if (i == n - 1) {
776
ADDSS(tempxreg, fpr.V(tregs[i]));
777
} else {
778
MOVSS(XMM1, fpr.V(sregs[i]));
779
MULSS(XMM1, fpr.V(tregs[i]));
780
ADDSS(tempxreg, R(XMM1));
781
}
782
}
783
784
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
785
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
786
MOVSS(fpr.V(dregs[0]), tempxreg);
787
}
788
789
ApplyPrefixD(dregs, V_Single);
790
791
fpr.ReleaseSpillLocks();
792
}
793
794
void Jit::Comp_VCrossQuat(MIPSOpcode op) {
795
CONDITIONAL_DISABLE(VFPU_VEC);
796
797
if (js.HasUnknownPrefix())
798
DISABLE;
799
800
VectorSize sz = GetVecSize(op);
801
802
u8 sregs[4], tregs[4], dregs[4];
803
GetVectorRegs(sregs, sz, _VS);
804
GetVectorRegs(tregs, sz, _VT);
805
GetVectorRegs(dregs, sz, _VD);
806
807
if (sz == V_Triple) {
808
// Cross product vcrsp.t
809
if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
810
MOVAPS(XMM0, fpr.VS(tregs));
811
MOVAPS(XMM1, fpr.VS(sregs));
812
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
813
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 0, 2, 1));
814
MULPS(XMM0, fpr.VS(sregs));
815
MULPS(XMM1, fpr.VS(tregs));
816
SUBPS(XMM0, R(XMM1));
817
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
818
MOVAPS(fpr.VS(dregs), XMM0);
819
fpr.ReleaseSpillLocks();
820
return;
821
}
822
823
// Flush SIMD.
824
fpr.SimpleRegsV(sregs, sz, 0);
825
fpr.SimpleRegsV(tregs, sz, 0);
826
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
827
828
fpr.MapRegsV(sregs, sz, 0);
829
830
// Compute X
831
MOVSS(XMM0, fpr.V(sregs[1]));
832
MULSS(XMM0, fpr.V(tregs[2]));
833
MOVSS(XMM1, fpr.V(sregs[2]));
834
MULSS(XMM1, fpr.V(tregs[1]));
835
SUBSS(XMM0, R(XMM1));
836
MOVSS(fpr.V(dregs[0]), XMM0);
837
838
// Compute Y
839
MOVSS(XMM0, fpr.V(sregs[2]));
840
MULSS(XMM0, fpr.V(tregs[0]));
841
MOVSS(XMM1, fpr.V(sregs[0]));
842
MULSS(XMM1, fpr.V(tregs[2]));
843
SUBSS(XMM0, R(XMM1));
844
MOVSS(fpr.V(dregs[1]), XMM0);
845
846
// Compute Z
847
MOVSS(XMM0, fpr.V(sregs[0]));
848
MULSS(XMM0, fpr.V(tregs[1]));
849
MOVSS(XMM1, fpr.V(sregs[1]));
850
MULSS(XMM1, fpr.V(tregs[0]));
851
SUBSS(XMM0, R(XMM1));
852
MOVSS(fpr.V(dregs[2]), XMM0);
853
} else if (sz == V_Quad) {
854
// Flush SIMD.
855
fpr.SimpleRegsV(sregs, sz, 0);
856
fpr.SimpleRegsV(tregs, sz, 0);
857
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
858
859
// Quaternion product vqmul.q
860
fpr.MapRegsV(sregs, sz, 0);
861
862
// Compute X
863
// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
864
MOVSS(XMM0, fpr.V(sregs[0]));
865
MULSS(XMM0, fpr.V(tregs[3]));
866
MOVSS(XMM1, fpr.V(sregs[1]));
867
MULSS(XMM1, fpr.V(tregs[2]));
868
ADDSS(XMM0, R(XMM1));
869
MOVSS(XMM1, fpr.V(sregs[2]));
870
MULSS(XMM1, fpr.V(tregs[1]));
871
SUBSS(XMM0, R(XMM1));
872
MOVSS(XMM1, fpr.V(sregs[3]));
873
MULSS(XMM1, fpr.V(tregs[0]));
874
ADDSS(XMM0, R(XMM1));
875
MOVSS(fpr.V(dregs[0]), XMM0);
876
877
// Compute Y
878
//d[1] = s[1] * t[3] + s[2] * t[0] + s[3] * t[1] - s[0] * t[2];
879
MOVSS(XMM0, fpr.V(sregs[1]));
880
MULSS(XMM0, fpr.V(tregs[3]));
881
MOVSS(XMM1, fpr.V(sregs[2]));
882
MULSS(XMM1, fpr.V(tregs[0]));
883
ADDSS(XMM0, R(XMM1));
884
MOVSS(XMM1, fpr.V(sregs[3]));
885
MULSS(XMM1, fpr.V(tregs[1]));
886
ADDSS(XMM0, R(XMM1));
887
MOVSS(XMM1, fpr.V(sregs[0]));
888
MULSS(XMM1, fpr.V(tregs[2]));
889
SUBSS(XMM0, R(XMM1));
890
MOVSS(fpr.V(dregs[1]), XMM0);
891
892
// Compute Z
893
//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
894
MOVSS(XMM0, fpr.V(sregs[0]));
895
MULSS(XMM0, fpr.V(tregs[1]));
896
MOVSS(XMM1, fpr.V(sregs[1]));
897
MULSS(XMM1, fpr.V(tregs[0]));
898
SUBSS(XMM0, R(XMM1));
899
MOVSS(XMM1, fpr.V(sregs[2]));
900
MULSS(XMM1, fpr.V(tregs[3]));
901
ADDSS(XMM0, R(XMM1));
902
MOVSS(XMM1, fpr.V(sregs[3]));
903
MULSS(XMM1, fpr.V(tregs[2]));
904
ADDSS(XMM0, R(XMM1));
905
MOVSS(fpr.V(dregs[2]), XMM0);
906
907
// Compute W
908
//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
909
MOVSS(XMM0, fpr.V(sregs[3]));
910
MULSS(XMM0, fpr.V(tregs[3]));
911
MOVSS(XMM1, fpr.V(sregs[1]));
912
MULSS(XMM1, fpr.V(tregs[1]));
913
SUBSS(XMM0, R(XMM1));
914
MOVSS(XMM1, fpr.V(sregs[2]));
915
MULSS(XMM1, fpr.V(tregs[2]));
916
SUBSS(XMM0, R(XMM1));
917
MOVSS(XMM1, fpr.V(sregs[0]));
918
MULSS(XMM1, fpr.V(tregs[0]));
919
SUBSS(XMM0, R(XMM1));
920
MOVSS(fpr.V(dregs[3]), XMM0);
921
}
922
923
fpr.ReleaseSpillLocks();
924
}
925
926
void Jit::Comp_Vcmov(MIPSOpcode op) {
927
CONDITIONAL_DISABLE(VFPU_COMP);
928
929
if (js.HasUnknownPrefix())
930
DISABLE;
931
932
VectorSize sz = GetVecSize(op);
933
int n = GetNumVectorElements(sz);
934
935
u8 sregs[4], dregs[4];
936
GetVectorRegsPrefixS(sregs, sz, _VS);
937
GetVectorRegsPrefixD(dregs, sz, _VD);
938
int tf = (op >> 19) & 1;
939
int imm3 = (op >> 16) & 7;
940
941
// Flush SIMD.
942
fpr.SimpleRegsV(sregs, sz, 0);
943
944
for (int i = 0; i < n; ++i) {
945
// Simplification: Disable if overlap unsafe
946
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
947
DISABLE;
948
}
949
}
950
951
if (imm3 < 6) {
952
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
953
fpr.MapRegsV(dregs, sz, MAP_DIRTY);
954
// Test one bit of CC. This bit decides whether none or all subregisters are copied.
955
TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << imm3));
956
FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
957
for (int i = 0; i < n; i++) {
958
MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
959
}
960
SetJumpTarget(skip);
961
} else {
962
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
963
fpr.MapRegsV(dregs, sz, MAP_DIRTY);
964
// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
965
for (int i = 0; i < n; i++) {
966
TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << i));
967
FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
968
MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
969
SetJumpTarget(skip);
970
}
971
}
972
973
ApplyPrefixD(dregs, sz);
974
975
fpr.ReleaseSpillLocks();
976
}
977
978
static s32 DoVminSS(s32 treg) {
979
s32 sreg = currentMIPS->temp;
980
981
// If both are negative, we flip the comparison (not two's compliment.)
982
if (sreg < 0 && treg < 0) {
983
// If at least one side is NAN, we take the highest mantissa bits.
984
return treg < sreg ? sreg : treg;
985
} else {
986
// Otherwise, we take the lowest value (negative or lowest mantissa.)
987
return treg > sreg ? sreg : treg;
988
}
989
}
990
991
static s32 DoVmaxSS(s32 treg) {
992
s32 sreg = currentMIPS->temp;
993
994
// This is the same logic as vmin, just reversed.
995
if (sreg < 0 && treg < 0) {
996
return treg < sreg ? treg : sreg;
997
} else {
998
return treg > sreg ? treg : sreg;
999
}
1000
}
1001
1002
void Jit::Comp_VecDo3(MIPSOpcode op) {
1003
CONDITIONAL_DISABLE(VFPU_VEC);
1004
1005
if (js.HasUnknownPrefix())
1006
DISABLE;
1007
1008
// Check that we can support the ops, and prepare temporary values for ops that need it.
1009
bool allowSIMD = true;
1010
switch (op >> 26) {
1011
case 24: //VFPU0
1012
switch ((op >> 23) & 7) {
1013
case 0: // d[i] = s[i] + t[i]; break; //vadd
1014
case 1: // d[i] = s[i] - t[i]; break; //vsub
1015
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1016
break;
1017
default:
1018
DISABLE;
1019
}
1020
break;
1021
case 25: //VFPU1
1022
switch ((op >> 23) & 7) {
1023
case 0: // d[i] = s[i] * t[i]; break; //vmul
1024
break;
1025
default:
1026
DISABLE;
1027
}
1028
break;
1029
case 27: //VFPU3
1030
switch ((op >> 23) & 7) {
1031
case 2: // vmin
1032
case 3: // vmax
1033
allowSIMD = false;
1034
break;
1035
case 6: // vsge
1036
case 7: // vslt
1037
break;
1038
default:
1039
DISABLE;
1040
}
1041
break;
1042
default:
1043
DISABLE;
1044
break;
1045
}
1046
1047
VectorSize sz = GetVecSize(op);
1048
int n = GetNumVectorElements(sz);
1049
1050
u8 sregs[4], tregs[4], dregs[4];
1051
GetVectorRegsPrefixS(sregs, sz, _VS);
1052
GetVectorRegsPrefixT(tregs, sz, _VT);
1053
GetVectorRegsPrefixD(dregs, sz, _VD);
1054
1055
if (allowSIMD && fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
1056
void (XEmitter::*opFunc)(X64Reg, OpArg) = nullptr;
1057
bool symmetric = false;
1058
switch (op >> 26) {
1059
case 24: //VFPU0
1060
switch ((op >> 23) & 7) {
1061
case 0: // d[i] = s[i] + t[i]; break; //vadd
1062
opFunc = &XEmitter::ADDPS;
1063
symmetric = true;
1064
break;
1065
case 1: // d[i] = s[i] - t[i]; break; //vsub
1066
opFunc = &XEmitter::SUBPS;
1067
break;
1068
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1069
opFunc = &XEmitter::DIVPS;
1070
break;
1071
}
1072
break;
1073
case 25: //VFPU1
1074
switch ((op >> 23) & 7)
1075
{
1076
case 0: // d[i] = s[i] * t[i]; break; //vmul
1077
opFunc = &XEmitter::MULPS;
1078
symmetric = true;
1079
break;
1080
}
1081
break;
1082
case 27: //VFPU3
1083
switch ((op >> 23) & 7)
1084
{
1085
case 2: // vmin
1086
// TODO: Mishandles NaN. Disabled for now.
1087
MOVAPS(XMM1, fpr.VS(sregs));
1088
MINPS(XMM1, fpr.VS(tregs));
1089
MOVAPS(fpr.VSX(dregs), R(XMM1));
1090
break;
1091
case 3: // vmax
1092
// TODO: Mishandles NaN. Disabled for now.
1093
MOVAPS(XMM1, fpr.VS(sregs));
1094
MAXPS(XMM1, fpr.VS(tregs));
1095
MOVAPS(fpr.VSX(dregs), R(XMM1));
1096
break;
1097
case 6: // vsge
1098
MOVAPS(XMM0, fpr.VS(tregs));
1099
MOVAPS(XMM1, fpr.VS(sregs));
1100
CMPPS(XMM0, R(XMM1), CMP_ORD);
1101
CMPPS(XMM1, fpr.VS(tregs), CMP_NLT);
1102
1103
ANDPS(XMM1, R(XMM0));
1104
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1105
ANDPS(XMM1, MatR(TEMPREG));
1106
MOVAPS(fpr.VSX(dregs), R(XMM1));
1107
break;
1108
case 7: // vslt
1109
MOVAPS(XMM1, fpr.VS(sregs));
1110
CMPPS(XMM1, fpr.VS(tregs), CMP_LT);
1111
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1112
ANDPS(XMM1, MatR(TEMPREG));
1113
MOVAPS(fpr.VSX(dregs), R(XMM1));
1114
break;
1115
}
1116
break;
1117
}
1118
1119
if (opFunc != nullptr) {
1120
if (fpr.VSX(dregs) != fpr.VSX(tregs)) {
1121
if (fpr.VSX(dregs) != fpr.VSX(sregs)) {
1122
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
1123
}
1124
(this->*opFunc)(fpr.VSX(dregs), fpr.VS(tregs));
1125
} else if (symmetric) {
1126
// We already know d = t.
1127
(this->*opFunc)(fpr.VSX(dregs), fpr.VS(sregs));
1128
} else {
1129
MOVAPS(XMM1, fpr.VS(sregs));
1130
(this->*opFunc)(XMM1, fpr.VS(tregs));
1131
MOVAPS(fpr.VSX(dregs), R(XMM1));
1132
}
1133
}
1134
1135
ApplyPrefixD(dregs, sz);
1136
fpr.ReleaseSpillLocks();
1137
return;
1138
}
1139
1140
// Flush SIMD.
1141
fpr.SimpleRegsV(sregs, sz, 0);
1142
fpr.SimpleRegsV(tregs, sz, 0);
1143
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1144
1145
X64Reg tempxregs[4];
1146
for (int i = 0; i < n; ++i)
1147
{
1148
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs, n, tregs))
1149
{
1150
// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
1151
// But for vmin/vmax/vsge, we need XMM0/XMM1, so avoid.
1152
if (i < 2 && (op >> 26) != 27)
1153
tempxregs[i] = (X64Reg) (XMM0 + i);
1154
else
1155
{
1156
int reg = fpr.GetTempV();
1157
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
1158
fpr.SpillLockV(reg);
1159
tempxregs[i] = fpr.VX(reg);
1160
}
1161
}
1162
else
1163
{
1164
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
1165
fpr.SpillLockV(dregs[i]);
1166
tempxregs[i] = fpr.VX(dregs[i]);
1167
}
1168
}
1169
1170
for (int i = 0; i < n; ++i)
1171
{
1172
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
1173
MOVSS(tempxregs[i], fpr.V(sregs[i]));
1174
}
1175
1176
for (int i = 0; i < n; ++i) {
1177
switch (op >> 26) {
1178
case 24: //VFPU0
1179
switch ((op >> 23) & 7) {
1180
case 0: // d[i] = s[i] + t[i]; break; //vadd
1181
ADDSS(tempxregs[i], fpr.V(tregs[i]));
1182
break;
1183
case 1: // d[i] = s[i] - t[i]; break; //vsub
1184
SUBSS(tempxregs[i], fpr.V(tregs[i]));
1185
break;
1186
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1187
DIVSS(tempxregs[i], fpr.V(tregs[i]));
1188
break;
1189
}
1190
break;
1191
case 25: //VFPU1
1192
switch ((op >> 23) & 7)
1193
{
1194
case 0: // d[i] = s[i] * t[i]; break; //vmul
1195
MULSS(tempxregs[i], fpr.V(tregs[i]));
1196
break;
1197
}
1198
break;
1199
case 27: //VFPU3
1200
switch ((op >> 23) & 7)
1201
{
1202
case 2: // vmin
1203
{
1204
MOVSS(XMM0, fpr.V(tregs[i]));
1205
UCOMISS(tempxregs[i], R(XMM0));
1206
FixupBranch skip = J_CC(CC_NP, true);
1207
1208
MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1209
MOVD_xmm(R(EAX), XMM0);
1210
CallProtectedFunction(&DoVminSS, R(EAX));
1211
MOVD_xmm(tempxregs[i], R(EAX));
1212
FixupBranch finish = J();
1213
1214
SetJumpTarget(skip);
1215
MINSS(tempxregs[i], R(XMM0));
1216
SetJumpTarget(finish);
1217
}
1218
break;
1219
case 3: // vmax
1220
{
1221
MOVSS(XMM0, fpr.V(tregs[i]));
1222
UCOMISS(tempxregs[i], R(XMM0));
1223
FixupBranch skip = J_CC(CC_NP, true);
1224
1225
MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1226
MOVD_xmm(R(EAX), XMM0);
1227
CallProtectedFunction(&DoVmaxSS, R(EAX));
1228
MOVD_xmm(tempxregs[i], R(EAX));
1229
FixupBranch finish = J();
1230
1231
SetJumpTarget(skip);
1232
MAXSS(tempxregs[i], R(XMM0));
1233
SetJumpTarget(finish);
1234
}
1235
break;
1236
case 6: // vsge
1237
// We can't just reverse, because of 0/-0.
1238
MOVSS(XMM0, fpr.V(tregs[i]));
1239
MOVSS(XMM1, R(tempxregs[i]));
1240
CMPORDSS(XMM1, R(XMM0));
1241
CMPNLTSS(tempxregs[i], R(XMM0));
1242
ANDPS(tempxregs[i], R(XMM1));
1243
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1244
ANDPS(tempxregs[i], MatR(TEMPREG));
1245
break;
1246
case 7: // vslt
1247
CMPLTSS(tempxregs[i], fpr.V(tregs[i]));
1248
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1249
ANDPS(tempxregs[i], MatR(TEMPREG));
1250
break;
1251
}
1252
break;
1253
}
1254
}
1255
1256
for (int i = 0; i < n; ++i)
1257
{
1258
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
1259
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
1260
}
1261
1262
ApplyPrefixD(dregs, sz);
1263
1264
fpr.ReleaseSpillLocks();
1265
}
1266
1267
alignas(16) static const u32 vcmpMask[4][4] = {
1268
{0x00000031, 0x00000000, 0x00000000, 0x00000000},
1269
{0x00000011, 0x00000012, 0x00000000, 0x00000000},
1270
{0x00000011, 0x00000012, 0x00000014, 0x00000000},
1271
{0x00000011, 0x00000012, 0x00000014, 0x00000018},
1272
};
1273
1274
void Jit::Comp_Vcmp(MIPSOpcode op) {
1275
CONDITIONAL_DISABLE(VFPU_COMP);
1276
1277
if (js.HasUnknownPrefix())
1278
DISABLE;
1279
1280
VectorSize sz = GetVecSize(op);
1281
int n = GetNumVectorElements(sz);
1282
1283
VCondition cond = (VCondition)(op & 0xF);
1284
1285
u8 sregs[4], tregs[4];
1286
GetVectorRegsPrefixS(sregs, sz, _VS);
1287
GetVectorRegsPrefixT(tregs, sz, _VT);
1288
1289
// Some, we just fall back to the interpreter.
1290
switch (cond) {
1291
case VC_EI: // c = my_isinf(s[i]); break;
1292
case VC_NI: // c = !my_isinf(s[i]); break;
1293
DISABLE;
1294
break;
1295
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1296
case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
1297
case VC_EN: // c = my_isnan(s[i]); break;
1298
case VC_NN: // c = !my_isnan(s[i]); break;
1299
if (_VS != _VT)
1300
DISABLE;
1301
break;
1302
default:
1303
break;
1304
}
1305
1306
// First, let's get the trivial ones.
1307
1308
static const int true_bits[4] = {0x31, 0x33, 0x37, 0x3f};
1309
1310
if (cond == VC_TR) {
1311
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1312
OR(32, gpr.R(MIPS_REG_VFPUCC), Imm32(true_bits[n-1]));
1313
return;
1314
} else if (cond == VC_FL) {
1315
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1316
AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~true_bits[n-1]));
1317
return;
1318
}
1319
1320
if (n > 1)
1321
gpr.FlushLockX(ECX);
1322
1323
// Start with zero in each lane for the compare to zero.
1324
if (cond == VC_EZ || cond == VC_NZ) {
1325
XORPS(XMM0, R(XMM0));
1326
if (n > 1) {
1327
XORPS(XMM1, R(XMM1));
1328
}
1329
}
1330
1331
bool inverse = false;
1332
1333
if (cond == VC_GE || cond == VC_GT) {
1334
// We flip, and we need them in regs so we don't clear the high lanes.
1335
fpr.SimpleRegsV(sregs, sz, 0);
1336
fpr.MapRegsV(tregs, sz, 0);
1337
} else {
1338
fpr.SimpleRegsV(tregs, sz, 0);
1339
fpr.MapRegsV(sregs, sz, 0);
1340
}
1341
1342
// We go backwards because it's more convenient to put things in the right lanes.
1343
int affected_bits = (1 << 4) | (1 << 5); // 4 and 5
1344
for (int i = n - 1; i >= 0; --i) {
1345
// Alternate between XMM0 and XMM1
1346
X64Reg reg = i == 1 || i == 3 ? XMM1 : XMM0;
1347
if ((i == 0 || i == 1) && n > 2) {
1348
// We need to swap lanes... this also puts them in the right place.
1349
SHUFPS(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 1));
1350
}
1351
1352
// Let's only handle the easy ones, and fall back on the interpreter for the rest.
1353
bool compareTwo = false;
1354
bool compareToZero = false;
1355
int comparison = -1;
1356
bool flip = false;
1357
1358
switch (cond) {
1359
case VC_ES:
1360
comparison = -1; // We will do the compare at the end. XMM1 will have the bits.
1361
MOVSS(reg, fpr.V(sregs[i]));
1362
break;
1363
1364
case VC_NS:
1365
comparison = -1; // We will do the compare at the end. XMM1 will have the bits.
1366
MOVSS(reg, fpr.V(sregs[i]));
1367
// Note that we do this all at once at the end.
1368
inverse = true;
1369
break;
1370
1371
case VC_EN:
1372
comparison = CMP_UNORD;
1373
compareTwo = true;
1374
break;
1375
1376
case VC_NN:
1377
comparison = CMP_UNORD;
1378
compareTwo = true;
1379
// Note that we do this all at once at the end.
1380
inverse = true;
1381
break;
1382
1383
case VC_EQ: // c = s[i] == t[i]; break;
1384
comparison = CMP_EQ;
1385
compareTwo = true;
1386
break;
1387
1388
case VC_LT: // c = s[i] < t[i]; break;
1389
comparison = CMP_LT;
1390
compareTwo = true;
1391
break;
1392
1393
case VC_LE: // c = s[i] <= t[i]; break;
1394
comparison = CMP_LE;
1395
compareTwo = true;
1396
break;
1397
1398
case VC_NE: // c = s[i] != t[i]; break;
1399
comparison = CMP_NEQ;
1400
compareTwo = true;
1401
break;
1402
1403
case VC_GE: // c = s[i] >= t[i]; break;
1404
comparison = CMP_LE;
1405
flip = true;
1406
compareTwo = true;
1407
break;
1408
1409
case VC_GT: // c = s[i] > t[i]; break;
1410
comparison = CMP_LT;
1411
flip = true;
1412
compareTwo = true;
1413
break;
1414
1415
case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f; break;
1416
comparison = CMP_EQ;
1417
compareToZero = true;
1418
break;
1419
1420
case VC_NZ: // c = s[i] != 0; break;
1421
comparison = CMP_NEQ;
1422
compareToZero = true;
1423
break;
1424
1425
default:
1426
DISABLE;
1427
}
1428
1429
if (comparison != -1) {
1430
if (compareTwo) {
1431
if (!flip) {
1432
MOVSS(reg, fpr.V(sregs[i]));
1433
CMPSS(reg, fpr.V(tregs[i]), comparison);
1434
} else {
1435
MOVSS(reg, fpr.V(tregs[i]));
1436
CMPSS(reg, fpr.V(sregs[i]), comparison);
1437
}
1438
} else if (compareToZero) {
1439
CMPSS(reg, fpr.V(sregs[i]), comparison);
1440
}
1441
}
1442
1443
affected_bits |= 1 << i;
1444
}
1445
1446
if (n > 1) {
1447
XOR(32, R(ECX), R(ECX));
1448
1449
// This combines them together.
1450
UNPCKLPS(XMM0, R(XMM1));
1451
1452
// Finalize the comparison for ES/NS.
1453
if (cond == VC_ES || cond == VC_NS) {
1454
MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1455
ANDPS(XMM0, MatR(TEMPREG));
1456
PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison
1457
// It's inversed below for NS.
1458
}
1459
1460
if (inverse) {
1461
// The canonical way to generate a bunch of ones, see https://stackoverflow.com/questions/35085059/what-are-the-best-instruction-sequences-to-generate-vector-constants-on-the-fly
1462
PCMPEQW(XMM1, R(XMM1));
1463
XORPS(XMM0, R(XMM1));
1464
}
1465
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vcmpMask[n - 1]));
1466
ANDPS(XMM0, MatR(TEMPREG));
1467
MOVAPS(MIPSSTATE_VAR(vcmpResult), XMM0);
1468
1469
MOV(32, R(TEMPREG), MIPSSTATE_VAR(vcmpResult[0]));
1470
for (int i = 1; i < n; ++i) {
1471
OR(32, R(TEMPREG), MIPSSTATE_VAR_ELEM32(vcmpResult[0], i));
1472
}
1473
1474
// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison,
1475
// which is the most common after all.
1476
CMP(32, R(TEMPREG), Imm8(affected_bits & 0x1F));
1477
SETcc(CC_E, R(ECX));
1478
SHL(32, R(ECX), Imm8(5));
1479
OR(32, R(TEMPREG), R(ECX));
1480
} else {
1481
// Finalize the comparison for ES/NS.
1482
if (cond == VC_ES || cond == VC_NS) {
1483
MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1484
ANDPS(XMM0, MatR(TEMPREG));
1485
PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison
1486
// It's inversed below for NS.
1487
}
1488
1489
MOVD_xmm(R(TEMPREG), XMM0);
1490
if (inverse) {
1491
XOR(32, R(TEMPREG), Imm32(0xFFFFFFFF));
1492
}
1493
AND(32, R(TEMPREG), Imm32(0x31));
1494
}
1495
1496
gpr.UnlockAllX();
1497
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1498
AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~affected_bits));
1499
OR(32, gpr.R(MIPS_REG_VFPUCC), R(TEMPREG));
1500
1501
fpr.ReleaseSpillLocks();
1502
}
1503
1504
// There are no immediates for floating point, so we need to load these
1505
// from RAM. Might as well have a table ready.
1506
extern const float mulTableVi2f[32] = {
1507
1.0f/(1UL<<0),1.0f/(1UL<<1),1.0f/(1UL<<2),1.0f/(1UL<<3),
1508
1.0f/(1UL<<4),1.0f/(1UL<<5),1.0f/(1UL<<6),1.0f/(1UL<<7),
1509
1.0f/(1UL<<8),1.0f/(1UL<<9),1.0f/(1UL<<10),1.0f/(1UL<<11),
1510
1.0f/(1UL<<12),1.0f/(1UL<<13),1.0f/(1UL<<14),1.0f/(1UL<<15),
1511
1.0f/(1UL<<16),1.0f/(1UL<<17),1.0f/(1UL<<18),1.0f/(1UL<<19),
1512
1.0f/(1UL<<20),1.0f/(1UL<<21),1.0f/(1UL<<22),1.0f/(1UL<<23),
1513
1.0f/(1UL<<24),1.0f/(1UL<<25),1.0f/(1UL<<26),1.0f/(1UL<<27),
1514
1.0f/(1UL<<28),1.0f/(1UL<<29),1.0f/(1UL<<30),1.0f/(1UL<<31),
1515
};
1516
1517
void Jit::Comp_Vi2f(MIPSOpcode op) {
1518
CONDITIONAL_DISABLE(VFPU_VEC);
1519
1520
if (js.HasUnknownPrefix())
1521
DISABLE;
1522
1523
VectorSize sz = GetVecSize(op);
1524
int n = GetNumVectorElements(sz);
1525
1526
int imm = (op >> 16) & 0x1f;
1527
const float *mult = &mulTableVi2f[imm];
1528
1529
u8 sregs[4], dregs[4];
1530
GetVectorRegsPrefixS(sregs, sz, _VS);
1531
GetVectorRegsPrefixD(dregs, sz, _VD);
1532
1533
// Flush SIMD.
1534
fpr.SimpleRegsV(sregs, sz, 0);
1535
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1536
1537
int tempregs[4];
1538
for (int i = 0; i < n; ++i) {
1539
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1540
tempregs[i] = fpr.GetTempV();
1541
} else {
1542
tempregs[i] = dregs[i];
1543
}
1544
}
1545
1546
if (*mult != 1.0f) {
1547
if (RipAccessible(mult)) {
1548
MOVSS(XMM1, M(mult)); // rip accessible
1549
} else {
1550
MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1551
MOVSS(XMM1, MatR(TEMPREG));
1552
}
1553
}
1554
for (int i = 0; i < n; i++) {
1555
fpr.MapRegV(tempregs[i], sregs[i] == dregs[i] ? MAP_DIRTY : MAP_NOINIT);
1556
if (fpr.V(sregs[i]).IsSimpleReg()) {
1557
CVTDQ2PS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1558
} else {
1559
MOVSS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1560
CVTDQ2PS(fpr.VX(tempregs[i]), R(fpr.VX(tempregs[i])));
1561
}
1562
if (*mult != 1.0f)
1563
MULSS(fpr.VX(tempregs[i]), R(XMM1));
1564
}
1565
1566
for (int i = 0; i < n; ++i) {
1567
if (dregs[i] != tempregs[i]) {
1568
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1569
MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1570
}
1571
}
1572
1573
ApplyPrefixD(dregs, sz);
1574
fpr.ReleaseSpillLocks();
1575
}
1576
1577
// Planning for true SIMD
1578
1579
// Sequence for gathering sparse registers into one SIMD:
1580
// MOVSS(XMM0, fpr.R(sregs[0]));
1581
// MOVSS(XMM1, fpr.R(sregs[1]));
1582
// MOVSS(XMM2, fpr.R(sregs[2]));
1583
// MOVSS(XMM3, fpr.R(sregs[3]));
1584
// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); // XMM0 = S1 S1 S0 S0
1585
// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0)); // XMM2 = S3 S3 S2 S2
1586
// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0)); // XMM0 = S3 S2 S1 S0
1587
// Some punpckwd etc would also work.
1588
// Alternatively, MOVSS and three PINSRD (SSE4) with mem source.
1589
// Why PINSRD instead of INSERTPS?
1590
// http://software.intel.com/en-us/blogs/2009/01/07/using-sse41-for-mp3-encoding-quantization
1591
1592
// Sequence for scattering a SIMD register to sparse registers:
1593
// (Very serial though, better methods may be possible)
1594
// MOVSS(fpr.R(sregs[0]), XMM0);
1595
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1596
// MOVSS(fpr.R(sregs[1]), XMM0);
1597
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1598
// MOVSS(fpr.R(sregs[2]), XMM0);
1599
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1600
// MOVSS(fpr.R(sregs[3]), XMM0);
1601
// On SSE4 we should use EXTRACTPS.
1602
1603
// Translation of ryg's half_to_float5_SSE2
1604
void Jit::Comp_Vh2f(MIPSOpcode op) {
1605
CONDITIONAL_DISABLE(VFPU_VEC);
1606
if (js.HasUnknownPrefix())
1607
DISABLE;
1608
1609
#define SSE_CONST4(name, val) alignas(16) static const u32 name[4] = { (val), (val), (val), (val) }
1610
1611
SSE_CONST4(mask_nosign, 0x7fff);
1612
SSE_CONST4(nan_mantissa, 0x800003ff);
1613
SSE_CONST4(magic, (254 - 15) << 23);
1614
SSE_CONST4(was_infnan, 0x7bff);
1615
SSE_CONST4(exp_infnan, 255 << 23);
1616
1617
OpArg mask_nosign_arg, nan_mantissa_arg, magic_arg, was_infnan_arg, exp_infnan_arg;
1618
if (RipAccessible(mask_nosign)) {
1619
mask_nosign_arg = M(&mask_nosign[0]);
1620
nan_mantissa_arg = M(&nan_mantissa[0]);
1621
magic_arg = M(&magic[0]);
1622
was_infnan_arg = M(&was_infnan[0]);
1623
exp_infnan_arg = M(&exp_infnan[0]);
1624
} else {
1625
MOV(PTRBITS, R(TEMPREG), ImmPtr(&mask_nosign[0]));
1626
mask_nosign_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &mask_nosign[0]);
1627
nan_mantissa_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &nan_mantissa[0]);
1628
magic_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &magic[0]);
1629
was_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &was_infnan[0]);
1630
exp_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &exp_infnan[0]);
1631
}
1632
1633
#undef SSE_CONST4
1634
VectorSize sz = GetVecSize(op);
1635
VectorSize outsize;
1636
switch (sz) {
1637
case V_Single:
1638
outsize = V_Pair;
1639
break;
1640
case V_Pair:
1641
outsize = V_Quad;
1642
break;
1643
default:
1644
DISABLE;
1645
}
1646
1647
u8 sregs[4], dregs[4];
1648
GetVectorRegsPrefixS(sregs, sz, _VS);
1649
GetVectorRegsPrefixD(dregs, outsize, _VD);
1650
1651
// Flush SIMD.
1652
fpr.SimpleRegsV(sregs, sz, 0);
1653
1654
// Force ourselves an extra xreg as temp space.
1655
X64Reg tempR = fpr.GetFreeXReg();
1656
1657
MOVSS(XMM0, fpr.V(sregs[0]));
1658
if (sz != V_Single) {
1659
MOVSS(XMM1, fpr.V(sregs[1]));
1660
PUNPCKLDQ(XMM0, R(XMM1));
1661
}
1662
XORPS(XMM1, R(XMM1));
1663
PUNPCKLWD(XMM0, R(XMM1));
1664
1665
// OK, 16 bits in each word.
1666
// Let's go. Deep magic here.
1667
MOVAPS(XMM1, R(XMM0));
1668
ANDPS(XMM0, mask_nosign_arg); // xmm0 = expmant
1669
XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0
1670
MOVAPS(tempR, R(XMM0));
1671
PSLLD(XMM0, 13);
1672
MULPS(XMM0, magic_arg); /// xmm0 = scaled
1673
PSLLD(XMM1, 16); // xmm1 = sign
1674
ORPS(XMM0, R(XMM1));
1675
1676
// Now create a NAN mask, adding in the sign.
1677
ORPS(XMM1, R(tempR)); // xmm1 = sign + original mantissa.
1678
ANDPS(XMM1, nan_mantissa_arg); // xmm1 = original mantissa
1679
PCMPGTD(tempR, was_infnan_arg); // xmm2 = b_wasinfnan
1680
ORPS(XMM1, exp_infnan_arg); // xmm1 = infnan result
1681
ANDPS(XMM1, R(tempR)); // xmm1 = infnan result OR zero if not infnan
1682
ANDNPS(tempR, R(XMM0)); // tempR = result OR zero if infnan
1683
ORPS(XMM1, R(tempR));
1684
1685
fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
1686
1687
// TODO: Could apply D-prefix in parallel here...
1688
1689
MOVSS(fpr.V(dregs[0]), XMM1);
1690
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1691
MOVSS(fpr.V(dregs[1]), XMM1);
1692
1693
if (sz != V_Single) {
1694
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1695
MOVSS(fpr.V(dregs[2]), XMM1);
1696
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1697
MOVSS(fpr.V(dregs[3]), XMM1);
1698
}
1699
1700
ApplyPrefixD(dregs, outsize);
1701
gpr.UnlockAllX();
1702
fpr.ReleaseSpillLocks();
1703
}
1704
1705
// The goal is to map (reversed byte order for clarity):
1706
// AABBCCDD -> 000000AA 000000BB 000000CC 000000DD
1707
alignas(16) static s8 vc2i_shuffle[16] = { -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3 };
1708
// AABBCCDD -> AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
1709
alignas(16) static s8 vuc2i_shuffle[16] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 };
1710
1711
void Jit::Comp_Vx2i(MIPSOpcode op) {
1712
CONDITIONAL_DISABLE(VFPU_VEC);
1713
if (js.HasUnknownPrefix())
1714
DISABLE;
1715
1716
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1717
bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1718
1719
// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1720
// at the top. vus2i shifts it an extra bit right afterward.
1721
// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1722
// at the top too. vuc2i is a bit special (see below.)
1723
// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1724
// then use it for both.
1725
1726
VectorSize sz = GetVecSize(op);
1727
VectorSize outsize;
1728
if (bits == 8) {
1729
outsize = V_Quad;
1730
} else {
1731
switch (sz) {
1732
case V_Single:
1733
outsize = V_Pair;
1734
break;
1735
case V_Pair:
1736
outsize = V_Quad;
1737
break;
1738
default:
1739
DISABLE;
1740
}
1741
}
1742
1743
u8 sregs[4], dregs[4];
1744
GetVectorRegsPrefixS(sregs, sz, _VS);
1745
GetVectorRegsPrefixD(dregs, outsize, _VD);
1746
1747
// Flush SIMD.
1748
fpr.SimpleRegsV(sregs, sz, 0);
1749
1750
if (bits == 16) {
1751
MOVSS(XMM1, fpr.V(sregs[0]));
1752
if (sz != V_Single) {
1753
MOVSS(XMM0, fpr.V(sregs[1]));
1754
PUNPCKLDQ(XMM1, R(XMM0));
1755
}
1756
1757
// Unpack 16-bit words into 32-bit words, upper position, and we're done!
1758
PXOR(XMM0, R(XMM0));
1759
PUNPCKLWD(XMM0, R(XMM1));
1760
} else if (bits == 8) {
1761
if (unsignedOp) {
1762
// vuc2i is a bit special. It spreads out the bits like this:
1763
// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.
1764
MOVSS(XMM0, fpr.V(sregs[0]));
1765
if (cpu_info.bSSSE3 && RipAccessible(vuc2i_shuffle)) {
1766
// Not really different speed. Generates a bit less code.
1767
PSHUFB(XMM0, M(&vuc2i_shuffle[0])); // rip accessible
1768
} else {
1769
// First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA.
1770
PUNPCKLBW(XMM0, R(XMM0));
1771
// Now, interleave each 16 bits so they're all 32 bits wide.
1772
PUNPCKLWD(XMM0, R(XMM0));
1773
}
1774
} else {
1775
if (cpu_info.bSSSE3 && RipAccessible(vc2i_shuffle)) {
1776
MOVSS(XMM0, fpr.V(sregs[0]));
1777
PSHUFB(XMM0, M(&vc2i_shuffle[0]));
1778
} else {
1779
PXOR(XMM1, R(XMM1));
1780
MOVSS(XMM0, fpr.V(sregs[0]));
1781
PUNPCKLBW(XMM1, R(XMM0));
1782
PXOR(XMM0, R(XMM0));
1783
PUNPCKLWD(XMM0, R(XMM1));
1784
}
1785
}
1786
}
1787
1788
// At this point we have the regs in the 4 lanes.
1789
// In the "u" mode, we need to shift it out of the sign bit.
1790
if (unsignedOp) {
1791
PSRLD(XMM0, 1);
1792
}
1793
1794
if (fpr.TryMapRegsVS(dregs, outsize, MAP_NOINIT | MAP_DIRTY)) {
1795
MOVAPS(fpr.VSX(dregs), R(XMM0));
1796
} else {
1797
// Done! TODO: The rest of this should be possible to extract into a function.
1798
fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
1799
1800
// TODO: Could apply D-prefix in parallel here...
1801
1802
MOVSS(fpr.V(dregs[0]), XMM0);
1803
PSRLDQ(XMM0, 4);
1804
MOVSS(fpr.V(dregs[1]), XMM0);
1805
1806
if (outsize != V_Pair) {
1807
PSRLDQ(XMM0, 4);
1808
MOVSS(fpr.V(dregs[2]), XMM0);
1809
PSRLDQ(XMM0, 4);
1810
MOVSS(fpr.V(dregs[3]), XMM0);
1811
}
1812
}
1813
1814
ApplyPrefixD(dregs, outsize);
1815
gpr.UnlockAllX();
1816
fpr.ReleaseSpillLocks();
1817
}
1818
1819
extern const double mulTableVf2i[32] = {
1820
(1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3),
1821
(1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7),
1822
(1ULL<<8),(1ULL<<9),(1ULL<<10),(1ULL<<11),
1823
(1ULL<<12),(1ULL<<13),(1ULL<<14),(1ULL<<15),
1824
(1ULL<<16),(1ULL<<17),(1ULL<<18),(1ULL<<19),
1825
(1ULL<<20),(1ULL<<21),(1ULL<<22),(1ULL<<23),
1826
(1ULL<<24),(1ULL<<25),(1ULL<<26),(1ULL<<27),
1827
(1ULL<<28),(1ULL<<29),(1ULL<<30),(1ULL<<31),
1828
};
1829
1830
static const double maxMinIntAsDouble[2] = { (double)0x7fffffff, (double)(int)0x80000000 }; // that's not equal to 0x80000000
1831
1832
void Jit::Comp_Vf2i(MIPSOpcode op) {
1833
CONDITIONAL_DISABLE(VFPU_VEC);
1834
if (js.HasUnknownPrefix())
1835
DISABLE;
1836
1837
VectorSize sz = GetVecSize(op);
1838
int n = GetNumVectorElements(sz);
1839
1840
int imm = (op >> 16) & 0x1f;
1841
const double *mult = &mulTableVf2i[imm];
1842
1843
int setMXCSR = -1;
1844
int rmode = (op >> 21) & 0x1f;
1845
switch (rmode) {
1846
case 17:
1847
break; //z - truncate. Easy to support.
1848
case 16:
1849
setMXCSR = 0;
1850
break;
1851
case 18:
1852
setMXCSR = 2;
1853
break;
1854
case 19:
1855
setMXCSR = 1;
1856
break;
1857
}
1858
1859
// Small optimization: 0 is our default mode anyway.
1860
if (setMXCSR == 0 && !js.hasSetRounding) {
1861
setMXCSR = -1;
1862
}
1863
// Except for truncate, we need to update MXCSR to our preferred rounding mode.
1864
if (setMXCSR != -1) {
1865
STMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1866
MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));
1867
AND(32, R(TEMPREG), Imm32(~(3 << 13)));
1868
if (setMXCSR != 0) {
1869
OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
1870
}
1871
MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));
1872
LDMXCSR(MIPSSTATE_VAR(temp));
1873
}
1874
1875
u8 sregs[4], dregs[4];
1876
GetVectorRegsPrefixS(sregs, sz, _VS);
1877
GetVectorRegsPrefixD(dregs, sz, _VD);
1878
1879
// Really tricky to SIMD due to double precision requirement...
1880
1881
// Flush SIMD.
1882
fpr.SimpleRegsV(sregs, sz, 0);
1883
fpr.SimpleRegsV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
1884
1885
u8 tempregs[4];
1886
for (int i = 0; i < n; ++i) {
1887
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1888
tempregs[i] = fpr.GetTempV();
1889
} else {
1890
tempregs[i] = dregs[i];
1891
}
1892
}
1893
1894
if (*mult != 1.0f) {
1895
if (RipAccessible(mult)) {
1896
MOVSD(XMM1, M(mult)); // rip accessible
1897
} else {
1898
MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1899
MOVSD(XMM1, MatR(TEMPREG));
1900
}
1901
}
1902
1903
fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT);
1904
for (int i = 0; i < n; i++) {
1905
// Need to do this in double precision to clamp correctly as float
1906
// doesn't have enough precision to represent 0x7fffffff for example exactly.
1907
MOVSS(XMM0, fpr.V(sregs[i]));
1908
CVTSS2SD(XMM0, R(XMM0)); // convert to double precision
1909
if (*mult != 1.0f) {
1910
MULSD(XMM0, R(XMM1));
1911
}
1912
MOV(PTRBITS, R(TEMPREG), ImmPtr(maxMinIntAsDouble));
1913
MINSD(XMM0, MDisp(TEMPREG, 0));
1914
MAXSD(XMM0, MDisp(TEMPREG, sizeof(double)));
1915
// We've set the rounding mode above, so this part's easy.
1916
switch ((op >> 21) & 0x1f) {
1917
case 16: CVTSD2SI(TEMPREG, R(XMM0)); break; //n
1918
case 17: CVTTSD2SI(TEMPREG, R(XMM0)); break; //z - truncate
1919
case 18: CVTSD2SI(TEMPREG, R(XMM0)); break; //u
1920
case 19: CVTSD2SI(TEMPREG, R(XMM0)); break; //d
1921
}
1922
MOVD_xmm(fpr.VX(tempregs[i]), R(TEMPREG));
1923
}
1924
1925
for (int i = 0; i < n; ++i) {
1926
if (dregs[i] != tempregs[i]) {
1927
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1928
MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1929
fpr.DiscardV(tempregs[i]);
1930
}
1931
}
1932
1933
if (setMXCSR != -1) {
1934
LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1935
}
1936
1937
ApplyPrefixD(dregs, sz);
1938
fpr.ReleaseSpillLocks();
1939
}
1940
1941
void Jit::Comp_Vcst(MIPSOpcode op) {
1942
CONDITIONAL_DISABLE(VFPU_XFER);
1943
1944
if (js.HasUnknownPrefix())
1945
DISABLE;
1946
1947
int conNum = (op >> 16) & 0x1f;
1948
int vd = _VD;
1949
1950
VectorSize sz = GetVecSize(op);
1951
int n = GetNumVectorElements(sz);
1952
1953
u8 dregs[4];
1954
GetVectorRegsPrefixD(dregs, sz, vd);
1955
1956
if (RipAccessible(cst_constants)) {
1957
MOVSS(XMM0, M(&cst_constants[conNum])); // rip accessible
1958
} else {
1959
MOV(PTRBITS, R(TEMPREG), ImmPtr(&cst_constants[conNum]));
1960
MOVSS(XMM0, MatR(TEMPREG));
1961
}
1962
1963
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
1964
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0,0,0,0));
1965
MOVAPS(fpr.VS(dregs), XMM0);
1966
fpr.ReleaseSpillLocks();
1967
return;
1968
}
1969
1970
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1971
for (int i = 0; i < n; i++) {
1972
MOVSS(fpr.V(dregs[i]), XMM0);
1973
}
1974
ApplyPrefixD(dregs, sz);
1975
fpr.ReleaseSpillLocks();
1976
}
1977
1978
void Jit::Comp_Vsgn(MIPSOpcode op) {
1979
CONDITIONAL_DISABLE(VFPU_VEC);
1980
1981
if (js.HasUnknownPrefix())
1982
DISABLE;
1983
1984
VectorSize sz = GetVecSize(op);
1985
int n = GetNumVectorElements(sz);
1986
1987
u8 sregs[4], dregs[4];
1988
GetVectorRegsPrefixS(sregs, sz, _VS);
1989
GetVectorRegsPrefixD(dregs, sz, _VD);
1990
1991
// Flush SIMD.
1992
fpr.SimpleRegsV(sregs, sz, 0);
1993
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1994
1995
X64Reg tempxregs[4];
1996
for (int i = 0; i < n; ++i) {
1997
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1998
int reg = fpr.GetTempV();
1999
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2000
fpr.SpillLockV(reg);
2001
tempxregs[i] = fpr.VX(reg);
2002
} else {
2003
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2004
fpr.SpillLockV(dregs[i]);
2005
tempxregs[i] = fpr.VX(dregs[i]);
2006
}
2007
}
2008
2009
// Would be nice with more temp regs here so we could put signBitLower and oneOneOneOne into regs...
2010
for (int i = 0; i < n; ++i) {
2011
XORPS(XMM0, R(XMM0));
2012
CMPEQSS(XMM0, fpr.V(sregs[i])); // XMM0 = s[i] == 0.0f
2013
MOVSS(XMM1, fpr.V(sregs[i]));
2014
// Preserve sign bit, replace rest with ones
2015
if (RipAccessible(signBitLower)) {
2016
ANDPS(XMM1, M(&signBitLower)); // rip accessible
2017
ORPS(XMM1, M(&oneOneOneOne)); // rip accessible
2018
} else {
2019
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2020
ANDPS(XMM1, MatR(TEMPREG));
2021
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2022
ORPS(XMM1, MatR(TEMPREG));
2023
}
2024
// If really was equal to zero, zap. Note that ANDN negates the destination.
2025
ANDNPS(XMM0, R(XMM1));
2026
MOVAPS(tempxregs[i], R(XMM0));
2027
}
2028
2029
for (int i = 0; i < n; ++i) {
2030
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2031
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2032
}
2033
2034
ApplyPrefixD(dregs, sz);
2035
2036
fpr.ReleaseSpillLocks();
2037
}
2038
2039
void Jit::Comp_Vocp(MIPSOpcode op) {
2040
CONDITIONAL_DISABLE(VFPU_VEC);
2041
2042
if (js.HasUnknownPrefix())
2043
DISABLE;
2044
2045
VectorSize sz = GetVecSize(op);
2046
int n = GetNumVectorElements(sz);
2047
2048
// This is a hack that modifies prefixes. We eat them later, so just overwrite.
2049
// S prefix forces the negate flags.
2050
js.prefixS |= 0x000F0000;
2051
// T prefix forces constants on and regnum to 1.
2052
// That means negate still works, and abs activates a different constant.
2053
js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2054
2055
u8 sregs[4], tregs[4], dregs[4];
2056
// Actually uses the T prefixes (despite being VS.)
2057
GetVectorRegsPrefixS(sregs, sz, _VS);
2058
if (js.prefixT != 0x0000F055)
2059
GetVectorRegsPrefixT(tregs, sz, _VS);
2060
GetVectorRegsPrefixD(dregs, sz, _VD);
2061
2062
// Flush SIMD.
2063
fpr.SimpleRegsV(sregs, sz, 0);
2064
if (js.prefixT != 0x0000F055)
2065
fpr.SimpleRegsV(tregs, sz, 0);
2066
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2067
2068
X64Reg tempxregs[4];
2069
for (int i = 0; i < n; ++i) {
2070
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2071
int reg = fpr.GetTempV();
2072
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2073
fpr.SpillLockV(reg);
2074
tempxregs[i] = fpr.VX(reg);
2075
} else {
2076
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2077
fpr.SpillLockV(dregs[i]);
2078
tempxregs[i] = fpr.VX(dregs[i]);
2079
}
2080
}
2081
2082
if (js.prefixT == 0x0000F055) {
2083
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2084
MOVSS(XMM1, MatR(TEMPREG));
2085
}
2086
for (int i = 0; i < n; ++i) {
2087
if (js.prefixT == 0x0000F055) {
2088
MOVSS(XMM0, R(XMM1));
2089
} else {
2090
MOVSS(XMM0, fpr.V(tregs[i]));
2091
}
2092
ADDSS(XMM0, fpr.V(sregs[i]));
2093
MOVSS(tempxregs[i], R(XMM0));
2094
}
2095
2096
for (int i = 0; i < n; ++i) {
2097
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2098
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2099
}
2100
2101
ApplyPrefixD(dregs, sz);
2102
2103
fpr.ReleaseSpillLocks();
2104
}
2105
2106
void Jit::Comp_Vbfy(MIPSOpcode op) {
2107
CONDITIONAL_DISABLE(VFPU_VEC);
2108
if (js.HasUnknownPrefix())
2109
DISABLE;
2110
2111
VectorSize sz = GetVecSize(op);
2112
int n = GetNumVectorElements(sz);
2113
if (n != 2 && n != 4) {
2114
DISABLE;
2115
}
2116
2117
u8 sregs[4], dregs[4];
2118
GetVectorRegsPrefixS(sregs, sz, _VS);
2119
GetVectorRegsPrefixD(dregs, sz, _VD);
2120
// Flush SIMD.
2121
fpr.SimpleRegsV(sregs, sz, 0);
2122
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2123
2124
X64Reg tempxregs[4];
2125
for (int i = 0; i < n; ++i) {
2126
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2127
int reg = fpr.GetTempV();
2128
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2129
fpr.SpillLockV(reg);
2130
tempxregs[i] = fpr.VX(reg);
2131
} else {
2132
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2133
fpr.SpillLockV(dregs[i]);
2134
tempxregs[i] = fpr.VX(dregs[i]);
2135
}
2136
}
2137
2138
int subop = (op >> 16) & 0x1F;
2139
if (subop == 3) {
2140
// vbfy2
2141
MOVSS(tempxregs[0], fpr.V(sregs[0]));
2142
MOVSS(tempxregs[1], fpr.V(sregs[1]));
2143
MOVSS(tempxregs[2], fpr.V(sregs[0]));
2144
MOVSS(tempxregs[3], fpr.V(sregs[1]));
2145
ADDSS(tempxregs[0], fpr.V(sregs[2]));
2146
ADDSS(tempxregs[1], fpr.V(sregs[3]));
2147
SUBSS(tempxregs[2], fpr.V(sregs[2]));
2148
SUBSS(tempxregs[3], fpr.V(sregs[3]));
2149
} else if (subop == 2) {
2150
// vbfy1
2151
MOVSS(tempxregs[0], fpr.V(sregs[0]));
2152
MOVSS(tempxregs[1], fpr.V(sregs[0]));
2153
ADDSS(tempxregs[0], fpr.V(sregs[1]));
2154
SUBSS(tempxregs[1], fpr.V(sregs[1]));
2155
if (n == 4) {
2156
MOVSS(tempxregs[2], fpr.V(sregs[2]));
2157
MOVSS(tempxregs[3], fpr.V(sregs[2]));
2158
ADDSS(tempxregs[2], fpr.V(sregs[3]));
2159
SUBSS(tempxregs[3], fpr.V(sregs[3]));
2160
}
2161
} else {
2162
DISABLE;
2163
}
2164
2165
for (int i = 0; i < n; ++i) {
2166
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2167
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2168
}
2169
2170
ApplyPrefixD(dregs, sz);
2171
2172
fpr.ReleaseSpillLocks();
2173
}
2174
2175
union u32float {
2176
u32 u;
2177
float f;
2178
2179
operator float() const {
2180
return f;
2181
}
2182
2183
inline u32float &operator *=(const float &other) {
2184
f *= other;
2185
return *this;
2186
}
2187
};
2188
2189
#if PPSSPP_ARCH(AMD64)
2190
typedef float SinCosArg;
2191
#else
2192
typedef u32float SinCosArg;
2193
#endif
2194
2195
void SinCos(SinCosArg angle, float *output) {
2196
vfpu_sincos(angle, output[0], output[1]);
2197
}
2198
2199
void SinOnly(SinCosArg angle, float *output) {
2200
output[0] = vfpu_sin(angle);
2201
}
2202
2203
void NegSinOnly(SinCosArg angle, float *output) {
2204
output[0] = -vfpu_sin(angle);
2205
}
2206
2207
void CosOnly(SinCosArg angle, float *output) {
2208
output[1] = vfpu_cos(angle);
2209
}
2210
2211
void ASinScaled(SinCosArg sine, float *output) {
2212
output[0] = vfpu_asin(sine);
2213
}
2214
2215
void SinCosNegSin(SinCosArg angle, float *output) {
2216
vfpu_sincos(angle, output[0], output[1]);
2217
output[0] = -output[0];
2218
}
2219
2220
void Exp2(SinCosArg arg, float *output) {
2221
output[0] = vfpu_exp2(arg);
2222
}
2223
2224
void Log2(SinCosArg arg, float *output) {
2225
output[0] = vfpu_log2(arg);
2226
}
2227
2228
void RExp2(SinCosArg arg, float *output) {
2229
output[0] = vfpu_rexp2(arg);
2230
}
2231
2232
void Jit::Comp_VV2Op(MIPSOpcode op) {
2233
CONDITIONAL_DISABLE(VFPU_VEC);
2234
2235
if (js.HasUnknownPrefix())
2236
DISABLE;
2237
2238
auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {
2239
#if PPSSPP_ARCH(AMD64)
2240
MOVSS(XMM0, fpr.V(sreg));
2241
// TODO: This reg might be different on Linux...
2242
#ifdef _WIN32
2243
LEA(64, RDX, MIPSSTATE_VAR(sincostemp[0]));
2244
#else
2245
LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));
2246
#endif
2247
ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));
2248
#else
2249
// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
2250
if (fpr.V(sreg).IsSimpleReg()) {
2251
MOVD_xmm(R(EAX), fpr.VX(sreg));
2252
} else {
2253
MOV(32, R(EAX), fpr.V(sreg));
2254
}
2255
CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
2256
#endif
2257
};
2258
2259
// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
2260
if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
2261
return;
2262
}
2263
2264
VectorSize sz = GetVecSize(op);
2265
int n = GetNumVectorElements(sz);
2266
2267
u8 sregs[4], dregs[4];
2268
GetVectorRegsPrefixS(sregs, sz, _VS);
2269
GetVectorRegsPrefixD(dregs, sz, _VD);
2270
2271
bool canSIMD = false;
2272
// Some can be SIMD'd.
2273
switch ((op >> 16) & 0x1f) {
2274
case 0: // vmov
2275
case 1: // vabs
2276
case 2: // vneg
2277
canSIMD = true;
2278
break;
2279
}
2280
2281
if (canSIMD && fpr.TryMapDirtyInVS(dregs, sz, sregs, sz)) {
2282
switch ((op >> 16) & 0x1f) {
2283
case 0: // vmov
2284
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2285
break;
2286
case 1: // vabs
2287
if (dregs[0] != sregs[0])
2288
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2289
if (RipAccessible(&noSignMask)) {
2290
ANDPS(fpr.VSX(dregs), M(&noSignMask)); // rip accessible
2291
} else {
2292
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2293
ANDPS(fpr.VSX(dregs), MatR(TEMPREG));
2294
}
2295
break;
2296
case 2: // vneg
2297
if (dregs[0] != sregs[0])
2298
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2299
if (RipAccessible(&signBitAll)) {
2300
XORPS(fpr.VSX(dregs), M(&signBitAll)); // rip accessible
2301
} else {
2302
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitAll));
2303
XORPS(fpr.VSX(dregs), MatR(TEMPREG));
2304
}
2305
break;
2306
}
2307
ApplyPrefixD(dregs, sz);
2308
fpr.ReleaseSpillLocks();
2309
return;
2310
}
2311
2312
// Flush SIMD.
2313
fpr.SimpleRegsV(sregs, sz, 0);
2314
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2315
2316
X64Reg tempxregs[4];
2317
for (int i = 0; i < n; ++i)
2318
{
2319
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs))
2320
{
2321
int reg = fpr.GetTempV();
2322
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2323
fpr.SpillLockV(reg);
2324
tempxregs[i] = fpr.VX(reg);
2325
}
2326
else
2327
{
2328
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2329
fpr.SpillLockV(dregs[i]);
2330
tempxregs[i] = fpr.VX(dregs[i]);
2331
}
2332
}
2333
2334
// Warning: sregs[i] and tempxregs[i] may be the same reg.
2335
// Helps for vmov, hurts for vrcp, etc.
2336
for (int i = 0; i < n; ++i)
2337
{
2338
switch ((op >> 16) & 0x1f)
2339
{
2340
case 0: // d[i] = s[i]; break; //vmov
2341
// Probably for swizzle.
2342
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2343
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2344
break;
2345
case 1: // d[i] = fabsf(s[i]); break; //vabs
2346
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2347
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2348
if (RipAccessible(&noSignMask)) {
2349
ANDPS(tempxregs[i], M(&noSignMask)); // rip accessible
2350
} else {
2351
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2352
ANDPS(tempxregs[i], MatR(TEMPREG));
2353
}
2354
break;
2355
case 2: // d[i] = -s[i]; break; //vneg
2356
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2357
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2358
if (RipAccessible(&signBitLower)) {
2359
XORPS(tempxregs[i], M(&signBitLower)); // rip accessible
2360
} else {
2361
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2362
XORPS(tempxregs[i], MatR(TEMPREG));
2363
}
2364
break;
2365
case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
2366
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2367
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2368
2369
// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
2370
MOVSS(R(XMM0), tempxregs[i]);
2371
XORPS(XMM1, R(XMM1));
2372
CMPLESS(XMM0, R(XMM1));
2373
ANDNPS(XMM0, R(tempxregs[i]));
2374
2375
// Retain a NAN in XMM0 (must be second operand.)
2376
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2377
MOVSS(tempxregs[i], MatR(TEMPREG));
2378
MINSS(tempxregs[i], R(XMM0));
2379
break;
2380
case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
2381
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2382
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2383
2384
// Check for < -1.0f, but careful of NANs.
2385
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2386
MOVSS(XMM1, MatR(TEMPREG));
2387
MOVSS(R(XMM0), tempxregs[i]);
2388
CMPLESS(XMM0, R(XMM1));
2389
// If it was NOT less, the three ops below do nothing.
2390
// Otherwise, they replace the value with -1.0f.
2391
ANDPS(XMM1, R(XMM0));
2392
ANDNPS(XMM0, R(tempxregs[i]));
2393
ORPS(XMM0, R(XMM1));
2394
2395
// Retain a NAN in XMM0 (must be second operand.)
2396
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2397
MOVSS(tempxregs[i], MatR(TEMPREG));
2398
MINSS(tempxregs[i], R(XMM0));
2399
break;
2400
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
2401
if (RipAccessible(&one)) {
2402
MOVSS(XMM0, M(&one)); // rip accessible
2403
} else {
2404
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2405
MOVSS(XMM0, MatR(TEMPREG));
2406
}
2407
DIVSS(XMM0, fpr.V(sregs[i]));
2408
MOVSS(tempxregs[i], R(XMM0));
2409
break;
2410
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
2411
SQRTSS(XMM0, fpr.V(sregs[i]));
2412
if (RipAccessible(&one)) {
2413
MOVSS(tempxregs[i], M(&one)); // rip accessible
2414
} else {
2415
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2416
MOVSS(tempxregs[i], MatR(TEMPREG));
2417
}
2418
DIVSS(tempxregs[i], R(XMM0));
2419
break;
2420
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
2421
specialFuncCallHelper(&SinOnly, sregs[i]);
2422
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2423
break;
2424
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
2425
specialFuncCallHelper(&CosOnly, sregs[i]);
2426
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));
2427
break;
2428
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
2429
specialFuncCallHelper(&Exp2, sregs[i]);
2430
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2431
break;
2432
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
2433
specialFuncCallHelper(&Log2, sregs[i]);
2434
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2435
break;
2436
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
2437
SQRTSS(tempxregs[i], fpr.V(sregs[i]));
2438
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2439
ANDPS(tempxregs[i], MatR(TEMPREG));
2440
break;
2441
case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
2442
specialFuncCallHelper(&ASinScaled, sregs[i]);
2443
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2444
break;
2445
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
2446
// Rare so let's not bother checking for RipAccessible.
2447
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2448
MOVSS(XMM0, MatR(TEMPREG));
2449
DIVSS(XMM0, fpr.V(sregs[i]));
2450
MOVSS(tempxregs[i], R(XMM0));
2451
break;
2452
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
2453
specialFuncCallHelper(&NegSinOnly, sregs[i]);
2454
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2455
break;
2456
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
2457
specialFuncCallHelper(&RExp2, sregs[i]);
2458
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2459
break;
2460
}
2461
}
2462
for (int i = 0; i < n; ++i)
2463
{
2464
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2465
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2466
}
2467
2468
ApplyPrefixD(dregs, sz);
2469
2470
fpr.ReleaseSpillLocks();
2471
}
2472
2473
void Jit::Comp_Mftv(MIPSOpcode op) {
2474
CONDITIONAL_DISABLE(VFPU_XFER);
2475
2476
int imm = op & 0xFF;
2477
MIPSGPReg rt = _RT;
2478
switch ((op >> 21) & 0x1f)
2479
{
2480
case 3: //mfv / mfvc
2481
// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
2482
if (rt != MIPS_REG_ZERO) {
2483
if (imm < 128) { //R(rt) = VI(imm);
2484
fpr.SimpleRegV(imm, 0);
2485
if (fpr.V(imm).IsSimpleReg()) {
2486
fpr.MapRegV(imm, 0);
2487
gpr.MapReg(rt, false, true);
2488
MOVD_xmm(gpr.R(rt), fpr.VX(imm));
2489
} else {
2490
// Let's not bother mapping the vreg.
2491
gpr.MapReg(rt, false, true);
2492
MOV(32, gpr.R(rt), fpr.V(imm));
2493
}
2494
} else if (imm < 128 + VFPU_CTRL_MAX) { //mfvc
2495
if (imm - 128 == VFPU_CTRL_CC) {
2496
if (gpr.IsImm(MIPS_REG_VFPUCC)) {
2497
gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));
2498
} else {
2499
gpr.Lock(rt, MIPS_REG_VFPUCC);
2500
gpr.MapReg(rt, false, true);
2501
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2502
MOV(32, gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));
2503
gpr.UnlockAll();
2504
}
2505
} else {
2506
// In case we have a saved prefix.
2507
FlushPrefixV();
2508
gpr.MapReg(rt, false, true);
2509
MOV(32, gpr.R(rt), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128));
2510
}
2511
} else {
2512
//ERROR - maybe need to make this value too an "interlock" value?
2513
_dbg_assert_msg_(false,"mfv - invalid register");
2514
}
2515
}
2516
break;
2517
2518
case 7: //mtv
2519
if (imm < 128) { // VI(imm) = R(rt);
2520
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
2521
// Let's not bother mapping rt if we don't have to.
2522
if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {
2523
XORPS(fpr.VX(imm), fpr.V(imm));
2524
} else {
2525
gpr.KillImmediate(rt, true, false);
2526
MOVD_xmm(fpr.VX(imm), gpr.R(rt));
2527
}
2528
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
2529
if (imm - 128 == VFPU_CTRL_CC) {
2530
if (gpr.IsImm(rt)) {
2531
gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));
2532
} else {
2533
gpr.Lock(rt, MIPS_REG_VFPUCC);
2534
gpr.MapReg(rt, true, false);
2535
gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2536
MOV(32, gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));
2537
gpr.UnlockAll();
2538
}
2539
} else {
2540
gpr.MapReg(rt, true, false);
2541
MOV(32, MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128), gpr.R(rt));
2542
}
2543
2544
// TODO: Optimization if rt is Imm?
2545
if (imm - 128 == VFPU_CTRL_SPREFIX) {
2546
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2547
js.blockWrotePrefixes = true;
2548
} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
2549
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2550
js.blockWrotePrefixes = true;
2551
} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
2552
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2553
js.blockWrotePrefixes = true;
2554
}
2555
} else {
2556
//ERROR
2557
_dbg_assert_msg_(false,"mtv - invalid register");
2558
}
2559
break;
2560
2561
default:
2562
DISABLE;
2563
}
2564
}
2565
2566
void Jit::Comp_Vmfvc(MIPSOpcode op) {
2567
CONDITIONAL_DISABLE(VFPU_XFER);
2568
int vd = _VD;
2569
int imm = (op >> 8) & 0x7F;
2570
if (imm < VFPU_CTRL_MAX) {
2571
fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2572
if (imm == VFPU_CTRL_CC) {
2573
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2574
MOVD_xmm(fpr.VX(vd), gpr.R(MIPS_REG_VFPUCC));
2575
} else {
2576
MOVSS(fpr.VX(vd), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm));
2577
}
2578
fpr.ReleaseSpillLocks();
2579
} else {
2580
fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2581
XORPS(fpr.VX(vd), fpr.V(vd));
2582
fpr.ReleaseSpillLocks();
2583
}
2584
}
2585
2586
void Jit::Comp_Vmtvc(MIPSOpcode op) {
2587
CONDITIONAL_DISABLE(VFPU_XFER);
2588
int vs = _VS;
2589
int imm = op & 0x7F;
2590
if (imm < VFPU_CTRL_MAX) {
2591
fpr.MapRegV(vs, 0);
2592
if (imm == VFPU_CTRL_CC) {
2593
gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2594
MOVD_xmm(gpr.R(MIPS_REG_VFPUCC), fpr.VX(vs));
2595
} else {
2596
MOVSS(MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm), fpr.VX(vs));
2597
}
2598
fpr.ReleaseSpillLocks();
2599
2600
if (imm == VFPU_CTRL_SPREFIX) {
2601
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2602
js.blockWrotePrefixes = true;
2603
} else if (imm == VFPU_CTRL_TPREFIX) {
2604
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2605
js.blockWrotePrefixes = true;
2606
} else if (imm == VFPU_CTRL_DPREFIX) {
2607
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2608
js.blockWrotePrefixes = true;
2609
}
2610
}
2611
}
2612
2613
void Jit::Comp_VMatrixInit(MIPSOpcode op) {
2614
CONDITIONAL_DISABLE(VFPU_XFER);
2615
2616
if (js.HasUnknownPrefix())
2617
DISABLE;
2618
2619
MatrixSize sz = GetMtxSize(op);
2620
int n = GetMatrixSide(sz);
2621
2622
// Not really about trying here, it will work if enabled.
2623
if (jo.enableVFPUSIMD) {
2624
VectorSize vsz = GetVectorSize(sz);
2625
u8 vecs[4];
2626
GetMatrixColumns(_VD, sz, vecs);
2627
switch ((op >> 16) & 0xF) {
2628
case 3:
2629
MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[0]));
2630
break;
2631
case 7:
2632
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2633
MOVAPS(XMM0, MatR(TEMPREG));
2634
break;
2635
}
2636
2637
for (int i = 0; i < n; i++) {
2638
u8 vec[4];
2639
GetVectorRegs(vec, vsz, vecs[i]);
2640
fpr.MapRegsVS(vec, vsz, MAP_NOINIT | MAP_DIRTY);
2641
switch ((op >> 16) & 0xF) {
2642
case 3:
2643
MOVAPS(fpr.VSX(vec), MDisp(TEMPREG, 16 * i));
2644
break;
2645
case 6:
2646
XORPS(fpr.VSX(vec), fpr.VS(vec));
2647
break;
2648
case 7:
2649
MOVAPS(fpr.VSX(vec), R(XMM0));
2650
break;
2651
}
2652
}
2653
fpr.ReleaseSpillLocks();
2654
return;
2655
}
2656
2657
u8 dregs[16];
2658
GetMatrixRegs(dregs, sz, _VD);
2659
2660
// Flush SIMD.
2661
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2662
2663
switch ((op >> 16) & 0xF) {
2664
case 3: // vmidt
2665
XORPS(XMM0, R(XMM0));
2666
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2667
MOVSS(XMM1, MatR(TEMPREG));
2668
for (int a = 0; a < n; a++) {
2669
for (int b = 0; b < n; b++) {
2670
MOVSS(fpr.V(dregs[a * 4 + b]), a == b ? XMM1 : XMM0);
2671
}
2672
}
2673
break;
2674
case 6: // vmzero
2675
XORPS(XMM0, R(XMM0));
2676
for (int a = 0; a < n; a++) {
2677
for (int b = 0; b < n; b++) {
2678
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2679
}
2680
}
2681
break;
2682
case 7: // vmone
2683
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2684
MOVSS(XMM0, MatR(TEMPREG));
2685
for (int a = 0; a < n; a++) {
2686
for (int b = 0; b < n; b++) {
2687
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2688
}
2689
}
2690
break;
2691
}
2692
2693
fpr.ReleaseSpillLocks();
2694
}
2695
2696
void Jit::Comp_Vmmov(MIPSOpcode op) {
2697
CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
2698
2699
// TODO: This probably ignores prefixes?
2700
if (js.HasUnknownPrefix())
2701
DISABLE;
2702
2703
MatrixSize sz = GetMtxSize(op);
2704
int n = GetMatrixSide(sz);
2705
2706
if (jo.enableVFPUSIMD) {
2707
VectorSize vsz = GetVectorSize(sz);
2708
u8 dest[4][4];
2709
MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);
2710
2711
u8 vecs[4];
2712
if (overlap == OVERLAP_NONE) {
2713
GetMatrixColumns(_VD, sz, vecs);
2714
for (int i = 0; i < n; ++i) {
2715
GetVectorRegs(dest[i], vsz, vecs[i]);
2716
}
2717
} else {
2718
for (int i = 0; i < n; ++i) {
2719
fpr.GetTempVS(dest[i], vsz);
2720
}
2721
}
2722
2723
GetMatrixColumns(_VS, sz, vecs);
2724
for (int i = 0; i < n; i++) {
2725
u8 vec[4];
2726
GetVectorRegs(vec, vsz, vecs[i]);
2727
fpr.MapRegsVS(vec, vsz, 0);
2728
fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);
2729
MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));
2730
fpr.ReleaseSpillLocks();
2731
}
2732
2733
if (overlap != OVERLAP_NONE) {
2734
// Okay, move from the temps to VD now.
2735
GetMatrixColumns(_VD, sz, vecs);
2736
for (int i = 0; i < n; i++) {
2737
u8 vec[4];
2738
GetVectorRegs(vec, vsz, vecs[i]);
2739
fpr.MapRegsVS(vec, vsz, MAP_NOINIT);
2740
fpr.MapRegsVS(dest[i], vsz, 0);
2741
MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));
2742
fpr.ReleaseSpillLocks();
2743
}
2744
}
2745
2746
fpr.ReleaseSpillLocks();
2747
return;
2748
}
2749
2750
u8 sregs[16], dregs[16];
2751
GetMatrixRegs(sregs, sz, _VS);
2752
GetMatrixRegs(dregs, sz, _VD);
2753
2754
// Flush SIMD.
2755
fpr.SimpleRegsV(sregs, sz, 0);
2756
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2757
2758
// TODO: gas doesn't allow overlap, what does the PSP do?
2759
// Potentially detect overlap or the safe direction to move in, or just DISABLE?
2760
// This is very not optimal, blows the regcache everytime.
2761
u8 tempregs[16];
2762
for (int a = 0; a < n; a++) {
2763
for (int b = 0; b < n; b++) {
2764
u8 temp = (u8) fpr.GetTempV();
2765
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
2766
MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
2767
fpr.StoreFromRegisterV(temp);
2768
tempregs[a * 4 + b] = temp;
2769
}
2770
}
2771
for (int a = 0; a < n; a++) {
2772
for (int b = 0; b < n; b++) {
2773
u8 temp = tempregs[a * 4 + b];
2774
fpr.MapRegV(temp, 0);
2775
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
2776
}
2777
}
2778
2779
fpr.ReleaseSpillLocks();
2780
}
2781
2782
void Jit::Comp_VScl(MIPSOpcode op) {
2783
CONDITIONAL_DISABLE(VFPU_VEC);
2784
2785
if (js.HasUnknownPrefix())
2786
DISABLE;
2787
2788
VectorSize sz = GetVecSize(op);
2789
int n = GetNumVectorElements(sz);
2790
2791
u8 sregs[4], dregs[4], scale;
2792
GetVectorRegsPrefixS(sregs, sz, _VS);
2793
GetVectorRegsPrefixT(&scale, V_Single, _VT);
2794
GetVectorRegsPrefixD(dregs, sz, _VD);
2795
2796
if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, &scale, V_Single, true)) {
2797
MOVSS(XMM0, fpr.VS(&scale));
2798
if (sz != V_Single)
2799
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2800
if (dregs[0] != sregs[0]) {
2801
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2802
}
2803
MULPS(fpr.VSX(dregs), R(XMM0));
2804
ApplyPrefixD(dregs, sz);
2805
fpr.ReleaseSpillLocks();
2806
return;
2807
}
2808
2809
// Flush SIMD.
2810
fpr.SimpleRegsV(sregs, sz, 0);
2811
fpr.SimpleRegsV(&scale, V_Single, 0);
2812
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2813
2814
// Move to XMM0 early, so we don't have to worry about overlap with scale.
2815
MOVSS(XMM0, fpr.V(scale));
2816
2817
X64Reg tempxregs[4];
2818
for (int i = 0; i < n; ++i) {
2819
if (dregs[i] != scale || !IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2820
int reg = fpr.GetTempV();
2821
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2822
fpr.SpillLockV(reg);
2823
tempxregs[i] = fpr.VX(reg);
2824
} else {
2825
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2826
fpr.SpillLockV(dregs[i]);
2827
tempxregs[i] = fpr.VX(dregs[i]);
2828
}
2829
}
2830
for (int i = 0; i < n; ++i) {
2831
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2832
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2833
MULSS(tempxregs[i], R(XMM0));
2834
}
2835
for (int i = 0; i < n; ++i) {
2836
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2837
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2838
}
2839
ApplyPrefixD(dregs, sz);
2840
2841
fpr.ReleaseSpillLocks();
2842
}
2843
2844
void Jit::Comp_Vmmul(MIPSOpcode op) {
2845
CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
2846
if (!js.HasNoPrefix()) {
2847
DISABLE;
2848
}
2849
2850
if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
2851
// Fall back to interpreter, which has the accurate implementation.
2852
// Later we might do something more optimized here.
2853
DISABLE;
2854
}
2855
2856
MatrixSize sz = GetMtxSize(op);
2857
VectorSize vsz = GetVectorSize(sz);
2858
int n = GetMatrixSide(sz);
2859
2860
MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
2861
MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
2862
// If these overlap, we won't be able to map T as singles.
2863
MatrixOverlapType stoverlap = GetMatrixOverlap(_VS, _VT, sz);
2864
2865
if (jo.enableVFPUSIMD && !soverlap && !toverlap && !stoverlap) {
2866
u8 scols[4], dcols[4], tregs[16];
2867
2868
int vs = _VS;
2869
int vd = _VD;
2870
int vt = _VT;
2871
2872
bool transposeDest = false;
2873
bool transposeS = false;
2874
2875
if ((vd & 0x20) && sz == M_4x4) {
2876
vd ^= 0x20;
2877
transposeDest = true;
2878
}
2879
2880
// Our algorithm needs a transposed S (which is the usual).
2881
if (!(vs & 0x20) && sz == M_4x4) {
2882
vs ^= 0x20;
2883
transposeS = true;
2884
}
2885
2886
// The T matrix we will address individually.
2887
GetMatrixColumns(vd, sz, dcols);
2888
GetMatrixRows(vs, sz, scols);
2889
memset(tregs, 255, sizeof(tregs));
2890
GetMatrixRegs(tregs, sz, vt);
2891
for (int i = 0; i < 16; i++) {
2892
if (tregs[i] != 255)
2893
fpr.StoreFromRegisterV(tregs[i]);
2894
}
2895
2896
u8 scol[4][4];
2897
2898
// Map all of S's columns into registers.
2899
for (int i = 0; i < n; i++) {
2900
if (transposeS){
2901
fpr.StoreFromRegisterV(scols[i]);
2902
}
2903
GetVectorRegs(scol[i], vsz, scols[i]);
2904
fpr.MapRegsVS(scol[i], vsz, 0);
2905
fpr.SpillLockV(scols[i], vsz);
2906
}
2907
2908
// Shorter than manually stuffing the registers. But it feels like ther'es room for optimization here...
2909
auto transposeInPlace = [=](u8 col[4][4]) {
2910
MOVAPS(XMM0, fpr.VS(col[0]));
2911
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[2]));
2912
UNPCKHPS(XMM0, fpr.VS(col[2]));
2913
2914
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2915
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2916
UNPCKHPS(fpr.VSX(col[2]), fpr.VS(col[3]));
2917
2918
MOVAPS(fpr.VSX(col[3]), fpr.VS(col[0]));
2919
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[1]));
2920
UNPCKHPS(fpr.VSX(col[3]), fpr.VS(col[1]));
2921
2922
MOVAPS(fpr.VSX(col[1]), R(XMM0));
2923
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[2]));
2924
UNPCKHPS(XMM0, fpr.VS(col[2]));
2925
2926
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2927
MOVAPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2928
MOVAPS(fpr.VSX(col[3]), R(XMM0));
2929
};
2930
2931
// Some games pass in S as an E matrix (transposed). Let's just transpose the data before we do the multiplication instead.
2932
// This is shorter than trying to combine a discontinous matrix with lots of shufps.
2933
if (transposeS) {
2934
transposeInPlace(scol);
2935
}
2936
2937
// Now, work our way through the matrix, loading things as we go.
2938
// TODO: With more temp registers, can generate much more efficient code.
2939
for (int i = 0; i < n; i++) {
2940
MOVSS(XMM1, fpr.V(tregs[4 * i])); // TODO: AVX broadcastss to replace this and the SHUFPS
2941
MOVSS(XMM0, fpr.V(tregs[4 * i + 1]));
2942
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
2943
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2944
MULPS(XMM1, fpr.VS(scol[0]));
2945
MULPS(XMM0, fpr.VS(scol[1]));
2946
ADDPS(XMM1, R(XMM0));
2947
for (int j = 2; j < n; j++) {
2948
MOVSS(XMM0, fpr.V(tregs[4 * i + j]));
2949
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2950
MULPS(XMM0, fpr.VS(scol[j]));
2951
ADDPS(XMM1, R(XMM0));
2952
}
2953
// Map the D column.
2954
u8 dcol[4];
2955
GetVectorRegs(dcol, vsz, dcols[i]);
2956
#if !PPSSPP_ARCH(AMD64)
2957
fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT | MAP_NOLOCK);
2958
#else
2959
fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT);
2960
#endif
2961
MOVAPS(fpr.VS(dcol), XMM1);
2962
}
2963
if (transposeS){
2964
for (int i = 0; i < n; i++){
2965
fpr.DiscardVS(scols[i]);
2966
}
2967
}
2968
2969
#if !PPSSPP_ARCH(AMD64)
2970
fpr.ReleaseSpillLocks();
2971
#endif
2972
if (transposeDest) {
2973
u8 dcol[4][4];
2974
for (int i = 0; i < n; i++) {
2975
GetVectorRegs(dcol[i], vsz, dcols[i]);
2976
fpr.MapRegsVS(dcol[i], vsz, MAP_DIRTY);
2977
}
2978
transposeInPlace(dcol);
2979
}
2980
fpr.ReleaseSpillLocks();
2981
return;
2982
}
2983
2984
u8 sregs[16], tregs[16], dregs[16];
2985
GetMatrixRegs(sregs, sz, _VS);
2986
GetMatrixRegs(tregs, sz, _VT);
2987
GetMatrixRegs(dregs, sz, _VD);
2988
2989
// Flush SIMD.
2990
fpr.SimpleRegsV(sregs, sz, 0);
2991
fpr.SimpleRegsV(tregs, sz, 0);
2992
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2993
2994
// Rough overlap check.
2995
bool overlap = false;
2996
if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {
2997
// Potential overlap (guaranteed for 3x3 or more).
2998
overlap = true;
2999
}
3000
3001
if (overlap) {
3002
u8 tempregs[16];
3003
for (int a = 0; a < n; a++) {
3004
for (int b = 0; b < n; b++) {
3005
MOVSS(XMM0, fpr.V(sregs[b * 4]));
3006
MULSS(XMM0, fpr.V(tregs[a * 4]));
3007
for (int c = 1; c < n; c++) {
3008
MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3009
MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3010
ADDSS(XMM0, R(XMM1));
3011
}
3012
u8 temp = (u8) fpr.GetTempV();
3013
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3014
MOVSS(fpr.VX(temp), R(XMM0));
3015
fpr.StoreFromRegisterV(temp);
3016
tempregs[a * 4 + b] = temp;
3017
}
3018
}
3019
for (int a = 0; a < n; a++) {
3020
for (int b = 0; b < n; b++) {
3021
u8 temp = tempregs[a * 4 + b];
3022
fpr.MapRegV(temp, 0);
3023
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3024
}
3025
}
3026
} else {
3027
for (int a = 0; a < n; a++) {
3028
for (int b = 0; b < n; b++) {
3029
MOVSS(XMM0, fpr.V(sregs[b * 4]));
3030
MULSS(XMM0, fpr.V(tregs[a * 4]));
3031
for (int c = 1; c < n; c++) {
3032
MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3033
MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3034
ADDSS(XMM0, R(XMM1));
3035
}
3036
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
3037
}
3038
}
3039
}
3040
fpr.ReleaseSpillLocks();
3041
}
3042
3043
void Jit::Comp_Vmscl(MIPSOpcode op) {
3044
CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
3045
3046
// TODO: This op probably ignores prefixes?
3047
if (js.HasUnknownPrefix())
3048
DISABLE;
3049
3050
MatrixSize sz = GetMtxSize(op);
3051
int n = GetMatrixSide(sz);
3052
3053
u8 sregs[16], dregs[16], scale;
3054
GetMatrixRegs(sregs, sz, _VS);
3055
GetVectorRegs(&scale, V_Single, _VT);
3056
GetMatrixRegs(dregs, sz, _VD);
3057
3058
// Flush SIMD.
3059
fpr.SimpleRegsV(sregs, sz, 0);
3060
fpr.SimpleRegsV(&scale, V_Single, 0);
3061
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3062
3063
// Move to XMM0 early, so we don't have to worry about overlap with scale.
3064
MOVSS(XMM0, fpr.V(scale));
3065
3066
// TODO: test overlap, optimize.
3067
u8 tempregs[16];
3068
for (int a = 0; a < n; a++) {
3069
for (int b = 0; b < n; b++) {
3070
u8 temp = (u8) fpr.GetTempV();
3071
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3072
MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
3073
MULSS(fpr.VX(temp), R(XMM0));
3074
fpr.StoreFromRegisterV(temp);
3075
tempregs[a * 4 + b] = temp;
3076
}
3077
}
3078
for (int a = 0; a < n; a++) {
3079
for (int b = 0; b < n; b++) {
3080
u8 temp = tempregs[a * 4 + b];
3081
fpr.MapRegV(temp, 0);
3082
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3083
}
3084
}
3085
3086
fpr.ReleaseSpillLocks();
3087
}
3088
3089
void Jit::Comp_Vtfm(MIPSOpcode op) {
3090
CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
3091
3092
// TODO: This probably ignores prefixes? Or maybe uses D?
3093
if (js.HasUnknownPrefix())
3094
DISABLE;
3095
3096
VectorSize sz = GetVecSize(op);
3097
MatrixSize msz = GetMtxSize(op);
3098
int n = GetNumVectorElements(sz);
3099
int ins = (op >> 23) & 7;
3100
3101
bool homogenous = false;
3102
if (n == ins) {
3103
n++;
3104
sz = (VectorSize)((int)(sz)+1);
3105
msz = (MatrixSize)((int)(msz)+1);
3106
homogenous = true;
3107
}
3108
// Otherwise, n should already be ins + 1.
3109
else if (n != ins + 1) {
3110
DISABLE;
3111
}
3112
3113
if (jo.enableVFPUSIMD) {
3114
u8 scols[4], dcol[4], tregs[4];
3115
3116
int vs = _VS;
3117
int vd = _VD;
3118
int vt = _VT; // vector!
3119
3120
// The T matrix we will address individually.
3121
GetVectorRegs(dcol, sz, vd);
3122
GetMatrixRows(vs, msz, scols);
3123
GetVectorRegs(tregs, sz, vt);
3124
for (int i = 0; i < n; i++) {
3125
fpr.StoreFromRegisterV(tregs[i]);
3126
}
3127
3128
// We need the T regs in individual regs, but they could overlap with S regs.
3129
// If that happens, we copy the T reg to a temp.
3130
auto flushConflictingTRegsToTemps = [&](u8 regs[4]) {
3131
for (int i = 0; i < n; ++i) {
3132
for (int j = 0; j < n; ++j) {
3133
if (regs[i] != tregs[j]) {
3134
continue;
3135
}
3136
3137
// They match. Let's replace this treg with a temp reg.
3138
// Note that it will spill if there's contention, unfortunately...
3139
tregs[j] = fpr.GetTempV();
3140
fpr.MapRegV(tregs[j], MAP_NOINIT);
3141
MOVSS(fpr.VX(tregs[j]), fpr.V(regs[i]));
3142
}
3143
}
3144
};
3145
3146
u8 scol[4][4];
3147
3148
// Map all of S's columns into registers.
3149
for (int i = 0; i < n; i++) {
3150
GetVectorRegs(scol[i], sz, scols[i]);
3151
flushConflictingTRegsToTemps(scol[i]);
3152
fpr.MapRegsVS(scol[i], sz, 0);
3153
}
3154
3155
// Now, work our way through the matrix, loading things as we go.
3156
// TODO: With more temp registers, can generate much more efficient code.
3157
MOVSS(XMM1, fpr.V(tregs[0])); // TODO: AVX broadcastss to replace this and the SHUFPS (but take care of temps, unless we force store them.)
3158
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
3159
MULPS(XMM1, fpr.VS(scol[0]));
3160
for (int j = 1; j < n; j++) {
3161
if (!homogenous || j != n - 1) {
3162
MOVSS(XMM0, fpr.V(tregs[j]));
3163
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
3164
MULPS(XMM0, fpr.VS(scol[j]));
3165
ADDPS(XMM1, R(XMM0));
3166
} else {
3167
ADDPS(XMM1, fpr.VS(scol[j]));
3168
}
3169
}
3170
// Map the D column. Release first in case of overlap.
3171
for (int i = 0; i < n; i++) {
3172
fpr.ReleaseSpillLockV(scol[i], sz);
3173
}
3174
fpr.MapRegsVS(dcol, sz, MAP_DIRTY | MAP_NOINIT);
3175
MOVAPS(fpr.VS(dcol), XMM1);
3176
fpr.ReleaseSpillLocks();
3177
return;
3178
}
3179
3180
u8 sregs[16], dregs[4], tregs[4];
3181
GetMatrixRegs(sregs, msz, _VS);
3182
GetVectorRegs(tregs, sz, _VT);
3183
GetVectorRegs(dregs, sz, _VD);
3184
3185
// Flush SIMD.
3186
fpr.SimpleRegsV(sregs, msz, 0);
3187
fpr.SimpleRegsV(tregs, sz, 0);
3188
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3189
3190
// TODO: test overlap, optimize.
3191
u8 tempregs[4];
3192
for (int i = 0; i < n; i++) {
3193
MOVSS(XMM0, fpr.V(sregs[i * 4]));
3194
MULSS(XMM0, fpr.V(tregs[0]));
3195
for (int k = 1; k < n; k++)
3196
{
3197
MOVSS(XMM1, fpr.V(sregs[i * 4 + k]));
3198
if (!homogenous || k != n - 1)
3199
MULSS(XMM1, fpr.V(tregs[k]));
3200
ADDSS(XMM0, R(XMM1));
3201
}
3202
3203
u8 temp = (u8) fpr.GetTempV();
3204
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3205
MOVSS(fpr.VX(temp), R(XMM0));
3206
fpr.StoreFromRegisterV(temp);
3207
tempregs[i] = temp;
3208
}
3209
for (int i = 0; i < n; i++) {
3210
u8 temp = tempregs[i];
3211
fpr.MapRegV(temp, 0);
3212
MOVSS(fpr.V(dregs[i]), fpr.VX(temp));
3213
}
3214
3215
fpr.ReleaseSpillLocks();
3216
}
3217
3218
void Jit::Comp_VCrs(MIPSOpcode op) {
3219
DISABLE;
3220
}
3221
3222
void Jit::Comp_VDet(MIPSOpcode op) {
3223
DISABLE;
3224
}
3225
3226
// The goal is to map (reversed byte order for clarity):
3227
// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD
3228
alignas(16) static const s8 vi2xc_shuffle[16] = { 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
3229
// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD
3230
alignas(16) static const s8 vi2xs_shuffle[16] = { 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 };
3231
3232
void Jit::Comp_Vi2x(MIPSOpcode op) {
3233
CONDITIONAL_DISABLE(VFPU_VEC);
3234
if (js.HasUnknownPrefix())
3235
DISABLE;
3236
3237
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
3238
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
3239
3240
// These instructions pack pairs or quads of integers into 32 bits.
3241
// The unsigned (u) versions skip the sign bit when packing.
3242
3243
VectorSize sz = GetVecSize(op);
3244
VectorSize outsize;
3245
if (bits == 8) {
3246
outsize = V_Single;
3247
if (sz != V_Quad) {
3248
DISABLE;
3249
}
3250
} else {
3251
switch (sz) {
3252
case V_Pair:
3253
outsize = V_Single;
3254
break;
3255
case V_Quad:
3256
outsize = V_Pair;
3257
break;
3258
default:
3259
DISABLE;
3260
}
3261
}
3262
3263
u8 sregs[4], dregs[4];
3264
GetVectorRegsPrefixS(sregs, sz, _VS);
3265
GetVectorRegsPrefixD(dregs, outsize, _VD);
3266
3267
// Flush SIMD.
3268
fpr.SimpleRegsV(sregs, sz, 0);
3269
fpr.SimpleRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
3270
3271
// First, let's assemble the sregs into lanes of a single xmm reg.
3272
// For quad inputs, we need somewhere for the bottom regs. Ideally dregs[0].
3273
X64Reg dst0 = XMM0;
3274
if (sz == V_Quad) {
3275
int vreg = dregs[0];
3276
if (!IsOverlapSafeAllowS(dregs[0], 0, 4, sregs)) {
3277
// Will be discarded on release.
3278
vreg = fpr.GetTempV();
3279
}
3280
fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3281
fpr.SpillLockV(vreg);
3282
dst0 = fpr.VX(vreg);
3283
} else {
3284
// Pair, let's check if we should use dregs[0] directly. No temp needed.
3285
int vreg = dregs[0];
3286
if (IsOverlapSafeAllowS(dregs[0], 0, 2, sregs)) {
3287
fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3288
fpr.SpillLockV(vreg);
3289
dst0 = fpr.VX(vreg);
3290
}
3291
}
3292
3293
if (!fpr.V(sregs[0]).IsSimpleReg(dst0)) {
3294
MOVSS(dst0, fpr.V(sregs[0]));
3295
}
3296
MOVSS(XMM1, fpr.V(sregs[1]));
3297
// With this, we have the lower half in dst0.
3298
PUNPCKLDQ(dst0, R(XMM1));
3299
if (sz == V_Quad) {
3300
MOVSS(XMM0, fpr.V(sregs[2]));
3301
MOVSS(XMM1, fpr.V(sregs[3]));
3302
PUNPCKLDQ(XMM0, R(XMM1));
3303
// Now we need to combine XMM0 into dst0.
3304
PUNPCKLQDQ(dst0, R(XMM0));
3305
} else {
3306
// Otherwise, we need to zero out the top 2.
3307
// We expect XMM1 to be zero below.
3308
PXOR(XMM1, R(XMM1));
3309
PUNPCKLQDQ(dst0, R(XMM1));
3310
}
3311
3312
// For "u" type ops, we clamp to zero and shift off the sign bit first.
3313
if (unsignedOp) {
3314
if (cpu_info.bSSE4_1) {
3315
if (sz == V_Quad) {
3316
// Zeroed in the other case above.
3317
PXOR(XMM1, R(XMM1));
3318
}
3319
PMAXSD(dst0, R(XMM1));
3320
PSLLD(dst0, 1);
3321
} else {
3322
// Get a mask of the sign bit in dst0, then and in the values. This clamps to 0.
3323
MOVDQA(XMM1, R(dst0));
3324
PSRAD(dst0, 31);
3325
PSLLD(XMM1, 1);
3326
PANDN(dst0, R(XMM1));
3327
}
3328
}
3329
3330
// At this point, everything is aligned in the high bits of our lanes.
3331
if (cpu_info.bSSSE3) {
3332
if (RipAccessible(vi2xc_shuffle)) {
3333
PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle)); // rip accessible
3334
} else {
3335
MOV(PTRBITS, R(TEMPREG), bits == 8 ? ImmPtr(vi2xc_shuffle) : ImmPtr(vi2xs_shuffle));
3336
PSHUFB(dst0, MatR(TEMPREG));
3337
}
3338
} else {
3339
// Let's *arithmetically* shift in the sign so we can use saturating packs.
3340
PSRAD(dst0, 32 - bits);
3341
// XMM1 used for the high part just so there's no dependency. It contains garbage or 0.
3342
PACKSSDW(dst0, R(XMM1));
3343
if (bits == 8) {
3344
PACKSSWB(dst0, R(XMM1));
3345
}
3346
}
3347
3348
if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {
3349
MOVSS(fpr.V(dregs[0]), dst0);
3350
}
3351
if (outsize == V_Pair) {
3352
fpr.MapRegV(dregs[1], MAP_NOINIT | MAP_DIRTY);
3353
MOVDQA(fpr.V(dregs[1]), dst0);
3354
// Shift out the lower result to get the result we want.
3355
PSRLDQ(fpr.VX(dregs[1]), 4);
3356
}
3357
3358
ApplyPrefixD(dregs, outsize);
3359
fpr.ReleaseSpillLocks();
3360
}
3361
3362
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
3363
3364
void Jit::Comp_Vhoriz(MIPSOpcode op) {
3365
CONDITIONAL_DISABLE(VFPU_VEC);
3366
3367
if (js.HasUnknownPrefix())
3368
DISABLE;
3369
3370
VectorSize sz = GetVecSize(op);
3371
int n = GetNumVectorElements(sz);
3372
3373
u8 sregs[4], dregs[1];
3374
GetVectorRegsPrefixS(sregs, sz, _VS);
3375
GetVectorRegsPrefixD(dregs, V_Single, _VD);
3376
if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3377
if (cpu_info.bSSE4_1) {
3378
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
3379
switch (sz) {
3380
case V_Pair:
3381
MOVAPS(XMM0, fpr.VS(sregs));
3382
DPPS(XMM0, MatR(TEMPREG), 0x31);
3383
MOVAPS(fpr.VSX(dregs), R(XMM0));
3384
break;
3385
case V_Triple:
3386
MOVAPS(XMM0, fpr.VS(sregs));
3387
DPPS(XMM0, MatR(TEMPREG), 0x71);
3388
MOVAPS(fpr.VSX(dregs), R(XMM0));
3389
break;
3390
case V_Quad:
3391
XORPS(XMM1, R(XMM1));
3392
MOVAPS(XMM0, fpr.VS(sregs));
3393
DPPS(XMM0, MatR(TEMPREG), 0xF1);
3394
// In every other case, +0.0 is selected by the mask and added.
3395
// But, here we need to manually add it to the result.
3396
ADDPS(XMM0, R(XMM1));
3397
MOVAPS(fpr.VSX(dregs), R(XMM0));
3398
break;
3399
default:
3400
DISABLE;
3401
}
3402
} else {
3403
switch (sz) {
3404
case V_Pair:
3405
XORPS(XMM1, R(XMM1));
3406
MOVAPS(XMM0, fpr.VS(sregs));
3407
ADDPS(XMM1, R(XMM0));
3408
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3409
ADDPS(XMM0, R(XMM1));
3410
MOVAPS(fpr.VSX(dregs), R(XMM0));
3411
break;
3412
case V_Triple:
3413
XORPS(XMM1, R(XMM1));
3414
MOVAPS(XMM0, fpr.VS(sregs));
3415
ADDPS(XMM1, R(XMM0));
3416
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3417
ADDPS(XMM0, R(XMM1));
3418
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2));
3419
ADDPS(XMM0, R(XMM1));
3420
MOVAPS(fpr.VSX(dregs), R(XMM0));
3421
break;
3422
case V_Quad:
3423
XORPS(XMM1, R(XMM1));
3424
MOVAPS(XMM0, fpr.VS(sregs));
3425
// This flips the sign of any -0.000.
3426
ADDPS(XMM0, R(XMM1));
3427
MOVHLPS(XMM1, XMM0);
3428
ADDPS(XMM0, R(XMM1));
3429
MOVAPS(XMM1, R(XMM0));
3430
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1));
3431
ADDPS(XMM0, R(XMM1));
3432
MOVAPS(fpr.VSX(dregs), R(XMM0));
3433
break;
3434
default:
3435
DISABLE;
3436
}
3437
}
3438
if (((op >> 16) & 31) == 7) { // vavg
3439
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3440
MULSS(fpr.VSX(dregs), MatR(TEMPREG));
3441
}
3442
ApplyPrefixD(dregs, V_Single);
3443
fpr.ReleaseSpillLocks();
3444
return;
3445
}
3446
3447
// Flush SIMD.
3448
fpr.SimpleRegsV(sregs, sz, 0);
3449
fpr.SimpleRegsV(dregs, V_Single, MAP_NOINIT | MAP_DIRTY);
3450
3451
X64Reg reg = XMM0;
3452
if (IsOverlapSafe(dregs[0], 0, n, sregs)) {
3453
fpr.MapRegV(dregs[0], dregs[0] == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3454
fpr.SpillLockV(dregs[0]);
3455
reg = fpr.VX(dregs[0]);
3456
}
3457
3458
// We have to start zt +0.000 in case any values are -0.000.
3459
XORPS(reg, R(reg));
3460
for (int i = 0; i < n; ++i) {
3461
ADDSS(reg, fpr.V(sregs[i]));
3462
}
3463
3464
switch ((op >> 16) & 31) {
3465
case 6: // vfad
3466
break;
3467
case 7: // vavg
3468
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3469
MULSS(reg, MatR(TEMPREG));
3470
break;
3471
}
3472
3473
if (reg == XMM0) {
3474
MOVSS(fpr.V(dregs[0]), XMM0);
3475
}
3476
3477
ApplyPrefixD(dregs, V_Single);
3478
fpr.ReleaseSpillLocks();
3479
}
3480
3481
void Jit::Comp_Viim(MIPSOpcode op) {
3482
CONDITIONAL_DISABLE(VFPU_XFER);
3483
3484
if (js.HasUnknownPrefix())
3485
DISABLE;
3486
3487
u8 dreg;
3488
GetVectorRegs(&dreg, V_Single, _VT);
3489
3490
// Flush SIMD.
3491
fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3492
3493
s32 imm = SignExtend16ToS32(op);
3494
FP32 fp;
3495
fp.f = (float)imm;
3496
MOV(32, R(TEMPREG), Imm32(fp.u));
3497
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3498
MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3499
3500
ApplyPrefixD(&dreg, V_Single);
3501
fpr.ReleaseSpillLocks();
3502
}
3503
3504
void Jit::Comp_Vfim(MIPSOpcode op) {
3505
CONDITIONAL_DISABLE(VFPU_XFER);
3506
3507
if (js.HasUnknownPrefix())
3508
DISABLE;
3509
3510
u8 dreg;
3511
GetVectorRegs(&dreg, V_Single, _VT);
3512
3513
// Flush SIMD.
3514
fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3515
3516
FP16 half;
3517
half.u = op & 0xFFFF;
3518
FP32 fval = half_to_float_fast5(half);
3519
MOV(32, R(TEMPREG), Imm32(fval.u));
3520
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3521
MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3522
3523
ApplyPrefixD(&dreg, V_Single);
3524
fpr.ReleaseSpillLocks();
3525
}
3526
3527
void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) {
3528
char what[4] = { '0', '0', '0', '0' };
3529
if (((imm >> 2) & 3) == (imm & 3)) {
3530
for (int i = 0; i < 4; i++)
3531
what[i] = 'S';
3532
}
3533
what[(imm >> 2) & 3] = 'S';
3534
what[imm & 3] = 'C';
3535
3536
// TODO: shufps SIMD version
3537
3538
for (int i = 0; i < n; i++) {
3539
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
3540
switch (what[i]) {
3541
case 'C': MOVSS(fpr.V(dregs[i]), XMM1); break;
3542
case 'S':
3543
MOVSS(fpr.V(dregs[i]), XMM0);
3544
if (negSin) {
3545
if (RipAccessible(&signBitLower)) {
3546
XORPS(fpr.VX(dregs[i]), M(&signBitLower)); // rip accessible
3547
} else {
3548
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
3549
XORPS(fpr.VX(dregs[i]), MatR(TEMPREG));
3550
}
3551
}
3552
break;
3553
case '0':
3554
{
3555
XORPS(fpr.VX(dregs[i]), fpr.V(dregs[i]));
3556
break;
3557
}
3558
default:
3559
ERROR_LOG(Log::JIT, "Bad what in vrot");
3560
break;
3561
}
3562
}
3563
}
3564
3565
// Very heavily used by FF:CC
3566
void Jit::Comp_VRot(MIPSOpcode op) {
3567
CONDITIONAL_DISABLE(VFPU_VEC);
3568
if (js.HasUnknownPrefix()) {
3569
DISABLE;
3570
}
3571
if (!js.HasNoPrefix()) {
3572
// Prefixes work strangely for this, see IRCompVFPU.
3573
WARN_LOG_REPORT(Log::JIT, "vrot instruction using prefixes at %08x", GetCompilerPC());
3574
DISABLE;
3575
}
3576
3577
int vd = _VD;
3578
int vs = _VS;
3579
3580
VectorSize sz = GetVecSize(op);
3581
int n = GetNumVectorElements(sz);
3582
3583
u8 dregs[4];
3584
u8 dregs2[4];
3585
3586
MIPSOpcode nextOp = GetOffsetInstruction(1);
3587
int vd2 = -1;
3588
int imm2 = -1;
3589
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
3590
// Pair of vrot with the same angle argument. Let's join them (can share sin/cos results).
3591
vd2 = MIPS_GET_VD(nextOp);
3592
imm2 = (nextOp >> 16) & 0x1f;
3593
// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);
3594
}
3595
3596
u8 sreg;
3597
GetVectorRegs(dregs, sz, vd);
3598
if (vd2 >= 0)
3599
GetVectorRegs(dregs2, sz, vd2);
3600
GetVectorRegs(&sreg, V_Single, vs);
3601
3602
// Flush SIMD.
3603
fpr.SimpleRegsV(&sreg, V_Single, 0);
3604
3605
int imm = (op >> 16) & 0x1f;
3606
3607
gpr.FlushBeforeCall();
3608
fpr.Flush();
3609
3610
bool negSin1 = (imm & 0x10) ? true : false;
3611
3612
#if PPSSPP_ARCH(AMD64)
3613
#ifdef _WIN32
3614
LEA(64, RDX, MIPSSTATE_VAR(sincostemp));
3615
#else
3616
LEA(64, RDI, MIPSSTATE_VAR(sincostemp));
3617
#endif
3618
MOVSS(XMM0, fpr.V(sreg));
3619
ABI_CallFunction(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos);
3620
#else
3621
// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
3622
ABI_CallFunctionAC(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos, fpr.V(sreg), (uintptr_t)mips_->sincostemp);
3623
#endif
3624
3625
MOVSS(XMM0, MIPSSTATE_VAR(sincostemp[0]));
3626
MOVSS(XMM1, MIPSSTATE_VAR(sincostemp[1]));
3627
3628
CompVrotShuffle(dregs, imm, n, false);
3629
if (vd2 != -1) {
3630
// If the negsin setting differs between the two joint invocations, we need to flip the second one.
3631
bool negSin2 = (imm2 & 0x10) ? true : false;
3632
CompVrotShuffle(dregs2, imm2, n, negSin1 != negSin2);
3633
EatInstruction(nextOp);
3634
}
3635
fpr.ReleaseSpillLocks();
3636
}
3637
3638
void Jit::Comp_ColorConv(MIPSOpcode op) {
3639
CONDITIONAL_DISABLE(VFPU_VEC);
3640
if (js.HasUnknownPrefix())
3641
DISABLE;
3642
3643
int vd = _VD;
3644
int vs = _VS;
3645
3646
DISABLE;
3647
#if 0
3648
VectorSize sz = V_Quad;
3649
int n = GetNumVectorElements(sz);
3650
3651
switch ((op >> 16) & 3) {
3652
case 1:
3653
break;
3654
default:
3655
DISABLE;
3656
}
3657
3658
u8 sregs[4];
3659
u8 dregs[1];
3660
// WARNING: Prefixes.
3661
GetVectorRegs(sregs, sz, vs);
3662
GetVectorRegs(dregs, V_Pair, vd);
3663
3664
if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3665
switch ((op >> 16) & 3) {
3666
case 1: // 4444
3667
{
3668
//int a = ((in >> 24) & 0xFF) >> 4;
3669
//int b = ((in >> 16) & 0xFF) >> 4;
3670
//int g = ((in >> 8) & 0xFF) >> 4;
3671
//int r = ((in)& 0xFF) >> 4;
3672
//col = (a << 12) | (b << 8) | (g << 4) | (r);
3673
//PACKUSW
3674
break;
3675
}
3676
case 2: // 5551
3677
{
3678
//int a = ((in >> 24) & 0xFF) >> 7;
3679
//int b = ((in >> 16) & 0xFF) >> 3;
3680
//int g = ((in >> 8) & 0xFF) >> 3;
3681
//int r = ((in)& 0xFF) >> 3;
3682
//col = (a << 15) | (b << 10) | (g << 5) | (r);
3683
break;
3684
}
3685
case 3: // 565
3686
{
3687
//int b = ((in >> 16) & 0xFF) >> 3;
3688
//int g = ((in >> 8) & 0xFF) >> 2;
3689
//int r = ((in)& 0xFF) >> 3;
3690
//col = (b << 11) | (g << 5) | (r);
3691
break;
3692
}
3693
}
3694
DISABLE;
3695
3696
// Flush SIMD.
3697
fpr.SimpleRegsV(&sreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3698
fpr.SimpleRegsV(&dreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3699
#endif
3700
3701
}
3702
}
3703
3704
#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
3705
3706