CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/IR/IRCompVFPU.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <cmath>
19
20
#include "Common/CPUDetect.h"
21
#include "Common/Data/Convert/SmallDataConvert.h"
22
#include "Common/Math/math_util.h"
23
#include "Core/Compatibility.h"
24
#include "Core/Config.h"
25
#include "Core/MemMap.h"
26
#include "Core/MIPS/MIPS.h"
27
#include "Core/MIPS/MIPSTables.h"
28
#include "Core/MIPS/MIPSAnalyst.h"
29
#include "Core/MIPS/MIPSCodeUtils.h"
30
#include "Core/MIPS/IR/IRFrontend.h"
31
#include "Core/MIPS/IR/IRRegCache.h"
32
#include "Core/Reporting.h"
33
#include "Core/System.h"
34
35
36
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
37
// Currently known non working ones should have DISABLE.
38
39
// #define CONDITIONAL_DISABLE(flag) { Comp_Generic(op); return; }
40
#define CONDITIONAL_DISABLE(flag) if (opts.disableFlags & (uint32_t)JitDisable::flag) { Comp_Generic(op); return; }
41
#define DISABLE { Comp_Generic(op); return; }
42
#define INVALIDOP { Comp_Generic(op); return; }
43
44
#define _RS MIPS_GET_RS(op)
45
#define _RT MIPS_GET_RT(op)
46
#define _RD MIPS_GET_RD(op)
47
#define _FS MIPS_GET_FS(op)
48
#define _FT MIPS_GET_FT(op)
49
#define _FD MIPS_GET_FD(op)
50
#define _SA MIPS_GET_SA(op)
51
#define _POS ((op>> 6) & 0x1F)
52
#define _SIZE ((op>>11) & 0x1F)
53
#define _IMM16 (signed short)(op & 0xFFFF)
54
#define _IMM26 (op & 0x03FFFFFF)
55
56
const int vfpuBase = 32; // skip the FP registers
57
58
namespace MIPSComp {
59
static void ApplyVoffset(u8 regs[4], int count) {
60
for (int i = 0; i < count; i++) {
61
regs[i] = vfpuBase + voffset[regs[i]];
62
}
63
}
64
65
static bool IsConsecutive2(const u8 regs[2]) {
66
return regs[1] == regs[0] + 1;
67
}
68
69
static bool IsConsecutive3(const u8 regs[3]) {
70
return IsConsecutive2(regs) && regs[2] == regs[1] + 1;
71
}
72
73
static bool IsConsecutive4(const u8 regs[4]) {
74
return IsConsecutive3(regs) && regs[3] == regs[2] + 1;
75
}
76
77
static bool IsVec2(VectorSize sz, const u8 regs[2]) {
78
return sz == V_Pair && IsConsecutive2(regs) && (regs[0] & 1) == 0;
79
}
80
81
static bool IsVec4(VectorSize sz, const u8 regs[4]) {
82
return sz == V_Quad && IsConsecutive4(regs) && (regs[0] & 3) == 0;
83
}
84
85
static bool IsVec3of4(VectorSize sz, const u8 regs[4]) {
86
return sz == V_Triple && IsConsecutive3(regs) && (regs[0] & 3) == 0;
87
}
88
89
static bool IsMatrixVec4(MatrixSize sz, const u8 regs[16]) {
90
if (sz != M_4x4)
91
return false;
92
if (!IsConsecutive4(&regs[0]) || (regs[0] & 3) != 0)
93
return false;
94
if (!IsConsecutive4(&regs[4]) || (regs[4] & 3) != 0)
95
return false;
96
if (!IsConsecutive4(&regs[8]) || (regs[8] & 3) != 0)
97
return false;
98
if (!IsConsecutive4(&regs[12]) || (regs[12] & 3) != 0)
99
return false;
100
return true;
101
}
102
103
// Vector regs can overlap in all sorts of swizzled ways.
104
// This does allow a single overlap in sregs[i].
105
static bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
106
for (int i = 0; i < sn; ++i) {
107
if (sregs[i] == dreg && i != di)
108
return false;
109
}
110
for (int i = 0; i < tn; ++i) {
111
if (tregs[i] == dreg)
112
return false;
113
}
114
115
// Hurray, no overlap, we can write directly.
116
return true;
117
}
118
119
static bool IsOverlapSafeAllowS(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
120
for (int i = 0; i < dn; ++i) {
121
if (!IsOverlapSafeAllowS(dregs[i], i, sn, sregs, tn, tregs)) {
122
return false;
123
}
124
}
125
return true;
126
}
127
128
static bool IsOverlapSafe(int dreg, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
129
return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);
130
}
131
132
static bool IsOverlapSafe(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
133
for (int i = 0; i < dn; ++i) {
134
if (!IsOverlapSafe(dregs[i], sn, sregs, tn, tregs)) {
135
return false;
136
}
137
}
138
return true;
139
}
140
141
static bool IsPrefixWithinSize(u32 prefix, VectorSize sz) {
142
int n = GetNumVectorElements(sz);
143
for (int i = n; i < 4; i++) {
144
int regnum = (prefix >> (i * 2)) & 3;
145
int abs = (prefix >> (8 + i)) & 1;
146
int negate = (prefix >> (16 + i)) & 1;
147
int constants = (prefix >> (12 + i)) & 1;
148
if (regnum >= n && !constants) {
149
if (abs || negate || regnum != i)
150
return false;
151
}
152
}
153
154
return true;
155
}
156
157
static bool IsPrefixWithinSize(u32 prefix, MIPSOpcode op) {
158
return IsPrefixWithinSize(prefix, GetVecSize(op));
159
}
160
161
void IRFrontend::Comp_VPFX(MIPSOpcode op) {
162
CONDITIONAL_DISABLE(VFPU_XFER);
163
// This is how prefixes are typically set.
164
int data = op & 0xFFFFF;
165
int regnum = (op >> 24) & 3;
166
switch (regnum) {
167
case 0: // S
168
js.prefixS = data;
169
js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
170
break;
171
case 1: // T
172
js.prefixT = data;
173
js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
174
break;
175
case 2: // D
176
js.prefixD = data & 0x00000FFF;
177
js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
178
break;
179
default:
180
ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
181
break;
182
}
183
}
184
185
static void InitRegs(u8 *vregs, int reg) {
186
vregs[0] = reg;
187
vregs[1] = reg + 1;
188
vregs[2] = reg + 2;
189
vregs[3] = reg + 3;
190
}
191
192
void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {
193
if (prefix == 0xE4)
194
return;
195
196
int n = GetNumVectorElements(sz);
197
u8 origV[4]{};
198
static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
199
200
for (int i = 0; i < n; i++)
201
origV[i] = vregs[i];
202
203
// Some common vector prefixes
204
if (IsVec4(sz, vregs)) {
205
if (prefix == 0xF00E4) {
206
InitRegs(vregs, tempReg);
207
ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);
208
return;
209
}
210
if (prefix == 0x00FE4) {
211
InitRegs(vregs, tempReg);
212
ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);
213
return;
214
}
215
// Pure shuffle
216
if (prefix == (prefix & 0xFF)) {
217
InitRegs(vregs, tempReg);
218
ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);
219
return;
220
}
221
222
if ((prefix & 0x000FF000) == 0x0000F000) {
223
// Handle some easy and common cases.
224
Vec4Init init = Vec4Init::AllZERO;
225
bool useInit;
226
switch (prefix & 0xFFF) {
227
case 0x00: useInit = true; init = Vec4Init::AllZERO; break;
228
case 0x01: useInit = true; init = Vec4Init::Set_1000; break;
229
case 0x04: useInit = true; init = Vec4Init::Set_0100; break;
230
case 0x10: useInit = true; init = Vec4Init::Set_0010; break;
231
case 0x40: useInit = true; init = Vec4Init::Set_0001; break;
232
case 0x55: useInit = true; init = Vec4Init::AllONE; break;
233
default: useInit = false; break;
234
}
235
236
if (useInit) {
237
InitRegs(vregs, tempReg);
238
ir.Write(IROp::Vec4Init, vregs[0], (int)init);
239
return;
240
}
241
}
242
243
// Check if we're just zeroing certain lanes - this is common.
244
u32 zeroedLanes = 0;
245
for (int i = 0; i < 4; ++i) {
246
int regnum = (prefix >> (i * 2)) & 3;
247
int abs = (prefix >> (8 + i)) & 1;
248
int negate = (prefix >> (16 + i)) & 1;
249
int constants = (prefix >> (12 + i)) & 1;
250
251
if (!constants && regnum == i && !abs && !negate)
252
continue;
253
if (constants && regnum == 0 && abs == 0 && !negate) {
254
zeroedLanes |= 1 << i;
255
continue;
256
}
257
258
// Nope, it has something else going on.
259
zeroedLanes = -1;
260
break;
261
}
262
263
if (zeroedLanes != -1) {
264
InitRegs(vregs, tempReg);
265
ir.Write(IROp::Vec4Init, vregs[0], (int)Vec4Init::AllZERO);
266
ir.Write(IROp::Vec4Blend, vregs[0], origV[0], vregs[0], zeroedLanes);
267
return;
268
}
269
}
270
271
// Alright, fall back to the generic approach.
272
for (int i = 0; i < n; i++) {
273
int regnum = (prefix >> (i * 2)) & 3;
274
int abs = (prefix >> (8 + i)) & 1;
275
int negate = (prefix >> (16 + i)) & 1;
276
int constants = (prefix >> (12 + i)) & 1;
277
278
// Unchanged, hurray.
279
if (!constants && regnum == i && !abs && !negate)
280
continue;
281
282
// This puts the value into a temp reg, so we won't write the modified value back.
283
vregs[i] = tempReg + i;
284
if (!constants) {
285
if (regnum >= n) {
286
// Depends on the op, but often zero.
287
ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(0.0f));
288
} else if (abs) {
289
ir.Write(IROp::FAbs, vregs[i], origV[regnum]);
290
if (negate)
291
ir.Write(IROp::FNeg, vregs[i], vregs[i]);
292
} else {
293
if (negate)
294
ir.Write(IROp::FNeg, vregs[i], origV[regnum]);
295
else if (vregs[i] != origV[regnum])
296
ir.Write(IROp::FMov, vregs[i], origV[regnum]);
297
}
298
} else {
299
if (negate) {
300
ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)]));
301
} else {
302
ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)]));
303
}
304
}
305
}
306
}
307
308
void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
309
::GetVectorRegs(regs, N, vectorReg);
310
ApplyVoffset(regs, N);
311
}
312
313
void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
314
::GetMatrixRegs(regs, N, matrixReg);
315
for (int i = 0; i < GetMatrixSide(N); i++) {
316
ApplyVoffset(regs + 4 * i, GetVectorSize(N));
317
}
318
}
319
320
void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
321
_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
322
GetVectorRegs(regs, sz, vectorReg);
323
ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);
324
}
325
void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
326
_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
327
GetVectorRegs(regs, sz, vectorReg);
328
ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);
329
}
330
331
void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
332
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
333
334
GetVectorRegs(regs, sz, vectorReg);
335
int n = GetNumVectorElements(sz);
336
if (js.prefixD == 0)
337
return;
338
339
if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
340
// Use temps for all, we'll blend in the end (keeping in Vec4.)
341
for (int i = 0; i < 4; ++i)
342
regs[i] = IRVTEMP_PFX_D + i;
343
return;
344
}
345
346
for (int i = 0; i < n; i++) {
347
// Hopefully this is rare, we'll just write it into a dumping ground reg.
348
if (js.VfpuWriteMask(i))
349
regs[i] = IRVTEMP_PFX_D + i;
350
}
351
}
352
353
inline int GetDSat(int prefix, int i) {
354
return (prefix >> (i * 2)) & 3;
355
}
356
357
// "D" prefix is really a post process. No need to allocate a temporary register (except
358
// dummies to simulate writemask, which is done in GetVectorRegsPrefixD
359
void IRFrontend::ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg) {
360
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
361
if (!js.prefixD)
362
return;
363
364
ApplyPrefixDMask(vregs, sz, vectorReg);
365
366
int n = GetNumVectorElements(sz);
367
for (int i = 0; i < n; i++) {
368
if (js.VfpuWriteMask(i))
369
continue;
370
int sat = GetDSat(js.prefixD, i);
371
if (sat == 1) {
372
// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
373
ir.Write(IROp::FSat0_1, vregs[i], vregs[i]);
374
} else if (sat == 3) {
375
ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]);
376
}
377
}
378
}
379
380
void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
381
if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
382
u8 origV[4];
383
GetVectorRegs(origV, sz, vectorReg);
384
385
// Just keep the original values where it was masked.
386
ir.Write(IROp::Vec4Blend, origV[0], vregs[0], origV[0], js.VfpuWriteMask());
387
388
// So that saturate works, change it back.
389
for (int i = 0; i < 4; ++i)
390
vregs[i] = origV[i];
391
}
392
}
393
394
void IRFrontend::Comp_SV(MIPSOpcode op) {
395
CONDITIONAL_DISABLE(LSU_VFPU);
396
s32 offset = (signed short)(op & 0xFFFC);
397
int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
398
MIPSGPReg rs = _RS;
399
400
CheckMemoryBreakpoint(rs, offset);
401
402
switch (op >> 26) {
403
case 50: //lv.s
404
ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
405
break;
406
407
case 58: //sv.s
408
ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
409
break;
410
411
default:
412
INVALIDOP;
413
}
414
}
415
416
void IRFrontend::Comp_SVQ(MIPSOpcode op) {
417
CONDITIONAL_DISABLE(LSU_VFPU);
418
int imm = (signed short)(op & 0xFFFC);
419
int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
420
MIPSGPReg rs = _RS;
421
422
u8 vregs[4];
423
GetVectorRegs(vregs, V_Quad, vt);
424
425
CheckMemoryBreakpoint(rs, imm);
426
427
enum class LSVType {
428
INVALID,
429
LVQ,
430
SVQ,
431
LVLQ,
432
LVRQ,
433
SVLQ,
434
SVRQ,
435
};
436
437
LSVType optype = LSVType::INVALID;
438
switch (op >> 26) {
439
case 54: optype = LSVType::LVQ; break; // lv.q
440
case 62: optype = LSVType::SVQ; break; // sv.q
441
case 53: // lvl/lvr.q - highly unusual
442
optype = (op & 2) == 0 ? LSVType::LVLQ : LSVType::LVRQ;
443
break;
444
case 61: // svl/svr.q - highly unusual
445
optype = (op & 2) == 0 ? LSVType::SVLQ : LSVType::SVRQ;
446
break;
447
}
448
if (optype == LSVType::INVALID)
449
INVALIDOP;
450
451
if ((optype == LSVType::LVRQ || optype == LSVType::SVRQ) && opts.unalignedLoadStoreVec4) {
452
// We don't bother with an op for this, but we do fuse unaligned stores which happen.
453
MIPSOpcode nextOp = GetOffsetInstruction(1);
454
if ((nextOp.encoding ^ op.encoding) == 0x0000000E) {
455
// Okay, it's an svr.q/svl.q pair, same registers. Treat as lv.q/sv.q.
456
EatInstruction(nextOp);
457
optype = optype == LSVType::LVRQ ? LSVType::LVQ : LSVType::SVQ;
458
}
459
}
460
461
switch (optype) {
462
case LSVType::LVQ:
463
if (IsVec4(V_Quad, vregs)) {
464
ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
465
} else {
466
// Let's not even bother with "vertical" loads for now.
467
if (!g_Config.bFastMemory)
468
ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 0, (u32)imm);
469
ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm));
470
ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4));
471
ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8));
472
ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12));
473
}
474
break;
475
476
case LSVType::SVQ:
477
if (IsVec4(V_Quad, vregs)) {
478
ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
479
} else {
480
// Let's not even bother with "vertical" stores for now.
481
if (!g_Config.bFastMemory)
482
ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 1, (u32)imm);
483
ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm));
484
ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4));
485
ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8));
486
ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12));
487
}
488
break;
489
490
case LSVType::LVLQ:
491
case LSVType::LVRQ:
492
case LSVType::SVLQ:
493
case LSVType::SVRQ:
494
// These are pretty uncommon unless paired.
495
DISABLE;
496
break;
497
498
default:
499
INVALIDOP;
500
}
501
}
502
503
void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
504
CONDITIONAL_DISABLE(VFPU_XFER);
505
if (js.HasUnknownPrefix() || js.HasSPrefix()) {
506
DISABLE;
507
}
508
509
// Vector init
510
// d[N] = CONST[N]
511
// Note: probably implemented as vmov with prefix hack.
512
513
VectorSize sz = GetVecSize(op);
514
int type = (op >> 16) & 0xF;
515
int vd = _VD;
516
int n = GetNumVectorElements(sz);
517
u8 dregs[4];
518
GetVectorRegsPrefixD(dregs, sz, vd);
519
520
if (IsVec4(sz, dregs)) {
521
ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
522
} else {
523
for (int i = 0; i < n; i++) {
524
ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
525
}
526
}
527
ApplyPrefixD(dregs, sz, vd);
528
}
529
530
void IRFrontend::Comp_VIdt(MIPSOpcode op) {
531
CONDITIONAL_DISABLE(VFPU_XFER);
532
if (js.HasUnknownPrefix() || js.HasSPrefix()) {
533
DISABLE;
534
}
535
536
// Vector identity row
537
// d[N] = IDENTITY[N,m]
538
// Note: probably implemented as vmov with prefix hack.
539
540
int vd = _VD;
541
VectorSize sz = GetVecSize(op);
542
u8 dregs[4];
543
GetVectorRegsPrefixD(dregs, sz, vd);
544
545
if (IsVec4(sz, dregs)) {
546
int row = vd & 3;
547
Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
548
ir.Write(IROp::Vec4Init, dregs[0], (int)init);
549
} else {
550
switch (sz) {
551
case V_Pair:
552
ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f));
553
ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f));
554
break;
555
case V_Quad:
556
ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f));
557
ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f));
558
ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f));
559
ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f));
560
break;
561
default:
562
INVALIDOP;
563
}
564
}
565
566
ApplyPrefixD(dregs, sz, vd);
567
}
568
569
void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
570
CONDITIONAL_DISABLE(VFPU_XFER);
571
MatrixSize sz = GetMtxSize(op);
572
if (!js.HasNoPrefix()) {
573
DISABLE;
574
}
575
576
// Matrix init (weird prefixes)
577
// d[N,M] = CONST[N,M]
578
579
int vd = _VD;
580
if (IsMatrixTransposed(vd)) {
581
// All outputs are transpositionally symmetric, so should be fine.
582
vd = TransposeMatrixReg(vd);
583
}
584
585
if (sz != M_4x4) {
586
// 3x3 is decently common. It expands a lot, but let's set each.
587
u8 dregs[16];
588
GetMatrixRegs(dregs, sz, vd);
589
590
// TODO: It might be worth using Vec4Blend for 3x3 to mask w.
591
int n = GetMatrixSide(sz);
592
for (int y = 0; y < n; ++y) {
593
for (int x = 0; x < n; ++x) {
594
switch ((op >> 16) & 0xF) {
595
case 3: // vmidt
596
if (x == 0 && y == 0)
597
ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));
598
else if (x == y)
599
ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);
600
else
601
ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));
602
break;
603
case 6: // vmzero
604
// Likely to be fast.
605
ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));
606
break;
607
case 7: // vmone
608
if (x == 0 && y == 0)
609
ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));
610
else
611
ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);
612
break;
613
default:
614
INVALIDOP;
615
}
616
}
617
}
618
return;
619
}
620
621
// Not really about trying here, it will work if enabled.
622
VectorSize vsz = GetVectorSize(sz);
623
u8 vecs[4];
624
GetMatrixColumns(vd, sz, vecs);
625
for (int i = 0; i < 4; i++) {
626
u8 vec[4];
627
GetVectorRegs(vec, vsz, vecs[i]);
628
// As they are columns, they will be nicely consecutive.
629
Vec4Init init;
630
switch ((op >> 16) & 0xF) {
631
case 3:
632
init = Vec4Init((int)Vec4Init::Set_1000 + i);
633
break;
634
case 6:
635
init = Vec4Init::AllZERO;
636
break;
637
case 7:
638
init = Vec4Init::AllONE;
639
break;
640
default:
641
INVALIDOP;
642
return;
643
}
644
ir.Write(IROp::Vec4Init, vec[0], (int)init);
645
}
646
}
647
648
void IRFrontend::Comp_VHdp(MIPSOpcode op) {
649
CONDITIONAL_DISABLE(VFPU_VEC);
650
if (js.HasUnknownPrefix() || js.HasSPrefix() || !IsPrefixWithinSize(js.prefixT, op)) {
651
DISABLE;
652
}
653
654
// Vector homogenous dot product
655
// d[0] = s[0 .. n-2] dot t[0 .. n-2] + t[n-1]
656
// Note: s[n-1] is ignored / treated as 1 via prefix override.
657
658
int vd = _VD;
659
int vs = _VS;
660
int vt = _VT;
661
VectorSize sz = GetVecSize(op);
662
int n = GetNumVectorElements(sz);
663
664
if (js.prefixS & (0x0101 << (8 + n - 1)))
665
DISABLE;
666
667
// TODO: Force read one of them into regs? probably not.
668
u8 sregs[4], tregs[4], dregs[1];
669
GetVectorRegsPrefixS(sregs, sz, vs);
670
GetVectorRegsPrefixT(tregs, sz, vt);
671
GetVectorRegsPrefixD(dregs, V_Single, vd);
672
673
ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);
674
675
for (int i = 1; i < n; i++) {
676
if (i == n - 1) {
677
ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]);
678
} else {
679
ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]);
680
ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1);
681
}
682
}
683
684
ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
685
ApplyPrefixD(dregs, V_Single, vd);
686
}
687
688
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
689
690
void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {
691
CONDITIONAL_DISABLE(VFPU_VEC);
692
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
693
DISABLE;
694
}
695
696
// Vector horizontal add
697
// d[0] = s[0] + ... s[n-1]
698
// Vector horizontal average
699
// d[0] = s[0] / n + ... s[n-1] / n
700
// Note: Both are implemented as dot products against generated constants.
701
702
VectorSize sz = GetVecSize(op);
703
int n = GetNumVectorElements(sz);
704
705
u8 sregs[4], dregs[1];
706
GetVectorRegsPrefixS(sregs, sz, _VS);
707
GetVectorRegsPrefixD(dregs, V_Single, _VD);
708
709
// We have to start at +0.000 in case any values are -0.000.
710
ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f));
711
for (int i = 0; i < n; ++i) {
712
ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]);
713
}
714
715
switch ((op >> 16) & 31) {
716
case 6: // vfad
717
ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
718
break;
719
case 7: // vavg
720
ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1]));
721
ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1);
722
break;
723
}
724
725
ApplyPrefixD(dregs, V_Single, _VD);
726
}
727
728
void IRFrontend::Comp_VDot(MIPSOpcode op) {
729
CONDITIONAL_DISABLE(VFPU_VEC);
730
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
731
DISABLE;
732
}
733
734
// Vector dot product
735
// d[0] = s[0 .. n-1] dot t[0 .. n-1]
736
737
int vd = _VD;
738
int vs = _VS;
739
int vt = _VT;
740
741
VectorSize sz = GetVecSize(op);
742
int n = GetNumVectorElements(sz);
743
744
// TODO: Force read one of them into regs? probably not.
745
u8 sregs[4], tregs[4], dregs[1];
746
GetVectorRegsPrefixS(sregs, sz, vs);
747
GetVectorRegsPrefixT(tregs, sz, vt);
748
GetVectorRegsPrefixD(dregs, V_Single, vd);
749
750
if (IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
751
if (IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
752
ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
753
} else {
754
ir.Write(IROp::Vec4Dot, IRVTEMP_0, sregs[0], tregs[0]);
755
ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
756
}
757
ApplyPrefixD(dregs, V_Single, vd);
758
return;
759
} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4Dot) {
760
// Note: this is often worse than separate muliplies and adds on x86.
761
if (IsOverlapSafe(dregs[0], n, tregs) || sregs[0] == tregs[0]) {
762
// Nice example of this in Fat Princess (US) in block 088181A0 (hot.)
763
// Create a temporary copy of S with the last element zeroed.
764
ir.Write(IROp::Vec4Init, IRVTEMP_0, (int)Vec4Init::AllZERO);
765
ir.Write(IROp::Vec4Blend, IRVTEMP_0, IRVTEMP_0, sregs[0], 0x7);
766
// Now we can just dot like normal, with the last element effectively masked.
767
ir.Write(IROp::Vec4Dot, dregs[0], IRVTEMP_0, sregs[0] == tregs[0] ? IRVTEMP_0 : tregs[0]);
768
ApplyPrefixD(dregs, V_Single, vd);
769
return;
770
}
771
}
772
773
int temp0 = IRVTEMP_0;
774
int temp1 = IRVTEMP_0 + 1;
775
ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);
776
for (int i = 1; i < n; i++) {
777
ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
778
ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
779
}
780
ApplyPrefixD(dregs, V_Single, vd);
781
}
782
783
void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
784
CONDITIONAL_DISABLE(VFPU_VEC);
785
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
786
DISABLE;
787
}
788
789
// Vector arithmetic
790
// d[N] = OP(s[N], t[N]) (see below)
791
792
enum class VecDo3Op : uint8_t {
793
INVALID,
794
VADD,
795
VSUB,
796
VDIV,
797
VMUL,
798
VMIN,
799
VMAX,
800
VSGE,
801
VSLT,
802
};
803
VecDo3Op type = VecDo3Op::INVALID;
804
VectorSize sz = GetVecSize(op);
805
int n = GetNumVectorElements(sz);
806
807
// Check that we can support the ops, and prepare temporary values for ops that need it.
808
switch (op >> 26) {
809
case 24: //VFPU0
810
switch ((op >> 23) & 7) {
811
case 0: type = VecDo3Op::VADD; break;
812
case 1: type = VecDo3Op::VSUB; break;
813
case 7: type = VecDo3Op::VDIV; break;
814
default: INVALIDOP;
815
}
816
break;
817
case 25: //VFPU1
818
switch ((op >> 23) & 7) {
819
case 0: type = VecDo3Op::VMUL; break;
820
default: INVALIDOP;
821
}
822
break;
823
case 27: //VFPU3
824
switch ((op >> 23) & 7) {
825
case 2: type = VecDo3Op::VMIN; break;
826
case 3: type = VecDo3Op::VMAX; break;
827
case 6: type = VecDo3Op::VSGE; break;
828
case 7: type = VecDo3Op::VSLT; break;
829
default: INVALIDOP;
830
}
831
break;
832
default: INVALIDOP;
833
}
834
_assert_(type != VecDo3Op::INVALID);
835
836
bool allowSIMD = true;
837
switch (type) {
838
case VecDo3Op::VADD:
839
case VecDo3Op::VSUB:
840
case VecDo3Op::VMUL:
841
break;
842
case VecDo3Op::VDIV:
843
if (js.HasUnknownPrefix() || (sz != V_Single && !js.HasNoPrefix()))
844
DISABLE;
845
// If it's single, we just need to check the prefixes are within the size.
846
if (!IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))
847
DISABLE;
848
break;
849
case VecDo3Op::VMIN:
850
case VecDo3Op::VMAX:
851
case VecDo3Op::VSGE:
852
case VecDo3Op::VSLT:
853
allowSIMD = false;
854
break;
855
case VecDo3Op::INVALID: // Can't happen, but to avoid compiler warnings
856
break;
857
}
858
859
u8 sregs[4], tregs[4], dregs[4];
860
GetVectorRegsPrefixS(sregs, sz, _VS);
861
GetVectorRegsPrefixT(tregs, sz, _VT);
862
GetVectorRegsPrefixD(dregs, sz, _VD);
863
864
u8 tempregs[4];
865
for (int i = 0; i < n; i++) {
866
if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
867
tempregs[i] = IRVTEMP_0 + i;
868
} else {
869
tempregs[i] = dregs[i];
870
}
871
}
872
873
// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
874
if (allowSIMD) {
875
IROp opFunc = IROp::Nop;
876
switch (type) {
877
case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
878
opFunc = IROp::Vec4Add;
879
break;
880
case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub
881
opFunc = IROp::Vec4Sub;
882
break;
883
case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv
884
opFunc = IROp::Vec4Div;
885
break;
886
case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul
887
opFunc = IROp::Vec4Mul;
888
break;
889
default:
890
// Leave it Nop, disabled below.
891
break;
892
}
893
894
if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
895
if (opFunc != IROp::Nop) {
896
ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
897
} else {
898
DISABLE;
899
}
900
ApplyPrefixD(dregs, sz, _VD);
901
return;
902
} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
903
// This is actually pretty common. Use a temp + blend.
904
// We could post-process this, but it's easier to do it here.
905
if (opFunc == IROp::Nop)
906
DISABLE;
907
ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);
908
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
909
ApplyPrefixD(dregs, sz, _VD);
910
return;
911
}
912
}
913
914
if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
915
// TODO: Consider a dedicated op? For now, we abuse FpCond a bit.
916
ir.Write(IROp::FpCondToReg, IRTEMP_0);
917
}
918
919
for (int i = 0; i < n; ++i) {
920
switch (type) {
921
case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
922
ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]);
923
break;
924
case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub
925
ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]);
926
break;
927
case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv
928
ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]);
929
break;
930
case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul
931
ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]);
932
break;
933
case VecDo3Op::VMIN: // vmin
934
ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]);
935
break;
936
case VecDo3Op::VMAX: // vmax
937
ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]);
938
break;
939
case VecDo3Op::VSGE: // vsge
940
ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessUnordered, sregs[i], tregs[i]);
941
ir.Write(IROp::FpCondToReg, IRTEMP_1);
942
ir.Write(IROp::XorConst, IRTEMP_1, IRTEMP_1, ir.AddConstant(1));
943
ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);
944
ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);
945
break;
946
case VecDo3Op::VSLT: // vslt
947
ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessOrdered, sregs[i], tregs[i]);
948
ir.Write(IROp::FpCondToReg, IRTEMP_1);
949
ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);
950
ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);
951
break;
952
case VecDo3Op::INVALID: // Can't happen, but to avoid compiler warnings
953
break;
954
}
955
}
956
957
if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
958
ir.Write(IROp::FpCondFromReg, IRTEMP_0);
959
}
960
961
for (int i = 0; i < n; i++) {
962
if (dregs[i] != tempregs[i]) {
963
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
964
}
965
}
966
967
ApplyPrefixD(dregs, sz, _VD);
968
}
969
970
void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
971
CONDITIONAL_DISABLE(VFPU_VEC);
972
973
if (js.HasUnknownPrefix()) {
974
DISABLE;
975
}
976
977
int optype = (op >> 16) & 0x1f;
978
if (optype == 0) {
979
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op))
980
DISABLE;
981
} else if (optype == 1 || optype == 2) {
982
// D prefix is fine for these, and used sometimes.
983
if (js.HasUnknownPrefix() || js.HasSPrefix())
984
DISABLE;
985
} else if (optype == 5 && js.HasDPrefix()) {
986
DISABLE;
987
}
988
989
// Vector unary operation
990
// d[N] = OP(s[N]) (see below)
991
992
int vs = _VS;
993
int vd = _VD;
994
VectorSize sz = GetVecSize(op);
995
int n = GetNumVectorElements(sz);
996
997
if (optype >= 16 && !js.HasNoPrefix()) {
998
// Many of these apply the D prefix strangely or override parts of the S prefix.
999
if (js.HasUnknownPrefix() || sz != V_Single)
1000
DISABLE;
1001
// If it's single, we just need to check the prefixes are within the size.
1002
if (!IsPrefixWithinSize(js.prefixS, op))
1003
DISABLE;
1004
// The negative ones seem to use negate flags as a prefix hack.
1005
if (optype >= 24 && (js.prefixS & 0x000F0000) != 0)
1006
DISABLE;
1007
}
1008
1009
// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
1010
if (optype == 0 && vs == vd && js.HasNoPrefix()) {
1011
return;
1012
}
1013
1014
u8 sregs[4]{}, dregs[4]{};
1015
GetVectorRegsPrefixS(sregs, sz, vs);
1016
GetVectorRegsPrefixD(dregs, sz, vd);
1017
1018
bool usingTemps = false;
1019
u8 tempregs[4];
1020
for (int i = 0; i < n; ++i) {
1021
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1022
usingTemps = true;
1023
tempregs[i] = IRVTEMP_0 + i;
1024
} else {
1025
tempregs[i] = dregs[i];
1026
}
1027
}
1028
1029
bool canSIMD = false;
1030
// Some can be SIMD'd.
1031
switch (optype) {
1032
case 0: // vmov
1033
case 1: // vabs
1034
case 2: // vneg
1035
canSIMD = true;
1036
break;
1037
}
1038
1039
if (canSIMD && !usingTemps) {
1040
IROp irop = IROp::Nop;
1041
switch (optype) {
1042
case 0: // vmov
1043
irop = IROp::Vec4Mov;
1044
break;
1045
case 1: // vabs
1046
irop = IROp::Vec4Abs;
1047
break;
1048
case 2: // vneg
1049
irop = IROp::Vec4Neg;
1050
break;
1051
}
1052
if (IsVec4(sz, sregs) && IsVec4(sz, dregs) && irop != IROp::Nop) {
1053
ir.Write(irop, dregs[0], sregs[0]);
1054
ApplyPrefixD(dregs, sz, vd);
1055
return;
1056
} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && irop != IROp::Nop && opts.preferVec4) {
1057
// This is a simple case of vmov.t, just blend.
1058
if (irop == IROp::Vec4Mov) {
1059
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], sregs[0], 0x7);
1060
} else {
1061
ir.Write(irop, IRVTEMP_0, sregs[0]);
1062
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1063
}
1064
ApplyPrefixD(dregs, sz, vd);
1065
return;
1066
}
1067
}
1068
1069
for (int i = 0; i < n; ++i) {
1070
switch (optype) {
1071
case 0: // d[i] = s[i]; break; //vmov
1072
// Probably for swizzle.
1073
if (tempregs[i] != sregs[i])
1074
ir.Write(IROp::FMov, tempregs[i], sregs[i]);
1075
break;
1076
case 1: // d[i] = fabsf(s[i]); break; //vabs
1077
ir.Write(IROp::FAbs, tempregs[i], sregs[i]);
1078
break;
1079
case 2: // d[i] = -s[i]; break; //vneg
1080
ir.Write(IROp::FNeg, tempregs[i], sregs[i]);
1081
break;
1082
case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
1083
ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]);
1084
break;
1085
case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
1086
ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]);
1087
break;
1088
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
1089
ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
1090
break;
1091
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
1092
ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]);
1093
break;
1094
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
1095
ir.Write(IROp::FSin, tempregs[i], sregs[i]);
1096
break;
1097
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
1098
ir.Write(IROp::FCos, tempregs[i], sregs[i]);
1099
break;
1100
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
1101
DISABLE;
1102
break;
1103
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
1104
DISABLE;
1105
break;
1106
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
1107
ir.Write(IROp::FSqrt, tempregs[i], sregs[i]);
1108
break;
1109
case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
1110
ir.Write(IROp::FAsin, tempregs[i], sregs[i]);
1111
break;
1112
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
1113
ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
1114
ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
1115
break;
1116
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
1117
ir.Write(IROp::FSin, tempregs[i], sregs[i]);
1118
ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
1119
break;
1120
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
1121
DISABLE;
1122
break;
1123
default:
1124
INVALIDOP;
1125
}
1126
}
1127
for (int i = 0; i < n; i++) {
1128
if (dregs[i] != tempregs[i]) {
1129
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1130
}
1131
}
1132
1133
ApplyPrefixD(dregs, sz, vd);
1134
}
1135
1136
void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
1137
CONDITIONAL_DISABLE(VFPU_VEC);
1138
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
1139
DISABLE;
1140
}
1141
1142
// Vector integer to float
1143
// d[N] = float(S[N]) * mult
1144
1145
VectorSize sz = GetVecSize(op);
1146
int n = GetNumVectorElements(sz);
1147
1148
uint8_t imm = (op >> 16) & 0x1f;
1149
1150
u8 sregs[4], dregs[4];
1151
GetVectorRegsPrefixS(sregs, sz, _VS);
1152
GetVectorRegsPrefixD(dregs, sz, _VD);
1153
1154
for (int i = 0; i < n; i++) {
1155
if (imm == 0)
1156
ir.Write(IROp::FCvtSW, dregs[i], sregs[i]);
1157
else
1158
ir.Write(IROp::FCvtScaledSW, dregs[i], sregs[i], imm);
1159
}
1160
ApplyPrefixD(dregs, sz, _VD);
1161
}
1162
1163
void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
1164
CONDITIONAL_DISABLE(VFPU_VEC);
1165
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
1166
DISABLE;
1167
}
1168
1169
// Vector expand half to float
1170
// d[N*2] = float(lowerhalf(s[N])), d[N*2+1] = float(upperhalf(s[N]))
1171
1172
DISABLE;
1173
}
1174
1175
void IRFrontend::Comp_Vf2i(MIPSOpcode op) {
1176
CONDITIONAL_DISABLE(VFPU_VEC);
1177
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixD & 0xFF) != 0) {
1178
DISABLE;
1179
}
1180
1181
// Vector float to integer
1182
// d[N] = int(S[N] * mult)
1183
// Note: saturates on overflow.
1184
1185
VectorSize sz = GetVecSize(op);
1186
int n = GetNumVectorElements(sz);
1187
1188
uint8_t imm = (op >> 16) & 0x1f;
1189
1190
u8 sregs[4], dregs[4];
1191
GetVectorRegsPrefixS(sregs, sz, _VS);
1192
GetVectorRegsPrefixD(dregs, sz, _VD);
1193
1194
// Same values as FCR31.
1195
uint8_t rmode = (op >> 21) & 3;
1196
if (((op >> 21) & 0x1C) != 0x10)
1197
INVALIDOP;
1198
1199
if (imm != 0) {
1200
for (int i = 0; i < n; i++)
1201
ir.Write(IROp::FCvtScaledWS, dregs[i], sregs[i], imm | (rmode << 6));
1202
} else {
1203
for (int i = 0; i < n; i++) {
1204
switch (IRRoundMode(rmode)) {
1205
case IRRoundMode::RINT_0: // vf2in
1206
ir.Write(IROp::FRound, dregs[i], sregs[i]);
1207
break;
1208
1209
case IRRoundMode::CAST_1: // vf2iz
1210
ir.Write(IROp::FTrunc, dregs[i], sregs[i]);
1211
break;
1212
1213
case IRRoundMode::CEIL_2: // vf2iu
1214
ir.Write(IROp::FCeil, dregs[i], sregs[i]);
1215
break;
1216
1217
case IRRoundMode::FLOOR_3: // vf2id
1218
ir.Write(IROp::FFloor, dregs[i], sregs[i]);
1219
break;
1220
1221
default:
1222
INVALIDOP;
1223
}
1224
}
1225
}
1226
1227
ApplyPrefixDMask(dregs, sz, _VD);
1228
}
1229
1230
void IRFrontend::Comp_Mftv(MIPSOpcode op) {
1231
CONDITIONAL_DISABLE(VFPU_XFER);
1232
1233
// Vector move from VFPU / from VFPU ctrl (no prefixes)
1234
// gpr = S
1235
// gpr = VFPU_CTRL[i]
1236
1237
int imm = op & 0xFF;
1238
MIPSGPReg rt = _RT;
1239
switch ((op >> 21) & 0x1f) {
1240
case 3: //mfv / mfvc
1241
// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
1242
if (rt != MIPS_REG_ZERO) {
1243
if (imm < 128) { //R(rt) = VI(imm);
1244
ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
1245
} else {
1246
switch (imm - 128) {
1247
case VFPU_CTRL_DPREFIX:
1248
case VFPU_CTRL_SPREFIX:
1249
case VFPU_CTRL_TPREFIX:
1250
FlushPrefixV();
1251
break;
1252
}
1253
if (imm - 128 < VFPU_CTRL_MAX) {
1254
ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128);
1255
} else {
1256
INVALIDOP;
1257
}
1258
}
1259
}
1260
break;
1261
1262
case 7: // mtv
1263
if (imm < 128) {
1264
ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);
1265
} else if ((imm - 128) < VFPU_CTRL_MAX) {
1266
u32 mask;
1267
if (GetVFPUCtrlMask(imm - 128, &mask)) {
1268
if (mask != 0xFFFFFFFF) {
1269
ir.Write(IROp::AndConst, IRTEMP_0, rt, ir.AddConstant(mask));
1270
ir.Write(IROp::SetCtrlVFPUReg, imm - 128, IRTEMP_0);
1271
} else {
1272
ir.Write(IROp::SetCtrlVFPUReg, imm - 128, rt);
1273
}
1274
}
1275
1276
if (imm - 128 == VFPU_CTRL_SPREFIX) {
1277
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1278
js.blockWrotePrefixes = true;
1279
} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
1280
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1281
js.blockWrotePrefixes = true;
1282
} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
1283
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1284
js.blockWrotePrefixes = true;
1285
}
1286
} else {
1287
INVALIDOP;
1288
}
1289
break;
1290
1291
default:
1292
INVALIDOP;
1293
}
1294
}
1295
1296
void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {
1297
CONDITIONAL_DISABLE(VFPU_XFER);
1298
1299
// Vector Move from vector control reg (no prefixes)
1300
// D[0] = VFPU_CTRL[i]
1301
1302
int vd = _VD;
1303
int imm = (op >> 8) & 0x7F;
1304
if (imm < VFPU_CTRL_MAX) {
1305
ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm);
1306
ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vd], IRTEMP_0);
1307
} else {
1308
INVALIDOP;
1309
}
1310
}
1311
1312
void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {
1313
CONDITIONAL_DISABLE(VFPU_XFER);
1314
1315
// Vector Move to vector control reg (no prefixes)
1316
// VFPU_CTRL[i] = S[0]
1317
1318
int vs = _VS;
1319
int imm = op & 0xFF;
1320
if (imm < VFPU_CTRL_MAX) {
1321
u32 mask;
1322
if (GetVFPUCtrlMask(imm, &mask)) {
1323
if (mask != 0xFFFFFFFF) {
1324
ir.Write(IROp::FMovToGPR, IRTEMP_0, vfpuBase + voffset[imm]);
1325
ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(mask));
1326
ir.Write(IROp::SetCtrlVFPUReg, imm, IRTEMP_0);
1327
} else {
1328
ir.Write(IROp::SetCtrlVFPUFReg, imm, vfpuBase + voffset[vs]);
1329
}
1330
}
1331
if (imm == VFPU_CTRL_SPREFIX) {
1332
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1333
js.blockWrotePrefixes = true;
1334
} else if (imm == VFPU_CTRL_TPREFIX) {
1335
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1336
js.blockWrotePrefixes = true;
1337
} else if (imm == VFPU_CTRL_DPREFIX) {
1338
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1339
js.blockWrotePrefixes = true;
1340
}
1341
} else {
1342
INVALIDOP;
1343
}
1344
}
1345
1346
void IRFrontend::Comp_Vmmov(MIPSOpcode op) {
1347
CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
1348
if (!js.HasNoPrefix()) {
1349
DISABLE;
1350
}
1351
1352
// Matrix move (weird prefixes)
1353
// D[N,M] = S[N,M]
1354
1355
int vs = _VS;
1356
int vd = _VD;
1357
// This probably ignores prefixes for all sane intents and purposes.
1358
if (vs == vd) {
1359
// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
1360
return;
1361
}
1362
1363
MatrixSize sz = GetMtxSize(op);
1364
int n = GetMatrixSide(sz);
1365
1366
u8 sregs[16], dregs[16];
1367
GetMatrixRegs(sregs, sz, vs);
1368
GetMatrixRegs(dregs, sz, vd);
1369
1370
switch (GetMatrixOverlap(vs, vd, sz)) {
1371
case OVERLAP_EQUAL:
1372
// In-place transpose
1373
DISABLE;
1374
case OVERLAP_PARTIAL:
1375
DISABLE;
1376
case OVERLAP_NONE:
1377
default:
1378
break;
1379
}
1380
if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) {
1381
// Untranspose both matrices
1382
if (IsMatrixTransposed(vd)) {
1383
vd = TransposeMatrixReg(vd);
1384
vs = TransposeMatrixReg(vs);
1385
}
1386
// Get the columns
1387
u8 scols[4], dcols[4];
1388
GetMatrixColumns(vs, sz, scols);
1389
GetMatrixColumns(vd, sz, dcols);
1390
for (int i = 0; i < 4; i++) {
1391
u8 svec[4], dvec[4];
1392
GetVectorRegs(svec, GetVectorSize(sz), scols[i]);
1393
GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]);
1394
ir.Write(IROp::Vec4Mov, dvec[0], svec[0]);
1395
}
1396
return;
1397
}
1398
for (int a = 0; a < n; a++) {
1399
for (int b = 0; b < n; b++) {
1400
if (dregs[a * 4 + b] != sregs[a * 4 + b])
1401
ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]);
1402
}
1403
}
1404
}
1405
1406
void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
1407
CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
1408
if (!js.HasNoPrefix()) {
1409
DISABLE;
1410
}
1411
1412
// Matrix scale, matrix by scalar (weird prefixes)
1413
// d[N,M] = s[N,M] * t[0]
1414
// Note: behaves just slightly differently than a series of vscls.
1415
1416
int vs = _VS;
1417
int vd = _VD;
1418
int vt = _VT;
1419
1420
MatrixSize sz = GetMtxSize(op);
1421
if (sz != M_4x4) {
1422
DISABLE;
1423
}
1424
if (GetMtx(vt) == GetMtx(vd)) {
1425
DISABLE;
1426
}
1427
int n = GetMatrixSide(sz);
1428
1429
// The entire matrix is scaled equally, so transpose doesn't matter. Let's normalize.
1430
if (IsMatrixTransposed(vs) && IsMatrixTransposed(vd)) {
1431
vs = TransposeMatrixReg(vs);
1432
vd = TransposeMatrixReg(vd);
1433
}
1434
if (IsMatrixTransposed(vs) || IsMatrixTransposed(vd)) {
1435
DISABLE;
1436
}
1437
1438
u8 sregs[16], dregs[16], tregs[1];
1439
GetMatrixRegs(sregs, sz, vs);
1440
GetMatrixRegs(dregs, sz, vd);
1441
GetVectorRegs(tregs, V_Single, vt);
1442
1443
for (int i = 0; i < n; ++i) {
1444
ir.Write(IROp::Vec4Scale, dregs[i * 4], sregs[i * 4], tregs[0]);
1445
}
1446
}
1447
1448
void IRFrontend::Comp_VScl(MIPSOpcode op) {
1449
CONDITIONAL_DISABLE(VFPU_VEC);
1450
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1451
DISABLE;
1452
}
1453
1454
// Vector scale, vector by scalar
1455
// d[N] = s[N] * t[0]
1456
1457
VectorSize sz = GetVecSize(op);
1458
int n = GetNumVectorElements(sz);
1459
1460
int vs = _VS;
1461
int vd = _VD;
1462
int vt = _VT;
1463
u8 sregs[4], dregs[4], treg;
1464
GetVectorRegsPrefixS(sregs, sz, vs);
1465
// T prefixes handled by interp.
1466
GetVectorRegs(&treg, V_Single, vt);
1467
GetVectorRegsPrefixD(dregs, sz, vd);
1468
1469
bool overlap = false;
1470
// For prefixes to work, we just have to ensure that none of the output registers spill
1471
// and that there's no overlap.
1472
u8 tempregs[4];
1473
memcpy(tempregs, dregs, sizeof(tempregs));
1474
for (int i = 0; i < n; ++i) {
1475
// Conservative, can be improved
1476
if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {
1477
// Need to use temp regs
1478
tempregs[i] = IRVTEMP_0 + i;
1479
overlap = true;
1480
}
1481
}
1482
1483
if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
1484
if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
1485
ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
1486
ApplyPrefixD(dregs, sz, vd);
1487
return;
1488
} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && opts.preferVec4) {
1489
ir.Write(IROp::Vec4Scale, IRVTEMP_0, sregs[0], treg);
1490
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1491
ApplyPrefixD(dregs, sz, vd);
1492
return;
1493
}
1494
}
1495
1496
for (int i = 0; i < n; i++) {
1497
ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);
1498
}
1499
1500
for (int i = 0; i < n; i++) {
1501
// All must be mapped for prefixes to work.
1502
if (dregs[i] != tempregs[i]) {
1503
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1504
}
1505
}
1506
1507
ApplyPrefixD(dregs, sz, vd);
1508
}
1509
1510
/*
1511
// Capital = straight, lower case = transposed
1512
// 8 possibilities:
1513
ABC 2
1514
ABc missing
1515
AbC 1
1516
Abc 1
1517
1518
aBC = ACB 2 + swap
1519
aBc = AcB 1 + swap
1520
abC = ACb missing
1521
abc = Acb 1 + swap
1522
1523
*/
1524
1525
// This may or may not be a win when using the IR interpreter...
1526
// Many more instructions to interpret.
1527
void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
1528
CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
1529
if (!js.HasNoPrefix()) {
1530
DISABLE;
1531
}
1532
1533
if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
1534
// Fall back to interpreter, which has the accurate implementation.
1535
// Later we might do something more optimized here.
1536
DISABLE;
1537
}
1538
1539
// Matrix multiply (weird prefixes)
1540
// D[0 .. N, 0 .. M] = S[0 .. N, 0 .. M]' * T[0 .. N, 0 .. M]
1541
// Note: Behaves as if it's implemented through a series of vdots.
1542
// Important: this is a matrix multiply with a pre-transposed S.
1543
1544
MatrixSize sz = GetMtxSize(op);
1545
int n = GetMatrixSide(sz);
1546
1547
int vs = _VS;
1548
int vd = _VD;
1549
int vt = _VT;
1550
MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz);
1551
MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);
1552
1553
// A very common arrangment. Rearrange to something we can handle.
1554
if (IsMatrixTransposed(vd)) {
1555
// Matrix identity says (At * Bt) = (B * A)t
1556
// D = S * T
1557
// Dt = (S * T)t = (Tt * St)
1558
vd = TransposeMatrixReg(vd);
1559
std::swap(vs, vt);
1560
}
1561
1562
u8 sregs[16], tregs[16], dregs[16];
1563
GetMatrixRegs(sregs, sz, vs);
1564
GetMatrixRegs(tregs, sz, vt);
1565
GetMatrixRegs(dregs, sz, vd);
1566
1567
if (soverlap || toverlap) {
1568
DISABLE;
1569
}
1570
1571
// dregs are always consecutive, thanks to our transpose trick.
1572
// However, not sure this is always worth it.
1573
if (IsMatrixVec4(sz, dregs)) {
1574
// TODO: The interpreter would like proper matrix ops better. Can generate those, and
1575
// expand them like this as needed on "real" architectures.
1576
int s0 = IRVTEMP_0;
1577
int s1 = IRVTEMP_PFX_T;
1578
if (!IsMatrixVec4(sz, sregs)) {
1579
// METHOD 1: Handles AbC and Abc
1580
for (int j = 0; j < 4; j++) {
1581
ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);
1582
for (int i = 1; i < 4; i++) {
1583
ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]);
1584
ir.Write(IROp::Vec4Add, s0, s0, s1);
1585
}
1586
ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1587
}
1588
return;
1589
} else if (IsMatrixVec4(sz, tregs)) {
1590
// METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots.
1591
// Dots only work if tregs are consecutive.
1592
// TODO: Skip this and resort to method one and transpose the output?
1593
for (int j = 0; j < 4; j++) {
1594
for (int i = 0; i < 4; i++) {
1595
ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[j * 4]);
1596
}
1597
ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1598
}
1599
return;
1600
} else {
1601
// ABc - s consecutive, t not.
1602
// Tekken uses this.
1603
// logBlocks = 1;
1604
}
1605
}
1606
1607
// Fallback. Expands a LOT
1608
int temp0 = IRVTEMP_0;
1609
int temp1 = IRVTEMP_0 + 1;
1610
for (int a = 0; a < n; a++) {
1611
for (int b = 0; b < n; b++) {
1612
ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]);
1613
for (int c = 1; c < n; c++) {
1614
ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]);
1615
ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1);
1616
}
1617
}
1618
}
1619
}
1620
1621
void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
1622
CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
1623
if (!js.HasNoPrefix()) {
1624
DISABLE;
1625
}
1626
1627
// Vertex transform, vector by matrix (weird prefixes)
1628
// d[N] = s[N*m .. N*m + n-1] dot t[0 .. n-1]
1629
// Homogenous means t[n-1] is treated as 1.
1630
// Note: this might be implemented as a series of vdots with special prefixes.
1631
1632
VectorSize sz = GetVecSize(op);
1633
MatrixSize msz = GetMtxSize(op);
1634
int n = GetNumVectorElements(sz);
1635
int ins = (op >> 23) & 7;
1636
1637
bool homogenous = false;
1638
if (n == ins) {
1639
n++;
1640
sz = (VectorSize)((int)(sz)+1);
1641
msz = (MatrixSize)((int)(msz)+1);
1642
homogenous = true;
1643
}
1644
// Otherwise, n should already be ins + 1.
1645
else if (n != ins + 1) {
1646
DISABLE;
1647
}
1648
1649
u8 sregs[16], dregs[4], tregs[4];
1650
GetMatrixRegs(sregs, msz, _VS);
1651
GetVectorRegs(tregs, sz, _VT);
1652
GetVectorRegs(dregs, sz, _VD);
1653
1654
// SIMD-optimized implementations - if sregs[0..3] is non-consecutive, it's transposed.
1655
if (msz == M_4x4 && !IsMatrixVec4(msz, sregs)) {
1656
int s0 = IRVTEMP_0;
1657
int s1 = IRVTEMP_PFX_S;
1658
// For this algorithm, we don't care if tregs are consecutive or not,
1659
// they are accessed one at a time. This handles homogenous transforms correctly, as well.
1660
// We take advantage of sregs[0] + 1 being sregs[4] here.
1661
ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
1662
for (int i = 1; i < 4; i++) {
1663
if (!homogenous || (i != n - 1)) {
1664
ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
1665
ir.Write(IROp::Vec4Add, s0, s0, s1);
1666
} else {
1667
ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
1668
}
1669
}
1670
if (IsVec4(sz, dregs)) {
1671
ir.Write(IROp::Vec4Mov, dregs[0], s0);
1672
} else {
1673
for (int i = 0; i < 4; i++) {
1674
ir.Write(IROp::FMov, dregs[i], s0 + i);
1675
}
1676
}
1677
return;
1678
} else if (msz == M_4x4 && IsMatrixVec4(msz, sregs) && IsVec4(sz, tregs)) {
1679
IRReg t = tregs[0];
1680
if (homogenous) {
1681
// This is probably even what the hardware basically does, wiring t[3] to 1.0f.
1682
ir.Write(IROp::Vec4Init, IRVTEMP_PFX_T, (int)Vec4Init::AllONE);
1683
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, IRVTEMP_PFX_T, t, 0x7);
1684
t = IRVTEMP_PFX_T;
1685
}
1686
for (int i = 0; i < 4; i++)
1687
ir.Write(IROp::Vec4Dot, IRVTEMP_PFX_D + i, sregs[i * 4], t);
1688
for (int i = 0; i < 4; i++)
1689
ir.Write(IROp::FMov, dregs[i], IRVTEMP_PFX_D + i);
1690
return;
1691
}
1692
1693
// TODO: test overlap, optimize.
1694
u8 tempregs[4];
1695
int s0 = IRVTEMP_0;
1696
int temp1 = IRVTEMP_0 + 1;
1697
for (int i = 0; i < n; i++) {
1698
ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]);
1699
for (int k = 1; k < n; k++) {
1700
if (!homogenous || k != n - 1) {
1701
ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]);
1702
ir.Write(IROp::FAdd, s0, s0, temp1);
1703
} else {
1704
ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]);
1705
}
1706
}
1707
int temp = IRVTEMP_PFX_T + i;
1708
ir.Write(IROp::FMov, temp, s0);
1709
tempregs[i] = temp;
1710
}
1711
for (int i = 0; i < n; i++) {
1712
if (tempregs[i] != dregs[i])
1713
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1714
}
1715
}
1716
1717
void IRFrontend::Comp_VCrs(MIPSOpcode op) {
1718
CONDITIONAL_DISABLE(VFPU_VEC);
1719
if (js.HasUnknownPrefix() || js.HasSPrefix() || js.HasTPrefix()) {
1720
DISABLE;
1721
}
1722
1723
// Vector cross (half a cross product, n = 3)
1724
// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]
1725
// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;
1726
// (or just use vcrsp.)
1727
// Note: this is possibly just a swizzle prefix hack for vmul.
1728
1729
VectorSize sz = GetVecSize(op);
1730
int n = GetNumVectorElements(sz);
1731
if (sz != V_Triple)
1732
DISABLE;
1733
1734
u8 sregs[4], dregs[4], tregs[4];
1735
GetVectorRegsPrefixS(sregs, sz, _VS);
1736
GetVectorRegsPrefixT(tregs, sz, _VT);
1737
GetVectorRegsPrefixD(dregs, sz, _VD);
1738
1739
if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
1740
// Use Vec4 where we can. First, apply shuffles.
1741
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
1742
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
1743
ir.Write(IROp::Vec4Mul, IRVTEMP_0, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
1744
// Now just retain w and blend in our values.
1745
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1746
} else {
1747
u8 tempregs[4]{};
1748
if (!IsOverlapSafe(n, dregs, n, sregs, n, tregs)) {
1749
for (int i = 0; i < n; ++i)
1750
tempregs[i] = IRVTEMP_0 + i;
1751
} else {
1752
for (int i = 0; i < n; ++i)
1753
tempregs[i] = dregs[i];
1754
}
1755
1756
ir.Write(IROp::FMul, tempregs[0], sregs[1], tregs[2]);
1757
ir.Write(IROp::FMul, tempregs[1], sregs[2], tregs[0]);
1758
ir.Write(IROp::FMul, tempregs[2], sregs[0], tregs[1]);
1759
1760
for (int i = 0; i < n; i++) {
1761
if (tempregs[i] != dregs[i])
1762
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1763
}
1764
}
1765
1766
ApplyPrefixD(dregs, sz, _VD);
1767
}
1768
1769
void IRFrontend::Comp_VDet(MIPSOpcode op) {
1770
CONDITIONAL_DISABLE(VFPU_VEC);
1771
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1772
DISABLE;
1773
}
1774
1775
// Vector determinant
1776
// d[0] = s[0]*t[1] - s[1]*t[0]
1777
// Note: this operates on two vectors, not a 2x2 matrix.
1778
1779
VectorSize sz = GetVecSize(op);
1780
if (sz != V_Pair)
1781
DISABLE;
1782
1783
u8 sregs[4], dregs[4], tregs[4];
1784
GetVectorRegsPrefixS(sregs, sz, _VS);
1785
GetVectorRegsPrefixT(tregs, sz, _VT);
1786
GetVectorRegsPrefixD(dregs, V_Single, _VD);
1787
1788
ir.Write(IROp::FMul, IRVTEMP_0, sregs[1], tregs[0]);
1789
ir.Write(IROp::FMul, dregs[0], sregs[0], tregs[1]);
1790
ir.Write(IROp::FSub, dregs[0], dregs[0], IRVTEMP_0);
1791
1792
ApplyPrefixD(dregs, V_Single, _VD);
1793
}
1794
1795
void IRFrontend::Comp_Vi2x(MIPSOpcode op) {
1796
CONDITIONAL_DISABLE(VFPU_VEC);
1797
if (js.HasUnknownPrefix() || js.HasSPrefix())
1798
DISABLE;
1799
1800
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
1801
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
1802
1803
// These instructions pack pairs or quads of integers into 32 bits.
1804
// The unsigned (u) versions skip the sign bit when packing, first doing a signed clamp to 0 (so the sign bit won't ever be 1).
1805
1806
VectorSize sz = GetVecSize(op);
1807
VectorSize outsize;
1808
if (bits == 8) {
1809
outsize = V_Single;
1810
if (sz != V_Quad) {
1811
DISABLE;
1812
}
1813
} else {
1814
switch (sz) {
1815
case V_Pair:
1816
outsize = V_Single;
1817
break;
1818
case V_Quad:
1819
outsize = V_Pair;
1820
break;
1821
default:
1822
DISABLE;
1823
}
1824
}
1825
1826
u8 sregs[4], dregs[2], srcregs[4], tempregs[2];
1827
GetVectorRegsPrefixS(sregs, sz, _VS);
1828
GetVectorRegsPrefixD(dregs, outsize, _VD);
1829
memcpy(srcregs, sregs, sizeof(sregs));
1830
memcpy(tempregs, dregs, sizeof(dregs));
1831
1832
int nOut = GetNumVectorElements(outsize);
1833
1834
// If src registers aren't contiguous, make them.
1835
if (!IsVec2(sz, sregs) && !IsVec4(sz, sregs)) {
1836
// T prefix is unused.
1837
for (int i = 0; i < GetNumVectorElements(sz); i++) {
1838
srcregs[i] = IRVTEMP_PFX_T + i;
1839
ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1840
}
1841
}
1842
1843
if (bits == 8) {
1844
if (unsignedOp) { //vi2uc
1845
// Output is only one register.
1846
ir.Write(IROp::Vec4ClampToZero, IRVTEMP_0, srcregs[0]);
1847
ir.Write(IROp::Vec4Pack31To8, tempregs[0], IRVTEMP_0);
1848
} else { //vi2c
1849
ir.Write(IROp::Vec4Pack32To8, tempregs[0], srcregs[0]);
1850
}
1851
} else {
1852
// bits == 16
1853
if (unsignedOp) { //vi2us
1854
// Output is only one register.
1855
ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0, srcregs[0]);
1856
ir.Write(IROp::Vec2Pack31To16, tempregs[0], IRVTEMP_0);
1857
if (outsize == V_Pair) {
1858
ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0 + 2, srcregs[2]);
1859
ir.Write(IROp::Vec2Pack31To16, tempregs[1], IRVTEMP_0 + 2);
1860
}
1861
} else { //vi2s
1862
ir.Write(IROp::Vec2Pack32To16, tempregs[0], srcregs[0]);
1863
if (outsize == V_Pair) {
1864
ir.Write(IROp::Vec2Pack32To16, tempregs[1], srcregs[2]);
1865
}
1866
}
1867
}
1868
1869
for (int i = 0; i < nOut; i++) {
1870
if (dregs[i] != tempregs[i]) {
1871
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1872
}
1873
}
1874
1875
ApplyPrefixD(dregs, outsize, _VD);
1876
}
1877
1878
void IRFrontend::Comp_Vx2i(MIPSOpcode op) {
1879
CONDITIONAL_DISABLE(VFPU_VEC);
1880
if (js.HasUnknownPrefix() || js.HasSPrefix())
1881
DISABLE;
1882
1883
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1884
bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1885
1886
// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1887
// at the top. vus2i shifts it an extra bit right afterward.
1888
// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1889
// at the top too. vuc2i is a bit special (see below.)
1890
// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1891
// then use it for both.
1892
1893
VectorSize sz = GetVecSize(op);
1894
VectorSize outsize;
1895
if (bits == 8) {
1896
outsize = V_Quad;
1897
sz = V_Single; // For some reason, sz is set to Quad in this case though the outsize is Single.
1898
} else {
1899
switch (sz) {
1900
case V_Single:
1901
outsize = V_Pair;
1902
break;
1903
case V_Pair:
1904
outsize = V_Quad;
1905
break;
1906
default:
1907
DISABLE;
1908
}
1909
}
1910
1911
u8 sregs[2], dregs[4], tempregs[4], srcregs[2];
1912
GetVectorRegsPrefixS(sregs, sz, _VS);
1913
GetVectorRegsPrefixD(dregs, outsize, _VD);
1914
memcpy(tempregs, dregs, sizeof(dregs));
1915
memcpy(srcregs, sregs, sizeof(sregs));
1916
1917
// Remap source regs to be consecutive. This is not required
1918
// but helpful when implementations can join two Vec2Expand.
1919
if (sz == V_Pair && !IsConsecutive2(srcregs)) {
1920
for (int i = 0; i < 2; i++) {
1921
srcregs[i] = IRVTEMP_0 + i;
1922
ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1923
}
1924
}
1925
1926
int nIn = GetNumVectorElements(sz);
1927
1928
int nOut = 2;
1929
if (outsize == V_Quad)
1930
nOut = 4;
1931
// Remap dest regs. PFX_T is unused.
1932
if (outsize == V_Pair) {
1933
bool consecutive = IsConsecutive2(dregs);
1934
// We must have them consecutive, so all temps, or none.
1935
if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1936
for (int i = 0; i < nOut; i++) {
1937
tempregs[i] = IRVTEMP_PFX_T + i;
1938
}
1939
}
1940
} else if (outsize == V_Quad) {
1941
bool consecutive = IsVec4(outsize, dregs);
1942
if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1943
for (int i = 0; i < nOut; i++) {
1944
tempregs[i] = IRVTEMP_PFX_T + i;
1945
}
1946
}
1947
}
1948
1949
if (bits == 16) {
1950
if (unsignedOp) {
1951
ir.Write(IROp::Vec2Unpack16To31, tempregs[0], srcregs[0]);
1952
if (outsize == V_Quad)
1953
ir.Write(IROp::Vec2Unpack16To31, tempregs[2], srcregs[1]);
1954
} else {
1955
ir.Write(IROp::Vec2Unpack16To32, tempregs[0], srcregs[0]);
1956
if (outsize == V_Quad)
1957
ir.Write(IROp::Vec2Unpack16To32, tempregs[2], srcregs[1]);
1958
}
1959
} else if (bits == 8) {
1960
if (unsignedOp) {
1961
// See the interpreter, this one is odd. Hardware bug?
1962
ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1963
ir.Write(IROp::Vec4DuplicateUpperBitsAndShift1, tempregs[0], tempregs[0]);
1964
} else {
1965
ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1966
}
1967
}
1968
1969
for (int i = 0; i < nOut; i++) {
1970
if (tempregs[i] != dregs[i]) {
1971
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1972
}
1973
}
1974
ApplyPrefixD(dregs, outsize, _VD);
1975
}
1976
1977
void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
1978
CONDITIONAL_DISABLE(VFPU_VEC);
1979
if (!js.HasNoPrefix())
1980
DISABLE;
1981
1982
// Vector cross product (n = 3, weird prefixes)
1983
// d[0 .. 2] = s[0 .. 2] X t[0 .. 2]
1984
// Vector quaternion product (n = 4, weird prefixes)
1985
// d[0 .. 2] = t[0 .. 2] X s[0 .. 2] + s[3] * t[0 .. 2] + t[3] * s[0 .. 2]
1986
// d[3] = s[3]*t[3] - s[0 .. 2] dot t[0 .. 3]
1987
// Note: Behaves as if it's implemented through a series of vdots.
1988
1989
VectorSize sz = GetVecSize(op);
1990
int n = GetNumVectorElements(sz);
1991
1992
u8 sregs[4], tregs[4], dregs[4];
1993
GetVectorRegs(sregs, sz, _VS);
1994
GetVectorRegs(tregs, sz, _VT);
1995
GetVectorRegs(dregs, sz, _VD);
1996
1997
if (sz == V_Triple) {
1998
u8 tempregs[4]{};
1999
for (int i = 0; i < n; ++i) {
2000
if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
2001
tempregs[i] = IRVTEMP_PFX_T + i; // using IRTEMP0 for other things
2002
} else {
2003
tempregs[i] = dregs[i];
2004
}
2005
}
2006
2007
int temp0 = IRVTEMP_0;
2008
int temp1 = IRVTEMP_0 + 1;
2009
// Compute X
2010
ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);
2011
ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);
2012
ir.Write(IROp::FSub, tempregs[0], temp0, temp1);
2013
2014
// Compute Y
2015
ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);
2016
ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);
2017
ir.Write(IROp::FSub, tempregs[1], temp0, temp1);
2018
2019
// Compute Z
2020
ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
2021
ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
2022
ir.Write(IROp::FSub, tempregs[2], temp0, temp1);
2023
2024
for (int i = 0; i < n; i++) {
2025
if (tempregs[i] != dregs[i])
2026
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2027
}
2028
} else if (sz == V_Quad) {
2029
// Rather than using vdots, we organize this as SIMD multiplies and adds.
2030
// That means flipping the logic column-wise. Also, luckily no prefix temps used.
2031
if (!IsConsecutive4(sregs) || !IsConsecutive4(tregs) || !IsConsecutive4(dregs)) {
2032
DISABLE;
2033
}
2034
2035
auto shuffleImm = [](int x, int y, int z, int w) { return x | (y << 2) | (z << 4) | (w << 6); };
2036
auto blendConst = [](int x, int y, int z, int w) { return x | (y << 1) | (z << 2) | (w << 3); };
2037
2038
// Prepare some negatives.
2039
ir.Write(IROp::Vec4Neg, IRVTEMP_0, tregs[0]);
2040
2041
// tmp = S[x,x,x,x] * T[w,-z,y,-x]
2042
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 0, 1, 0));
2043
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(3, 2, 1, 0));
2044
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(0, 0, 0, 0));
2045
ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_D, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2046
2047
// tmp += S[y,y,y,y] * T[z,w,-x,-y]
2048
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 1, 0, 0));
2049
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(2, 3, 0, 1));
2050
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(1, 1, 1, 1));
2051
ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2052
ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2053
2054
// tmp += S[z,z,z,z] * T[-y,x,w,-z]
2055
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(0, 1, 1, 0));
2056
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(1, 0, 3, 2));
2057
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(2, 2, 2, 2));
2058
ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2059
ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2060
2061
// tmp += S[w,w,w,w] * T[x,y,z,w]
2062
ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(3, 3, 3, 3));
2063
ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, tregs[0]);
2064
ir.Write(IROp::Vec4Add, dregs[0], IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2065
} else {
2066
INVALIDOP;
2067
}
2068
}
2069
2070
void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
2071
CONDITIONAL_DISABLE(VFPU_COMP);
2072
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
2073
DISABLE;
2074
}
2075
2076
// Vector compare
2077
// VFPU_CC[N] = COMPARE(s[N], t[N])
2078
2079
VectorSize sz = GetVecSize(op);
2080
int n = GetNumVectorElements(sz);
2081
2082
u8 sregs[4], tregs[4];
2083
GetVectorRegsPrefixS(sregs, sz, _VS);
2084
GetVectorRegsPrefixT(tregs, sz, _VT);
2085
2086
int cond = op & 0xF;
2087
int mask = 0;
2088
for (int i = 0; i < n; i++) {
2089
ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
2090
mask |= (1 << i);
2091
}
2092
ir.Write(IROp::FCmpVfpuAggregate, mask);
2093
}
2094
2095
void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
2096
CONDITIONAL_DISABLE(VFPU_COMP);
2097
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2098
DISABLE;
2099
}
2100
2101
// Vector conditional move
2102
// imm3 >= 6: d[N] = VFPU_CC[N] == tf ? s[N] : d[N]
2103
// imm3 < 6: d[N] = VFPU_CC[imm3] == tf ? s[N] : d[N]
2104
2105
VectorSize sz = GetVecSize(op);
2106
int n = GetNumVectorElements(sz);
2107
2108
u8 sregs[4], dregs[4];
2109
GetVectorRegsPrefixS(sregs, sz, _VS);
2110
GetVectorRegsPrefixD(dregs, sz, _VD);
2111
int tf = (op >> 19) & 1;
2112
int imm3 = (op >> 16) & 7;
2113
2114
if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
2115
// TODO: Could do a VfpuCC variant of Vec4Blend.
2116
}
2117
2118
for (int i = 0; i < n; ++i) {
2119
// Simplification: Disable if overlap unsafe
2120
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2121
DISABLE;
2122
}
2123
}
2124
if (imm3 < 6) {
2125
// Test one bit of CC. This bit decides whether none or all subregisters are copied.
2126
for (int i = 0; i < n; i++) {
2127
ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7));
2128
}
2129
} else {
2130
// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
2131
for (int i = 0; i < n; i++) {
2132
ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));
2133
}
2134
}
2135
ApplyPrefixD(dregs, sz, _VD);
2136
}
2137
2138
void IRFrontend::Comp_Viim(MIPSOpcode op) {
2139
CONDITIONAL_DISABLE(VFPU_XFER);
2140
if (js.HasUnknownPrefix())
2141
DISABLE;
2142
2143
// Vector integer immediate
2144
// d[0] = float(imm)
2145
2146
s32 imm = SignExtend16ToS32(op);
2147
u8 dreg;
2148
GetVectorRegsPrefixD(&dreg, V_Single, _VT);
2149
ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));
2150
ApplyPrefixD(&dreg, V_Single, _VT);
2151
}
2152
2153
void IRFrontend::Comp_Vfim(MIPSOpcode op) {
2154
CONDITIONAL_DISABLE(VFPU_XFER);
2155
if (js.HasUnknownPrefix())
2156
DISABLE;
2157
2158
// Vector half-float immediate
2159
// d[0] = float(imm)
2160
2161
FP16 half;
2162
half.u = op & 0xFFFF;
2163
FP32 fval = half_to_float_fast5(half);
2164
2165
u8 dreg;
2166
GetVectorRegsPrefixD(&dreg, V_Single, _VT);
2167
ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));
2168
ApplyPrefixD(&dreg, V_Single, _VT);
2169
}
2170
2171
void IRFrontend::Comp_Vcst(MIPSOpcode op) {
2172
CONDITIONAL_DISABLE(VFPU_XFER);
2173
if (js.HasUnknownPrefix())
2174
DISABLE;
2175
2176
// Vector constant
2177
// d[N] = CONST
2178
2179
int conNum = (op >> 16) & 0x1f;
2180
int vd = _VD;
2181
2182
VectorSize sz = GetVecSize(op);
2183
int n = GetNumVectorElements(sz);
2184
2185
u8 dregs[4];
2186
GetVectorRegsPrefixD(dregs, sz, vd);
2187
2188
if (IsVec4(sz, dregs)) {
2189
ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
2190
ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);
2191
} else if (IsVec3of4(sz, dregs) && opts.preferVec4) {
2192
ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
2193
ir.Write(IROp::Vec4Shuffle, IRVTEMP_0, IRVTEMP_0, 0);
2194
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
2195
} else {
2196
for (int i = 0; i < n; i++) {
2197
// Most of the time, materializing a float is slower than copying from another float.
2198
if (i == 0)
2199
ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
2200
else
2201
ir.Write(IROp::FMov, dregs[i], dregs[0]);
2202
}
2203
}
2204
ApplyPrefixD(dregs, sz, vd);
2205
}
2206
2207
// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
2208
// calling the math library.
2209
void IRFrontend::Comp_VRot(MIPSOpcode op) {
2210
CONDITIONAL_DISABLE(VFPU_VEC);
2211
if (!js.HasNoPrefix()) {
2212
// Prefixes work strangely for this:
2213
// * They never apply to cos (whether d or s prefixes.)
2214
// * They mostly apply to sin/0, e.g. 0:1, M, or |x|.
2215
DISABLE;
2216
}
2217
2218
// Vector rotation matrix (weird prefixes)
2219
// d[N] = SINCOSVAL(s[0], imm[N])
2220
// The imm selects: cos index, sin index, 0 or sin for others, sin sign flip.
2221
2222
int vd = _VD;
2223
int vs = _VS;
2224
int imm = (op >> 16) & 0x1f;
2225
VectorSize sz = GetVecSize(op);
2226
int n = GetNumVectorElements(sz);
2227
int sineLane = (imm >> 2) & 3;
2228
int cosineLane = imm & 3;
2229
bool negSin = (imm & 0x10) ? true : false;
2230
bool broadcastSine = sineLane == cosineLane;
2231
2232
char d[4] = { '0', '0', '0', '0' };
2233
if (broadcastSine) {
2234
for (int i = 0; i < 4; i++)
2235
d[i] = 's';
2236
}
2237
d[sineLane] = 's';
2238
d[cosineLane] = 'c';
2239
2240
u8 dregs[4];
2241
GetVectorRegs(dregs, sz, vd);
2242
u8 sreg[1];
2243
GetVectorRegs(sreg, V_Single, vs);
2244
2245
// If there's overlap, sin is calculated without it, but cosine uses the result.
2246
// This corresponds with prefix handling, where cosine doesn't get in prefixes.
2247
if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {
2248
ir.Write(IROp::FSin, IRVTEMP_0, sreg[0]);
2249
if (negSin)
2250
ir.Write(IROp::FNeg, IRVTEMP_0, IRVTEMP_0);
2251
}
2252
2253
for (int i = 0; i < n; i++) {
2254
switch (d[i]) {
2255
case '0':
2256
ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(0.0f));
2257
break;
2258
case 's':
2259
if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {
2260
ir.Write(IROp::FMov, dregs[i], IRVTEMP_0);
2261
} else {
2262
ir.Write(IROp::FSin, dregs[i], sreg[0]);
2263
if (negSin) {
2264
ir.Write(IROp::FNeg, dregs[i], dregs[i]);
2265
}
2266
}
2267
break;
2268
case 'c':
2269
if (IsOverlapSafe(n, dregs, 1, sreg))
2270
ir.Write(IROp::FCos, dregs[i], sreg[0]);
2271
else if (dregs[sineLane] == sreg[0])
2272
ir.Write(IROp::FCos, dregs[i], IRVTEMP_0);
2273
else
2274
ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(1.0f));
2275
break;
2276
}
2277
}
2278
}
2279
2280
void IRFrontend::Comp_Vsgn(MIPSOpcode op) {
2281
CONDITIONAL_DISABLE(VFPU_VEC);
2282
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2283
DISABLE;
2284
}
2285
2286
// Vector extract sign
2287
// d[N] = signum(s[N])
2288
2289
VectorSize sz = GetVecSize(op);
2290
int n = GetNumVectorElements(sz);
2291
2292
u8 sregs[4], dregs[4];
2293
GetVectorRegsPrefixS(sregs, sz, _VS);
2294
GetVectorRegsPrefixD(dregs, sz, _VD);
2295
2296
u8 tempregs[4];
2297
for (int i = 0; i < n; ++i) {
2298
if (!IsOverlapSafe(dregs[i], n, sregs)) {
2299
tempregs[i] = IRTEMP_0 + i;
2300
} else {
2301
tempregs[i] = dregs[i];
2302
}
2303
}
2304
2305
for (int i = 0; i < n; ++i) {
2306
ir.Write(IROp::FSign, tempregs[i], sregs[i]);
2307
}
2308
2309
for (int i = 0; i < n; ++i) {
2310
if (dregs[i] != tempregs[i]) {
2311
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2312
}
2313
}
2314
2315
ApplyPrefixD(dregs, sz, _VD);
2316
}
2317
2318
void IRFrontend::Comp_Vocp(MIPSOpcode op) {
2319
CONDITIONAL_DISABLE(VFPU_VEC);
2320
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {
2321
DISABLE;
2322
}
2323
2324
// Vector one's complement
2325
// d[N] = 1.0 - s[N]
2326
2327
VectorSize sz = GetVecSize(op);
2328
int n = GetNumVectorElements(sz);
2329
2330
// This is a hack that modifies prefixes. We eat them later, so just overwrite.
2331
// S prefix forces the negate flags.
2332
js.prefixS |= 0x000F0000;
2333
// T prefix forces constants on and regnum to 1.
2334
// That means negate still works, and abs activates a different constant.
2335
js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2336
2337
u8 sregs[4], tregs[4], dregs[4];
2338
GetVectorRegsPrefixS(sregs, sz, _VS);
2339
// There's no bits for t, so just reuse s. It'll be constants only.
2340
GetVectorRegsPrefixT(tregs, sz, _VS);
2341
GetVectorRegsPrefixD(dregs, sz, _VD);
2342
2343
if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
2344
ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);
2345
} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
2346
ir.Write(IROp::Vec4Add, IRVTEMP_0, tregs[0], sregs[0]);
2347
ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
2348
} else {
2349
u8 tempregs[4];
2350
for (int i = 0; i < n; ++i) {
2351
if (!IsOverlapSafe(dregs[i], n, sregs)) {
2352
tempregs[i] = IRVTEMP_0 + i;
2353
} else {
2354
tempregs[i] = dregs[i];
2355
}
2356
}
2357
2358
for (int i = 0; i < n; ++i) {
2359
ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);
2360
}
2361
for (int i = 0; i < n; ++i) {
2362
if (dregs[i] != tempregs[i]) {
2363
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2364
}
2365
}
2366
}
2367
2368
ApplyPrefixD(dregs, sz, _VD);
2369
}
2370
2371
void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
2372
CONDITIONAL_DISABLE(VFPU_VEC);
2373
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2374
DISABLE;
2375
}
2376
2377
// Vector color conversion
2378
// d[N] = ConvertTo16(s[N*2]) | (ConvertTo16(s[N*2+1]) << 16)
2379
2380
DISABLE;
2381
}
2382
2383
void IRFrontend::Comp_Vbfy(MIPSOpcode op) {
2384
CONDITIONAL_DISABLE(VFPU_VEC);
2385
if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {
2386
DISABLE;
2387
}
2388
2389
// Vector butterfly operation
2390
// vbfy2: d[0] = s[0] + s[2], d[1] = s[1] + s[3], d[2] = s[0] - s[2], d[3] = s[1] - s[3]
2391
// vbfy1: d[N*2] = s[N*2] + s[N*2+1], d[N*2+1] = s[N*2] - s[N*2+1]
2392
2393
VectorSize sz = GetVecSize(op);
2394
int n = GetNumVectorElements(sz);
2395
if (n != 2 && n != 4) {
2396
// Bad instructions
2397
INVALIDOP;
2398
}
2399
2400
u8 sregs[4], dregs[4];
2401
GetVectorRegsPrefixS(sregs, sz, _VS);
2402
GetVectorRegsPrefixD(dregs, sz, _VD);
2403
2404
u8 tempregs[4];
2405
for (int i = 0; i < n; ++i) {
2406
if (!IsOverlapSafe(dregs[i], n, sregs)) {
2407
tempregs[i] = IRVTEMP_0 + i;
2408
} else {
2409
tempregs[i] = dregs[i];
2410
}
2411
}
2412
2413
int subop = (op >> 16) & 0x1F;
2414
if (subop == 3 && n == 4) {
2415
// vbfy2
2416
ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[2]);
2417
ir.Write(IROp::FAdd, tempregs[1], sregs[1], sregs[3]);
2418
ir.Write(IROp::FSub, tempregs[2], sregs[0], sregs[2]);
2419
ir.Write(IROp::FSub, tempregs[3], sregs[1], sregs[3]);
2420
} else if (subop == 2) {
2421
// vbfy1
2422
ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[1]);
2423
ir.Write(IROp::FSub, tempregs[1], sregs[0], sregs[1]);
2424
if (n == 4) {
2425
ir.Write(IROp::FAdd, tempregs[2], sregs[2], sregs[3]);
2426
ir.Write(IROp::FSub, tempregs[3], sregs[2], sregs[3]);
2427
}
2428
} else {
2429
INVALIDOP;
2430
}
2431
2432
for (int i = 0; i < n; ++i) {
2433
if (tempregs[i] != dregs[i])
2434
ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2435
}
2436
2437
ApplyPrefixD(dregs, sz, _VD);
2438
}
2439
}
2440
2441