CoCalc -- IRCompVFPU.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/IR/IRCompVFPU.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <cmath>
19

20
#include "Common/CPUDetect.h"
21
#include "Common/Data/Convert/SmallDataConvert.h"
22
#include "Common/Math/math_util.h"
23
#include "Core/Compatibility.h"
24
#include "Core/Config.h"
25
#include "Core/MemMap.h"
26
#include "Core/MIPS/MIPS.h"
27
#include "Core/MIPS/MIPSTables.h"
28
#include "Core/MIPS/MIPSAnalyst.h"
29
#include "Core/MIPS/MIPSCodeUtils.h"
30
#include "Core/MIPS/IR/IRFrontend.h"
31
#include "Core/MIPS/IR/IRRegCache.h"
32
#include "Core/Reporting.h"
33
#include "Core/System.h"
34

35

36
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
37
// Currently known non working ones should have DISABLE.
38

39
// #define CONDITIONAL_DISABLE(flag) { Comp_Generic(op); return; }
40
#define CONDITIONAL_DISABLE(flag) if (opts.disableFlags & (uint32_t)JitDisable::flag) { Comp_Generic(op); return; }
41
#define DISABLE { Comp_Generic(op); return; }
42
#define INVALIDOP { Comp_Generic(op); return; }
43

44
#define _RS MIPS_GET_RS(op)
45
#define _RT MIPS_GET_RT(op)
46
#define _RD MIPS_GET_RD(op)
47
#define _FS MIPS_GET_FS(op)
48
#define _FT MIPS_GET_FT(op)
49
#define _FD MIPS_GET_FD(op)
50
#define _SA MIPS_GET_SA(op)
51
#define _POS  ((op>> 6) & 0x1F)
52
#define _SIZE ((op>>11) & 0x1F)
53
#define _IMM16 (signed short)(op & 0xFFFF)
54
#define _IMM26 (op & 0x03FFFFFF)
55

56
const int vfpuBase = 32;  // skip the FP registers
57

58
namespace MIPSComp {
59
	static void ApplyVoffset(u8 regs[4], int count) {
60
		for (int i = 0; i < count; i++) {
61
			regs[i] = vfpuBase + voffset[regs[i]];
62
		}
63
	}
64

65
	static bool IsConsecutive2(const u8 regs[2]) {
66
		return regs[1] == regs[0] + 1;
67
	}
68

69
	static bool IsConsecutive3(const u8 regs[3]) {
70
		return IsConsecutive2(regs) && regs[2] == regs[1] + 1;
71
	}
72

73
	static bool IsConsecutive4(const u8 regs[4]) {
74
		return IsConsecutive3(regs) && regs[3] == regs[2] + 1;
75
	}
76

77
	static bool IsVec2(VectorSize sz, const u8 regs[2]) {
78
		return sz == V_Pair && IsConsecutive2(regs) && (regs[0] & 1) == 0;
79
	}
80

81
	static bool IsVec4(VectorSize sz, const u8 regs[4]) {
82
		return sz == V_Quad && IsConsecutive4(regs) && (regs[0] & 3) == 0;
83
	}
84

85
	static bool IsVec3of4(VectorSize sz, const u8 regs[4]) {
86
		return sz == V_Triple && IsConsecutive3(regs) && (regs[0] & 3) == 0;
87
	}
88

89
	static bool IsMatrixVec4(MatrixSize sz, const u8 regs[16]) {
90
		if (sz != M_4x4)
91
			return false;
92
		if (!IsConsecutive4(&regs[0]) || (regs[0] & 3) != 0)
93
			return false;
94
		if (!IsConsecutive4(&regs[4]) || (regs[4] & 3) != 0)
95
			return false;
96
		if (!IsConsecutive4(&regs[8]) || (regs[8] & 3) != 0)
97
			return false;
98
		if (!IsConsecutive4(&regs[12]) || (regs[12] & 3) != 0)
99
			return false;
100
		return true;
101
	}
102

103
	// Vector regs can overlap in all sorts of swizzled ways.
104
	// This does allow a single overlap in sregs[i].
105
	static bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
106
		for (int i = 0; i < sn; ++i) {
107
			if (sregs[i] == dreg && i != di)
108
				return false;
109
		}
110
		for (int i = 0; i < tn; ++i) {
111
			if (tregs[i] == dreg)
112
				return false;
113
		}
114

115
		// Hurray, no overlap, we can write directly.
116
		return true;
117
	}
118

119
	static bool IsOverlapSafeAllowS(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
120
		for (int i = 0; i < dn; ++i) {
121
			if (!IsOverlapSafeAllowS(dregs[i], i, sn, sregs, tn, tregs)) {
122
				return false;
123
			}
124
		}
125
		return true;
126
	}
127

128
	static bool IsOverlapSafe(int dreg, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
129
		return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);
130
	}
131

132
	static bool IsOverlapSafe(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {
133
		for (int i = 0; i < dn; ++i) {
134
			if (!IsOverlapSafe(dregs[i], sn, sregs, tn, tregs)) {
135
				return false;
136
			}
137
		}
138
		return true;
139
	}
140

141
	static bool IsPrefixWithinSize(u32 prefix, VectorSize sz) {
142
		int n = GetNumVectorElements(sz);
143
		for (int i = n; i < 4; i++) {
144
			int regnum = (prefix >> (i * 2)) & 3;
145
			int abs = (prefix >> (8 + i)) & 1;
146
			int negate = (prefix >> (16 + i)) & 1;
147
			int constants = (prefix >> (12 + i)) & 1;
148
			if (regnum >= n && !constants) {
149
				if (abs || negate || regnum != i)
150
					return false;
151
			}
152
		}
153

154
		return true;
155
	}
156

157
	static bool IsPrefixWithinSize(u32 prefix, MIPSOpcode op) {
158
		return IsPrefixWithinSize(prefix, GetVecSize(op));
159
	}
160

161
	void IRFrontend::Comp_VPFX(MIPSOpcode op) {
162
		CONDITIONAL_DISABLE(VFPU_XFER);
163
		// This is how prefixes are typically set.
164
		int data = op & 0xFFFFF;
165
		int regnum = (op >> 24) & 3;
166
		switch (regnum) {
167
		case 0:  // S
168
			js.prefixS = data;
169
			js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
170
			break;
171
		case 1:  // T
172
			js.prefixT = data;
173
			js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
174
			break;
175
		case 2:  // D
176
			js.prefixD = data & 0x00000FFF;
177
			js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
178
			break;
179
		default:
180
			ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
181
			break;
182
		}
183
	}
184

185
	static void InitRegs(u8 *vregs, int reg) {
186
		vregs[0] = reg;
187
		vregs[1] = reg + 1;
188
		vregs[2] = reg + 2;
189
		vregs[3] = reg + 3;
190
	}
191

192
	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {
193
		if (prefix == 0xE4)
194
			return;
195

196
		int n = GetNumVectorElements(sz);
197
		u8 origV[4]{};
198
		static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
199

200
		for (int i = 0; i < n; i++)
201
			origV[i] = vregs[i];
202

203
		// Some common vector prefixes
204
		if (IsVec4(sz, vregs)) {
205
			if (prefix == 0xF00E4) {
206
				InitRegs(vregs, tempReg);
207
				ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);
208
				return;
209
			}
210
			if (prefix == 0x00FE4) {
211
				InitRegs(vregs, tempReg);
212
				ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);
213
				return;
214
			}
215
			// Pure shuffle
216
			if (prefix == (prefix & 0xFF)) {
217
				InitRegs(vregs, tempReg);
218
				ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);
219
				return;
220
			}
221

222
			if ((prefix & 0x000FF000) == 0x0000F000) {
223
				// Handle some easy and common cases.
224
				Vec4Init init = Vec4Init::AllZERO;
225
				bool useInit;
226
				switch (prefix & 0xFFF) {
227
				case 0x00: useInit = true; init = Vec4Init::AllZERO; break;
228
				case 0x01: useInit = true; init = Vec4Init::Set_1000; break;
229
				case 0x04: useInit = true; init = Vec4Init::Set_0100; break;
230
				case 0x10: useInit = true; init = Vec4Init::Set_0010; break;
231
				case 0x40: useInit = true; init = Vec4Init::Set_0001; break;
232
				case 0x55: useInit = true; init = Vec4Init::AllONE; break;
233
				default: useInit = false; break;
234
				}
235

236
				if (useInit) {
237
					InitRegs(vregs, tempReg);
238
					ir.Write(IROp::Vec4Init, vregs[0], (int)init);
239
					return;
240
				}
241
			}
242

243
			// Check if we're just zeroing certain lanes - this is common.
244
			u32 zeroedLanes = 0;
245
			for (int i = 0; i < 4; ++i) {
246
				int regnum = (prefix >> (i * 2)) & 3;
247
				int abs = (prefix >> (8 + i)) & 1;
248
				int negate = (prefix >> (16 + i)) & 1;
249
				int constants = (prefix >> (12 + i)) & 1;
250

251
				if (!constants && regnum == i && !abs && !negate)
252
					continue;
253
				if (constants && regnum == 0 && abs == 0 && !negate) {
254
					zeroedLanes |= 1 << i;
255
					continue;
256
				}
257

258
				// Nope, it has something else going on.
259
				zeroedLanes = -1;
260
				break;
261
			}
262

263
			if (zeroedLanes != -1) {
264
				InitRegs(vregs, tempReg);
265
				ir.Write(IROp::Vec4Init, vregs[0], (int)Vec4Init::AllZERO);
266
				ir.Write(IROp::Vec4Blend, vregs[0], origV[0], vregs[0], zeroedLanes);
267
				return;
268
			}
269
		}
270

271
		// Alright, fall back to the generic approach.
272
		for (int i = 0; i < n; i++) {
273
			int regnum = (prefix >> (i * 2)) & 3;
274
			int abs = (prefix >> (8 + i)) & 1;
275
			int negate = (prefix >> (16 + i)) & 1;
276
			int constants = (prefix >> (12 + i)) & 1;
277

278
			// Unchanged, hurray.
279
			if (!constants && regnum == i && !abs && !negate)
280
				continue;
281

282
			// This puts the value into a temp reg, so we won't write the modified value back.
283
			vregs[i] = tempReg + i;
284
			if (!constants) {
285
				if (regnum >= n) {
286
					// Depends on the op, but often zero.
287
					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(0.0f));
288
				} else if (abs) {
289
					ir.Write(IROp::FAbs, vregs[i], origV[regnum]);
290
					if (negate)
291
						ir.Write(IROp::FNeg, vregs[i], vregs[i]);
292
				} else {
293
					if (negate)
294
						ir.Write(IROp::FNeg, vregs[i], origV[regnum]);
295
					else if (vregs[i] != origV[regnum])
296
						ir.Write(IROp::FMov, vregs[i], origV[regnum]);
297
				}
298
			} else {
299
				if (negate) {
300
					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)]));
301
				} else {
302
					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)]));
303
				}
304
			}
305
		}
306
	}
307

308
	void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
309
		::GetVectorRegs(regs, N, vectorReg);
310
		ApplyVoffset(regs, N);
311
	}
312

313
	void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
314
		::GetMatrixRegs(regs, N, matrixReg);
315
		for (int i = 0; i < GetMatrixSide(N); i++) {
316
			ApplyVoffset(regs + 4 * i, GetVectorSize(N));
317
		}
318
	}
319

320
	void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
321
		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
322
		GetVectorRegs(regs, sz, vectorReg);
323
		ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);
324
	}
325
	void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
326
		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
327
		GetVectorRegs(regs, sz, vectorReg);
328
		ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);
329
	}
330

331
	void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
332
		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
333

334
		GetVectorRegs(regs, sz, vectorReg);
335
		int n = GetNumVectorElements(sz);
336
		if (js.prefixD == 0)
337
			return;
338

339
		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
340
			// Use temps for all, we'll blend in the end (keeping in Vec4.)
341
			for (int i = 0; i < 4; ++i)
342
				regs[i] = IRVTEMP_PFX_D + i;
343
			return;
344
		}
345

346
		for (int i = 0; i < n; i++) {
347
			// Hopefully this is rare, we'll just write it into a dumping ground reg.
348
			if (js.VfpuWriteMask(i))
349
				regs[i] = IRVTEMP_PFX_D + i;
350
		}
351
	}
352

353
	inline int GetDSat(int prefix, int i) {
354
		return (prefix >> (i * 2)) & 3;
355
	}
356

357
	// "D" prefix is really a post process. No need to allocate a temporary register (except
358
	// dummies to simulate writemask, which is done in GetVectorRegsPrefixD
359
	void IRFrontend::ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg) {
360
		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
361
		if (!js.prefixD)
362
			return;
363

364
		ApplyPrefixDMask(vregs, sz, vectorReg);
365

366
		int n = GetNumVectorElements(sz);
367
		for (int i = 0; i < n; i++) {
368
			if (js.VfpuWriteMask(i))
369
				continue;
370
			int sat = GetDSat(js.prefixD, i);
371
			if (sat == 1) {
372
				// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
373
				ir.Write(IROp::FSat0_1, vregs[i], vregs[i]);
374
			} else if (sat == 3) {
375
				ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]);
376
			}
377
		}
378
	}
379

380
	void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
381
		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
382
			u8 origV[4];
383
			GetVectorRegs(origV, sz, vectorReg);
384

385
			// Just keep the original values where it was masked.
386
			ir.Write(IROp::Vec4Blend, origV[0], vregs[0], origV[0], js.VfpuWriteMask());
387

388
			// So that saturate works, change it back.
389
			for (int i = 0; i < 4; ++i)
390
				vregs[i] = origV[i];
391
		}
392
	}
393

394
	void IRFrontend::Comp_SV(MIPSOpcode op) {
395
		CONDITIONAL_DISABLE(LSU_VFPU);
396
		s32 offset = (signed short)(op & 0xFFFC);
397
		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
398
		MIPSGPReg rs = _RS;
399

400
		CheckMemoryBreakpoint(rs, offset);
401

402
		switch (op >> 26) {
403
		case 50: //lv.s
404
			ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
405
			break;
406

407
		case 58: //sv.s
408
			ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
409
			break;
410

411
		default:
412
			INVALIDOP;
413
		}
414
	}
415

416
	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
417
		CONDITIONAL_DISABLE(LSU_VFPU);
418
		int imm = (signed short)(op & 0xFFFC);
419
		int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
420
		MIPSGPReg rs = _RS;
421

422
		u8 vregs[4];
423
		GetVectorRegs(vregs, V_Quad, vt);
424

425
		CheckMemoryBreakpoint(rs, imm);
426

427
		enum class LSVType {
428
			INVALID,
429
			LVQ,
430
			SVQ,
431
			LVLQ,
432
			LVRQ,
433
			SVLQ,
434
			SVRQ,
435
		};
436

437
		LSVType optype = LSVType::INVALID;
438
		switch (op >> 26) {
439
		case 54: optype = LSVType::LVQ; break; // lv.q
440
		case 62: optype = LSVType::SVQ; break; // sv.q
441
		case 53: // lvl/lvr.q - highly unusual
442
			optype = (op & 2) == 0 ? LSVType::LVLQ : LSVType::LVRQ;
443
			break;
444
		case 61: // svl/svr.q - highly unusual
445
			optype = (op & 2) == 0 ? LSVType::SVLQ : LSVType::SVRQ;
446
			break;
447
		}
448
		if (optype == LSVType::INVALID)
449
			INVALIDOP;
450

451
		if ((optype == LSVType::LVRQ || optype == LSVType::SVRQ) && opts.unalignedLoadStoreVec4) {
452
			// We don't bother with an op for this, but we do fuse unaligned stores which happen.
453
			MIPSOpcode nextOp = GetOffsetInstruction(1);
454
			if ((nextOp.encoding ^ op.encoding) == 0x0000000E) {
455
				// Okay, it's an svr.q/svl.q pair, same registers.  Treat as lv.q/sv.q.
456
				EatInstruction(nextOp);
457
				optype = optype == LSVType::LVRQ ? LSVType::LVQ : LSVType::SVQ;
458
			}
459
		}
460

461
		switch (optype) {
462
		case LSVType::LVQ:
463
			if (IsVec4(V_Quad, vregs)) {
464
				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
465
			} else {
466
				// Let's not even bother with "vertical" loads for now.
467
				if (!g_Config.bFastMemory)
468
					ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 0, (u32)imm);
469
				ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm));
470
				ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4));
471
				ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8));
472
				ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12));
473
			}
474
			break;
475

476
		case LSVType::SVQ:
477
			if (IsVec4(V_Quad, vregs)) {
478
				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
479
			} else {
480
				// Let's not even bother with "vertical" stores for now.
481
				if (!g_Config.bFastMemory)
482
					ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 1, (u32)imm);
483
				ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm));
484
				ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4));
485
				ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8));
486
				ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12));
487
			}
488
			break;
489

490
		case LSVType::LVLQ:
491
		case LSVType::LVRQ:
492
		case LSVType::SVLQ:
493
		case LSVType::SVRQ:
494
			// These are pretty uncommon unless paired.
495
			DISABLE;
496
			break;
497

498
		default:
499
			INVALIDOP;
500
		}
501
	}
502

503
	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
504
		CONDITIONAL_DISABLE(VFPU_XFER);
505
		if (js.HasUnknownPrefix() || js.HasSPrefix()) {
506
			DISABLE;
507
		}
508

509
		// Vector init
510
		// d[N] = CONST[N]
511
		// Note: probably implemented as vmov with prefix hack.
512

513
		VectorSize sz = GetVecSize(op);
514
		int type = (op >> 16) & 0xF;
515
		int vd = _VD;
516
		int n = GetNumVectorElements(sz);
517
		u8 dregs[4];
518
		GetVectorRegsPrefixD(dregs, sz, vd);
519

520
		if (IsVec4(sz, dregs)) {
521
			ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
522
		} else {
523
			for (int i = 0; i < n; i++) {
524
				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
525
			}
526
		}
527
		ApplyPrefixD(dregs, sz, vd);
528
	}
529

530
	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
531
		CONDITIONAL_DISABLE(VFPU_XFER);
532
		if (js.HasUnknownPrefix() || js.HasSPrefix()) {
533
			DISABLE;
534
		}
535

536
		// Vector identity row
537
		// d[N] = IDENTITY[N,m]
538
		// Note: probably implemented as vmov with prefix hack.
539

540
		int vd = _VD;
541
		VectorSize sz = GetVecSize(op);
542
		u8 dregs[4];
543
		GetVectorRegsPrefixD(dregs, sz, vd);
544

545
		if (IsVec4(sz, dregs)) {
546
			int row = vd & 3;
547
			Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
548
			ir.Write(IROp::Vec4Init, dregs[0], (int)init);
549
		} else {
550
			switch (sz) {
551
			case V_Pair:
552
				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f));
553
				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f));
554
				break;
555
			case V_Quad:
556
				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f));
557
				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f));
558
				ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f));
559
				ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f));
560
				break;
561
			default:
562
				INVALIDOP;
563
			}
564
		}
565

566
		ApplyPrefixD(dregs, sz, vd);
567
	}
568

569
	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
570
		CONDITIONAL_DISABLE(VFPU_XFER);
571
		MatrixSize sz = GetMtxSize(op);
572
		if (!js.HasNoPrefix()) {
573
			DISABLE;
574
		}
575

576
		// Matrix init (weird prefixes)
577
		// d[N,M] = CONST[N,M]
578

579
		int vd = _VD;
580
		if (IsMatrixTransposed(vd)) {
581
			// All outputs are transpositionally symmetric, so should be fine.
582
			vd = TransposeMatrixReg(vd);
583
		}
584

585
		if (sz != M_4x4) {
586
			// 3x3 is decently common.  It expands a lot, but let's set each.
587
			u8 dregs[16];
588
			GetMatrixRegs(dregs, sz, vd);
589

590
			// TODO: It might be worth using Vec4Blend for 3x3 to mask w.
591
			int n = GetMatrixSide(sz);
592
			for (int y = 0; y < n; ++y) {
593
				for (int x = 0; x < n; ++x) {
594
					switch ((op >> 16) & 0xF) {
595
					case 3: // vmidt
596
						if (x == 0 && y == 0)
597
							ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));
598
						else if (x == y)
599
							ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);
600
						else
601
							ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));
602
						break;
603
					case 6: // vmzero
604
						// Likely to be fast.
605
						ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));
606
						break;
607
					case 7: // vmone
608
						if (x == 0 && y == 0)
609
							ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));
610
						else
611
							ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);
612
						break;
613
					default:
614
						INVALIDOP;
615
					}
616
				}
617
			}
618
			return;
619
		}
620

621
		// Not really about trying here, it will work if enabled.
622
		VectorSize vsz = GetVectorSize(sz);
623
		u8 vecs[4];
624
		GetMatrixColumns(vd, sz, vecs);
625
		for (int i = 0; i < 4; i++) {
626
			u8 vec[4];
627
			GetVectorRegs(vec, vsz, vecs[i]);
628
			// As they are columns, they will be nicely consecutive.
629
			Vec4Init init;
630
			switch ((op >> 16) & 0xF) {
631
			case 3:
632
				init = Vec4Init((int)Vec4Init::Set_1000 + i);
633
				break;
634
			case 6:
635
				init = Vec4Init::AllZERO;
636
				break;
637
			case 7:
638
				init = Vec4Init::AllONE;
639
				break;
640
			default:
641
				INVALIDOP;
642
				return;
643
			}
644
			ir.Write(IROp::Vec4Init, vec[0], (int)init);
645
		}
646
	}
647

648
	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
649
		CONDITIONAL_DISABLE(VFPU_VEC);
650
		if (js.HasUnknownPrefix() || js.HasSPrefix() || !IsPrefixWithinSize(js.prefixT, op)) {
651
			DISABLE;
652
		}
653

654
		// Vector homogenous dot product
655
		// d[0] = s[0 .. n-2] dot t[0 .. n-2] + t[n-1]
656
		// Note: s[n-1] is ignored / treated as 1 via prefix override.
657

658
		int vd = _VD;
659
		int vs = _VS;
660
		int vt = _VT;
661
		VectorSize sz = GetVecSize(op);
662
		int n = GetNumVectorElements(sz);
663

664
		if (js.prefixS & (0x0101 << (8 + n - 1)))
665
			DISABLE;
666

667
		// TODO: Force read one of them into regs? probably not.
668
		u8 sregs[4], tregs[4], dregs[1];
669
		GetVectorRegsPrefixS(sregs, sz, vs);
670
		GetVectorRegsPrefixT(tregs, sz, vt);
671
		GetVectorRegsPrefixD(dregs, V_Single, vd);
672

673
		ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);
674

675
		for (int i = 1; i < n; i++) {
676
			if (i == n - 1) {
677
				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]);
678
			} else {
679
				ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]);
680
				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1);
681
			}
682
		}
683

684
		ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
685
		ApplyPrefixD(dregs, V_Single, vd);
686
	}
687

688
	alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
689

690
	void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {
691
		CONDITIONAL_DISABLE(VFPU_VEC);
692
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
693
			DISABLE;
694
		}
695

696
		// Vector horizontal add
697
		// d[0] = s[0] + ... s[n-1]
698
		// Vector horizontal average
699
		// d[0] = s[0] / n + ... s[n-1] / n
700
		// Note: Both are implemented as dot products against generated constants.
701

702
		VectorSize sz = GetVecSize(op);
703
		int n = GetNumVectorElements(sz);
704

705
		u8 sregs[4], dregs[1];
706
		GetVectorRegsPrefixS(sregs, sz, _VS);
707
		GetVectorRegsPrefixD(dregs, V_Single, _VD);
708

709
		// We have to start at +0.000 in case any values are -0.000.
710
		ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f));
711
		for (int i = 0; i < n; ++i) {
712
			ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]);
713
		}
714

715
		switch ((op >> 16) & 31) {
716
		case 6:  // vfad
717
			ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
718
			break;
719
		case 7:  // vavg
720
			ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1]));
721
			ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1);
722
			break;
723
		}
724

725
		ApplyPrefixD(dregs, V_Single, _VD);
726
	}
727

728
	void IRFrontend::Comp_VDot(MIPSOpcode op) {
729
		CONDITIONAL_DISABLE(VFPU_VEC);
730
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
731
			DISABLE;
732
		}
733

734
		// Vector dot product
735
		// d[0] = s[0 .. n-1] dot t[0 .. n-1]
736

737
		int vd = _VD;
738
		int vs = _VS;
739
		int vt = _VT;
740

741
		VectorSize sz = GetVecSize(op);
742
		int n = GetNumVectorElements(sz);
743

744
		// TODO: Force read one of them into regs? probably not.
745
		u8 sregs[4], tregs[4], dregs[1];
746
		GetVectorRegsPrefixS(sregs, sz, vs);
747
		GetVectorRegsPrefixT(tregs, sz, vt);
748
		GetVectorRegsPrefixD(dregs, V_Single, vd);
749

750
		if (IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
751
			if (IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
752
				ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
753
			} else {
754
				ir.Write(IROp::Vec4Dot, IRVTEMP_0, sregs[0], tregs[0]);
755
				ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
756
			}
757
			ApplyPrefixD(dregs, V_Single, vd);
758
			return;
759
		} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4Dot) {
760
			// Note: this is often worse than separate muliplies and adds on x86.
761
			if (IsOverlapSafe(dregs[0], n, tregs) || sregs[0] == tregs[0]) {
762
				// Nice example of this in Fat Princess (US) in block 088181A0 (hot.)
763
				// Create a temporary copy of S with the last element zeroed.
764
				ir.Write(IROp::Vec4Init, IRVTEMP_0, (int)Vec4Init::AllZERO);
765
				ir.Write(IROp::Vec4Blend, IRVTEMP_0, IRVTEMP_0, sregs[0], 0x7);
766
				// Now we can just dot like normal, with the last element effectively masked.
767
				ir.Write(IROp::Vec4Dot, dregs[0], IRVTEMP_0, sregs[0] == tregs[0] ? IRVTEMP_0 : tregs[0]);
768
				ApplyPrefixD(dregs, V_Single, vd);
769
				return;
770
			}
771
		}
772

773
		int temp0 = IRVTEMP_0;
774
		int temp1 = IRVTEMP_0 + 1;
775
		ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);
776
		for (int i = 1; i < n; i++) {
777
			ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
778
			ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
779
		}
780
		ApplyPrefixD(dregs, V_Single, vd);
781
	}
782

783
	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
784
		CONDITIONAL_DISABLE(VFPU_VEC);
785
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
786
			DISABLE;
787
		}
788

789
		// Vector arithmetic
790
		// d[N] = OP(s[N], t[N]) (see below)
791

792
		enum class VecDo3Op : uint8_t {
793
			INVALID,
794
			VADD,
795
			VSUB,
796
			VDIV,
797
			VMUL,
798
			VMIN,
799
			VMAX,
800
			VSGE,
801
			VSLT,
802
		};
803
		VecDo3Op type = VecDo3Op::INVALID;
804
		VectorSize sz = GetVecSize(op);
805
		int n = GetNumVectorElements(sz);
806

807
		// Check that we can support the ops, and prepare temporary values for ops that need it.
808
		switch (op >> 26) {
809
		case 24: //VFPU0
810
			switch ((op >> 23) & 7) {
811
			case 0: type = VecDo3Op::VADD; break;
812
			case 1: type = VecDo3Op::VSUB; break;
813
			case 7: type = VecDo3Op::VDIV; break;
814
			default: INVALIDOP;
815
			}
816
			break;
817
		case 25: //VFPU1
818
			switch ((op >> 23) & 7) {
819
			case 0: type = VecDo3Op::VMUL; break;
820
			default: INVALIDOP;
821
			}
822
			break;
823
		case 27: //VFPU3
824
			switch ((op >> 23) & 7) {
825
			case 2: type = VecDo3Op::VMIN; break;
826
			case 3: type = VecDo3Op::VMAX; break;
827
			case 6: type = VecDo3Op::VSGE; break;
828
			case 7: type = VecDo3Op::VSLT; break;
829
			default: INVALIDOP;
830
			}
831
			break;
832
		default: INVALIDOP;
833
		}
834
		_assert_(type != VecDo3Op::INVALID);
835

836
		bool allowSIMD = true;
837
		switch (type) {
838
		case VecDo3Op::VADD:
839
		case VecDo3Op::VSUB:
840
		case VecDo3Op::VMUL:
841
			break;
842
		case VecDo3Op::VDIV:
843
			if (js.HasUnknownPrefix() || (sz != V_Single && !js.HasNoPrefix()))
844
				DISABLE;
845
			// If it's single, we just need to check the prefixes are within the size.
846
			if (!IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))
847
				DISABLE;
848
			break;
849
		case VecDo3Op::VMIN:
850
		case VecDo3Op::VMAX:
851
		case VecDo3Op::VSGE:
852
		case VecDo3Op::VSLT:
853
			allowSIMD = false;
854
			break;
855
		case VecDo3Op::INVALID:  // Can't happen, but to avoid compiler warnings
856
			break;
857
		}
858

859
		u8 sregs[4], tregs[4], dregs[4];
860
		GetVectorRegsPrefixS(sregs, sz, _VS);
861
		GetVectorRegsPrefixT(tregs, sz, _VT);
862
		GetVectorRegsPrefixD(dregs, sz, _VD);
863

864
		u8 tempregs[4];
865
		for (int i = 0; i < n; i++) {
866
			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
867
				tempregs[i] = IRVTEMP_0 + i;
868
			} else {
869
				tempregs[i] = dregs[i];
870
			}
871
		}
872

873
		// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
874
		if (allowSIMD) {
875
			IROp opFunc = IROp::Nop;
876
			switch (type) {
877
			case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
878
				opFunc = IROp::Vec4Add;
879
				break;
880
			case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub
881
				opFunc = IROp::Vec4Sub;
882
				break;
883
			case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv
884
				opFunc = IROp::Vec4Div;
885
				break;
886
			case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul
887
				opFunc = IROp::Vec4Mul;
888
				break;
889
			default:
890
				// Leave it Nop, disabled below.
891
				break;
892
			}
893

894
			if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
895
				if (opFunc != IROp::Nop) {
896
					ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
897
				} else {
898
					DISABLE;
899
				}
900
				ApplyPrefixD(dregs, sz, _VD);
901
				return;
902
			} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
903
				// This is actually pretty common.  Use a temp + blend.
904
				// We could post-process this, but it's easier to do it here.
905
				if (opFunc == IROp::Nop)
906
					DISABLE;
907
				ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);
908
				ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
909
				ApplyPrefixD(dregs, sz, _VD);
910
				return;
911
			}
912
		}
913

914
		if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
915
			// TODO: Consider a dedicated op?  For now, we abuse FpCond a bit.
916
			ir.Write(IROp::FpCondToReg, IRTEMP_0);
917
		}
918

919
		for (int i = 0; i < n; ++i) {
920
			switch (type) {
921
			case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
922
				ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]);
923
				break;
924
			case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub
925
				ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]);
926
				break;
927
			case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv
928
				ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]);
929
				break;
930
			case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul
931
				ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]);
932
				break;
933
			case VecDo3Op::VMIN: // vmin
934
				ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]);
935
				break;
936
			case VecDo3Op::VMAX: // vmax
937
				ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]);
938
				break;
939
			case VecDo3Op::VSGE: // vsge
940
				ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessUnordered, sregs[i], tregs[i]);
941
				ir.Write(IROp::FpCondToReg, IRTEMP_1);
942
				ir.Write(IROp::XorConst, IRTEMP_1, IRTEMP_1, ir.AddConstant(1));
943
				ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);
944
				ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);
945
				break;
946
			case VecDo3Op::VSLT: // vslt
947
				ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessOrdered, sregs[i], tregs[i]);
948
				ir.Write(IROp::FpCondToReg, IRTEMP_1);
949
				ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);
950
				ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);
951
				break;
952
			case VecDo3Op::INVALID:  // Can't happen, but to avoid compiler warnings
953
				break;
954
			}
955
		}
956

957
		if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
958
			ir.Write(IROp::FpCondFromReg, IRTEMP_0);
959
		}
960

961
		for (int i = 0; i < n; i++) {
962
			if (dregs[i] != tempregs[i]) {
963
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
964
			}
965
		}
966

967
		ApplyPrefixD(dregs, sz, _VD);
968
	}
969

970
	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
971
		CONDITIONAL_DISABLE(VFPU_VEC);
972

973
		if (js.HasUnknownPrefix()) {
974
			DISABLE;
975
		}
976

977
		int optype = (op >> 16) & 0x1f;
978
		if (optype == 0) {
979
			if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op))
980
				DISABLE;
981
		} else if (optype == 1 || optype == 2) {
982
			// D prefix is fine for these, and used sometimes.
983
			if (js.HasUnknownPrefix() || js.HasSPrefix())
984
				DISABLE;
985
		} else if (optype == 5 && js.HasDPrefix()) {
986
			DISABLE;
987
		}
988

989
		// Vector unary operation
990
		// d[N] = OP(s[N]) (see below)
991

992
		int vs = _VS;
993
		int vd = _VD;
994
		VectorSize sz = GetVecSize(op);
995
		int n = GetNumVectorElements(sz);
996

997
		if (optype >= 16 && !js.HasNoPrefix()) {
998
			// Many of these apply the D prefix strangely or override parts of the S prefix.
999
			if (js.HasUnknownPrefix() || sz != V_Single)
1000
				DISABLE;
1001
			// If it's single, we just need to check the prefixes are within the size.
1002
			if (!IsPrefixWithinSize(js.prefixS, op))
1003
				DISABLE;
1004
			// The negative ones seem to use negate flags as a prefix hack.
1005
			if (optype >= 24 && (js.prefixS & 0x000F0000) != 0)
1006
				DISABLE;
1007
		}
1008

1009
		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
1010
		if (optype == 0 && vs == vd && js.HasNoPrefix()) {
1011
			return;
1012
		}
1013

1014
		u8 sregs[4]{}, dregs[4]{};
1015
		GetVectorRegsPrefixS(sregs, sz, vs);
1016
		GetVectorRegsPrefixD(dregs, sz, vd);
1017

1018
		bool usingTemps = false;
1019
		u8 tempregs[4];
1020
		for (int i = 0; i < n; ++i) {
1021
			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1022
				usingTemps = true;
1023
				tempregs[i] = IRVTEMP_0 + i;
1024
			} else {
1025
				tempregs[i] = dregs[i];
1026
			}
1027
		}
1028

1029
		bool canSIMD = false;
1030
		// Some can be SIMD'd.
1031
		switch (optype) {
1032
		case 0:  // vmov
1033
		case 1:  // vabs
1034
		case 2:  // vneg
1035
			canSIMD = true;
1036
			break;
1037
		}
1038

1039
		if (canSIMD && !usingTemps) {
1040
			IROp irop = IROp::Nop;
1041
			switch (optype) {
1042
			case 0:  // vmov
1043
				irop = IROp::Vec4Mov;
1044
				break;
1045
			case 1:  // vabs
1046
				irop = IROp::Vec4Abs;
1047
				break;
1048
			case 2:  // vneg
1049
				irop = IROp::Vec4Neg;
1050
				break;
1051
			}
1052
			if (IsVec4(sz, sregs) && IsVec4(sz, dregs) && irop != IROp::Nop) {
1053
				ir.Write(irop, dregs[0], sregs[0]);
1054
				ApplyPrefixD(dregs, sz, vd);
1055
				return;
1056
			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && irop != IROp::Nop && opts.preferVec4) {
1057
				// This is a simple case of vmov.t, just blend.
1058
				if (irop == IROp::Vec4Mov) {
1059
					ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], sregs[0], 0x7);
1060
				} else {
1061
					ir.Write(irop, IRVTEMP_0, sregs[0]);
1062
					ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1063
				}
1064
				ApplyPrefixD(dregs, sz, vd);
1065
				return;
1066
			}
1067
		}
1068

1069
		for (int i = 0; i < n; ++i) {
1070
			switch (optype) {
1071
			case 0: // d[i] = s[i]; break; //vmov
1072
				// Probably for swizzle.
1073
				if (tempregs[i] != sregs[i])
1074
					ir.Write(IROp::FMov, tempregs[i], sregs[i]);
1075
				break;
1076
			case 1: // d[i] = fabsf(s[i]); break; //vabs
1077
				ir.Write(IROp::FAbs, tempregs[i], sregs[i]);
1078
				break;
1079
			case 2: // d[i] = -s[i]; break; //vneg
1080
				ir.Write(IROp::FNeg, tempregs[i], sregs[i]);
1081
				break;
1082
			case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;    // vsat0
1083
				ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]);
1084
				break;
1085
			case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;  // vsat1
1086
				ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]);
1087
				break;
1088
			case 16: // d[i] = 1.0f / s[i]; break; //vrcp
1089
				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
1090
				break;
1091
			case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
1092
				ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]);
1093
				break;
1094
			case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
1095
				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
1096
				break;
1097
			case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
1098
				ir.Write(IROp::FCos, tempregs[i], sregs[i]);
1099
				break;
1100
			case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
1101
				DISABLE;
1102
				break;
1103
			case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
1104
				DISABLE;
1105
				break;
1106
			case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
1107
				ir.Write(IROp::FSqrt, tempregs[i], sregs[i]);
1108
				break;
1109
			case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
1110
				ir.Write(IROp::FAsin, tempregs[i], sregs[i]);
1111
				break;
1112
			case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
1113
				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
1114
				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
1115
				break;
1116
			case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
1117
				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
1118
				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
1119
				break;
1120
			case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
1121
				DISABLE;
1122
				break;
1123
			default:
1124
				INVALIDOP;
1125
			}
1126
		}
1127
		for (int i = 0; i < n; i++) {
1128
			if (dregs[i] != tempregs[i]) {
1129
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1130
			}
1131
		}
1132

1133
		ApplyPrefixD(dregs, sz, vd);
1134
	}
1135

1136
	void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
1137
		CONDITIONAL_DISABLE(VFPU_VEC);
1138
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
1139
			DISABLE;
1140
		}
1141

1142
		// Vector integer to float
1143
		// d[N] = float(S[N]) * mult
1144

1145
		VectorSize sz = GetVecSize(op);
1146
		int n = GetNumVectorElements(sz);
1147

1148
		uint8_t imm = (op >> 16) & 0x1f;
1149

1150
		u8 sregs[4], dregs[4];
1151
		GetVectorRegsPrefixS(sregs, sz, _VS);
1152
		GetVectorRegsPrefixD(dregs, sz, _VD);
1153

1154
		for (int i = 0; i < n; i++) {
1155
			if (imm == 0)
1156
				ir.Write(IROp::FCvtSW, dregs[i], sregs[i]);
1157
			else
1158
				ir.Write(IROp::FCvtScaledSW, dregs[i], sregs[i], imm);
1159
		}
1160
		ApplyPrefixD(dregs, sz, _VD);
1161
	}
1162

1163
	void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
1164
		CONDITIONAL_DISABLE(VFPU_VEC);
1165
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
1166
			DISABLE;
1167
		}
1168

1169
		// Vector expand half to float
1170
		// d[N*2] = float(lowerhalf(s[N])), d[N*2+1] = float(upperhalf(s[N]))
1171

1172
		DISABLE;
1173
	}
1174

1175
	void IRFrontend::Comp_Vf2i(MIPSOpcode op) {
1176
		CONDITIONAL_DISABLE(VFPU_VEC);
1177
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixD & 0xFF) != 0) {
1178
			DISABLE;
1179
		}
1180

1181
		// Vector float to integer
1182
		// d[N] = int(S[N] * mult)
1183
		// Note: saturates on overflow.
1184

1185
		VectorSize sz = GetVecSize(op);
1186
		int n = GetNumVectorElements(sz);
1187

1188
		uint8_t imm = (op >> 16) & 0x1f;
1189

1190
		u8 sregs[4], dregs[4];
1191
		GetVectorRegsPrefixS(sregs, sz, _VS);
1192
		GetVectorRegsPrefixD(dregs, sz, _VD);
1193

1194
		// Same values as FCR31.
1195
		uint8_t rmode = (op >> 21) & 3;
1196
		if (((op >> 21) & 0x1C) != 0x10)
1197
			INVALIDOP;
1198

1199
		if (imm != 0) {
1200
			for (int i = 0; i < n; i++)
1201
				ir.Write(IROp::FCvtScaledWS, dregs[i], sregs[i], imm | (rmode << 6));
1202
		} else {
1203
			for (int i = 0; i < n; i++) {
1204
				switch (IRRoundMode(rmode)) {
1205
				case IRRoundMode::RINT_0: // vf2in
1206
					ir.Write(IROp::FRound, dregs[i], sregs[i]);
1207
					break;
1208

1209
				case IRRoundMode::CAST_1: // vf2iz
1210
					ir.Write(IROp::FTrunc, dregs[i], sregs[i]);
1211
					break;
1212

1213
				case IRRoundMode::CEIL_2: // vf2iu
1214
					ir.Write(IROp::FCeil, dregs[i], sregs[i]);
1215
					break;
1216

1217
				case IRRoundMode::FLOOR_3: // vf2id
1218
					ir.Write(IROp::FFloor, dregs[i], sregs[i]);
1219
					break;
1220

1221
				default:
1222
					INVALIDOP;
1223
				}
1224
			}
1225
		}
1226

1227
		ApplyPrefixDMask(dregs, sz, _VD);
1228
	}
1229

1230
	void IRFrontend::Comp_Mftv(MIPSOpcode op) {
1231
		CONDITIONAL_DISABLE(VFPU_XFER);
1232

1233
		// Vector move from VFPU / from VFPU ctrl (no prefixes)
1234
		// gpr = S
1235
		// gpr = VFPU_CTRL[i]
1236

1237
		int imm = op & 0xFF;
1238
		MIPSGPReg rt = _RT;
1239
		switch ((op >> 21) & 0x1f) {
1240
		case 3: //mfv / mfvc
1241
			// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
1242
			if (rt != MIPS_REG_ZERO) {
1243
				if (imm < 128) {  //R(rt) = VI(imm);
1244
					ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
1245
				} else {
1246
					switch (imm - 128) {
1247
					case VFPU_CTRL_DPREFIX:
1248
					case VFPU_CTRL_SPREFIX:
1249
					case VFPU_CTRL_TPREFIX:
1250
						FlushPrefixV();
1251
						break;
1252
					}
1253
					if (imm - 128 < VFPU_CTRL_MAX) {
1254
						ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128);
1255
					} else {
1256
						INVALIDOP;
1257
					}
1258
				}
1259
			}
1260
			break;
1261

1262
		case 7: // mtv
1263
			if (imm < 128) {
1264
				ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);
1265
			} else if ((imm - 128) < VFPU_CTRL_MAX) {
1266
				u32 mask;
1267
				if (GetVFPUCtrlMask(imm - 128, &mask)) {
1268
					if (mask != 0xFFFFFFFF) {
1269
						ir.Write(IROp::AndConst, IRTEMP_0, rt, ir.AddConstant(mask));
1270
						ir.Write(IROp::SetCtrlVFPUReg, imm - 128, IRTEMP_0);
1271
					} else {
1272
						ir.Write(IROp::SetCtrlVFPUReg, imm - 128, rt);
1273
					}
1274
				}
1275

1276
				if (imm - 128 == VFPU_CTRL_SPREFIX) {
1277
					js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1278
					js.blockWrotePrefixes = true;
1279
				} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
1280
					js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1281
					js.blockWrotePrefixes = true;
1282
				} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
1283
					js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1284
					js.blockWrotePrefixes = true;
1285
				}
1286
			} else {
1287
				INVALIDOP;
1288
			}
1289
			break;
1290

1291
		default:
1292
			INVALIDOP;
1293
		}
1294
	}
1295

1296
	void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {
1297
		CONDITIONAL_DISABLE(VFPU_XFER);
1298

1299
		// Vector Move from vector control reg (no prefixes)
1300
		// D[0] = VFPU_CTRL[i]
1301

1302
		int vd = _VD;
1303
		int imm = (op >> 8) & 0x7F;
1304
		if (imm < VFPU_CTRL_MAX) {
1305
			ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm);
1306
			ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vd], IRTEMP_0);
1307
		} else {
1308
			INVALIDOP;
1309
		}
1310
	}
1311

1312
	void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {
1313
		CONDITIONAL_DISABLE(VFPU_XFER);
1314

1315
		// Vector Move to vector control reg (no prefixes)
1316
		// VFPU_CTRL[i] = S[0]
1317

1318
		int vs = _VS;
1319
		int imm = op & 0xFF;
1320
		if (imm < VFPU_CTRL_MAX) {
1321
			u32 mask;
1322
			if (GetVFPUCtrlMask(imm, &mask)) {
1323
				if (mask != 0xFFFFFFFF) {
1324
					ir.Write(IROp::FMovToGPR, IRTEMP_0, vfpuBase + voffset[imm]);
1325
					ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(mask));
1326
					ir.Write(IROp::SetCtrlVFPUReg, imm, IRTEMP_0);
1327
				} else {
1328
					ir.Write(IROp::SetCtrlVFPUFReg, imm, vfpuBase + voffset[vs]);
1329
				}
1330
			}
1331
			if (imm == VFPU_CTRL_SPREFIX) {
1332
				js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1333
				js.blockWrotePrefixes = true;
1334
			} else if (imm == VFPU_CTRL_TPREFIX) {
1335
				js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1336
				js.blockWrotePrefixes = true;
1337
			} else if (imm == VFPU_CTRL_DPREFIX) {
1338
				js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1339
				js.blockWrotePrefixes = true;
1340
			}
1341
		} else {
1342
			INVALIDOP;
1343
		}
1344
	}
1345

1346
	void IRFrontend::Comp_Vmmov(MIPSOpcode op) {
1347
		CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
1348
		if (!js.HasNoPrefix()) {
1349
			DISABLE;
1350
		}
1351

1352
		// Matrix move (weird prefixes)
1353
		// D[N,M] = S[N,M]
1354

1355
		int vs = _VS;
1356
		int vd = _VD;
1357
		// This probably ignores prefixes for all sane intents and purposes.
1358
		if (vs == vd) {
1359
			// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
1360
			return;
1361
		}
1362

1363
		MatrixSize sz = GetMtxSize(op);
1364
		int n = GetMatrixSide(sz);
1365

1366
		u8 sregs[16], dregs[16];
1367
		GetMatrixRegs(sregs, sz, vs);
1368
		GetMatrixRegs(dregs, sz, vd);
1369

1370
		switch (GetMatrixOverlap(vs, vd, sz)) {
1371
		case OVERLAP_EQUAL:
1372
			// In-place transpose
1373
			DISABLE;
1374
		case OVERLAP_PARTIAL:
1375
			DISABLE;
1376
		case OVERLAP_NONE:
1377
		default:
1378
			break;
1379
		}
1380
		if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) {
1381
			// Untranspose both matrices
1382
			if (IsMatrixTransposed(vd)) {
1383
				vd = TransposeMatrixReg(vd);
1384
				vs = TransposeMatrixReg(vs);
1385
			}
1386
			// Get the columns
1387
			u8 scols[4], dcols[4];
1388
			GetMatrixColumns(vs, sz, scols);
1389
			GetMatrixColumns(vd, sz, dcols);
1390
			for (int i = 0; i < 4; i++) {
1391
				u8 svec[4], dvec[4];
1392
				GetVectorRegs(svec, GetVectorSize(sz), scols[i]);
1393
				GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]);
1394
				ir.Write(IROp::Vec4Mov, dvec[0], svec[0]);
1395
			}
1396
			return;
1397
		}
1398
		for (int a = 0; a < n; a++) {
1399
			for (int b = 0; b < n; b++) {
1400
				if (dregs[a * 4 + b] != sregs[a * 4 + b])
1401
					ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]);
1402
			}
1403
		}
1404
	}
1405

1406
	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
1407
		CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
1408
		if (!js.HasNoPrefix()) {
1409
			DISABLE;
1410
		}
1411

1412
		// Matrix scale, matrix by scalar (weird prefixes)
1413
		// d[N,M] = s[N,M] * t[0]
1414
		// Note: behaves just slightly differently than a series of vscls.
1415

1416
		int vs = _VS;
1417
		int vd = _VD;
1418
		int vt = _VT;
1419

1420
		MatrixSize sz = GetMtxSize(op);
1421
		if (sz != M_4x4) {
1422
			DISABLE;
1423
		}
1424
		if (GetMtx(vt) == GetMtx(vd)) {
1425
			DISABLE;
1426
		}
1427
		int n = GetMatrixSide(sz);
1428

1429
		// The entire matrix is scaled equally, so transpose doesn't matter.  Let's normalize.
1430
		if (IsMatrixTransposed(vs) && IsMatrixTransposed(vd)) {
1431
			vs = TransposeMatrixReg(vs);
1432
			vd = TransposeMatrixReg(vd);
1433
		}
1434
		if (IsMatrixTransposed(vs) || IsMatrixTransposed(vd)) {
1435
			DISABLE;
1436
		}
1437

1438
		u8 sregs[16], dregs[16], tregs[1];
1439
		GetMatrixRegs(sregs, sz, vs);
1440
		GetMatrixRegs(dregs, sz, vd);
1441
		GetVectorRegs(tregs, V_Single, vt);
1442

1443
		for (int i = 0; i < n; ++i) {
1444
			ir.Write(IROp::Vec4Scale, dregs[i * 4], sregs[i * 4], tregs[0]);
1445
		}
1446
	}
1447

1448
	void IRFrontend::Comp_VScl(MIPSOpcode op) {
1449
		CONDITIONAL_DISABLE(VFPU_VEC);
1450
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1451
			DISABLE;
1452
		}
1453

1454
		// Vector scale, vector by scalar
1455
		// d[N] = s[N] * t[0]
1456

1457
		VectorSize sz = GetVecSize(op);
1458
		int n = GetNumVectorElements(sz);
1459

1460
		int vs = _VS;
1461
		int vd = _VD;
1462
		int vt = _VT;
1463
		u8 sregs[4], dregs[4], treg;
1464
		GetVectorRegsPrefixS(sregs, sz, vs);
1465
		// T prefixes handled by interp.
1466
		GetVectorRegs(&treg, V_Single, vt);
1467
		GetVectorRegsPrefixD(dregs, sz, vd);
1468

1469
		bool overlap = false;
1470
		// For prefixes to work, we just have to ensure that none of the output registers spill
1471
		// and that there's no overlap.
1472
		u8 tempregs[4];
1473
		memcpy(tempregs, dregs, sizeof(tempregs));
1474
		for (int i = 0; i < n; ++i) {
1475
			// Conservative, can be improved
1476
			if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {
1477
				// Need to use temp regs
1478
				tempregs[i] = IRVTEMP_0 + i;
1479
				overlap = true;
1480
			}
1481
		}
1482

1483
		if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
1484
			if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
1485
				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
1486
				ApplyPrefixD(dregs, sz, vd);
1487
				return;
1488
			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && opts.preferVec4) {
1489
				ir.Write(IROp::Vec4Scale, IRVTEMP_0, sregs[0], treg);
1490
				ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1491
				ApplyPrefixD(dregs, sz, vd);
1492
				return;
1493
			}
1494
		}
1495

1496
		for (int i = 0; i < n; i++) {
1497
			ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);
1498
		}
1499

1500
		for (int i = 0; i < n; i++) {
1501
			// All must be mapped for prefixes to work.
1502
			if (dregs[i] != tempregs[i]) {
1503
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1504
			}
1505
		}
1506

1507
		ApplyPrefixD(dregs, sz, vd);
1508
	}
1509

1510
	/*
1511
	// Capital = straight, lower case = transposed
1512
	// 8 possibilities:
1513
	ABC   2
1514
	ABc   missing
1515
	AbC   1
1516
	Abc   1
1517

1518
	aBC = ACB    2 + swap
1519
	aBc = AcB    1 + swap
1520
	abC = ACb    missing
1521
	abc = Acb    1 + swap
1522

1523
	*/
1524

1525
	// This may or may not be a win when using the IR interpreter...
1526
	// Many more instructions to interpret.
1527
	void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
1528
		CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
1529
		if (!js.HasNoPrefix()) {
1530
			DISABLE;
1531
		}
1532

1533
		if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
1534
			// Fall back to interpreter, which has the accurate implementation.
1535
			// Later we might do something more optimized here.
1536
			DISABLE;
1537
		}
1538

1539
		// Matrix multiply (weird prefixes)
1540
		// D[0 .. N, 0 .. M] = S[0 .. N, 0 .. M]' * T[0 .. N, 0 .. M]
1541
		// Note: Behaves as if it's implemented through a series of vdots.
1542
		// Important: this is a matrix multiply with a pre-transposed S.
1543

1544
		MatrixSize sz = GetMtxSize(op);
1545
		int n = GetMatrixSide(sz);
1546

1547
		int vs = _VS;
1548
		int vd = _VD;
1549
		int vt = _VT;
1550
		MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz);
1551
		MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);
1552

1553
		// A very common arrangment. Rearrange to something we can handle.
1554
		if (IsMatrixTransposed(vd)) {
1555
			// Matrix identity says (At * Bt) = (B * A)t
1556
			// D = S * T
1557
			// Dt = (S * T)t = (Tt * St)
1558
			vd = TransposeMatrixReg(vd);
1559
			std::swap(vs, vt);
1560
		}
1561

1562
		u8 sregs[16], tregs[16], dregs[16];
1563
		GetMatrixRegs(sregs, sz, vs);
1564
		GetMatrixRegs(tregs, sz, vt);
1565
		GetMatrixRegs(dregs, sz, vd);
1566

1567
		if (soverlap || toverlap) {
1568
			DISABLE;
1569
		}
1570

1571
		// dregs are always consecutive, thanks to our transpose trick.
1572
		// However, not sure this is always worth it.
1573
		if (IsMatrixVec4(sz, dregs)) {
1574
			// TODO: The interpreter would like proper matrix ops better. Can generate those, and
1575
			// expand them like this as needed on "real" architectures.
1576
			int s0 = IRVTEMP_0;
1577
			int s1 = IRVTEMP_PFX_T;
1578
			if (!IsMatrixVec4(sz, sregs)) {
1579
				// METHOD 1: Handles AbC and Abc
1580
				for (int j = 0; j < 4; j++) {
1581
					ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);
1582
					for (int i = 1; i < 4; i++) {
1583
						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]);
1584
						ir.Write(IROp::Vec4Add, s0, s0, s1);
1585
					}
1586
					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1587
				}
1588
				return;
1589
			} else if (IsMatrixVec4(sz, tregs)) {
1590
				// METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots.
1591
				// Dots only work if tregs are consecutive.
1592
				// TODO: Skip this and resort to method one and transpose the output?
1593
				for (int j = 0; j < 4; j++) {
1594
					for (int i = 0; i < 4; i++) {
1595
						ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[j * 4]);
1596
					}
1597
					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1598
				}
1599
				return;
1600
			} else {
1601
				// ABc - s consecutive, t not.
1602
				// Tekken uses this.
1603
				// logBlocks = 1;
1604
			}
1605
		}
1606

1607
		// Fallback. Expands a LOT
1608
		int temp0 = IRVTEMP_0;
1609
		int temp1 = IRVTEMP_0 + 1;
1610
		for (int a = 0; a < n; a++) {
1611
			for (int b = 0; b < n; b++) {
1612
				ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]);
1613
				for (int c = 1; c < n; c++) {
1614
					ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]);
1615
					ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1);
1616
				}
1617
			}
1618
		}
1619
	}
1620

1621
	void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
1622
		CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
1623
		if (!js.HasNoPrefix()) {
1624
			DISABLE;
1625
		}
1626

1627
		// Vertex transform, vector by matrix (weird prefixes)
1628
		// d[N] = s[N*m .. N*m + n-1] dot t[0 .. n-1]
1629
		// Homogenous means t[n-1] is treated as 1.
1630
		// Note: this might be implemented as a series of vdots with special prefixes.
1631

1632
		VectorSize sz = GetVecSize(op);
1633
		MatrixSize msz = GetMtxSize(op);
1634
		int n = GetNumVectorElements(sz);
1635
		int ins = (op >> 23) & 7;
1636

1637
		bool homogenous = false;
1638
		if (n == ins) {
1639
			n++;
1640
			sz = (VectorSize)((int)(sz)+1);
1641
			msz = (MatrixSize)((int)(msz)+1);
1642
			homogenous = true;
1643
		}
1644
		// Otherwise, n should already be ins + 1.
1645
		else if (n != ins + 1) {
1646
			DISABLE;
1647
		}
1648

1649
		u8 sregs[16], dregs[4], tregs[4];
1650
		GetMatrixRegs(sregs, msz, _VS);
1651
		GetVectorRegs(tregs, sz, _VT);
1652
		GetVectorRegs(dregs, sz, _VD);
1653

1654
		// SIMD-optimized implementations - if sregs[0..3] is non-consecutive, it's transposed.
1655
		if (msz == M_4x4 && !IsMatrixVec4(msz, sregs)) {
1656
			int s0 = IRVTEMP_0;
1657
			int s1 = IRVTEMP_PFX_S;
1658
			// For this algorithm, we don't care if tregs are consecutive or not,
1659
			// they are accessed one at a time. This handles homogenous transforms correctly, as well.
1660
			// We take advantage of sregs[0] + 1 being sregs[4] here.
1661
			ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
1662
			for (int i = 1; i < 4; i++) {
1663
				if (!homogenous || (i != n - 1)) {
1664
					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
1665
					ir.Write(IROp::Vec4Add, s0, s0, s1);
1666
				} else {
1667
					ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
1668
				}
1669
			}
1670
			if (IsVec4(sz, dregs)) {
1671
				ir.Write(IROp::Vec4Mov, dregs[0], s0);
1672
			} else {
1673
				for (int i = 0; i < 4; i++) {
1674
					ir.Write(IROp::FMov, dregs[i], s0 + i);
1675
				}
1676
			}
1677
			return;
1678
		} else if (msz == M_4x4 && IsMatrixVec4(msz, sregs) && IsVec4(sz, tregs)) {
1679
			IRReg t = tregs[0];
1680
			if (homogenous) {
1681
				// This is probably even what the hardware basically does, wiring t[3] to 1.0f.
1682
				ir.Write(IROp::Vec4Init, IRVTEMP_PFX_T, (int)Vec4Init::AllONE);
1683
				ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, IRVTEMP_PFX_T, t, 0x7);
1684
				t = IRVTEMP_PFX_T;
1685
			}
1686
			for (int i = 0; i < 4; i++)
1687
				ir.Write(IROp::Vec4Dot, IRVTEMP_PFX_D + i, sregs[i * 4], t);
1688
			for (int i = 0; i < 4; i++)
1689
				ir.Write(IROp::FMov, dregs[i], IRVTEMP_PFX_D + i);
1690
			return;
1691
		}
1692

1693
		// TODO: test overlap, optimize.
1694
		u8 tempregs[4];
1695
		int s0 = IRVTEMP_0;
1696
		int temp1 = IRVTEMP_0 + 1;
1697
		for (int i = 0; i < n; i++) {
1698
			ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]);
1699
			for (int k = 1; k < n; k++) {
1700
				if (!homogenous || k != n - 1) {
1701
					ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]);
1702
					ir.Write(IROp::FAdd, s0, s0, temp1);
1703
				} else {
1704
					ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]);
1705
				}
1706
			}
1707
			int temp = IRVTEMP_PFX_T + i;
1708
			ir.Write(IROp::FMov, temp, s0);
1709
			tempregs[i] = temp;
1710
		}
1711
		for (int i = 0; i < n; i++) {
1712
			if (tempregs[i] != dregs[i])
1713
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1714
		}
1715
	}
1716

1717
	void IRFrontend::Comp_VCrs(MIPSOpcode op) {
1718
		CONDITIONAL_DISABLE(VFPU_VEC);
1719
		if (js.HasUnknownPrefix() || js.HasSPrefix() || js.HasTPrefix()) {
1720
			DISABLE;
1721
		}
1722

1723
		// Vector cross (half a cross product, n = 3)
1724
		// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]
1725
		// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;
1726
		// (or just use vcrsp.)
1727
		// Note: this is possibly just a swizzle prefix hack for vmul.
1728

1729
		VectorSize sz = GetVecSize(op);
1730
		int n = GetNumVectorElements(sz);
1731
		if (sz != V_Triple)
1732
			DISABLE;
1733

1734
		u8 sregs[4], dregs[4], tregs[4];
1735
		GetVectorRegsPrefixS(sregs, sz, _VS);
1736
		GetVectorRegsPrefixT(tregs, sz, _VT);
1737
		GetVectorRegsPrefixD(dregs, sz, _VD);
1738

1739
		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
1740
			// Use Vec4 where we can.  First, apply shuffles.
1741
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
1742
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
1743
			ir.Write(IROp::Vec4Mul, IRVTEMP_0, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
1744
			// Now just retain w and blend in our values.
1745
			ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
1746
		} else {
1747
			u8 tempregs[4]{};
1748
			if (!IsOverlapSafe(n, dregs, n, sregs, n, tregs)) {
1749
				for (int i = 0; i < n; ++i)
1750
					tempregs[i] = IRVTEMP_0 + i;
1751
			} else {
1752
				for (int i = 0; i < n; ++i)
1753
					tempregs[i] = dregs[i];
1754
			}
1755

1756
			ir.Write(IROp::FMul, tempregs[0], sregs[1], tregs[2]);
1757
			ir.Write(IROp::FMul, tempregs[1], sregs[2], tregs[0]);
1758
			ir.Write(IROp::FMul, tempregs[2], sregs[0], tregs[1]);
1759

1760
			for (int i = 0; i < n; i++) {
1761
				if (tempregs[i] != dregs[i])
1762
					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1763
			}
1764
		}
1765

1766
		ApplyPrefixD(dregs, sz, _VD);
1767
	}
1768

1769
	void IRFrontend::Comp_VDet(MIPSOpcode op) {
1770
		CONDITIONAL_DISABLE(VFPU_VEC);
1771
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1772
			DISABLE;
1773
		}
1774

1775
		// Vector determinant
1776
		// d[0] = s[0]*t[1] - s[1]*t[0]
1777
		// Note: this operates on two vectors, not a 2x2 matrix.
1778

1779
		VectorSize sz = GetVecSize(op);
1780
		if (sz != V_Pair)
1781
			DISABLE;
1782

1783
		u8 sregs[4], dregs[4], tregs[4];
1784
		GetVectorRegsPrefixS(sregs, sz, _VS);
1785
		GetVectorRegsPrefixT(tregs, sz, _VT);
1786
		GetVectorRegsPrefixD(dregs, V_Single, _VD);
1787

1788
		ir.Write(IROp::FMul, IRVTEMP_0, sregs[1], tregs[0]);
1789
		ir.Write(IROp::FMul, dregs[0], sregs[0], tregs[1]);
1790
		ir.Write(IROp::FSub, dregs[0], dregs[0], IRVTEMP_0);
1791

1792
		ApplyPrefixD(dregs, V_Single, _VD);
1793
	}
1794

1795
	void IRFrontend::Comp_Vi2x(MIPSOpcode op) {
1796
		CONDITIONAL_DISABLE(VFPU_VEC);
1797
		if (js.HasUnknownPrefix() || js.HasSPrefix())
1798
			DISABLE;
1799

1800
		int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
1801
		bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
1802

1803
		// These instructions pack pairs or quads of integers into 32 bits.
1804
		// The unsigned (u) versions skip the sign bit when packing, first doing a signed clamp to 0 (so the sign bit won't ever be 1).
1805

1806
		VectorSize sz = GetVecSize(op);
1807
		VectorSize outsize;
1808
		if (bits == 8) {
1809
			outsize = V_Single;
1810
			if (sz != V_Quad) {
1811
				DISABLE;
1812
			}
1813
		} else {
1814
			switch (sz) {
1815
			case V_Pair:
1816
				outsize = V_Single;
1817
				break;
1818
			case V_Quad:
1819
				outsize = V_Pair;
1820
				break;
1821
			default:
1822
				DISABLE;
1823
			}
1824
		}
1825

1826
		u8 sregs[4], dregs[2], srcregs[4], tempregs[2];
1827
		GetVectorRegsPrefixS(sregs, sz, _VS);
1828
		GetVectorRegsPrefixD(dregs, outsize, _VD);
1829
		memcpy(srcregs, sregs, sizeof(sregs));
1830
		memcpy(tempregs, dregs, sizeof(dregs));
1831

1832
		int nOut = GetNumVectorElements(outsize);
1833

1834
		// If src registers aren't contiguous, make them.
1835
		if (!IsVec2(sz, sregs) && !IsVec4(sz, sregs)) {
1836
			// T prefix is unused.
1837
			for (int i = 0; i < GetNumVectorElements(sz); i++) {
1838
				srcregs[i] = IRVTEMP_PFX_T + i;
1839
				ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1840
			}
1841
		}
1842

1843
		if (bits == 8) {
1844
			if (unsignedOp) {  //vi2uc
1845
				// Output is only one register.
1846
				ir.Write(IROp::Vec4ClampToZero, IRVTEMP_0, srcregs[0]);
1847
				ir.Write(IROp::Vec4Pack31To8, tempregs[0], IRVTEMP_0);
1848
			} else {  //vi2c
1849
				ir.Write(IROp::Vec4Pack32To8, tempregs[0], srcregs[0]);
1850
			}
1851
		} else {
1852
			// bits == 16
1853
			if (unsignedOp) {  //vi2us
1854
				// Output is only one register.
1855
				ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0, srcregs[0]);
1856
				ir.Write(IROp::Vec2Pack31To16, tempregs[0], IRVTEMP_0);
1857
				if (outsize == V_Pair) {
1858
					ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0 + 2, srcregs[2]);
1859
					ir.Write(IROp::Vec2Pack31To16, tempregs[1], IRVTEMP_0 + 2);
1860
				}
1861
			} else {  //vi2s
1862
				ir.Write(IROp::Vec2Pack32To16, tempregs[0], srcregs[0]);
1863
				if (outsize == V_Pair) {
1864
					ir.Write(IROp::Vec2Pack32To16, tempregs[1], srcregs[2]);
1865
				}
1866
			}
1867
		}
1868

1869
		for (int i = 0; i < nOut; i++) {
1870
			if (dregs[i] != tempregs[i]) {
1871
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1872
			}
1873
		}
1874

1875
		ApplyPrefixD(dregs, outsize, _VD);
1876
	}
1877

1878
	void IRFrontend::Comp_Vx2i(MIPSOpcode op) {
1879
		CONDITIONAL_DISABLE(VFPU_VEC);
1880
		if (js.HasUnknownPrefix() || js.HasSPrefix())
1881
			DISABLE;
1882

1883
		int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1884
		bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1885

1886
		// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1887
		// at the top.  vus2i shifts it an extra bit right afterward.
1888
		// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1889
		// at the top too.  vuc2i is a bit special (see below.)
1890
		// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1891
		// then use it for both.
1892

1893
		VectorSize sz = GetVecSize(op);
1894
		VectorSize outsize;
1895
		if (bits == 8) {
1896
			outsize = V_Quad;
1897
			sz = V_Single;  // For some reason, sz is set to Quad in this case though the outsize is Single.
1898
		} else {
1899
			switch (sz) {
1900
			case V_Single:
1901
				outsize = V_Pair;
1902
				break;
1903
			case V_Pair:
1904
				outsize = V_Quad;
1905
				break;
1906
			default:
1907
				DISABLE;
1908
			}
1909
		}
1910

1911
		u8 sregs[2], dregs[4], tempregs[4], srcregs[2];
1912
		GetVectorRegsPrefixS(sregs, sz, _VS);
1913
		GetVectorRegsPrefixD(dregs, outsize, _VD);
1914
		memcpy(tempregs, dregs, sizeof(dregs));
1915
		memcpy(srcregs, sregs, sizeof(sregs));
1916

1917
		// Remap source regs to be consecutive. This is not required
1918
		// but helpful when implementations can join two Vec2Expand.
1919
		if (sz == V_Pair && !IsConsecutive2(srcregs)) {
1920
			for (int i = 0; i < 2; i++) {
1921
				srcregs[i] = IRVTEMP_0 + i;
1922
				ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1923
			}
1924
		}
1925

1926
		int nIn = GetNumVectorElements(sz);
1927

1928
		int nOut = 2;
1929
		if (outsize == V_Quad)
1930
			nOut = 4;
1931
		// Remap dest regs. PFX_T is unused.
1932
		if (outsize == V_Pair) {
1933
			bool consecutive = IsConsecutive2(dregs);
1934
			// We must have them consecutive, so all temps, or none.
1935
			if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1936
				for (int i = 0; i < nOut; i++) {
1937
					tempregs[i] = IRVTEMP_PFX_T + i;
1938
				}
1939
			}
1940
		} else if (outsize == V_Quad) {
1941
			bool consecutive = IsVec4(outsize, dregs);
1942
			if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1943
				for (int i = 0; i < nOut; i++) {
1944
					tempregs[i] = IRVTEMP_PFX_T + i;
1945
				}
1946
			}
1947
		}
1948

1949
		if (bits == 16) {
1950
			if (unsignedOp) {
1951
				ir.Write(IROp::Vec2Unpack16To31, tempregs[0], srcregs[0]);
1952
				if (outsize == V_Quad)
1953
					ir.Write(IROp::Vec2Unpack16To31, tempregs[2], srcregs[1]);
1954
			} else {
1955
				ir.Write(IROp::Vec2Unpack16To32, tempregs[0], srcregs[0]);
1956
				if (outsize == V_Quad)
1957
					ir.Write(IROp::Vec2Unpack16To32, tempregs[2], srcregs[1]);
1958
			}
1959
		} else if (bits == 8) {
1960
			if (unsignedOp) {
1961
				// See the interpreter, this one is odd. Hardware bug?
1962
				ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1963
				ir.Write(IROp::Vec4DuplicateUpperBitsAndShift1, tempregs[0], tempregs[0]);
1964
			} else {
1965
				ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1966
			}
1967
		}
1968

1969
		for (int i = 0; i < nOut; i++) {
1970
			if (tempregs[i] != dregs[i]) {
1971
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1972
			}
1973
		}
1974
		ApplyPrefixD(dregs, outsize, _VD);
1975
	}
1976

1977
	void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
1978
		CONDITIONAL_DISABLE(VFPU_VEC);
1979
		if (!js.HasNoPrefix())
1980
			DISABLE;
1981

1982
		// Vector cross product (n = 3, weird prefixes)
1983
		// d[0 .. 2] = s[0 .. 2] X t[0 .. 2]
1984
		// Vector quaternion product (n = 4, weird prefixes)
1985
		// d[0 .. 2] = t[0 .. 2] X s[0 .. 2] + s[3] * t[0 .. 2] + t[3] * s[0 .. 2]
1986
		// d[3] = s[3]*t[3] - s[0 .. 2] dot t[0 .. 3]
1987
		// Note: Behaves as if it's implemented through a series of vdots.
1988

1989
		VectorSize sz = GetVecSize(op);
1990
		int n = GetNumVectorElements(sz);
1991

1992
		u8 sregs[4], tregs[4], dregs[4];
1993
		GetVectorRegs(sregs, sz, _VS);
1994
		GetVectorRegs(tregs, sz, _VT);
1995
		GetVectorRegs(dregs, sz, _VD);
1996

1997
		if (sz == V_Triple) {
1998
			u8 tempregs[4]{};
1999
			for (int i = 0; i < n; ++i) {
2000
				if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
2001
					tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
2002
				} else {
2003
					tempregs[i] = dregs[i];
2004
				}
2005
			}
2006

2007
			int temp0 = IRVTEMP_0;
2008
			int temp1 = IRVTEMP_0 + 1;
2009
			// Compute X
2010
			ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);
2011
			ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);
2012
			ir.Write(IROp::FSub, tempregs[0], temp0, temp1);
2013

2014
			// Compute Y
2015
			ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);
2016
			ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);
2017
			ir.Write(IROp::FSub, tempregs[1], temp0, temp1);
2018

2019
			// Compute Z
2020
			ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
2021
			ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
2022
			ir.Write(IROp::FSub, tempregs[2], temp0, temp1);
2023

2024
			for (int i = 0; i < n; i++) {
2025
				if (tempregs[i] != dregs[i])
2026
					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2027
			}
2028
		} else if (sz == V_Quad) {
2029
			// Rather than using vdots, we organize this as SIMD multiplies and adds.
2030
			// That means flipping the logic column-wise.  Also, luckily no prefix temps used.
2031
			if (!IsConsecutive4(sregs) || !IsConsecutive4(tregs) || !IsConsecutive4(dregs)) {
2032
				DISABLE;
2033
			}
2034

2035
			auto shuffleImm = [](int x, int y, int z, int w) { return x | (y << 2) | (z << 4) | (w << 6); };
2036
			auto blendConst = [](int x, int y, int z, int w) { return x | (y << 1) | (z << 2) | (w << 3); };
2037

2038
			// Prepare some negatives.
2039
			ir.Write(IROp::Vec4Neg, IRVTEMP_0, tregs[0]);
2040

2041
			// tmp = S[x,x,x,x] * T[w,-z,y,-x]
2042
			ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 0, 1, 0));
2043
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(3, 2, 1, 0));
2044
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(0, 0, 0, 0));
2045
			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_D, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2046

2047
			// tmp += S[y,y,y,y] * T[z,w,-x,-y]
2048
			ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 1, 0, 0));
2049
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(2, 3, 0, 1));
2050
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(1, 1, 1, 1));
2051
			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2052
			ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2053

2054
			// tmp += S[z,z,z,z] * T[-y,x,w,-z]
2055
			ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(0, 1, 1, 0));
2056
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(1, 0, 3, 2));
2057
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(2, 2, 2, 2));
2058
			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
2059
			ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2060

2061
			// tmp += S[w,w,w,w] * T[x,y,z,w]
2062
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(3, 3, 3, 3));
2063
			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, tregs[0]);
2064
			ir.Write(IROp::Vec4Add, dregs[0], IRVTEMP_PFX_D, IRVTEMP_PFX_S);
2065
		} else {
2066
			INVALIDOP;
2067
		}
2068
	}
2069

2070
	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
2071
		CONDITIONAL_DISABLE(VFPU_COMP);
2072
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
2073
			DISABLE;
2074
		}
2075

2076
		// Vector compare
2077
		// VFPU_CC[N] = COMPARE(s[N], t[N])
2078

2079
		VectorSize sz = GetVecSize(op);
2080
		int n = GetNumVectorElements(sz);
2081

2082
		u8 sregs[4], tregs[4];
2083
		GetVectorRegsPrefixS(sregs, sz, _VS);
2084
		GetVectorRegsPrefixT(tregs, sz, _VT);
2085

2086
		int cond = op & 0xF;
2087
		int mask = 0;
2088
		for (int i = 0; i < n; i++) {
2089
			ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
2090
			mask |= (1 << i);
2091
		}
2092
		ir.Write(IROp::FCmpVfpuAggregate, mask);
2093
	}
2094

2095
	void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
2096
		CONDITIONAL_DISABLE(VFPU_COMP);
2097
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2098
			DISABLE;
2099
		}
2100

2101
		// Vector conditional move
2102
		// imm3 >= 6: d[N] = VFPU_CC[N] == tf ? s[N] : d[N]
2103
		// imm3 < 6:  d[N] = VFPU_CC[imm3] == tf ? s[N] : d[N]
2104

2105
		VectorSize sz = GetVecSize(op);
2106
		int n = GetNumVectorElements(sz);
2107

2108
		u8 sregs[4], dregs[4];
2109
		GetVectorRegsPrefixS(sregs, sz, _VS);
2110
		GetVectorRegsPrefixD(dregs, sz, _VD);
2111
		int tf = (op >> 19) & 1;
2112
		int imm3 = (op >> 16) & 7;
2113

2114
		if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
2115
			// TODO: Could do a VfpuCC variant of Vec4Blend.
2116
		}
2117

2118
		for (int i = 0; i < n; ++i) {
2119
			// Simplification: Disable if overlap unsafe
2120
			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2121
				DISABLE;
2122
			}
2123
		}
2124
		if (imm3 < 6) {
2125
			// Test one bit of CC. This bit decides whether none or all subregisters are copied.
2126
			for (int i = 0; i < n; i++) {
2127
				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7));
2128
			}
2129
		} else {
2130
			// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
2131
			for (int i = 0; i < n; i++) {
2132
				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));
2133
			}
2134
		}
2135
		ApplyPrefixD(dregs, sz, _VD);
2136
	}
2137

2138
	void IRFrontend::Comp_Viim(MIPSOpcode op) {
2139
		CONDITIONAL_DISABLE(VFPU_XFER);
2140
		if (js.HasUnknownPrefix())
2141
			DISABLE;
2142

2143
		// Vector integer immediate
2144
		// d[0] = float(imm)
2145

2146
		s32 imm = SignExtend16ToS32(op);
2147
		u8 dreg;
2148
		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
2149
		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));
2150
		ApplyPrefixD(&dreg, V_Single, _VT);
2151
	}
2152

2153
	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
2154
		CONDITIONAL_DISABLE(VFPU_XFER);
2155
		if (js.HasUnknownPrefix())
2156
			DISABLE;
2157

2158
		// Vector half-float immediate
2159
		// d[0] = float(imm)
2160

2161
		FP16 half;
2162
		half.u = op & 0xFFFF;
2163
		FP32 fval = half_to_float_fast5(half);
2164

2165
		u8 dreg;
2166
		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
2167
		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));
2168
		ApplyPrefixD(&dreg, V_Single, _VT);
2169
	}
2170

2171
	void IRFrontend::Comp_Vcst(MIPSOpcode op) {
2172
		CONDITIONAL_DISABLE(VFPU_XFER);
2173
		if (js.HasUnknownPrefix())
2174
			DISABLE;
2175

2176
		// Vector constant
2177
		// d[N] = CONST
2178

2179
		int conNum = (op >> 16) & 0x1f;
2180
		int vd = _VD;
2181

2182
		VectorSize sz = GetVecSize(op);
2183
		int n = GetNumVectorElements(sz);
2184

2185
		u8 dregs[4];
2186
		GetVectorRegsPrefixD(dregs, sz, vd);
2187

2188
		if (IsVec4(sz, dregs)) {
2189
			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
2190
			ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);
2191
		} else if (IsVec3of4(sz, dregs) && opts.preferVec4) {
2192
			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
2193
			ir.Write(IROp::Vec4Shuffle, IRVTEMP_0, IRVTEMP_0, 0);
2194
			ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
2195
		} else {
2196
			for (int i = 0; i < n; i++) {
2197
				// Most of the time, materializing a float is slower than copying from another float.
2198
				if (i == 0)
2199
					ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
2200
				else
2201
					ir.Write(IROp::FMov, dregs[i], dregs[0]);
2202
			}
2203
		}
2204
		ApplyPrefixD(dregs, sz, vd);
2205
	}
2206

2207
	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
2208
	// calling the math library.
2209
	void IRFrontend::Comp_VRot(MIPSOpcode op) {
2210
		CONDITIONAL_DISABLE(VFPU_VEC);
2211
		if (!js.HasNoPrefix()) {
2212
			// Prefixes work strangely for this:
2213
			//  * They never apply to cos (whether d or s prefixes.)
2214
			//  * They mostly apply to sin/0, e.g. 0:1, M, or |x|.
2215
			DISABLE;
2216
		}
2217

2218
		// Vector rotation matrix (weird prefixes)
2219
		// d[N] = SINCOSVAL(s[0], imm[N])
2220
		// The imm selects: cos index, sin index, 0 or sin for others, sin sign flip.
2221

2222
		int vd = _VD;
2223
		int vs = _VS;
2224
		int imm = (op >> 16) & 0x1f;
2225
		VectorSize sz = GetVecSize(op);
2226
		int n = GetNumVectorElements(sz);
2227
		int sineLane = (imm >> 2) & 3;
2228
		int cosineLane = imm & 3;
2229
		bool negSin = (imm & 0x10) ? true : false;
2230
		bool broadcastSine = sineLane == cosineLane;
2231

2232
		char d[4] = { '0', '0', '0', '0' };
2233
		if (broadcastSine) {
2234
			for (int i = 0; i < 4; i++)
2235
				d[i] = 's';
2236
		}
2237
		d[sineLane] = 's';
2238
		d[cosineLane] = 'c';
2239

2240
		u8 dregs[4];
2241
		GetVectorRegs(dregs, sz, vd);
2242
		u8 sreg[1];
2243
		GetVectorRegs(sreg, V_Single, vs);
2244

2245
		// If there's overlap, sin is calculated without it, but cosine uses the result.
2246
		// This corresponds with prefix handling, where cosine doesn't get in prefixes.
2247
		if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {
2248
			ir.Write(IROp::FSin, IRVTEMP_0, sreg[0]);
2249
			if (negSin)
2250
				ir.Write(IROp::FNeg, IRVTEMP_0, IRVTEMP_0);
2251
		}
2252

2253
		for (int i = 0; i < n; i++) {
2254
			switch (d[i]) {
2255
			case '0':
2256
				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(0.0f));
2257
				break;
2258
			case 's':
2259
				if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {
2260
					ir.Write(IROp::FMov, dregs[i], IRVTEMP_0);
2261
				} else {
2262
					ir.Write(IROp::FSin, dregs[i], sreg[0]);
2263
					if (negSin) {
2264
						ir.Write(IROp::FNeg, dregs[i], dregs[i]);
2265
					}
2266
				}
2267
				break;
2268
			case 'c':
2269
				if (IsOverlapSafe(n, dregs, 1, sreg))
2270
					ir.Write(IROp::FCos, dregs[i], sreg[0]);
2271
				else if (dregs[sineLane] == sreg[0])
2272
					ir.Write(IROp::FCos, dregs[i], IRVTEMP_0);
2273
				else
2274
					ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(1.0f));
2275
				break;
2276
			}
2277
		}
2278
	}
2279

2280
	void IRFrontend::Comp_Vsgn(MIPSOpcode op) {
2281
		CONDITIONAL_DISABLE(VFPU_VEC);
2282
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2283
			DISABLE;
2284
		}
2285

2286
		// Vector extract sign
2287
		// d[N] = signum(s[N])
2288

2289
		VectorSize sz = GetVecSize(op);
2290
		int n = GetNumVectorElements(sz);
2291

2292
		u8 sregs[4], dregs[4];
2293
		GetVectorRegsPrefixS(sregs, sz, _VS);
2294
		GetVectorRegsPrefixD(dregs, sz, _VD);
2295

2296
		u8 tempregs[4];
2297
		for (int i = 0; i < n; ++i) {
2298
			if (!IsOverlapSafe(dregs[i], n, sregs)) {
2299
				tempregs[i] = IRTEMP_0 + i;
2300
			} else {
2301
				tempregs[i] = dregs[i];
2302
			}
2303
		}
2304

2305
		for (int i = 0; i < n; ++i) {
2306
			ir.Write(IROp::FSign, tempregs[i], sregs[i]);
2307
		}
2308

2309
		for (int i = 0; i < n; ++i) {
2310
			if (dregs[i] != tempregs[i]) {
2311
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2312
			}
2313
		}
2314

2315
		ApplyPrefixD(dregs, sz, _VD);
2316
	}
2317

2318
	void IRFrontend::Comp_Vocp(MIPSOpcode op) {
2319
		CONDITIONAL_DISABLE(VFPU_VEC);
2320
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {
2321
			DISABLE;
2322
		}
2323

2324
		// Vector one's complement
2325
		// d[N] = 1.0 - s[N]
2326

2327
		VectorSize sz = GetVecSize(op);
2328
		int n = GetNumVectorElements(sz);
2329

2330
		// This is a hack that modifies prefixes.  We eat them later, so just overwrite.
2331
		// S prefix forces the negate flags.
2332
		js.prefixS |= 0x000F0000;
2333
		// T prefix forces constants on and regnum to 1.
2334
		// That means negate still works, and abs activates a different constant.
2335
		js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2336

2337
		u8 sregs[4], tregs[4], dregs[4];
2338
		GetVectorRegsPrefixS(sregs, sz, _VS);
2339
		// There's no bits for t, so just reuse s.  It'll be constants only.
2340
		GetVectorRegsPrefixT(tregs, sz, _VS);
2341
		GetVectorRegsPrefixD(dregs, sz, _VD);
2342

2343
		if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
2344
			ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);
2345
		} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
2346
			ir.Write(IROp::Vec4Add, IRVTEMP_0, tregs[0], sregs[0]);
2347
			ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);
2348
		} else {
2349
			u8 tempregs[4];
2350
			for (int i = 0; i < n; ++i) {
2351
				if (!IsOverlapSafe(dregs[i], n, sregs)) {
2352
					tempregs[i] = IRVTEMP_0 + i;
2353
				} else {
2354
					tempregs[i] = dregs[i];
2355
				}
2356
			}
2357

2358
			for (int i = 0; i < n; ++i) {
2359
				ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);
2360
			}
2361
			for (int i = 0; i < n; ++i) {
2362
				if (dregs[i] != tempregs[i]) {
2363
					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2364
				}
2365
			}
2366
		}
2367

2368
		ApplyPrefixD(dregs, sz, _VD);
2369
	}
2370

2371
	void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
2372
		CONDITIONAL_DISABLE(VFPU_VEC);
2373
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
2374
			DISABLE;
2375
		}
2376

2377
		// Vector color conversion
2378
		// d[N] = ConvertTo16(s[N*2]) | (ConvertTo16(s[N*2+1]) << 16)
2379

2380
		DISABLE;
2381
	}
2382

2383
	void IRFrontend::Comp_Vbfy(MIPSOpcode op) {
2384
		CONDITIONAL_DISABLE(VFPU_VEC);
2385
		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {
2386
			DISABLE;
2387
		}
2388

2389
		// Vector butterfly operation
2390
		// vbfy2: d[0] = s[0] + s[2], d[1] = s[1] + s[3], d[2] = s[0] - s[2], d[3] = s[1] - s[3]
2391
		// vbfy1: d[N*2] = s[N*2] + s[N*2+1], d[N*2+1] = s[N*2] - s[N*2+1]
2392

2393
		VectorSize sz = GetVecSize(op);
2394
		int n = GetNumVectorElements(sz);
2395
		if (n != 2 && n != 4) {
2396
			// Bad instructions
2397
			INVALIDOP;
2398
		}
2399

2400
		u8 sregs[4], dregs[4];
2401
		GetVectorRegsPrefixS(sregs, sz, _VS);
2402
		GetVectorRegsPrefixD(dregs, sz, _VD);
2403

2404
		u8 tempregs[4];
2405
		for (int i = 0; i < n; ++i) {
2406
			if (!IsOverlapSafe(dregs[i], n, sregs)) {
2407
				tempregs[i] = IRVTEMP_0 + i;
2408
			} else {
2409
				tempregs[i] = dregs[i];
2410
			}
2411
		}
2412

2413
		int subop = (op >> 16) & 0x1F;
2414
		if (subop == 3 && n == 4) {
2415
			// vbfy2
2416
			ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[2]);
2417
			ir.Write(IROp::FAdd, tempregs[1], sregs[1], sregs[3]);
2418
			ir.Write(IROp::FSub, tempregs[2], sregs[0], sregs[2]);
2419
			ir.Write(IROp::FSub, tempregs[3], sregs[1], sregs[3]);
2420
		} else if (subop == 2) {
2421
			// vbfy1
2422
			ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[1]);
2423
			ir.Write(IROp::FSub, tempregs[1], sregs[0], sregs[1]);
2424
			if (n == 4) {
2425
				ir.Write(IROp::FAdd, tempregs[2], sregs[2], sregs[3]);
2426
				ir.Write(IROp::FSub, tempregs[3], sregs[2], sregs[3]);
2427
			}
2428
		} else {
2429
			INVALIDOP;
2430
		}
2431

2432
		for (int i = 0; i < n; ++i) {
2433
			if (tempregs[i] != dregs[i])
2434
				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2435
		}
2436

2437
		ApplyPrefixD(dregs, sz, _VD);
2438
	}
2439
}
2440

2441
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company