CoCalc -- IRInterpreter.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/IR/IRInterpreter.cpp
Views: ¹⁴⁰¹
1
#include <algorithm>
2
#include <cmath>
3

4
#include "ppsspp_config.h"
5
#include "Common/BitSet.h"
6
#include "Common/BitScan.h"
7
#include "Common/Common.h"
8
#include "Common/Data/Convert/SmallDataConvert.h"
9
#include "Common/Math/math_util.h"
10

11
#ifdef _M_SSE
12
#include <emmintrin.h>
13
#endif
14

15
#if PPSSPP_ARCH(ARM_NEON)
16
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
17
#include <arm64_neon.h>
18
#else
19
#include <arm_neon.h>
20
#endif
21
#endif
22

23
#include "Core/Core.h"
24
#include "Core/CoreTiming.h"
25
#include "Core/Debugger/Breakpoints.h"
26
#include "Core/HLE/HLE.h"
27
#include "Core/HLE/ReplaceTables.h"
28
#include "Core/MemMap.h"
29
#include "Core/MIPS/MIPS.h"
30
#include "Core/MIPS/MIPSTables.h"
31
#include "Core/MIPS/MIPSVFPUUtils.h"
32
#include "Core/MIPS/IR/IRInst.h"
33
#include "Core/MIPS/IR/IRInterpreter.h"
34
#include "Core/System.h"
35
#include "Core/MIPS/MIPSTracer.h"
36

37
#ifdef mips
38
// Why do MIPS compilers define something so generic?  Try to keep defined, at least...
39
#undef mips
40
#define mips mips
41
#endif
42

43
alignas(16) static const float vec4InitValues[8][4] = {
44
	{ 0.0f, 0.0f, 0.0f, 0.0f },
45
	{ 1.0f, 1.0f, 1.0f, 1.0f },
46
	{ -1.0f, -1.0f, -1.0f, -1.0f },
47
	{ 1.0f, 0.0f, 0.0f, 0.0f },
48
	{ 0.0f, 1.0f, 0.0f, 0.0f },
49
	{ 0.0f, 0.0f, 1.0f, 0.0f },
50
	{ 0.0f, 0.0f, 0.0f, 1.0f },
51
};
52

53
alignas(16) static const uint32_t signBits[4] = {
54
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
55
};
56

57
alignas(16) static const uint32_t noSignMask[4] = {
58
	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
59
};
60

61
alignas(16) static const uint32_t lowBytesMask[4] = {
62
	0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
63
};
64

65
u32 IRRunBreakpoint(u32 pc) {
66
	// Should we skip this breakpoint?
67
	uint32_t skipFirst = CBreakPoints::CheckSkipFirst();
68
	if (skipFirst == pc || skipFirst == currentMIPS->pc)
69
		return 0;
70

71
	// Did we already hit one?
72
	if (coreState != CORE_RUNNING && coreState != CORE_NEXTFRAME)
73
		return 1;
74

75
	CBreakPoints::ExecBreakPoint(pc);
76
	return coreState != CORE_RUNNING ? 1 : 0;
77
}
78

79
u32 IRRunMemCheck(u32 pc, u32 addr) {
80
	// Should we skip this breakpoint?
81
	uint32_t skipFirst = CBreakPoints::CheckSkipFirst();
82
	if (skipFirst == pc || skipFirst == currentMIPS->pc)
83
		return 0;
84

85
	// Did we already hit one?
86
	if (coreState != CORE_RUNNING && coreState != CORE_NEXTFRAME)
87
		return 1;
88

89
	CBreakPoints::ExecOpMemCheck(addr, pc);
90
	return coreState != CORE_RUNNING ? 1 : 0;
91
}
92

93
void IRApplyRounding(MIPSState *mips) {
94
	u32 fcr1Bits = mips->fcr31 & 0x01000003;
95
	// If these are 0, we just leave things as they are.
96
	if (fcr1Bits) {
97
		int rmode = fcr1Bits & 3;
98
		bool ftz = (fcr1Bits & 0x01000000) != 0;
99
#if PPSSPP_ARCH(SSE2)
100
		u32 csr = _mm_getcsr() & ~0x6000;
101
		// Translate the rounding mode bits to X86, the same way as in Asm.cpp.
102
		if (rmode & 1) {
103
			rmode ^= 2;
104
		}
105
		csr |= rmode << 13;
106

107
		if (ftz) {
108
			// Flush to zero
109
			csr |= 0x8000;
110
		}
111
		_mm_setcsr(csr);
112
#elif PPSSPP_ARCH(ARM64) && !PPSSPP_PLATFORM(WINDOWS)
113
		// On ARM64 we need to use inline assembly for a portable solution.
114
		// Unfortunately we don't have this possibility on Windows with MSVC, so ifdeffed out above.
115
		// Note that in the JIT, for fcvts, we use specific conversions. We could use the FCVTS variants
116
		// directly through inline assembly.
117
		u64 fpcr;  // not really 64-bit, just to match the register size.
118
		asm volatile ("mrs %0, fpcr" : "=r" (fpcr));
119

120
		// Translate MIPS to ARM rounding mode
121
		static const u8 lookup[4] = {0, 3, 1, 2};
122

123
		fpcr &= ~(3 << 22);    // Clear bits [23:22]
124
		fpcr |= (lookup[rmode] << 22);
125

126
		if (ftz) {
127
			fpcr |= 1 << 24;
128
		}
129
		// Write back the modified FPCR
130
		asm volatile ("msr fpcr, %0" : : "r" (fpcr));
131
#endif
132
	}
133
}
134

135
void IRRestoreRounding() {
136
#if PPSSPP_ARCH(SSE2)
137
	// TODO: We should avoid this if we didn't apply rounding in the first place.
138
	// In the meantime, clear out FTZ and rounding mode bits.
139
	u32 csr = _mm_getcsr();
140
	csr &= ~(7 << 13);
141
	_mm_setcsr(csr);
142
#elif PPSSPP_ARCH(ARM64) && !PPSSPP_PLATFORM(WINDOWS)
143
	u64 fpcr;  // not really 64-bit, just to match the regsiter size.
144
	asm volatile ("mrs %0, fpcr" : "=r" (fpcr));
145
	fpcr &= ~(7 << 22);    // Clear bits [23:22] for rounding, 24 for FTZ
146
	// Write back the modified FPCR
147
	asm volatile ("msr fpcr, %0" : : "r" (fpcr));
148
#endif
149
}
150

151
// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64.
152
u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
153
	while (true) {
154
		switch (inst->op) {
155
		case IROp::SetConst:
156
			mips->r[inst->dest] = inst->constant;
157
			break;
158
		case IROp::SetConstF:
159
			memcpy(&mips->f[inst->dest], &inst->constant, 4);
160
			break;
161
		case IROp::Add:
162
			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
163
			break;
164
		case IROp::Sub:
165
			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
166
			break;
167
		case IROp::And:
168
			mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];
169
			break;
170
		case IROp::Or:
171
			mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];
172
			break;
173
		case IROp::Xor:
174
			mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];
175
			break;
176
		case IROp::Mov:
177
			mips->r[inst->dest] = mips->r[inst->src1];
178
			break;
179
		case IROp::AddConst:
180
			mips->r[inst->dest] = mips->r[inst->src1] + inst->constant;
181
			break;
182
		case IROp::OptAddConst:  // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.
183
			mips->r[inst->dest] += inst->constant;
184
			break;
185
		case IROp::SubConst:
186
			mips->r[inst->dest] = mips->r[inst->src1] - inst->constant;
187
			break;
188
		case IROp::AndConst:
189
			mips->r[inst->dest] = mips->r[inst->src1] & inst->constant;
190
			break;
191
		case IROp::OptAndConst:  // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.
192
			mips->r[inst->dest] &= inst->constant;
193
			break;
194
		case IROp::OrConst:
195
			mips->r[inst->dest] = mips->r[inst->src1] | inst->constant;
196
			break;
197
		case IROp::OptOrConst:
198
			mips->r[inst->dest] |= inst->constant;
199
			break;
200
		case IROp::XorConst:
201
			mips->r[inst->dest] = mips->r[inst->src1] ^ inst->constant;
202
			break;
203
		case IROp::Neg:
204
			mips->r[inst->dest] = (u32)(-(s32)mips->r[inst->src1]);
205
			break;
206
		case IROp::Not:
207
			mips->r[inst->dest] = ~mips->r[inst->src1];
208
			break;
209
		case IROp::Ext8to32:
210
			mips->r[inst->dest] = SignExtend8ToU32(mips->r[inst->src1]);
211
			break;
212
		case IROp::Ext16to32:
213
			mips->r[inst->dest] = SignExtend16ToU32(mips->r[inst->src1]);
214
			break;
215
		case IROp::ReverseBits:
216
			mips->r[inst->dest] = ReverseBits32(mips->r[inst->src1]);
217
			break;
218

219
		case IROp::Load8:
220
			mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + inst->constant);
221
			break;
222
		case IROp::Load8Ext:
223
			mips->r[inst->dest] = SignExtend8ToU32(Memory::ReadUnchecked_U8(mips->r[inst->src1] + inst->constant));
224
			break;
225
		case IROp::Load16:
226
			mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + inst->constant);
227
			break;
228
		case IROp::Load16Ext:
229
			mips->r[inst->dest] = SignExtend16ToU32(Memory::ReadUnchecked_U16(mips->r[inst->src1] + inst->constant));
230
			break;
231
		case IROp::Load32:
232
			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + inst->constant);
233
			break;
234
		case IROp::Load32Left:
235
		{
236
			u32 addr = mips->r[inst->src1] + inst->constant;
237
			u32 shift = (addr & 3) * 8;
238
			u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);
239
			u32 destMask = 0x00ffffff >> shift;
240
			mips->r[inst->dest] = (mips->r[inst->dest] & destMask) | (mem << (24 - shift));
241
			break;
242
		}
243
		case IROp::Load32Right:
244
		{
245
			u32 addr = mips->r[inst->src1] + inst->constant;
246
			u32 shift = (addr & 3) * 8;
247
			u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);
248
			u32 destMask = 0xffffff00 << (24 - shift);
249
			mips->r[inst->dest] = (mips->r[inst->dest] & destMask) | (mem >> shift);
250
			break;
251
		}
252
		case IROp::Load32Linked:
253
			if (inst->dest != MIPS_REG_ZERO)
254
				mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + inst->constant);
255
			mips->llBit = 1;
256
			break;
257
		case IROp::LoadFloat:
258
			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + inst->constant);
259
			break;
260

261
		case IROp::Store8:
262
			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);
263
			break;
264
		case IROp::Store16:
265
			Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);
266
			break;
267
		case IROp::Store32:
268
			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);
269
			break;
270
		case IROp::Store32Left:
271
		{
272
			u32 addr = mips->r[inst->src1] + inst->constant;
273
			u32 shift = (addr & 3) * 8;
274
			u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);
275
			u32 memMask = 0xffffff00 << shift;
276
			u32 result = (mips->r[inst->src3] >> (24 - shift)) | (mem & memMask);
277
			Memory::WriteUnchecked_U32(result, addr & 0xfffffffc);
278
			break;
279
		}
280
		case IROp::Store32Right:
281
		{
282
			u32 addr = mips->r[inst->src1] + inst->constant;
283
			u32 shift = (addr & 3) * 8;
284
			u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);
285
			u32 memMask = 0x00ffffff >> (24 - shift);
286
			u32 result = (mips->r[inst->src3] << shift) | (mem & memMask);
287
			Memory::WriteUnchecked_U32(result, addr & 0xfffffffc);
288
			break;
289
		}
290
		case IROp::Store32Conditional:
291
			if (mips->llBit) {
292
				Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);
293
				if (inst->dest != MIPS_REG_ZERO) {
294
					mips->r[inst->dest] = 1;
295
				}
296
			} else if (inst->dest != MIPS_REG_ZERO) {
297
				mips->r[inst->dest] = 0;
298
			}
299
			break;
300
		case IROp::StoreFloat:
301
			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + inst->constant);
302
			break;
303

304
		case IROp::LoadVec4:
305
		{
306
			u32 base = mips->r[inst->src1] + inst->constant;
307
			// This compiles to a nice SSE load/store on x86, and hopefully similar on ARM.
308
			memcpy(&mips->f[inst->dest], Memory::GetPointerUnchecked(base), 4 * 4);
309
			break;
310
		}
311
		case IROp::StoreVec4:
312
		{
313
			u32 base = mips->r[inst->src1] + inst->constant;
314
			memcpy((float *)Memory::GetPointerUnchecked(base), &mips->f[inst->dest], 4 * 4);
315
			break;
316
		}
317

318
		case IROp::Vec4Init:
319
		{
320
			memcpy(&mips->f[inst->dest], vec4InitValues[inst->src1], 4 * sizeof(float));
321
			break;
322
		}
323

324
		case IROp::Vec4Shuffle:
325
		{
326
			// Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though,
327
			// or a big switch - there are only 256 shuffles possible (4^4)
328
			float temp[4];
329
			for (int i = 0; i < 4; i++)
330
				temp[i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
331
			const int dest = inst->dest;
332
			for (int i = 0; i < 4; i++)
333
				mips->f[dest + i] = temp[i];
334
			break;
335
		}
336

337
		case IROp::Vec4Blend:
338
		{
339
			const int dest = inst->dest;
340
			const int src1 = inst->src1;
341
			const int src2 = inst->src2;
342
			const int constant = inst->constant;
343
			// 90% of calls to this is inst->constant == 7 or inst->constant == 8. Some are 1 and 4, others very rare.
344
			// Could use _mm_blendv_ps (SSE4+BMI), vbslq_f32 (ARM), __riscv_vmerge_vvm (RISC-V)
345
			for (int i = 0; i < 4; i++)
346
				mips->f[dest + i] = ((constant >> i) & 1) ? mips->f[src2 + i] : mips->f[src1 + i];
347
			break;
348
		}
349

350
		case IROp::Vec4Mov:
351
		{
352
#if defined(_M_SSE)
353
			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
354
#elif PPSSPP_ARCH(ARM_NEON)
355
			vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
356
#else
357
			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
358
#endif
359
			break;
360
		}
361

362
		case IROp::Vec4Add:
363
		{
364
#if defined(_M_SSE)
365
			_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
366
#elif PPSSPP_ARCH(ARM_NEON)
367
			vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
368
#else
369
			for (int i = 0; i < 4; i++)
370
				mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i];
371
#endif
372
			break;
373
		}
374

375
		case IROp::Vec4Sub:
376
		{
377
#if defined(_M_SSE)
378
			_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
379
#elif PPSSPP_ARCH(ARM_NEON)
380
			vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
381
#else
382
			for (int i = 0; i < 4; i++)
383
				mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i];
384
#endif
385
			break;
386
		}
387

388
		case IROp::Vec4Mul:
389
		{
390
#if defined(_M_SSE)
391
			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
392
#elif PPSSPP_ARCH(ARM_NEON)
393
			vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
394
#else
395
			for (int i = 0; i < 4; i++)
396
				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
397
#endif
398
			break;
399
		}
400

401
		case IROp::Vec4Div:
402
		{
403
#if defined(_M_SSE)
404
			_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
405
#elif PPSSPP_ARCH(ARM64_NEON)
406
			vst1q_f32(&mips->f[inst->dest], vdivq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
407
#else
408
			for (int i = 0; i < 4; i++)
409
				mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];
410
#endif
411
			break;
412
		}
413

414
		case IROp::Vec4Scale:
415
		{
416
#if defined(_M_SSE)
417
			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
418
#elif PPSSPP_ARCH(ARM_NEON)
419
			vst1q_f32(&mips->f[inst->dest], vmulq_lane_f32(vld1q_f32(&mips->f[inst->src1]), vdup_n_f32(mips->f[inst->src2]), 0));
420
#else
421
			const float factor = mips->f[inst->src2];
422
			for (int i = 0; i < 4; i++)
423
				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * factor;
424
#endif
425
			break;
426
		}
427

428
		case IROp::Vec4Neg:
429
		{
430
#if defined(_M_SSE)
431
			_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
432
#elif PPSSPP_ARCH(ARM_NEON)
433
			vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
434
#else
435
			for (int i = 0; i < 4; i++)
436
				mips->f[inst->dest + i] = -mips->f[inst->src1 + i];
437
#endif
438
			break;
439
		}
440

441
		case IROp::Vec4Abs:
442
		{
443
#if defined(_M_SSE)
444
			_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
445
#elif PPSSPP_ARCH(ARM_NEON)
446
			vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
447
#else
448
			for (int i = 0; i < 4; i++)
449
				mips->f[inst->dest + i] = fabsf(mips->f[inst->src1 + i]);
450
#endif
451
			break;
452
		}
453

454
		case IROp::Vec2Unpack16To31:
455
		{
456
			const int dest = inst->dest;
457
			const int src1 = inst->src1;
458
			mips->fi[dest] = (mips->fi[src1] << 16) >> 1;
459
			mips->fi[dest + 1] = (mips->fi[src1] & 0xFFFF0000) >> 1;
460
			break;
461
		}
462

463
		case IROp::Vec2Unpack16To32:
464
		{
465
			const int dest = inst->dest;
466
			const int src1 = inst->src1;
467
			mips->fi[dest] = (mips->fi[src1] << 16);
468
			mips->fi[dest + 1] = (mips->fi[src1] & 0xFFFF0000);
469
			break;
470
		}
471

472
		case IROp::Vec4Unpack8To32:
473
		{
474
#if defined(_M_SSE)
475
			__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);
476
			src = _mm_unpacklo_epi8(src, _mm_setzero_si128());
477
			src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
478
			_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));
479
#elif PPSSPP_ARCH(ARM_NEON) && 0 // Untested
480
			const uint8x8_t value = (uint8x8_t)vdup_n_u32(mips->fi[inst->src1]);
481
			const uint16x8_t value16 = vmovl_u8(value);
482
			const uint32x4_t value32 = vshll_n_u16(vget_low_u16(value16), 24);
483
			vst1q_u32(&mips->fi[inst->dest], value32);
484
#else
485
			mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);
486
			mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000;
487
			mips->fi[inst->dest + 2] = (mips->fi[inst->src1] << 8) & 0xFF000000;
488
			mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000;
489
#endif
490
			break;
491
		}
492

493
		case IROp::Vec2Pack32To16:
494
		{
495
			u32 val = mips->fi[inst->src1] >> 16;
496
			mips->fi[inst->dest] = (mips->fi[inst->src1 + 1] & 0xFFFF0000) | val;
497
			break;
498
		}
499

500
		case IROp::Vec2Pack31To16:
501
		{
502
			// Used in Tekken 6
503

504
			u32 val = (mips->fi[inst->src1] >> 15) & 0xFFFF;
505
			val |= (mips->fi[inst->src1 + 1] << 1) & 0xFFFF0000;
506
			mips->fi[inst->dest] = val;
507
			break;
508
		}
509

510
		case IROp::Vec4Pack32To8:
511
		{
512
			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
513
			// pshufb or SSE4 instructions can be used instead.
514
			u32 val = mips->fi[inst->src1] >> 24;
515
			val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00;
516
			val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000;
517
			val |= (mips->fi[inst->src1 + 3]) & 0xFF000000;
518
			mips->fi[inst->dest] = val;
519
			break;
520
		}
521

522
		case IROp::Vec4Pack31To8:
523
		{
524
			// Used in Tekken 6
525

526
			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
527
			// pshufb or SSE4 instructions can be used instead.
528
#if PPSSPP_ARCH(ARM_NEON) && 0
529
			// Untested
530
			uint32x4_t value = vld1q_u32(&mips->fi[inst->src1]);
531
			value = vshlq_n_u32(value, 1);
532
			uint32x2_t halved = vshrn_n_u32(value, 8);
533
			uint32x2_t halvedAgain = vshrn_n_u32(vcombine_u32(halved, vdup_n_u32(0)), 8);
534
			mips->fi[inst->dest] = vget_lane_u32(halvedAgain, 0);
535
#else
536
			u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
537
			val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
538
			val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
539
			val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
540
			mips->fi[inst->dest] = val;
541
#endif
542
			break;
543
		}
544

545
		case IROp::Vec2ClampToZero:
546
		{
547
			for (int i = 0; i < 2; i++) {
548
				u32 val = mips->fi[inst->src1 + i];
549
				mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0;
550
			}
551
			break;
552
		}
553

554
		case IROp::Vec4ClampToZero:
555
		{
556
#if defined(_M_SSE)
557
			// Trickery: Expand the sign bit, and use andnot to zero negative values.
558
			__m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]);
559
			__m128i mask = _mm_srai_epi32(val, 31);
560
			val = _mm_andnot_si128(mask, val);
561
			_mm_store_si128((__m128i *)&mips->fi[inst->dest], val);
562
#else
563
			const int src1 = inst->src1;
564
			const int dest = inst->dest;
565
			for (int i = 0; i < 4; i++) {
566
				u32 val = mips->fi[src1 + i];
567
				mips->fi[dest + i] = (int)val >= 0 ? val : 0;
568
			}
569
#endif
570
			break;
571
		}
572

573
		case IROp::Vec4DuplicateUpperBitsAndShift1:  // For vuc2i, the weird one.
574
		{
575
			const int src1 = inst->src1;
576
			const int dest = inst->dest;
577
			for (int i = 0; i < 4; i++) {
578
				u32 val = mips->fi[src1 + i];
579
				val = val | (val >> 8);
580
				val = val | (val >> 16);
581
				val >>= 1;
582
				mips->fi[dest + i] = val;
583
			}
584
			break;
585
		}
586

587
		case IROp::FCmpVfpuBit:
588
		{
589
			const int op = inst->dest & 0xF;
590
			const int bit = inst->dest >> 4;
591
			int result = 0;
592
			switch (op) {
593
			case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break;
594
			case VC_NE: result = mips->f[inst->src1] != mips->f[inst->src2]; break;
595
			case VC_LT: result = mips->f[inst->src1] < mips->f[inst->src2]; break;
596
			case VC_LE: result = mips->f[inst->src1] <= mips->f[inst->src2]; break;
597
			case VC_GT: result = mips->f[inst->src1] > mips->f[inst->src2]; break;
598
			case VC_GE: result = mips->f[inst->src1] >= mips->f[inst->src2]; break;
599
			case VC_EZ: result = mips->f[inst->src1] == 0.0f; break;
600
			case VC_NZ: result = mips->f[inst->src1] != 0.0f; break;
601
			case VC_EN: result = my_isnan(mips->f[inst->src1]); break;
602
			case VC_NN: result = !my_isnan(mips->f[inst->src1]); break;
603
			case VC_EI: result = my_isinf(mips->f[inst->src1]); break;
604
			case VC_NI: result = !my_isinf(mips->f[inst->src1]); break;
605
			case VC_ES: result = my_isnanorinf(mips->f[inst->src1]); break;
606
			case VC_NS: result = !my_isnanorinf(mips->f[inst->src1]); break;
607
			case VC_TR: result = 1; break;
608
			case VC_FL: result = 0; break;
609
			default:
610
				result = 0;
611
			}
612
			if (result != 0) {
613
				mips->vfpuCtrl[VFPU_CTRL_CC] |= (1 << bit);
614
			} else {
615
				mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit);
616
			}
617
			break;
618
		}
619

620
		case IROp::FCmpVfpuAggregate:
621
		{
622
			const u32 mask = inst->dest;
623
			const u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
624
			int anyBit = (cc & mask) ? 0x10 : 0x00;
625
			int allBit = (cc & mask) == mask ? 0x20 : 0x00;
626
			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit;
627
			break;
628
		}
629

630
		case IROp::FCmovVfpuCC:
631
			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0xf)) & 1) == ((u32)inst->src2 >> 7)) {
632
				mips->f[inst->dest] = mips->f[inst->src1];
633
			}
634
			break;
635

636
		case IROp::Vec4Dot:
637
		{
638
			// Not quickly implementable on all platforms, unfortunately.
639
			// Though, this is still pretty fast compared to one split into multiple IR instructions.
640
			// This might be good though: https://gist.github.com/rikusalminen/3040241
641
			float dot = mips->f[inst->src1] * mips->f[inst->src2];
642
			for (int i = 1; i < 4; i++)
643
				dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
644
			mips->f[inst->dest] = dot;
645
			break;
646
		}
647

648
		case IROp::FSin:
649
			mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);
650
			break;
651
		case IROp::FCos:
652
			mips->f[inst->dest] = vfpu_cos(mips->f[inst->src1]);
653
			break;
654
		case IROp::FRSqrt:
655
			mips->f[inst->dest] = 1.0f / sqrtf(mips->f[inst->src1]);
656
			break;
657
		case IROp::FRecip:
658
			mips->f[inst->dest] = 1.0f / mips->f[inst->src1];
659
			break;
660
		case IROp::FAsin:
661
			mips->f[inst->dest] = vfpu_asin(mips->f[inst->src1]);
662
			break;
663

664
		case IROp::ShlImm:
665
			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
666
			break;
667
		case IROp::ShrImm:
668
			mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;
669
			break;
670
		case IROp::SarImm:
671
			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;
672
			break;
673
		case IROp::RorImm:
674
		{
675
			u32 x = mips->r[inst->src1];
676
			int sa = inst->src2;
677
			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
678
		}
679
		break;
680

681
		case IROp::Shl:
682
			mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);
683
			break;
684
		case IROp::Shr:
685
			mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
686
			break;
687
		case IROp::Sar:
688
			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
689
			break;
690
		case IROp::Ror:
691
		{
692
			u32 x = mips->r[inst->src1];
693
			int sa = mips->r[inst->src2] & 31;
694
			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
695
			break;
696
		}
697

698
		case IROp::Clz:
699
		{
700
			mips->r[inst->dest] = clz32(mips->r[inst->src1]);
701
			break;
702
		}
703

704
		case IROp::Slt:
705
			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
706
			break;
707

708
		case IROp::SltU:
709
			mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];
710
			break;
711

712
		case IROp::SltConst:
713
			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)inst->constant;
714
			break;
715

716
		case IROp::SltUConst:
717
			mips->r[inst->dest] = mips->r[inst->src1] < inst->constant;
718
			break;
719

720
		case IROp::MovZ:
721
			if (mips->r[inst->src1] == 0)
722
				mips->r[inst->dest] = mips->r[inst->src2];
723
			break;
724
		case IROp::MovNZ:
725
			if (mips->r[inst->src1] != 0)
726
				mips->r[inst->dest] = mips->r[inst->src2];
727
			break;
728

729
		case IROp::Max:
730
			mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
731
			break;
732
		case IROp::Min:
733
			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
734
			break;
735

736
		case IROp::MtLo:
737
			mips->lo = mips->r[inst->src1];
738
			break;
739
		case IROp::MtHi:
740
			mips->hi = mips->r[inst->src1];
741
			break;
742
		case IROp::MfLo:
743
			mips->r[inst->dest] = mips->lo;
744
			break;
745
		case IROp::MfHi:
746
			mips->r[inst->dest] = mips->hi;
747
			break;
748

749
		case IROp::Mult:
750
		{
751
			s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
752
			memcpy(&mips->lo, &result, 8);
753
			break;
754
		}
755
		case IROp::MultU:
756
		{
757
			u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
758
			memcpy(&mips->lo, &result, 8);
759
			break;
760
		}
761
		case IROp::Madd:
762
		{
763
			s64 result;
764
			memcpy(&result, &mips->lo, 8);
765
			result += (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
766
			memcpy(&mips->lo, &result, 8);
767
			break;
768
		}
769
		case IROp::MaddU:
770
		{
771
			s64 result;
772
			memcpy(&result, &mips->lo, 8);
773
			result += (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
774
			memcpy(&mips->lo, &result, 8);
775
			break;
776
		}
777
		case IROp::Msub:
778
		{
779
			s64 result;
780
			memcpy(&result, &mips->lo, 8);
781
			result -= (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
782
			memcpy(&mips->lo, &result, 8);
783
			break;
784
		}
785
		case IROp::MsubU:
786
		{
787
			s64 result;
788
			memcpy(&result, &mips->lo, 8);
789
			result -= (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
790
			memcpy(&mips->lo, &result, 8);
791
			break;
792
		}
793

794
		case IROp::Div:
795
		{
796
			s32 numerator = (s32)mips->r[inst->src1];
797
			s32 denominator = (s32)mips->r[inst->src2];
798
			if (numerator == (s32)0x80000000 && denominator == -1) {
799
				mips->lo = 0x80000000;
800
				mips->hi = -1;
801
			} else if (denominator != 0) {
802
				mips->lo = (u32)(numerator / denominator);
803
				mips->hi = (u32)(numerator % denominator);
804
			} else {
805
				mips->lo = numerator < 0 ? 1 : -1;
806
				mips->hi = numerator;
807
			}
808
			break;
809
		}
810
		case IROp::DivU:
811
		{
812
			u32 numerator = mips->r[inst->src1];
813
			u32 denominator = mips->r[inst->src2];
814
			if (denominator != 0) {
815
				mips->lo = numerator / denominator;
816
				mips->hi = numerator % denominator;
817
			} else {
818
				mips->lo = numerator <= 0xFFFF ? 0xFFFF : -1;
819
				mips->hi = numerator;
820
			}
821
			break;
822
		}
823

824
		case IROp::BSwap16:
825
		{
826
			u32 x = mips->r[inst->src1];
827
			// Don't think we can beat this with intrinsics.
828
			mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
829
			break;
830
		}
831
		case IROp::BSwap32:
832
		{
833
			mips->r[inst->dest] = swap32(mips->r[inst->src1]);
834
			break;
835
		}
836

837
		case IROp::FAdd:
838
			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];
839
			break;
840
		case IROp::FSub:
841
			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
842
			break;
843
		case IROp::FMul:
844
#if 1
845
		{
846
			float a = mips->f[inst->src1];
847
			float b = mips->f[inst->src2];
848
			if ((b == 0.0f && my_isinf(a)) || (a == 0.0f && my_isinf(b))) {
849
				mips->fi[inst->dest] = 0x7fc00000;
850
			} else {
851
				mips->f[inst->dest] = a * b;
852
			}
853
		}
854
			break;
855
#else
856
			// Not sure if faster since it needs to load the operands twice? But the code is simpler.
857
			{
858
				// Takes care of negative zero by masking away the top bit, which also makes the inf check shorter.
859
				u32 a = mips->fi[inst->src1] & 0x7FFFFFFF;
860
				u32 b = mips->fi[inst->src2] & 0x7FFFFFFF;
861
				if ((a == 0 && b == 0x7F800000) || (b == 0 && a == 0x7F800000)) {
862
					mips->fi[inst->dest] = 0x7fc00000;
863
				} else {
864
					mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
865
				}
866
				break;
867
			}
868
#endif
869
		case IROp::FDiv:
870
			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
871
			break;
872
		case IROp::FMin:
873
			if (my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2])) {
874
				// See interpreter for this logic: this is for vmin, we're comparing mantissa+exp.
875
				if (mips->fs[inst->src1] < 0 && mips->fs[inst->src2] < 0) {
876
					mips->fs[inst->dest] = std::max(mips->fs[inst->src1], mips->fs[inst->src2]);
877
				} else {
878
					mips->fs[inst->dest] = std::min(mips->fs[inst->src1], mips->fs[inst->src2]);
879
				}
880
			} else {
881
				mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);
882
			}
883
			break;
884
		case IROp::FMax:
885
			if (my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2])) {
886
				// See interpreter for this logic: this is for vmax, we're comparing mantissa+exp.
887
				if (mips->fs[inst->src1] < 0 && mips->fs[inst->src2] < 0) {
888
					mips->fs[inst->dest] = std::min(mips->fs[inst->src1], mips->fs[inst->src2]);
889
				} else {
890
					mips->fs[inst->dest] = std::max(mips->fs[inst->src1], mips->fs[inst->src2]);
891
				}
892
			} else {
893
				mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);
894
			}
895
			break;
896

897
		case IROp::FMov:
898
			mips->f[inst->dest] = mips->f[inst->src1];
899
			break;
900
		case IROp::FAbs:
901
			mips->f[inst->dest] = fabsf(mips->f[inst->src1]);
902
			break;
903
		case IROp::FSqrt:
904
			mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);
905
			break;
906
		case IROp::FNeg:
907
			mips->f[inst->dest] = -mips->f[inst->src1];
908
			break;
909
		case IROp::FSat0_1:
910
			// We have to do this carefully to handle NAN and -0.0f.
911
			mips->f[inst->dest] = vfpu_clamp(mips->f[inst->src1], 0.0f, 1.0f);
912
			break;
913
		case IROp::FSatMinus1_1:
914
			mips->f[inst->dest] = vfpu_clamp(mips->f[inst->src1], -1.0f, 1.0f);
915
			break;
916

917
		case IROp::FSign:
918
		{
919
			// Bitwise trickery
920
			u32 val;
921
			memcpy(&val, &mips->f[inst->src1], sizeof(u32));
922
			if (val == 0 || val == 0x80000000)
923
				mips->f[inst->dest] = 0.0f;
924
			else if ((val >> 31) == 0)
925
				mips->f[inst->dest] = 1.0f;
926
			else
927
				mips->f[inst->dest] = -1.0f;
928
			break;
929
		}
930

931
		case IROp::FpCondFromReg:
932
			mips->fpcond = mips->r[inst->dest];
933
			break;
934
		case IROp::FpCondToReg:
935
			mips->r[inst->dest] = mips->fpcond;
936
			break;
937
		case IROp::FpCtrlFromReg:
938
			mips->fcr31 = mips->r[inst->src1] & 0x0181FFFF;
939
			// Extract the new fpcond value.
940
			// TODO: Is it really helping us to keep it separate?
941
			mips->fpcond = (mips->fcr31 >> 23) & 1;
942
			break;
943
		case IROp::FpCtrlToReg:
944
			// Update the fpcond bit first.
945
			mips->fcr31 = (mips->fcr31 & ~(1 << 23)) | ((mips->fpcond & 1) << 23);
946
			mips->r[inst->dest] = mips->fcr31;
947
			break;
948
		case IROp::VfpuCtrlToReg:
949
			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
950
			break;
951
		case IROp::FRound:
952
		{
953
			float value = mips->f[inst->src1];
954
			if (my_isnanorinf(value)) {
955
				mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;
956
				break;
957
			} else {
958
				mips->fs[inst->dest] = (int)round_ieee_754(value);
959
			}
960
			break;
961
		}
962
		case IROp::FTrunc:
963
		{
964
			float value = mips->f[inst->src1];
965
			if (my_isnanorinf(value)) {
966
				mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;
967
				break;
968
			} else {
969
				if (value >= 0.0f) {
970
					mips->fs[inst->dest] = (int)floorf(value);
971
					// Overflow, but it was positive.
972
					if (mips->fs[inst->dest] == -2147483648LL) {
973
						mips->fs[inst->dest] = 2147483647LL;
974
					}
975
				} else {
976
					// Overflow happens to be the right value anyway.
977
					mips->fs[inst->dest] = (int)ceilf(value);
978
				}
979
				break;
980
			}
981
		}
982
		case IROp::FCeil:
983
		{
984
			float value = mips->f[inst->src1];
985
			if (my_isnanorinf(value)) {
986
				mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;
987
				break;
988
			} else {
989
				mips->fs[inst->dest] = (int)ceilf(value);
990
			}
991
			break;
992
		}
993
		case IROp::FFloor:
994
		{
995
			float value = mips->f[inst->src1];
996
			if (my_isnanorinf(value)) {
997
				mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;
998
				break;
999
			} else {
1000
				mips->fs[inst->dest] = (int)floorf(value);
1001
			}
1002
			break;
1003
		}
1004
		case IROp::FCmp:
1005
			switch (inst->dest) {
1006
			case IRFpCompareMode::False:
1007
				mips->fpcond = 0;
1008
				break;
1009
			case IRFpCompareMode::EitherUnordered:
1010
			{
1011
				float a = mips->f[inst->src1];
1012
				float b = mips->f[inst->src2];
1013
				mips->fpcond = !(a > b || a < b || a == b);
1014
				break;
1015
			}
1016
			case IRFpCompareMode::EqualOrdered:
1017
				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];
1018
				break;
1019
			case IRFpCompareMode::EqualUnordered:
1020
				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2] || my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2]);
1021
				break;
1022
			case IRFpCompareMode::LessEqualOrdered:
1023
				mips->fpcond = mips->f[inst->src1] <= mips->f[inst->src2];
1024
				break;
1025
			case IRFpCompareMode::LessEqualUnordered:
1026
				mips->fpcond = !(mips->f[inst->src1] > mips->f[inst->src2]);
1027
				break;
1028
			case IRFpCompareMode::LessOrdered:
1029
				mips->fpcond = mips->f[inst->src1] < mips->f[inst->src2];
1030
				break;
1031
			case IRFpCompareMode::LessUnordered:
1032
				mips->fpcond = !(mips->f[inst->src1] >= mips->f[inst->src2]);
1033
				break;
1034
			}
1035
			break;
1036

1037
		case IROp::FCvtSW:
1038
			mips->f[inst->dest] = (float)mips->fs[inst->src1];
1039
			break;
1040
		case IROp::FCvtWS:
1041
		{
1042
			float src = mips->f[inst->src1];
1043
			if (my_isnanorinf(src)) {
1044
				mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;
1045
				break;
1046
			}
1047
			// TODO: Inline assembly to use here would be better.
1048
			switch (IRRoundMode(mips->fcr31 & 3)) {
1049
			case IRRoundMode::RINT_0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;
1050
			case IRRoundMode::CAST_1: mips->fs[inst->dest] = (int)src; break;
1051
			case IRRoundMode::CEIL_2: mips->fs[inst->dest] = (int)ceilf(src); break;
1052
			case IRRoundMode::FLOOR_3: mips->fs[inst->dest] = (int)floorf(src); break;
1053
			}
1054
			break; //cvt.w.s
1055
		}
1056
		case IROp::FCvtScaledSW:
1057
			mips->f[inst->dest] = (float)mips->fs[inst->src1] * (1.0f / (1UL << (inst->src2 & 0x1F)));
1058
			break;
1059
		case IROp::FCvtScaledWS:
1060
		{
1061
			float src = mips->f[inst->src1];
1062
			if (my_isnan(src)) {
1063
				// TODO: True for negatives too?
1064
				mips->fs[inst->dest] = 2147483647L;
1065
				break;
1066
			}
1067

1068
			float mult = (float)(1UL << (inst->src2 & 0x1F));
1069
			double sv = src * mult; // (float)0x7fffffff == (float)0x80000000
1070
			// Cap/floor it to 0x7fffffff / 0x80000000
1071
			if (sv > (double)0x7fffffff) {
1072
				mips->fs[inst->dest] = 0x7fffffff;
1073
			} else if (sv <= (double)(int)0x80000000) {
1074
				mips->fs[inst->dest] = 0x80000000;
1075
			} else {
1076
				switch (IRRoundMode(inst->src2 >> 6)) {
1077
				case IRRoundMode::RINT_0: mips->fs[inst->dest] = (int)round_ieee_754(sv); break;
1078
				case IRRoundMode::CAST_1: mips->fs[inst->dest] = src >= 0 ? (int)floor(sv) : (int)ceil(sv); break;
1079
				case IRRoundMode::CEIL_2: mips->fs[inst->dest] = (int)ceil(sv); break;
1080
				case IRRoundMode::FLOOR_3: mips->fs[inst->dest] = (int)floor(sv); break;
1081
				}
1082
			}
1083
			break;
1084
		}
1085

1086
		case IROp::FMovFromGPR:
1087
			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
1088
			break;
1089
		case IROp::OptFCvtSWFromGPR:
1090
			mips->f[inst->dest] = (float)(int)mips->r[inst->src1];
1091
			break;
1092
		case IROp::FMovToGPR:
1093
			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
1094
			break;
1095
		case IROp::OptFMovToGPRShr8:
1096
		{
1097
			u32 temp;
1098
			memcpy(&temp, &mips->f[inst->src1], 4);
1099
			mips->r[inst->dest] = temp >> 8;
1100
			break;
1101
		}
1102

1103
		case IROp::ExitToConst:
1104
			return inst->constant;
1105

1106
		case IROp::ExitToReg:
1107
			return mips->r[inst->src1];
1108

1109
		case IROp::ExitToConstIfEq:
1110
			if (mips->r[inst->src1] == mips->r[inst->src2])
1111
				return inst->constant;
1112
			break;
1113
		case IROp::ExitToConstIfNeq:
1114
			if (mips->r[inst->src1] != mips->r[inst->src2])
1115
				return inst->constant;
1116
			break;
1117
		case IROp::ExitToConstIfGtZ:
1118
			if ((s32)mips->r[inst->src1] > 0)
1119
				return inst->constant;
1120
			break;
1121
		case IROp::ExitToConstIfGeZ:
1122
			if ((s32)mips->r[inst->src1] >= 0)
1123
				return inst->constant;
1124
			break;
1125
		case IROp::ExitToConstIfLtZ:
1126
			if ((s32)mips->r[inst->src1] < 0)
1127
				return inst->constant;
1128
			break;
1129
		case IROp::ExitToConstIfLeZ:
1130
			if ((s32)mips->r[inst->src1] <= 0)
1131
				return inst->constant;
1132
			break;
1133

1134
		case IROp::Downcount:
1135
			mips->downcount -= (int)inst->constant;
1136
			break;
1137

1138
		case IROp::SetPC:
1139
			mips->pc = mips->r[inst->src1];
1140
			break;
1141

1142
		case IROp::SetPCConst:
1143
			mips->pc = inst->constant;
1144
			break;
1145

1146
		case IROp::Syscall:
1147
			// IROp::SetPC was (hopefully) executed before.
1148
		{
1149
			MIPSOpcode op(inst->constant);
1150
			CallSyscall(op);
1151
			if (coreState != CORE_RUNNING)
1152
				CoreTiming::ForceCheck();
1153
			break;
1154
		}
1155

1156
		case IROp::ExitToPC:
1157
			return mips->pc;
1158

1159
		case IROp::Interpret:  // SLOW fallback. Can be made faster. Ideally should be removed but may be useful for debugging.
1160
		{
1161
			MIPSOpcode op(inst->constant);
1162
			MIPSInterpret(op);
1163
			break;
1164
		}
1165

1166
		case IROp::CallReplacement:
1167
		{
1168
			int funcIndex = inst->constant;
1169
			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
1170
			int cycles = f->replaceFunc();
1171
			mips->r[inst->dest] = cycles < 0 ? -1 : 0;
1172
			mips->downcount -= cycles < 0 ? -cycles : cycles;
1173
			break;
1174
		}
1175

1176
		case IROp::SetCtrlVFPU:
1177
			mips->vfpuCtrl[inst->dest] = inst->constant;
1178
			break;
1179

1180
		case IROp::SetCtrlVFPUReg:
1181
			mips->vfpuCtrl[inst->dest] = mips->r[inst->src1];
1182
			break;
1183

1184
		case IROp::SetCtrlVFPUFReg:
1185
			memcpy(&mips->vfpuCtrl[inst->dest], &mips->f[inst->src1], 4);
1186
			break;
1187

1188
		case IROp::ApplyRoundingMode:
1189
			IRApplyRounding(mips);
1190
			break;
1191
		case IROp::RestoreRoundingMode:
1192
			IRRestoreRounding();
1193
			break;
1194
		case IROp::UpdateRoundingMode:
1195
			// TODO: Implement
1196
			break;
1197

1198
		case IROp::Break:
1199
			Core_Break(mips->pc);
1200
			return mips->pc + 4;
1201

1202
		case IROp::Breakpoint:
1203
			if (IRRunBreakpoint(inst->constant)) {
1204
				CoreTiming::ForceCheck();
1205
				return mips->pc;
1206
			}
1207
			break;
1208

1209
		case IROp::MemoryCheck:
1210
			if (IRRunMemCheck(mips->pc + inst->dest, mips->r[inst->src1] + inst->constant)) {
1211
				CoreTiming::ForceCheck();
1212
				return mips->pc;
1213
			}
1214
			break;
1215

1216
		case IROp::ValidateAddress8:
1217
			if (RunValidateAddress<1>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
1218
				CoreTiming::ForceCheck();
1219
				return mips->pc;
1220
			}
1221
			break;
1222
		case IROp::ValidateAddress16:
1223
			if (RunValidateAddress<2>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
1224
				CoreTiming::ForceCheck();
1225
				return mips->pc;
1226
			}
1227
			break;
1228
		case IROp::ValidateAddress32:
1229
			if (RunValidateAddress<4>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
1230
				CoreTiming::ForceCheck();
1231
				return mips->pc;
1232
			}
1233
			break;
1234
		case IROp::ValidateAddress128:
1235
			if (RunValidateAddress<16>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
1236
				CoreTiming::ForceCheck();
1237
				return mips->pc;
1238
			}
1239
			break;
1240
		case IROp::LogIRBlock:
1241
			if (mipsTracer.tracing_enabled) {
1242
				mipsTracer.executed_blocks.push_back(inst->constant);
1243
			}
1244
			break;
1245

1246
		case IROp::Nop: // TODO: This shouldn't crash, but for now we should not emit nops, so...
1247
		case IROp::Bad:
1248
		default:
1249
			Crash();
1250
			break;
1251
			// Unimplemented IR op. Bad.
1252
		}
1253

1254
#ifdef _DEBUG
1255
		if (mips->r[0] != 0)
1256
			Crash();
1257
#endif
1258
		inst++;
1259
	}
1260

1261
	// We should not reach here anymore.
1262
	return 0;
1263
}
1264

1265
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company