CoCalc -- IRPassSimplify.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/IR/IRPassSimplify.cpp
Views: ¹⁴⁰¹
1
#include <algorithm>
2
#include <cstring>
3
#include <utility>
4

5
#include "Common/BitSet.h"
6
#include "Common/Data/Convert/SmallDataConvert.h"
7
#include "Common/Log.h"
8
#include "Core/Config.h"
9
#include "Core/MIPS/MIPSVFPUUtils.h"
10
#include "Core/MIPS/IR/IRAnalysis.h"
11
#include "Core/MIPS/IR/IRInterpreter.h"
12
#include "Core/MIPS/IR/IRPassSimplify.h"
13
#include "Core/MIPS/IR/IRRegCache.h"
14

15
// #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
16
#define CONDITIONAL_DISABLE
17
#define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
18

19
u32 Evaluate(u32 a, u32 b, IROp op) {
20
	switch (op) {
21
	case IROp::Add: case IROp::AddConst: return a + b;
22
	case IROp::Sub: case IROp::SubConst: return a - b;
23
	case IROp::And: case IROp::AndConst: return a & b;
24
	case IROp::Or: case IROp::OrConst: return a | b;
25
	case IROp::Xor: case IROp::XorConst: return a ^ b;
26
	case IROp::Shr: case IROp::ShrImm: return a >> b;
27
	case IROp::Sar: case IROp::SarImm: return (s32)a >> b;
28
	case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));
29
	case IROp::Shl: case IROp::ShlImm: return a << b;
30
	case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);
31
	case IROp::SltU: case IROp::SltUConst: return (a < b);
32
	default:
33
		_assert_msg_(false, "Unable to evaluate two op %d", (int)op);
34
		return -1;
35
	}
36
}
37

38
u32 Evaluate(u32 a, IROp op) {
39
	switch (op) {
40
	case IROp::Not: return ~a;
41
	case IROp::Neg: return -(s32)a;
42
	case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);
43
	case IROp::BSwap32: return swap32(a);
44
	case IROp::Ext8to32: return SignExtend8ToU32(a);
45
	case IROp::Ext16to32: return SignExtend16ToU32(a);
46
	case IROp::ReverseBits: return ReverseBits32(a);
47
	case IROp::Clz: {
48
		int x = 31;
49
		int count = 0;
50
		while (x >= 0 && !(a & (1 << x))) {
51
			count++;
52
			x--;
53
		}
54
		return count;
55
	}
56
	default:
57
		_assert_msg_(false, "Unable to evaluate one op %d", (int)op);
58
		return -1;
59
	}
60
}
61

62
IROp ArithToArithConst(IROp op) {
63
	switch (op) {
64
	case IROp::Add: return IROp::AddConst;
65
	case IROp::Sub: return IROp::SubConst;
66
	case IROp::And: return IROp::AndConst;
67
	case IROp::Or: return IROp::OrConst;
68
	case IROp::Xor: return IROp::XorConst;
69
	case IROp::Slt: return IROp::SltConst;
70
	case IROp::SltU: return IROp::SltUConst;
71
	default:
72
		_assert_msg_(false, "Invalid ArithToArithConst for op %d", (int)op);
73
		return (IROp)-1;
74
	}
75
}
76

77
IROp ShiftToShiftImm(IROp op) {
78
	switch (op) {
79
	case IROp::Shl: return IROp::ShlImm;
80
	case IROp::Shr: return IROp::ShrImm;
81
	case IROp::Ror: return IROp::RorImm;
82
	case IROp::Sar: return IROp::SarImm;
83
	default:
84
		_assert_msg_(false, "Invalid ShiftToShiftImm for op %d", (int)op);
85
		return (IROp)-1;
86
	}
87
}
88

89
bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) {
90
	out.Reserve(in.GetInstructions().size());
91

92
	if (c == 1) {
93
		return passes[0](in, out, opts);
94
	}
95

96
	bool logBlocks = false;
97

98
	IRWriter temp[2];
99
	const IRWriter *nextIn = &in;
100
	IRWriter *nextOut = &temp[1];
101
	temp[1].Reserve(nextIn->GetInstructions().size());
102
	for (size_t i = 0; i < c - 1; ++i) {
103
		if (passes[i](*nextIn, *nextOut, opts)) {
104
			logBlocks = true;
105
		}
106

107
		temp[0] = std::move(temp[1]);
108
		nextIn = &temp[0];
109

110
		temp[1].Clear();
111
		temp[1].Reserve(nextIn->GetInstructions().size());
112
	}
113

114
	out.Reserve(nextIn->GetInstructions().size());
115
	if (passes[c - 1](*nextIn, out, opts)) {
116
		logBlocks = true;
117
	}
118

119
	return logBlocks;
120
}
121

122
bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) {
123
	CONDITIONAL_DISABLE;
124

125
	bool logBlocks = false;
126
	IRInst prev{ IROp::Nop };
127

128
	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
129
		IRInst inst = in.GetInstructions()[i];
130
		switch (inst.op) {
131
		case IROp::FMovFromGPR:
132
			//FMovToGPR a0, f12
133
			//FMovFromGPR f14, a0
134
			// to
135
			//FMovToGPR a0, f12
136
			//FMov f14, f12
137
			if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
138
				inst.op = IROp::FMov;
139
				inst.src1 = prev.src1;
140
				// Skip it entirely if it's just a copy to and back.
141
				if (inst.dest != inst.src1)
142
					out.Write(inst);
143
			} else {
144
				out.Write(inst);
145
			}
146
			break;
147

148
		// This will need to scan forward or keep track of more information to be useful.
149
		// Just doing one isn't.
150
		/*
151
		case IROp::LoadVec4:
152
			// AddConst a0, sp, 0x30
153
			// LoadVec4 v16, a0, 0x0
154
			// to
155
			// AddConst a0, sp, 0x30
156
			// LoadVec4 v16, sp, 0x30 
157
			if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {
158
				inst.constant += prev.constant;
159
				inst.src1 = prev.src1;
160
				logBlocks = 1;
161
			} else {
162
				goto doDefault;
163
			}
164
			out.Write(inst);
165
			break;
166
		*/
167
		default:
168
			out.Write(inst);
169
			break;
170
		}
171
		prev = inst;
172
	}
173
	return logBlocks;
174
}
175

176
// Might be useful later on x86.
177
bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) {
178
	CONDITIONAL_DISABLE;
179

180
	bool logBlocks = false;
181
	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
182
		IRInst inst = in.GetInstructions()[i];
183
		switch (inst.op) {
184
		case IROp::Sub:
185
		case IROp::Slt:
186
		case IROp::SltU:
187
		case IROp::Add:
188
		case IROp::And:
189
		case IROp::Or:
190
		case IROp::Xor:
191
			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
192
				out.Write(IROp::Mov, inst.dest, inst.src1);
193
				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
194
			} else {
195
				out.Write(inst);
196
			}
197
			break;
198
		case IROp::FMul:
199
		case IROp::FAdd:
200
			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
201
				out.Write(IROp::FMov, inst.dest, inst.src1);
202
				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
203
			} else {
204
				out.Write(inst);
205
			}
206
			break;
207

208
		case IROp::Vec4Add:
209
		case IROp::Vec4Sub:
210
		case IROp::Vec4Mul:
211
		case IROp::Vec4Div:
212
			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
213
				out.Write(IROp::Vec4Mov, inst.dest, inst.src1);
214
				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
215
			} else {
216
				out.Write(inst);
217
			}
218
			break;
219

220
		default:
221
			out.Write(inst);
222
			break;
223
		}
224
	}
225
	return logBlocks;
226
}
227

228
bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) {
229
	CONDITIONAL_DISABLE;
230

231
	bool logBlocks = false;
232

233
	bool letThroughHalves = false;
234
	if (opts.optimizeForInterpreter) {
235
		// If we're using the interpreter, which can handle these instructions directly,
236
		// don't break "half" instructions up.
237
		// Of course, we still want to combine if possible.
238
		letThroughHalves = true;
239
	}
240

241
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) {
242
		const IRInst &inst = in.GetInstructions()[i];
243

244
		// TODO: Reorder or look ahead to combine?
245

246
		auto nextOp = [&]() -> const IRInst &{
247
			return in.GetInstructions()[i + 1];
248
		};
249

250
		auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) {
251
			if (i + 1 >= n)
252
				return false;
253
			const IRInst &next = nextOp();
254
			if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1)
255
				return false;
256
			if (inst.constant + matchOff != next.constant)
257
				return false;
258

259
			if (opts.unalignedLoadStore) {
260
				// Write out one unaligned op.
261
				out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff));
262
			} else if (replaceOp == IROp::Load32) {
263
				// We can still combine to a simpler set of two loads.
264
				// We start by isolating the address and shift amount.
265

266
				// IRTEMP_LR_ADDR = rs + imm
267
				out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant + replaceOff));
268
				// IRTEMP_LR_SHIFT = (addr & 3) * 8
269
				out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
270
				out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
271
				// IRTEMP_LR_ADDR = addr & 0xfffffffc
272
				out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
273
				// IRTEMP_LR_VALUE = low_word, dest = high_word
274
				out.Write(IROp::Load32, inst.dest, IRTEMP_LR_ADDR, out.AddConstant(0));
275
				out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(4));
276

277
				// Now we just need to adjust and combine dest and IRTEMP_LR_VALUE.
278
				// inst.dest >>= shift (putting its bits in the right spot.)
279
				out.Write(IROp::Shr, inst.dest, inst.dest, IRTEMP_LR_SHIFT);
280
				// We can't shift by 32, so we compromise by shifting twice.
281
				out.Write(IROp::ShlImm, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, 8);
282
				// IRTEMP_LR_SHIFT = 24 - shift
283
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
284
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
285
				// IRTEMP_LR_VALUE <<= (24 - shift)
286
				out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
287

288
				// At this point the values are aligned, and we just merge.
289
				out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
290
			} else {
291
				return false;
292
			}
293
			// Skip the next one, replaced.
294
			i++;
295
			return true;
296
		};
297

298
		auto addCommonProlog = [&]() {
299
			// IRTEMP_LR_ADDR = rs + imm
300
			out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant));
301
			// IRTEMP_LR_SHIFT = (addr & 3) * 8
302
			out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
303
			out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
304
			// IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later)
305
			out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
306
			// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR)
307
			out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0));
308
		};
309
		auto addCommonStore = [&](int off = 0) {
310
			// RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE
311
			out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off));
312
		};
313

314
		switch (inst.op) {
315
		case IROp::Load32Left:
316
			if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) {
317
				if (letThroughHalves) {
318
					out.Write(inst);
319
					break;
320
				}
321

322
				addCommonProlog();
323
				// dest &= (0x00ffffff >> shift)
324
				// Alternatively, could shift to a wall and back (but would require two shifts each way.)
325
				out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
326
				out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
327
				out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
328
				// IRTEMP_LR_SHIFT = 24 - shift
329
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
330
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
331
				// IRTEMP_LR_VALUE <<= (24 - shift)
332
				out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
333
				// dest |= IRTEMP_LR_VALUE
334
				out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
335

336
				bool src1Dirty = inst.dest == inst.src1;
337
				while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
338
					// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
339
					out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
340

341
					// dest &= IRTEMP_LR_MASK
342
					out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
343
					// IRTEMP_LR_VALUE <<= (24 - shift)
344
					out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
345
					// dest |= IRTEMP_LR_VALUE
346
					out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
347

348
					src1Dirty = nextOp().dest == inst.src1;
349
					++i;
350
				}
351
			}
352
			break;
353

354
		case IROp::Load32Right:
355
			if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) {
356
				if (letThroughHalves) {
357
					out.Write(inst);
358
					break;
359
				}
360
				addCommonProlog();
361
				// IRTEMP_LR_VALUE >>= shift
362
				out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
363
				// IRTEMP_LR_SHIFT = 24 - shift
364
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
365
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
366
				// dest &= (0xffffff00 << (24 - shift))
367
				// Alternatively, could shift to a wall and back (but would require two shifts each way.)
368
				out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
369
				out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
370
				out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
371
				// dest |= IRTEMP_LR_VALUE
372
				out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
373

374
				// Building display lists sometimes involves a bunch of lwr in a row.
375
				// We can generate more optimal code by combining.
376
				bool shiftNeedsReverse = true;
377
				bool src1Dirty = inst.dest == inst.src1;
378
				while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
379
					// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
380
					out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
381

382
					if (shiftNeedsReverse) {
383
						// IRTEMP_LR_SHIFT = shift again
384
						out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
385
						out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
386
						shiftNeedsReverse = false;
387
					}
388
					// IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT
389
					out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
390
					// dest &= IRTEMP_LR_MASK
391
					out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
392
					// dest |= IRTEMP_LR_VALUE
393
					out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
394

395
					src1Dirty = nextOp().dest == inst.src1;
396
					++i;
397
				}
398
			}
399
			break;
400

401
		case IROp::Store32Left:
402
			if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) {
403
				if (letThroughHalves) {
404
					out.Write(inst);
405
					break;
406
				}
407
				addCommonProlog();
408
				// IRTEMP_LR_VALUE &= 0xffffff00 << shift
409
				out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
410
				out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
411
				out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
412
				// IRTEMP_LR_SHIFT = 24 - shift
413
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
414
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
415
				// IRTEMP_LR_VALUE |= src3 >> (24 - shift)
416
				out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
417
				out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
418
				addCommonStore(0);
419
			}
420
			break;
421

422
		case IROp::Store32Right:
423
			if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) {
424
				if (letThroughHalves) {
425
					out.Write(inst);
426
					break;
427
				}
428
				addCommonProlog();
429
				// IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift)
430
				out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
431
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
432
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
433
				out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
434
				out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
435
				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
436
				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
437
				// IRTEMP_LR_VALUE |= src3 << shift
438
				out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
439
				out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
440
				addCommonStore(0);
441
			}
442
			break;
443

444
		default:
445
			out.Write(inst);
446
			break;
447
		}
448
	}
449

450
	return logBlocks;
451
}
452

453
bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) {
454
	CONDITIONAL_DISABLE;
455
	IRImmRegCache gpr(&out);
456

457
	bool logBlocks = false;
458
	bool skipNextExitToConst = false;
459
	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
460
		IRInst inst = in.GetInstructions()[i];
461
		bool symmetric = true;
462
		switch (inst.op) {
463
		case IROp::SetConst:
464
			gpr.SetImm(inst.dest, inst.constant);
465
			break;
466
		case IROp::SetConstF:
467
			goto doDefault;
468

469
		case IROp::Sub:
470
			if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0 && !gpr.IsImm(inst.src2)) {
471
				// Morph into a Neg.
472
				gpr.MapDirtyIn(inst.dest, inst.src2);
473
				out.Write(IROp::Neg, inst.dest, inst.src2);
474
				break;
475
			} else if (inst.src1 == inst.src2) {
476
				// Seen sometimes, yet another way of producing zero.
477
				gpr.SetImm(inst.dest, 0);
478
				break;
479
			}
480
#if  __cplusplus >= 201703 || _MSC_VER > 1910
481
			[[fallthrough]];
482
#endif
483
		case IROp::Slt:
484
		case IROp::SltU:
485
			symmetric = false;
486
#if  __cplusplus >= 201703 || _MSC_VER > 1910
487
			[[fallthrough]];
488
#endif
489
		case IROp::Add:
490
		case IROp::And:
491
		case IROp::Or:
492
		case IROp::Xor:
493
			// Regularize, for the add/or check below.
494
			if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {
495
				std::swap(inst.src1, inst.src2);
496
			}
497
			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
498
				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
499
			} else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
500
				gpr.SetImm(inst.dest, 0);
501
			} else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) {
502
				gpr.SetImm(inst.dest, 0);
503
			} else if (gpr.IsImm(inst.src2)) {
504
				const u32 imm2 = gpr.GetImm(inst.src2);
505
				gpr.MapDirtyIn(inst.dest, inst.src1);
506
				if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Sub || inst.op == IROp::Or || inst.op == IROp::Xor)) {
507
					// Add / Sub / Or / Xor with zero is just a Mov.  Add / Or are most common.
508
					if (inst.dest != inst.src1)
509
						out.Write(IROp::Mov, inst.dest, inst.src1);
510
				} else {
511
					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));
512
				}
513
			} else if (symmetric && gpr.IsImm(inst.src1)) {
514
				const u32 imm1 = gpr.GetImm(inst.src1);
515
				gpr.MapDirtyIn(inst.dest, inst.src2);
516
				if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or || inst.op == IROp::Xor)) {
517
					// Add / Or / Xor with zero is just a Mov.
518
					if (inst.dest != inst.src2)
519
						out.Write(IROp::Mov, inst.dest, inst.src2);
520
				} else {
521
					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));
522
				}
523
			} else {
524
				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
525
				goto doDefault;
526
			}
527
			break;
528

529
		case IROp::Neg:
530
		case IROp::Not:
531
		case IROp::BSwap16:
532
		case IROp::BSwap32:
533
		case IROp::Ext8to32:
534
		case IROp::Ext16to32:
535
		case IROp::ReverseBits:
536
		case IROp::Clz:
537
			if (gpr.IsImm(inst.src1)) {
538
				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));
539
			} else {
540
				gpr.MapDirtyIn(inst.dest, inst.src1);
541
				goto doDefault;
542
			}
543
			break;
544

545
		case IROp::AddConst:
546
		case IROp::SubConst:
547
		case IROp::AndConst:
548
		case IROp::OrConst:
549
		case IROp::XorConst:
550
		case IROp::SltConst:
551
		case IROp::SltUConst:
552
			// And 0 is otherwise set to 0.  Happens when optimizing lwl.
553
			if (inst.op == IROp::AndConst && inst.constant == 0) {
554
				gpr.SetImm(inst.dest, 0);
555
			} else if (gpr.IsImm(inst.src1)) {
556
				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op));
557
			} else if (inst.constant == 0 && (inst.op == IROp::AddConst || inst.op == IROp::SubConst || inst.op == IROp::OrConst || inst.op == IROp::XorConst)) {
558
				// Convert an Add/Sub/Or/Xor with a constant zero to a Mov (just like with reg zero.)
559
				gpr.MapDirtyIn(inst.dest, inst.src1);
560
				if (inst.dest != inst.src1)
561
					out.Write(IROp::Mov, inst.dest, inst.src1);
562
			} else {
563
				gpr.MapDirtyIn(inst.dest, inst.src1);
564
				goto doDefault;
565
			}
566
			break;
567

568
		case IROp::Shl:
569
		case IROp::Shr:
570
		case IROp::Ror:
571
		case IROp::Sar:
572
			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
573
				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
574
			} else if (gpr.IsImm(inst.src2)) {
575
				const u8 sa = gpr.GetImm(inst.src2) & 31;
576
				gpr.MapDirtyIn(inst.dest, inst.src1);
577
				if (sa == 0) {
578
					if (inst.dest != inst.src1)
579
						out.Write(IROp::Mov, inst.dest, inst.src1);
580
				} else {
581
					out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);
582
				}
583
			} else {
584
				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
585
				goto doDefault;
586
			}
587
			break;
588

589
		case IROp::ShlImm:
590
		case IROp::ShrImm:
591
		case IROp::RorImm:
592
		case IROp::SarImm:
593
			if (gpr.IsImm(inst.src1)) {
594
				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));
595
			} else {
596
				gpr.MapDirtyIn(inst.dest, inst.src1);
597
				goto doDefault;
598
			}
599
			break;
600

601
		case IROp::Mov:
602
			if (inst.dest == inst.src1) {
603
				// Nop
604
			} else if (gpr.IsImm(inst.src1)) {
605
				gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
606
			} else {
607
				gpr.MapDirtyIn(inst.dest, inst.src1);
608
				goto doDefault;
609
			}
610
			break;
611

612
		case IROp::Mult:
613
		case IROp::MultU:
614
		case IROp::Madd:
615
		case IROp::MaddU:
616
		case IROp::Msub:
617
		case IROp::MsubU:
618
		case IROp::Div:
619
		case IROp::DivU:
620
			gpr.MapInIn(inst.src1, inst.src2);
621
			goto doDefault;
622

623
		case IROp::MovZ:
624
		case IROp::MovNZ:
625
			gpr.MapInInIn(inst.dest, inst.src1, inst.src2);
626
			goto doDefault;
627

628
		case IROp::Min:
629
		case IROp::Max:
630
			gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
631
			goto doDefault;
632

633
		case IROp::FMovFromGPR:
634
			if (gpr.IsImm(inst.src1)) {
635
				out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
636
			} else {
637
				gpr.MapIn(inst.src1);
638
				goto doDefault;
639
			}
640
			break;
641

642
		case IROp::FMovToGPR:
643
			gpr.MapDirty(inst.dest);
644
			goto doDefault;
645

646
		case IROp::MfHi:
647
		case IROp::MfLo:
648
			gpr.MapDirty(inst.dest);
649
			goto doDefault;
650

651
		case IROp::MtHi:
652
		case IROp::MtLo:
653
			gpr.MapIn(inst.src1);
654
			goto doDefault;
655

656
		case IROp::Store8:
657
		case IROp::Store16:
658
		case IROp::Store32:
659
		case IROp::Store32Left:
660
		case IROp::Store32Right:
661
		case IROp::Store32Conditional:
662
			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
663
				gpr.MapIn(inst.dest);
664
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
665
			} else {
666
				gpr.MapInIn(inst.dest, inst.src1);
667
				goto doDefault;
668
			}
669
			break;
670
		case IROp::StoreFloat:
671
		case IROp::StoreVec4:
672
			if (gpr.IsImm(inst.src1)) {
673
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
674
			} else {
675
				gpr.MapIn(inst.src1);
676
				goto doDefault;
677
			}
678
			break;
679

680
		case IROp::Load8:
681
		case IROp::Load8Ext:
682
		case IROp::Load16:
683
		case IROp::Load16Ext:
684
		case IROp::Load32:
685
		case IROp::Load32Linked:
686
			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
687
				gpr.MapDirty(inst.dest);
688
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
689
			} else {
690
				gpr.MapDirtyIn(inst.dest, inst.src1);
691
				goto doDefault;
692
			}
693
			break;
694
		case IROp::LoadFloat:
695
		case IROp::LoadVec4:
696
			if (gpr.IsImm(inst.src1)) {
697
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
698
			} else {
699
				gpr.MapIn(inst.src1);
700
				goto doDefault;
701
			}
702
			break;
703
		case IROp::Load32Left:
704
		case IROp::Load32Right:
705
			if (gpr.IsImm(inst.src1)) {
706
				gpr.MapIn(inst.dest);
707
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
708
			} else {
709
				gpr.MapInIn(inst.dest, inst.src1);
710
				goto doDefault;
711
			}
712
			break;
713

714
		case IROp::ValidateAddress8:
715
		case IROp::ValidateAddress16:
716
		case IROp::ValidateAddress32:
717
		case IROp::ValidateAddress128:
718
			if (gpr.IsImm(inst.src1)) {
719
				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
720
			} else {
721
				gpr.MapIn(inst.src1);
722
				goto doDefault;
723
			}
724
			break;
725

726
		case IROp::Downcount:
727
		case IROp::SetPCConst:
728
			goto doDefault;
729

730
		case IROp::SetPC:
731
			if (gpr.IsImm(inst.src1)) {
732
				out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));
733
			} else {
734
				gpr.MapIn(inst.src1);
735
				goto doDefault;
736
			}
737
			break;
738

739
		// FP-only instructions don't need to flush immediates.
740
		case IROp::FAdd:
741
		case IROp::FMul:
742
			// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
743
			if (inst.src2 == inst.dest && inst.src1 != inst.src2)
744
				std::swap(inst.src1, inst.src2);
745
			out.Write(inst);
746
			break;
747

748
		case IROp::FSub:
749
		case IROp::FDiv:
750
		case IROp::FNeg:
751
		case IROp::FAbs:
752
		case IROp::FMov:
753
		case IROp::FRound:
754
		case IROp::FTrunc:
755
		case IROp::FCeil:
756
		case IROp::FFloor:
757
		case IROp::FCvtSW:
758
		case IROp::FCvtScaledWS:
759
		case IROp::FCvtScaledSW:
760
		case IROp::FSin:
761
		case IROp::FCos:
762
		case IROp::FSqrt:
763
		case IROp::FRSqrt:
764
		case IROp::FRecip:
765
		case IROp::FAsin:
766
			out.Write(inst);
767
			break;
768

769
		case IROp::SetCtrlVFPU:
770
			gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);
771
			goto doDefault;
772

773
		case IROp::SetCtrlVFPUReg:
774
			if (gpr.IsImm(inst.src1)) {
775
				out.Write(IROp::SetCtrlVFPU, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
776
			} else {
777
				gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1);
778
				out.Write(inst);
779
			}
780
			break;
781

782
		case IROp::SetCtrlVFPUFReg:
783
			gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);
784
			goto doDefault;
785

786
		case IROp::FCvtWS:
787
			// TODO: Actually, this should just use the currently set rounding mode.
788
			// Move up with FCvtSW when that's implemented.
789
			gpr.MapIn(IRREG_FCR31);
790
			out.Write(inst);
791
			break;
792

793
		case IROp::FpCondFromReg:
794
			gpr.MapDirtyIn(IRREG_FPCOND, inst.src1);
795
			out.Write(inst);
796
			break;
797
		case IROp::FpCondToReg:
798
			if (gpr.IsImm(IRREG_FPCOND)) {
799
				gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));
800
			} else {
801
				gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
802
				out.Write(inst);
803
			}
804
			break;
805
		case IROp::FpCtrlFromReg:
806
			gpr.MapDirtyIn(IRREG_FCR31, inst.src1);
807
			gpr.MapDirty(IRREG_FPCOND);
808
			goto doDefault;
809
		case IROp::FpCtrlToReg:
810
			gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31);
811
			goto doDefault;
812

813
		case IROp::Vec4Init:
814
		case IROp::Vec4Mov:
815
		case IROp::Vec4Add:
816
		case IROp::Vec4Sub:
817
		case IROp::Vec4Mul:
818
		case IROp::Vec4Div:
819
		case IROp::Vec4Dot:
820
		case IROp::Vec4Scale:
821
		case IROp::Vec4Shuffle:
822
		case IROp::Vec4Blend:
823
		case IROp::Vec4Neg:
824
		case IROp::Vec4Abs:
825
		case IROp::Vec4Pack31To8:
826
		case IROp::Vec4Pack32To8:
827
		case IROp::Vec2Pack32To16:
828
		case IROp::Vec4Unpack8To32:
829
		case IROp::Vec2Unpack16To32:
830
		case IROp::Vec4DuplicateUpperBitsAndShift1:
831
		case IROp::Vec2ClampToZero:
832
		case IROp::Vec4ClampToZero:
833
			out.Write(inst);
834
			break;
835

836
		case IROp::FCmp:
837
			gpr.MapDirty(IRREG_FPCOND);
838
			goto doDefault;
839

840
		case IROp::RestoreRoundingMode:
841
		case IROp::ApplyRoundingMode:
842
		case IROp::UpdateRoundingMode:
843
			goto doDefault;
844

845
		case IROp::VfpuCtrlToReg:
846
			gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
847
			goto doDefault;
848

849
		case IROp::FCmpVfpuBit:
850
			gpr.MapDirty(IRREG_VFPU_CC);
851
			goto doDefault;
852

853
		case IROp::FCmovVfpuCC:
854
			gpr.MapIn(IRREG_VFPU_CC);
855
			goto doDefault;
856

857
		case IROp::FCmpVfpuAggregate:
858
			gpr.MapDirtyIn(IRREG_VFPU_CC, IRREG_VFPU_CC);
859
			goto doDefault;
860

861
		case IROp::ExitToConstIfEq:
862
		case IROp::ExitToConstIfNeq:
863
			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
864
				bool passed = false;
865
				switch (inst.op) {
866
				case IROp::ExitToConstIfEq: passed = gpr.GetImm(inst.src1) == gpr.GetImm(inst.src2); break;
867
				case IROp::ExitToConstIfNeq: passed = gpr.GetImm(inst.src1) != gpr.GetImm(inst.src2); break;
868
				default: _assert_(false); break;
869
				}
870

871
				// This is a bit common for the first cycle of loops.
872
				// Reduce bloat by skipping on fail, and const exit on pass.
873
				if (passed) {
874
					gpr.FlushAll();
875
					out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));
876
					skipNextExitToConst = true;
877
				}
878
				break;
879
			}
880
			gpr.FlushAll();
881
			goto doDefault;
882

883
		case IROp::ExitToConstIfGtZ:
884
		case IROp::ExitToConstIfGeZ:
885
		case IROp::ExitToConstIfLtZ:
886
		case IROp::ExitToConstIfLeZ:
887
			if (gpr.IsImm(inst.src1)) {
888
				bool passed = false;
889
				switch (inst.op) {
890
				case IROp::ExitToConstIfGtZ: passed = (s32)gpr.GetImm(inst.src1) > 0; break;
891
				case IROp::ExitToConstIfGeZ: passed = (s32)gpr.GetImm(inst.src1) >= 0; break;
892
				case IROp::ExitToConstIfLtZ: passed = (s32)gpr.GetImm(inst.src1) < 0; break;
893
				case IROp::ExitToConstIfLeZ: passed = (s32)gpr.GetImm(inst.src1) <= 0; break;
894
				default: _assert_(false); break;
895
				}
896

897
				if (passed) {
898
					gpr.FlushAll();
899
					out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));
900
					skipNextExitToConst = true;
901
				}
902
				break;
903
			}
904
			gpr.FlushAll();
905
			goto doDefault;
906

907
		case IROp::ExitToConst:
908
			if (skipNextExitToConst) {
909
				skipNextExitToConst = false;
910
				break;
911
			}
912
			gpr.FlushAll();
913
			goto doDefault;
914

915
		case IROp::ExitToReg:
916
			if (gpr.IsImm(inst.src1)) {
917
				// This happens sometimes near loops.
918
				// Prefer ExitToConst to allow block linking.
919
				u32 dest = gpr.GetImm(inst.src1);
920
				gpr.FlushAll();
921
				out.Write(IROp::ExitToConst, out.AddConstant(dest));
922
				break;
923
			}
924
			gpr.FlushAll();
925
			goto doDefault;
926

927
		case IROp::CallReplacement:
928
		case IROp::Break:
929
		case IROp::Syscall:
930
		case IROp::Interpret:
931
		case IROp::ExitToConstIfFpFalse:
932
		case IROp::ExitToConstIfFpTrue:
933
		case IROp::Breakpoint:
934
		case IROp::MemoryCheck:
935
		default:
936
		{
937
			gpr.FlushAll();
938
		doDefault:
939
			out.Write(inst);
940
			break;
941
		}
942
		}
943
	}
944
	gpr.FlushAll();
945
	return logBlocks;
946
}
947

948
IRInstMeta IRReplaceSrcGPR(const IRInstMeta &inst, int fromReg, int toReg) {
949
	IRInstMeta newInst = inst;
950

951
	if (inst.m.types[1] == 'G' && inst.src1 == fromReg) {
952
		newInst.src1 = toReg;
953
	}
954
	if (inst.m.types[2] == 'G' && inst.src2 == fromReg) {
955
		newInst.src2 = toReg;
956
	}
957
	if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && inst.m.types[0] == 'G' && inst.src3 == fromReg) {
958
		newInst.src3 = toReg;
959
	}
960
	return newInst;
961
}
962

963
IRInstMeta IRReplaceDestGPR(const IRInstMeta &inst, int fromReg, int toReg) {
964
	IRInstMeta newInst = inst;
965

966
	if ((inst.m.flags & IRFLAG_SRC3) == 0 && inst.m.types[0] == 'G' && inst.dest == fromReg) {
967
		newInst.dest = toReg;
968
	}
969
	return newInst;
970
}
971

972
bool IRMutatesDestGPR(const IRInstMeta &inst, int reg) {
973
	return (inst.m.flags & IRFLAG_SRC3DST) != 0 && inst.m.types[0] == 'G' && inst.src3 == reg;
974
}
975

976
bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) {
977
	CONDITIONAL_DISABLE;
978
	std::vector<IRInstMeta> insts;
979
	insts.reserve(in.GetInstructions().size());
980

981
	// We track writes both to rename regs and to purge dead stores.
982
	struct Check {
983
		Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {
984
		}
985

986
		// Register this instruction wrote to.
987
		int reg;
988
		// Only other than -1 when it's a Mov, equivalent reg at this point.
989
		int srcReg = -1;
990
		// Index into insts for this op.
991
		int index;
992
		// Whether the dest reg is read by any Exit.
993
		bool readByExit;
994
		int8_t fplen = 0;
995
	};
996
	std::vector<Check> checks;
997
	checks.reserve(insts.size() / 2);
998

999
	// This tracks the last index at which each reg was modified.
1000
	int lastWrittenTo[256];
1001
	int lastReadFrom[256];
1002
	memset(lastWrittenTo, -1, sizeof(lastWrittenTo));
1003
	memset(lastReadFrom, -1, sizeof(lastReadFrom));
1004

1005
	auto readsFromFPRCheck = [](IRInstMeta &inst, Check &check, bool *directly) {
1006
		if (check.reg < 32)
1007
			return false;
1008

1009
		bool result = false;
1010
		*directly = true;
1011
		for (int i = 0; i < 4; ++i) {
1012
			bool laneDirectly;
1013
			if (check.fplen >= i + 1 && IRReadsFromFPR(inst, check.reg - 32 + i, &laneDirectly)) {
1014
				result = true;
1015
				if (!laneDirectly) {
1016
					*directly = false;
1017
					break;
1018
				}
1019
			}
1020
		}
1021
		return result;
1022
	};
1023

1024
	bool logBlocks = false;
1025
	size_t firstCheck = 0;
1026
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1027
		IRInstMeta inst = GetIRMeta(in.GetInstructions()[i]);
1028

1029
		// It helps to skip through rechecking ones we already discarded.
1030
		for (size_t ch = firstCheck; ch < checks.size(); ++ch) {
1031
			Check &check = checks[ch];
1032
			if (check.reg != 0) {
1033
				firstCheck = ch;
1034
				break;
1035
			}
1036
		}
1037

1038
		// Check if we can optimize by running through all the writes we've previously found.
1039
		for (size_t ch = firstCheck; ch < checks.size(); ++ch) {
1040
			Check &check = checks[ch];
1041
			if (check.reg == 0) {
1042
				// This means we already optimized this or a later inst depends on it.
1043
				continue;
1044
			}
1045

1046
			bool readsDirectly;
1047
			if (IRReadsFromGPR(inst, check.reg, &readsDirectly)) {
1048
				// If this reads from the reg, we either depend on it or we can fold or swap.
1049
				// That's determined below.
1050

1051
				// If this reads and writes the reg (e.g. MovZ, Load32Left), we can't just swap.
1052
				bool mutatesReg = IRMutatesDestGPR(inst, check.reg);
1053
				// If this doesn't directly read (i.e. Interpret), we can't swap.
1054
				bool cannotReplace = !readsDirectly;
1055
				if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) {
1056
					// Replace with the srcReg instead.  This happens with non-nice delay slots.
1057
					// We're changing "Mov A, B; Add C, C, A" to "Mov A, B; Add C, C, B" here.
1058
					// srcReg should only be set when it was a Mov.
1059
					inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg);
1060

1061
					// If the Mov modified the same reg as this instruction, we can't optimize from it anymore.
1062
					if (inst.dest == check.reg) {
1063
						check.reg = 0;
1064
						// We can also optimize it out since we've essentially moved now.
1065
						insts[check.index].op = IROp::Mov;
1066
						insts[check.index].dest = 0;
1067
						insts[check.index].src1 = 0;
1068
					}
1069
				} else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) {
1070
					// As long as the previous inst wasn't modifying its dest reg, and this is a Mov, we can swap.
1071
					// We're changing "Add A, B, C; Mov B, A" to "Add B, B, C; Mov A, B" here.
1072

1073
					// This happens with lwl/lwr temps.  Replace the original dest.
1074
					insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest);
1075
					lastWrittenTo[inst.dest] = check.index;
1076
					// If it's being read from (by inst now), we can't optimize out.
1077
					check.reg = 0;
1078
					// Update the read by exit flag to match the new reg.
1079
					check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT;
1080
					// And swap the args for this mov, since we changed the other dest.  We'll optimize this out later.
1081
					std::swap(inst.dest, inst.src1);
1082
				} else {
1083
					// Legitimately read from, so we can't optimize out.
1084
					// Unless this is an exit and a temp not read directly by the exit.
1085
					if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)
1086
						check.reg = 0;
1087
				}
1088
			} else if (check.fplen >= 1 && readsFromFPRCheck(inst, check, &readsDirectly)) {
1089
				// If one or the other is a Vec, they must match.
1090
				bool lenMismatch = false;
1091

1092
				auto checkMismatch = [&check, &lenMismatch](IRReg src, char type) {
1093
					int srclen = 1;
1094
					if (type == 'V')
1095
						srclen = 4;
1096
					else if (type == '2')
1097
						srclen = 2;
1098
					else if (type != 'F')
1099
						return;
1100

1101
					if (src + 32 + srclen > check.reg && src + 32 < check.reg + check.fplen) {
1102
						if (src + 32 != check.reg || srclen != check.fplen)
1103
							lenMismatch = true;
1104
					}
1105
				};
1106

1107
				checkMismatch(inst.src1, inst.m.types[1]);
1108
				checkMismatch(inst.src2, inst.m.types[2]);
1109
				if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0)
1110
					checkMismatch(inst.src3, inst.m.types[3]);
1111

1112
				bool cannotReplace = !readsDirectly || lenMismatch;
1113
				if (!cannotReplace && check.srcReg >= 32 && lastWrittenTo[check.srcReg] < check.index) {
1114
					// This is probably not worth doing unless we can get rid of a temp.
1115
					if (!check.readByExit) {
1116
						if (insts[check.index].dest == inst.src1)
1117
							inst.src1 = check.srcReg - 32;
1118
						else if (insts[check.index].dest == inst.src2)
1119
							inst.src2 = check.srcReg - 32;
1120
						else
1121
							_assert_msg_(false, "Unexpected src3 read of FPR");
1122

1123
						// Check if we've clobbered it entirely.
1124
						if (inst.dest == check.reg) {
1125
							check.reg = 0;
1126
							insts[check.index].op = IROp::Mov;
1127
							insts[check.index].dest = 0;
1128
							insts[check.index].src1 = 0;
1129
						}
1130
					} else {
1131
						// Let's not bother.
1132
						check.reg = 0;
1133
					}
1134
				} else if ((inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) && !lenMismatch) {
1135
					// A swap could be profitable if this is a temp, and maybe in other cases.
1136
					// These can happen a lot from mask regs, etc.
1137
					// But make sure no other changes happened between.
1138
					bool destNotChanged = true;
1139
					for (int j = 0; j < check.fplen; ++j)
1140
						destNotChanged = destNotChanged && lastWrittenTo[inst.dest + 32 + j] < check.index;
1141

1142
					bool destNotRead = true;
1143
					for (int j = 0; j < check.fplen; ++j)
1144
						destNotRead = destNotRead && lastReadFrom[inst.dest + 32 + j] <= check.index;
1145

1146
					if (!check.readByExit && destNotChanged && destNotRead) {
1147
						_dbg_assert_(insts[check.index].dest == inst.src1);
1148
						insts[check.index].dest = inst.dest;
1149
						for (int j = 0; j < check.fplen; ++j)
1150
							lastWrittenTo[inst.dest + 32 + j] = check.index;
1151
						// If it's being read from (by inst now), we can't optimize out.
1152
						check.reg = 0;
1153
						// Swap the dest and src1 so we can optimize this out later, maybe.
1154
						std::swap(inst.dest, inst.src1);
1155
					} else {
1156
						// Doesn't look like a good candidate.
1157
						check.reg = 0;
1158
					}
1159
				} else {
1160
					// Legitimately read from, so we can't optimize out.
1161
					if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)
1162
						check.reg = 0;
1163
				}
1164
			} else if (check.readByExit && (inst.m.flags & IRFLAG_EXIT) != 0) {
1165
				// This is an exit, and the reg is read by any exit.  Clear it.
1166
				check.reg = 0;
1167
			} else if (IRDestGPR(inst) == check.reg) {
1168
				// Clobbered, we can optimize out.
1169
				// This happens sometimes with temporaries used for constant addresses.
1170
				insts[check.index].op = IROp::Mov;
1171
				insts[check.index].dest = 0;
1172
				insts[check.index].src1 = 0;
1173
				check.reg = 0;
1174
			} else if (IRWritesToFPR(inst, check.reg - 32) && check.fplen >= 1) {
1175
				IRReg destFPRs[4];
1176
				int numFPRs = IRDestFPRs(inst, destFPRs);
1177

1178
				if (numFPRs == check.fplen && inst.dest + 32 == check.reg) {
1179
					// This means we've clobbered it, and with full overlap.
1180
					// Sometimes this happens for non-temps, i.e. vmmov + vinit last row.
1181
					insts[check.index].op = IROp::Mov;
1182
					insts[check.index].dest = 0;
1183
					insts[check.index].src1 = 0;
1184
					check.reg = 0;
1185
				} else {
1186
					// Since there's an overlap, we simply cannot optimize.
1187
					check.reg = 0;
1188
				}
1189
			}
1190
		}
1191

1192
		int dest = IRDestGPR(inst);
1193
		switch (dest) {
1194
		case IRTEMP_0:
1195
		case IRTEMP_1:
1196
		case IRTEMP_2:
1197
		case IRTEMP_3:
1198
		case IRTEMP_LHS:
1199
		case IRTEMP_RHS:
1200
		case IRTEMP_LR_ADDR:
1201
		case IRTEMP_LR_VALUE:
1202
		case IRTEMP_LR_MASK:
1203
		case IRTEMP_LR_SHIFT:
1204
			// Check that it's not a barrier instruction (like CallReplacement). Don't want to even consider optimizing those.
1205
			if (!(inst.m.flags & IRFLAG_BARRIER)) {
1206
				// Unlike other registers, these don't need to persist between blocks.
1207
				// So we consider them not read unless proven read.
1208
				lastWrittenTo[dest] = i;
1209
				// If this is a copy, we might be able to optimize out the copy.
1210
				if (inst.op == IROp::Mov) {
1211
					Check check(dest, i, false);
1212
					check.srcReg = inst.src1;
1213
					checks.push_back(check);
1214
				} else {
1215
					checks.push_back(Check(dest, i, false));
1216
				}
1217
			} else {
1218
				lastWrittenTo[dest] = i;
1219
			}
1220
			break;
1221

1222
		default:
1223
			lastWrittenTo[dest] = i;
1224
			if (dest > IRTEMP_LR_SHIFT) {
1225
				// These might sometimes be implicitly read/written by other instructions.
1226
				break;
1227
			}
1228
			checks.push_back(Check(dest, i, true));
1229
			break;
1230

1231
		// Not a GPR output.
1232
		case 0:
1233
		case -1:
1234
			break;
1235
		}
1236

1237
		IRReg regs[16];
1238
		int readGPRs = IRReadsFromGPRs(inst, regs);
1239
		if (readGPRs == -1) {
1240
			for (int j = 0; j < 256; ++j)
1241
				lastReadFrom[j] = i;
1242
		} else {
1243
			for (int j = 0; j < readGPRs; ++j)
1244
				lastReadFrom[regs[j]] = i;
1245
		}
1246

1247
		int readFPRs = IRReadsFromFPRs(inst, regs);
1248
		if (readFPRs == -1) {
1249
			for (int j = 0; j < 256; ++j)
1250
				lastReadFrom[j] = i;
1251
		} else {
1252
			for (int j = 0; j < readFPRs; ++j)
1253
				lastReadFrom[regs[j] + 32] = i;
1254
		}
1255

1256
		int destFPRs = IRDestFPRs(inst, regs);
1257
		for (int j = 0; j < destFPRs; ++j)
1258
			lastWrittenTo[regs[j] + 32] = i;
1259

1260
		dest = destFPRs > 0 ? regs[0] + 32 : -1;
1261
		if (dest >= 32 && dest < IRTEMP_0) {
1262
			// Standard FPU or VFPU reg.
1263
			Check check(dest, i, true);
1264
			check.fplen = (int8_t)destFPRs;
1265
			checks.push_back(check);
1266
		} else if (dest >= IRVTEMP_PFX_S + 32 && dest < IRVTEMP_PFX_S + 32 + 16) {
1267
			// These are temporary regs and not read by exits.
1268
			Check check(dest, i, false);
1269
			check.fplen = (int8_t)destFPRs;
1270
			if (inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) {
1271
				check.srcReg = inst.src1 + 32;
1272
			}
1273
			checks.push_back(check);
1274
		} else if (dest != -1) {
1275
			_assert_msg_(false, "Unexpected FPR output %d", dest);
1276
		}
1277

1278
		insts.push_back(inst);
1279
	}
1280

1281
	// Since we're done with the instructions, all remaining can be nuked.
1282
	for (Check &check : checks) {
1283
		if (!check.readByExit && check.reg > 0) {
1284
			insts[check.index].op = IROp::Mov;
1285
			insts[check.index].dest = 0;
1286
			insts[check.index].src1 = 0;
1287
		}
1288
	}
1289

1290
	for (const IRInstMeta &inst : insts) {
1291
		// Simply skip any Mov 0, 0 instructions, since that's how we nuke one.
1292
		if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {
1293
			out.Write(inst.i);
1294
		}
1295
	}
1296

1297
	return logBlocks;
1298
}
1299

1300
bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1301
	CONDITIONAL_DISABLE;
1302
	// This tells us to skip an AND op that has been optimized out.
1303
	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
1304
	int nextSkip = -1;
1305

1306
	bool logBlocks = false;
1307
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1308
		IRInst inst = in.GetInstructions()[i];
1309

1310
		if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {
1311
			int dest = IRDestGPR(GetIRMeta(inst));
1312
			for (int j = i + 1; j < n; j++) {
1313
				const IRInstMeta laterInst = GetIRMeta(in.GetInstructions()[j]);
1314

1315
				if ((laterInst.m.flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {
1316
					// Exit, so we can't do the optimization.
1317
					break;
1318
				}
1319
				if (IRReadsFromGPR(laterInst, dest)) {
1320
					if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {
1321
						const u32 mask = laterInst.constant;
1322
						// Here we are, maybe we can reduce the load size based on the mask.
1323
						if ((mask & 0xffffff00) == 0) {
1324
							inst.op = IROp::Load8;
1325
							if (mask == 0xff) {
1326
								nextSkip = j;
1327
							}
1328
						} else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {
1329
							inst.op = IROp::Load16;
1330
							if (mask == 0xffff) {
1331
								nextSkip = j;
1332
							}
1333
						}
1334
					}
1335
					// If it was read, we can't do the optimization.
1336
					break;
1337
				}
1338
				if (IRDestGPR(laterInst) == dest) {
1339
					// Someone else wrote, so we can't do the optimization.
1340
					break;
1341
				}
1342
			}
1343
		}
1344

1345
		if (i != nextSkip) {
1346
			out.Write(inst);
1347
		}
1348
	}
1349

1350
	return logBlocks;
1351
}
1352

1353
static std::vector<IRInst> ReorderLoadStoreOps(std::vector<IRInst> &ops) {
1354
	if (ops.size() < 2) {
1355
		return ops;
1356
	}
1357

1358
	bool modifiedRegs[256] = {};
1359

1360
	for (size_t i = 0, n = ops.size(); i < n - 1; ++i) {
1361
		bool modifiesReg = false;
1362
		bool usesFloatReg = false;
1363
		switch (ops[i].op) {
1364
		case IROp::Load8:
1365
		case IROp::Load8Ext:
1366
		case IROp::Load16:
1367
		case IROp::Load16Ext:
1368
		case IROp::Load32:
1369
		case IROp::Load32Left:
1370
		case IROp::Load32Right:
1371
			modifiesReg = true;
1372
			if (ops[i].src1 == ops[i].dest) {
1373
				// Can't ever reorder these, since it changes.
1374
				continue;
1375
			}
1376
			break;
1377

1378
		case IROp::Store8:
1379
		case IROp::Store16:
1380
		case IROp::Store32:
1381
		case IROp::Store32Left:
1382
		case IROp::Store32Right:
1383
			break;
1384

1385
		case IROp::LoadFloat:
1386
		case IROp::LoadVec4:
1387
			usesFloatReg = true;
1388
			modifiesReg = true;
1389
			break;
1390

1391
		case IROp::StoreFloat:
1392
		case IROp::StoreVec4:
1393
			usesFloatReg = true;
1394
			break;
1395

1396
		default:
1397
			continue;
1398
		}
1399

1400
		memset(modifiedRegs, 0, sizeof(modifiedRegs));
1401
		size_t start = i;
1402
		size_t j;
1403
		for (j = i; j < n; ++j) {
1404
			if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) {
1405
				// Incompatible ops, so let's not reorder.
1406
				break;
1407
			}
1408
			if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) {
1409
				// Can't reorder, this reg was modified.
1410
				break;
1411
			}
1412
			if (modifiesReg) {
1413
				// Modifies itself, can't reorder this.
1414
				if (!usesFloatReg && ops[j].dest == ops[j].src1) {
1415
					break;
1416
				}
1417
				modifiedRegs[ops[j].dest] = true;
1418
			}
1419

1420
			// Keep going, these operations are compatible.
1421
		}
1422

1423
		// Everything up to (but not including) j will be sorted, so skip them.
1424
		i = j - 1;
1425
		size_t end = j;
1426
		if (start + 1 < end) {
1427
			std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) {
1428
				return a.constant < b.constant;
1429
			});
1430
		}
1431
	}
1432

1433
	return ops;
1434
}
1435

1436
bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1437
	CONDITIONAL_DISABLE;
1438

1439
	bool logBlocks = false;
1440

1441
	enum class RegState : u8 {
1442
		UNUSED = 0,
1443
		READ = 1,
1444
		CHANGED = 2,
1445
	};
1446

1447
	bool queuing = false;
1448
	std::vector<IRInst> loadStoreQueue;
1449
	std::vector<IRInst> otherQueue;
1450
	RegState otherRegs[256] = {};
1451

1452
	auto flushQueue = [&]() {
1453
		if (!queuing) {
1454
			return;
1455
		}
1456

1457
		std::vector<IRInst> loadStoreUnsorted = loadStoreQueue;
1458
		std::vector<IRInst> loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue);
1459
		if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) {
1460
			logBlocks = true;
1461
		}
1462

1463
		queuing = false;
1464
		for (IRInst queued : loadStoreSorted) {
1465
			out.Write(queued);
1466
		}
1467
		for (IRInst queued : otherQueue) {
1468
			out.Write(queued);
1469
		}
1470
		loadStoreQueue.clear();
1471
		otherQueue.clear();
1472
		memset(otherRegs, 0, sizeof(otherRegs));
1473
	};
1474

1475
	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
1476
		IRInst inst = in.GetInstructions()[i];
1477
		switch (inst.op) {
1478
		case IROp::Load8:
1479
		case IROp::Load8Ext:
1480
		case IROp::Load16:
1481
		case IROp::Load16Ext:
1482
		case IROp::Load32:
1483
		case IROp::Load32Left:
1484
		case IROp::Load32Right:
1485
			// To move a load up, its dest can't be changed by things we move down.
1486
			if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) {
1487
				flushQueue();
1488
			}
1489

1490
			queuing = true;
1491
			loadStoreQueue.push_back(inst);
1492
			break;
1493

1494
		case IROp::Store8:
1495
		case IROp::Store16:
1496
		case IROp::Store32:
1497
		case IROp::Store32Left:
1498
		case IROp::Store32Right:
1499
			// A store can move above even if it's read, as long as it's not changed by the other ops.
1500
			if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) {
1501
				flushQueue();
1502
			}
1503

1504
			queuing = true;
1505
			loadStoreQueue.push_back(inst);
1506
			break;
1507

1508
		case IROp::LoadVec4:
1509
		case IROp::LoadFloat:
1510
		case IROp::StoreVec4:
1511
		case IROp::StoreFloat:
1512
			// Floats can always move as long as their address is safe.
1513
			if (otherRegs[inst.src1] == RegState::CHANGED) {
1514
				flushQueue();
1515
			}
1516

1517
			queuing = true;
1518
			loadStoreQueue.push_back(inst);
1519
			break;
1520

1521
		case IROp::Sub:
1522
		case IROp::Slt:
1523
		case IROp::SltU:
1524
		case IROp::Add:
1525
		case IROp::And:
1526
		case IROp::Or:
1527
		case IROp::Xor:
1528
		case IROp::Shl:
1529
		case IROp::Shr:
1530
		case IROp::Ror:
1531
		case IROp::Sar:
1532
		case IROp::MovZ:
1533
		case IROp::MovNZ:
1534
		case IROp::Max:
1535
		case IROp::Min:
1536
			// We'll try to move this downward.
1537
			otherRegs[inst.dest] = RegState::CHANGED;
1538
			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1539
				otherRegs[inst.src1] = RegState::READ;
1540
			if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1541
				otherRegs[inst.src2] = RegState::READ;
1542
			otherQueue.push_back(inst);
1543
			queuing = true;
1544
			break;
1545

1546
		case IROp::Neg:
1547
		case IROp::Not:
1548
		case IROp::BSwap16:
1549
		case IROp::BSwap32:
1550
		case IROp::Ext8to32:
1551
		case IROp::Ext16to32:
1552
		case IROp::ReverseBits:
1553
		case IROp::Clz:
1554
		case IROp::AddConst:
1555
		case IROp::SubConst:
1556
		case IROp::AndConst:
1557
		case IROp::OrConst:
1558
		case IROp::XorConst:
1559
		case IROp::SltConst:
1560
		case IROp::SltUConst:
1561
		case IROp::ShlImm:
1562
		case IROp::ShrImm:
1563
		case IROp::RorImm:
1564
		case IROp::SarImm:
1565
		case IROp::Mov:
1566
			// We'll try to move this downward.
1567
			otherRegs[inst.dest] = RegState::CHANGED;
1568
			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1569
				otherRegs[inst.src1] = RegState::READ;
1570
			otherQueue.push_back(inst);
1571
			queuing = true;
1572
			break;
1573

1574
		case IROp::SetConst:
1575
			// We'll try to move this downward.
1576
			otherRegs[inst.dest] = RegState::CHANGED;
1577
			otherQueue.push_back(inst);
1578
			queuing = true;
1579
			break;
1580

1581
		case IROp::Mult:
1582
		case IROp::MultU:
1583
		case IROp::Madd:
1584
		case IROp::MaddU:
1585
		case IROp::Msub:
1586
		case IROp::MsubU:
1587
		case IROp::Div:
1588
		case IROp::DivU:
1589
			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1590
				otherRegs[inst.src1] = RegState::READ;
1591
			if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1592
				otherRegs[inst.src2] = RegState::READ;
1593
			otherQueue.push_back(inst);
1594
			queuing = true;
1595
			break;
1596

1597
		case IROp::MfHi:
1598
		case IROp::MfLo:
1599
		case IROp::FpCondToReg:
1600
			otherRegs[inst.dest] = RegState::CHANGED;
1601
			otherQueue.push_back(inst);
1602
			queuing = true;
1603
			break;
1604

1605
		case IROp::MtHi:
1606
		case IROp::MtLo:
1607
		case IROp::FpCondFromReg:
1608
			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1609
				otherRegs[inst.src1] = RegState::READ;
1610
			otherQueue.push_back(inst);
1611
			queuing = true;
1612
			break;
1613

1614
		case IROp::Nop:
1615
		case IROp::Downcount:
1616
			if (queuing) {
1617
				// These are freebies.  Sometimes helps with delay slots.
1618
				otherQueue.push_back(inst);
1619
			} else {
1620
				out.Write(inst);
1621
			}
1622
			break;
1623

1624
		default:
1625
			flushQueue();
1626
			out.Write(inst);
1627
			break;
1628
		}
1629
	}
1630
	return logBlocks;
1631
}
1632

1633
bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1634
	CONDITIONAL_DISABLE;
1635

1636
	bool logBlocks = false;
1637

1638
	auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) {
1639
		if (a.op != b.op || a.src1 != b.src1) {
1640
			// Not similar enough at all.
1641
			return false;
1642
		}
1643
		u32 off1 = a.constant;
1644
		u32 off2 = b.constant;
1645
		if (off1 + dist != off2) {
1646
			// Not immediately sequential.
1647
			return false;
1648
		}
1649

1650
		return true;
1651
	};
1652

1653
	IRInst prev = { IROp::Nop };
1654
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1655
		IRInst inst = in.GetInstructions()[i];
1656
		int c = 0;
1657
		switch (inst.op) {
1658
		case IROp::Store8:
1659
			for (c = 1; c < 4 && i + c < n; ++c) {
1660
				const IRInst &nextInst = in.GetInstructions()[i + c];
1661
				// TODO: Might be nice to check if this is an obvious constant.
1662
				if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1663
					break;
1664
				}
1665
				if (!opsCompatible(inst, nextInst, c)) {
1666
					break;
1667
				}
1668
			}
1669
			if ((c == 2 || c == 3) && opts.unalignedLoadStore) {
1670
				inst.op = IROp::Store16;
1671
				out.Write(inst);
1672
				prev = inst;
1673
				// Skip the next one (the 3rd will be separate.)
1674
				++i;
1675
				continue;
1676
			}
1677
			if (c == 4 && opts.unalignedLoadStore) {
1678
				inst.op = IROp::Store32;
1679
				out.Write(inst);
1680
				prev = inst;
1681
				// Skip all 4.
1682
				i += 3;
1683
				continue;
1684
			}
1685
			out.Write(inst);
1686
			prev = inst;
1687
			break;
1688

1689
		case IROp::Store16:
1690
			for (c = 1; c < 2 && i + c < n; ++c) {
1691
				const IRInst &nextInst = in.GetInstructions()[i + c];
1692
				// TODO: Might be nice to check if this is an obvious constant.
1693
				if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1694
					break;
1695
				}
1696
				if (!opsCompatible(inst, nextInst, c * 2)) {
1697
					break;
1698
				}
1699
			}
1700
			if (c == 2 && opts.unalignedLoadStore) {
1701
				inst.op = IROp::Store32;
1702
				out.Write(inst);
1703
				prev = inst;
1704
				// Skip the next one.
1705
				++i;
1706
				continue;
1707
			}
1708
			out.Write(inst);
1709
			prev = inst;
1710
			break;
1711

1712
		case IROp::Load32:
1713
			if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1714
				// A store and then an immediate load.  This is sadly common in minis.
1715
				if (prev.op == IROp::Store32 && prev.src3 == inst.dest) {
1716
					// Even the same reg, a volatile variable?  Skip it.
1717
					continue;
1718
				}
1719

1720
				// Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble.
1721
				if (prev.op == IROp::Store32) {
1722
					inst.op = IROp::Mov;
1723
					inst.src1 = prev.src3;
1724
					inst.src2 = 0;
1725
				} else if (prev.op == IROp::StoreFloat) {
1726
					inst.op = IROp::FMovToGPR;
1727
					inst.src1 = prev.src3;
1728
					inst.src2 = 0;
1729
				}
1730
				// The actual op is written below.
1731
			}
1732
			out.Write(inst);
1733
			prev = inst;
1734
			break;
1735

1736
		case IROp::LoadFloat:
1737
			if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1738
				// A store and then an immediate load, of a float.
1739
				if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) {
1740
					// Volatile float, I suppose?
1741
					continue;
1742
				}
1743

1744
				if (prev.op == IROp::StoreFloat) {
1745
					inst.op = IROp::FMov;
1746
					inst.src1 = prev.src3;
1747
					inst.src2 = 0;
1748
				} else if (prev.op == IROp::Store32) {
1749
					inst.op = IROp::FMovFromGPR;
1750
					inst.src1 = prev.src3;
1751
					inst.src2 = 0;
1752
				}
1753
				// The actual op is written below.
1754
			}
1755
			out.Write(inst);
1756
			prev = inst;
1757
			break;
1758

1759
		default:
1760
			out.Write(inst);
1761
			prev = inst;
1762
			break;
1763
		}
1764
	}
1765
	return logBlocks;
1766
}
1767

1768
struct IRMemoryOpInfo {
1769
	int size;
1770
	bool isWrite;
1771
	bool isWordLR;
1772
};
1773

1774
static IRMemoryOpInfo IROpMemoryAccessSize(IROp op) {
1775
	// Assumes all take src1 + constant.
1776
	switch (op) {
1777
	case IROp::Load8:
1778
	case IROp::Load8Ext:
1779
	case IROp::Store8:
1780
		return { 1, op == IROp::Store8 };
1781

1782
	case IROp::Load16:
1783
	case IROp::Load16Ext:
1784
	case IROp::Store16:
1785
		return { 2, op == IROp::Store16 };
1786

1787
	case IROp::Load32:
1788
	case IROp::Load32Linked:
1789
	case IROp::LoadFloat:
1790
	case IROp::Store32:
1791
	case IROp::Store32Conditional:
1792
	case IROp::StoreFloat:
1793
		return { 4, op == IROp::Store32 || op == IROp::Store32Conditional || op == IROp::StoreFloat };
1794

1795
	case IROp::LoadVec4:
1796
	case IROp::StoreVec4:
1797
		return { 16, op == IROp::StoreVec4 };
1798

1799
	case IROp::Load32Left:
1800
	case IROp::Load32Right:
1801
	case IROp::Store32Left:
1802
	case IROp::Store32Right:
1803
		// This explicitly does not require alignment, so validate as an 8-bit operation.
1804
		return { 1, op == IROp::Store32Left || op == IROp::Store32Right, true };
1805

1806
	default:
1807
		return { 0 };
1808
	}
1809
}
1810

1811
bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1812
	CONDITIONAL_DISABLE;
1813
	if (g_Config.bFastMemory)
1814
		DISABLE;
1815

1816
	int spLower = 0;
1817
	int spUpper = -1;
1818
	bool spWrite = false;
1819
	bool spModified = false;
1820
	for (IRInst inst : in.GetInstructions()) {
1821
		IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
1822
		// Note: we only combine word aligned accesses.
1823
		if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) {
1824
			if (spModified) {
1825
				// No good, it was modified and then we did more accesses.  Can't combine.
1826
				spUpper = -1;
1827
				break;
1828
			}
1829
			if ((int)inst.constant < 0 || (int)inst.constant >= 0x4000) {
1830
				// Let's assume this might cross boundaries or something.  Uncommon.
1831
				spUpper = -1;
1832
				break;
1833
			}
1834

1835
			spLower = std::min(spLower, (int)inst.constant);
1836
			spUpper = std::max(spUpper, (int)inst.constant + info.size);
1837
			spWrite = spWrite || info.isWrite;
1838
		}
1839

1840
		const IRMeta *m = GetIRMeta(inst.op);
1841
		if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0 && inst.dest == MIPS_REG_SP) {
1842
			// We only care if it changes after we start combining.
1843
			spModified = spUpper != -1;
1844
		}
1845
	}
1846

1847
	bool skipSP = spUpper != -1;
1848
	bool flushedSP = false;
1849

1850
	std::map<uint64_t, uint8_t> checks;
1851
	const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) {
1852
		if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) {
1853
			if (!flushedSP) {
1854
				out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower);
1855
				if (spUpper > spLower + 4)
1856
					out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spUpper - 4);
1857
				flushedSP = true;
1858
			}
1859
			return;
1860
		}
1861

1862
		uint64_t key = ((uint64_t)inst.src1 << 32) | inst.constant;
1863
		auto it = checks.find(key);
1864
		if (it == checks.end() || it->second < sz) {
1865
			out.Write(validate, 0, inst.src1, isStore ? 1U : 0U, inst.constant);
1866
			checks[key] = sz;
1867
		}
1868
	};
1869

1870
	bool logBlocks = false;
1871
	for (IRInst inst : in.GetInstructions()) {
1872
		IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
1873
		IROp validateOp = IROp::Nop;
1874
		switch (info.size) {
1875
		case 1: validateOp = IROp::ValidateAddress8; break;
1876
		case 2: validateOp = IROp::ValidateAddress16; break;
1877
		case 4: validateOp = IROp::ValidateAddress32; break;
1878
		case 16: validateOp = IROp::ValidateAddress128; break;
1879
		case 0: break;
1880
		default: _assert_msg_(false, "Unexpected memory access size");
1881
		}
1882

1883
		if (validateOp != IROp::Nop) {
1884
			addValidate(validateOp, info.size, inst, info.isWrite);
1885
		}
1886

1887
		const IRMeta *m = GetIRMeta(inst.op);
1888
		if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0) {
1889
			uint64_t key = (uint64_t)inst.dest << 32;
1890
			// Wipe out all the already done checks since this was modified.
1891
			checks.erase(checks.lower_bound(key), checks.upper_bound(key | 0xFFFFFFFFULL));
1892
		}
1893

1894
		// Always write out the original.  We're only adding.
1895
		out.Write(inst);
1896
	}
1897
	return logBlocks;
1898
}
1899

1900
bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1901
	CONDITIONAL_DISABLE;
1902
	// Only do this when using a SIMD backend.
1903
	if (!opts.preferVec4) {
1904
		DISABLE;
1905
	}
1906

1907
	bool isVec4[256]{};
1908
	bool isUsed[256]{};
1909
	bool isVec4Dirty[256]{};
1910
	auto updateVec4 = [&](char type, IRReg r) {
1911
		bool downgraded = false;
1912
		switch (type) {
1913
		case 'F':
1914
			downgraded = isVec4[r & ~3];
1915
			isVec4[r & ~3] = false;
1916
			isUsed[r] = true;
1917
			break;
1918

1919
		case 'V':
1920
			_dbg_assert_((r & 3) == 0);
1921
			isVec4[r] = true;
1922
			for (int i = 0; i < 4; ++i)
1923
				isUsed[r + i] = true;
1924
			break;
1925

1926
		case '2':
1927
			downgraded = isVec4[r & ~3];
1928
			isVec4[r & ~3] = false;
1929
			for (int i = 0; i < 2; ++i)
1930
				isUsed[r + i] = true;
1931
			break;
1932

1933
		default:
1934
			break;
1935
		}
1936

1937
		return downgraded;
1938
	};
1939
	auto updateVec4Dest = [&](char type, IRReg r, uint32_t flags) {
1940
		if ((flags & IRFLAG_SRC3) == 0) {
1941
			switch (type) {
1942
			case 'F':
1943
				isVec4Dirty[r & ~3] = false;
1944
				break;
1945

1946
			case 'V':
1947
				_dbg_assert_((r & 3) == 0);
1948
				isVec4Dirty[r] = true;
1949
				break;
1950

1951
			case '2':
1952
				isVec4Dirty[r & ~3] = false;
1953
				break;
1954

1955
			default:
1956
				break;
1957
			}
1958
		}
1959
		return updateVec4(type, r);
1960
	};
1961

1962
	// Checks overlap from r1 to other params.
1963
	auto overlapped = [](IRReg r1, int l1, IRReg r2, int l2, IRReg r3 = IRREG_INVALID, int l3 = 0) {
1964
		if (r1 < r2 + l2 && r1 + l1 > r2)
1965
			return true;
1966
		if (r1 < r3 + l3 && r1 + l1 > r3)
1967
			return true;
1968
		return false;
1969
	};
1970

1971
	bool logBlocks = false;
1972
	int inCount = (int)in.GetInstructions().size();
1973
	for (int i = 0; i < inCount; ++i) {
1974
		IRInst inst = in.GetInstructions()[i];
1975
		const IRMeta *m = GetIRMeta(inst.op);
1976

1977
		if ((m->flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {
1978
			memset(isVec4, 0, sizeof(isVec4));
1979
			out.Write(inst);
1980
			continue;
1981
		}
1982

1983
		IRReg temp = IRREG_INVALID;
1984
		auto findAvailTempVec4 = [&]() {
1985
			// If it's not used yet in this block, we can use it.
1986
			// Note: even if the instruction uses it to write, that should be fine.
1987
			for (IRReg r = IRVTEMP_PFX_S; r < IRVTEMP_0 + 4; r += 4) {
1988
				if (isUsed[r])
1989
					continue;
1990

1991
				bool usable = true;
1992
				for (int j = 1; j < 4; ++j)
1993
					usable = usable && !isUsed[r + j];
1994

1995
				if (usable) {
1996
					temp = r;
1997
					// We don't update isUsed because our temporary doesn't need to last.
1998
					return true;
1999
				}
2000
			}
2001

2002
			return false;
2003
		};
2004

2005
		auto usedLaterAsVec4 = [&](IRReg r) {
2006
			for (int j = i + 1; j < inCount; ++j) {
2007
				IRInst inst = in.GetInstructions()[j];
2008
				const IRMeta *m = GetIRMeta(inst.op);
2009
				if (m->types[0] == 'V' && inst.dest == r)
2010
					return true;
2011
				if (m->types[1] == 'V' && inst.src1 == r)
2012
					return true;
2013
				if (m->types[2] == 'V' && inst.src2 == r)
2014
					return true;
2015
			}
2016
			return false;
2017
		};
2018

2019
		bool skip = false;
2020
		switch (inst.op) {
2021
		case IROp::SetConstF:
2022
			if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {
2023
				// Check if we're setting multiple in a row, this is a bit common.
2024
				u8 blendMask = 1 << (inst.dest & 3);
2025
				while (i + 1 < inCount) {
2026
					IRInst next = in.GetInstructions()[i + 1];
2027
					if (next.op != IROp::SetConstF || (next.dest & ~3) != (inst.dest & ~3))
2028
						break;
2029
					if (next.constant != inst.constant)
2030
						break;
2031

2032
					blendMask |= 1 << (next.dest & 3);
2033
					i++;
2034
				}
2035

2036
				if (inst.constant == 0) {
2037
					out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllZERO);
2038
				} else if (inst.constant == 0x3F800000) {
2039
					out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllONE);
2040
				} else if (inst.constant == 0xBF800000) {
2041
					out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllMinusONE);
2042
				} else {
2043
					out.Write(IROp::SetConstF, temp, out.AddConstant(inst.constant));
2044
					out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2045
				}
2046
				out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2047
				isVec4Dirty[inst.dest & ~3] = true;
2048
				continue;
2049
			}
2050
			break;
2051

2052
		case IROp::FMovFromGPR:
2053
			if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {
2054
				u8 blendMask = 1 << (inst.dest & 3);
2055
				out.Write(IROp::FMovFromGPR, temp, inst.src1);
2056
				out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2057
				out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2058
				isVec4Dirty[inst.dest & ~3] = true;
2059
				continue;
2060
			}
2061
			break;
2062

2063
		case IROp::LoadFloat:
2064
			if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3) && findAvailTempVec4()) {
2065
				u8 blendMask = 1 << (inst.dest & 3);
2066
				out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2067
				out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2068
				out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2069
				isVec4Dirty[inst.dest & ~3] = true;
2070
				continue;
2071
			}
2072
			break;
2073

2074
		case IROp::StoreFloat:
2075
			if (isVec4[inst.src3 & ~3] && isVec4Dirty[inst.src3 & ~3] && usedLaterAsVec4(inst.src3 & ~3) && findAvailTempVec4()) {
2076
				out.Write(IROp::FMov, temp, inst.src3, 0);
2077
				out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2078
				continue;
2079
			}
2080
			break;
2081

2082
		case IROp::FMov:
2083
			if (isVec4[inst.dest & ~3] && (inst.dest & ~3) == (inst.src1 & ~3)) {
2084
				// Oh, actually a shuffle?
2085
				uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(0, 1, 2, 3);
2086
				uint8_t destShift = (inst.dest & 3) * 2;
2087
				shuffle = (shuffle & ~(3 << destShift)) | ((inst.src1 & 3) << destShift);
2088
				out.Write(IROp::Vec4Shuffle, inst.dest & ~3, inst.dest & ~3, shuffle);
2089
				isVec4Dirty[inst.dest & ~3] = true;
2090
				continue;
2091
			} else if (isVec4[inst.dest & ~3] && (inst.dest & 3) == (inst.src1 & 3)) {
2092
				// We can turn this directly into a blend, since it's the same lane.
2093
				out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, inst.src1 & ~3, 1 << (inst.dest & 3));
2094
				isVec4Dirty[inst.dest & ~3] = true;
2095
				continue;
2096
			} else if (isVec4[inst.dest & ~3] && isVec4[inst.src1 & ~3] && findAvailTempVec4()) {
2097
				// For this, we'll need a temporary to move to the right lane.
2098
				int lane = inst.src1 & 3;
2099
				uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(lane, lane, lane, lane);
2100
				out.Write(IROp::Vec4Shuffle, temp, inst.src1 & ~3, shuffle);
2101
				out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));
2102
				isVec4Dirty[inst.dest & ~3] = true;
2103
				continue;
2104
			}
2105
			break;
2106

2107
		case IROp::FAdd:
2108
		case IROp::FSub:
2109
		case IROp::FMul:
2110
		case IROp::FDiv:
2111
			if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3)) {
2112
				if (!overlapped(inst.dest & ~3, 4, inst.src1, 1, inst.src2, 1) && findAvailTempVec4()) {
2113
					u8 blendMask = 1 << (inst.dest & 3);
2114
					out.Write(inst.op, temp, inst.src1, inst.src2);
2115
					out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2116
					out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2117
					updateVec4('F', inst.src1);
2118
					updateVec4('F', inst.src2);
2119
					isVec4Dirty[inst.dest & ~3] = true;
2120
					continue;
2121
				}
2122
			}
2123
			break;
2124

2125
		case IROp::Vec4Dot:
2126
			if (overlapped(inst.dest, 1, inst.src1, 4, inst.src2, 4) && findAvailTempVec4()) {
2127
				out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2128
				if (usedLaterAsVec4(inst.dest & ~3)) {
2129
					// Broadcast to other lanes if needed.
2130
					if ((inst.dest & 3) != 0)
2131
						out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2132
					out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));
2133
					// It's overlapped, so it'll get marked as Vec4 and used anyway.
2134
					isVec4Dirty[inst.dest & ~3] = true;
2135
					inst.dest = IRREG_INVALID;
2136
				} else {
2137
					out.Write(IROp::FMov, inst.dest, temp);
2138
				}
2139
				skip = true;
2140
			}
2141
			break;
2142

2143
		case IROp::Vec4Scale:
2144
			if (overlapped(inst.src2, 1, inst.src1, 4, inst.dest, 4) && findAvailTempVec4()) {
2145
				out.Write(IROp::FMov, temp, inst.src2);
2146
				out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);
2147
				skip = true;
2148
				inst.src2 = IRREG_INVALID;
2149
			} else if (isVec4[inst.src2 & 3] && usedLaterAsVec4(inst.src2 & ~3) && findAvailTempVec4()) {
2150
				out.Write(IROp::FMov, temp, inst.src2);
2151
				out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);
2152
				skip = true;
2153
				inst.src2 = IRREG_INVALID;
2154
			}
2155
			break;
2156

2157
		default:
2158
			break;
2159
		}
2160

2161
		bool downgrade = false;
2162
		if (inst.src1 != IRREG_INVALID && updateVec4(m->types[1], inst.src1))
2163
			downgrade = true;
2164
		if (inst.src2 != IRREG_INVALID && updateVec4(m->types[2], inst.src2))
2165
			downgrade = true;
2166
		if (inst.dest != IRREG_INVALID && updateVec4Dest(m->types[0], inst.dest, m->flags))
2167
			downgrade = true;
2168

2169
		if (downgrade) {
2170
			//WARN_LOG(Log::JIT, "Vec4 downgrade by: %s", m->name);
2171
		}
2172

2173
		if (!skip)
2174
			out.Write(inst);
2175
	}
2176
	return logBlocks;
2177
}
2178

2179
// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon.
2180
bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) {
2181
	CONDITIONAL_DISABLE;
2182
	// This tells us to skip an AND op that has been optimized out.
2183
	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
2184
	int nextSkip = -1;
2185

2186
	bool logBlocks = false;
2187
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
2188
		IRInst inst = in.GetInstructions()[i];
2189

2190
		// Just copy the last instruction.
2191
		if (i == n - 1) {
2192
			out.Write(inst);
2193
			break;
2194
		}
2195

2196
		out.Write(inst);
2197

2198
		IRInst next = in.GetInstructions()[i + 1];
2199
		switch (inst.op) {
2200
		case IROp::Store32:
2201
			if (next.op == IROp::Load32 &&
2202
				next.constant == inst.constant &&
2203
				next.dest == inst.dest &&
2204
				next.src1 == inst.src1) {
2205
				// The upcoming load is completely redundant.
2206
				// Skip it.
2207
				i++;
2208
			}
2209
			break;
2210
		case IROp::StoreVec4:
2211
			if (next.op == IROp::LoadVec4 &&
2212
				next.constant == inst.constant &&
2213
				next.dest == inst.dest &&
2214
				next.src1 == inst.src1) {
2215
				// The upcoming load is completely redundant. These are common in Wipeout.
2216
				// Skip it. NOTE: It looks like vector load/stores uses different register assignments, but there's a union between dest and src3.
2217
				i++;
2218
			}
2219
			break;
2220
		default:
2221
			break;
2222
		}
2223
	}
2224

2225
	return logBlocks;
2226
}
2227

2228
bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) {
2229
	CONDITIONAL_DISABLE;
2230
	// This tells us to skip an AND op that has been optimized out.
2231
	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
2232
	int nextSkip = -1;
2233

2234
	bool logBlocks = false;
2235
	// We also move the downcount to the top so the interpreter can assume that it's there.
2236
	bool foundDowncount = false;
2237
	out.Write(IROp::Downcount);
2238

2239
	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
2240
		IRInst inst = in.GetInstructions()[i];
2241

2242
		bool last = i == n - 1;
2243

2244
		// Specialize some instructions.
2245
		switch (inst.op) {
2246
		case IROp::Downcount:
2247
			if (!foundDowncount) {
2248
				// Move the value into the initial Downcount.
2249
				foundDowncount = true;
2250
				out.ReplaceConstant(0, inst.constant);
2251
			} else {
2252
				// Already had a downcount. Let's just re-emit it.
2253
				out.Write(inst);
2254
			}
2255
			break;
2256
		case IROp::AddConst:
2257
			if (inst.src1 == inst.dest) {
2258
				inst.op = IROp::OptAddConst;
2259
			}
2260
			out.Write(inst);
2261
			break;
2262
		case IROp::AndConst:
2263
			if (inst.src1 == inst.dest) {
2264
				inst.op = IROp::OptAndConst;
2265
			}
2266
			out.Write(inst);
2267
			break;
2268
		case IROp::OrConst:
2269
			if (inst.src1 == inst.dest) {
2270
				inst.op = IROp::OptOrConst;
2271
			}
2272
			out.Write(inst);
2273
			break;
2274
		case IROp::FMovToGPR:
2275
			if (!last) {
2276
				IRInst next = in.GetInstructions()[i + 1];
2277
				if (next.op == IROp::ShrImm && next.src2 == 8 && next.src1 == next.dest && next.src1 == inst.dest) {
2278
					// Heavily used when writing display lists.
2279
					inst.op = IROp::OptFMovToGPRShr8;
2280
					i++;  // Skip the next instruction.
2281
				}
2282
			}
2283
			out.Write(inst);
2284
			break;
2285
		case IROp::FMovFromGPR:
2286
			if (!last) {
2287
				IRInst next = in.GetInstructions()[i + 1];
2288
				if (next.op == IROp::FCvtSW && next.src1 == inst.dest && next.dest == inst.dest) {
2289
					inst.op = IROp::OptFCvtSWFromGPR;
2290
					i++;  // Skip the next
2291
				}
2292
			}
2293
			out.Write(inst);
2294
			break;
2295
		default:
2296
			out.Write(inst);
2297
			break;
2298
		}
2299
	}
2300

2301
	return logBlocks;
2302
}
2303

2304
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company