CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/IR/IRPassSimplify.cpp
Views: 1401
1
#include <algorithm>
2
#include <cstring>
3
#include <utility>
4
5
#include "Common/BitSet.h"
6
#include "Common/Data/Convert/SmallDataConvert.h"
7
#include "Common/Log.h"
8
#include "Core/Config.h"
9
#include "Core/MIPS/MIPSVFPUUtils.h"
10
#include "Core/MIPS/IR/IRAnalysis.h"
11
#include "Core/MIPS/IR/IRInterpreter.h"
12
#include "Core/MIPS/IR/IRPassSimplify.h"
13
#include "Core/MIPS/IR/IRRegCache.h"
14
15
// #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
16
#define CONDITIONAL_DISABLE
17
#define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
18
19
u32 Evaluate(u32 a, u32 b, IROp op) {
20
switch (op) {
21
case IROp::Add: case IROp::AddConst: return a + b;
22
case IROp::Sub: case IROp::SubConst: return a - b;
23
case IROp::And: case IROp::AndConst: return a & b;
24
case IROp::Or: case IROp::OrConst: return a | b;
25
case IROp::Xor: case IROp::XorConst: return a ^ b;
26
case IROp::Shr: case IROp::ShrImm: return a >> b;
27
case IROp::Sar: case IROp::SarImm: return (s32)a >> b;
28
case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));
29
case IROp::Shl: case IROp::ShlImm: return a << b;
30
case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);
31
case IROp::SltU: case IROp::SltUConst: return (a < b);
32
default:
33
_assert_msg_(false, "Unable to evaluate two op %d", (int)op);
34
return -1;
35
}
36
}
37
38
u32 Evaluate(u32 a, IROp op) {
39
switch (op) {
40
case IROp::Not: return ~a;
41
case IROp::Neg: return -(s32)a;
42
case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);
43
case IROp::BSwap32: return swap32(a);
44
case IROp::Ext8to32: return SignExtend8ToU32(a);
45
case IROp::Ext16to32: return SignExtend16ToU32(a);
46
case IROp::ReverseBits: return ReverseBits32(a);
47
case IROp::Clz: {
48
int x = 31;
49
int count = 0;
50
while (x >= 0 && !(a & (1 << x))) {
51
count++;
52
x--;
53
}
54
return count;
55
}
56
default:
57
_assert_msg_(false, "Unable to evaluate one op %d", (int)op);
58
return -1;
59
}
60
}
61
62
IROp ArithToArithConst(IROp op) {
63
switch (op) {
64
case IROp::Add: return IROp::AddConst;
65
case IROp::Sub: return IROp::SubConst;
66
case IROp::And: return IROp::AndConst;
67
case IROp::Or: return IROp::OrConst;
68
case IROp::Xor: return IROp::XorConst;
69
case IROp::Slt: return IROp::SltConst;
70
case IROp::SltU: return IROp::SltUConst;
71
default:
72
_assert_msg_(false, "Invalid ArithToArithConst for op %d", (int)op);
73
return (IROp)-1;
74
}
75
}
76
77
IROp ShiftToShiftImm(IROp op) {
78
switch (op) {
79
case IROp::Shl: return IROp::ShlImm;
80
case IROp::Shr: return IROp::ShrImm;
81
case IROp::Ror: return IROp::RorImm;
82
case IROp::Sar: return IROp::SarImm;
83
default:
84
_assert_msg_(false, "Invalid ShiftToShiftImm for op %d", (int)op);
85
return (IROp)-1;
86
}
87
}
88
89
bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) {
90
out.Reserve(in.GetInstructions().size());
91
92
if (c == 1) {
93
return passes[0](in, out, opts);
94
}
95
96
bool logBlocks = false;
97
98
IRWriter temp[2];
99
const IRWriter *nextIn = &in;
100
IRWriter *nextOut = &temp[1];
101
temp[1].Reserve(nextIn->GetInstructions().size());
102
for (size_t i = 0; i < c - 1; ++i) {
103
if (passes[i](*nextIn, *nextOut, opts)) {
104
logBlocks = true;
105
}
106
107
temp[0] = std::move(temp[1]);
108
nextIn = &temp[0];
109
110
temp[1].Clear();
111
temp[1].Reserve(nextIn->GetInstructions().size());
112
}
113
114
out.Reserve(nextIn->GetInstructions().size());
115
if (passes[c - 1](*nextIn, out, opts)) {
116
logBlocks = true;
117
}
118
119
return logBlocks;
120
}
121
122
bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) {
123
CONDITIONAL_DISABLE;
124
125
bool logBlocks = false;
126
IRInst prev{ IROp::Nop };
127
128
for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
129
IRInst inst = in.GetInstructions()[i];
130
switch (inst.op) {
131
case IROp::FMovFromGPR:
132
//FMovToGPR a0, f12
133
//FMovFromGPR f14, a0
134
// to
135
//FMovToGPR a0, f12
136
//FMov f14, f12
137
if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
138
inst.op = IROp::FMov;
139
inst.src1 = prev.src1;
140
// Skip it entirely if it's just a copy to and back.
141
if (inst.dest != inst.src1)
142
out.Write(inst);
143
} else {
144
out.Write(inst);
145
}
146
break;
147
148
// This will need to scan forward or keep track of more information to be useful.
149
// Just doing one isn't.
150
/*
151
case IROp::LoadVec4:
152
// AddConst a0, sp, 0x30
153
// LoadVec4 v16, a0, 0x0
154
// to
155
// AddConst a0, sp, 0x30
156
// LoadVec4 v16, sp, 0x30
157
if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {
158
inst.constant += prev.constant;
159
inst.src1 = prev.src1;
160
logBlocks = 1;
161
} else {
162
goto doDefault;
163
}
164
out.Write(inst);
165
break;
166
*/
167
default:
168
out.Write(inst);
169
break;
170
}
171
prev = inst;
172
}
173
return logBlocks;
174
}
175
176
// Might be useful later on x86.
177
bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) {
178
CONDITIONAL_DISABLE;
179
180
bool logBlocks = false;
181
for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
182
IRInst inst = in.GetInstructions()[i];
183
switch (inst.op) {
184
case IROp::Sub:
185
case IROp::Slt:
186
case IROp::SltU:
187
case IROp::Add:
188
case IROp::And:
189
case IROp::Or:
190
case IROp::Xor:
191
if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
192
out.Write(IROp::Mov, inst.dest, inst.src1);
193
out.Write(inst.op, inst.dest, inst.dest, inst.src2);
194
} else {
195
out.Write(inst);
196
}
197
break;
198
case IROp::FMul:
199
case IROp::FAdd:
200
if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
201
out.Write(IROp::FMov, inst.dest, inst.src1);
202
out.Write(inst.op, inst.dest, inst.dest, inst.src2);
203
} else {
204
out.Write(inst);
205
}
206
break;
207
208
case IROp::Vec4Add:
209
case IROp::Vec4Sub:
210
case IROp::Vec4Mul:
211
case IROp::Vec4Div:
212
if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
213
out.Write(IROp::Vec4Mov, inst.dest, inst.src1);
214
out.Write(inst.op, inst.dest, inst.dest, inst.src2);
215
} else {
216
out.Write(inst);
217
}
218
break;
219
220
default:
221
out.Write(inst);
222
break;
223
}
224
}
225
return logBlocks;
226
}
227
228
bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) {
229
CONDITIONAL_DISABLE;
230
231
bool logBlocks = false;
232
233
bool letThroughHalves = false;
234
if (opts.optimizeForInterpreter) {
235
// If we're using the interpreter, which can handle these instructions directly,
236
// don't break "half" instructions up.
237
// Of course, we still want to combine if possible.
238
letThroughHalves = true;
239
}
240
241
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) {
242
const IRInst &inst = in.GetInstructions()[i];
243
244
// TODO: Reorder or look ahead to combine?
245
246
auto nextOp = [&]() -> const IRInst &{
247
return in.GetInstructions()[i + 1];
248
};
249
250
auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) {
251
if (i + 1 >= n)
252
return false;
253
const IRInst &next = nextOp();
254
if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1)
255
return false;
256
if (inst.constant + matchOff != next.constant)
257
return false;
258
259
if (opts.unalignedLoadStore) {
260
// Write out one unaligned op.
261
out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff));
262
} else if (replaceOp == IROp::Load32) {
263
// We can still combine to a simpler set of two loads.
264
// We start by isolating the address and shift amount.
265
266
// IRTEMP_LR_ADDR = rs + imm
267
out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant + replaceOff));
268
// IRTEMP_LR_SHIFT = (addr & 3) * 8
269
out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
270
out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
271
// IRTEMP_LR_ADDR = addr & 0xfffffffc
272
out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
273
// IRTEMP_LR_VALUE = low_word, dest = high_word
274
out.Write(IROp::Load32, inst.dest, IRTEMP_LR_ADDR, out.AddConstant(0));
275
out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(4));
276
277
// Now we just need to adjust and combine dest and IRTEMP_LR_VALUE.
278
// inst.dest >>= shift (putting its bits in the right spot.)
279
out.Write(IROp::Shr, inst.dest, inst.dest, IRTEMP_LR_SHIFT);
280
// We can't shift by 32, so we compromise by shifting twice.
281
out.Write(IROp::ShlImm, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, 8);
282
// IRTEMP_LR_SHIFT = 24 - shift
283
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
284
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
285
// IRTEMP_LR_VALUE <<= (24 - shift)
286
out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
287
288
// At this point the values are aligned, and we just merge.
289
out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
290
} else {
291
return false;
292
}
293
// Skip the next one, replaced.
294
i++;
295
return true;
296
};
297
298
auto addCommonProlog = [&]() {
299
// IRTEMP_LR_ADDR = rs + imm
300
out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant));
301
// IRTEMP_LR_SHIFT = (addr & 3) * 8
302
out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
303
out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
304
// IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later)
305
out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
306
// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR)
307
out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0));
308
};
309
auto addCommonStore = [&](int off = 0) {
310
// RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE
311
out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off));
312
};
313
314
switch (inst.op) {
315
case IROp::Load32Left:
316
if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) {
317
if (letThroughHalves) {
318
out.Write(inst);
319
break;
320
}
321
322
addCommonProlog();
323
// dest &= (0x00ffffff >> shift)
324
// Alternatively, could shift to a wall and back (but would require two shifts each way.)
325
out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
326
out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
327
out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
328
// IRTEMP_LR_SHIFT = 24 - shift
329
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
330
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
331
// IRTEMP_LR_VALUE <<= (24 - shift)
332
out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
333
// dest |= IRTEMP_LR_VALUE
334
out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
335
336
bool src1Dirty = inst.dest == inst.src1;
337
while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
338
// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
339
out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
340
341
// dest &= IRTEMP_LR_MASK
342
out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
343
// IRTEMP_LR_VALUE <<= (24 - shift)
344
out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
345
// dest |= IRTEMP_LR_VALUE
346
out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
347
348
src1Dirty = nextOp().dest == inst.src1;
349
++i;
350
}
351
}
352
break;
353
354
case IROp::Load32Right:
355
if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) {
356
if (letThroughHalves) {
357
out.Write(inst);
358
break;
359
}
360
addCommonProlog();
361
// IRTEMP_LR_VALUE >>= shift
362
out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
363
// IRTEMP_LR_SHIFT = 24 - shift
364
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
365
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
366
// dest &= (0xffffff00 << (24 - shift))
367
// Alternatively, could shift to a wall and back (but would require two shifts each way.)
368
out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
369
out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
370
out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
371
// dest |= IRTEMP_LR_VALUE
372
out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
373
374
// Building display lists sometimes involves a bunch of lwr in a row.
375
// We can generate more optimal code by combining.
376
bool shiftNeedsReverse = true;
377
bool src1Dirty = inst.dest == inst.src1;
378
while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
379
// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
380
out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
381
382
if (shiftNeedsReverse) {
383
// IRTEMP_LR_SHIFT = shift again
384
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
385
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
386
shiftNeedsReverse = false;
387
}
388
// IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT
389
out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
390
// dest &= IRTEMP_LR_MASK
391
out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
392
// dest |= IRTEMP_LR_VALUE
393
out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
394
395
src1Dirty = nextOp().dest == inst.src1;
396
++i;
397
}
398
}
399
break;
400
401
case IROp::Store32Left:
402
if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) {
403
if (letThroughHalves) {
404
out.Write(inst);
405
break;
406
}
407
addCommonProlog();
408
// IRTEMP_LR_VALUE &= 0xffffff00 << shift
409
out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
410
out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
411
out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
412
// IRTEMP_LR_SHIFT = 24 - shift
413
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
414
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
415
// IRTEMP_LR_VALUE |= src3 >> (24 - shift)
416
out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
417
out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
418
addCommonStore(0);
419
}
420
break;
421
422
case IROp::Store32Right:
423
if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) {
424
if (letThroughHalves) {
425
out.Write(inst);
426
break;
427
}
428
addCommonProlog();
429
// IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift)
430
out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
431
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
432
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
433
out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
434
out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
435
out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
436
out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
437
// IRTEMP_LR_VALUE |= src3 << shift
438
out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
439
out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
440
addCommonStore(0);
441
}
442
break;
443
444
default:
445
out.Write(inst);
446
break;
447
}
448
}
449
450
return logBlocks;
451
}
452
453
bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) {
454
CONDITIONAL_DISABLE;
455
IRImmRegCache gpr(&out);
456
457
bool logBlocks = false;
458
bool skipNextExitToConst = false;
459
for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
460
IRInst inst = in.GetInstructions()[i];
461
bool symmetric = true;
462
switch (inst.op) {
463
case IROp::SetConst:
464
gpr.SetImm(inst.dest, inst.constant);
465
break;
466
case IROp::SetConstF:
467
goto doDefault;
468
469
case IROp::Sub:
470
if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0 && !gpr.IsImm(inst.src2)) {
471
// Morph into a Neg.
472
gpr.MapDirtyIn(inst.dest, inst.src2);
473
out.Write(IROp::Neg, inst.dest, inst.src2);
474
break;
475
} else if (inst.src1 == inst.src2) {
476
// Seen sometimes, yet another way of producing zero.
477
gpr.SetImm(inst.dest, 0);
478
break;
479
}
480
#if __cplusplus >= 201703 || _MSC_VER > 1910
481
[[fallthrough]];
482
#endif
483
case IROp::Slt:
484
case IROp::SltU:
485
symmetric = false;
486
#if __cplusplus >= 201703 || _MSC_VER > 1910
487
[[fallthrough]];
488
#endif
489
case IROp::Add:
490
case IROp::And:
491
case IROp::Or:
492
case IROp::Xor:
493
// Regularize, for the add/or check below.
494
if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {
495
std::swap(inst.src1, inst.src2);
496
}
497
if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
498
gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
499
} else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
500
gpr.SetImm(inst.dest, 0);
501
} else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) {
502
gpr.SetImm(inst.dest, 0);
503
} else if (gpr.IsImm(inst.src2)) {
504
const u32 imm2 = gpr.GetImm(inst.src2);
505
gpr.MapDirtyIn(inst.dest, inst.src1);
506
if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Sub || inst.op == IROp::Or || inst.op == IROp::Xor)) {
507
// Add / Sub / Or / Xor with zero is just a Mov. Add / Or are most common.
508
if (inst.dest != inst.src1)
509
out.Write(IROp::Mov, inst.dest, inst.src1);
510
} else {
511
out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));
512
}
513
} else if (symmetric && gpr.IsImm(inst.src1)) {
514
const u32 imm1 = gpr.GetImm(inst.src1);
515
gpr.MapDirtyIn(inst.dest, inst.src2);
516
if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or || inst.op == IROp::Xor)) {
517
// Add / Or / Xor with zero is just a Mov.
518
if (inst.dest != inst.src2)
519
out.Write(IROp::Mov, inst.dest, inst.src2);
520
} else {
521
out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));
522
}
523
} else {
524
gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
525
goto doDefault;
526
}
527
break;
528
529
case IROp::Neg:
530
case IROp::Not:
531
case IROp::BSwap16:
532
case IROp::BSwap32:
533
case IROp::Ext8to32:
534
case IROp::Ext16to32:
535
case IROp::ReverseBits:
536
case IROp::Clz:
537
if (gpr.IsImm(inst.src1)) {
538
gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));
539
} else {
540
gpr.MapDirtyIn(inst.dest, inst.src1);
541
goto doDefault;
542
}
543
break;
544
545
case IROp::AddConst:
546
case IROp::SubConst:
547
case IROp::AndConst:
548
case IROp::OrConst:
549
case IROp::XorConst:
550
case IROp::SltConst:
551
case IROp::SltUConst:
552
// And 0 is otherwise set to 0. Happens when optimizing lwl.
553
if (inst.op == IROp::AndConst && inst.constant == 0) {
554
gpr.SetImm(inst.dest, 0);
555
} else if (gpr.IsImm(inst.src1)) {
556
gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op));
557
} else if (inst.constant == 0 && (inst.op == IROp::AddConst || inst.op == IROp::SubConst || inst.op == IROp::OrConst || inst.op == IROp::XorConst)) {
558
// Convert an Add/Sub/Or/Xor with a constant zero to a Mov (just like with reg zero.)
559
gpr.MapDirtyIn(inst.dest, inst.src1);
560
if (inst.dest != inst.src1)
561
out.Write(IROp::Mov, inst.dest, inst.src1);
562
} else {
563
gpr.MapDirtyIn(inst.dest, inst.src1);
564
goto doDefault;
565
}
566
break;
567
568
case IROp::Shl:
569
case IROp::Shr:
570
case IROp::Ror:
571
case IROp::Sar:
572
if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
573
gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
574
} else if (gpr.IsImm(inst.src2)) {
575
const u8 sa = gpr.GetImm(inst.src2) & 31;
576
gpr.MapDirtyIn(inst.dest, inst.src1);
577
if (sa == 0) {
578
if (inst.dest != inst.src1)
579
out.Write(IROp::Mov, inst.dest, inst.src1);
580
} else {
581
out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);
582
}
583
} else {
584
gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
585
goto doDefault;
586
}
587
break;
588
589
case IROp::ShlImm:
590
case IROp::ShrImm:
591
case IROp::RorImm:
592
case IROp::SarImm:
593
if (gpr.IsImm(inst.src1)) {
594
gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));
595
} else {
596
gpr.MapDirtyIn(inst.dest, inst.src1);
597
goto doDefault;
598
}
599
break;
600
601
case IROp::Mov:
602
if (inst.dest == inst.src1) {
603
// Nop
604
} else if (gpr.IsImm(inst.src1)) {
605
gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
606
} else {
607
gpr.MapDirtyIn(inst.dest, inst.src1);
608
goto doDefault;
609
}
610
break;
611
612
case IROp::Mult:
613
case IROp::MultU:
614
case IROp::Madd:
615
case IROp::MaddU:
616
case IROp::Msub:
617
case IROp::MsubU:
618
case IROp::Div:
619
case IROp::DivU:
620
gpr.MapInIn(inst.src1, inst.src2);
621
goto doDefault;
622
623
case IROp::MovZ:
624
case IROp::MovNZ:
625
gpr.MapInInIn(inst.dest, inst.src1, inst.src2);
626
goto doDefault;
627
628
case IROp::Min:
629
case IROp::Max:
630
gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
631
goto doDefault;
632
633
case IROp::FMovFromGPR:
634
if (gpr.IsImm(inst.src1)) {
635
out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
636
} else {
637
gpr.MapIn(inst.src1);
638
goto doDefault;
639
}
640
break;
641
642
case IROp::FMovToGPR:
643
gpr.MapDirty(inst.dest);
644
goto doDefault;
645
646
case IROp::MfHi:
647
case IROp::MfLo:
648
gpr.MapDirty(inst.dest);
649
goto doDefault;
650
651
case IROp::MtHi:
652
case IROp::MtLo:
653
gpr.MapIn(inst.src1);
654
goto doDefault;
655
656
case IROp::Store8:
657
case IROp::Store16:
658
case IROp::Store32:
659
case IROp::Store32Left:
660
case IROp::Store32Right:
661
case IROp::Store32Conditional:
662
if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
663
gpr.MapIn(inst.dest);
664
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
665
} else {
666
gpr.MapInIn(inst.dest, inst.src1);
667
goto doDefault;
668
}
669
break;
670
case IROp::StoreFloat:
671
case IROp::StoreVec4:
672
if (gpr.IsImm(inst.src1)) {
673
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
674
} else {
675
gpr.MapIn(inst.src1);
676
goto doDefault;
677
}
678
break;
679
680
case IROp::Load8:
681
case IROp::Load8Ext:
682
case IROp::Load16:
683
case IROp::Load16Ext:
684
case IROp::Load32:
685
case IROp::Load32Linked:
686
if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
687
gpr.MapDirty(inst.dest);
688
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
689
} else {
690
gpr.MapDirtyIn(inst.dest, inst.src1);
691
goto doDefault;
692
}
693
break;
694
case IROp::LoadFloat:
695
case IROp::LoadVec4:
696
if (gpr.IsImm(inst.src1)) {
697
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
698
} else {
699
gpr.MapIn(inst.src1);
700
goto doDefault;
701
}
702
break;
703
case IROp::Load32Left:
704
case IROp::Load32Right:
705
if (gpr.IsImm(inst.src1)) {
706
gpr.MapIn(inst.dest);
707
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
708
} else {
709
gpr.MapInIn(inst.dest, inst.src1);
710
goto doDefault;
711
}
712
break;
713
714
case IROp::ValidateAddress8:
715
case IROp::ValidateAddress16:
716
case IROp::ValidateAddress32:
717
case IROp::ValidateAddress128:
718
if (gpr.IsImm(inst.src1)) {
719
out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
720
} else {
721
gpr.MapIn(inst.src1);
722
goto doDefault;
723
}
724
break;
725
726
case IROp::Downcount:
727
case IROp::SetPCConst:
728
goto doDefault;
729
730
case IROp::SetPC:
731
if (gpr.IsImm(inst.src1)) {
732
out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));
733
} else {
734
gpr.MapIn(inst.src1);
735
goto doDefault;
736
}
737
break;
738
739
// FP-only instructions don't need to flush immediates.
740
case IROp::FAdd:
741
case IROp::FMul:
742
// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
743
if (inst.src2 == inst.dest && inst.src1 != inst.src2)
744
std::swap(inst.src1, inst.src2);
745
out.Write(inst);
746
break;
747
748
case IROp::FSub:
749
case IROp::FDiv:
750
case IROp::FNeg:
751
case IROp::FAbs:
752
case IROp::FMov:
753
case IROp::FRound:
754
case IROp::FTrunc:
755
case IROp::FCeil:
756
case IROp::FFloor:
757
case IROp::FCvtSW:
758
case IROp::FCvtScaledWS:
759
case IROp::FCvtScaledSW:
760
case IROp::FSin:
761
case IROp::FCos:
762
case IROp::FSqrt:
763
case IROp::FRSqrt:
764
case IROp::FRecip:
765
case IROp::FAsin:
766
out.Write(inst);
767
break;
768
769
case IROp::SetCtrlVFPU:
770
gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);
771
goto doDefault;
772
773
case IROp::SetCtrlVFPUReg:
774
if (gpr.IsImm(inst.src1)) {
775
out.Write(IROp::SetCtrlVFPU, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
776
} else {
777
gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1);
778
out.Write(inst);
779
}
780
break;
781
782
case IROp::SetCtrlVFPUFReg:
783
gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);
784
goto doDefault;
785
786
case IROp::FCvtWS:
787
// TODO: Actually, this should just use the currently set rounding mode.
788
// Move up with FCvtSW when that's implemented.
789
gpr.MapIn(IRREG_FCR31);
790
out.Write(inst);
791
break;
792
793
case IROp::FpCondFromReg:
794
gpr.MapDirtyIn(IRREG_FPCOND, inst.src1);
795
out.Write(inst);
796
break;
797
case IROp::FpCondToReg:
798
if (gpr.IsImm(IRREG_FPCOND)) {
799
gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));
800
} else {
801
gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
802
out.Write(inst);
803
}
804
break;
805
case IROp::FpCtrlFromReg:
806
gpr.MapDirtyIn(IRREG_FCR31, inst.src1);
807
gpr.MapDirty(IRREG_FPCOND);
808
goto doDefault;
809
case IROp::FpCtrlToReg:
810
gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31);
811
goto doDefault;
812
813
case IROp::Vec4Init:
814
case IROp::Vec4Mov:
815
case IROp::Vec4Add:
816
case IROp::Vec4Sub:
817
case IROp::Vec4Mul:
818
case IROp::Vec4Div:
819
case IROp::Vec4Dot:
820
case IROp::Vec4Scale:
821
case IROp::Vec4Shuffle:
822
case IROp::Vec4Blend:
823
case IROp::Vec4Neg:
824
case IROp::Vec4Abs:
825
case IROp::Vec4Pack31To8:
826
case IROp::Vec4Pack32To8:
827
case IROp::Vec2Pack32To16:
828
case IROp::Vec4Unpack8To32:
829
case IROp::Vec2Unpack16To32:
830
case IROp::Vec4DuplicateUpperBitsAndShift1:
831
case IROp::Vec2ClampToZero:
832
case IROp::Vec4ClampToZero:
833
out.Write(inst);
834
break;
835
836
case IROp::FCmp:
837
gpr.MapDirty(IRREG_FPCOND);
838
goto doDefault;
839
840
case IROp::RestoreRoundingMode:
841
case IROp::ApplyRoundingMode:
842
case IROp::UpdateRoundingMode:
843
goto doDefault;
844
845
case IROp::VfpuCtrlToReg:
846
gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
847
goto doDefault;
848
849
case IROp::FCmpVfpuBit:
850
gpr.MapDirty(IRREG_VFPU_CC);
851
goto doDefault;
852
853
case IROp::FCmovVfpuCC:
854
gpr.MapIn(IRREG_VFPU_CC);
855
goto doDefault;
856
857
case IROp::FCmpVfpuAggregate:
858
gpr.MapDirtyIn(IRREG_VFPU_CC, IRREG_VFPU_CC);
859
goto doDefault;
860
861
case IROp::ExitToConstIfEq:
862
case IROp::ExitToConstIfNeq:
863
if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
864
bool passed = false;
865
switch (inst.op) {
866
case IROp::ExitToConstIfEq: passed = gpr.GetImm(inst.src1) == gpr.GetImm(inst.src2); break;
867
case IROp::ExitToConstIfNeq: passed = gpr.GetImm(inst.src1) != gpr.GetImm(inst.src2); break;
868
default: _assert_(false); break;
869
}
870
871
// This is a bit common for the first cycle of loops.
872
// Reduce bloat by skipping on fail, and const exit on pass.
873
if (passed) {
874
gpr.FlushAll();
875
out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));
876
skipNextExitToConst = true;
877
}
878
break;
879
}
880
gpr.FlushAll();
881
goto doDefault;
882
883
case IROp::ExitToConstIfGtZ:
884
case IROp::ExitToConstIfGeZ:
885
case IROp::ExitToConstIfLtZ:
886
case IROp::ExitToConstIfLeZ:
887
if (gpr.IsImm(inst.src1)) {
888
bool passed = false;
889
switch (inst.op) {
890
case IROp::ExitToConstIfGtZ: passed = (s32)gpr.GetImm(inst.src1) > 0; break;
891
case IROp::ExitToConstIfGeZ: passed = (s32)gpr.GetImm(inst.src1) >= 0; break;
892
case IROp::ExitToConstIfLtZ: passed = (s32)gpr.GetImm(inst.src1) < 0; break;
893
case IROp::ExitToConstIfLeZ: passed = (s32)gpr.GetImm(inst.src1) <= 0; break;
894
default: _assert_(false); break;
895
}
896
897
if (passed) {
898
gpr.FlushAll();
899
out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));
900
skipNextExitToConst = true;
901
}
902
break;
903
}
904
gpr.FlushAll();
905
goto doDefault;
906
907
case IROp::ExitToConst:
908
if (skipNextExitToConst) {
909
skipNextExitToConst = false;
910
break;
911
}
912
gpr.FlushAll();
913
goto doDefault;
914
915
case IROp::ExitToReg:
916
if (gpr.IsImm(inst.src1)) {
917
// This happens sometimes near loops.
918
// Prefer ExitToConst to allow block linking.
919
u32 dest = gpr.GetImm(inst.src1);
920
gpr.FlushAll();
921
out.Write(IROp::ExitToConst, out.AddConstant(dest));
922
break;
923
}
924
gpr.FlushAll();
925
goto doDefault;
926
927
case IROp::CallReplacement:
928
case IROp::Break:
929
case IROp::Syscall:
930
case IROp::Interpret:
931
case IROp::ExitToConstIfFpFalse:
932
case IROp::ExitToConstIfFpTrue:
933
case IROp::Breakpoint:
934
case IROp::MemoryCheck:
935
default:
936
{
937
gpr.FlushAll();
938
doDefault:
939
out.Write(inst);
940
break;
941
}
942
}
943
}
944
gpr.FlushAll();
945
return logBlocks;
946
}
947
948
IRInstMeta IRReplaceSrcGPR(const IRInstMeta &inst, int fromReg, int toReg) {
949
IRInstMeta newInst = inst;
950
951
if (inst.m.types[1] == 'G' && inst.src1 == fromReg) {
952
newInst.src1 = toReg;
953
}
954
if (inst.m.types[2] == 'G' && inst.src2 == fromReg) {
955
newInst.src2 = toReg;
956
}
957
if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && inst.m.types[0] == 'G' && inst.src3 == fromReg) {
958
newInst.src3 = toReg;
959
}
960
return newInst;
961
}
962
963
IRInstMeta IRReplaceDestGPR(const IRInstMeta &inst, int fromReg, int toReg) {
964
IRInstMeta newInst = inst;
965
966
if ((inst.m.flags & IRFLAG_SRC3) == 0 && inst.m.types[0] == 'G' && inst.dest == fromReg) {
967
newInst.dest = toReg;
968
}
969
return newInst;
970
}
971
972
bool IRMutatesDestGPR(const IRInstMeta &inst, int reg) {
973
return (inst.m.flags & IRFLAG_SRC3DST) != 0 && inst.m.types[0] == 'G' && inst.src3 == reg;
974
}
975
976
bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) {
977
CONDITIONAL_DISABLE;
978
std::vector<IRInstMeta> insts;
979
insts.reserve(in.GetInstructions().size());
980
981
// We track writes both to rename regs and to purge dead stores.
982
struct Check {
983
Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {
984
}
985
986
// Register this instruction wrote to.
987
int reg;
988
// Only other than -1 when it's a Mov, equivalent reg at this point.
989
int srcReg = -1;
990
// Index into insts for this op.
991
int index;
992
// Whether the dest reg is read by any Exit.
993
bool readByExit;
994
int8_t fplen = 0;
995
};
996
std::vector<Check> checks;
997
checks.reserve(insts.size() / 2);
998
999
// This tracks the last index at which each reg was modified.
1000
int lastWrittenTo[256];
1001
int lastReadFrom[256];
1002
memset(lastWrittenTo, -1, sizeof(lastWrittenTo));
1003
memset(lastReadFrom, -1, sizeof(lastReadFrom));
1004
1005
auto readsFromFPRCheck = [](IRInstMeta &inst, Check &check, bool *directly) {
1006
if (check.reg < 32)
1007
return false;
1008
1009
bool result = false;
1010
*directly = true;
1011
for (int i = 0; i < 4; ++i) {
1012
bool laneDirectly;
1013
if (check.fplen >= i + 1 && IRReadsFromFPR(inst, check.reg - 32 + i, &laneDirectly)) {
1014
result = true;
1015
if (!laneDirectly) {
1016
*directly = false;
1017
break;
1018
}
1019
}
1020
}
1021
return result;
1022
};
1023
1024
bool logBlocks = false;
1025
size_t firstCheck = 0;
1026
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1027
IRInstMeta inst = GetIRMeta(in.GetInstructions()[i]);
1028
1029
// It helps to skip through rechecking ones we already discarded.
1030
for (size_t ch = firstCheck; ch < checks.size(); ++ch) {
1031
Check &check = checks[ch];
1032
if (check.reg != 0) {
1033
firstCheck = ch;
1034
break;
1035
}
1036
}
1037
1038
// Check if we can optimize by running through all the writes we've previously found.
1039
for (size_t ch = firstCheck; ch < checks.size(); ++ch) {
1040
Check &check = checks[ch];
1041
if (check.reg == 0) {
1042
// This means we already optimized this or a later inst depends on it.
1043
continue;
1044
}
1045
1046
bool readsDirectly;
1047
if (IRReadsFromGPR(inst, check.reg, &readsDirectly)) {
1048
// If this reads from the reg, we either depend on it or we can fold or swap.
1049
// That's determined below.
1050
1051
// If this reads and writes the reg (e.g. MovZ, Load32Left), we can't just swap.
1052
bool mutatesReg = IRMutatesDestGPR(inst, check.reg);
1053
// If this doesn't directly read (i.e. Interpret), we can't swap.
1054
bool cannotReplace = !readsDirectly;
1055
if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) {
1056
// Replace with the srcReg instead. This happens with non-nice delay slots.
1057
// We're changing "Mov A, B; Add C, C, A" to "Mov A, B; Add C, C, B" here.
1058
// srcReg should only be set when it was a Mov.
1059
inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg);
1060
1061
// If the Mov modified the same reg as this instruction, we can't optimize from it anymore.
1062
if (inst.dest == check.reg) {
1063
check.reg = 0;
1064
// We can also optimize it out since we've essentially moved now.
1065
insts[check.index].op = IROp::Mov;
1066
insts[check.index].dest = 0;
1067
insts[check.index].src1 = 0;
1068
}
1069
} else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) {
1070
// As long as the previous inst wasn't modifying its dest reg, and this is a Mov, we can swap.
1071
// We're changing "Add A, B, C; Mov B, A" to "Add B, B, C; Mov A, B" here.
1072
1073
// This happens with lwl/lwr temps. Replace the original dest.
1074
insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest);
1075
lastWrittenTo[inst.dest] = check.index;
1076
// If it's being read from (by inst now), we can't optimize out.
1077
check.reg = 0;
1078
// Update the read by exit flag to match the new reg.
1079
check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT;
1080
// And swap the args for this mov, since we changed the other dest. We'll optimize this out later.
1081
std::swap(inst.dest, inst.src1);
1082
} else {
1083
// Legitimately read from, so we can't optimize out.
1084
// Unless this is an exit and a temp not read directly by the exit.
1085
if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)
1086
check.reg = 0;
1087
}
1088
} else if (check.fplen >= 1 && readsFromFPRCheck(inst, check, &readsDirectly)) {
1089
// If one or the other is a Vec, they must match.
1090
bool lenMismatch = false;
1091
1092
auto checkMismatch = [&check, &lenMismatch](IRReg src, char type) {
1093
int srclen = 1;
1094
if (type == 'V')
1095
srclen = 4;
1096
else if (type == '2')
1097
srclen = 2;
1098
else if (type != 'F')
1099
return;
1100
1101
if (src + 32 + srclen > check.reg && src + 32 < check.reg + check.fplen) {
1102
if (src + 32 != check.reg || srclen != check.fplen)
1103
lenMismatch = true;
1104
}
1105
};
1106
1107
checkMismatch(inst.src1, inst.m.types[1]);
1108
checkMismatch(inst.src2, inst.m.types[2]);
1109
if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0)
1110
checkMismatch(inst.src3, inst.m.types[3]);
1111
1112
bool cannotReplace = !readsDirectly || lenMismatch;
1113
if (!cannotReplace && check.srcReg >= 32 && lastWrittenTo[check.srcReg] < check.index) {
1114
// This is probably not worth doing unless we can get rid of a temp.
1115
if (!check.readByExit) {
1116
if (insts[check.index].dest == inst.src1)
1117
inst.src1 = check.srcReg - 32;
1118
else if (insts[check.index].dest == inst.src2)
1119
inst.src2 = check.srcReg - 32;
1120
else
1121
_assert_msg_(false, "Unexpected src3 read of FPR");
1122
1123
// Check if we've clobbered it entirely.
1124
if (inst.dest == check.reg) {
1125
check.reg = 0;
1126
insts[check.index].op = IROp::Mov;
1127
insts[check.index].dest = 0;
1128
insts[check.index].src1 = 0;
1129
}
1130
} else {
1131
// Let's not bother.
1132
check.reg = 0;
1133
}
1134
} else if ((inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) && !lenMismatch) {
1135
// A swap could be profitable if this is a temp, and maybe in other cases.
1136
// These can happen a lot from mask regs, etc.
1137
// But make sure no other changes happened between.
1138
bool destNotChanged = true;
1139
for (int j = 0; j < check.fplen; ++j)
1140
destNotChanged = destNotChanged && lastWrittenTo[inst.dest + 32 + j] < check.index;
1141
1142
bool destNotRead = true;
1143
for (int j = 0; j < check.fplen; ++j)
1144
destNotRead = destNotRead && lastReadFrom[inst.dest + 32 + j] <= check.index;
1145
1146
if (!check.readByExit && destNotChanged && destNotRead) {
1147
_dbg_assert_(insts[check.index].dest == inst.src1);
1148
insts[check.index].dest = inst.dest;
1149
for (int j = 0; j < check.fplen; ++j)
1150
lastWrittenTo[inst.dest + 32 + j] = check.index;
1151
// If it's being read from (by inst now), we can't optimize out.
1152
check.reg = 0;
1153
// Swap the dest and src1 so we can optimize this out later, maybe.
1154
std::swap(inst.dest, inst.src1);
1155
} else {
1156
// Doesn't look like a good candidate.
1157
check.reg = 0;
1158
}
1159
} else {
1160
// Legitimately read from, so we can't optimize out.
1161
if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)
1162
check.reg = 0;
1163
}
1164
} else if (check.readByExit && (inst.m.flags & IRFLAG_EXIT) != 0) {
1165
// This is an exit, and the reg is read by any exit. Clear it.
1166
check.reg = 0;
1167
} else if (IRDestGPR(inst) == check.reg) {
1168
// Clobbered, we can optimize out.
1169
// This happens sometimes with temporaries used for constant addresses.
1170
insts[check.index].op = IROp::Mov;
1171
insts[check.index].dest = 0;
1172
insts[check.index].src1 = 0;
1173
check.reg = 0;
1174
} else if (IRWritesToFPR(inst, check.reg - 32) && check.fplen >= 1) {
1175
IRReg destFPRs[4];
1176
int numFPRs = IRDestFPRs(inst, destFPRs);
1177
1178
if (numFPRs == check.fplen && inst.dest + 32 == check.reg) {
1179
// This means we've clobbered it, and with full overlap.
1180
// Sometimes this happens for non-temps, i.e. vmmov + vinit last row.
1181
insts[check.index].op = IROp::Mov;
1182
insts[check.index].dest = 0;
1183
insts[check.index].src1 = 0;
1184
check.reg = 0;
1185
} else {
1186
// Since there's an overlap, we simply cannot optimize.
1187
check.reg = 0;
1188
}
1189
}
1190
}
1191
1192
int dest = IRDestGPR(inst);
1193
switch (dest) {
1194
case IRTEMP_0:
1195
case IRTEMP_1:
1196
case IRTEMP_2:
1197
case IRTEMP_3:
1198
case IRTEMP_LHS:
1199
case IRTEMP_RHS:
1200
case IRTEMP_LR_ADDR:
1201
case IRTEMP_LR_VALUE:
1202
case IRTEMP_LR_MASK:
1203
case IRTEMP_LR_SHIFT:
1204
// Check that it's not a barrier instruction (like CallReplacement). Don't want to even consider optimizing those.
1205
if (!(inst.m.flags & IRFLAG_BARRIER)) {
1206
// Unlike other registers, these don't need to persist between blocks.
1207
// So we consider them not read unless proven read.
1208
lastWrittenTo[dest] = i;
1209
// If this is a copy, we might be able to optimize out the copy.
1210
if (inst.op == IROp::Mov) {
1211
Check check(dest, i, false);
1212
check.srcReg = inst.src1;
1213
checks.push_back(check);
1214
} else {
1215
checks.push_back(Check(dest, i, false));
1216
}
1217
} else {
1218
lastWrittenTo[dest] = i;
1219
}
1220
break;
1221
1222
default:
1223
lastWrittenTo[dest] = i;
1224
if (dest > IRTEMP_LR_SHIFT) {
1225
// These might sometimes be implicitly read/written by other instructions.
1226
break;
1227
}
1228
checks.push_back(Check(dest, i, true));
1229
break;
1230
1231
// Not a GPR output.
1232
case 0:
1233
case -1:
1234
break;
1235
}
1236
1237
IRReg regs[16];
1238
int readGPRs = IRReadsFromGPRs(inst, regs);
1239
if (readGPRs == -1) {
1240
for (int j = 0; j < 256; ++j)
1241
lastReadFrom[j] = i;
1242
} else {
1243
for (int j = 0; j < readGPRs; ++j)
1244
lastReadFrom[regs[j]] = i;
1245
}
1246
1247
int readFPRs = IRReadsFromFPRs(inst, regs);
1248
if (readFPRs == -1) {
1249
for (int j = 0; j < 256; ++j)
1250
lastReadFrom[j] = i;
1251
} else {
1252
for (int j = 0; j < readFPRs; ++j)
1253
lastReadFrom[regs[j] + 32] = i;
1254
}
1255
1256
int destFPRs = IRDestFPRs(inst, regs);
1257
for (int j = 0; j < destFPRs; ++j)
1258
lastWrittenTo[regs[j] + 32] = i;
1259
1260
dest = destFPRs > 0 ? regs[0] + 32 : -1;
1261
if (dest >= 32 && dest < IRTEMP_0) {
1262
// Standard FPU or VFPU reg.
1263
Check check(dest, i, true);
1264
check.fplen = (int8_t)destFPRs;
1265
checks.push_back(check);
1266
} else if (dest >= IRVTEMP_PFX_S + 32 && dest < IRVTEMP_PFX_S + 32 + 16) {
1267
// These are temporary regs and not read by exits.
1268
Check check(dest, i, false);
1269
check.fplen = (int8_t)destFPRs;
1270
if (inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) {
1271
check.srcReg = inst.src1 + 32;
1272
}
1273
checks.push_back(check);
1274
} else if (dest != -1) {
1275
_assert_msg_(false, "Unexpected FPR output %d", dest);
1276
}
1277
1278
insts.push_back(inst);
1279
}
1280
1281
// Since we're done with the instructions, all remaining can be nuked.
1282
for (Check &check : checks) {
1283
if (!check.readByExit && check.reg > 0) {
1284
insts[check.index].op = IROp::Mov;
1285
insts[check.index].dest = 0;
1286
insts[check.index].src1 = 0;
1287
}
1288
}
1289
1290
for (const IRInstMeta &inst : insts) {
1291
// Simply skip any Mov 0, 0 instructions, since that's how we nuke one.
1292
if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {
1293
out.Write(inst.i);
1294
}
1295
}
1296
1297
return logBlocks;
1298
}
1299
1300
bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1301
CONDITIONAL_DISABLE;
1302
// This tells us to skip an AND op that has been optimized out.
1303
// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
1304
int nextSkip = -1;
1305
1306
bool logBlocks = false;
1307
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1308
IRInst inst = in.GetInstructions()[i];
1309
1310
if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {
1311
int dest = IRDestGPR(GetIRMeta(inst));
1312
for (int j = i + 1; j < n; j++) {
1313
const IRInstMeta laterInst = GetIRMeta(in.GetInstructions()[j]);
1314
1315
if ((laterInst.m.flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {
1316
// Exit, so we can't do the optimization.
1317
break;
1318
}
1319
if (IRReadsFromGPR(laterInst, dest)) {
1320
if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {
1321
const u32 mask = laterInst.constant;
1322
// Here we are, maybe we can reduce the load size based on the mask.
1323
if ((mask & 0xffffff00) == 0) {
1324
inst.op = IROp::Load8;
1325
if (mask == 0xff) {
1326
nextSkip = j;
1327
}
1328
} else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {
1329
inst.op = IROp::Load16;
1330
if (mask == 0xffff) {
1331
nextSkip = j;
1332
}
1333
}
1334
}
1335
// If it was read, we can't do the optimization.
1336
break;
1337
}
1338
if (IRDestGPR(laterInst) == dest) {
1339
// Someone else wrote, so we can't do the optimization.
1340
break;
1341
}
1342
}
1343
}
1344
1345
if (i != nextSkip) {
1346
out.Write(inst);
1347
}
1348
}
1349
1350
return logBlocks;
1351
}
1352
1353
static std::vector<IRInst> ReorderLoadStoreOps(std::vector<IRInst> &ops) {
1354
if (ops.size() < 2) {
1355
return ops;
1356
}
1357
1358
bool modifiedRegs[256] = {};
1359
1360
for (size_t i = 0, n = ops.size(); i < n - 1; ++i) {
1361
bool modifiesReg = false;
1362
bool usesFloatReg = false;
1363
switch (ops[i].op) {
1364
case IROp::Load8:
1365
case IROp::Load8Ext:
1366
case IROp::Load16:
1367
case IROp::Load16Ext:
1368
case IROp::Load32:
1369
case IROp::Load32Left:
1370
case IROp::Load32Right:
1371
modifiesReg = true;
1372
if (ops[i].src1 == ops[i].dest) {
1373
// Can't ever reorder these, since it changes.
1374
continue;
1375
}
1376
break;
1377
1378
case IROp::Store8:
1379
case IROp::Store16:
1380
case IROp::Store32:
1381
case IROp::Store32Left:
1382
case IROp::Store32Right:
1383
break;
1384
1385
case IROp::LoadFloat:
1386
case IROp::LoadVec4:
1387
usesFloatReg = true;
1388
modifiesReg = true;
1389
break;
1390
1391
case IROp::StoreFloat:
1392
case IROp::StoreVec4:
1393
usesFloatReg = true;
1394
break;
1395
1396
default:
1397
continue;
1398
}
1399
1400
memset(modifiedRegs, 0, sizeof(modifiedRegs));
1401
size_t start = i;
1402
size_t j;
1403
for (j = i; j < n; ++j) {
1404
if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) {
1405
// Incompatible ops, so let's not reorder.
1406
break;
1407
}
1408
if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) {
1409
// Can't reorder, this reg was modified.
1410
break;
1411
}
1412
if (modifiesReg) {
1413
// Modifies itself, can't reorder this.
1414
if (!usesFloatReg && ops[j].dest == ops[j].src1) {
1415
break;
1416
}
1417
modifiedRegs[ops[j].dest] = true;
1418
}
1419
1420
// Keep going, these operations are compatible.
1421
}
1422
1423
// Everything up to (but not including) j will be sorted, so skip them.
1424
i = j - 1;
1425
size_t end = j;
1426
if (start + 1 < end) {
1427
std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) {
1428
return a.constant < b.constant;
1429
});
1430
}
1431
}
1432
1433
return ops;
1434
}
1435
1436
bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1437
CONDITIONAL_DISABLE;
1438
1439
bool logBlocks = false;
1440
1441
enum class RegState : u8 {
1442
UNUSED = 0,
1443
READ = 1,
1444
CHANGED = 2,
1445
};
1446
1447
bool queuing = false;
1448
std::vector<IRInst> loadStoreQueue;
1449
std::vector<IRInst> otherQueue;
1450
RegState otherRegs[256] = {};
1451
1452
auto flushQueue = [&]() {
1453
if (!queuing) {
1454
return;
1455
}
1456
1457
std::vector<IRInst> loadStoreUnsorted = loadStoreQueue;
1458
std::vector<IRInst> loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue);
1459
if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) {
1460
logBlocks = true;
1461
}
1462
1463
queuing = false;
1464
for (IRInst queued : loadStoreSorted) {
1465
out.Write(queued);
1466
}
1467
for (IRInst queued : otherQueue) {
1468
out.Write(queued);
1469
}
1470
loadStoreQueue.clear();
1471
otherQueue.clear();
1472
memset(otherRegs, 0, sizeof(otherRegs));
1473
};
1474
1475
for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
1476
IRInst inst = in.GetInstructions()[i];
1477
switch (inst.op) {
1478
case IROp::Load8:
1479
case IROp::Load8Ext:
1480
case IROp::Load16:
1481
case IROp::Load16Ext:
1482
case IROp::Load32:
1483
case IROp::Load32Left:
1484
case IROp::Load32Right:
1485
// To move a load up, its dest can't be changed by things we move down.
1486
if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) {
1487
flushQueue();
1488
}
1489
1490
queuing = true;
1491
loadStoreQueue.push_back(inst);
1492
break;
1493
1494
case IROp::Store8:
1495
case IROp::Store16:
1496
case IROp::Store32:
1497
case IROp::Store32Left:
1498
case IROp::Store32Right:
1499
// A store can move above even if it's read, as long as it's not changed by the other ops.
1500
if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) {
1501
flushQueue();
1502
}
1503
1504
queuing = true;
1505
loadStoreQueue.push_back(inst);
1506
break;
1507
1508
case IROp::LoadVec4:
1509
case IROp::LoadFloat:
1510
case IROp::StoreVec4:
1511
case IROp::StoreFloat:
1512
// Floats can always move as long as their address is safe.
1513
if (otherRegs[inst.src1] == RegState::CHANGED) {
1514
flushQueue();
1515
}
1516
1517
queuing = true;
1518
loadStoreQueue.push_back(inst);
1519
break;
1520
1521
case IROp::Sub:
1522
case IROp::Slt:
1523
case IROp::SltU:
1524
case IROp::Add:
1525
case IROp::And:
1526
case IROp::Or:
1527
case IROp::Xor:
1528
case IROp::Shl:
1529
case IROp::Shr:
1530
case IROp::Ror:
1531
case IROp::Sar:
1532
case IROp::MovZ:
1533
case IROp::MovNZ:
1534
case IROp::Max:
1535
case IROp::Min:
1536
// We'll try to move this downward.
1537
otherRegs[inst.dest] = RegState::CHANGED;
1538
if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1539
otherRegs[inst.src1] = RegState::READ;
1540
if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1541
otherRegs[inst.src2] = RegState::READ;
1542
otherQueue.push_back(inst);
1543
queuing = true;
1544
break;
1545
1546
case IROp::Neg:
1547
case IROp::Not:
1548
case IROp::BSwap16:
1549
case IROp::BSwap32:
1550
case IROp::Ext8to32:
1551
case IROp::Ext16to32:
1552
case IROp::ReverseBits:
1553
case IROp::Clz:
1554
case IROp::AddConst:
1555
case IROp::SubConst:
1556
case IROp::AndConst:
1557
case IROp::OrConst:
1558
case IROp::XorConst:
1559
case IROp::SltConst:
1560
case IROp::SltUConst:
1561
case IROp::ShlImm:
1562
case IROp::ShrImm:
1563
case IROp::RorImm:
1564
case IROp::SarImm:
1565
case IROp::Mov:
1566
// We'll try to move this downward.
1567
otherRegs[inst.dest] = RegState::CHANGED;
1568
if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1569
otherRegs[inst.src1] = RegState::READ;
1570
otherQueue.push_back(inst);
1571
queuing = true;
1572
break;
1573
1574
case IROp::SetConst:
1575
// We'll try to move this downward.
1576
otherRegs[inst.dest] = RegState::CHANGED;
1577
otherQueue.push_back(inst);
1578
queuing = true;
1579
break;
1580
1581
case IROp::Mult:
1582
case IROp::MultU:
1583
case IROp::Madd:
1584
case IROp::MaddU:
1585
case IROp::Msub:
1586
case IROp::MsubU:
1587
case IROp::Div:
1588
case IROp::DivU:
1589
if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1590
otherRegs[inst.src1] = RegState::READ;
1591
if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1592
otherRegs[inst.src2] = RegState::READ;
1593
otherQueue.push_back(inst);
1594
queuing = true;
1595
break;
1596
1597
case IROp::MfHi:
1598
case IROp::MfLo:
1599
case IROp::FpCondToReg:
1600
otherRegs[inst.dest] = RegState::CHANGED;
1601
otherQueue.push_back(inst);
1602
queuing = true;
1603
break;
1604
1605
case IROp::MtHi:
1606
case IROp::MtLo:
1607
case IROp::FpCondFromReg:
1608
if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1609
otherRegs[inst.src1] = RegState::READ;
1610
otherQueue.push_back(inst);
1611
queuing = true;
1612
break;
1613
1614
case IROp::Nop:
1615
case IROp::Downcount:
1616
if (queuing) {
1617
// These are freebies. Sometimes helps with delay slots.
1618
otherQueue.push_back(inst);
1619
} else {
1620
out.Write(inst);
1621
}
1622
break;
1623
1624
default:
1625
flushQueue();
1626
out.Write(inst);
1627
break;
1628
}
1629
}
1630
return logBlocks;
1631
}
1632
1633
bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1634
CONDITIONAL_DISABLE;
1635
1636
bool logBlocks = false;
1637
1638
auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) {
1639
if (a.op != b.op || a.src1 != b.src1) {
1640
// Not similar enough at all.
1641
return false;
1642
}
1643
u32 off1 = a.constant;
1644
u32 off2 = b.constant;
1645
if (off1 + dist != off2) {
1646
// Not immediately sequential.
1647
return false;
1648
}
1649
1650
return true;
1651
};
1652
1653
IRInst prev = { IROp::Nop };
1654
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1655
IRInst inst = in.GetInstructions()[i];
1656
int c = 0;
1657
switch (inst.op) {
1658
case IROp::Store8:
1659
for (c = 1; c < 4 && i + c < n; ++c) {
1660
const IRInst &nextInst = in.GetInstructions()[i + c];
1661
// TODO: Might be nice to check if this is an obvious constant.
1662
if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1663
break;
1664
}
1665
if (!opsCompatible(inst, nextInst, c)) {
1666
break;
1667
}
1668
}
1669
if ((c == 2 || c == 3) && opts.unalignedLoadStore) {
1670
inst.op = IROp::Store16;
1671
out.Write(inst);
1672
prev = inst;
1673
// Skip the next one (the 3rd will be separate.)
1674
++i;
1675
continue;
1676
}
1677
if (c == 4 && opts.unalignedLoadStore) {
1678
inst.op = IROp::Store32;
1679
out.Write(inst);
1680
prev = inst;
1681
// Skip all 4.
1682
i += 3;
1683
continue;
1684
}
1685
out.Write(inst);
1686
prev = inst;
1687
break;
1688
1689
case IROp::Store16:
1690
for (c = 1; c < 2 && i + c < n; ++c) {
1691
const IRInst &nextInst = in.GetInstructions()[i + c];
1692
// TODO: Might be nice to check if this is an obvious constant.
1693
if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1694
break;
1695
}
1696
if (!opsCompatible(inst, nextInst, c * 2)) {
1697
break;
1698
}
1699
}
1700
if (c == 2 && opts.unalignedLoadStore) {
1701
inst.op = IROp::Store32;
1702
out.Write(inst);
1703
prev = inst;
1704
// Skip the next one.
1705
++i;
1706
continue;
1707
}
1708
out.Write(inst);
1709
prev = inst;
1710
break;
1711
1712
case IROp::Load32:
1713
if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1714
// A store and then an immediate load. This is sadly common in minis.
1715
if (prev.op == IROp::Store32 && prev.src3 == inst.dest) {
1716
// Even the same reg, a volatile variable? Skip it.
1717
continue;
1718
}
1719
1720
// Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble.
1721
if (prev.op == IROp::Store32) {
1722
inst.op = IROp::Mov;
1723
inst.src1 = prev.src3;
1724
inst.src2 = 0;
1725
} else if (prev.op == IROp::StoreFloat) {
1726
inst.op = IROp::FMovToGPR;
1727
inst.src1 = prev.src3;
1728
inst.src2 = 0;
1729
}
1730
// The actual op is written below.
1731
}
1732
out.Write(inst);
1733
prev = inst;
1734
break;
1735
1736
case IROp::LoadFloat:
1737
if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1738
// A store and then an immediate load, of a float.
1739
if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) {
1740
// Volatile float, I suppose?
1741
continue;
1742
}
1743
1744
if (prev.op == IROp::StoreFloat) {
1745
inst.op = IROp::FMov;
1746
inst.src1 = prev.src3;
1747
inst.src2 = 0;
1748
} else if (prev.op == IROp::Store32) {
1749
inst.op = IROp::FMovFromGPR;
1750
inst.src1 = prev.src3;
1751
inst.src2 = 0;
1752
}
1753
// The actual op is written below.
1754
}
1755
out.Write(inst);
1756
prev = inst;
1757
break;
1758
1759
default:
1760
out.Write(inst);
1761
prev = inst;
1762
break;
1763
}
1764
}
1765
return logBlocks;
1766
}
1767
1768
struct IRMemoryOpInfo {
1769
int size;
1770
bool isWrite;
1771
bool isWordLR;
1772
};
1773
1774
static IRMemoryOpInfo IROpMemoryAccessSize(IROp op) {
1775
// Assumes all take src1 + constant.
1776
switch (op) {
1777
case IROp::Load8:
1778
case IROp::Load8Ext:
1779
case IROp::Store8:
1780
return { 1, op == IROp::Store8 };
1781
1782
case IROp::Load16:
1783
case IROp::Load16Ext:
1784
case IROp::Store16:
1785
return { 2, op == IROp::Store16 };
1786
1787
case IROp::Load32:
1788
case IROp::Load32Linked:
1789
case IROp::LoadFloat:
1790
case IROp::Store32:
1791
case IROp::Store32Conditional:
1792
case IROp::StoreFloat:
1793
return { 4, op == IROp::Store32 || op == IROp::Store32Conditional || op == IROp::StoreFloat };
1794
1795
case IROp::LoadVec4:
1796
case IROp::StoreVec4:
1797
return { 16, op == IROp::StoreVec4 };
1798
1799
case IROp::Load32Left:
1800
case IROp::Load32Right:
1801
case IROp::Store32Left:
1802
case IROp::Store32Right:
1803
// This explicitly does not require alignment, so validate as an 8-bit operation.
1804
return { 1, op == IROp::Store32Left || op == IROp::Store32Right, true };
1805
1806
default:
1807
return { 0 };
1808
}
1809
}
1810
1811
bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1812
CONDITIONAL_DISABLE;
1813
if (g_Config.bFastMemory)
1814
DISABLE;
1815
1816
int spLower = 0;
1817
int spUpper = -1;
1818
bool spWrite = false;
1819
bool spModified = false;
1820
for (IRInst inst : in.GetInstructions()) {
1821
IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
1822
// Note: we only combine word aligned accesses.
1823
if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) {
1824
if (spModified) {
1825
// No good, it was modified and then we did more accesses. Can't combine.
1826
spUpper = -1;
1827
break;
1828
}
1829
if ((int)inst.constant < 0 || (int)inst.constant >= 0x4000) {
1830
// Let's assume this might cross boundaries or something. Uncommon.
1831
spUpper = -1;
1832
break;
1833
}
1834
1835
spLower = std::min(spLower, (int)inst.constant);
1836
spUpper = std::max(spUpper, (int)inst.constant + info.size);
1837
spWrite = spWrite || info.isWrite;
1838
}
1839
1840
const IRMeta *m = GetIRMeta(inst.op);
1841
if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0 && inst.dest == MIPS_REG_SP) {
1842
// We only care if it changes after we start combining.
1843
spModified = spUpper != -1;
1844
}
1845
}
1846
1847
bool skipSP = spUpper != -1;
1848
bool flushedSP = false;
1849
1850
std::map<uint64_t, uint8_t> checks;
1851
const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) {
1852
if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) {
1853
if (!flushedSP) {
1854
out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower);
1855
if (spUpper > spLower + 4)
1856
out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spUpper - 4);
1857
flushedSP = true;
1858
}
1859
return;
1860
}
1861
1862
uint64_t key = ((uint64_t)inst.src1 << 32) | inst.constant;
1863
auto it = checks.find(key);
1864
if (it == checks.end() || it->second < sz) {
1865
out.Write(validate, 0, inst.src1, isStore ? 1U : 0U, inst.constant);
1866
checks[key] = sz;
1867
}
1868
};
1869
1870
bool logBlocks = false;
1871
for (IRInst inst : in.GetInstructions()) {
1872
IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
1873
IROp validateOp = IROp::Nop;
1874
switch (info.size) {
1875
case 1: validateOp = IROp::ValidateAddress8; break;
1876
case 2: validateOp = IROp::ValidateAddress16; break;
1877
case 4: validateOp = IROp::ValidateAddress32; break;
1878
case 16: validateOp = IROp::ValidateAddress128; break;
1879
case 0: break;
1880
default: _assert_msg_(false, "Unexpected memory access size");
1881
}
1882
1883
if (validateOp != IROp::Nop) {
1884
addValidate(validateOp, info.size, inst, info.isWrite);
1885
}
1886
1887
const IRMeta *m = GetIRMeta(inst.op);
1888
if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0) {
1889
uint64_t key = (uint64_t)inst.dest << 32;
1890
// Wipe out all the already done checks since this was modified.
1891
checks.erase(checks.lower_bound(key), checks.upper_bound(key | 0xFFFFFFFFULL));
1892
}
1893
1894
// Always write out the original. We're only adding.
1895
out.Write(inst);
1896
}
1897
return logBlocks;
1898
}
1899
1900
bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1901
CONDITIONAL_DISABLE;
1902
// Only do this when using a SIMD backend.
1903
if (!opts.preferVec4) {
1904
DISABLE;
1905
}
1906
1907
bool isVec4[256]{};
1908
bool isUsed[256]{};
1909
bool isVec4Dirty[256]{};
1910
auto updateVec4 = [&](char type, IRReg r) {
1911
bool downgraded = false;
1912
switch (type) {
1913
case 'F':
1914
downgraded = isVec4[r & ~3];
1915
isVec4[r & ~3] = false;
1916
isUsed[r] = true;
1917
break;
1918
1919
case 'V':
1920
_dbg_assert_((r & 3) == 0);
1921
isVec4[r] = true;
1922
for (int i = 0; i < 4; ++i)
1923
isUsed[r + i] = true;
1924
break;
1925
1926
case '2':
1927
downgraded = isVec4[r & ~3];
1928
isVec4[r & ~3] = false;
1929
for (int i = 0; i < 2; ++i)
1930
isUsed[r + i] = true;
1931
break;
1932
1933
default:
1934
break;
1935
}
1936
1937
return downgraded;
1938
};
1939
auto updateVec4Dest = [&](char type, IRReg r, uint32_t flags) {
1940
if ((flags & IRFLAG_SRC3) == 0) {
1941
switch (type) {
1942
case 'F':
1943
isVec4Dirty[r & ~3] = false;
1944
break;
1945
1946
case 'V':
1947
_dbg_assert_((r & 3) == 0);
1948
isVec4Dirty[r] = true;
1949
break;
1950
1951
case '2':
1952
isVec4Dirty[r & ~3] = false;
1953
break;
1954
1955
default:
1956
break;
1957
}
1958
}
1959
return updateVec4(type, r);
1960
};
1961
1962
// Checks overlap from r1 to other params.
1963
auto overlapped = [](IRReg r1, int l1, IRReg r2, int l2, IRReg r3 = IRREG_INVALID, int l3 = 0) {
1964
if (r1 < r2 + l2 && r1 + l1 > r2)
1965
return true;
1966
if (r1 < r3 + l3 && r1 + l1 > r3)
1967
return true;
1968
return false;
1969
};
1970
1971
bool logBlocks = false;
1972
int inCount = (int)in.GetInstructions().size();
1973
for (int i = 0; i < inCount; ++i) {
1974
IRInst inst = in.GetInstructions()[i];
1975
const IRMeta *m = GetIRMeta(inst.op);
1976
1977
if ((m->flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {
1978
memset(isVec4, 0, sizeof(isVec4));
1979
out.Write(inst);
1980
continue;
1981
}
1982
1983
IRReg temp = IRREG_INVALID;
1984
auto findAvailTempVec4 = [&]() {
1985
// If it's not used yet in this block, we can use it.
1986
// Note: even if the instruction uses it to write, that should be fine.
1987
for (IRReg r = IRVTEMP_PFX_S; r < IRVTEMP_0 + 4; r += 4) {
1988
if (isUsed[r])
1989
continue;
1990
1991
bool usable = true;
1992
for (int j = 1; j < 4; ++j)
1993
usable = usable && !isUsed[r + j];
1994
1995
if (usable) {
1996
temp = r;
1997
// We don't update isUsed because our temporary doesn't need to last.
1998
return true;
1999
}
2000
}
2001
2002
return false;
2003
};
2004
2005
auto usedLaterAsVec4 = [&](IRReg r) {
2006
for (int j = i + 1; j < inCount; ++j) {
2007
IRInst inst = in.GetInstructions()[j];
2008
const IRMeta *m = GetIRMeta(inst.op);
2009
if (m->types[0] == 'V' && inst.dest == r)
2010
return true;
2011
if (m->types[1] == 'V' && inst.src1 == r)
2012
return true;
2013
if (m->types[2] == 'V' && inst.src2 == r)
2014
return true;
2015
}
2016
return false;
2017
};
2018
2019
bool skip = false;
2020
switch (inst.op) {
2021
case IROp::SetConstF:
2022
if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {
2023
// Check if we're setting multiple in a row, this is a bit common.
2024
u8 blendMask = 1 << (inst.dest & 3);
2025
while (i + 1 < inCount) {
2026
IRInst next = in.GetInstructions()[i + 1];
2027
if (next.op != IROp::SetConstF || (next.dest & ~3) != (inst.dest & ~3))
2028
break;
2029
if (next.constant != inst.constant)
2030
break;
2031
2032
blendMask |= 1 << (next.dest & 3);
2033
i++;
2034
}
2035
2036
if (inst.constant == 0) {
2037
out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllZERO);
2038
} else if (inst.constant == 0x3F800000) {
2039
out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllONE);
2040
} else if (inst.constant == 0xBF800000) {
2041
out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllMinusONE);
2042
} else {
2043
out.Write(IROp::SetConstF, temp, out.AddConstant(inst.constant));
2044
out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2045
}
2046
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2047
isVec4Dirty[inst.dest & ~3] = true;
2048
continue;
2049
}
2050
break;
2051
2052
case IROp::FMovFromGPR:
2053
if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {
2054
u8 blendMask = 1 << (inst.dest & 3);
2055
out.Write(IROp::FMovFromGPR, temp, inst.src1);
2056
out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2057
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2058
isVec4Dirty[inst.dest & ~3] = true;
2059
continue;
2060
}
2061
break;
2062
2063
case IROp::LoadFloat:
2064
if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3) && findAvailTempVec4()) {
2065
u8 blendMask = 1 << (inst.dest & 3);
2066
out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2067
out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2068
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2069
isVec4Dirty[inst.dest & ~3] = true;
2070
continue;
2071
}
2072
break;
2073
2074
case IROp::StoreFloat:
2075
if (isVec4[inst.src3 & ~3] && isVec4Dirty[inst.src3 & ~3] && usedLaterAsVec4(inst.src3 & ~3) && findAvailTempVec4()) {
2076
out.Write(IROp::FMov, temp, inst.src3, 0);
2077
out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2078
continue;
2079
}
2080
break;
2081
2082
case IROp::FMov:
2083
if (isVec4[inst.dest & ~3] && (inst.dest & ~3) == (inst.src1 & ~3)) {
2084
// Oh, actually a shuffle?
2085
uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(0, 1, 2, 3);
2086
uint8_t destShift = (inst.dest & 3) * 2;
2087
shuffle = (shuffle & ~(3 << destShift)) | ((inst.src1 & 3) << destShift);
2088
out.Write(IROp::Vec4Shuffle, inst.dest & ~3, inst.dest & ~3, shuffle);
2089
isVec4Dirty[inst.dest & ~3] = true;
2090
continue;
2091
} else if (isVec4[inst.dest & ~3] && (inst.dest & 3) == (inst.src1 & 3)) {
2092
// We can turn this directly into a blend, since it's the same lane.
2093
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, inst.src1 & ~3, 1 << (inst.dest & 3));
2094
isVec4Dirty[inst.dest & ~3] = true;
2095
continue;
2096
} else if (isVec4[inst.dest & ~3] && isVec4[inst.src1 & ~3] && findAvailTempVec4()) {
2097
// For this, we'll need a temporary to move to the right lane.
2098
int lane = inst.src1 & 3;
2099
uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(lane, lane, lane, lane);
2100
out.Write(IROp::Vec4Shuffle, temp, inst.src1 & ~3, shuffle);
2101
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));
2102
isVec4Dirty[inst.dest & ~3] = true;
2103
continue;
2104
}
2105
break;
2106
2107
case IROp::FAdd:
2108
case IROp::FSub:
2109
case IROp::FMul:
2110
case IROp::FDiv:
2111
if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3)) {
2112
if (!overlapped(inst.dest & ~3, 4, inst.src1, 1, inst.src2, 1) && findAvailTempVec4()) {
2113
u8 blendMask = 1 << (inst.dest & 3);
2114
out.Write(inst.op, temp, inst.src1, inst.src2);
2115
out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2116
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);
2117
updateVec4('F', inst.src1);
2118
updateVec4('F', inst.src2);
2119
isVec4Dirty[inst.dest & ~3] = true;
2120
continue;
2121
}
2122
}
2123
break;
2124
2125
case IROp::Vec4Dot:
2126
if (overlapped(inst.dest, 1, inst.src1, 4, inst.src2, 4) && findAvailTempVec4()) {
2127
out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);
2128
if (usedLaterAsVec4(inst.dest & ~3)) {
2129
// Broadcast to other lanes if needed.
2130
if ((inst.dest & 3) != 0)
2131
out.Write(IROp::Vec4Shuffle, temp, temp, 0);
2132
out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));
2133
// It's overlapped, so it'll get marked as Vec4 and used anyway.
2134
isVec4Dirty[inst.dest & ~3] = true;
2135
inst.dest = IRREG_INVALID;
2136
} else {
2137
out.Write(IROp::FMov, inst.dest, temp);
2138
}
2139
skip = true;
2140
}
2141
break;
2142
2143
case IROp::Vec4Scale:
2144
if (overlapped(inst.src2, 1, inst.src1, 4, inst.dest, 4) && findAvailTempVec4()) {
2145
out.Write(IROp::FMov, temp, inst.src2);
2146
out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);
2147
skip = true;
2148
inst.src2 = IRREG_INVALID;
2149
} else if (isVec4[inst.src2 & 3] && usedLaterAsVec4(inst.src2 & ~3) && findAvailTempVec4()) {
2150
out.Write(IROp::FMov, temp, inst.src2);
2151
out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);
2152
skip = true;
2153
inst.src2 = IRREG_INVALID;
2154
}
2155
break;
2156
2157
default:
2158
break;
2159
}
2160
2161
bool downgrade = false;
2162
if (inst.src1 != IRREG_INVALID && updateVec4(m->types[1], inst.src1))
2163
downgrade = true;
2164
if (inst.src2 != IRREG_INVALID && updateVec4(m->types[2], inst.src2))
2165
downgrade = true;
2166
if (inst.dest != IRREG_INVALID && updateVec4Dest(m->types[0], inst.dest, m->flags))
2167
downgrade = true;
2168
2169
if (downgrade) {
2170
//WARN_LOG(Log::JIT, "Vec4 downgrade by: %s", m->name);
2171
}
2172
2173
if (!skip)
2174
out.Write(inst);
2175
}
2176
return logBlocks;
2177
}
2178
2179
// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon.
2180
bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) {
2181
CONDITIONAL_DISABLE;
2182
// This tells us to skip an AND op that has been optimized out.
2183
// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
2184
int nextSkip = -1;
2185
2186
bool logBlocks = false;
2187
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
2188
IRInst inst = in.GetInstructions()[i];
2189
2190
// Just copy the last instruction.
2191
if (i == n - 1) {
2192
out.Write(inst);
2193
break;
2194
}
2195
2196
out.Write(inst);
2197
2198
IRInst next = in.GetInstructions()[i + 1];
2199
switch (inst.op) {
2200
case IROp::Store32:
2201
if (next.op == IROp::Load32 &&
2202
next.constant == inst.constant &&
2203
next.dest == inst.dest &&
2204
next.src1 == inst.src1) {
2205
// The upcoming load is completely redundant.
2206
// Skip it.
2207
i++;
2208
}
2209
break;
2210
case IROp::StoreVec4:
2211
if (next.op == IROp::LoadVec4 &&
2212
next.constant == inst.constant &&
2213
next.dest == inst.dest &&
2214
next.src1 == inst.src1) {
2215
// The upcoming load is completely redundant. These are common in Wipeout.
2216
// Skip it. NOTE: It looks like vector load/stores uses different register assignments, but there's a union between dest and src3.
2217
i++;
2218
}
2219
break;
2220
default:
2221
break;
2222
}
2223
}
2224
2225
return logBlocks;
2226
}
2227
2228
bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) {
2229
CONDITIONAL_DISABLE;
2230
// This tells us to skip an AND op that has been optimized out.
2231
// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
2232
int nextSkip = -1;
2233
2234
bool logBlocks = false;
2235
// We also move the downcount to the top so the interpreter can assume that it's there.
2236
bool foundDowncount = false;
2237
out.Write(IROp::Downcount);
2238
2239
for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
2240
IRInst inst = in.GetInstructions()[i];
2241
2242
bool last = i == n - 1;
2243
2244
// Specialize some instructions.
2245
switch (inst.op) {
2246
case IROp::Downcount:
2247
if (!foundDowncount) {
2248
// Move the value into the initial Downcount.
2249
foundDowncount = true;
2250
out.ReplaceConstant(0, inst.constant);
2251
} else {
2252
// Already had a downcount. Let's just re-emit it.
2253
out.Write(inst);
2254
}
2255
break;
2256
case IROp::AddConst:
2257
if (inst.src1 == inst.dest) {
2258
inst.op = IROp::OptAddConst;
2259
}
2260
out.Write(inst);
2261
break;
2262
case IROp::AndConst:
2263
if (inst.src1 == inst.dest) {
2264
inst.op = IROp::OptAndConst;
2265
}
2266
out.Write(inst);
2267
break;
2268
case IROp::OrConst:
2269
if (inst.src1 == inst.dest) {
2270
inst.op = IROp::OptOrConst;
2271
}
2272
out.Write(inst);
2273
break;
2274
case IROp::FMovToGPR:
2275
if (!last) {
2276
IRInst next = in.GetInstructions()[i + 1];
2277
if (next.op == IROp::ShrImm && next.src2 == 8 && next.src1 == next.dest && next.src1 == inst.dest) {
2278
// Heavily used when writing display lists.
2279
inst.op = IROp::OptFMovToGPRShr8;
2280
i++; // Skip the next instruction.
2281
}
2282
}
2283
out.Write(inst);
2284
break;
2285
case IROp::FMovFromGPR:
2286
if (!last) {
2287
IRInst next = in.GetInstructions()[i + 1];
2288
if (next.op == IROp::FCvtSW && next.src1 == inst.dest && next.dest == inst.dest) {
2289
inst.op = IROp::OptFCvtSWFromGPR;
2290
i++; // Skip the next
2291
}
2292
}
2293
out.Write(inst);
2294
break;
2295
default:
2296
out.Write(inst);
2297
break;
2298
}
2299
}
2300
2301
return logBlocks;
2302
}
2303
2304