CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/IR/IRPassSimplify.cpp
Views: 1401
#include <algorithm>1#include <cstring>2#include <utility>34#include "Common/BitSet.h"5#include "Common/Data/Convert/SmallDataConvert.h"6#include "Common/Log.h"7#include "Core/Config.h"8#include "Core/MIPS/MIPSVFPUUtils.h"9#include "Core/MIPS/IR/IRAnalysis.h"10#include "Core/MIPS/IR/IRInterpreter.h"11#include "Core/MIPS/IR/IRPassSimplify.h"12#include "Core/MIPS/IR/IRRegCache.h"1314// #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }15#define CONDITIONAL_DISABLE16#define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }1718u32 Evaluate(u32 a, u32 b, IROp op) {19switch (op) {20case IROp::Add: case IROp::AddConst: return a + b;21case IROp::Sub: case IROp::SubConst: return a - b;22case IROp::And: case IROp::AndConst: return a & b;23case IROp::Or: case IROp::OrConst: return a | b;24case IROp::Xor: case IROp::XorConst: return a ^ b;25case IROp::Shr: case IROp::ShrImm: return a >> b;26case IROp::Sar: case IROp::SarImm: return (s32)a >> b;27case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));28case IROp::Shl: case IROp::ShlImm: return a << b;29case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);30case IROp::SltU: case IROp::SltUConst: return (a < b);31default:32_assert_msg_(false, "Unable to evaluate two op %d", (int)op);33return -1;34}35}3637u32 Evaluate(u32 a, IROp op) {38switch (op) {39case IROp::Not: return ~a;40case IROp::Neg: return -(s32)a;41case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);42case IROp::BSwap32: return swap32(a);43case IROp::Ext8to32: return SignExtend8ToU32(a);44case IROp::Ext16to32: return SignExtend16ToU32(a);45case IROp::ReverseBits: return ReverseBits32(a);46case IROp::Clz: {47int x = 31;48int count = 0;49while (x >= 0 && !(a & (1 << x))) {50count++;51x--;52}53return count;54}55default:56_assert_msg_(false, "Unable to evaluate one op %d", (int)op);57return -1;58}59}6061IROp ArithToArithConst(IROp op) {62switch (op) {63case IROp::Add: return IROp::AddConst;64case IROp::Sub: return IROp::SubConst;65case IROp::And: return IROp::AndConst;66case IROp::Or: return IROp::OrConst;67case IROp::Xor: return IROp::XorConst;68case IROp::Slt: return IROp::SltConst;69case IROp::SltU: return IROp::SltUConst;70default:71_assert_msg_(false, "Invalid ArithToArithConst for op %d", (int)op);72return (IROp)-1;73}74}7576IROp ShiftToShiftImm(IROp op) {77switch (op) {78case IROp::Shl: return IROp::ShlImm;79case IROp::Shr: return IROp::ShrImm;80case IROp::Ror: return IROp::RorImm;81case IROp::Sar: return IROp::SarImm;82default:83_assert_msg_(false, "Invalid ShiftToShiftImm for op %d", (int)op);84return (IROp)-1;85}86}8788bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) {89out.Reserve(in.GetInstructions().size());9091if (c == 1) {92return passes[0](in, out, opts);93}9495bool logBlocks = false;9697IRWriter temp[2];98const IRWriter *nextIn = ∈99IRWriter *nextOut = &temp[1];100temp[1].Reserve(nextIn->GetInstructions().size());101for (size_t i = 0; i < c - 1; ++i) {102if (passes[i](*nextIn, *nextOut, opts)) {103logBlocks = true;104}105106temp[0] = std::move(temp[1]);107nextIn = &temp[0];108109temp[1].Clear();110temp[1].Reserve(nextIn->GetInstructions().size());111}112113out.Reserve(nextIn->GetInstructions().size());114if (passes[c - 1](*nextIn, out, opts)) {115logBlocks = true;116}117118return logBlocks;119}120121bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) {122CONDITIONAL_DISABLE;123124bool logBlocks = false;125IRInst prev{ IROp::Nop };126127for (int i = 0; i < (int)in.GetInstructions().size(); i++) {128IRInst inst = in.GetInstructions()[i];129switch (inst.op) {130case IROp::FMovFromGPR:131//FMovToGPR a0, f12132//FMovFromGPR f14, a0133// to134//FMovToGPR a0, f12135//FMov f14, f12136if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {137inst.op = IROp::FMov;138inst.src1 = prev.src1;139// Skip it entirely if it's just a copy to and back.140if (inst.dest != inst.src1)141out.Write(inst);142} else {143out.Write(inst);144}145break;146147// This will need to scan forward or keep track of more information to be useful.148// Just doing one isn't.149/*150case IROp::LoadVec4:151// AddConst a0, sp, 0x30152// LoadVec4 v16, a0, 0x0153// to154// AddConst a0, sp, 0x30155// LoadVec4 v16, sp, 0x30156if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {157inst.constant += prev.constant;158inst.src1 = prev.src1;159logBlocks = 1;160} else {161goto doDefault;162}163out.Write(inst);164break;165*/166default:167out.Write(inst);168break;169}170prev = inst;171}172return logBlocks;173}174175// Might be useful later on x86.176bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) {177CONDITIONAL_DISABLE;178179bool logBlocks = false;180for (int i = 0; i < (int)in.GetInstructions().size(); i++) {181IRInst inst = in.GetInstructions()[i];182switch (inst.op) {183case IROp::Sub:184case IROp::Slt:185case IROp::SltU:186case IROp::Add:187case IROp::And:188case IROp::Or:189case IROp::Xor:190if (inst.src1 != inst.dest && inst.src2 != inst.dest) {191out.Write(IROp::Mov, inst.dest, inst.src1);192out.Write(inst.op, inst.dest, inst.dest, inst.src2);193} else {194out.Write(inst);195}196break;197case IROp::FMul:198case IROp::FAdd:199if (inst.src1 != inst.dest && inst.src2 != inst.dest) {200out.Write(IROp::FMov, inst.dest, inst.src1);201out.Write(inst.op, inst.dest, inst.dest, inst.src2);202} else {203out.Write(inst);204}205break;206207case IROp::Vec4Add:208case IROp::Vec4Sub:209case IROp::Vec4Mul:210case IROp::Vec4Div:211if (inst.src1 != inst.dest && inst.src2 != inst.dest) {212out.Write(IROp::Vec4Mov, inst.dest, inst.src1);213out.Write(inst.op, inst.dest, inst.dest, inst.src2);214} else {215out.Write(inst);216}217break;218219default:220out.Write(inst);221break;222}223}224return logBlocks;225}226227bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) {228CONDITIONAL_DISABLE;229230bool logBlocks = false;231232bool letThroughHalves = false;233if (opts.optimizeForInterpreter) {234// If we're using the interpreter, which can handle these instructions directly,235// don't break "half" instructions up.236// Of course, we still want to combine if possible.237letThroughHalves = true;238}239240for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) {241const IRInst &inst = in.GetInstructions()[i];242243// TODO: Reorder or look ahead to combine?244245auto nextOp = [&]() -> const IRInst &{246return in.GetInstructions()[i + 1];247};248249auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) {250if (i + 1 >= n)251return false;252const IRInst &next = nextOp();253if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1)254return false;255if (inst.constant + matchOff != next.constant)256return false;257258if (opts.unalignedLoadStore) {259// Write out one unaligned op.260out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff));261} else if (replaceOp == IROp::Load32) {262// We can still combine to a simpler set of two loads.263// We start by isolating the address and shift amount.264265// IRTEMP_LR_ADDR = rs + imm266out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant + replaceOff));267// IRTEMP_LR_SHIFT = (addr & 3) * 8268out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));269out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);270// IRTEMP_LR_ADDR = addr & 0xfffffffc271out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));272// IRTEMP_LR_VALUE = low_word, dest = high_word273out.Write(IROp::Load32, inst.dest, IRTEMP_LR_ADDR, out.AddConstant(0));274out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(4));275276// Now we just need to adjust and combine dest and IRTEMP_LR_VALUE.277// inst.dest >>= shift (putting its bits in the right spot.)278out.Write(IROp::Shr, inst.dest, inst.dest, IRTEMP_LR_SHIFT);279// We can't shift by 32, so we compromise by shifting twice.280out.Write(IROp::ShlImm, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, 8);281// IRTEMP_LR_SHIFT = 24 - shift282out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);283out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));284// IRTEMP_LR_VALUE <<= (24 - shift)285out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);286287// At this point the values are aligned, and we just merge.288out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);289} else {290return false;291}292// Skip the next one, replaced.293i++;294return true;295};296297auto addCommonProlog = [&]() {298// IRTEMP_LR_ADDR = rs + imm299out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant));300// IRTEMP_LR_SHIFT = (addr & 3) * 8301out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));302out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);303// IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later)304out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));305// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR)306out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0));307};308auto addCommonStore = [&](int off = 0) {309// RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE310out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off));311};312313switch (inst.op) {314case IROp::Load32Left:315if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) {316if (letThroughHalves) {317out.Write(inst);318break;319}320321addCommonProlog();322// dest &= (0x00ffffff >> shift)323// Alternatively, could shift to a wall and back (but would require two shifts each way.)324out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);325out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);326out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);327// IRTEMP_LR_SHIFT = 24 - shift328out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);329out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));330// IRTEMP_LR_VALUE <<= (24 - shift)331out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);332// dest |= IRTEMP_LR_VALUE333out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);334335bool src1Dirty = inst.dest == inst.src1;336while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {337// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)338out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));339340// dest &= IRTEMP_LR_MASK341out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);342// IRTEMP_LR_VALUE <<= (24 - shift)343out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);344// dest |= IRTEMP_LR_VALUE345out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);346347src1Dirty = nextOp().dest == inst.src1;348++i;349}350}351break;352353case IROp::Load32Right:354if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) {355if (letThroughHalves) {356out.Write(inst);357break;358}359addCommonProlog();360// IRTEMP_LR_VALUE >>= shift361out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);362// IRTEMP_LR_SHIFT = 24 - shift363out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);364out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));365// dest &= (0xffffff00 << (24 - shift))366// Alternatively, could shift to a wall and back (but would require two shifts each way.)367out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);368out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);369out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);370// dest |= IRTEMP_LR_VALUE371out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);372373// Building display lists sometimes involves a bunch of lwr in a row.374// We can generate more optimal code by combining.375bool shiftNeedsReverse = true;376bool src1Dirty = inst.dest == inst.src1;377while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {378// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)379out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));380381if (shiftNeedsReverse) {382// IRTEMP_LR_SHIFT = shift again383out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);384out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));385shiftNeedsReverse = false;386}387// IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT388out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);389// dest &= IRTEMP_LR_MASK390out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);391// dest |= IRTEMP_LR_VALUE392out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);393394src1Dirty = nextOp().dest == inst.src1;395++i;396}397}398break;399400case IROp::Store32Left:401if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) {402if (letThroughHalves) {403out.Write(inst);404break;405}406addCommonProlog();407// IRTEMP_LR_VALUE &= 0xffffff00 << shift408out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);409out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);410out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);411// IRTEMP_LR_SHIFT = 24 - shift412out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);413out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));414// IRTEMP_LR_VALUE |= src3 >> (24 - shift)415out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);416out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);417addCommonStore(0);418}419break;420421case IROp::Store32Right:422if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) {423if (letThroughHalves) {424out.Write(inst);425break;426}427addCommonProlog();428// IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift)429out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);430out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);431out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));432out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);433out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);434out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);435out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));436// IRTEMP_LR_VALUE |= src3 << shift437out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);438out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);439addCommonStore(0);440}441break;442443default:444out.Write(inst);445break;446}447}448449return logBlocks;450}451452bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) {453CONDITIONAL_DISABLE;454IRImmRegCache gpr(&out);455456bool logBlocks = false;457bool skipNextExitToConst = false;458for (int i = 0; i < (int)in.GetInstructions().size(); i++) {459IRInst inst = in.GetInstructions()[i];460bool symmetric = true;461switch (inst.op) {462case IROp::SetConst:463gpr.SetImm(inst.dest, inst.constant);464break;465case IROp::SetConstF:466goto doDefault;467468case IROp::Sub:469if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0 && !gpr.IsImm(inst.src2)) {470// Morph into a Neg.471gpr.MapDirtyIn(inst.dest, inst.src2);472out.Write(IROp::Neg, inst.dest, inst.src2);473break;474} else if (inst.src1 == inst.src2) {475// Seen sometimes, yet another way of producing zero.476gpr.SetImm(inst.dest, 0);477break;478}479#if __cplusplus >= 201703 || _MSC_VER > 1910480[[fallthrough]];481#endif482case IROp::Slt:483case IROp::SltU:484symmetric = false;485#if __cplusplus >= 201703 || _MSC_VER > 1910486[[fallthrough]];487#endif488case IROp::Add:489case IROp::And:490case IROp::Or:491case IROp::Xor:492// Regularize, for the add/or check below.493if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {494std::swap(inst.src1, inst.src2);495}496if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {497gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));498} else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {499gpr.SetImm(inst.dest, 0);500} else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) {501gpr.SetImm(inst.dest, 0);502} else if (gpr.IsImm(inst.src2)) {503const u32 imm2 = gpr.GetImm(inst.src2);504gpr.MapDirtyIn(inst.dest, inst.src1);505if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Sub || inst.op == IROp::Or || inst.op == IROp::Xor)) {506// Add / Sub / Or / Xor with zero is just a Mov. Add / Or are most common.507if (inst.dest != inst.src1)508out.Write(IROp::Mov, inst.dest, inst.src1);509} else {510out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));511}512} else if (symmetric && gpr.IsImm(inst.src1)) {513const u32 imm1 = gpr.GetImm(inst.src1);514gpr.MapDirtyIn(inst.dest, inst.src2);515if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or || inst.op == IROp::Xor)) {516// Add / Or / Xor with zero is just a Mov.517if (inst.dest != inst.src2)518out.Write(IROp::Mov, inst.dest, inst.src2);519} else {520out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));521}522} else {523gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);524goto doDefault;525}526break;527528case IROp::Neg:529case IROp::Not:530case IROp::BSwap16:531case IROp::BSwap32:532case IROp::Ext8to32:533case IROp::Ext16to32:534case IROp::ReverseBits:535case IROp::Clz:536if (gpr.IsImm(inst.src1)) {537gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));538} else {539gpr.MapDirtyIn(inst.dest, inst.src1);540goto doDefault;541}542break;543544case IROp::AddConst:545case IROp::SubConst:546case IROp::AndConst:547case IROp::OrConst:548case IROp::XorConst:549case IROp::SltConst:550case IROp::SltUConst:551// And 0 is otherwise set to 0. Happens when optimizing lwl.552if (inst.op == IROp::AndConst && inst.constant == 0) {553gpr.SetImm(inst.dest, 0);554} else if (gpr.IsImm(inst.src1)) {555gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op));556} else if (inst.constant == 0 && (inst.op == IROp::AddConst || inst.op == IROp::SubConst || inst.op == IROp::OrConst || inst.op == IROp::XorConst)) {557// Convert an Add/Sub/Or/Xor with a constant zero to a Mov (just like with reg zero.)558gpr.MapDirtyIn(inst.dest, inst.src1);559if (inst.dest != inst.src1)560out.Write(IROp::Mov, inst.dest, inst.src1);561} else {562gpr.MapDirtyIn(inst.dest, inst.src1);563goto doDefault;564}565break;566567case IROp::Shl:568case IROp::Shr:569case IROp::Ror:570case IROp::Sar:571if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {572gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));573} else if (gpr.IsImm(inst.src2)) {574const u8 sa = gpr.GetImm(inst.src2) & 31;575gpr.MapDirtyIn(inst.dest, inst.src1);576if (sa == 0) {577if (inst.dest != inst.src1)578out.Write(IROp::Mov, inst.dest, inst.src1);579} else {580out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);581}582} else {583gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);584goto doDefault;585}586break;587588case IROp::ShlImm:589case IROp::ShrImm:590case IROp::RorImm:591case IROp::SarImm:592if (gpr.IsImm(inst.src1)) {593gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));594} else {595gpr.MapDirtyIn(inst.dest, inst.src1);596goto doDefault;597}598break;599600case IROp::Mov:601if (inst.dest == inst.src1) {602// Nop603} else if (gpr.IsImm(inst.src1)) {604gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));605} else {606gpr.MapDirtyIn(inst.dest, inst.src1);607goto doDefault;608}609break;610611case IROp::Mult:612case IROp::MultU:613case IROp::Madd:614case IROp::MaddU:615case IROp::Msub:616case IROp::MsubU:617case IROp::Div:618case IROp::DivU:619gpr.MapInIn(inst.src1, inst.src2);620goto doDefault;621622case IROp::MovZ:623case IROp::MovNZ:624gpr.MapInInIn(inst.dest, inst.src1, inst.src2);625goto doDefault;626627case IROp::Min:628case IROp::Max:629gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);630goto doDefault;631632case IROp::FMovFromGPR:633if (gpr.IsImm(inst.src1)) {634out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));635} else {636gpr.MapIn(inst.src1);637goto doDefault;638}639break;640641case IROp::FMovToGPR:642gpr.MapDirty(inst.dest);643goto doDefault;644645case IROp::MfHi:646case IROp::MfLo:647gpr.MapDirty(inst.dest);648goto doDefault;649650case IROp::MtHi:651case IROp::MtLo:652gpr.MapIn(inst.src1);653goto doDefault;654655case IROp::Store8:656case IROp::Store16:657case IROp::Store32:658case IROp::Store32Left:659case IROp::Store32Right:660case IROp::Store32Conditional:661if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {662gpr.MapIn(inst.dest);663out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));664} else {665gpr.MapInIn(inst.dest, inst.src1);666goto doDefault;667}668break;669case IROp::StoreFloat:670case IROp::StoreVec4:671if (gpr.IsImm(inst.src1)) {672out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));673} else {674gpr.MapIn(inst.src1);675goto doDefault;676}677break;678679case IROp::Load8:680case IROp::Load8Ext:681case IROp::Load16:682case IROp::Load16Ext:683case IROp::Load32:684case IROp::Load32Linked:685if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {686gpr.MapDirty(inst.dest);687out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));688} else {689gpr.MapDirtyIn(inst.dest, inst.src1);690goto doDefault;691}692break;693case IROp::LoadFloat:694case IROp::LoadVec4:695if (gpr.IsImm(inst.src1)) {696out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));697} else {698gpr.MapIn(inst.src1);699goto doDefault;700}701break;702case IROp::Load32Left:703case IROp::Load32Right:704if (gpr.IsImm(inst.src1)) {705gpr.MapIn(inst.dest);706out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));707} else {708gpr.MapInIn(inst.dest, inst.src1);709goto doDefault;710}711break;712713case IROp::ValidateAddress8:714case IROp::ValidateAddress16:715case IROp::ValidateAddress32:716case IROp::ValidateAddress128:717if (gpr.IsImm(inst.src1)) {718out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));719} else {720gpr.MapIn(inst.src1);721goto doDefault;722}723break;724725case IROp::Downcount:726case IROp::SetPCConst:727goto doDefault;728729case IROp::SetPC:730if (gpr.IsImm(inst.src1)) {731out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));732} else {733gpr.MapIn(inst.src1);734goto doDefault;735}736break;737738// FP-only instructions don't need to flush immediates.739case IROp::FAdd:740case IROp::FMul:741// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)742if (inst.src2 == inst.dest && inst.src1 != inst.src2)743std::swap(inst.src1, inst.src2);744out.Write(inst);745break;746747case IROp::FSub:748case IROp::FDiv:749case IROp::FNeg:750case IROp::FAbs:751case IROp::FMov:752case IROp::FRound:753case IROp::FTrunc:754case IROp::FCeil:755case IROp::FFloor:756case IROp::FCvtSW:757case IROp::FCvtScaledWS:758case IROp::FCvtScaledSW:759case IROp::FSin:760case IROp::FCos:761case IROp::FSqrt:762case IROp::FRSqrt:763case IROp::FRecip:764case IROp::FAsin:765out.Write(inst);766break;767768case IROp::SetCtrlVFPU:769gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);770goto doDefault;771772case IROp::SetCtrlVFPUReg:773if (gpr.IsImm(inst.src1)) {774out.Write(IROp::SetCtrlVFPU, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));775} else {776gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1);777out.Write(inst);778}779break;780781case IROp::SetCtrlVFPUFReg:782gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest);783goto doDefault;784785case IROp::FCvtWS:786// TODO: Actually, this should just use the currently set rounding mode.787// Move up with FCvtSW when that's implemented.788gpr.MapIn(IRREG_FCR31);789out.Write(inst);790break;791792case IROp::FpCondFromReg:793gpr.MapDirtyIn(IRREG_FPCOND, inst.src1);794out.Write(inst);795break;796case IROp::FpCondToReg:797if (gpr.IsImm(IRREG_FPCOND)) {798gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));799} else {800gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);801out.Write(inst);802}803break;804case IROp::FpCtrlFromReg:805gpr.MapDirtyIn(IRREG_FCR31, inst.src1);806gpr.MapDirty(IRREG_FPCOND);807goto doDefault;808case IROp::FpCtrlToReg:809gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31);810goto doDefault;811812case IROp::Vec4Init:813case IROp::Vec4Mov:814case IROp::Vec4Add:815case IROp::Vec4Sub:816case IROp::Vec4Mul:817case IROp::Vec4Div:818case IROp::Vec4Dot:819case IROp::Vec4Scale:820case IROp::Vec4Shuffle:821case IROp::Vec4Blend:822case IROp::Vec4Neg:823case IROp::Vec4Abs:824case IROp::Vec4Pack31To8:825case IROp::Vec4Pack32To8:826case IROp::Vec2Pack32To16:827case IROp::Vec4Unpack8To32:828case IROp::Vec2Unpack16To32:829case IROp::Vec4DuplicateUpperBitsAndShift1:830case IROp::Vec2ClampToZero:831case IROp::Vec4ClampToZero:832out.Write(inst);833break;834835case IROp::FCmp:836gpr.MapDirty(IRREG_FPCOND);837goto doDefault;838839case IROp::RestoreRoundingMode:840case IROp::ApplyRoundingMode:841case IROp::UpdateRoundingMode:842goto doDefault;843844case IROp::VfpuCtrlToReg:845gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);846goto doDefault;847848case IROp::FCmpVfpuBit:849gpr.MapDirty(IRREG_VFPU_CC);850goto doDefault;851852case IROp::FCmovVfpuCC:853gpr.MapIn(IRREG_VFPU_CC);854goto doDefault;855856case IROp::FCmpVfpuAggregate:857gpr.MapDirtyIn(IRREG_VFPU_CC, IRREG_VFPU_CC);858goto doDefault;859860case IROp::ExitToConstIfEq:861case IROp::ExitToConstIfNeq:862if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {863bool passed = false;864switch (inst.op) {865case IROp::ExitToConstIfEq: passed = gpr.GetImm(inst.src1) == gpr.GetImm(inst.src2); break;866case IROp::ExitToConstIfNeq: passed = gpr.GetImm(inst.src1) != gpr.GetImm(inst.src2); break;867default: _assert_(false); break;868}869870// This is a bit common for the first cycle of loops.871// Reduce bloat by skipping on fail, and const exit on pass.872if (passed) {873gpr.FlushAll();874out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));875skipNextExitToConst = true;876}877break;878}879gpr.FlushAll();880goto doDefault;881882case IROp::ExitToConstIfGtZ:883case IROp::ExitToConstIfGeZ:884case IROp::ExitToConstIfLtZ:885case IROp::ExitToConstIfLeZ:886if (gpr.IsImm(inst.src1)) {887bool passed = false;888switch (inst.op) {889case IROp::ExitToConstIfGtZ: passed = (s32)gpr.GetImm(inst.src1) > 0; break;890case IROp::ExitToConstIfGeZ: passed = (s32)gpr.GetImm(inst.src1) >= 0; break;891case IROp::ExitToConstIfLtZ: passed = (s32)gpr.GetImm(inst.src1) < 0; break;892case IROp::ExitToConstIfLeZ: passed = (s32)gpr.GetImm(inst.src1) <= 0; break;893default: _assert_(false); break;894}895896if (passed) {897gpr.FlushAll();898out.Write(IROp::ExitToConst, out.AddConstant(inst.constant));899skipNextExitToConst = true;900}901break;902}903gpr.FlushAll();904goto doDefault;905906case IROp::ExitToConst:907if (skipNextExitToConst) {908skipNextExitToConst = false;909break;910}911gpr.FlushAll();912goto doDefault;913914case IROp::ExitToReg:915if (gpr.IsImm(inst.src1)) {916// This happens sometimes near loops.917// Prefer ExitToConst to allow block linking.918u32 dest = gpr.GetImm(inst.src1);919gpr.FlushAll();920out.Write(IROp::ExitToConst, out.AddConstant(dest));921break;922}923gpr.FlushAll();924goto doDefault;925926case IROp::CallReplacement:927case IROp::Break:928case IROp::Syscall:929case IROp::Interpret:930case IROp::ExitToConstIfFpFalse:931case IROp::ExitToConstIfFpTrue:932case IROp::Breakpoint:933case IROp::MemoryCheck:934default:935{936gpr.FlushAll();937doDefault:938out.Write(inst);939break;940}941}942}943gpr.FlushAll();944return logBlocks;945}946947IRInstMeta IRReplaceSrcGPR(const IRInstMeta &inst, int fromReg, int toReg) {948IRInstMeta newInst = inst;949950if (inst.m.types[1] == 'G' && inst.src1 == fromReg) {951newInst.src1 = toReg;952}953if (inst.m.types[2] == 'G' && inst.src2 == fromReg) {954newInst.src2 = toReg;955}956if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && inst.m.types[0] == 'G' && inst.src3 == fromReg) {957newInst.src3 = toReg;958}959return newInst;960}961962IRInstMeta IRReplaceDestGPR(const IRInstMeta &inst, int fromReg, int toReg) {963IRInstMeta newInst = inst;964965if ((inst.m.flags & IRFLAG_SRC3) == 0 && inst.m.types[0] == 'G' && inst.dest == fromReg) {966newInst.dest = toReg;967}968return newInst;969}970971bool IRMutatesDestGPR(const IRInstMeta &inst, int reg) {972return (inst.m.flags & IRFLAG_SRC3DST) != 0 && inst.m.types[0] == 'G' && inst.src3 == reg;973}974975bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) {976CONDITIONAL_DISABLE;977std::vector<IRInstMeta> insts;978insts.reserve(in.GetInstructions().size());979980// We track writes both to rename regs and to purge dead stores.981struct Check {982Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {983}984985// Register this instruction wrote to.986int reg;987// Only other than -1 when it's a Mov, equivalent reg at this point.988int srcReg = -1;989// Index into insts for this op.990int index;991// Whether the dest reg is read by any Exit.992bool readByExit;993int8_t fplen = 0;994};995std::vector<Check> checks;996checks.reserve(insts.size() / 2);997998// This tracks the last index at which each reg was modified.999int lastWrittenTo[256];1000int lastReadFrom[256];1001memset(lastWrittenTo, -1, sizeof(lastWrittenTo));1002memset(lastReadFrom, -1, sizeof(lastReadFrom));10031004auto readsFromFPRCheck = [](IRInstMeta &inst, Check &check, bool *directly) {1005if (check.reg < 32)1006return false;10071008bool result = false;1009*directly = true;1010for (int i = 0; i < 4; ++i) {1011bool laneDirectly;1012if (check.fplen >= i + 1 && IRReadsFromFPR(inst, check.reg - 32 + i, &laneDirectly)) {1013result = true;1014if (!laneDirectly) {1015*directly = false;1016break;1017}1018}1019}1020return result;1021};10221023bool logBlocks = false;1024size_t firstCheck = 0;1025for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {1026IRInstMeta inst = GetIRMeta(in.GetInstructions()[i]);10271028// It helps to skip through rechecking ones we already discarded.1029for (size_t ch = firstCheck; ch < checks.size(); ++ch) {1030Check &check = checks[ch];1031if (check.reg != 0) {1032firstCheck = ch;1033break;1034}1035}10361037// Check if we can optimize by running through all the writes we've previously found.1038for (size_t ch = firstCheck; ch < checks.size(); ++ch) {1039Check &check = checks[ch];1040if (check.reg == 0) {1041// This means we already optimized this or a later inst depends on it.1042continue;1043}10441045bool readsDirectly;1046if (IRReadsFromGPR(inst, check.reg, &readsDirectly)) {1047// If this reads from the reg, we either depend on it or we can fold or swap.1048// That's determined below.10491050// If this reads and writes the reg (e.g. MovZ, Load32Left), we can't just swap.1051bool mutatesReg = IRMutatesDestGPR(inst, check.reg);1052// If this doesn't directly read (i.e. Interpret), we can't swap.1053bool cannotReplace = !readsDirectly;1054if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) {1055// Replace with the srcReg instead. This happens with non-nice delay slots.1056// We're changing "Mov A, B; Add C, C, A" to "Mov A, B; Add C, C, B" here.1057// srcReg should only be set when it was a Mov.1058inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg);10591060// If the Mov modified the same reg as this instruction, we can't optimize from it anymore.1061if (inst.dest == check.reg) {1062check.reg = 0;1063// We can also optimize it out since we've essentially moved now.1064insts[check.index].op = IROp::Mov;1065insts[check.index].dest = 0;1066insts[check.index].src1 = 0;1067}1068} else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) {1069// As long as the previous inst wasn't modifying its dest reg, and this is a Mov, we can swap.1070// We're changing "Add A, B, C; Mov B, A" to "Add B, B, C; Mov A, B" here.10711072// This happens with lwl/lwr temps. Replace the original dest.1073insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest);1074lastWrittenTo[inst.dest] = check.index;1075// If it's being read from (by inst now), we can't optimize out.1076check.reg = 0;1077// Update the read by exit flag to match the new reg.1078check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT;1079// And swap the args for this mov, since we changed the other dest. We'll optimize this out later.1080std::swap(inst.dest, inst.src1);1081} else {1082// Legitimately read from, so we can't optimize out.1083// Unless this is an exit and a temp not read directly by the exit.1084if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)1085check.reg = 0;1086}1087} else if (check.fplen >= 1 && readsFromFPRCheck(inst, check, &readsDirectly)) {1088// If one or the other is a Vec, they must match.1089bool lenMismatch = false;10901091auto checkMismatch = [&check, &lenMismatch](IRReg src, char type) {1092int srclen = 1;1093if (type == 'V')1094srclen = 4;1095else if (type == '2')1096srclen = 2;1097else if (type != 'F')1098return;10991100if (src + 32 + srclen > check.reg && src + 32 < check.reg + check.fplen) {1101if (src + 32 != check.reg || srclen != check.fplen)1102lenMismatch = true;1103}1104};11051106checkMismatch(inst.src1, inst.m.types[1]);1107checkMismatch(inst.src2, inst.m.types[2]);1108if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0)1109checkMismatch(inst.src3, inst.m.types[3]);11101111bool cannotReplace = !readsDirectly || lenMismatch;1112if (!cannotReplace && check.srcReg >= 32 && lastWrittenTo[check.srcReg] < check.index) {1113// This is probably not worth doing unless we can get rid of a temp.1114if (!check.readByExit) {1115if (insts[check.index].dest == inst.src1)1116inst.src1 = check.srcReg - 32;1117else if (insts[check.index].dest == inst.src2)1118inst.src2 = check.srcReg - 32;1119else1120_assert_msg_(false, "Unexpected src3 read of FPR");11211122// Check if we've clobbered it entirely.1123if (inst.dest == check.reg) {1124check.reg = 0;1125insts[check.index].op = IROp::Mov;1126insts[check.index].dest = 0;1127insts[check.index].src1 = 0;1128}1129} else {1130// Let's not bother.1131check.reg = 0;1132}1133} else if ((inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) && !lenMismatch) {1134// A swap could be profitable if this is a temp, and maybe in other cases.1135// These can happen a lot from mask regs, etc.1136// But make sure no other changes happened between.1137bool destNotChanged = true;1138for (int j = 0; j < check.fplen; ++j)1139destNotChanged = destNotChanged && lastWrittenTo[inst.dest + 32 + j] < check.index;11401141bool destNotRead = true;1142for (int j = 0; j < check.fplen; ++j)1143destNotRead = destNotRead && lastReadFrom[inst.dest + 32 + j] <= check.index;11441145if (!check.readByExit && destNotChanged && destNotRead) {1146_dbg_assert_(insts[check.index].dest == inst.src1);1147insts[check.index].dest = inst.dest;1148for (int j = 0; j < check.fplen; ++j)1149lastWrittenTo[inst.dest + 32 + j] = check.index;1150// If it's being read from (by inst now), we can't optimize out.1151check.reg = 0;1152// Swap the dest and src1 so we can optimize this out later, maybe.1153std::swap(inst.dest, inst.src1);1154} else {1155// Doesn't look like a good candidate.1156check.reg = 0;1157}1158} else {1159// Legitimately read from, so we can't optimize out.1160if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly)1161check.reg = 0;1162}1163} else if (check.readByExit && (inst.m.flags & IRFLAG_EXIT) != 0) {1164// This is an exit, and the reg is read by any exit. Clear it.1165check.reg = 0;1166} else if (IRDestGPR(inst) == check.reg) {1167// Clobbered, we can optimize out.1168// This happens sometimes with temporaries used for constant addresses.1169insts[check.index].op = IROp::Mov;1170insts[check.index].dest = 0;1171insts[check.index].src1 = 0;1172check.reg = 0;1173} else if (IRWritesToFPR(inst, check.reg - 32) && check.fplen >= 1) {1174IRReg destFPRs[4];1175int numFPRs = IRDestFPRs(inst, destFPRs);11761177if (numFPRs == check.fplen && inst.dest + 32 == check.reg) {1178// This means we've clobbered it, and with full overlap.1179// Sometimes this happens for non-temps, i.e. vmmov + vinit last row.1180insts[check.index].op = IROp::Mov;1181insts[check.index].dest = 0;1182insts[check.index].src1 = 0;1183check.reg = 0;1184} else {1185// Since there's an overlap, we simply cannot optimize.1186check.reg = 0;1187}1188}1189}11901191int dest = IRDestGPR(inst);1192switch (dest) {1193case IRTEMP_0:1194case IRTEMP_1:1195case IRTEMP_2:1196case IRTEMP_3:1197case IRTEMP_LHS:1198case IRTEMP_RHS:1199case IRTEMP_LR_ADDR:1200case IRTEMP_LR_VALUE:1201case IRTEMP_LR_MASK:1202case IRTEMP_LR_SHIFT:1203// Check that it's not a barrier instruction (like CallReplacement). Don't want to even consider optimizing those.1204if (!(inst.m.flags & IRFLAG_BARRIER)) {1205// Unlike other registers, these don't need to persist between blocks.1206// So we consider them not read unless proven read.1207lastWrittenTo[dest] = i;1208// If this is a copy, we might be able to optimize out the copy.1209if (inst.op == IROp::Mov) {1210Check check(dest, i, false);1211check.srcReg = inst.src1;1212checks.push_back(check);1213} else {1214checks.push_back(Check(dest, i, false));1215}1216} else {1217lastWrittenTo[dest] = i;1218}1219break;12201221default:1222lastWrittenTo[dest] = i;1223if (dest > IRTEMP_LR_SHIFT) {1224// These might sometimes be implicitly read/written by other instructions.1225break;1226}1227checks.push_back(Check(dest, i, true));1228break;12291230// Not a GPR output.1231case 0:1232case -1:1233break;1234}12351236IRReg regs[16];1237int readGPRs = IRReadsFromGPRs(inst, regs);1238if (readGPRs == -1) {1239for (int j = 0; j < 256; ++j)1240lastReadFrom[j] = i;1241} else {1242for (int j = 0; j < readGPRs; ++j)1243lastReadFrom[regs[j]] = i;1244}12451246int readFPRs = IRReadsFromFPRs(inst, regs);1247if (readFPRs == -1) {1248for (int j = 0; j < 256; ++j)1249lastReadFrom[j] = i;1250} else {1251for (int j = 0; j < readFPRs; ++j)1252lastReadFrom[regs[j] + 32] = i;1253}12541255int destFPRs = IRDestFPRs(inst, regs);1256for (int j = 0; j < destFPRs; ++j)1257lastWrittenTo[regs[j] + 32] = i;12581259dest = destFPRs > 0 ? regs[0] + 32 : -1;1260if (dest >= 32 && dest < IRTEMP_0) {1261// Standard FPU or VFPU reg.1262Check check(dest, i, true);1263check.fplen = (int8_t)destFPRs;1264checks.push_back(check);1265} else if (dest >= IRVTEMP_PFX_S + 32 && dest < IRVTEMP_PFX_S + 32 + 16) {1266// These are temporary regs and not read by exits.1267Check check(dest, i, false);1268check.fplen = (int8_t)destFPRs;1269if (inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) {1270check.srcReg = inst.src1 + 32;1271}1272checks.push_back(check);1273} else if (dest != -1) {1274_assert_msg_(false, "Unexpected FPR output %d", dest);1275}12761277insts.push_back(inst);1278}12791280// Since we're done with the instructions, all remaining can be nuked.1281for (Check &check : checks) {1282if (!check.readByExit && check.reg > 0) {1283insts[check.index].op = IROp::Mov;1284insts[check.index].dest = 0;1285insts[check.index].src1 = 0;1286}1287}12881289for (const IRInstMeta &inst : insts) {1290// Simply skip any Mov 0, 0 instructions, since that's how we nuke one.1291if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {1292out.Write(inst.i);1293}1294}12951296return logBlocks;1297}12981299bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) {1300CONDITIONAL_DISABLE;1301// This tells us to skip an AND op that has been optimized out.1302// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.1303int nextSkip = -1;13041305bool logBlocks = false;1306for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {1307IRInst inst = in.GetInstructions()[i];13081309if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {1310int dest = IRDestGPR(GetIRMeta(inst));1311for (int j = i + 1; j < n; j++) {1312const IRInstMeta laterInst = GetIRMeta(in.GetInstructions()[j]);13131314if ((laterInst.m.flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {1315// Exit, so we can't do the optimization.1316break;1317}1318if (IRReadsFromGPR(laterInst, dest)) {1319if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {1320const u32 mask = laterInst.constant;1321// Here we are, maybe we can reduce the load size based on the mask.1322if ((mask & 0xffffff00) == 0) {1323inst.op = IROp::Load8;1324if (mask == 0xff) {1325nextSkip = j;1326}1327} else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {1328inst.op = IROp::Load16;1329if (mask == 0xffff) {1330nextSkip = j;1331}1332}1333}1334// If it was read, we can't do the optimization.1335break;1336}1337if (IRDestGPR(laterInst) == dest) {1338// Someone else wrote, so we can't do the optimization.1339break;1340}1341}1342}13431344if (i != nextSkip) {1345out.Write(inst);1346}1347}13481349return logBlocks;1350}13511352static std::vector<IRInst> ReorderLoadStoreOps(std::vector<IRInst> &ops) {1353if (ops.size() < 2) {1354return ops;1355}13561357bool modifiedRegs[256] = {};13581359for (size_t i = 0, n = ops.size(); i < n - 1; ++i) {1360bool modifiesReg = false;1361bool usesFloatReg = false;1362switch (ops[i].op) {1363case IROp::Load8:1364case IROp::Load8Ext:1365case IROp::Load16:1366case IROp::Load16Ext:1367case IROp::Load32:1368case IROp::Load32Left:1369case IROp::Load32Right:1370modifiesReg = true;1371if (ops[i].src1 == ops[i].dest) {1372// Can't ever reorder these, since it changes.1373continue;1374}1375break;13761377case IROp::Store8:1378case IROp::Store16:1379case IROp::Store32:1380case IROp::Store32Left:1381case IROp::Store32Right:1382break;13831384case IROp::LoadFloat:1385case IROp::LoadVec4:1386usesFloatReg = true;1387modifiesReg = true;1388break;13891390case IROp::StoreFloat:1391case IROp::StoreVec4:1392usesFloatReg = true;1393break;13941395default:1396continue;1397}13981399memset(modifiedRegs, 0, sizeof(modifiedRegs));1400size_t start = i;1401size_t j;1402for (j = i; j < n; ++j) {1403if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) {1404// Incompatible ops, so let's not reorder.1405break;1406}1407if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) {1408// Can't reorder, this reg was modified.1409break;1410}1411if (modifiesReg) {1412// Modifies itself, can't reorder this.1413if (!usesFloatReg && ops[j].dest == ops[j].src1) {1414break;1415}1416modifiedRegs[ops[j].dest] = true;1417}14181419// Keep going, these operations are compatible.1420}14211422// Everything up to (but not including) j will be sorted, so skip them.1423i = j - 1;1424size_t end = j;1425if (start + 1 < end) {1426std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) {1427return a.constant < b.constant;1428});1429}1430}14311432return ops;1433}14341435bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {1436CONDITIONAL_DISABLE;14371438bool logBlocks = false;14391440enum class RegState : u8 {1441UNUSED = 0,1442READ = 1,1443CHANGED = 2,1444};14451446bool queuing = false;1447std::vector<IRInst> loadStoreQueue;1448std::vector<IRInst> otherQueue;1449RegState otherRegs[256] = {};14501451auto flushQueue = [&]() {1452if (!queuing) {1453return;1454}14551456std::vector<IRInst> loadStoreUnsorted = loadStoreQueue;1457std::vector<IRInst> loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue);1458if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) {1459logBlocks = true;1460}14611462queuing = false;1463for (IRInst queued : loadStoreSorted) {1464out.Write(queued);1465}1466for (IRInst queued : otherQueue) {1467out.Write(queued);1468}1469loadStoreQueue.clear();1470otherQueue.clear();1471memset(otherRegs, 0, sizeof(otherRegs));1472};14731474for (int i = 0; i < (int)in.GetInstructions().size(); i++) {1475IRInst inst = in.GetInstructions()[i];1476switch (inst.op) {1477case IROp::Load8:1478case IROp::Load8Ext:1479case IROp::Load16:1480case IROp::Load16Ext:1481case IROp::Load32:1482case IROp::Load32Left:1483case IROp::Load32Right:1484// To move a load up, its dest can't be changed by things we move down.1485if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) {1486flushQueue();1487}14881489queuing = true;1490loadStoreQueue.push_back(inst);1491break;14921493case IROp::Store8:1494case IROp::Store16:1495case IROp::Store32:1496case IROp::Store32Left:1497case IROp::Store32Right:1498// A store can move above even if it's read, as long as it's not changed by the other ops.1499if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) {1500flushQueue();1501}15021503queuing = true;1504loadStoreQueue.push_back(inst);1505break;15061507case IROp::LoadVec4:1508case IROp::LoadFloat:1509case IROp::StoreVec4:1510case IROp::StoreFloat:1511// Floats can always move as long as their address is safe.1512if (otherRegs[inst.src1] == RegState::CHANGED) {1513flushQueue();1514}15151516queuing = true;1517loadStoreQueue.push_back(inst);1518break;15191520case IROp::Sub:1521case IROp::Slt:1522case IROp::SltU:1523case IROp::Add:1524case IROp::And:1525case IROp::Or:1526case IROp::Xor:1527case IROp::Shl:1528case IROp::Shr:1529case IROp::Ror:1530case IROp::Sar:1531case IROp::MovZ:1532case IROp::MovNZ:1533case IROp::Max:1534case IROp::Min:1535// We'll try to move this downward.1536otherRegs[inst.dest] = RegState::CHANGED;1537if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)1538otherRegs[inst.src1] = RegState::READ;1539if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)1540otherRegs[inst.src2] = RegState::READ;1541otherQueue.push_back(inst);1542queuing = true;1543break;15441545case IROp::Neg:1546case IROp::Not:1547case IROp::BSwap16:1548case IROp::BSwap32:1549case IROp::Ext8to32:1550case IROp::Ext16to32:1551case IROp::ReverseBits:1552case IROp::Clz:1553case IROp::AddConst:1554case IROp::SubConst:1555case IROp::AndConst:1556case IROp::OrConst:1557case IROp::XorConst:1558case IROp::SltConst:1559case IROp::SltUConst:1560case IROp::ShlImm:1561case IROp::ShrImm:1562case IROp::RorImm:1563case IROp::SarImm:1564case IROp::Mov:1565// We'll try to move this downward.1566otherRegs[inst.dest] = RegState::CHANGED;1567if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)1568otherRegs[inst.src1] = RegState::READ;1569otherQueue.push_back(inst);1570queuing = true;1571break;15721573case IROp::SetConst:1574// We'll try to move this downward.1575otherRegs[inst.dest] = RegState::CHANGED;1576otherQueue.push_back(inst);1577queuing = true;1578break;15791580case IROp::Mult:1581case IROp::MultU:1582case IROp::Madd:1583case IROp::MaddU:1584case IROp::Msub:1585case IROp::MsubU:1586case IROp::Div:1587case IROp::DivU:1588if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)1589otherRegs[inst.src1] = RegState::READ;1590if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)1591otherRegs[inst.src2] = RegState::READ;1592otherQueue.push_back(inst);1593queuing = true;1594break;15951596case IROp::MfHi:1597case IROp::MfLo:1598case IROp::FpCondToReg:1599otherRegs[inst.dest] = RegState::CHANGED;1600otherQueue.push_back(inst);1601queuing = true;1602break;16031604case IROp::MtHi:1605case IROp::MtLo:1606case IROp::FpCondFromReg:1607if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)1608otherRegs[inst.src1] = RegState::READ;1609otherQueue.push_back(inst);1610queuing = true;1611break;16121613case IROp::Nop:1614case IROp::Downcount:1615if (queuing) {1616// These are freebies. Sometimes helps with delay slots.1617otherQueue.push_back(inst);1618} else {1619out.Write(inst);1620}1621break;16221623default:1624flushQueue();1625out.Write(inst);1626break;1627}1628}1629return logBlocks;1630}16311632bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {1633CONDITIONAL_DISABLE;16341635bool logBlocks = false;16361637auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) {1638if (a.op != b.op || a.src1 != b.src1) {1639// Not similar enough at all.1640return false;1641}1642u32 off1 = a.constant;1643u32 off2 = b.constant;1644if (off1 + dist != off2) {1645// Not immediately sequential.1646return false;1647}16481649return true;1650};16511652IRInst prev = { IROp::Nop };1653for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {1654IRInst inst = in.GetInstructions()[i];1655int c = 0;1656switch (inst.op) {1657case IROp::Store8:1658for (c = 1; c < 4 && i + c < n; ++c) {1659const IRInst &nextInst = in.GetInstructions()[i + c];1660// TODO: Might be nice to check if this is an obvious constant.1661if (inst.src3 != nextInst.src3 || inst.src3 != 0) {1662break;1663}1664if (!opsCompatible(inst, nextInst, c)) {1665break;1666}1667}1668if ((c == 2 || c == 3) && opts.unalignedLoadStore) {1669inst.op = IROp::Store16;1670out.Write(inst);1671prev = inst;1672// Skip the next one (the 3rd will be separate.)1673++i;1674continue;1675}1676if (c == 4 && opts.unalignedLoadStore) {1677inst.op = IROp::Store32;1678out.Write(inst);1679prev = inst;1680// Skip all 4.1681i += 3;1682continue;1683}1684out.Write(inst);1685prev = inst;1686break;16871688case IROp::Store16:1689for (c = 1; c < 2 && i + c < n; ++c) {1690const IRInst &nextInst = in.GetInstructions()[i + c];1691// TODO: Might be nice to check if this is an obvious constant.1692if (inst.src3 != nextInst.src3 || inst.src3 != 0) {1693break;1694}1695if (!opsCompatible(inst, nextInst, c * 2)) {1696break;1697}1698}1699if (c == 2 && opts.unalignedLoadStore) {1700inst.op = IROp::Store32;1701out.Write(inst);1702prev = inst;1703// Skip the next one.1704++i;1705continue;1706}1707out.Write(inst);1708prev = inst;1709break;17101711case IROp::Load32:1712if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {1713// A store and then an immediate load. This is sadly common in minis.1714if (prev.op == IROp::Store32 && prev.src3 == inst.dest) {1715// Even the same reg, a volatile variable? Skip it.1716continue;1717}17181719// Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble.1720if (prev.op == IROp::Store32) {1721inst.op = IROp::Mov;1722inst.src1 = prev.src3;1723inst.src2 = 0;1724} else if (prev.op == IROp::StoreFloat) {1725inst.op = IROp::FMovToGPR;1726inst.src1 = prev.src3;1727inst.src2 = 0;1728}1729// The actual op is written below.1730}1731out.Write(inst);1732prev = inst;1733break;17341735case IROp::LoadFloat:1736if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {1737// A store and then an immediate load, of a float.1738if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) {1739// Volatile float, I suppose?1740continue;1741}17421743if (prev.op == IROp::StoreFloat) {1744inst.op = IROp::FMov;1745inst.src1 = prev.src3;1746inst.src2 = 0;1747} else if (prev.op == IROp::Store32) {1748inst.op = IROp::FMovFromGPR;1749inst.src1 = prev.src3;1750inst.src2 = 0;1751}1752// The actual op is written below.1753}1754out.Write(inst);1755prev = inst;1756break;17571758default:1759out.Write(inst);1760prev = inst;1761break;1762}1763}1764return logBlocks;1765}17661767struct IRMemoryOpInfo {1768int size;1769bool isWrite;1770bool isWordLR;1771};17721773static IRMemoryOpInfo IROpMemoryAccessSize(IROp op) {1774// Assumes all take src1 + constant.1775switch (op) {1776case IROp::Load8:1777case IROp::Load8Ext:1778case IROp::Store8:1779return { 1, op == IROp::Store8 };17801781case IROp::Load16:1782case IROp::Load16Ext:1783case IROp::Store16:1784return { 2, op == IROp::Store16 };17851786case IROp::Load32:1787case IROp::Load32Linked:1788case IROp::LoadFloat:1789case IROp::Store32:1790case IROp::Store32Conditional:1791case IROp::StoreFloat:1792return { 4, op == IROp::Store32 || op == IROp::Store32Conditional || op == IROp::StoreFloat };17931794case IROp::LoadVec4:1795case IROp::StoreVec4:1796return { 16, op == IROp::StoreVec4 };17971798case IROp::Load32Left:1799case IROp::Load32Right:1800case IROp::Store32Left:1801case IROp::Store32Right:1802// This explicitly does not require alignment, so validate as an 8-bit operation.1803return { 1, op == IROp::Store32Left || op == IROp::Store32Right, true };18041805default:1806return { 0 };1807}1808}18091810bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts) {1811CONDITIONAL_DISABLE;1812if (g_Config.bFastMemory)1813DISABLE;18141815int spLower = 0;1816int spUpper = -1;1817bool spWrite = false;1818bool spModified = false;1819for (IRInst inst : in.GetInstructions()) {1820IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);1821// Note: we only combine word aligned accesses.1822if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) {1823if (spModified) {1824// No good, it was modified and then we did more accesses. Can't combine.1825spUpper = -1;1826break;1827}1828if ((int)inst.constant < 0 || (int)inst.constant >= 0x4000) {1829// Let's assume this might cross boundaries or something. Uncommon.1830spUpper = -1;1831break;1832}18331834spLower = std::min(spLower, (int)inst.constant);1835spUpper = std::max(spUpper, (int)inst.constant + info.size);1836spWrite = spWrite || info.isWrite;1837}18381839const IRMeta *m = GetIRMeta(inst.op);1840if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0 && inst.dest == MIPS_REG_SP) {1841// We only care if it changes after we start combining.1842spModified = spUpper != -1;1843}1844}18451846bool skipSP = spUpper != -1;1847bool flushedSP = false;18481849std::map<uint64_t, uint8_t> checks;1850const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) {1851if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) {1852if (!flushedSP) {1853out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower);1854if (spUpper > spLower + 4)1855out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spUpper - 4);1856flushedSP = true;1857}1858return;1859}18601861uint64_t key = ((uint64_t)inst.src1 << 32) | inst.constant;1862auto it = checks.find(key);1863if (it == checks.end() || it->second < sz) {1864out.Write(validate, 0, inst.src1, isStore ? 1U : 0U, inst.constant);1865checks[key] = sz;1866}1867};18681869bool logBlocks = false;1870for (IRInst inst : in.GetInstructions()) {1871IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);1872IROp validateOp = IROp::Nop;1873switch (info.size) {1874case 1: validateOp = IROp::ValidateAddress8; break;1875case 2: validateOp = IROp::ValidateAddress16; break;1876case 4: validateOp = IROp::ValidateAddress32; break;1877case 16: validateOp = IROp::ValidateAddress128; break;1878case 0: break;1879default: _assert_msg_(false, "Unexpected memory access size");1880}18811882if (validateOp != IROp::Nop) {1883addValidate(validateOp, info.size, inst, info.isWrite);1884}18851886const IRMeta *m = GetIRMeta(inst.op);1887if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0) {1888uint64_t key = (uint64_t)inst.dest << 32;1889// Wipe out all the already done checks since this was modified.1890checks.erase(checks.lower_bound(key), checks.upper_bound(key | 0xFFFFFFFFULL));1891}18921893// Always write out the original. We're only adding.1894out.Write(inst);1895}1896return logBlocks;1897}18981899bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) {1900CONDITIONAL_DISABLE;1901// Only do this when using a SIMD backend.1902if (!opts.preferVec4) {1903DISABLE;1904}19051906bool isVec4[256]{};1907bool isUsed[256]{};1908bool isVec4Dirty[256]{};1909auto updateVec4 = [&](char type, IRReg r) {1910bool downgraded = false;1911switch (type) {1912case 'F':1913downgraded = isVec4[r & ~3];1914isVec4[r & ~3] = false;1915isUsed[r] = true;1916break;19171918case 'V':1919_dbg_assert_((r & 3) == 0);1920isVec4[r] = true;1921for (int i = 0; i < 4; ++i)1922isUsed[r + i] = true;1923break;19241925case '2':1926downgraded = isVec4[r & ~3];1927isVec4[r & ~3] = false;1928for (int i = 0; i < 2; ++i)1929isUsed[r + i] = true;1930break;19311932default:1933break;1934}19351936return downgraded;1937};1938auto updateVec4Dest = [&](char type, IRReg r, uint32_t flags) {1939if ((flags & IRFLAG_SRC3) == 0) {1940switch (type) {1941case 'F':1942isVec4Dirty[r & ~3] = false;1943break;19441945case 'V':1946_dbg_assert_((r & 3) == 0);1947isVec4Dirty[r] = true;1948break;19491950case '2':1951isVec4Dirty[r & ~3] = false;1952break;19531954default:1955break;1956}1957}1958return updateVec4(type, r);1959};19601961// Checks overlap from r1 to other params.1962auto overlapped = [](IRReg r1, int l1, IRReg r2, int l2, IRReg r3 = IRREG_INVALID, int l3 = 0) {1963if (r1 < r2 + l2 && r1 + l1 > r2)1964return true;1965if (r1 < r3 + l3 && r1 + l1 > r3)1966return true;1967return false;1968};19691970bool logBlocks = false;1971int inCount = (int)in.GetInstructions().size();1972for (int i = 0; i < inCount; ++i) {1973IRInst inst = in.GetInstructions()[i];1974const IRMeta *m = GetIRMeta(inst.op);19751976if ((m->flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) {1977memset(isVec4, 0, sizeof(isVec4));1978out.Write(inst);1979continue;1980}19811982IRReg temp = IRREG_INVALID;1983auto findAvailTempVec4 = [&]() {1984// If it's not used yet in this block, we can use it.1985// Note: even if the instruction uses it to write, that should be fine.1986for (IRReg r = IRVTEMP_PFX_S; r < IRVTEMP_0 + 4; r += 4) {1987if (isUsed[r])1988continue;19891990bool usable = true;1991for (int j = 1; j < 4; ++j)1992usable = usable && !isUsed[r + j];19931994if (usable) {1995temp = r;1996// We don't update isUsed because our temporary doesn't need to last.1997return true;1998}1999}20002001return false;2002};20032004auto usedLaterAsVec4 = [&](IRReg r) {2005for (int j = i + 1; j < inCount; ++j) {2006IRInst inst = in.GetInstructions()[j];2007const IRMeta *m = GetIRMeta(inst.op);2008if (m->types[0] == 'V' && inst.dest == r)2009return true;2010if (m->types[1] == 'V' && inst.src1 == r)2011return true;2012if (m->types[2] == 'V' && inst.src2 == r)2013return true;2014}2015return false;2016};20172018bool skip = false;2019switch (inst.op) {2020case IROp::SetConstF:2021if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {2022// Check if we're setting multiple in a row, this is a bit common.2023u8 blendMask = 1 << (inst.dest & 3);2024while (i + 1 < inCount) {2025IRInst next = in.GetInstructions()[i + 1];2026if (next.op != IROp::SetConstF || (next.dest & ~3) != (inst.dest & ~3))2027break;2028if (next.constant != inst.constant)2029break;20302031blendMask |= 1 << (next.dest & 3);2032i++;2033}20342035if (inst.constant == 0) {2036out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllZERO);2037} else if (inst.constant == 0x3F800000) {2038out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllONE);2039} else if (inst.constant == 0xBF800000) {2040out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllMinusONE);2041} else {2042out.Write(IROp::SetConstF, temp, out.AddConstant(inst.constant));2043out.Write(IROp::Vec4Shuffle, temp, temp, 0);2044}2045out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);2046isVec4Dirty[inst.dest & ~3] = true;2047continue;2048}2049break;20502051case IROp::FMovFromGPR:2052if (isVec4[inst.dest & ~3] && findAvailTempVec4()) {2053u8 blendMask = 1 << (inst.dest & 3);2054out.Write(IROp::FMovFromGPR, temp, inst.src1);2055out.Write(IROp::Vec4Shuffle, temp, temp, 0);2056out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);2057isVec4Dirty[inst.dest & ~3] = true;2058continue;2059}2060break;20612062case IROp::LoadFloat:2063if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3) && findAvailTempVec4()) {2064u8 blendMask = 1 << (inst.dest & 3);2065out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);2066out.Write(IROp::Vec4Shuffle, temp, temp, 0);2067out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);2068isVec4Dirty[inst.dest & ~3] = true;2069continue;2070}2071break;20722073case IROp::StoreFloat:2074if (isVec4[inst.src3 & ~3] && isVec4Dirty[inst.src3 & ~3] && usedLaterAsVec4(inst.src3 & ~3) && findAvailTempVec4()) {2075out.Write(IROp::FMov, temp, inst.src3, 0);2076out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);2077continue;2078}2079break;20802081case IROp::FMov:2082if (isVec4[inst.dest & ~3] && (inst.dest & ~3) == (inst.src1 & ~3)) {2083// Oh, actually a shuffle?2084uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(0, 1, 2, 3);2085uint8_t destShift = (inst.dest & 3) * 2;2086shuffle = (shuffle & ~(3 << destShift)) | ((inst.src1 & 3) << destShift);2087out.Write(IROp::Vec4Shuffle, inst.dest & ~3, inst.dest & ~3, shuffle);2088isVec4Dirty[inst.dest & ~3] = true;2089continue;2090} else if (isVec4[inst.dest & ~3] && (inst.dest & 3) == (inst.src1 & 3)) {2091// We can turn this directly into a blend, since it's the same lane.2092out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, inst.src1 & ~3, 1 << (inst.dest & 3));2093isVec4Dirty[inst.dest & ~3] = true;2094continue;2095} else if (isVec4[inst.dest & ~3] && isVec4[inst.src1 & ~3] && findAvailTempVec4()) {2096// For this, we'll need a temporary to move to the right lane.2097int lane = inst.src1 & 3;2098uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(lane, lane, lane, lane);2099out.Write(IROp::Vec4Shuffle, temp, inst.src1 & ~3, shuffle);2100out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));2101isVec4Dirty[inst.dest & ~3] = true;2102continue;2103}2104break;21052106case IROp::FAdd:2107case IROp::FSub:2108case IROp::FMul:2109case IROp::FDiv:2110if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3)) {2111if (!overlapped(inst.dest & ~3, 4, inst.src1, 1, inst.src2, 1) && findAvailTempVec4()) {2112u8 blendMask = 1 << (inst.dest & 3);2113out.Write(inst.op, temp, inst.src1, inst.src2);2114out.Write(IROp::Vec4Shuffle, temp, temp, 0);2115out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask);2116updateVec4('F', inst.src1);2117updateVec4('F', inst.src2);2118isVec4Dirty[inst.dest & ~3] = true;2119continue;2120}2121}2122break;21232124case IROp::Vec4Dot:2125if (overlapped(inst.dest, 1, inst.src1, 4, inst.src2, 4) && findAvailTempVec4()) {2126out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant);2127if (usedLaterAsVec4(inst.dest & ~3)) {2128// Broadcast to other lanes if needed.2129if ((inst.dest & 3) != 0)2130out.Write(IROp::Vec4Shuffle, temp, temp, 0);2131out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3));2132// It's overlapped, so it'll get marked as Vec4 and used anyway.2133isVec4Dirty[inst.dest & ~3] = true;2134inst.dest = IRREG_INVALID;2135} else {2136out.Write(IROp::FMov, inst.dest, temp);2137}2138skip = true;2139}2140break;21412142case IROp::Vec4Scale:2143if (overlapped(inst.src2, 1, inst.src1, 4, inst.dest, 4) && findAvailTempVec4()) {2144out.Write(IROp::FMov, temp, inst.src2);2145out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);2146skip = true;2147inst.src2 = IRREG_INVALID;2148} else if (isVec4[inst.src2 & 3] && usedLaterAsVec4(inst.src2 & ~3) && findAvailTempVec4()) {2149out.Write(IROp::FMov, temp, inst.src2);2150out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant);2151skip = true;2152inst.src2 = IRREG_INVALID;2153}2154break;21552156default:2157break;2158}21592160bool downgrade = false;2161if (inst.src1 != IRREG_INVALID && updateVec4(m->types[1], inst.src1))2162downgrade = true;2163if (inst.src2 != IRREG_INVALID && updateVec4(m->types[2], inst.src2))2164downgrade = true;2165if (inst.dest != IRREG_INVALID && updateVec4Dest(m->types[0], inst.dest, m->flags))2166downgrade = true;21672168if (downgrade) {2169//WARN_LOG(Log::JIT, "Vec4 downgrade by: %s", m->name);2170}21712172if (!skip)2173out.Write(inst);2174}2175return logBlocks;2176}21772178// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon.2179bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) {2180CONDITIONAL_DISABLE;2181// This tells us to skip an AND op that has been optimized out.2182// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.2183int nextSkip = -1;21842185bool logBlocks = false;2186for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {2187IRInst inst = in.GetInstructions()[i];21882189// Just copy the last instruction.2190if (i == n - 1) {2191out.Write(inst);2192break;2193}21942195out.Write(inst);21962197IRInst next = in.GetInstructions()[i + 1];2198switch (inst.op) {2199case IROp::Store32:2200if (next.op == IROp::Load32 &&2201next.constant == inst.constant &&2202next.dest == inst.dest &&2203next.src1 == inst.src1) {2204// The upcoming load is completely redundant.2205// Skip it.2206i++;2207}2208break;2209case IROp::StoreVec4:2210if (next.op == IROp::LoadVec4 &&2211next.constant == inst.constant &&2212next.dest == inst.dest &&2213next.src1 == inst.src1) {2214// The upcoming load is completely redundant. These are common in Wipeout.2215// Skip it. NOTE: It looks like vector load/stores uses different register assignments, but there's a union between dest and src3.2216i++;2217}2218break;2219default:2220break;2221}2222}22232224return logBlocks;2225}22262227bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) {2228CONDITIONAL_DISABLE;2229// This tells us to skip an AND op that has been optimized out.2230// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.2231int nextSkip = -1;22322233bool logBlocks = false;2234// We also move the downcount to the top so the interpreter can assume that it's there.2235bool foundDowncount = false;2236out.Write(IROp::Downcount);22372238for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {2239IRInst inst = in.GetInstructions()[i];22402241bool last = i == n - 1;22422243// Specialize some instructions.2244switch (inst.op) {2245case IROp::Downcount:2246if (!foundDowncount) {2247// Move the value into the initial Downcount.2248foundDowncount = true;2249out.ReplaceConstant(0, inst.constant);2250} else {2251// Already had a downcount. Let's just re-emit it.2252out.Write(inst);2253}2254break;2255case IROp::AddConst:2256if (inst.src1 == inst.dest) {2257inst.op = IROp::OptAddConst;2258}2259out.Write(inst);2260break;2261case IROp::AndConst:2262if (inst.src1 == inst.dest) {2263inst.op = IROp::OptAndConst;2264}2265out.Write(inst);2266break;2267case IROp::OrConst:2268if (inst.src1 == inst.dest) {2269inst.op = IROp::OptOrConst;2270}2271out.Write(inst);2272break;2273case IROp::FMovToGPR:2274if (!last) {2275IRInst next = in.GetInstructions()[i + 1];2276if (next.op == IROp::ShrImm && next.src2 == 8 && next.src1 == next.dest && next.src1 == inst.dest) {2277// Heavily used when writing display lists.2278inst.op = IROp::OptFMovToGPRShr8;2279i++; // Skip the next instruction.2280}2281}2282out.Write(inst);2283break;2284case IROp::FMovFromGPR:2285if (!last) {2286IRInst next = in.GetInstructions()[i + 1];2287if (next.op == IROp::FCvtSW && next.src1 == inst.dest && next.dest == inst.dest) {2288inst.op = IROp::OptFCvtSWFromGPR;2289i++; // Skip the next2290}2291}2292out.Write(inst);2293break;2294default:2295out.Write(inst);2296break;2297}2298}22992300return logBlocks;2301}230223032304