CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM/ArmRegCacheFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cstring>1819#include "Common/CPUDetect.h"20#include "Common/Log.h"21#include "Core/MIPS/MIPS.h"22#include "Core/MIPS/ARM/ArmRegCacheFPU.h"23#include "Core/MIPS/ARM/ArmJit.h"24#include "Core/MIPS/MIPSTables.h"2526using namespace ArmGen;27using namespace ArmJitConstants;2829ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}3031void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {32if (!initialReady) {33SetupInitialRegs();34initialReady = true;35}3637memcpy(ar, arInitial, sizeof(ar));38memcpy(mr, mrInitial, sizeof(mr));39pendingFlush = false;40}4142void ArmRegCacheFPU::SetupInitialRegs() {43for (int i = 0; i < NUM_ARMFPUREG; i++) {44arInitial[i].mipsReg = -1;45arInitial[i].isDirty = false;46}47for (int i = 0; i < NUM_MIPSFPUREG; i++) {48mrInitial[i].loc = ML_MEM;49mrInitial[i].reg = INVALID_REG;50mrInitial[i].spillLock = false;51mrInitial[i].tempLock = false;52}53for (int i = 0; i < NUM_ARMQUADS; i++) {54qr[i].isDirty = false;55qr[i].mipsVec = -1;56qr[i].sz = V_Invalid;57qr[i].spillLock = false;58qr[i].isTemp = false;59memset(qr[i].vregs, 0xff, 4);60}61}6263const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {64// VFP mapping65// VFPU registers and regular FP registers are mapped interchangably on top of the standard66// 16 FPU registers.6768// NEON mapping69// We map FPU and VFPU registers entirely separately. FPU is mapped to 12 of the bottom 16 S registers.70// VFPU is mapped to the upper 48 regs, 32 of which can only be reached through NEON71// (or D16-D31 as doubles, but not relevant).72// Might consider shifting the split in the future, giving more regs to NEON allowing it to map more quads.7374// We should attempt to map scalars to low Q registers and wider things to high registers,75// as the NEON instructions are all 2-vector or 4-vector, they don't do scalar, we want to be76// able to use regular VFP instructions too.77static const ARMReg allocationOrderNEON[] = {78// Reserve four temp registers. Useful when building quads until we really figure out79// how to do that best.80S4, S5, S6, S7, // Q181S8, S9, S10, S11, // Q282S12, S13, S14, S15, // Q383S16, S17, S18, S19, // Q484S20, S21, S22, S23, // Q585S24, S25, S26, S27, // Q686S28, S29, S30, S31, // Q787// Q8-Q15 free for NEON tricks88};8990static const ARMReg allocationOrderNEONVFPU[] = {91// Reserve four temp registers. Useful when building quads until we really figure out92// how to do that best.93S4, S5, S6, S7, // Q194S8, S9, S10, S11, // Q295S12, S13, S14, S15, // Q396// Q4-Q15 free for VFPU97};9899// NOTE: It's important that S2/S3 are not allocated with bNEON, even if !useNEONVFPU.100// They are used by a few instructions, like vh2f.101if (jo_->useNEONVFPU) {102count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);103return allocationOrderNEONVFPU;104} else {105count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);106return allocationOrderNEON;107}108}109110bool ArmRegCacheFPU::IsMapped(MIPSReg r) {111return mr[r].loc == ML_ARMREG;112}113114ARMReg ArmRegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {115// INFO_LOG(Log::JIT, "FPR MapReg: %i flags=%i", mipsReg, mapFlags);116if (jo_->useNEONVFPU && mipsReg >= 32) {117ERROR_LOG(Log::JIT, "Cannot map VFPU registers to ARM VFP registers in NEON mode. PC=%08x", js_->compilerPC);118return S0;119}120121pendingFlush = true;122// Let's see if it's already mapped. If so we just need to update the dirty flag.123// We don't need to check for ML_NOINIT because we assume that anyone who maps124// with that flag immediately writes a "known" value to the register.125if (mr[mipsReg].loc == ML_ARMREG) {126if (ar[mr[mipsReg].reg].mipsReg != mipsReg) {127ERROR_LOG(Log::JIT, "Reg mapping out of sync! MR %i", mipsReg);128}129if (mapFlags & MAP_DIRTY) {130ar[mr[mipsReg].reg].isDirty = true;131}132//INFO_LOG(Log::JIT, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg);133return (ARMReg)(mr[mipsReg].reg + S0);134}135136// Okay, not mapped, so we need to allocate an ARM register.137138int allocCount;139const ARMReg *allocOrder = GetMIPSAllocationOrder(allocCount);140141allocate:142for (int i = 0; i < allocCount; i++) {143int reg = allocOrder[i] - S0;144145if (ar[reg].mipsReg == -1) {146// That means it's free. Grab it, and load the value into it (if requested).147ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;148if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {149if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {150emit_->VLDR((ARMReg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));151}152}153ar[reg].mipsReg = mipsReg;154mr[mipsReg].loc = ML_ARMREG;155mr[mipsReg].reg = reg;156//INFO_LOG(Log::JIT, "Mapped %i to %i", mipsReg, mr[mipsReg].reg);157return (ARMReg)(reg + S0);158}159}160161162// Still nothing. Let's spill a reg and goto 10.163// TODO: Use age or something to choose which register to spill?164// TODO: Spill dirty regs first? or opposite?165int bestToSpill = -1;166for (int i = 0; i < allocCount; i++) {167int reg = allocOrder[i] - S0;168if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock))169continue;170bestToSpill = reg;171break;172}173174if (bestToSpill != -1) {175FlushArmReg((ARMReg)(S0 + bestToSpill));176goto allocate;177}178179// Uh oh, we have all them spilllocked....180ERROR_LOG(Log::JIT, "Out of spillable registers at PC %08x!!!", js_->compilerPC);181return INVALID_REG;182}183184void ArmRegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) {185SpillLock(rd, rs);186MapReg(rd);187MapReg(rs);188ReleaseSpillLock(rd);189ReleaseSpillLock(rs);190}191192void ArmRegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) {193SpillLock(rd, rs);194bool load = !avoidLoad || rd == rs;195MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);196MapReg(rs);197ReleaseSpillLock(rd);198ReleaseSpillLock(rs);199}200201void ArmRegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) {202SpillLock(rd, rs, rt);203bool load = !avoidLoad || (rd == rs || rd == rt);204MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);205MapReg(rt);206MapReg(rs);207ReleaseSpillLock(rd);208ReleaseSpillLock(rs);209ReleaseSpillLock(rt);210}211212void ArmRegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) {213for (int i = 0; i < GetNumVectorElements(sz); i++) {214vr[v[i]].spillLock = true;215}216}217218void ArmRegCacheFPU::SpillLockV(int vec, VectorSize sz) {219u8 v[4];220GetVectorRegs(v, sz, vec);221SpillLockV(v, sz);222}223224void ArmRegCacheFPU::MapRegV(int vreg, int flags) {225MapReg(vreg + 32, flags);226}227228void ArmRegCacheFPU::LoadToRegV(ARMReg armReg, int vreg) {229if (vr[vreg].loc == ML_ARMREG) {230emit_->VMOV(armReg, (ARMReg)(S0 + vr[vreg].reg));231} else {232MapRegV(vreg);233emit_->VMOV(armReg, V(vreg));234}235}236237void ArmRegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) {238u8 v[4];239GetVectorRegs(v, sz, vec);240SpillLockV(v, sz);241for (int i = 0; i < GetNumVectorElements(sz); i++) {242MapRegV(v[i], flags);243}244}245246void ArmRegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) {247SpillLockV(v, sz);248for (int i = 0; i < GetNumVectorElements(sz); i++) {249MapRegV(v[i], flags);250}251}252253void ArmRegCacheFPU::MapInInV(int vs, int vt) {254SpillLockV(vs);255SpillLockV(vt);256MapRegV(vs);257MapRegV(vt);258ReleaseSpillLockV(vs);259ReleaseSpillLockV(vt);260}261262void ArmRegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) {263bool load = !avoidLoad || (vd == vs);264SpillLockV(vd);265SpillLockV(vs);266MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);267MapRegV(vs);268ReleaseSpillLockV(vd);269ReleaseSpillLockV(vs);270}271272void ArmRegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) {273bool load = !avoidLoad || (vd == vs || vd == vt);274SpillLockV(vd);275SpillLockV(vs);276SpillLockV(vt);277MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);278MapRegV(vs);279MapRegV(vt);280ReleaseSpillLockV(vd);281ReleaseSpillLockV(vs);282ReleaseSpillLockV(vt);283}284285void ArmRegCacheFPU::FlushArmReg(ARMReg r) {286if (r >= S0 && r <= S31) {287int reg = r - S0;288if (ar[reg].mipsReg == -1) {289// Nothing to do, reg not mapped.290return;291}292if (ar[reg].mipsReg != -1) {293if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG)294{295//INFO_LOG(Log::JIT, "Flushing ARM reg %i", reg);296emit_->VSTR(r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));297}298// IMMs won't be in an ARM reg.299mr[ar[reg].mipsReg].loc = ML_MEM;300mr[ar[reg].mipsReg].reg = INVALID_REG;301} else {302ERROR_LOG(Log::JIT, "Dirty but no mipsreg?");303}304ar[reg].isDirty = false;305ar[reg].mipsReg = -1;306} else if (r >= D0 && r <= D31) {307// TODO: Convert to S regs and flush them individually.308} else if (r >= Q0 && r <= Q15) {309QFlush(r);310}311}312313void ArmRegCacheFPU::FlushV(MIPSReg r) {314FlushR(r + 32);315}316317/*318void ArmRegCacheFPU::FlushQWithV(MIPSReg r) {319// Look for it in all the quads. If it's in any, flush that quad clean.320int flushCount = 0;321for (int i = 0; i < MAX_ARMQUADS; i++) {322if (qr[i].sz == V_Invalid)323continue;324325int n = qr[i].sz;326bool flushThis = false;327for (int j = 0; j < n; j++) {328if (qr[i].vregs[j] == r) {329flushThis = true;330}331}332333if (flushThis) {334QFlush(i);335flushCount++;336}337}338339if (flushCount > 1) {340WARN_LOG(Log::JIT, "ERROR: More than one quad was flushed to flush reg %i", r);341}342}343*/344345void ArmRegCacheFPU::FlushR(MIPSReg r) {346switch (mr[r].loc) {347case ML_IMM:348// IMM is always "dirty".349// IMM is not allowed for FP (yet).350ERROR_LOG(Log::JIT, "Imm in FP register?");351break;352353case ML_ARMREG:354if (mr[r].reg == INVALID_REG) {355ERROR_LOG(Log::JIT, "FlushR: MipsReg had bad ArmReg");356}357358if (mr[r].reg >= Q0 && mr[r].reg <= Q15) {359// This should happen rarely, but occasionally we need to flush a single stray360// mipsreg that's been part of a quad.361int quad = mr[r].reg - Q0;362if (qr[quad].isDirty) {363WARN_LOG(Log::JIT, "FlushR found quad register %i - PC=%08x", quad, js_->compilerPC);364emit_->ADDI2R(R0, CTXREG, GetMipsRegOffset(r), R1);365emit_->VST1_lane(F_32, (ARMReg)mr[r].reg, R0, mr[r].lane, true);366}367} else {368if (ar[mr[r].reg].isDirty) {369//INFO_LOG(Log::JIT, "Flushing dirty reg %i", mr[r].reg);370emit_->VSTR((ARMReg)(mr[r].reg + S0), CTXREG, GetMipsRegOffset(r));371ar[mr[r].reg].isDirty = false;372}373ar[mr[r].reg].mipsReg = -1;374}375break;376377case ML_MEM:378// Already there, nothing to do.379break;380381default:382//BAD383break;384}385mr[r].loc = ML_MEM;386mr[r].reg = (int)INVALID_REG;387}388389// Scalar only. Need a similar one for sequential Q vectors.390int ArmRegCacheFPU::FlushGetSequential(int a) {391int c = 1;392int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);393a++;394while (a < 32) {395if (!ar[a].isDirty || ar[a].mipsReg == -1)396break;397int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);398if (mipsOffset != lastMipsOffset + 4) {399break;400}401402lastMipsOffset = mipsOffset;403a++;404c++;405}406return c;407}408409void ArmRegCacheFPU::FlushAll() {410if (!pendingFlush) {411// Nothing allocated. FPU regs are not nearly as common as GPR.412return;413}414415// Discard temps!416for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {417DiscardR(i);418}419420// Flush quads!421// These could also use sequential detection.422for (int i = 4; i < NUM_ARMQUADS; i++) {423QFlush(i);424}425426// Loop through the ARM registers, then use GetMipsRegOffset to determine if MIPS registers are427// sequential. This is necessary because we store VFPU registers in a staggered order to get428// columns sequential (most VFPU math in nearly all games is in columns, not rows).429430int numArmRegs;431// We rely on the allocation order being sequential.432const ARMReg baseReg = GetMIPSAllocationOrder(numArmRegs)[0];433434for (int i = 0; i < numArmRegs; i++) {435int a = (baseReg - S0) + i;436int m = ar[a].mipsReg;437438if (ar[a].isDirty) {439if (m == -1) {440INFO_LOG(Log::JIT, "ARM reg %i is dirty but has no mipsreg", a);441continue;442}443444int c = FlushGetSequential(a);445if (c == 1) {446// INFO_LOG(Log::JIT, "Got single register: %i (%i)", a, m);447emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));448} else if (c == 2) {449// Probably not worth using VSTMIA for two.450int offset = GetMipsRegOffset(m);451emit_->VSTR((ARMReg)(a + S0), CTXREG, offset);452emit_->VSTR((ARMReg)(a + 1 + S0), CTXREG, offset + 4);453} else {454// INFO_LOG(Log::JIT, "Got sequence: %i at %i (%i)", c, a, m);455emit_->ADDI2R(SCRATCHREG1, CTXREG, GetMipsRegOffset(m), SCRATCHREG2);456// INFO_LOG(Log::JIT, "VSTMIA R0, %i, %i", a, c);457emit_->VSTMIA(SCRATCHREG1, false, (ARMReg)(S0 + a), c);458}459460// Skip past, and mark as non-dirty.461for (int j = 0; j < c; j++) {462int b = a + j;463mr[ar[b].mipsReg].loc = ML_MEM;464mr[ar[b].mipsReg].reg = (int)INVALID_REG;465ar[a + j].mipsReg = -1;466ar[a + j].isDirty = false;467}468i += c - 1;469} else {470if (m != -1) {471mr[m].loc = ML_MEM;472mr[m].reg = (int)INVALID_REG;473}474ar[a].mipsReg = -1;475// already not dirty476}477}478479// Sanity check480for (int i = 0; i < NUM_ARMFPUREG; i++) {481if (ar[i].mipsReg != -1) {482ERROR_LOG(Log::JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);483}484}485pendingFlush = false;486}487488void ArmRegCacheFPU::DiscardR(MIPSReg r) {489switch (mr[r].loc) {490case ML_IMM:491// IMM is always "dirty".492// IMM is not allowed for FP (yet).493ERROR_LOG(Log::JIT, "Imm in FP register?");494break;495496case ML_ARMREG:497if (mr[r].reg == INVALID_REG) {498ERROR_LOG(Log::JIT, "DiscardR: MipsReg had bad ArmReg");499} else {500// Note that we DO NOT write it back here. That's the whole point of Discard.501ar[mr[r].reg].isDirty = false;502ar[mr[r].reg].mipsReg = -1;503}504break;505506case ML_MEM:507// Already there, nothing to do.508break;509510default:511//BAD512break;513}514mr[r].loc = ML_MEM;515mr[r].reg = (int)INVALID_REG;516mr[r].tempLock = false;517mr[r].spillLock = false;518}519520bool ArmRegCacheFPU::IsTempX(ARMReg r) const {521return ar[r - S0].mipsReg >= TEMP0;522}523524int ArmRegCacheFPU::GetTempR() {525if (jo_->useNEONVFPU) {526ERROR_LOG(Log::JIT, "VFP temps not allowed in NEON mode");527return 0;528}529pendingFlush = true;530for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) {531if (mr[r].loc == ML_MEM && !mr[r].tempLock) {532mr[r].tempLock = true;533return r;534}535}536537ERROR_LOG(Log::CPU, "Out of temp regs! Might need to DiscardR() some");538_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");539return -1;540}541542int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {543// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.544if (r < 0 || r > 32 + 128 + NUM_TEMPS) {545ERROR_LOG(Log::JIT, "bad mips register %i, out of range", r);546return 0; // or what?547}548549if (r < 32 || r >= 32 + 128) {550return (32 + r) << 2;551} else {552// r is between 32 and 128 + 32553return (32 + 32 + voffset[r - 32]) << 2;554}555}556557void ArmRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {558mr[r1].spillLock = true;559if (r2 != -1) mr[r2].spillLock = true;560if (r3 != -1) mr[r3].spillLock = true;561if (r4 != -1) mr[r4].spillLock = true;562}563564// This is actually pretty slow with all the 160 regs...565void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {566for (int i = 0; i < NUM_MIPSFPUREG; i++) {567mr[i].spillLock = false;568}569for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {570DiscardR(i);571}572for (int i = 0; i < NUM_ARMQUADS; i++) {573qr[i].spillLock = false;574if (qr[i].isTemp) {575qr[i].isTemp = false;576qr[i].sz = V_Invalid;577}578}579}580581ARMReg ArmRegCacheFPU::R(int mipsReg) {582if (mr[mipsReg].loc == ML_ARMREG) {583return (ARMReg)(mr[mipsReg].reg + S0);584} else {585if (mipsReg < 32) {586ERROR_LOG(Log::JIT, "FReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());587} else if (mipsReg < 32 + 128) {588ERROR_LOG(Log::JIT, "VReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());589} else {590ERROR_LOG(Log::JIT, "Tempreg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 128 - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());591}592return INVALID_REG; // BAAAD593}594}595596inline ARMReg QuadAsD(int quad) {597return (ARMReg)(D0 + quad * 2);598}599600inline ARMReg QuadAsQ(int quad) {601return (ARMReg)(Q0 + quad);602}603604bool MappableQ(int quad) {605return quad >= 4;606}607608void ArmRegCacheFPU::QLoad4x4(MIPSGPReg regPtr, int vquads[4]) {609ERROR_LOG(Log::JIT, "QLoad4x4 not implemented");610// TODO611}612613void ArmRegCacheFPU::QFlush(int quad) {614if (!MappableQ(quad)) {615ERROR_LOG(Log::JIT, "Cannot flush non-mappable quad %i", quad);616return;617}618619if (qr[quad].isDirty && !qr[quad].isTemp) {620INFO_LOG(Log::JIT, "Flushing Q%i (%s)", quad, GetVectorNotation(qr[quad].mipsVec, qr[quad].sz).c_str());621622ARMReg q = QuadAsQ(quad);623// Unlike reads, when writing to the register file we need to be careful to write the correct624// number of floats.625626switch (qr[quad].sz) {627case V_Single:628emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);629emit_->VST1_lane(F_32, q, R0, 0, true);630// WARN_LOG(Log::JIT, "S: Falling back to individual flush: pc=%08x", js_->compilerPC);631break;632case V_Pair:633if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1])) {634// Can combine, it's a column!635emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);636emit_->VST1(F_32, q, R0, 1, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable637} else {638// WARN_LOG(Log::JIT, "P: Falling back to individual flush: pc=%08x", js_->compilerPC);639emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);640emit_->VST1_lane(F_32, q, R0, 0, true);641emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);642emit_->VST1_lane(F_32, q, R0, 1, true);643}644break;645case V_Triple:646if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2])) {647emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);648emit_->VST1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE); // TODO: Allow ALIGN_64 when applicable649emit_->VST1_lane(F_32, q, R0, 2, true);650} else {651// WARN_LOG(Log::JIT, "T: Falling back to individual flush: pc=%08x", js_->compilerPC);652emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);653emit_->VST1_lane(F_32, q, R0, 0, true);654emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);655emit_->VST1_lane(F_32, q, R0, 1, true);656emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);657emit_->VST1_lane(F_32, q, R0, 2, true);658}659break;660case V_Quad:661if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2], qr[quad].vregs[3])) {662emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);663emit_->VST1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable664} else {665// WARN_LOG(Log::JIT, "Q: Falling back to individual flush: pc=%08x", js_->compilerPC);666emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);667emit_->VST1_lane(F_32, q, R0, 0, true);668emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);669emit_->VST1_lane(F_32, q, R0, 1, true);670emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);671emit_->VST1_lane(F_32, q, R0, 2, true);672emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[3]), R1);673emit_->VST1_lane(F_32, q, R0, 3, true);674}675break;676default:677ERROR_LOG(Log::JIT, "Unknown quad size %i", qr[quad].sz);678break;679}680681qr[quad].isDirty = false;682683int n = GetNumVectorElements(qr[quad].sz);684for (int i = 0; i < n; i++) {685int vr = qr[quad].vregs[i];686if (vr < 0 || vr > 128) {687ERROR_LOG(Log::JIT, "Bad vr %i", vr);688}689FPURegMIPS &m = mr[32 + vr];690m.loc = ML_MEM;691m.lane = -1;692m.reg = -1;693}694695} else {696if (qr[quad].isTemp) {697WARN_LOG(Log::JIT, "Not flushing quad %i; dirty = %i, isTemp = %i", quad, qr[quad].isDirty, qr[quad].isTemp);698}699}700701qr[quad].isTemp = false;702qr[quad].mipsVec = -1;703qr[quad].sz = V_Invalid;704memset(qr[quad].vregs, 0xFF, 4);705}706707int ArmRegCacheFPU::QGetFreeQuad(int start, int count, const char *reason) {708// Search for a free quad. A quad is free if the first register in it is free.709for (int i = 0; i < count; i++) {710int q = (i + start) & 15;711712if (!MappableQ(q))713continue;714715// Don't steal temp quads!716if (qr[q].mipsVec == (int)INVALID_REG && !qr[q].isTemp) {717// INFO_LOG(Log::JIT, "Free quad: %i", q);718// Oh yeah! Free quad!719return q;720}721}722723// Okay, find the "best scoring" reg to replace. Scoring algorithm TBD but may include some724// sort of age.725int bestQuad = -1;726int bestScore = -1;727for (int i = 0; i < count; i++) {728int q = (i + start) & 15;729730if (!MappableQ(q))731continue;732if (qr[q].spillLock)733continue;734if (qr[q].isTemp)735continue;736737int score = 0;738if (!qr[q].isDirty) {739score += 5;740}741742if (score > bestScore) {743bestQuad = q;744bestScore = score;745}746}747748if (bestQuad == -1) {749ERROR_LOG(Log::JIT, "Failed finding a free quad. Things will now go haywire!");750return -1;751} else {752INFO_LOG(Log::JIT, "No register found in %i and the next %i, kicked out #%i (%s)", start, count, bestQuad, reason ? reason : "no reason");753QFlush(bestQuad);754return bestQuad;755}756}757758ARMReg ArmRegCacheFPU::QAllocTemp(VectorSize sz) {759int q = QGetFreeQuad(8, 16, "allocating temporary"); // Prefer high quads as temps760if (q < 0) {761ERROR_LOG(Log::JIT, "Failed to allocate temp quad");762q = 0;763}764qr[q].spillLock = true;765qr[q].isTemp = true;766qr[q].sz = sz;767qr[q].isDirty = false; // doesn't matter768769INFO_LOG(Log::JIT, "Allocated temp quad %i", q);770771if (sz == V_Single || sz == V_Pair) {772return D_0(ARMReg(Q0 + q));773} else {774return ARMReg(Q0 + q);775}776}777778bool ArmRegCacheFPU::Consecutive(int v1, int v2) const {779return (voffset[v1] + 1) == voffset[v2];780}781782bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3) const {783return Consecutive(v1, v2) && Consecutive(v2, v3);784}785786bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3, int v4) const {787return Consecutive(v1, v2) && Consecutive(v2, v3) && Consecutive(v3, v4);788}789790void ArmRegCacheFPU::QMapMatrix(ARMReg *regs, int matrix, MatrixSize mz, int flags) {791u8 vregs[4];792if (flags & MAP_MTX_TRANSPOSED) {793GetMatrixRows(matrix, mz, vregs);794} else {795GetMatrixColumns(matrix, mz, vregs);796}797798// TODO: Zap existing mappings, reserve 4 consecutive regs, then do a fast load.799int n = GetMatrixSide(mz);800VectorSize vsz = GetVectorSize(mz);801for (int i = 0; i < n; i++) {802regs[i] = QMapReg(vregs[i], vsz, flags);803}804}805806ARMReg ArmRegCacheFPU::QMapReg(int vreg, VectorSize sz, int flags) {807qTime_++;808809int n = GetNumVectorElements(sz);810u8 vregs[4];811GetVectorRegs(vregs, sz, vreg);812813// Range of registers to consider814int start = 0;815int count = 16;816817if (flags & MAP_PREFER_HIGH) {818start = 8;819} else if (flags & MAP_PREFER_LOW) {820start = 4;821} else if (flags & MAP_FORCE_LOW) {822start = 4;823count = 4;824} else if (flags & MAP_FORCE_HIGH) {825start = 8;826count = 8;827}828829// Let's check if they are all mapped in a quad somewhere.830// At the same time, check for the quad already being mapped.831// Later we can check for possible transposes as well.832833// First just loop over all registers. If it's here and not in range, or overlapped, kick.834std::vector<int> quadsToFlush;835for (int i = 0; i < 16; i++) {836int q = (i + start) & 15;837if (!MappableQ(q))838continue;839840// Skip unmapped quads.841if (qr[q].sz == V_Invalid)842continue;843844// Check if completely there already. If so, set spill-lock, transfer dirty flag and exit.845if (vreg == qr[q].mipsVec && sz == qr[q].sz) {846if (i < count) {847INFO_LOG(Log::JIT, "Quad already mapped: %i : %i (size %i)", q, vreg, sz);848qr[q].isDirty = qr[q].isDirty || (flags & MAP_DIRTY);849qr[q].spillLock = true;850851// Sanity check vregs852for (int i = 0; i < n; i++) {853if (vregs[i] != qr[q].vregs[i]) {854ERROR_LOG(Log::JIT, "Sanity check failed: %i vs %i", vregs[i], qr[q].vregs[i]);855}856}857858return (ARMReg)(Q0 + q);859} else {860INFO_LOG(Log::JIT, "Quad already mapped at %i which is out of requested range [%i-%i) (count = %i), needs moving. For now we flush.", q, start, start+count, count);861quadsToFlush.push_back(q);862continue;863}864}865866// Check for any overlap. Overlap == flush.867int origN = GetNumVectorElements(qr[q].sz);868for (int a = 0; a < n; a++) {869for (int b = 0; b < origN; b++) {870if (vregs[a] == qr[q].vregs[b]) {871quadsToFlush.push_back(q);872goto doubleBreak;873}874}875}876doubleBreak:877;878}879880// We didn't find the extra register, but we got a list of regs to flush. Flush 'em.881// Here we can check for opportunities to do a "transpose-flush" of row vectors, etc.882if (!quadsToFlush.empty()) {883INFO_LOG(Log::JIT, "New mapping %s collided with %d quads, flushing them.", GetVectorNotation(vreg, sz).c_str(), (int)quadsToFlush.size());884}885for (size_t i = 0; i < quadsToFlush.size(); i++) {886QFlush(quadsToFlush[i]);887}888889// Find where we want to map it, obeying the constraints we gave.890int quad = QGetFreeQuad(start, count, "mapping");891if (quad < 0)892return INVALID_REG;893894// If parts of our register are elsewhere, and we are dirty, we need to flush them895// before we reload in a new location.896// This may be problematic if inputs overlap irregularly with output, say:897// vdot S700, R000, C000898// It might still work by accident...899if (flags & MAP_DIRTY) {900for (int i = 0; i < n; i++) {901FlushV(vregs[i]);902}903}904905qr[quad].sz = sz;906qr[quad].mipsVec = vreg;907908if ((flags & MAP_NOINIT) != MAP_NOINIT) {909// Okay, now we will try to load the whole thing in one go. This is possible910// if it's a row and easy if it's a single.911// Rows are rare, columns are common - but thanks to our register reordering,912// columns are actually in-order in memory.913switch (sz) {914case V_Single:915emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);916emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);917break;918case V_Pair:919if (Consecutive(vregs[0], vregs[1])) {920// Can combine, it's a column!921emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);922emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable923} else {924emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);925emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);926emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);927emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);928}929break;930case V_Triple:931if (Consecutive(vregs[0], vregs[1], vregs[2])) {932emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);933emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE); // TODO: Allow ALIGN_64 when applicable934emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);935} else {936emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);937emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);938emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);939emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);940emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);941emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);942}943break;944case V_Quad:945if (Consecutive(vregs[0], vregs[1], vregs[2], vregs[3])) {946emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);947emit_->VLD1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable948} else {949emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);950emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);951emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);952emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);953emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);954emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);955emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[3]), R1);956emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 3, true);957}958break;959default:960;961}962}963964// OK, let's fill out the arrays to confirm that we have grabbed these registers.965for (int i = 0; i < n; i++) {966int mipsReg = 32 + vregs[i];967mr[mipsReg].loc = ML_ARMREG;968mr[mipsReg].reg = QuadAsQ(quad);969mr[mipsReg].lane = i;970qr[quad].vregs[i] = vregs[i];971}972qr[quad].isDirty = (flags & MAP_DIRTY) != 0;973qr[quad].spillLock = true;974975INFO_LOG(Log::JIT, "Mapped Q%i to vfpu %i (%s), sz=%i, dirty=%i", quad, vreg, GetVectorNotation(vreg, sz).c_str(), (int)sz, qr[quad].isDirty);976if (sz == V_Single || sz == V_Pair) {977return D_0(QuadAsQ(quad));978} else {979return QuadAsQ(quad);980}981}982983984985