CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM64/Arm64RegCacheFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(ARM64)1920#include <cstring>2122#include "Common/CPUDetect.h"23#include "Common/Log.h"24#include "Core/Reporting.h"25#include "Core/MIPS/MIPS.h"26#include "Core/MIPS/ARM64/Arm64RegCacheFPU.h"27#include "Core/MIPS/ARM64/Arm64Jit.h"28#include "Core/MIPS/MIPSTables.h"2930using namespace Arm64Gen;31using namespace Arm64JitConstants;3233Arm64RegCacheFPU::Arm64RegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), vr(mr + 32), js_(js), jo_(jo) {34numARMFpuReg_ = 32;35}3637void Arm64RegCacheFPU::Init(Arm64Gen::ARM64XEmitter *emit, Arm64Gen::ARM64FloatEmitter *fp) {38emit_ = emit;39fp_ = fp;40}4142void Arm64RegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {43if (!initialReady) {44SetupInitialRegs();45initialReady = true;46}4748memcpy(ar, arInitial, sizeof(ar));49memcpy(mr, mrInitial, sizeof(mr));50pendingFlush = false;51}5253void Arm64RegCacheFPU::SetupInitialRegs() {54for (int i = 0; i < numARMFpuReg_; i++) {55arInitial[i].mipsReg = -1;56arInitial[i].isDirty = false;57}58for (int i = 0; i < NUM_MIPSFPUREG; i++) {59mrInitial[i].loc = ML_MEM;60mrInitial[i].reg = INVALID_REG;61mrInitial[i].spillLock = false;62mrInitial[i].tempLock = false;63}64}6566const ARM64Reg *Arm64RegCacheFPU::GetMIPSAllocationOrder(int &count) {67// On ARM64, all 32 registers are fully 128-bit and fully interchangable so we don't68// have to care about upper or lower registers. However, only S8-S15 are callee-save, and69// only the bottom 64 bits of those. So we should allocate into these when we call70// C functions, although we don't currently do so...7172static const ARM64Reg allocationOrder[] = {73// Reserve four full 128-bit temp registers, should be plenty.74S4, S5, S6, S7,75S8, S9, S10, S11, // Partially callee-save (bottom 64 bits)76S12, S13, S14, S15, // Partially callee-save (bottom 64 bits)77S16, S17, S18, S19,78S20, S21, S22, S23,79S24, S25, S26, S27,80S28, S29, S30, S31,81};8283static const ARM64Reg allocationOrderNEONVFPU[] = {84// Reserve four full 128-bit temp registers, should be plenty.8586// Then let's use 12 register as singles87S4, S5, S6, S7,88S8, S9, S10, S11,89S12, S13, S14, S15,9091// And do quads in the rest? Or use a strategy more similar to what we do on x86?92};9394if (jo_->useASIMDVFPU) {95count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARM64Reg);96return allocationOrderNEONVFPU;97} else {98count = sizeof(allocationOrder) / sizeof(const ARM64Reg);99return allocationOrder;100}101}102103bool Arm64RegCacheFPU::IsMapped(MIPSReg r) {104return mr[r].loc == ML_ARMREG;105}106107bool Arm64RegCacheFPU::IsInRAM(MIPSReg r) {108return mr[r].loc == ML_MEM;109}110111ARM64Reg Arm64RegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {112// INFO_LOG(Log::JIT, "FPR MapReg: %i flags=%i", mipsReg, mapFlags);113if (jo_->useASIMDVFPU && mipsReg >= 32) {114ERROR_LOG(Log::JIT, "Cannot map VFPU registers to ARM VFP registers in NEON mode. PC=%08x", js_->compilerPC);115return S0;116}117118pendingFlush = true;119// Let's see if it's already mapped. If so we just need to update the dirty flag.120// We don't need to check for ML_NOINIT because we assume that anyone who maps121// with that flag immediately writes a "known" value to the register.122if (mr[mipsReg].loc == ML_ARMREG) {123if (ar[mr[mipsReg].reg].mipsReg != mipsReg) {124ERROR_LOG(Log::JIT, "Reg mapping out of sync! MR %i", mipsReg);125}126if (mapFlags & MAP_DIRTY) {127ar[mr[mipsReg].reg].isDirty = true;128}129//INFO_LOG(Log::JIT, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg);130return (ARM64Reg)(mr[mipsReg].reg + S0);131}132133// Okay, not mapped, so we need to allocate an ARM register.134135int allocCount;136const ARM64Reg *allocOrder = GetMIPSAllocationOrder(allocCount);137138allocate:139for (int i = 0; i < allocCount; i++) {140int reg = DecodeReg(allocOrder[i]);141142if (ar[reg].mipsReg == -1) {143// That means it's free. Grab it, and load the value into it (if requested).144ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;145if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {146if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {147fp_->LDR(32, INDEX_UNSIGNED, (ARM64Reg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));148}149}150ar[reg].mipsReg = mipsReg;151mr[mipsReg].loc = ML_ARMREG;152mr[mipsReg].reg = reg;153//INFO_LOG(Log::JIT, "Mapped %i to %i", mipsReg, mr[mipsReg].reg);154return (ARM64Reg)(reg + S0);155}156}157158159// Still nothing. Let's spill a reg and goto 10.160// TODO: Use age or something to choose which register to spill?161// TODO: Spill dirty regs first? or opposite?162int bestToSpill = -1;163for (int i = 0; i < allocCount; i++) {164int reg = allocOrder[i] - S0;165if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock))166continue;167bestToSpill = reg;168break;169}170171if (bestToSpill != -1) {172FlushArmReg((ARM64Reg)(S0 + bestToSpill));173goto allocate;174}175176// Uh oh, we have all them spilllocked....177ERROR_LOG(Log::JIT, "Out of spillable registers at PC %08x!!!", js_->compilerPC);178return INVALID_REG;179}180181void Arm64RegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) {182SpillLock(rd, rs);183MapReg(rd);184MapReg(rs);185ReleaseSpillLock(rd);186ReleaseSpillLock(rs);187}188189void Arm64RegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) {190SpillLock(rd, rs);191bool load = !avoidLoad || rd == rs;192MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);193MapReg(rs);194ReleaseSpillLock(rd);195ReleaseSpillLock(rs);196}197198void Arm64RegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) {199SpillLock(rd, rs, rt);200bool load = !avoidLoad || (rd == rs || rd == rt);201MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);202MapReg(rt);203MapReg(rs);204ReleaseSpillLock(rd);205ReleaseSpillLock(rs);206ReleaseSpillLock(rt);207}208209void Arm64RegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) {210for (int i = 0; i < GetNumVectorElements(sz); i++) {211vr[v[i]].spillLock = true;212}213}214215void Arm64RegCacheFPU::SpillLockV(int vec, VectorSize sz) {216u8 v[4];217GetVectorRegs(v, sz, vec);218SpillLockV(v, sz);219}220221void Arm64RegCacheFPU::MapRegV(int vreg, int flags) {222MapReg(vreg + 32, flags);223}224225void Arm64RegCacheFPU::LoadToRegV(ARM64Reg armReg, int vreg) {226if (vr[vreg].loc == ML_ARMREG) {227fp_->FMOV(armReg, (ARM64Reg)(S0 + vr[vreg].reg));228} else {229MapRegV(vreg);230fp_->FMOV(armReg, V(vreg));231}232}233234void Arm64RegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) {235u8 v[4];236GetVectorRegs(v, sz, vec);237SpillLockV(v, sz);238for (int i = 0; i < GetNumVectorElements(sz); i++) {239MapRegV(v[i], flags);240}241}242243void Arm64RegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) {244SpillLockV(v, sz);245for (int i = 0; i < GetNumVectorElements(sz); i++) {246MapRegV(v[i], flags);247}248}249250void Arm64RegCacheFPU::MapInInV(int vs, int vt) {251SpillLockV(vs);252SpillLockV(vt);253MapRegV(vs);254MapRegV(vt);255ReleaseSpillLockV(vs);256ReleaseSpillLockV(vt);257}258259void Arm64RegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) {260bool load = !avoidLoad || (vd == vs);261SpillLockV(vd);262SpillLockV(vs);263MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);264MapRegV(vs);265ReleaseSpillLockV(vd);266ReleaseSpillLockV(vs);267}268269void Arm64RegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) {270bool load = !avoidLoad || (vd == vs || vd == vt);271SpillLockV(vd);272SpillLockV(vs);273SpillLockV(vt);274MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);275MapRegV(vs);276MapRegV(vt);277ReleaseSpillLockV(vd);278ReleaseSpillLockV(vs);279ReleaseSpillLockV(vt);280}281282void Arm64RegCacheFPU::FlushArmReg(ARM64Reg r) {283if (r >= S0 && r <= S31) {284int reg = r - S0;285if (ar[reg].mipsReg == -1) {286// Nothing to do, reg not mapped.287return;288}289if (ar[reg].mipsReg != -1) {290if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG){291//INFO_LOG(Log::JIT, "Flushing ARM reg %i", reg);292fp_->STR(32, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));293}294// IMMs won't be in an ARM reg.295mr[ar[reg].mipsReg].loc = ML_MEM;296mr[ar[reg].mipsReg].reg = INVALID_REG;297} else {298ERROR_LOG(Log::JIT, "Dirty but no mipsreg?");299}300ar[reg].mipsReg = -1;301ar[reg].isDirty = false;302}303}304305void Arm64RegCacheFPU::FlushV(MIPSReg r) {306FlushR(r + 32);307}308309void Arm64RegCacheFPU::FlushR(MIPSReg r) {310switch (mr[r].loc) {311case ML_IMM:312// IMM is always "dirty".313// IMM is not allowed for FP (yet).314ERROR_LOG(Log::JIT, "Imm in FP register?");315break;316317case ML_ARMREG:318if (mr[r].reg == INVALID_REG) {319ERROR_LOG(Log::JIT, "FlushR: MipsReg had bad ArmReg");320}321FlushArmReg((ARM64Reg)(S0 + mr[r].reg));322break;323324case ML_MEM:325// Already there, nothing to do.326break;327328default:329//BAD330break;331}332}333334Arm64Gen::ARM64Reg Arm64RegCacheFPU::ARM64RegForFlush(int r) {335switch (mr[r].loc) {336case ML_IMM:337// IMM is always "dirty".338// IMM is not allowed for FP (yet).339ERROR_LOG(Log::JIT, "Imm in FP register?");340return INVALID_REG;341342case ML_ARMREG:343if (mr[r].reg == INVALID_REG) {344ERROR_LOG_REPORT(Log::JIT, "ARM64RegForFlush: MipsReg %d had bad ArmReg", r);345return INVALID_REG;346}347// No need to flush if it's not dirty.348if (!ar[mr[r].reg].isDirty) {349return INVALID_REG;350}351return (ARM64Reg)(S0 + mr[r].reg);352353case ML_MEM:354return INVALID_REG;355356default:357ERROR_LOG_REPORT(Log::JIT, "ARM64RegForFlush: MipsReg %d with invalid location %d", r, mr[r].loc);358return INVALID_REG;359}360}361362void Arm64RegCacheFPU::FlushAll() {363if (!pendingFlush) {364// Nothing allocated. FPU regs are not nearly as common as GPR.365return;366}367368// Discard temps!369for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {370DiscardR(i);371}372373int numArmRegs = 0;374375const ARM64Reg *order = GetMIPSAllocationOrder(numArmRegs);376377// Flush pairs first when possible. Note that STP's offset can't reach more than 256 bytes so378// most VFPU registers cannot be flushed this way, unless we are willing to generate another offset pointer379// (which we could actually do right here, point right in the middle of the VFPU stuff and would reach it all)... TODO380for (int i = 0; i < 31; i++) {381int mr1 = i;382int mr2 = i + 1;383ARM64Reg ar1 = ARM64RegForFlush(mr1);384ARM64Reg ar2 = ARM64RegForFlush(mr2);385386if (ar1 != INVALID_REG && ar2 != INVALID_REG) {387fp_->STP(32, INDEX_SIGNED, ar1, ar2, CTXREG, GetMipsRegOffset(mr1));388DiscardR(mr1);389DiscardR(mr2);390}391}392393// Then flush one by one.394395for (int i = 0; i < numArmRegs; i++) {396int a = DecodeReg(order[i]);397int m = ar[a].mipsReg;398399if (ar[a].isDirty) {400if (m == -1) {401INFO_LOG(Log::JIT, "ARM reg %d is dirty but has no mipsreg", a);402continue;403}404405fp_->STR(32, INDEX_UNSIGNED, (ARM64Reg)(a + S0), CTXREG, GetMipsRegOffset(m));406407mr[m].loc = ML_MEM;408mr[m].reg = (int)INVALID_REG;409ar[a].mipsReg = -1;410ar[a].isDirty = false;411} else {412if (m != -1) {413mr[m].loc = ML_MEM;414mr[m].reg = (int)INVALID_REG;415}416ar[a].mipsReg = -1;417// already not dirty418}419}420421// Sanity check422for (int i = 0; i < numARMFpuReg_; i++) {423if (ar[i].mipsReg != -1) {424ERROR_LOG(Log::JIT, "Flush fail: ar[%d].mipsReg=%d", i, ar[i].mipsReg);425}426}427pendingFlush = false;428}429430void Arm64RegCacheFPU::DiscardR(MIPSReg r) {431switch (mr[r].loc) {432case ML_IMM:433// IMM is always "dirty".434// IMM is not allowed for FP (yet).435ERROR_LOG(Log::JIT, "Imm in FP register?");436break;437438case ML_ARMREG:439if (mr[r].reg == INVALID_REG) {440ERROR_LOG(Log::JIT, "DiscardR: MipsReg had bad ArmReg");441} else {442// Note that we DO NOT write it back here. That's the whole point of Discard.443ar[mr[r].reg].isDirty = false;444ar[mr[r].reg].mipsReg = -1;445}446break;447448case ML_MEM:449// Already there, nothing to do.450break;451452default:453//BAD454break;455}456mr[r].loc = ML_MEM;457mr[r].reg = (int)INVALID_REG;458mr[r].tempLock = false;459mr[r].spillLock = false;460}461462bool Arm64RegCacheFPU::IsTempX(ARM64Reg r) const {463return ar[r - S0].mipsReg >= TEMP0;464}465466int Arm64RegCacheFPU::GetTempR() {467if (jo_->useASIMDVFPU) {468ERROR_LOG(Log::JIT, "VFP temps not allowed in NEON mode");469return 0;470}471pendingFlush = true;472for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) {473if (mr[r].loc == ML_MEM && !mr[r].tempLock) {474mr[r].tempLock = true;475return r;476}477}478479ERROR_LOG(Log::CPU, "Out of temp regs! Might need to DiscardR() some");480_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");481return -1;482}483484int Arm64RegCacheFPU::GetMipsRegOffset(MIPSReg r) {485// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.486if (r < 0 || r > 32 + 128 + NUM_TEMPS) {487ERROR_LOG(Log::JIT, "bad mips register %i, out of range", r);488return 0; // or what?489}490491if (r < 32 || r >= 32 + 128) {492return (32 + r) << 2;493} else {494// r is between 32 and 128 + 32495return (32 + 32 + voffset[r - 32]) << 2;496}497}498499void Arm64RegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {500mr[r1].spillLock = true;501if (r2 != -1) mr[r2].spillLock = true;502if (r3 != -1) mr[r3].spillLock = true;503if (r4 != -1) mr[r4].spillLock = true;504}505506// This is actually pretty slow with all the 160 regs...507void Arm64RegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {508for (int i = 0; i < NUM_MIPSFPUREG; i++) {509mr[i].spillLock = false;510}511for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {512DiscardR(i);513}514}515516ARM64Reg Arm64RegCacheFPU::R(int mipsReg) {517if (mr[mipsReg].loc == ML_ARMREG) {518return (ARM64Reg)(mr[mipsReg].reg + S0);519} else {520if (mipsReg < 32) {521ERROR_LOG(Log::JIT, "FReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());522} else if (mipsReg < 32 + 128) {523ERROR_LOG(Log::JIT, "VReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());524} else {525ERROR_LOG(Log::JIT, "Tempreg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 128 - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());526}527return INVALID_REG; // BAAAD528}529}530531#endif // PPSSPP_ARCH(ARM64)532533534