CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/x86/RegCacheFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)1920#include <cstring>21#include <emmintrin.h>2223#include "Common/Log.h"24#include "Common/x64Emitter.h"25#include "Core/MIPS/MIPSAnalyst.h"26#include "Core/MIPS/x86/Jit.h"27#include "Core/MIPS/x86/RegCache.h"28#include "Core/MIPS/x86/RegCacheFPU.h"2930using namespace Gen;31using namespace X64JitConstants;3233FPURegCache::FPURegCache() {34vregs = regs + 32;35}3637void FPURegCache::Start(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo, MIPSAnalyst::AnalysisResults &stats, bool useRip) {38mips_ = mipsState;39useRip_ = useRip;40if (!initialReady) {41SetupInitialRegs();42initialReady = true;43}4445memcpy(xregs, xregsInitial, sizeof(xregs));46memcpy(regs, regsInitial, sizeof(regs));47pendingFlush = false;4849js_ = js;50jo_ = jo;51}5253void FPURegCache::SetupInitialRegs() {54for (int i = 0; i < NUM_X_FPREGS; i++) {55memset(xregsInitial[i].mipsRegs, -1, sizeof(xregsInitial[i].mipsRegs));56xregsInitial[i].dirty = false;57}58memset(regsInitial, 0, sizeof(regsInitial));59OpArg base = GetDefaultLocation(0);60for (int i = 0; i < 32; i++) {61regsInitial[i].location = base;62base.IncreaseOffset(sizeof(float));63}64for (int i = 32; i < 32 + 128; i++) {65regsInitial[i].location = GetDefaultLocation(i);66}67base = GetDefaultLocation(32 + 128);68for (int i = 32 + 128; i < NUM_MIPS_FPRS; i++) {69regsInitial[i].location = base;70base.IncreaseOffset(sizeof(float));71}72}7374void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {75regs[p1].locked++;76if (p2 != 0xFF) regs[p2].locked++;77if (p3 != 0xFF) regs[p3].locked++;78if (p4 != 0xFF) regs[p4].locked++;79}8081void FPURegCache::SpillLockV(const u8 *vec, VectorSize sz) {82for (int i = 0; i < GetNumVectorElements(sz); i++) {83vregs[vec[i]].locked++;84}85}8687void FPURegCache::SpillLockV(int vec, VectorSize sz) {88u8 r[4];89GetVectorRegs(r, sz, vec);90SpillLockV(r, sz);91}9293void FPURegCache::ReleaseSpillLockV(const u8 *vec, VectorSize sz) {94for (int i = 0; i < GetNumVectorElements(sz); i++) {95vregs[vec[i]].locked = 0;96}97}9899void FPURegCache::ReduceSpillLock(int mipsreg) {100regs[mipsreg].locked--;101}102103void FPURegCache::ReduceSpillLockV(const u8 *vec, VectorSize sz) {104for (int i = 0; i < GetNumVectorElements(sz); i++) {105vregs[vec[i]].locked--;106}107}108109void FPURegCache::FlushRemap(int oldreg, int newreg) {110OpArg oldLocation = regs[oldreg].location;111_assert_msg_(oldLocation.IsSimpleReg(), "FlushRemap: Must already be in an x86 SSE register");112_assert_msg_(regs[oldreg].lane == 0, "FlushRemap only supports FPR registers");113114X64Reg xr = oldLocation.GetSimpleReg();115if (oldreg == newreg) {116xregs[xr].dirty = true;117return;118}119120StoreFromRegister(oldreg);121122// Now, if newreg already was mapped somewhere, get rid of that.123DiscardR(newreg);124125// Now, take over the old register.126regs[newreg].location = oldLocation;127regs[newreg].away = true;128regs[newreg].locked = true;129regs[newreg].lane = 0;130xregs[xr].mipsReg = newreg;131xregs[xr].dirty = true;132}133134void FPURegCache::MapRegV(int vreg, int flags) {135MapReg(vreg + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);136}137138void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {139u8 r[4];140GetVectorRegs(r, sz, vec);141SpillLockV(r, sz);142for (int i = 0; i < GetNumVectorElements(sz); i++) {143MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);144}145if ((flags & MAP_NOLOCK) != 0) {146// We have to lock so the sz won't spill, so we unlock after.147// If they were already locked, we only reduce the lock we added above.148ReduceSpillLockV(r, sz);149}150}151152void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) {153SpillLockV(r, sz);154for (int i = 0; i < GetNumVectorElements(sz); i++) {155MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);156}157if ((flags & MAP_NOLOCK) != 0) {158// We have to lock so the sz won't spill, so we unlock after.159// If they were already locked, we only reduce the lock we added above.160ReduceSpillLockV(r, sz);161}162}163164bool FPURegCache::IsMappedVS(const u8 *v, VectorSize vsz) {165const int n = GetNumVectorElements(vsz);166167// Make sure the first reg is at least mapped in the right place.168if (!IsMappedVS(v[0]))169return false;170if (vregs[v[0]].lane != 1)171return false;172173// And make sure the rest are mapped to the same reg in the right positions.174X64Reg xr = VSX(v);175for (int i = 1; i < n; ++i) {176u8 vi = v[i];177if (!IsMappedVS(vi) || VSX(&vi) != xr)178return false;179if (vregs[vi].lane != i + 1)180return false;181}182// TODO: Optimize this case? It happens.183for (int i = n; i < 4; ++i) {184if (xregs[xr].mipsRegs[i] != -1) {185return false;186}187}188return true;189}190191void FPURegCache::MapRegsVS(const u8 *r, VectorSize vsz, int flags) {192const int n = GetNumVectorElements(vsz);193194_dbg_assert_msg_(jo_->enableVFPUSIMD, "Should not map simd regs when option is off.");195196if (!TryMapRegsVS(r, vsz, flags)) {197// TODO: Could be more optimal.198for (int i = 0; i < n; ++i) {199StoreFromRegisterV(r[i]);200}201if (!TryMapRegsVS(r, vsz, flags)) {202_dbg_assert_msg_(false, "MapRegsVS() failed on second try.");203}204}205}206207bool FPURegCache::CanMapVS(const u8 *v, VectorSize vsz) {208const int n = GetNumVectorElements(vsz);209210if (!jo_->enableVFPUSIMD) {211return false;212}213214if (IsMappedVS(v, vsz)) {215return true;216} else if (vregs[v[0]].lane != 0) {217const MIPSCachedFPReg &v0 = vregs[v[0]];218_dbg_assert_msg_(v0.away, "Must be away when lane != 0");219_dbg_assert_msg_(v0.location.IsSimpleReg(), "Must be is register when lane != 0");220221// Already in a different simd set.222return false;223}224225if (vregs[v[0]].locked) {226// If it's locked, we can't mess with it.227return false;228}229230// Next, fail if any of the other regs are in simd currently.231// TODO: Only if locked? Not sure if it will be worth breaking them anyway.232for (int i = 1; i < n; ++i) {233if (vregs[v[i]].lane != 0) {234return false;235}236// If it's locked, in simd or not, we can't use it.237if (vregs[v[i]].locked) {238return false;239}240_assert_msg_(!vregs[v[i]].location.IsImm(), "Cannot handle imms in fp cache.");241}242243return true;244}245246bool FPURegCache::TryMapRegsVS(const u8 *v, VectorSize vsz, int flags) {247const int n = GetNumVectorElements(vsz);248249if (!CanMapVS(v, vsz)) {250return false;251}252253if (IsMappedVS(v, vsz)) {254// Already mapped then, perfect. Just mark dirty.255if ((flags & MAP_DIRTY) != 0)256xregs[VSX(v)].dirty = true;257if ((flags & MAP_NOLOCK) == 0)258SpillLockV(v, vsz);259return true;260}261262// At this point, some or all are in single regs or memory, and they're not locked there.263264if (n == 1) {265// Single is easy, just map normally but track as a SIMD reg.266// This way V/VS can warn about improper usage properly.267MapRegV(v[0], flags);268X64Reg vx = VX(v[0]);269if (vx == INVALID_REG)270return false;271272vregs[v[0]].lane = 1;273if ((flags & MAP_DIRTY) != 0)274xregs[vx].dirty = true;275if ((flags & MAP_NOLOCK) == 0)276SpillLockV(v, vsz);277Invariant();278return true;279}280281X64Reg xr;282if ((flags & MAP_NOINIT) != MAP_NOINIT) {283xr = LoadRegsVS(v, n);284} else {285xr = GetFreeXReg();286}287288// Victory, now let's clean up everything.289OpArg newloc = Gen::R(xr);290bool dirty = (flags & MAP_DIRTY) != 0;291for (int i = 0; i < n; ++i) {292MIPSCachedFPReg &vr = vregs[v[i]];293if (vr.away) {294// Clear the xreg it was in before.295X64Reg oldXReg = vr.location.GetSimpleReg();296if (oldXReg != xr) {297xregs[oldXReg].mipsReg = -1;298}299if (xregs[oldXReg].dirty) {300// Inherit the "dirtiness" (ultimately set below for all regs.)301dirty = true;302xregs[oldXReg].dirty = false;303}304}305xregs[xr].mipsRegs[i] = v[i] + 32;306vr.location = newloc;307vr.lane = i + 1;308vr.away = true;309}310xregs[xr].dirty = dirty;311312if ((flags & MAP_NOLOCK) == 0) {313SpillLockV(v, vsz);314}315316Invariant();317return true;318}319320X64Reg FPURegCache::LoadRegsVS(const u8 *v, int n) {321int regsAvail = 0;322int regsLoaded = 0;323X64Reg xrs[4] = {INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG};324bool xrsLoaded[4] = {false, false, false, false};325326_dbg_assert_msg_(n >= 2 && n <= 4, "LoadRegsVS is only implemented for simd loads.");327328for (int i = 0; i < n; ++i) {329const MIPSCachedFPReg &mr = vregs[v[i]];330if (mr.away) {331X64Reg mrx = mr.location.GetSimpleReg();332// If it's not simd, or lanes 1+ are clear, we can use it.333if (mr.lane == 0 || xregs[mrx].mipsRegs[1] == -1) {334// Okay, there's nothing else in this reg, so we can use it.335xrsLoaded[i] = true;336xrs[i] = mrx;337++regsLoaded;338++regsAvail;339} else if (mr.lane != 0) {340_dbg_assert_msg_(false, "LoadRegsVS is not able to handle simd remapping yet, store first.");341}342}343}344345if (regsAvail < n) {346// Try to grab some without spilling.347X64Reg xrFree[4];348int obtained = GetFreeXRegs(xrFree, n - regsAvail, false);349int pos = 0;350for (int i = 0; i < n && pos < obtained; ++i) {351if (xrs[i] == INVALID_REG) {352// Okay, it's not loaded but we have a reg for this slot.353xrs[i] = xrFree[pos++];354++regsAvail;355}356}357}358359// Let's also check if the memory addresses are sequential.360int sequential = 1;361for (int i = 1; i < n; ++i) {362if (v[i] < 128 && v[i - 1] < 128) {363if (voffset[v[i]] != voffset[v[i - 1]] + 1) {364break;365}366} else if (v[i] >= 128 && v[i - 1] >= 128) {367if (v[i] != v[i - 1] + 1) {368break;369}370} else {371// Temps can't be sequential with non-temps.372break;373}374++sequential;375}376377// Did we end up with enough regs?378// TODO: Not handling the case of some regs avail and some loaded right now.379if (regsAvail < n && (sequential != n || regsLoaded == n || regsAvail == 0)) {380regsAvail = GetFreeXRegs(xrs, 2, true);381_dbg_assert_msg_(regsAvail >= 2, "Ran out of fp regs for loading simd regs with.");382_dbg_assert_msg_(xrs[0] != xrs[1], "Regs for simd load are the same, bad things await.");383// We spilled, so we assume that all our regs are screwed up now anyway.384for (int i = 0; i < 4; ++i) {385xrsLoaded[i] = false;386}387for (int i = 2; i < n; ++i){388xrs[i] = INVALID_REG;389}390regsLoaded = 0;391}392393// If they're sequential, and we wouldn't need to store them all, use a single load.394// But if they're already loaded, we'd have to store, not worth it.395X64Reg res = INVALID_REG;396if (sequential == n && regsLoaded < n) {397// TODO: What should we do if some are in regs? Better to assemble?398for (int i = 0; i < n; ++i) {399StoreFromRegisterV(v[i]);400}401402// Grab any available reg.403for (int i = 0; i < n; ++i) {404if (xrs[i] != INVALID_REG) {405res = xrs[i];406break;407}408}409const float *f = v[0] < 128 ? &mips_->v[voffset[v[0]]] : &mips_->tempValues[v[0] - 128];410if (((intptr_t)f & 0x7) == 0 && n == 2) {411emit->MOVQ_xmm(res, vregs[v[0]].location);412} else if (((intptr_t)f & 0xf) == 0) {413// On modern processors, MOVUPS on aligned is fast, but maybe not on older ones.414emit->MOVAPS(res, vregs[v[0]].location);415} else {416emit->MOVUPS(res, vregs[v[0]].location);417}418} else if (regsAvail >= n) {419// Have enough regs, potentially all in regs.420auto loadXR = [&](int l) {421if (!xrsLoaded[l] && n >= l + 1) {422emit->MOVSS(xrs[l], vregs[v[l]].location);423}424};425// The order here is intentional.426loadXR(3);427loadXR(1);428loadXR(2);429loadXR(0);430if (n == 4) {431// This gives us [w, y] in the y reg.432emit->UNPCKLPS(xrs[1], Gen::R(xrs[3]));433}434if (n >= 3) {435// This gives us [z, x]. Then we combine with y.436emit->UNPCKLPS(xrs[0], Gen::R(xrs[2]));437}438if (n >= 2) {439emit->UNPCKLPS(xrs[0], Gen::R(xrs[1]));440}441res = xrs[0];442} else {443_dbg_assert_msg_(n > 2, "2 should not be possible here.");444445// Available regs are less than n, and some may be loaded.446// Let's grab the most optimal unloaded ones.447X64Reg xr1 = n == 3 ? xrs[1] : xrs[3];448X64Reg xr2 = xrs[2];449if (xr1 == INVALID_REG) {450// Not one of the available ones. Grab another.451for (int i = n - 1; i >= 0; --i) {452if (xrs[i] != INVALID_REG && xrs[i] != xr2) {453StoreFromRegisterV(v[i]);454xr1 = xrs[i];455break;456}457}458}459if (xr2 == INVALID_REG) {460// Not one of the available ones. Grab another.461for (int i = n - 1; i >= 0; --i) {462if (xrs[i] != INVALID_REG && xrs[i] != xr1) {463StoreFromRegisterV(v[i]);464xr2 = xrs[i];465break;466}467}468}469470if (n == 3) {471if (!vregs[v[2]].location.IsSimpleReg(xr2))472emit->MOVSS(xr2, vregs[v[2]].location);473if (!vregs[v[1]].location.IsSimpleReg(xr1))474emit->MOVSS(xr1, vregs[v[1]].location);475emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(3, 0, 0, 0));476emit->MOVSS(xr2, vregs[v[0]].location);477emit->MOVSS(xr1, Gen::R(xr2));478} else if (n == 4) {479if (!vregs[v[2]].location.IsSimpleReg(xr2))480emit->MOVSS(xr2, vregs[v[2]].location);481if (!vregs[v[3]].location.IsSimpleReg(xr1))482emit->MOVSS(xr1, vregs[v[3]].location);483emit->UNPCKLPS(xr2, Gen::R(xr1));484emit->MOVSS(xr1, vregs[v[1]].location);485emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(1, 0, 0, 3));486emit->MOVSS(xr2, vregs[v[0]].location);487emit->MOVSS(xr1, Gen::R(xr2));488}489res = xr1;490}491492return res;493}494495bool FPURegCache::TryMapDirtyInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, bool avoidLoad) {496// Don't waste time mapping if some will for sure fail.497if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz)) {498return false;499}500// But, they could still fail based on overlap. Hopefully not common...501bool success = TryMapRegsVS(vs, vssz, 0);502if (success) {503success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);504}505ReleaseSpillLockV(vs, vssz);506ReleaseSpillLockV(vd, vdsz);507508_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");509_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");510511return success;512}513514bool FPURegCache::TryMapDirtyInInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, const u8 *vt, VectorSize vtsz, bool avoidLoad) {515// Don't waste time mapping if some will for sure fail.516if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz) || !CanMapVS(vt, vtsz)) {517return false;518}519520521// But, they could still fail based on overlap. Hopefully not common...522bool success = TryMapRegsVS(vs, vssz, 0);523if (success) {524success = TryMapRegsVS(vt, vtsz, 0);525}526if (success) {527success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);528}529ReleaseSpillLockV(vd, vdsz);530ReleaseSpillLockV(vs, vssz);531ReleaseSpillLockV(vt, vtsz);532533_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");534_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");535_dbg_assert_msg_(!success || IsMappedVS(vt, vtsz), "vt should be mapped now");536537return success;538}539540void FPURegCache::SimpleRegsV(const u8 *v, VectorSize vsz, int flags) {541const int n = GetNumVectorElements(vsz);542// TODO: Could be more optimal (in case of Discard or etc.)543for (int i = 0; i < n; ++i) {544SimpleRegV(v[i], flags);545}546}547548void FPURegCache::SimpleRegsV(const u8 *v, MatrixSize msz, int flags) {549const int n = GetMatrixSide(msz);550// TODO: Could be more optimal (in case of Discard or etc.)551for (int i = 0; i < n; ++i) {552for (int j = 0; j < n; ++j) {553SimpleRegV(v[j * 4 + i], flags);554}555}556}557558void FPURegCache::SimpleRegV(const u8 v, int flags) {559MIPSCachedFPReg &vr = vregs[v];560// Special optimization: if it's in a single simd, we can keep it there.561if (vr.lane == 1 && xregs[VSX(&v)].mipsRegs[1] == -1) {562if (flags & MAP_DIRTY) {563xregs[VSX(&v)].dirty = true;564}565// Just change the lane to 0.566vr.lane = 0;567} else if (vr.lane != 0) {568// This will never end up in a register this way, so ignore dirty.569if ((flags & MAP_NOINIT) == MAP_NOINIT) {570// This will discard only this reg, and store the others.571DiscardV(v);572} else {573StoreFromRegisterV(v);574}575} else if (vr.away) {576// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.577if (flags & MAP_DIRTY) {578xregs[VX(v)].dirty = true;579}580_assert_msg_(vr.location.IsSimpleReg(), "not loaded and not simple.");581}582Invariant();583}584585void FPURegCache::ReleaseSpillLock(int mipsreg) {586regs[mipsreg].locked = 0;587}588589void FPURegCache::ReleaseSpillLocks() {590for (int i = 0; i < NUM_MIPS_FPRS; i++)591regs[i].locked = 0;592for (int i = TEMP0; i < TEMP0 + NUM_X86_FPU_TEMPS; ++i)593DiscardR(i);594}595596void FPURegCache::MapReg(const int i, bool doLoad, bool makeDirty) {597pendingFlush = true;598_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::MapReg - imm");599_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::MapReg - invalid mips reg %d", i);600601if (!regs[i].away) {602// Reg is at home in the memory register file. Let's pull it out.603X64Reg xr = GetFreeXReg();604_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::MapReg - invalid reg %d", (int)xr);605xregs[xr].mipsReg = i;606xregs[xr].dirty = makeDirty;607OpArg newloc = ::Gen::R(xr);608if (doLoad) {609emit->MOVSS(xr, regs[i].location);610}611regs[i].location = newloc;612regs[i].lane = 0;613regs[i].away = true;614} else if (regs[i].lane != 0) {615// Well, darn. This means we need to flush it.616// TODO: This could be more optimal. Also check flags.617StoreFromRegister(i);618MapReg(i, doLoad, makeDirty);619} else {620// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.621xregs[RX(i)].dirty |= makeDirty;622_assert_msg_(regs[i].location.IsSimpleReg(), "not loaded and not simple.");623}624Invariant();625}626627static int MMShuffleSwapTo0(int lane) {628if (lane == 0) {629return _MM_SHUFFLE(3, 2, 1, 0);630} else if (lane == 1) {631return _MM_SHUFFLE(3, 2, 0, 1);632} else if (lane == 2) {633return _MM_SHUFFLE(3, 0, 1, 2);634} else if (lane == 3) {635return _MM_SHUFFLE(0, 2, 1, 3);636} else {637_assert_msg_(false, "MMShuffleSwapTo0: Invalid lane %d", lane);638return 0;639}640}641642void FPURegCache::StoreFromRegister(int i) {643_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::StoreFromRegister - it's an imm");644_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::StoreFromRegister - invalid mipsreg %i PC=%08x", i, js_->compilerPC);645646if (regs[i].away) {647X64Reg xr = regs[i].location.GetSimpleReg();648_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::StoreFromRegister - invalid reg: x %i (mr: %i). PC=%08x", (int)xr, i, js_->compilerPC);649if (regs[i].lane != 0) {650const int *mri = xregs[xr].mipsRegs;651int seq = 1;652for (int j = 1; j < 4; ++j) {653if (mri[j] == -1) {654break;655}656if (mri[j] - 32 >= 128 && mri[j - 1] - 32 >= 128 && mri[j] == mri[j - 1] + 1) {657seq++;658} else if (mri[j] - 32 < 128 && mri[j - 1] - 32 < 128 && voffset[mri[j] - 32] == voffset[mri[j - 1] - 32] + 1) {659seq++;660} else {661break;662}663}664665const float *f = mri[0] - 32 < 128 ? &mips_->v[voffset[mri[0] - 32]] : &mips_->tempValues[mri[0] - 32 - 128];666int align = (intptr_t)f & 0xf;667668// If we can do a multistore...669if ((seq == 2 && (align & 0x7) == 0) || seq == 4) {670OpArg newLoc = GetDefaultLocation(mri[0]);671if (xregs[xr].dirty) {672if (seq == 4 && align == 0)673emit->MOVAPS(newLoc, xr);674else if (seq == 4)675emit->MOVUPS(newLoc, xr);676else677emit->MOVQ_xmm(newLoc, xr);678}679for (int j = 0; j < seq; ++j) {680int mr = xregs[xr].mipsRegs[j];681if (mr == -1) {682continue;683}684OpArg newLoc = GetDefaultLocation(mr);685regs[mr].location = newLoc;686regs[mr].away = false;687regs[mr].lane = 0;688xregs[xr].mipsRegs[j] = -1;689}690} else {691seq = 0;692}693// Store the rest.694for (int j = seq; j < 4; ++j) {695int mr = xregs[xr].mipsRegs[j];696if (mr == -1) {697continue;698}699if (j != 0 && xregs[xr].dirty) {700emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));701}702OpArg newLoc = GetDefaultLocation(mr);703if (xregs[xr].dirty) {704emit->MOVSS(newLoc, xr);705}706regs[mr].location = newLoc;707regs[mr].away = false;708regs[mr].lane = 0;709xregs[xr].mipsRegs[j] = -1;710}711} else {712OpArg newLoc = GetDefaultLocation(i);713xregs[xr].mipsReg = -1;714if (xregs[xr].dirty) {715emit->MOVSS(newLoc, xr);716}717regs[i].location = newLoc;718}719xregs[xr].dirty = false;720regs[i].away = false;721} else {722// _assert_msg_(false,"already stored");723}724Invariant();725}726727void FPURegCache::DiscardR(int i) {728_assert_msg_(!regs[i].location.IsImm(), "FPU can't handle imm yet.");729if (regs[i].away) {730X64Reg xr = regs[i].location.GetSimpleReg();731_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");732// Note that we DO NOT write it back here. That's the whole point of Discard.733if (regs[i].lane != 0) {734// But we can't just discard all of them in SIMD, just the one lane.735// TODO: Potentially this could be more optimal (MOVQ or etc.)736xregs[xr].mipsRegs[regs[i].lane - 1] = -1;737regs[i].lane = 0;738for (int j = 0; j < 4; ++j) {739int mr = xregs[xr].mipsRegs[j];740if (mr == -1) {741continue;742}743if (j != 0 && xregs[xr].dirty) {744emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));745}746747OpArg newLoc = GetDefaultLocation(mr);748if (xregs[xr].dirty) {749emit->MOVSS(newLoc, xr);750}751regs[mr].location = newLoc;752regs[mr].away = false;753regs[mr].lane = 0;754xregs[xr].mipsRegs[j] = -1;755}756} else {757xregs[xr].mipsReg = -1;758}759xregs[xr].dirty = false;760regs[i].location = GetDefaultLocation(i);761regs[i].away = false;762regs[i].tempLocked = false;763} else {764// _assert_msg_(false,"already stored");765regs[i].tempLocked = false;766}767Invariant();768}769770void FPURegCache::DiscardVS(int vreg) {771_assert_msg_(!vregs[vreg].location.IsImm(), "FPU can't handle imm yet.");772773if (vregs[vreg].away) {774_assert_msg_(vregs[vreg].lane != 0, "VS expects a SIMD reg.");775X64Reg xr = vregs[vreg].location.GetSimpleReg();776_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");777// Note that we DO NOT write it back here. That's the whole point of Discard.778for (int i = 0; i < 4; ++i) {779int mr = xregs[xr].mipsRegs[i];780if (mr != -1) {781regs[mr].location = GetDefaultLocation(mr);782regs[mr].away = false;783regs[mr].tempLocked = false;784regs[mr].lane = 0;785}786xregs[xr].mipsRegs[i] = -1;787}788xregs[xr].dirty = false;789} else {790vregs[vreg].tempLocked = false;791}792Invariant();793}794795bool FPURegCache::IsTempX(X64Reg xr) {796return xregs[xr].mipsReg >= TEMP0;797}798799int FPURegCache::GetTempR() {800pendingFlush = true;801for (int r = TEMP0; r < TEMP0 + NUM_X86_FPU_TEMPS; ++r) {802if (!regs[r].away && !regs[r].tempLocked) {803regs[r].tempLocked = true;804return r;805}806}807808_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");809return -1;810}811812int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) {813pendingFlush = true;814const int n = GetNumVectorElements(vsz);815816// Let's collect regs as we go, but try for n free in a row.817int found = 0;818for (int r = TEMP0; r <= TEMP0 + NUM_X86_FPU_TEMPS - n; ++r) {819if (regs[r].away || regs[r].tempLocked) {820continue;821}822823// How many free siblings does this have?824int seq = 1;825for (int i = 1; i < n; ++i) {826if (regs[r + i].away || regs[r + i].tempLocked) {827break;828}829++seq;830}831832if (seq == n) {833// Got 'em. Exacty as many as we need.834for (int i = 0; i < n; ++i) {835v[i] = r + i - 32;836}837found = n;838break;839}840841if (found < n) {842v[found++] = r - 32;843}844}845846if (found != n) {847_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");848return -1;849}850851for (int i = 0; i < n; ++i) {852regs[v[i] + 32].tempLocked = true;853}854855return 0; // ??856}857858void FPURegCache::Flush() {859if (!pendingFlush) {860return;861}862for (int i = 0; i < NUM_MIPS_FPRS; i++) {863_assert_msg_(!regs[i].locked, "Somebody forgot to unlock MIPS reg %d.", i);864if (regs[i].away) {865if (regs[i].location.IsSimpleReg()) {866X64Reg xr = RX(i);867StoreFromRegister(i);868xregs[xr].dirty = false;869} else if (regs[i].location.IsImm()) {870StoreFromRegister(i);871} else {872_assert_msg_(false, "Jit64 - Flush unhandled case, reg %i PC: %08x", i, mips_->pc);873}874}875}876pendingFlush = false;877Invariant();878}879880OpArg FPURegCache::GetDefaultLocation(int reg) const {881if (reg < 32) {882// Smaller than RIP addressing since we can use a byte offset.883return MDisp(CTXREG, reg * 4);884} else if (reg < 32 + 128) {885// Here, RIP has the advantage so let's use it when possible886if (useRip_) {887return M(&mips_->v[voffset[reg - 32]]); // rip accessible888} else {889return MIPSSTATE_VAR_ELEM32(v[0], voffset[reg - 32]);890}891} else {892if (useRip_) {893return M(&mips_->tempValues[reg - 32 - 128]); // rip accessible894} else {895return MIPSSTATE_VAR_ELEM32(tempValues[0], reg - 32 - 128);896}897}898}899900void FPURegCache::Invariant() const {901#if 0902_assert_msg_(SanityCheck() == 0, "Sanity check failed: %d", SanityCheck());903#endif904}905906static int GetMRMtx(int mr) {907if (mr < 32)908return -1;909if (mr >= 128 + 32)910return -1;911return ((mr - 32) >> 2) & 7;912}913914static int GetMRRow(int mr) {915if (mr < 32)916return -1;917if (mr >= 128 + 32)918return -1;919return ((mr - 32) >> 0) & 3;920}921922static int GetMRCol(int mr) {923if (mr < 32)924return -1;925if (mr >= 128 + 32)926return -1;927return ((mr - 32) >> 5) & 3;928}929930static bool IsMRTemp(int mr) {931return mr >= 128 + 32;932}933934int FPURegCache::SanityCheck() const {935for (int i = 0; i < NUM_MIPS_FPRS; i++) {936const MIPSCachedFPReg &mr = regs[i];937938// FPR can never have imms.939if (mr.location.IsImm())940return 1;941942bool reallyAway = mr.location.IsSimpleReg();943if (reallyAway != mr.away)944return 2;945946if (mr.lane < 0 || mr.lane > 4)947return 3;948if (mr.lane != 0 && !reallyAway)949return 4;950951if (mr.away) {952Gen::X64Reg simple = mr.location.GetSimpleReg();953if (mr.lane == 0) {954if (xregs[simple].mipsReg != i)955return 5;956for (int j = 1; j < 4; ++j) {957if (xregs[simple].mipsRegs[j] != -1)958return 6;959}960} else {961if (xregs[simple].mipsRegs[mr.lane - 1] != i)962return 7;963}964}965}966967for (int i = 0; i < NUM_X_FPREGS; ++i) {968const X64CachedFPReg &xr = xregs[i];969bool hasReg = xr.mipsReg != -1;970if (!hasReg && xr.dirty)971return 8;972973bool hasMoreRegs = hasReg;974int mtx = -2;975int row = -2;976int col = -2;977bool rowMatched = true;978bool colMatched = true;979for (int j = 0; j < 4; ++j) {980if (xr.mipsRegs[j] == -1) {981hasMoreRegs = false;982continue;983}984if (xr.mipsRegs[j] >= NUM_MIPS_FPRS) {985return 13;986}987// We can't have a hole in the middle / front.988if (!hasMoreRegs)989return 9;990991const MIPSCachedFPReg &mr = regs[xr.mipsRegs[j]];992if (!mr.location.IsSimpleReg(X64Reg(i)))993return 10;994995if (!IsMRTemp(xr.mipsRegs[j])) {996if (mtx == -2)997mtx = GetMRMtx(xr.mipsRegs[j]);998else if (mtx != GetMRMtx(xr.mipsRegs[j]))999return 11;10001001if (row == -2)1002row = GetMRRow(xr.mipsRegs[j]);1003else if (row != GetMRRow(xr.mipsRegs[j]))1004rowMatched = false;10051006if (col == -2)1007col = GetMRCol(xr.mipsRegs[j]);1008else if (col != GetMRCol(xr.mipsRegs[j]))1009colMatched = false;1010}1011}1012if (!rowMatched && !colMatched) {1013return 12;1014}1015}10161017return 0;1018}10191020const int *FPURegCache::GetAllocationOrder(int &count) {1021static const int allocationOrder[] = {1022#if PPSSPP_ARCH(AMD64)1023XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM51024#elif PPSSPP_ARCH(X86)1025XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,1026#endif1027};1028count = sizeof(allocationOrder) / sizeof(int);1029return allocationOrder;1030}10311032X64Reg FPURegCache::GetFreeXReg() {1033X64Reg res;1034int obtained = GetFreeXRegs(&res, 1);10351036_assert_msg_(obtained == 1, "Regcache ran out of regs");1037return res;1038}10391040int FPURegCache::GetFreeXRegs(X64Reg *res, int n, bool spill) {1041pendingFlush = true;1042int aCount;1043const int *aOrder = GetAllocationOrder(aCount);10441045_dbg_assert_msg_(n <= NUM_X_FPREGS - 2, "Cannot obtain that many regs.");10461047int r = 0;10481049for (int i = 0; i < aCount; i++) {1050X64Reg xr = (X64Reg)aOrder[i];1051if (xregs[xr].mipsReg == -1) {1052res[r++] = (X64Reg)xr;1053if (r >= n) {1054break;1055}1056}1057}10581059if (r < n && spill) {1060// Okay, not found :(... Force grab one.1061// TODO - add a pass to grab xregs whose mipsreg is not used in the next 3 instructions.1062for (int i = 0; i < aCount; i++) {1063X64Reg xr = (X64Reg)aOrder[i];1064int preg = xregs[xr].mipsReg;1065_assert_msg_(preg >= -1 && preg < NUM_MIPS_FPRS, "WTF - FPURegCache::GetFreeXRegs - invalid mips reg %d in xr %d", preg, (int)xr);10661067// We're only spilling here, so don't overlap.1068if (preg != -1 && !regs[preg].locked) {1069StoreFromRegister(preg);1070res[r++] = xr;1071if (r >= n) {1072break;1073}1074}1075}1076}10771078for (int i = r; i < n; ++i) {1079res[i] = INVALID_REG;1080}1081return r;1082}10831084void FPURegCache::FlushX(X64Reg reg) {1085if (reg >= NUM_X_FPREGS) {1086_assert_msg_(false, "Flushing non existent reg");1087} else if (xregs[reg].mipsReg != -1) {1088StoreFromRegister(xregs[reg].mipsReg);1089}1090}10911092void FPURegCache::GetState(FPURegCacheState &state) const {1093memcpy(state.regs, regs, sizeof(regs));1094memcpy(state.xregs, xregs, sizeof(xregs));1095}10961097void FPURegCache::RestoreState(const FPURegCacheState& state) {1098memcpy(regs, state.regs, sizeof(regs));1099memcpy(xregs, state.xregs, sizeof(xregs));1100pendingFlush = true;1101}11021103#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)110411051106