CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/RasterizerRegCache.cpp
Views: 1401
// Copyright (c) 2021- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "GPU/Software/RasterizerRegCache.h"1819#include "Common/Arm64Emitter.h"2021namespace Rasterizer {2223void RegCache::SetupABI(const std::vector<Purpose> &args, bool forceRetain) {24#if PPSSPP_ARCH(ARM)25_assert_msg_(false, "Not yet implemented");26#elif PPSSPP_ARCH(ARM64_NEON)27using namespace Arm64Gen;2829// ARM64 has a generous allotment of registers.30static const Reg genArgs[] = { X0, X1, X2, X3, X4, X5, X6, X7 };31static const Reg vecArgs[] = { Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7 };32size_t genIndex = 0;33size_t vecIndex = 0;3435for (const Purpose &p : args) {36if ((p & FLAG_GEN) != 0) {37if (genIndex < ARRAY_SIZE(genArgs)) {38Add(genArgs[genIndex++], p);39if (forceRetain)40ForceRetain(p);41}42} else {43if (vecIndex < ARRAY_SIZE(vecArgs)) {44Add(vecArgs[vecIndex++], p);45if (forceRetain)46ForceRetain(p);47}48}49}5051// Any others are free and purposeless.52for (size_t i = genIndex; i < ARRAY_SIZE(genArgs); ++i)53Add(genArgs[i], GEN_INVALID);54for (size_t i = vecIndex; i < ARRAY_SIZE(vecArgs); ++i)55Add(vecArgs[i], VEC_INVALID);5657// Add all other caller saved regs without purposes yet.58static const Reg genTemps[] = { X8, X9, X10, X11, X12, X13, X14, X15 };59for (Reg r : genTemps)60Add(r, GEN_INVALID);61static const Reg vecTemps[] = { Q16, Q17, Q18, Q19, Q20, Q21, Q22, Q23 };62for (Reg r : vecTemps)63Add(r, VEC_INVALID);64// We also have X16-17 and Q24-Q31, but leave those for ordered paired instructions.65#elif PPSSPP_ARCH(X86)66_assert_msg_(false, "Not yet implemented");67#elif PPSSPP_ARCH(AMD64)68using namespace Gen;6970#if PPSSPP_PLATFORM(WINDOWS)71// The Windows convention is annoying, as it wastes registers and keeps to "positions."72Reg genArgs[] = { RCX, RDX, R8, R9 };73Reg vecArgs[] = { XMM0, XMM1, XMM2, XMM3, XMM4, XMM5 };7475for (size_t i = 0; i < args.size(); ++i) {76const Purpose &p = args[i];77if ((p & FLAG_GEN) != 0) {78if (i < ARRAY_SIZE(genArgs)) {79Add(genArgs[i], p);80genArgs[i] = INVALID_REG;81if (forceRetain)82ForceRetain(p);83}84} else {85if (i < ARRAY_SIZE(vecArgs)) {86Add(vecArgs[i], p);87vecArgs[i] = INVALID_REG;88if (forceRetain)89ForceRetain(p);90}91}92}9394// Any unused regs can be used freely as temps.95for (Reg r : genArgs) {96if (r != INVALID_REG)97Add(r, GEN_INVALID);98}99for (Reg r : vecArgs) {100if (r != INVALID_REG)101Add(r, VEC_INVALID);102}103104// Additionally, these three are volatile.105// Must save: RBX, RSP, RBP, RDI, RSI, R12-R15, XMM6-15106static const Reg genTemps[] = { RAX, R10, R11 };107for (Reg r : genTemps)108Add(r, GEN_INVALID);109#else110// Okay, first, allocate args. SystemV gives to the first of each usable pool.111static const Reg genArgs[] = { RDI, RSI, RDX, RCX, R8, R9 };112static const Reg vecArgs[] = { XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7 };113size_t genIndex = 0;114size_t vecIndex = 0;115116for (const Purpose &p : args) {117if ((p & FLAG_GEN) != 0) {118if (genIndex < ARRAY_SIZE(genArgs)) {119Add(genArgs[genIndex++], p);120if (forceRetain)121ForceRetain(p);122}123} else {124if (vecIndex < ARRAY_SIZE(vecArgs)) {125Add(vecArgs[vecIndex++], p);126if (forceRetain)127ForceRetain(p);128}129}130}131132// Any others are free and purposeless.133for (size_t i = genIndex; i < ARRAY_SIZE(genArgs); ++i)134Add(genArgs[i], GEN_INVALID);135for (size_t i = vecIndex; i < ARRAY_SIZE(vecArgs); ++i)136Add(vecArgs[i], VEC_INVALID);137138// Add all other caller saved regs without purposes yet.139// Must save: RBX, RSP, RBP, R12-R15140static const Reg genTemps[] = { RAX, R10, R11 };141for (Reg r : genTemps)142Add(r, GEN_INVALID);143static const Reg vecTemps[] = { XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 };144for (Reg r : vecTemps)145Add(r, VEC_INVALID);146#endif147#elif PPSSPP_ARCH(RISCV64)148_assert_msg_(false, "Not yet implemented (no vector calling standard yet)");149#elif PPSSPP_ARCH(MIPS)150_assert_msg_(false, "Not yet implemented");151#else152_assert_msg_(false, "Not yet implemented");153#endif154}155156void RegCache::Reset(bool validate) {157if (validate) {158for (auto ® : regs) {159_assert_msg_(reg.locked == 0, "softjit: Reset() with reg still locked (%04X)", reg.purpose);160_assert_msg_(!reg.forceRetained, "softjit: Reset() with reg force retained (%04X)", reg.purpose);161}162}163regs.clear();164}165166void RegCache::Add(Reg r, Purpose p) {167for (auto ® : regs) {168if (reg.reg == r && (reg.purpose & FLAG_GEN) == (p & FLAG_GEN)) {169_assert_msg_(false, "softjit Add() reg duplicate (%04X)", p);170}171}172_assert_msg_(r != REG_INVALID_VALUE, "softjit Add() invalid reg (%04X)", p);173174RegStatus newStatus;175newStatus.reg = r;176newStatus.purpose = p;177regs.push_back(newStatus);178}179180void RegCache::Change(Purpose history, Purpose destiny) {181for (auto ® : regs) {182if (reg.purpose == history) {183reg.purpose = destiny;184return;185}186}187188_assert_msg_(false, "softjit Change() reg that isn't there (%04X)", history);189}190191void RegCache::Release(Reg &r, Purpose p) {192RegStatus *status = FindReg(r, p);193_assert_msg_(status != nullptr, "softjit Release() reg that isn't there (%04X)", p);194_assert_msg_(status->locked > 0, "softjit Release() reg that isn't locked (%04X)", p);195_assert_msg_(!status->forceRetained, "softjit Release() reg that is force retained (%04X)", p);196197status->locked--;198if (status->locked == 0) {199if ((status->purpose & FLAG_GEN) != 0)200status->purpose = GEN_INVALID;201else202status->purpose = VEC_INVALID;203}204205r = REG_INVALID_VALUE;206}207208void RegCache::Unlock(Reg &r, Purpose p) {209_assert_msg_((p & FLAG_TEMP) == 0, "softjit Unlock() temp reg (%04X)", p);210RegStatus *status = FindReg(r, p);211if (status) {212_assert_msg_(status->locked > 0, "softjit Unlock() reg that isn't locked (%04X)", p);213status->locked--;214r = REG_INVALID_VALUE;215return;216}217218_assert_msg_(false, "softjit Unlock() reg that isn't there (%04X)", p);219}220221bool RegCache::Has(Purpose p) {222for (auto ® : regs) {223if (reg.purpose == p) {224return true;225}226}227return false;228}229230RegCache::Reg RegCache::Find(Purpose p) {231for (auto ® : regs) {232if (reg.purpose == p) {233_assert_msg_(reg.locked <= 255, "softjit Find() reg has lots of locks (%04X)", p);234reg.locked++;235reg.everLocked = true;236return reg.reg;237}238}239_assert_msg_(false, "softjit Find() reg that isn't there (%04X)", p);240return REG_INVALID_VALUE;241}242243RegCache::Reg RegCache::Alloc(Purpose p) {244_assert_msg_(!Has(p), "softjit Alloc() reg duplicate (%04X)", p);245RegStatus *best = nullptr;246for (auto ® : regs) {247if (reg.locked != 0 || reg.forceRetained)248continue;249// Needs to be the same type.250if ((reg.purpose & FLAG_GEN) != (p & FLAG_GEN))251continue;252253if (best == nullptr)254best = ®255// Prefer a free/purposeless reg (includes INVALID.)256if ((reg.purpose & FLAG_TEMP) != 0) {257best = ®258break;259}260// But also prefer a lower priority reg.261if (reg.purpose < best->purpose)262best = ®263}264265if (best) {266best->locked = 1;267best->everLocked = true;268best->purpose = p;269return best->reg;270}271272_assert_msg_(false, "softjit Alloc() reg with none free (%04X)", p);273return REG_INVALID_VALUE;274}275276void RegCache::ForceRetain(Purpose p) {277for (auto ® : regs) {278if (reg.purpose == p) {279reg.forceRetained = true;280return;281}282}283284_assert_msg_(false, "softjit ForceRetain() reg that isn't there (%04X)", p);285}286287void RegCache::ForceRelease(Purpose p) {288for (auto ® : regs) {289if (reg.purpose == p) {290_assert_msg_(reg.locked == 0, "softjit ForceRelease() while locked (%04X)", p);291reg.forceRetained = false;292if ((reg.purpose & FLAG_GEN) != 0)293reg.purpose = GEN_INVALID;294else295reg.purpose = VEC_INVALID;296return;297}298}299300_assert_msg_(false, "softjit ForceRelease() reg that isn't there (%04X)", p);301}302303void RegCache::GrabReg(Reg r, Purpose p, bool &needsSwap, Reg swapReg, Purpose swapPurpose) {304for (auto ® : regs) {305if (reg.reg != r)306continue;307if ((reg.purpose & FLAG_GEN) != (p & FLAG_GEN))308continue;309310// Easy version, it's free.311if (reg.locked == 0 && !reg.forceRetained) {312needsSwap = false;313reg.purpose = p;314reg.locked = 1;315reg.everLocked = true;316return;317}318319// Okay, we need to swap. Find that reg.320needsSwap = true;321RegStatus *swap = FindReg(swapReg, swapPurpose);322if (swap) {323swap->purpose = reg.purpose;324swap->forceRetained = reg.forceRetained;325swap->locked = reg.locked;326swap->everLocked = true;327} else {328_assert_msg_(!Has(swapPurpose), "softjit GrabReg() wrong purpose (%04X)", swapPurpose);329RegStatus newStatus = reg;330newStatus.reg = swapReg;331newStatus.everLocked = true;332regs.push_back(newStatus);333}334335reg.purpose = p;336reg.locked = 1;337reg.everLocked = true;338reg.forceRetained = false;339return;340}341342_assert_msg_(false, "softjit GrabReg() reg that isn't there");343}344345bool RegCache::ChangeReg(Reg r, Purpose p) {346for (auto ® : regs) {347if (reg.reg != r)348continue;349if ((reg.purpose & FLAG_GEN) != (p & FLAG_GEN))350continue;351352if (reg.purpose == p)353return true;354_assert_msg_(!Has(p), "softjit ChangeReg() duplicate purpose (%04X)", p);355356if (reg.locked != 0 || reg.forceRetained)357return false;358359reg.purpose = p;360// Since we're setting it's purpose, we must've used it.361reg.everLocked = true;362return true;363}364365_assert_msg_(false, "softjit ChangeReg() reg that isn't there");366return false;367}368369bool RegCache::UsedReg(Reg r, Purpose flag) {370for (auto ® : regs) {371if (reg.reg != r)372continue;373if ((reg.purpose & FLAG_GEN) != (flag & FLAG_GEN))374continue;375return reg.everLocked;376}377378_assert_msg_(false, "softjit UsedReg() reg that isn't there");379return false;380}381382RegCache::RegStatus *RegCache::FindReg(Reg r, Purpose p) {383for (auto ® : regs) {384if (reg.reg == r && reg.purpose == p) {385return ®386}387}388389return nullptr;390}391392CodeBlock::CodeBlock(int size)393#if PPSSPP_ARCH(ARM64_NEON)394: fp(this)395#endif396{397AllocCodeSpace(size);398ClearCodeSpace(0);399400// Add some random code to "help" MSVC's buggy disassembler :(401#if defined(_WIN32) && (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !PPSSPP_PLATFORM(UWP)402using namespace Gen;403for (int i = 0; i < 100; i++) {404MOV(32, R(EAX), R(EBX));405RET();406}407#elif PPSSPP_ARCH(ARM)408BKPT(0);409BKPT(0);410#endif411}412413int CodeBlock::WriteProlog(int extraStack, const std::vector<RegCache::Reg> &vec, const std::vector<RegCache::Reg> &gen) {414savedStack_ = 0;415firstVecStack_ = extraStack;416prologVec_ = vec;417prologGen_ = gen;418419int totalStack = 0;420421#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)422using namespace Gen;423424BeginWrite(32768);425AlignCode16();426lastPrologStart_ = GetWritableCodePtr();427428for (X64Reg r : gen) {429PUSH(r);430regCache_.Add(r, RegCache::GEN_INVALID);431totalStack += 8;432}433434savedStack_ = 16 * (int)vec.size() + extraStack;435// We want to align if possible. It starts out unaligned.436if ((totalStack & 8) == 0)437savedStack_ += 8;438totalStack += savedStack_;439if (savedStack_ != 0)440SUB(64, R(RSP), Imm32(savedStack_));441442int nextOffset = extraStack;443for (X64Reg r : vec) {444MOVUPS(MDisp(RSP, nextOffset), r);445regCache_.Add(r, RegCache::VEC_INVALID);446nextOffset += 16;447}448449lastPrologEnd_ = GetWritableCodePtr();450#else451_assert_msg_(false, "Not yet implemented");452#endif453454return totalStack;455}456457const u8 *CodeBlock::WriteFinalizedEpilog() {458u8 *prologPtr = lastPrologStart_;459ptrdiff_t prologMaxSize = lastPrologEnd_ - lastPrologStart_;460lastPrologStart_ = nullptr;461lastPrologEnd_ = nullptr;462463#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)464using namespace Gen;465466bool prologChange = false;467int nextOffset = firstVecStack_;468for (X64Reg r : prologVec_) {469if (regCache_.UsedReg(r, RegCache::VEC_INVALID)) {470MOVUPS(r, MDisp(RSP, nextOffset));471nextOffset += 16;472} else {473prologChange = true;474}475}476477// We use the stack offset in generated code, so maintain any difference.478int unusedGenSpace = 0;479for (X64Reg r : prologGen_) {480if (!regCache_.UsedReg(r, RegCache::GEN_INVALID))481unusedGenSpace += 8;482}483if (unusedGenSpace != 0)484prologChange = true;485486if (savedStack_ + unusedGenSpace != 0)487ADD(64, R(RSP), Imm32(savedStack_ + unusedGenSpace));488for (int i = (int)prologGen_.size(); i > 0; --i) {489X64Reg r = prologGen_[i - 1];490if (regCache_.UsedReg(r, RegCache::GEN_INVALID))491POP(r);492}493494RET();495EndWrite();496497if (prologChange) {498// Okay, now let's rewrite the prolog since we didn't need all those regs.499XEmitter prolog(prologPtr);500if (PlatformIsWXExclusive()) {501ProtectMemoryPages(prologPtr, 128, MEM_PROT_READ | MEM_PROT_WRITE);502}503504// First, write the new prolog at the original position.505for (X64Reg r : prologGen_) {506if (regCache_.UsedReg(r, RegCache::GEN_INVALID))507prolog.PUSH(r);508}509510// Even if less of the stack is actually used, we want the number to match to references.511if (savedStack_ + unusedGenSpace != 0)512prolog.SUB(64, R(RSP), Imm32(savedStack_ + unusedGenSpace));513514nextOffset = firstVecStack_;515for (X64Reg r : prologVec_) {516if (regCache_.UsedReg(r, RegCache::VEC_INVALID)) {517prolog.MOVUPS(MDisp(RSP, nextOffset), r);518nextOffset += 16;519}520}521522ptrdiff_t prologLen = prolog.GetWritableCodePtr() - prologPtr;523if (prologLen < prologMaxSize) {524// We wrote it at the start, but we actually want it at the end.525u8 *oldPrologPtr = prologPtr;526prologPtr += prologMaxSize - prologLen;527memmove(prologPtr, oldPrologPtr, prologLen);528// Set INT3s before the new start to be safe.529memset(oldPrologPtr, 0xCC, prologMaxSize - prologLen);530}531532if (PlatformIsWXExclusive()) {533ProtectMemoryPages(prologPtr, 128, MEM_PROT_READ | MEM_PROT_EXEC);534}535}536#else537_assert_msg_(false, "Not yet implemented");538#endif539540return prologPtr;541}542543RegCache::Reg CodeBlock::GetZeroVec() {544if (!regCache_.Has(RegCache::VEC_ZERO)) {545#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)546using namespace Gen;547X64Reg r = regCache_.Alloc(RegCache::VEC_ZERO);548PXOR(r, R(r));549return r;550#else551return RegCache::REG_INVALID_VALUE;552#endif553}554return regCache_.Find(RegCache::VEC_ZERO);555}556557void CodeBlock::Describe(const std::string &message) {558descriptions_[GetCodePointer()] = message;559}560561std::string CodeBlock::DescribeCodePtr(const u8 *ptr) {562ptrdiff_t dist = 0x7FFFFFFF;563std::string found;564for (const auto &it : descriptions_) {565ptrdiff_t it_dist = ptr - it.first;566if (it_dist >= 0 && it_dist < dist) {567found = it.second;568dist = it_dist;569}570}571return found;572}573574void CodeBlock::Clear() {575ClearCodeSpace(0);576descriptions_.clear();577}578579void CodeBlock::WriteSimpleConst16x8(const u8 *&ptr, uint8_t value) {580if (ptr == nullptr)581WriteDynamicConst16x8(ptr, value);582}583584void CodeBlock::WriteSimpleConst8x16(const u8 *&ptr, uint16_t value) {585if (ptr == nullptr)586WriteDynamicConst8x16(ptr, value);587}588589void CodeBlock::WriteSimpleConst4x32(const u8 *&ptr, uint32_t value) {590if (ptr == nullptr)591WriteDynamicConst4x32(ptr, value);592}593594void CodeBlock::WriteDynamicConst16x8(const u8 *&ptr, uint8_t value) {595#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)596ptr = AlignCode16();597for (int i = 0; i < 16; ++i)598Write8(value);599#else600_assert_msg_(false, "Not yet implemented");601#endif602}603604void CodeBlock::WriteDynamicConst8x16(const u8 *&ptr, uint16_t value) {605#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)606ptr = AlignCode16();607for (int i = 0; i < 8; ++i)608Write16(value);609#else610_assert_msg_(false, "Not yet implemented");611#endif612}613614void CodeBlock::WriteDynamicConst4x32(const u8 *&ptr, uint32_t value) {615#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)616ptr = AlignCode16();617for (int i = 0; i < 4; ++i)618Write32(value);619#else620_assert_msg_(false, "Not yet implemented");621#endif622}623624};625626627