Path: blob/master/src/core/cpu_recompiler_arm64.cpp
4212 views
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>1// SPDX-License-Identifier: CC-BY-NC-ND-4.023#include "cpu_recompiler_arm64.h"4#include "cpu_core_private.h"5#include "cpu_pgxp.h"6#include "gte.h"7#include "settings.h"8#include "timing_event.h"910#include "common/align.h"11#include "common/assert.h"12#include "common/log.h"13#include "common/memmap.h"14#include "common/string_util.h"1516#include <limits>1718#ifdef CPU_ARCH_ARM641920#include "vixl/aarch64/constants-aarch64.h"2122#ifdef ENABLE_HOST_DISASSEMBLY23#include "vixl/aarch64/disasm-aarch64.h"24#endif2526LOG_CHANNEL(Recompiler);2728#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))2930#define RWRET vixl::aarch64::w031#define RXRET vixl::aarch64::x032#define RWARG1 vixl::aarch64::w033#define RXARG1 vixl::aarch64::x034#define RWARG2 vixl::aarch64::w135#define RXARG2 vixl::aarch64::x136#define RWARG3 vixl::aarch64::w237#define RXARG3 vixl::aarch64::x238#define RWSCRATCH vixl::aarch64::w1639#define RXSCRATCH vixl::aarch64::x1640#define RSTATE vixl::aarch64::x1941#define RMEMBASE vixl::aarch64::x204243static bool armIsCallerSavedRegister(u32 id);44static s64 armGetPCDisplacement(const void* current, const void* target);45static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);46static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);47static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);48static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);49static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);50static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);51static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,52bool sign_extend_word = false);53static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,54const vixl::aarch64::Register& tempreg = RXSCRATCH);55static u8* armGetJumpTrampoline(const void* target);56static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);5758static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;59static std::unordered_map<const void*, u32> s_trampoline_targets;60static u8* s_trampoline_start_ptr = nullptr;61static u32 s_trampoline_used = 0;6263namespace CPU {6465using namespace vixl::aarch64;6667static ARM64Recompiler s_instance;68Recompiler* g_compiler = &s_instance;6970} // namespace CPU7172bool armIsCallerSavedRegister(u32 id)73{74// same on both linux and windows75return (id <= 18);76}7778void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)79{80// From vixl macro assembler.81DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());82DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());8384if (imm == 0)85{86armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));87return;88}8990// The worst case for size is mov 64-bit immediate to sp:91// * up to 4 instructions to materialise the constant92// * 1 instruction to move to sp9394// Immediates on Aarch64 can be produced using an initial value, and zero to95// three move keep operations.96//97// Initial values can be generated with:98// 1. 64-bit move zero (movz).99// 2. 32-bit move inverted (movn).100// 3. 64-bit move inverted.101// 4. 32-bit orr immediate.102// 5. 64-bit orr immediate.103// Move-keep may then be used to modify each of the 16-bit half words.104//105// The code below supports all five initial value generators, and106// applying move-keep operations to move-zero and move-inverted initial107// values.108109// Try to move the immediate in one instruction, and if that fails, switch to110// using multiple instructions.111const unsigned reg_size = rd.GetSizeInBits();112113if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())114{115// Immediate can be represented in a move zero instruction. Movz can't write116// to the stack pointer.117armAsm->movz(rd, imm);118return;119}120else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())121{122// Immediate can be represented in a move negative instruction. Movn can't123// write to the stack pointer.124armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));125return;126}127else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))128{129// Immediate can be represented in a logical orr instruction.130DebugAssert(!rd.IsZero());131armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);132return;133}134135// Generic immediate case. Imm will be represented by136// [imm3, imm2, imm1, imm0], where each imm is 16 bits.137// A move-zero or move-inverted is generated for the first non-zero or138// non-0xffff immX, and a move-keep for subsequent non-zero immX.139140uint64_t ignored_halfword = 0;141bool invert_move = false;142// If the number of 0xffff halfwords is greater than the number of 0x0000143// halfwords, it's more efficient to use move-inverted.144if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))145{146ignored_halfword = 0xffff;147invert_move = true;148}149150// Iterate through the halfwords. Use movn/movz for the first non-ignored151// halfword, and movk for subsequent halfwords.152DebugAssert((reg_size % 16) == 0);153bool first_mov_done = false;154for (unsigned i = 0; i < (reg_size / 16); i++)155{156uint64_t imm16 = (imm >> (16 * i)) & 0xffff;157if (imm16 != ignored_halfword)158{159if (!first_mov_done)160{161if (invert_move)162armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);163else164armAsm->movz(rd, imm16, 16 * i);165first_mov_done = true;166}167else168{169// Construct a wider constant.170armAsm->movk(rd, imm16, 16 * i);171}172}173}174175DebugAssert(first_mov_done);176}177178s64 armGetPCDisplacement(const void* current, const void* target)179{180// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));181// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));182return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);183}184185bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)186{187const void* cur = armAsm->GetCursorAddress<const void*>();188const void* current_code_ptr_page =189reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));190const void* ptr_page =191reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));192const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;193const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);194195return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||196vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));197}198199void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)200{201DebugAssert(reg.IsX());202203const void* cur = armAsm->GetCursorAddress<const void*>();204const void* current_code_ptr_page =205reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));206const void* ptr_page =207reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));208const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;209const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);210if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))211{212armAsm->adrp(reg, page_displacement);213armAsm->add(reg, reg, page_offset);214}215else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))216{217armAsm->adrp(reg, page_displacement);218armAsm->orr(reg, reg, page_offset);219}220else221{222armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));223}224}225226void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)227{228const void* cur = armAsm->GetCursorAddress<const void*>();229s64 displacement = armGetPCDisplacement(cur, ptr);230bool use_blr = !vixl::IsInt26(displacement);231bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);232if (use_blr && use_trampoline && !force_inline)233{234if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)235{236displacement = armGetPCDisplacement(cur, trampoline);237use_blr = !vixl::IsInt26(displacement);238}239}240241if (use_blr)242{243armMoveAddressToReg(armAsm, RXSCRATCH, ptr);244armAsm->br(RXSCRATCH);245}246else247{248armAsm->b(displacement);249}250}251252void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)253{254const void* cur = armAsm->GetCursorAddress<const void*>();255s64 displacement = armGetPCDisplacement(cur, ptr);256bool use_blr = !vixl::IsInt26(displacement);257bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);258if (use_blr && use_trampoline && !force_inline)259{260if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)261{262displacement = armGetPCDisplacement(cur, trampoline);263use_blr = !vixl::IsInt26(displacement);264}265}266267if (use_blr)268{269armMoveAddressToReg(armAsm, RXSCRATCH, ptr);270armAsm->blr(RXSCRATCH);271}272else273{274armAsm->bl(displacement);275}276}277278void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)279{280const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -281reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));282// pxAssert(Common::IsAligned(jump_distance, 4));283284if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))285{286armAsm->b(jump_distance >> 2, cond);287}288else289{290vixl::aarch64::Label branch_not_taken;291armAsm->b(&branch_not_taken, InvertCondition(cond));292293const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -294reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));295armAsm->b(new_jump_distance >> 2);296armAsm->bind(&branch_not_taken);297}298}299300void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,301bool sign_extend_word)302{303const void* cur = armAsm->GetCursorAddress<const void*>();304const void* current_code_ptr_page =305reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));306const void* ptr_page =307reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));308const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;309const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);310vixl::aarch64::MemOperand memop;311312const vixl::aarch64::Register xreg = reg.X();313if (vixl::IsInt21(page_displacement))314{315armAsm->adrp(xreg, page_displacement);316memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));317}318else319{320armMoveAddressToReg(armAsm, xreg, addr);321memop = vixl::aarch64::MemOperand(xreg);322}323324if (sign_extend_word)325armAsm->ldrsw(reg, memop);326else327armAsm->ldr(reg, memop);328}329330[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,331const void* addr, const vixl::aarch64::Register& tempreg)332{333DebugAssert(tempreg.IsX());334335const void* cur = armAsm->GetCursorAddress<const void*>();336const void* current_code_ptr_page =337reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));338const void* ptr_page =339reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));340const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;341const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);342343if (vixl::IsInt21(page_displacement))344{345armAsm->adrp(tempreg, page_displacement);346armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));347}348else349{350armMoveAddressToReg(armAsm, tempreg, addr);351armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));352}353}354355u8* armGetJumpTrampoline(const void* target)356{357auto it = s_trampoline_targets.find(target);358if (it != s_trampoline_targets.end())359return s_trampoline_start_ptr + it->second;360361// align to 16 bytes?362const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);363364// 4 movs plus a jump365if (TRAMPOLINE_AREA_SIZE - offset < 20)366{367Panic("Ran out of space in constant pool");368return nullptr;369}370371u8* start = s_trampoline_start_ptr + offset;372vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);373#ifdef VIXL_DEBUG374vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,375vixl::CodeBufferCheckScope::kDontReserveBufferSpace);376#endif377armMoveAddressToReg(&armAsm, RXSCRATCH, target);378armAsm.br(RXSCRATCH);379armAsm.FinalizeCode();380381const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());382DebugAssert(size < 20);383s_trampoline_targets.emplace(target, offset);384s_trampoline_used = offset + static_cast<u32>(size);385386MemMap::FlushInstructionCache(start, size);387return start;388}389390void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)391{392size_t addr = armAsm->GetCursorAddress<size_t>();393const size_t end_addr = Common::AlignUpPow2(addr, alignment);394while (addr != end_addr)395{396armAsm->nop();397addr += vixl::aarch64::kInstructionSize;398}399}400401void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)402{403#ifdef ENABLE_HOST_DISASSEMBLY404class MyDisassembler : public vixl::aarch64::Disassembler405{406protected:407void ProcessOutput(const vixl::aarch64::Instruction* instr) override408{409DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());410}411};412413vixl::aarch64::Decoder decoder;414MyDisassembler disas;415decoder.AppendVisitor(&disas);416decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),417reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));418#else419ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");420#endif421}422423u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)424{425return size / vixl::aarch64::kInstructionSize;426}427428u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)429{430using namespace vixl::aarch64;431432const s64 disp = armGetPCDisplacement(code, dst);433DebugAssert(vixl::IsInt26(disp));434435const u32 new_code = B | Assembler::ImmUncondBranch(disp);436std::memcpy(code, &new_code, sizeof(new_code));437if (flush_icache)438MemMap::FlushInstructionCache(code, kInstructionSize);439440return kInstructionSize;441}442443u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)444{445using namespace vixl::aarch64;446447Assembler actual_asm(static_cast<u8*>(code), code_size);448Assembler* RESTRICT armAsm = &actual_asm;449450#ifdef VIXL_DEBUG451vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);452#endif453454Label dispatch;455Label run_events_and_dispatch;456457g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();458{459// Need the CPU state for basically everything :-)460armMoveAddressToReg(armAsm, RSTATE, &g_state);461462// Fastmem setup, oldrec doesn't need it463if (IsUsingFastmem())464armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));465466// Fall through to event dispatcher467}468469// check events then for frame done470armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);471{472armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));473armAsm->ldr(RWARG2, PTR(&g_state.downcount));474armAsm->cmp(RWARG1, RWARG2);475armAsm->b(&dispatch, lt);476477g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();478armAsm->bind(&run_events_and_dispatch);479armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);480}481482armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);483g_dispatcher = armAsm->GetCursorAddress<const void*>();484{485armAsm->bind(&dispatch);486487// x9 <- s_fast_map[pc >> 16]488armAsm->ldr(RWARG1, PTR(&g_state.pc));489armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());490armAsm->lsr(RWARG2, RWARG1, 16);491armAsm->ubfx(RWARG1, RWARG1, 2, 14);492armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));493494// blr(x9[pc * 2]) (fast_map[pc >> 2])495armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));496armAsm->br(RXARG1);497}498499armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);500g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();501{502armAsm->ldr(RWARG1, PTR(&g_state.pc));503armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);504armAsm->b(&dispatch);505}506507armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);508g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();509{510armAsm->ldr(RWARG1, PTR(&g_state.pc));511armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);512armAsm->b(&dispatch);513}514515armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);516g_interpret_block = armAsm->GetCursorAddress<const void*>();517{518armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);519armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));520armAsm->ldr(RWARG2, PTR(&g_state.downcount));521armAsm->cmp(RWARG1, RWARG2);522armAsm->b(&run_events_and_dispatch, ge);523armAsm->b(&dispatch);524}525526armAsm->FinalizeCode();527528s_trampoline_targets.clear();529s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();530s_trampoline_used = 0;531532return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;533}534535void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)536{537constexpr u8 padding_value = 0x00;538std::memset(dst, padding_value, size);539}540541CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)542{543}544545CPU::ARM64Recompiler::~ARM64Recompiler() = default;546547const void* CPU::ARM64Recompiler::GetCurrentCodePointer()548{549return armAsm->GetCursorAddress<const void*>();550}551552void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,553u32 far_code_space)554{555Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);556557// TODO: don't recreate this every time..558DebugAssert(!armAsm);559m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);560m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);561armAsm = &m_emitter;562563#ifdef VIXL_DEBUG564m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,565vixl::CodeBufferCheckScope::kDontReserveBufferSpace);566m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(567&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);568#endif569570// Need to wipe it out so it's correct when toggling fastmem.571m_host_regs = {};572573const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;574for (u32 i = 0; i < NUM_HOST_REGS; i++)575{576HostRegAlloc& ra = m_host_regs[i];577578if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||579i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)580{581continue;582}583584ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);585}586}587588void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)589{590DebugAssert(armAsm == &m_emitter);591if (emit_jump)592{593const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());594if (cond != Condition::al)595{596if (vixl::IsInt19(disp))597{598armAsm->b(disp, cond);599}600else601{602Label skip;603armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));604armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));605armAsm->bind(&skip);606}607}608else609{610armAsm->b(disp);611}612}613armAsm = &m_far_emitter;614}615616void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)617{618const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());619if (vixl::IsInt14(disp))620{621armAsm->tbnz(reg, bit, disp);622}623else624{625Label skip;626armAsm->tbz(reg, bit, &skip);627armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));628armAsm->bind(&skip);629}630631armAsm = &m_far_emitter;632}633634void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)635{636const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());637if (vixl::IsInt19(disp))638{639nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);640}641else642{643Label skip;644nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);645armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));646armAsm->bind(&skip);647}648649armAsm = &m_far_emitter;650}651652void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)653{654DebugAssert(armAsm == &m_far_emitter);655if (emit_jump)656{657const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());658(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);659}660armAsm = &m_emitter;661}662663void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)664{665armEmitMov(armAsm, dst, val);666}667668void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)669{670armEmitCall(armAsm, ptr, force_inline);671}672673vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)674{675if (Assembler::IsImmAddSub(val))676return vixl::aarch64::Operand(static_cast<int64_t>(val));677678EmitMov(RWSCRATCH, static_cast<u32>(val));679return vixl::aarch64::Operand(RWSCRATCH);680}681682vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)683{684return armCheckAddSubConstant(static_cast<s32>(val));685}686687vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)688{689if (Assembler::IsImmConditionalCompare(val))690return vixl::aarch64::Operand(static_cast<int64_t>(val));691692EmitMov(RWSCRATCH, static_cast<u32>(val));693return vixl::aarch64::Operand(RWSCRATCH);694}695696vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)697{698if (Assembler::IsImmLogical(val, 32))699return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));700701EmitMov(RWSCRATCH, val);702return vixl::aarch64::Operand(RWSCRATCH);703}704705void CPU::ARM64Recompiler::BeginBlock()706{707Recompiler::BeginBlock();708}709710void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)711{712// store it first to reduce code size, because we can offset713armMoveAddressToReg(armAsm, RXARG1, ram_ptr);714armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);715716bool first = true;717u32 offset = 0;718Label block_changed;719720while (size >= 16)721{722const VRegister vtmp = v2.V4S();723const VRegister dst = first ? v0.V4S() : v1.V4S();724armAsm->ldr(dst, MemOperand(RXARG1, offset));725armAsm->ldr(vtmp, MemOperand(RXARG2, offset));726armAsm->cmeq(dst, dst, vtmp);727if (!first)728armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());729else730first = false;731732offset += 16;733size -= 16;734}735736if (!first)737{738// TODO: make sure this doesn't choke on ffffffff739armAsm->uminv(s0, v0.V4S());740armAsm->fcmp(s0, 0.0);741armAsm->b(&block_changed, eq);742}743744while (size >= 8)745{746armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));747armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));748armAsm->cmp(RXARG3, RXSCRATCH);749armAsm->b(&block_changed, ne);750offset += 8;751size -= 8;752}753754while (size >= 4)755{756armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));757armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));758armAsm->cmp(RWARG3, RWSCRATCH);759armAsm->b(&block_changed, ne);760offset += 4;761size -= 4;762}763764DebugAssert(size == 0);765766Label block_unchanged;767armAsm->b(&block_unchanged);768armAsm->bind(&block_changed);769armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);770armAsm->bind(&block_unchanged);771}772773void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()774{775if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))776{777if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))778{779armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());780armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));781armEmitMov(armAsm, RWARG3, m_block->size);782armAsm->mul(RWARG2, RWARG2, RWARG3);783armAsm->add(RWARG1, RWARG1, RWARG2);784armAsm->str(RWARG1, PTR(&g_state.pending_ticks));785}786else787{788armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));789armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));790armAsm->str(RWARG1, PTR(&g_state.pending_ticks));791}792}793else if (m_block->icache_line_count > 0)794{795const auto& ticks_reg = RWARG1;796const auto& current_tag_reg = RWARG2;797const auto& existing_tag_reg = RWARG3;798const auto& fill_ticks_reg = w4;799const auto& ticks_to_add_reg = w5;800801VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;802const TickCount fill_ticks = GetICacheFillTicks(current_pc);803if (fill_ticks <= 0)804return;805806armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));807armEmitMov(armAsm, current_tag_reg, current_pc);808armEmitMov(armAsm, fill_ticks_reg, fill_ticks);809810for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)811{812const u32 line = GetICacheLine(current_pc);813const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));814815Label cache_hit;816armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));817armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));818armAsm->cmp(existing_tag_reg, current_tag_reg);819armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);820armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);821822if (i != (m_block->icache_line_count - 1))823armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));824}825826armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));827}828}829830void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,831s32 arg3reg /*= -1*/)832{833if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))834armAsm->mov(RXARG1, XRegister(arg1reg));835if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))836armAsm->mov(RXARG2, XRegister(arg2reg));837if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))838armAsm->mov(RXARG3, XRegister(arg3reg));839EmitCall(func);840}841842void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)843{844if (newpc.has_value())845{846if (m_dirty_pc || m_compiler_pc != newpc)847{848EmitMov(RWSCRATCH, newpc.value());849armAsm->str(RWSCRATCH, PTR(&g_state.pc));850}851}852m_dirty_pc = false;853854// flush regs855Flush(FLUSH_END_BLOCK);856EndAndLinkBlock(newpc, do_event_test, false);857}858859void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)860{861// flush regs, but not pc, it's going to get overwritten862// flush cycles because of the GTE instruction stuff...863Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);864865// TODO: flush load delay866867EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,868inst->cop.cop_n));869EmitMov(RWARG2, m_current_instruction_pc);870if (excode != Exception::BP)871{872EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));873}874else875{876EmitMov(RWARG3, inst->bits);877EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));878}879m_dirty_pc = false;880881EndAndLinkBlock(std::nullopt, true, false);882}883884void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)885{886// event test887// pc should've been flushed888DebugAssert(!m_dirty_pc && !m_block_ended);889m_block_ended = true;890891// TODO: try extracting this to a function892893// save cycles for event test894const TickCount cycles = std::exchange(m_cycles, 0);895896// pending_ticks += cycles897// if (pending_ticks >= downcount) { dispatch_event(); }898if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)899armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));900if (do_event_test)901armAsm->ldr(RWARG2, PTR(&g_state.downcount));902if (cycles > 0)903armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));904if (m_gte_done_cycle > cycles)905{906armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));907armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));908}909if (do_event_test)910armAsm->cmp(RWARG1, RWARG2);911if (cycles > 0)912armAsm->str(RWARG1, PTR(&g_state.pending_ticks));913if (do_event_test)914armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);915916// jump to dispatcher or next block917if (force_run_events)918{919armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);920}921else if (!newpc.has_value())922{923armEmitJmp(armAsm, CodeCache::g_dispatcher, false);924}925else926{927const void* target = (newpc.value() == m_block->pc) ?928CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),929armAsm->GetBuffer()->GetStartAddress<const void*>()) :930CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());931armEmitJmp(armAsm, target, true);932}933}934935const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)936{937#ifdef VIXL_DEBUG938m_emitter_check.reset();939m_far_emitter_check.reset();940#endif941942m_emitter.FinalizeCode();943m_far_emitter.FinalizeCode();944945u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();946*code_size = static_cast<u32>(m_emitter.GetCursorOffset());947*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());948armAsm = nullptr;949return code;950}951952const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const953{954static constexpr std::array<const char*, 32> reg64_names = {955{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",956"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};957return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";958}959960void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)961{962EmitMov(WRegister(reg), val);963}964965void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)966{967armAsm->ldr(WRegister(reg), PTR(ptr));968}969970void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)971{972armAsm->str(WRegister(reg), PTR(ptr));973}974975void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)976{977if (val == 0)978{979armAsm->str(wzr, PTR(ptr));980return;981}982983EmitMov(RWSCRATCH, val);984armAsm->str(RWSCRATCH, PTR(ptr));985}986987void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)988{989if (src != dst)990armAsm->mov(WRegister(dst), WRegister(src));991}992993void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const994{995DebugAssert(cf.valid_host_s || cf.const_s);996}997998void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const999{1000DebugAssert(cf.valid_host_t || cf.const_t);1001}10021003vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const1004{1005DebugAssert(r < Reg::count);1006return PTR(&g_state.regs.r[static_cast<u32>(r)]);1007}10081009vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const1010{1011DebugAssert(cf.valid_host_d);1012return WRegister(cf.host_d);1013}10141015vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const1016{1017DebugAssert(cf.valid_host_s);1018return WRegister(cf.host_s);1019}10201021vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const1022{1023DebugAssert(cf.valid_host_t);1024return WRegister(cf.host_t);1025}10261027vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const1028{1029DebugAssert(cf.valid_host_lo);1030return WRegister(cf.host_lo);1031}10321033vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const1034{1035DebugAssert(cf.valid_host_hi);1036return WRegister(cf.host_hi);1037}10381039void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1040{1041DebugAssert(dst.IsW());1042if (cf.valid_host_s)1043{1044if (cf.host_s != dst.GetCode())1045armAsm->mov(dst, WRegister(cf.host_s));1046}1047else if (cf.const_s)1048{1049const u32 cv = GetConstantRegU32(cf.MipsS());1050if (cv == 0)1051armAsm->mov(dst, wzr);1052else1053EmitMov(dst, cv);1054}1055else1056{1057WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));1058armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));1059}1060}10611062void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1063{1064DebugAssert(dst.IsW());1065if (cf.valid_host_t)1066{1067if (cf.host_t != dst.GetCode())1068armAsm->mov(dst, WRegister(cf.host_t));1069}1070else if (cf.const_t)1071{1072const u32 cv = GetConstantRegU32(cf.MipsT());1073if (cv == 0)1074armAsm->mov(dst, wzr);1075else1076EmitMov(dst, cv);1077}1078else1079{1080WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));1081armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));1082}1083}10841085void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg)1086{1087DebugAssert(reg < Reg::count && dst.IsW());1088if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))1089armAsm->mov(dst, WRegister(hreg.value()));1090else if (HasConstantReg(reg))1091EmitMov(dst, GetConstantRegU32(reg));1092else1093armAsm->ldr(dst, MipsPtr(reg));1094}10951096void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,1097Reg arg3reg /* = Reg::count */)1098{1099DebugAssert(g_settings.gpu_pgxp_enable);11001101Flush(FLUSH_FOR_C_CALL);11021103if (arg2reg != Reg::count)1104MoveMIPSRegToReg(RWARG2, arg2reg);1105if (arg3reg != Reg::count)1106MoveMIPSRegToReg(RWARG3, arg3reg);11071108EmitMov(RWARG1, arg1val);1109EmitCall(func);1110}11111112void CPU::ARM64Recompiler::Flush(u32 flags)1113{1114Recompiler::Flush(flags);11151116if (flags & FLUSH_PC && m_dirty_pc)1117{1118StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);1119m_dirty_pc = false;1120}11211122if (flags & FLUSH_INSTRUCTION_BITS)1123{1124// This sucks, but it's only used for fallbacks.1125EmitMov(RWARG1, inst->bits);1126EmitMov(RWARG2, m_current_instruction_pc);1127EmitMov(RWARG3, m_current_instruction_branch_delay_slot);1128armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));1129armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));1130armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));1131}11321133if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)1134{1135// This sucks :(1136// TODO: make it a function?1137armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));1138armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));1139EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));1140armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));1141armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));1142EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));1143armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1144m_load_delay_dirty = false;1145}11461147if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)1148{1149if (m_load_delay_value_register != NUM_HOST_REGS)1150FreeHostReg(m_load_delay_value_register);11511152EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));1153armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1154m_load_delay_register = Reg::count;1155m_load_delay_dirty = true;1156}11571158if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)1159{1160// May as well flush cycles while we're here.1161// GTE spanning blocks is very rare, we _could_ disable this for speed.1162armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1163armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));1164if (m_cycles > 0)1165{1166armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1167m_cycles = 0;1168}1169armAsm->cmp(RWARG2, RWARG1);1170armAsm->csel(RWARG1, RWARG2, RWARG1, hs);1171armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1172m_dirty_gte_done_cycle = false;1173}11741175if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)1176{1177armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));11781179// update cycles at the same time1180if (flags & FLUSH_CYCLES && m_cycles > 0)1181{1182armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1183armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1184m_gte_done_cycle -= m_cycles;1185m_cycles = 0;1186}11871188armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));1189armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));1190m_gte_done_cycle = 0;1191m_dirty_gte_done_cycle = true;1192}11931194if (flags & FLUSH_CYCLES && m_cycles > 0)1195{1196armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1197armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1198armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1199m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);1200m_cycles = 0;1201}1202}12031204void CPU::ARM64Recompiler::Compile_Fallback()1205{1206WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,1207inst->bits);12081209Flush(FLUSH_FOR_INTERPRETER);12101211EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));12121213// TODO: make me less garbage1214// TODO: this is wrong, it flushes the load delay on the same cycle when we return.1215// but nothing should be going through here..1216Label no_load_delay;1217armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));1218armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));1219armAsm->b(&no_load_delay, eq);1220armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));1221armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));1222armAsm->str(RWARG2, PTR(&g_state.load_delay_value));1223EmitMov(RWARG1, static_cast<u32>(Reg::count));1224armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));1225armAsm->bind(&no_load_delay);12261227m_load_delay_dirty = EMULATE_LOAD_DELAYS;1228}12291230void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)1231{1232DebugAssert(pcreg.IsW());1233if (!g_settings.cpu_recompiler_memory_exceptions)1234return;12351236armAsm->tst(pcreg, armCheckLogicalConstant(0x3));1237SwitchToFarCode(true, ne);12381239BackupHostState();1240EndBlockWithException(Exception::AdEL);12411242RestoreHostState();1243SwitchToNearCode(false);1244}12451246void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)1247{1248const Register pcreg = CFGetRegS(cf);1249CheckBranchTarget(pcreg);12501251armAsm->str(pcreg, PTR(&g_state.pc));12521253CompileBranchDelaySlot(false);1254EndBlock(std::nullopt, true);1255}12561257void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)1258{1259const Register pcreg = CFGetRegS(cf);1260if (MipsD() != Reg::zero)1261SetConstantReg(MipsD(), GetBranchReturnAddress(cf));12621263CheckBranchTarget(pcreg);1264armAsm->str(pcreg, PTR(&g_state.pc));12651266CompileBranchDelaySlot(false);1267EndBlock(std::nullopt, true);1268}12691270void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)1271{1272AssertRegOrConstS(cf);12731274const u32 taken_pc = GetConditionalBranchTarget(cf);12751276Flush(FLUSH_FOR_BRANCH);12771278DebugAssert(cf.valid_host_s);12791280// MipsT() here should equal zero for zero branches.1281DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);12821283Label taken;1284const Register rs = CFGetRegS(cf);1285switch (cond)1286{1287case BranchCondition::Equal:1288case BranchCondition::NotEqual:1289{1290AssertRegOrConstT(cf);1291if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))1292{1293(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);1294}1295else1296{1297if (cf.valid_host_t)1298armAsm->cmp(rs, CFGetRegT(cf));1299else if (cf.const_t)1300armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));13011302armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);1303}1304}1305break;13061307case BranchCondition::GreaterThanZero:1308{1309armAsm->cmp(rs, 0);1310armAsm->b(&taken, gt);1311}1312break;13131314case BranchCondition::GreaterEqualZero:1315{1316armAsm->cmp(rs, 0);1317armAsm->b(&taken, ge);1318}1319break;13201321case BranchCondition::LessThanZero:1322{1323armAsm->cmp(rs, 0);1324armAsm->b(&taken, lt);1325}1326break;13271328case BranchCondition::LessEqualZero:1329{1330armAsm->cmp(rs, 0);1331armAsm->b(&taken, le);1332}1333break;1334}13351336BackupHostState();1337if (!cf.delay_slot_swapped)1338CompileBranchDelaySlot();13391340EndBlock(m_compiler_pc, true);13411342armAsm->bind(&taken);13431344RestoreHostState();1345if (!cf.delay_slot_swapped)1346CompileBranchDelaySlot();13471348EndBlock(taken_pc, true);1349}13501351void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)1352{1353const Register rs = CFGetRegS(cf);1354const Register rt = CFGetRegT(cf);1355if (const u32 imm = inst->i.imm_sext32(); imm != 0)1356{1357if (!overflow)1358{1359armAsm->add(rt, rs, armCheckAddSubConstant(imm));1360}1361else1362{1363armAsm->adds(rt, rs, armCheckAddSubConstant(imm));1364TestOverflow(rt);1365}1366}1367else if (rt.GetCode() != rs.GetCode())1368{1369armAsm->mov(rt, rs);1370}1371}13721373void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)1374{1375Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);1376}13771378void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)1379{1380Compile_addi(cf, false);1381}13821383void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)1384{1385Compile_slti(cf, true);1386}13871388void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)1389{1390Compile_slti(cf, false);1391}13921393void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)1394{1395armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));1396armAsm->cset(CFGetRegT(cf), sign ? lt : lo);1397}13981399void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)1400{1401const Register rt = CFGetRegT(cf);1402if (const u32 imm = inst->i.imm_zext32(); imm != 0)1403armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));1404else1405armAsm->mov(rt, wzr);1406}14071408void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)1409{1410const Register rt = CFGetRegT(cf);1411const Register rs = CFGetRegS(cf);1412if (const u32 imm = inst->i.imm_zext32(); imm != 0)1413armAsm->orr(rt, rs, armCheckLogicalConstant(imm));1414else if (rt.GetCode() != rs.GetCode())1415armAsm->mov(rt, rs);1416}14171418void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)1419{1420const Register rt = CFGetRegT(cf);1421const Register rs = CFGetRegS(cf);1422if (const u32 imm = inst->i.imm_zext32(); imm != 0)1423armAsm->eor(rt, rs, armCheckLogicalConstant(imm));1424else if (rt.GetCode() != rs.GetCode())1425armAsm->mov(rt, rs);1426}14271428void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,1429void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1430const vixl::aarch64::Register&, unsigned))1431{1432const Register rd = CFGetRegD(cf);1433const Register rt = CFGetRegT(cf);1434if (inst->r.shamt > 0)1435(armAsm->*op)(rd, rt, inst->r.shamt);1436else if (rd.GetCode() != rt.GetCode())1437armAsm->mov(rd, rt);1438}14391440void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)1441{1442Compile_shift(cf, &Assembler::lsl);1443}14441445void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)1446{1447Compile_shift(cf, &Assembler::lsr);1448}14491450void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)1451{1452Compile_shift(cf, &Assembler::asr);1453}14541455void CPU::ARM64Recompiler::Compile_variable_shift(1456CompileFlags cf,1457void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,1458const vixl::aarch64::Register&),1459void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))1460{1461const Register rd = CFGetRegD(cf);14621463AssertRegOrConstS(cf);1464AssertRegOrConstT(cf);14651466const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1467if (!cf.valid_host_t)1468MoveTToReg(rt, cf);14691470if (cf.const_s)1471{1472if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)1473(armAsm->*op_const)(rd, rt, shift);1474else if (rd.GetCode() != rt.GetCode())1475armAsm->mov(rd, rt);1476}1477else1478{1479(armAsm->*op)(rd, rt, CFGetRegS(cf));1480}1481}14821483void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)1484{1485Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);1486}14871488void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)1489{1490Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);1491}14921493void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)1494{1495Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);1496}14971498void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)1499{1500const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1501if (!cf.valid_host_s)1502MoveSToReg(rs, cf);15031504const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1505if (!cf.valid_host_t)1506MoveTToReg(rt, cf);15071508// TODO: if lo/hi gets killed, we can use a 32-bit multiply1509const Register lo = CFGetRegLO(cf);1510const Register hi = CFGetRegHI(cf);15111512(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);1513armAsm->lsr(hi.X(), lo.X(), 32);1514}15151516void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)1517{1518Compile_mult(cf, true);1519}15201521void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)1522{1523Compile_mult(cf, false);1524}15251526void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)1527{1528const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1529if (!cf.valid_host_s)1530MoveSToReg(rs, cf);15311532const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1533if (!cf.valid_host_t)1534MoveTToReg(rt, cf);15351536const Register rlo = CFGetRegLO(cf);1537const Register rhi = CFGetRegHI(cf);15381539// TODO: This could be slightly more optimal1540Label done;1541Label not_divide_by_zero;1542armAsm->cbnz(rt, ¬_divide_by_zero);1543armAsm->mov(rhi, rs); // hi = num1544EmitMov(rlo, 1);1545EmitMov(RWSCRATCH, static_cast<u32>(-1));1546armAsm->cmp(rs, 0);1547armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 11548armAsm->b(&done);15491550armAsm->bind(¬_divide_by_zero);1551Label not_unrepresentable;1552armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));1553armAsm->b(¬_unrepresentable, ne);1554armAsm->cmp(rt, armCheckCompareConstant(-1));1555armAsm->b(¬_unrepresentable, ne);15561557EmitMov(rlo, 0x80000000u);1558EmitMov(rhi, 0);1559armAsm->b(&done);15601561armAsm->bind(¬_unrepresentable);15621563armAsm->sdiv(rlo, rs, rt);15641565// TODO: skip when hi is dead1566armAsm->msub(rhi, rlo, rt, rs);15671568armAsm->bind(&done);1569}15701571void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)1572{1573const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1574if (!cf.valid_host_s)1575MoveSToReg(rs, cf);15761577const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1578if (!cf.valid_host_t)1579MoveTToReg(rt, cf);15801581const Register rlo = CFGetRegLO(cf);1582const Register rhi = CFGetRegHI(cf);15831584Label done;1585Label not_divide_by_zero;1586armAsm->cbnz(rt, ¬_divide_by_zero);1587EmitMov(rlo, static_cast<u32>(-1));1588armAsm->mov(rhi, rs);1589armAsm->b(&done);15901591armAsm->bind(¬_divide_by_zero);15921593armAsm->udiv(rlo, rs, rt);15941595// TODO: skip when hi is dead1596armAsm->msub(rhi, rlo, rt, rs);15971598armAsm->bind(&done);1599}16001601void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)1602{1603DebugAssert(result.IsW());1604SwitchToFarCode(true, vs);16051606BackupHostState();16071608// toss the result1609ClearHostReg(result.GetCode());16101611EndBlockWithException(Exception::Ov);16121613RestoreHostState();16141615SwitchToNearCode(false);1616}16171618void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,1619void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1620const vixl::aarch64::Register&,1621const vixl::aarch64::Operand&),1622bool commutative, bool logical, bool overflow)1623{1624AssertRegOrConstS(cf);1625AssertRegOrConstT(cf);16261627const Register rd = CFGetRegD(cf);1628if (cf.valid_host_s && cf.valid_host_t)1629{1630(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));1631}1632else if (commutative && (cf.const_s || cf.const_t))1633{1634const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);1635if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1636{1637(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1638}1639else1640{1641if (rd.GetCode() != src.GetCode())1642armAsm->mov(rd, src);1643overflow = false;1644}1645}1646else if (cf.const_s)1647{1648// TODO: Check where we can use wzr here1649EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));1650(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));1651}1652else if (cf.const_t)1653{1654const Register rs = CFGetRegS(cf);1655if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1656{1657(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1658}1659else1660{1661if (rd.GetCode() != rs.GetCode())1662armAsm->mov(rd, rs);1663overflow = false;1664}1665}16661667if (overflow)1668TestOverflow(rd);1669}16701671void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)1672{1673if (g_settings.cpu_recompiler_memory_exceptions)1674Compile_dst_op(cf, &Assembler::adds, true, false, true);1675else1676Compile_dst_op(cf, &Assembler::add, true, false, false);1677}16781679void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)1680{1681Compile_dst_op(cf, &Assembler::add, true, false, false);1682}16831684void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)1685{1686if (g_settings.cpu_recompiler_memory_exceptions)1687Compile_dst_op(cf, &Assembler::subs, false, false, true);1688else1689Compile_dst_op(cf, &Assembler::sub, false, false, false);1690}16911692void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)1693{1694Compile_dst_op(cf, &Assembler::sub, false, false, false);1695}16961697void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)1698{1699AssertRegOrConstS(cf);1700AssertRegOrConstT(cf);17011702// special cases - and with self -> self, and with 0 -> 01703const Register regd = CFGetRegD(cf);1704if (cf.MipsS() == cf.MipsT())1705{1706armAsm->mov(regd, CFGetRegS(cf));1707return;1708}1709else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1710{1711armAsm->mov(regd, wzr);1712return;1713}17141715Compile_dst_op(cf, &Assembler::and_, true, true, false);1716}17171718void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)1719{1720AssertRegOrConstS(cf);1721AssertRegOrConstT(cf);17221723// or/nor with 0 -> no effect1724const Register regd = CFGetRegD(cf);1725if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())1726{1727cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1728return;1729}17301731Compile_dst_op(cf, &Assembler::orr, true, true, false);1732}17331734void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)1735{1736AssertRegOrConstS(cf);1737AssertRegOrConstT(cf);17381739const Register regd = CFGetRegD(cf);1740if (cf.MipsS() == cf.MipsT())1741{1742// xor with self -> zero1743armAsm->mov(regd, wzr);1744return;1745}1746else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1747{1748// xor with zero -> no effect1749cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1750return;1751}17521753Compile_dst_op(cf, &Assembler::eor, true, true, false);1754}17551756void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)1757{1758Compile_or(cf);1759armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));1760}17611762void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)1763{1764Compile_slt(cf, true);1765}17661767void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)1768{1769Compile_slt(cf, false);1770}17711772void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)1773{1774AssertRegOrConstS(cf);1775AssertRegOrConstT(cf);17761777// TODO: swap and reverse op for constants1778if (cf.const_s)1779{1780EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));1781armAsm->cmp(RWSCRATCH, CFGetRegT(cf));1782}1783else if (cf.const_t)1784{1785armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));1786}1787else1788{1789armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));1790}17911792armAsm->cset(CFGetRegD(cf), sign ? lt : lo);1793}17941795vixl::aarch64::Register1796CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,1797const std::optional<const vixl::aarch64::Register>& reg)1798{1799const u32 imm = inst->i.imm_sext32();1800if (cf.valid_host_s && imm == 0 && !reg.has_value())1801return CFGetRegS(cf);18021803const Register dst = reg.has_value() ? reg.value() : RWARG1;1804if (address.has_value())1805{1806EmitMov(dst, address.value());1807}1808else if (imm == 0)1809{1810if (cf.valid_host_s)1811{1812if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())1813armAsm->mov(dst, CFGetRegS(cf));1814}1815else1816{1817armAsm->ldr(dst, MipsPtr(cf.MipsS()));1818}1819}1820else1821{1822if (cf.valid_host_s)1823{1824armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1825}1826else1827{1828armAsm->ldr(dst, MipsPtr(cf.MipsS()));1829armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1830}1831}18321833return dst;1834}18351836template<typename RegAllocFn>1837vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,1838MemoryAccessSize size, bool sign, bool use_fastmem,1839const RegAllocFn& dst_reg_alloc)1840{1841DebugAssert(addr_reg.IsW());1842if (use_fastmem)1843{1844m_cycles += Bus::RAM_READ_TICKS;18451846const Register dst = dst_reg_alloc();18471848if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1849{1850DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1851armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1852armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1853}18541855const MemOperand mem =1856MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1857u8* start = armAsm->GetCursorAddress<u8*>();1858switch (size)1859{1860case MemoryAccessSize::Byte:1861sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);1862break;18631864case MemoryAccessSize::HalfWord:1865sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);1866break;18671868case MemoryAccessSize::Word:1869armAsm->ldr(dst, mem);1870break;1871}18721873AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);1874return dst;1875}18761877if (addr_reg.GetCode() != RWARG1.GetCode())1878armAsm->mov(RWARG1, addr_reg);18791880const bool checked = g_settings.cpu_recompiler_memory_exceptions;1881switch (size)1882{1883case MemoryAccessSize::Byte:1884{1885EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :1886reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));1887}1888break;1889case MemoryAccessSize::HalfWord:1890{1891EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :1892reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));1893}1894break;1895case MemoryAccessSize::Word:1896{1897EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :1898reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));1899}1900break;1901}19021903// TODO: turn this into an asm function instead1904if (checked)1905{1906SwitchToFarCodeIfBitSet(RXRET, 63);1907BackupHostState();19081909// Need to stash this in a temp because of the flush.1910const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));1911armAsm->neg(temp.X(), RXRET);1912armAsm->lsl(temp, temp, 2);19131914Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);19151916// cause_bits = (-result << 2) | BD | cop_n1917armAsm->orr(RWARG1, temp,1918armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(1919static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));1920EmitMov(RWARG2, m_current_instruction_pc);1921EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1922FreeHostReg(temp.GetCode());1923EndBlock(std::nullopt, true);19241925RestoreHostState();1926SwitchToNearCode(false);1927}19281929const Register dst_reg = dst_reg_alloc();1930switch (size)1931{1932case MemoryAccessSize::Byte:1933{1934sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);1935}1936break;1937case MemoryAccessSize::HalfWord:1938{1939sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);1940}1941break;1942case MemoryAccessSize::Word:1943{1944if (dst_reg.GetCode() != RWRET.GetCode())1945armAsm->mov(dst_reg, RWRET);1946}1947break;1948}19491950return dst_reg;1951}19521953void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,1954const vixl::aarch64::Register& value_reg, MemoryAccessSize size,1955bool use_fastmem)1956{1957DebugAssert(addr_reg.IsW() && value_reg.IsW());1958if (use_fastmem)1959{1960if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1961{1962DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1963armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1964armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1965}19661967const MemOperand mem =1968MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1969u8* start = armAsm->GetCursorAddress<u8*>();1970switch (size)1971{1972case MemoryAccessSize::Byte:1973armAsm->strb(value_reg, mem);1974break;19751976case MemoryAccessSize::HalfWord:1977armAsm->strh(value_reg, mem);1978break;19791980case MemoryAccessSize::Word:1981armAsm->str(value_reg, mem);1982break;1983}1984AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);1985return;1986}19871988if (addr_reg.GetCode() != RWARG1.GetCode())1989armAsm->mov(RWARG1, addr_reg);1990if (value_reg.GetCode() != RWARG2.GetCode())1991armAsm->mov(RWARG2, value_reg);19921993const bool checked = g_settings.cpu_recompiler_memory_exceptions;1994switch (size)1995{1996case MemoryAccessSize::Byte:1997{1998EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :1999reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));2000}2001break;2002case MemoryAccessSize::HalfWord:2003{2004EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :2005reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));2006}2007break;2008case MemoryAccessSize::Word:2009{2010EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :2011reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));2012}2013break;2014}20152016// TODO: turn this into an asm function instead2017if (checked)2018{2019SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);2020BackupHostState();20212022// Need to stash this in a temp because of the flush.2023const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2024armAsm->lsl(temp, RWRET, 2);20252026Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);20272028// cause_bits = (result << 2) | BD | cop_n2029armAsm->orr(RWARG1, temp,2030armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(2031static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));2032EmitMov(RWARG2, m_current_instruction_pc);2033EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2034FreeHostReg(temp.GetCode());2035EndBlock(std::nullopt, true);20362037RestoreHostState();2038SwitchToNearCode(false);2039}2040}20412042void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2043const std::optional<VirtualMemoryAddress>& address)2044{2045const std::optional<WRegister> addr_reg =2046g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2047std::optional<WRegister>();2048FlushForLoadStore(address, false, use_fastmem);2049const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2050const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {2051if (cf.MipsT() == Reg::zero)2052return RWRET;20532054return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2055EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,2056cf.MipsT()));2057});20582059if (g_settings.gpu_pgxp_enable)2060{2061Flush(FLUSH_FOR_C_CALL);20622063EmitMov(RWARG1, inst->bits);2064armAsm->mov(RWARG2, addr);2065armAsm->mov(RWARG3, data);2066EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);2067FreeHostReg(addr_reg.value().GetCode());2068}2069}20702071void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2072const std::optional<VirtualMemoryAddress>& address)2073{2074DebugAssert(size == MemoryAccessSize::Word && !sign);20752076const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2077FlushForLoadStore(address, false, use_fastmem);20782079// TODO: if address is constant, this can be simplified..20802081// If we're coming from another block, just flush the load delay and hope for the best..2082if (m_load_delay_dirty)2083UpdateLoadDelay();20842085// We'd need to be careful here if we weren't overwriting it..2086ComputeLoadStoreAddressArg(cf, address, addr);2087armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2088GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });20892090if (inst->r.rt == Reg::zero)2091{2092FreeHostReg(addr.GetCode());2093return;2094}20952096// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is2097// never written back. NOTE: can't trust T in cf because of the flush2098const Reg rt = inst->r.rt;2099Register value;2100if (m_load_delay_register == rt)2101{2102const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?2103AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :2104m_load_delay_value_register;2105RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);2106value = WRegister(existing_ld_rt);2107}2108else2109{2110if constexpr (EMULATE_LOAD_DELAYS)2111{2112value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));2113if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())2114armAsm->mov(value, WRegister(rtreg.value()));2115else if (HasConstantReg(rt))2116EmitMov(value, GetConstantRegU32(rt));2117else2118armAsm->ldr(value, MipsPtr(rt));2119}2120else2121{2122value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));2123}2124}21252126DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2127armAsm->and_(RWARG2, addr, 3);2128armAsm->lsl(RWARG2, RWARG2, 3); // *82129EmitMov(RWARG3, 24);2130armAsm->sub(RWARG3, RWARG3, RWARG2);21312132if (inst->op == InstructionOp::lwl)2133{2134// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;2135// new_value = (value & mask) | (RWRET << (24 - shift));2136EmitMov(RWSCRATCH, 0xFFFFFFu);2137armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);2138armAsm->and_(value, value, RWSCRATCH);2139armAsm->lslv(RWRET, RWRET, RWARG3);2140armAsm->orr(value, value, RWRET);2141}2142else2143{2144// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);2145// new_value = (value & mask) | (RWRET >> shift);2146armAsm->lsrv(RWRET, RWRET, RWARG2);2147EmitMov(RWSCRATCH, 0xFFFFFF00u);2148armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);2149armAsm->and_(value, value, RWSCRATCH);2150armAsm->orr(value, value, RWRET);2151}21522153FreeHostReg(addr.GetCode());21542155if (g_settings.gpu_pgxp_enable)2156{2157Flush(FLUSH_FOR_C_CALL);2158armAsm->mov(RWARG3, value);2159armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u));2160EmitMov(RWARG1, inst->bits);2161EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));2162}2163}21642165void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2166const std::optional<VirtualMemoryAddress>& address)2167{2168const u32 index = static_cast<u32>(inst->r.rt.GetValue());2169const auto [ptr, action] = GetGTERegisterPointer(index, true);2170const std::optional<WRegister> addr_reg =2171g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2172std::optional<WRegister>();2173FlushForLoadStore(address, false, use_fastmem);2174const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2175const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {2176return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?2177WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2178RWRET;2179});21802181switch (action)2182{2183case GTERegisterAccessAction::Ignore:2184{2185break;2186}21872188case GTERegisterAccessAction::Direct:2189{2190armAsm->str(value, PTR(ptr));2191break;2192}21932194case GTERegisterAccessAction::SignExtend16:2195{2196armAsm->sxth(RWARG3, value);2197armAsm->str(RWARG3, PTR(ptr));2198break;2199}22002201case GTERegisterAccessAction::ZeroExtend16:2202{2203armAsm->uxth(RWARG3, value);2204armAsm->str(RWARG3, PTR(ptr));2205break;2206}22072208case GTERegisterAccessAction::CallHandler:2209{2210Flush(FLUSH_FOR_C_CALL);2211armAsm->mov(RWARG2, value);2212EmitMov(RWARG1, index);2213EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2214break;2215}22162217case GTERegisterAccessAction::PushFIFO:2218{2219// SXY0 <- SXY12220// SXY1 <- SXY22221// SXY2 <- SXYP2222DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2223armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2224armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2225armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2226armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2227armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));2228break;2229}22302231default:2232{2233Panic("Unknown action");2234return;2235}2236}22372238if (g_settings.gpu_pgxp_enable)2239{2240Flush(FLUSH_FOR_C_CALL);2241armAsm->mov(RWARG3, value);2242if (value.GetCode() != RWRET.GetCode())2243FreeHostReg(value.GetCode());2244armAsm->mov(RWARG2, addr);2245FreeHostReg(addr_reg.value().GetCode());2246EmitMov(RWARG1, inst->bits);2247EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));2248}2249}22502251void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2252const std::optional<VirtualMemoryAddress>& address)2253{2254AssertRegOrConstS(cf);2255AssertRegOrConstT(cf);22562257const std::optional<WRegister> addr_reg =2258g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2259std::optional<WRegister>();2260FlushForLoadStore(address, true, use_fastmem);2261const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2262const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;2263if (!cf.valid_host_t)2264MoveTToReg(RWARG2, cf);22652266GenerateStore(addr, data, size, use_fastmem);22672268if (g_settings.gpu_pgxp_enable)2269{2270Flush(FLUSH_FOR_C_CALL);2271MoveMIPSRegToReg(RWARG3, cf.MipsT());2272armAsm->mov(RWARG2, addr);2273EmitMov(RWARG1, inst->bits);2274EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);2275FreeHostReg(addr_reg.value().GetCode());2276}2277}22782279void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2280const std::optional<VirtualMemoryAddress>& address)2281{2282DebugAssert(size == MemoryAccessSize::Word && !sign);22832284// TODO: this can take over rt's value if it's no longer needed2285// NOTE: can't trust T in cf because of the alloc2286const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2287const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;2288if (g_settings.gpu_pgxp_enable)2289MoveMIPSRegToReg(value, inst->r.rt);22902291FlushForLoadStore(address, true, use_fastmem);22922293// TODO: if address is constant, this can be simplified..2294// We'd need to be careful here if we weren't overwriting it..2295ComputeLoadStoreAddressArg(cf, address, addr);2296armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2297GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });22982299armAsm->and_(RWSCRATCH, addr, 3);2300armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *82301armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));23022303// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.2304if (!g_settings.gpu_pgxp_enable)2305MoveMIPSRegToReg(value, inst->r.rt);23062307if (inst->op == InstructionOp::swl)2308{2309// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;2310// new_value = (RWRET & mem_mask) | (value >> (24 - shift));2311EmitMov(RWARG3, 0xFFFFFF00u);2312armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);2313armAsm->and_(RWRET, RWRET, RWARG3);23142315EmitMov(RWARG3, 24);2316armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2317armAsm->lsrv(value, value, RWARG3);2318armAsm->orr(value, value, RWRET);2319}2320else2321{2322// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);2323// new_value = (RWRET & mem_mask) | (value << shift);2324armAsm->lslv(value, value, RWSCRATCH);23252326EmitMov(RWARG3, 24);2327armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2328EmitMov(RWSCRATCH, 0x00FFFFFFu);2329armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);2330armAsm->and_(RWRET, RWRET, RWSCRATCH);2331armAsm->orr(value, value, RWRET);2332}23332334if (!g_settings.gpu_pgxp_enable)2335{2336GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);2337FreeHostReg(addr.GetCode());2338}2339else2340{2341GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);23422343Flush(FLUSH_FOR_C_CALL);2344armAsm->mov(RWARG3, value);2345FreeHostReg(value.GetCode());2346armAsm->mov(RWARG2, addr);2347FreeHostReg(addr.GetCode());2348EmitMov(RWARG1, inst->bits);2349EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));2350}2351}23522353void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2354const std::optional<VirtualMemoryAddress>& address)2355{2356const u32 index = static_cast<u32>(inst->r.rt.GetValue());2357const auto [ptr, action] = GetGTERegisterPointer(index, false);2358const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?2359WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2360RWARG1;2361const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;2362FlushForLoadStore(address, true, use_fastmem);2363ComputeLoadStoreAddressArg(cf, address, addr);23642365switch (action)2366{2367case GTERegisterAccessAction::Direct:2368{2369armAsm->ldr(data, PTR(ptr));2370}2371break;23722373case GTERegisterAccessAction::CallHandler:2374{2375// should already be flushed.. except in fastmem case2376Flush(FLUSH_FOR_C_CALL);2377EmitMov(RWARG1, index);2378EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));2379armAsm->mov(data, RWRET);2380}2381break;23822383default:2384{2385Panic("Unknown action");2386}2387break;2388}23892390GenerateStore(addr, data, size, use_fastmem);2391if (!g_settings.gpu_pgxp_enable)2392{2393if (addr.GetCode() != RWARG1.GetCode())2394FreeHostReg(addr.GetCode());2395}2396else2397{2398// TODO: This can be simplified because we don't need to validate in PGXP..2399Flush(FLUSH_FOR_C_CALL);2400armAsm->mov(RWARG3, data);2401FreeHostReg(data.GetCode());2402armAsm->mov(RWARG2, addr);2403FreeHostReg(addr.GetCode());2404EmitMov(RWARG1, inst->bits);2405EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));2406}2407}24082409void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)2410{2411// TODO: we need better constant setting here.. which will need backprop2412AssertRegOrConstT(cf);24132414const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());2415const u32* ptr = GetCop0RegPtr(reg);2416const u32 mask = GetCop0RegWriteMask(reg);2417if (!ptr)2418{2419Compile_Fallback();2420return;2421}24222423if (mask == 0)2424{2425// if it's a read-only register, ignore2426DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));2427return;2428}24292430// for some registers, we need to test certain bits2431const bool needs_bit_test = (reg == Cop0Reg::SR);2432const Register new_value = RWARG1;2433const Register old_value = RWARG2;2434const Register changed_bits = RWARG3;2435const Register mask_reg = RWSCRATCH;24362437// Load old value2438armAsm->ldr(old_value, PTR(ptr));24392440// No way we fit this in an immediate..2441EmitMov(mask_reg, mask);24422443// update value2444if (cf.valid_host_t)2445armAsm->and_(new_value, CFGetRegT(cf), mask_reg);2446else2447EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);24482449if (needs_bit_test)2450armAsm->eor(changed_bits, old_value, new_value);2451armAsm->bic(old_value, old_value, mask_reg);2452armAsm->orr(new_value, old_value, new_value);2453armAsm->str(new_value, PTR(ptr));24542455if (reg == Cop0Reg::SR)2456{2457// TODO: replace with register backup2458// We could just inline the whole thing..2459Flush(FLUSH_FOR_C_CALL);24602461Label caches_unchanged;2462armAsm->tbz(changed_bits, 16, &caches_unchanged);2463EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));2464armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below2465if (CodeCache::IsUsingFastmem())2466armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));2467armAsm->bind(&caches_unchanged);24682469TestInterrupts(RWARG1);2470}2471else if (reg == Cop0Reg::CAUSE)2472{2473armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2474TestInterrupts(RWARG1);2475}2476else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)2477{2478// need to check whether we're switching to debug mode2479Flush(FLUSH_FOR_C_CALL);2480EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));2481SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);2482BackupHostState();2483Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);2484EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return2485RestoreHostState();2486SwitchToNearCode(false);2487}2488}24892490void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)2491{2492// shift mode bits right two, preserving upper bits2493armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2494armAsm->bfxil(RWARG1, RWARG1, 2, 4);2495armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));24962497TestInterrupts(RWARG1);2498}24992500void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)2501{2502DebugAssert(sr.IsW());25032504// if Iec == 0 then goto no_interrupt2505Label no_interrupt;2506armAsm->tbz(sr, 0, &no_interrupt);25072508// sr & cause2509armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));2510armAsm->and_(sr, sr, RWSCRATCH);25112512// ((sr & cause) & 0xff00) == 0 goto no_interrupt2513armAsm->tst(sr, 0xFF00);25142515SwitchToFarCode(true, ne);2516BackupHostState();25172518// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.2519UpdateLoadDelay();25202521Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);25222523// Can't use EndBlockWithException() here, because it'll use the wrong PC.2524// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.2525if (!iinfo->is_last_instruction)2526{2527EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,2528(inst + 1)->cop.cop_n));2529EmitMov(RWARG2, m_compiler_pc);2530EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2531m_dirty_pc = false;2532EndAndLinkBlock(std::nullopt, true, false);2533}2534else2535{2536if (m_dirty_pc)2537EmitMov(RWARG1, m_compiler_pc);2538armAsm->str(wzr, PTR(&g_state.downcount));2539if (m_dirty_pc)2540armAsm->str(RWARG1, PTR(&g_state.pc));2541m_dirty_pc = false;2542EndAndLinkBlock(std::nullopt, false, true);2543}25442545RestoreHostState();2546SwitchToNearCode(false);25472548armAsm->bind(&no_interrupt);2549}25502551void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)2552{2553const u32 index = inst->cop.Cop2Index();2554const Reg rt = inst->r.rt;25552556const auto [ptr, action] = GetGTERegisterPointer(index, false);2557if (action == GTERegisterAccessAction::Ignore)2558return;25592560u32 hreg;2561if (action == GTERegisterAccessAction::Direct)2562{2563hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2564EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2565armAsm->ldr(WRegister(hreg), PTR(ptr));2566}2567else if (action == GTERegisterAccessAction::CallHandler)2568{2569Flush(FLUSH_FOR_C_CALL);2570EmitMov(RWARG1, index);2571EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));25722573hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2574EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2575armAsm->mov(WRegister(hreg), RWRET);2576}2577else2578{2579Panic("Unknown action");2580return;2581}25822583if (g_settings.gpu_pgxp_enable)2584{2585Flush(FLUSH_FOR_C_CALL);2586EmitMov(RWARG1, inst->bits);2587armAsm->mov(RWARG2, WRegister(hreg));2588EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));2589}2590}25912592void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)2593{2594const u32 index = inst->cop.Cop2Index();2595const auto [ptr, action] = GetGTERegisterPointer(index, true);2596if (action == GTERegisterAccessAction::Ignore)2597return;25982599if (action == GTERegisterAccessAction::Direct)2600{2601if (cf.const_t)2602StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);2603else2604armAsm->str(CFGetRegT(cf), PTR(ptr));2605}2606else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)2607{2608const bool sign = (action == GTERegisterAccessAction::SignExtend16);2609if (cf.valid_host_t)2610{2611sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));2612armAsm->str(RWARG1, PTR(ptr));2613}2614else if (cf.const_t)2615{2616const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));2617StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);2618}2619else2620{2621Panic("Unsupported setup");2622}2623}2624else if (action == GTERegisterAccessAction::CallHandler)2625{2626Flush(FLUSH_FOR_C_CALL);2627EmitMov(RWARG1, index);2628MoveTToReg(RWARG2, cf);2629EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2630}2631else if (action == GTERegisterAccessAction::PushFIFO)2632{2633// SXY0 <- SXY12634// SXY1 <- SXY22635// SXY2 <- SXYP2636DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());2637armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2638armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2639armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2640armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2641if (cf.valid_host_t)2642armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));2643else if (cf.const_t)2644StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);2645else2646Panic("Unsupported setup");2647}2648else2649{2650Panic("Unknown action");2651}2652}26532654void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)2655{2656TickCount func_ticks;2657GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);26582659Flush(FLUSH_FOR_C_CALL);2660EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);2661EmitCall(reinterpret_cast<const void*>(func));26622663AddGTETicks(func_ticks);2664}26652666u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,2667TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,2668u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,2669bool is_load)2670{2671Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);2672Assembler* armAsm = &arm_asm;26732674#ifdef VIXL_DEBUG2675vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);2676#endif26772678static constexpr u32 GPR_SIZE = 8;26792680// save regs2681u32 num_gprs = 0;26822683for (u32 i = 0; i < NUM_HOST_REGS; i++)2684{2685if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2686num_gprs++;2687}26882689const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);26902691// TODO: use stp+ldp, vixl helper?26922693if (stack_size > 0)2694{2695armAsm->sub(sp, sp, stack_size);26962697u32 stack_offset = 0;2698for (u32 i = 0; i < NUM_HOST_REGS; i++)2699{2700if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2701{2702armAsm->str(XRegister(i), MemOperand(sp, stack_offset));2703stack_offset += GPR_SIZE;2704}2705}2706}27072708if (cycles_to_add != 0)2709{2710// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles2711Assert(Assembler::IsImmAddSub(cycles_to_add));2712armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2713armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);2714armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2715}27162717if (address_register != static_cast<u8>(RWARG1.GetCode()))2718armAsm->mov(RWARG1, WRegister(address_register));27192720if (!is_load)2721{2722if (data_register != static_cast<u8>(RWARG2.GetCode()))2723armAsm->mov(RWARG2, WRegister(data_register));2724}27252726switch (size)2727{2728case MemoryAccessSize::Byte:2729{2730armEmitCall(armAsm,2731is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :2732reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),2733false);2734}2735break;2736case MemoryAccessSize::HalfWord:2737{2738armEmitCall(armAsm,2739is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :2740reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),2741false);2742}2743break;2744case MemoryAccessSize::Word:2745{2746armEmitCall(armAsm,2747is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :2748reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),2749false);2750}2751break;2752}27532754if (is_load)2755{2756const WRegister dst = WRegister(data_register);2757switch (size)2758{2759case MemoryAccessSize::Byte:2760{2761is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);2762}2763break;2764case MemoryAccessSize::HalfWord:2765{2766is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);2767}2768break;2769case MemoryAccessSize::Word:2770{2771if (dst.GetCode() != RWRET.GetCode())2772armAsm->mov(dst, RWRET);2773}2774break;2775}2776}27772778if (cycles_to_remove != 0)2779{2780Assert(Assembler::IsImmAddSub(cycles_to_remove));2781armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2782armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);2783armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2784}27852786// restore regs2787if (stack_size > 0)2788{2789u32 stack_offset = 0;2790for (u32 i = 0; i < NUM_HOST_REGS; i++)2791{2792if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2793{2794armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));2795stack_offset += GPR_SIZE;2796}2797}27982799armAsm->add(sp, sp, stack_size);2800}28012802armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);2803armAsm->FinalizeCode();28042805return static_cast<u32>(armAsm->GetCursorOffset());2806}28072808#endif // CPU_ARCH_ARM64280928102811