Path: blob/master/src/core/cpu_recompiler_arm64.cpp
7429 views
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>1// SPDX-License-Identifier: CC-BY-NC-ND-4.023#include "cpu_recompiler_arm64.h"4#include "cpu_core_private.h"5#include "cpu_pgxp.h"6#include "gte.h"7#include "settings.h"8#include "timing_event.h"910#include "common/align.h"11#include "common/assert.h"12#include "common/log.h"13#include "common/memmap.h"14#include "common/string_util.h"1516#include <limits>1718#ifdef CPU_ARCH_ARM641920#include "vixl/aarch64/constants-aarch64.h"2122#ifdef ENABLE_HOST_DISASSEMBLY23#include "vixl/aarch64/disasm-aarch64.h"24#endif2526LOG_CHANNEL(Recompiler);2728#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))2930#define RWRET vixl::aarch64::w031#define RXRET vixl::aarch64::x032#define RWARG1 vixl::aarch64::w033#define RXARG1 vixl::aarch64::x034#define RWARG2 vixl::aarch64::w135#define RXARG2 vixl::aarch64::x136#define RWARG3 vixl::aarch64::w237#define RXARG3 vixl::aarch64::x238#define RWSCRATCH vixl::aarch64::w1639#define RXSCRATCH vixl::aarch64::x1640#define RSTATE vixl::aarch64::x1941#define RMEMBASE vixl::aarch64::x204243static bool armIsCallerSavedRegister(u32 id);44static s64 armGetPCDisplacement(const void* current, const void* target);45static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);46static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);47static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);48static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);49static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);50static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);51static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,52bool sign_extend_word = false);53static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,54const vixl::aarch64::Register& tempreg = RXSCRATCH);55static u8* armGetJumpTrampoline(const void* target);56static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);5758static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;59static std::unordered_map<const void*, u32> s_trampoline_targets;60static u8* s_trampoline_start_ptr = nullptr;61static u32 s_trampoline_used = 0;6263namespace CPU {6465using namespace vixl::aarch64;6667static ARM64Recompiler s_instance;68Recompiler* g_compiler = &s_instance;6970} // namespace CPU7172bool armIsCallerSavedRegister(u32 id)73{74// same on both linux and windows75return (id <= 18);76}7778void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)79{80// From vixl macro assembler.81DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());82DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());8384if (imm == 0)85{86armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));87return;88}8990// The worst case for size is mov 64-bit immediate to sp:91// * up to 4 instructions to materialise the constant92// * 1 instruction to move to sp9394// Immediates on Aarch64 can be produced using an initial value, and zero to95// three move keep operations.96//97// Initial values can be generated with:98// 1. 64-bit move zero (movz).99// 2. 32-bit move inverted (movn).100// 3. 64-bit move inverted.101// 4. 32-bit orr immediate.102// 5. 64-bit orr immediate.103// Move-keep may then be used to modify each of the 16-bit half words.104//105// The code below supports all five initial value generators, and106// applying move-keep operations to move-zero and move-inverted initial107// values.108109// Try to move the immediate in one instruction, and if that fails, switch to110// using multiple instructions.111const unsigned reg_size = rd.GetSizeInBits();112113if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())114{115// Immediate can be represented in a move zero instruction. Movz can't write116// to the stack pointer.117armAsm->movz(rd, imm);118return;119}120else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())121{122// Immediate can be represented in a move negative instruction. Movn can't123// write to the stack pointer.124armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));125return;126}127else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))128{129// Immediate can be represented in a logical orr instruction.130DebugAssert(!rd.IsZero());131armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);132return;133}134135// Generic immediate case. Imm will be represented by136// [imm3, imm2, imm1, imm0], where each imm is 16 bits.137// A move-zero or move-inverted is generated for the first non-zero or138// non-0xffff immX, and a move-keep for subsequent non-zero immX.139140uint64_t ignored_halfword = 0;141bool invert_move = false;142// If the number of 0xffff halfwords is greater than the number of 0x0000143// halfwords, it's more efficient to use move-inverted.144if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))145{146ignored_halfword = 0xffff;147invert_move = true;148}149150// Iterate through the halfwords. Use movn/movz for the first non-ignored151// halfword, and movk for subsequent halfwords.152DebugAssert((reg_size % 16) == 0);153bool first_mov_done = false;154for (unsigned i = 0; i < (reg_size / 16); i++)155{156uint64_t imm16 = (imm >> (16 * i)) & 0xffff;157if (imm16 != ignored_halfword)158{159if (!first_mov_done)160{161if (invert_move)162armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);163else164armAsm->movz(rd, imm16, 16 * i);165first_mov_done = true;166}167else168{169// Construct a wider constant.170armAsm->movk(rd, imm16, 16 * i);171}172}173}174175DebugAssert(first_mov_done);176}177178s64 armGetPCDisplacement(const void* current, const void* target)179{180// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));181// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));182return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);183}184185bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)186{187const void* cur = armAsm->GetCursorAddress<const void*>();188const void* current_code_ptr_page =189reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));190const void* ptr_page =191reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));192const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;193const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);194195return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||196vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));197}198199void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)200{201DebugAssert(reg.IsX());202203const void* cur = armAsm->GetCursorAddress<const void*>();204const void* current_code_ptr_page =205reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));206const void* ptr_page =207reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));208const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;209const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);210if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))211{212armAsm->adrp(reg, page_displacement);213armAsm->add(reg, reg, page_offset);214}215else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))216{217armAsm->adrp(reg, page_displacement);218armAsm->orr(reg, reg, page_offset);219}220else221{222armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));223}224}225226void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)227{228const void* cur = armAsm->GetCursorAddress<const void*>();229s64 displacement = armGetPCDisplacement(cur, ptr);230bool use_blr = !vixl::IsInt26(displacement);231bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);232if (use_blr && use_trampoline && !force_inline)233{234if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)235{236displacement = armGetPCDisplacement(cur, trampoline);237use_blr = !vixl::IsInt26(displacement);238}239}240241if (use_blr)242{243armMoveAddressToReg(armAsm, RXSCRATCH, ptr);244armAsm->br(RXSCRATCH);245}246else247{248armAsm->b(displacement);249}250}251252void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)253{254const void* cur = armAsm->GetCursorAddress<const void*>();255s64 displacement = armGetPCDisplacement(cur, ptr);256bool use_blr = !vixl::IsInt26(displacement);257bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);258if (use_blr && use_trampoline && !force_inline)259{260if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)261{262displacement = armGetPCDisplacement(cur, trampoline);263use_blr = !vixl::IsInt26(displacement);264}265}266267if (use_blr)268{269armMoveAddressToReg(armAsm, RXSCRATCH, ptr);270armAsm->blr(RXSCRATCH);271}272else273{274armAsm->bl(displacement);275}276}277278void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)279{280const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -281reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));282// pxAssert(Common::IsAligned(jump_distance, 4));283284if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))285{286armAsm->b(jump_distance >> 2, cond);287}288else289{290vixl::aarch64::Label branch_not_taken;291armAsm->b(&branch_not_taken, InvertCondition(cond));292293const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -294reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));295armAsm->b(new_jump_distance >> 2);296armAsm->bind(&branch_not_taken);297}298}299300void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,301bool sign_extend_word)302{303const void* cur = armAsm->GetCursorAddress<const void*>();304const void* current_code_ptr_page =305reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));306const void* ptr_page =307reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));308const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;309const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);310vixl::aarch64::MemOperand memop;311312const vixl::aarch64::Register xreg = reg.X();313if (vixl::IsInt21(page_displacement))314{315armAsm->adrp(xreg, page_displacement);316memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));317}318else319{320armMoveAddressToReg(armAsm, xreg, addr);321memop = vixl::aarch64::MemOperand(xreg);322}323324if (sign_extend_word)325armAsm->ldrsw(reg, memop);326else327armAsm->ldr(reg, memop);328}329330[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,331const void* addr, const vixl::aarch64::Register& tempreg)332{333DebugAssert(tempreg.IsX());334335const void* cur = armAsm->GetCursorAddress<const void*>();336const void* current_code_ptr_page =337reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));338const void* ptr_page =339reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));340const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;341const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);342343if (vixl::IsInt21(page_displacement))344{345armAsm->adrp(tempreg, page_displacement);346armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));347}348else349{350armMoveAddressToReg(armAsm, tempreg, addr);351armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));352}353}354355u8* armGetJumpTrampoline(const void* target)356{357auto it = s_trampoline_targets.find(target);358if (it != s_trampoline_targets.end())359return s_trampoline_start_ptr + it->second;360361// align to 16 bytes?362const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);363364// 4 movs plus a jump365if (TRAMPOLINE_AREA_SIZE - offset < 20)366{367Panic("Ran out of space in constant pool");368return nullptr;369}370371u8* start = s_trampoline_start_ptr + offset;372vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);373#ifdef VIXL_DEBUG374vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,375vixl::CodeBufferCheckScope::kDontReserveBufferSpace);376#endif377armMoveAddressToReg(&armAsm, RXSCRATCH, target);378armAsm.br(RXSCRATCH);379armAsm.FinalizeCode();380381const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());382DebugAssert(size < 20);383s_trampoline_targets.emplace(target, offset);384s_trampoline_used = offset + static_cast<u32>(size);385386MemMap::FlushInstructionCache(start, size);387return start;388}389390void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)391{392size_t addr = armAsm->GetCursorAddress<size_t>();393const size_t end_addr = Common::AlignUpPow2(addr, alignment);394while (addr != end_addr)395{396armAsm->nop();397addr += vixl::aarch64::kInstructionSize;398}399}400401void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)402{403#ifdef ENABLE_HOST_DISASSEMBLY404class MyDisassembler : public vixl::aarch64::Disassembler405{406protected:407void ProcessOutput(const vixl::aarch64::Instruction* instr) override408{409DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());410}411};412413vixl::aarch64::Decoder decoder;414MyDisassembler disas;415decoder.AppendVisitor(&disas);416decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),417reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));418#else419ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");420#endif421}422423u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)424{425return size / vixl::aarch64::kInstructionSize;426}427428u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)429{430using namespace vixl::aarch64;431432const s64 disp = armGetPCDisplacement(code, dst);433DebugAssert(vixl::IsInt26(disp));434435const u32 new_code = B | Assembler::ImmUncondBranch(disp);436std::memcpy(code, &new_code, sizeof(new_code));437if (flush_icache)438MemMap::FlushInstructionCache(code, kInstructionSize);439440return kInstructionSize;441}442443u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)444{445using namespace vixl::aarch64;446447Assembler actual_asm(static_cast<u8*>(code), code_size);448Assembler* RESTRICT armAsm = &actual_asm;449450#ifdef VIXL_DEBUG451vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);452#endif453454Label dispatch;455Label run_events_and_dispatch;456457g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();458{459#ifdef _WIN32460// Frame pointer setup is needed on Windows461armAsm->stp(x29, x30, MemOperand(sp, -16, PreIndex));462armAsm->mov(x29, sp);463#endif464465// Need the CPU state for basically everything :-)466armMoveAddressToReg(armAsm, RSTATE, &g_state);467468// Fastmem setup, oldrec doesn't need it469if (IsUsingFastmem())470armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));471472// Fall through to event dispatcher473}474475// check events then for frame done476armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);477{478armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));479armAsm->ldr(RWARG2, PTR(&g_state.downcount));480armAsm->cmp(RWARG1, RWARG2);481armAsm->b(&dispatch, lt);482483g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();484armAsm->bind(&run_events_and_dispatch);485armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);486}487488armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);489g_dispatcher = armAsm->GetCursorAddress<const void*>();490{491armAsm->bind(&dispatch);492493// x9 <- s_fast_map[pc >> 16]494armAsm->ldr(RWARG1, PTR(&g_state.pc));495armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());496armAsm->lsr(RWARG2, RWARG1, 16);497armAsm->ubfx(RWARG1, RWARG1, 2, 14);498armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));499500// blr(x9[pc * 2]) (fast_map[pc >> 2])501armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));502armAsm->br(RXARG1);503}504505armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);506g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();507{508armAsm->ldr(RWARG1, PTR(&g_state.pc));509armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);510armAsm->b(&dispatch);511}512513armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);514g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();515{516armAsm->ldr(RWARG1, PTR(&g_state.pc));517armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);518armAsm->b(&dispatch);519}520521armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);522g_interpret_block = armAsm->GetCursorAddress<const void*>();523{524armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);525armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));526armAsm->ldr(RWARG2, PTR(&g_state.downcount));527armAsm->cmp(RWARG1, RWARG2);528armAsm->b(&run_events_and_dispatch, ge);529armAsm->b(&dispatch);530}531532armAsm->FinalizeCode();533534s_trampoline_targets.clear();535s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();536s_trampoline_used = 0;537538return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;539}540541void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)542{543constexpr u8 padding_value = 0x00;544std::memset(dst, padding_value, size);545}546547CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)548{549}550551CPU::ARM64Recompiler::~ARM64Recompiler() = default;552553const void* CPU::ARM64Recompiler::GetCurrentCodePointer()554{555return armAsm->GetCursorAddress<const void*>();556}557558void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,559u32 far_code_space)560{561Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);562563// TODO: don't recreate this every time..564DebugAssert(!armAsm);565m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);566m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);567armAsm = &m_emitter;568569#ifdef VIXL_DEBUG570m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,571vixl::CodeBufferCheckScope::kDontReserveBufferSpace);572m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(573&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);574#endif575576// Need to wipe it out so it's correct when toggling fastmem.577m_host_regs = {};578579// Frame pointer must be valid on Windows.580#ifdef _WIN32581constexpr u32 max_reg_idx = 28;582#else583constexpr u32 max_reg_idx = 29;584#endif585586const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;587for (u32 i = 0; i < NUM_HOST_REGS; i++)588{589HostRegAlloc& ra = m_host_regs[i];590591if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||592i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i > max_reg_idx)593{594continue;595}596597ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);598}599}600601void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)602{603DebugAssert(armAsm == &m_emitter);604if (emit_jump)605{606const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());607if (cond != Condition::al)608{609if (vixl::IsInt19(disp))610{611armAsm->b(disp, cond);612}613else614{615Label skip;616armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));617armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));618armAsm->bind(&skip);619}620}621else622{623armAsm->b(disp);624}625}626armAsm = &m_far_emitter;627}628629void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)630{631const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());632if (vixl::IsInt14(disp))633{634armAsm->tbnz(reg, bit, disp);635}636else637{638Label skip;639armAsm->tbz(reg, bit, &skip);640armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));641armAsm->bind(&skip);642}643644armAsm = &m_far_emitter;645}646647void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)648{649const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());650if (vixl::IsInt19(disp))651{652nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);653}654else655{656Label skip;657nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);658armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));659armAsm->bind(&skip);660}661662armAsm = &m_far_emitter;663}664665void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)666{667DebugAssert(armAsm == &m_far_emitter);668if (emit_jump)669{670const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());671(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);672}673armAsm = &m_emitter;674}675676void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)677{678armEmitMov(armAsm, dst, val);679}680681void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)682{683armEmitCall(armAsm, ptr, force_inline);684}685686vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)687{688if (Assembler::IsImmAddSub(val))689return vixl::aarch64::Operand(static_cast<int64_t>(val));690691EmitMov(RWSCRATCH, static_cast<u32>(val));692return vixl::aarch64::Operand(RWSCRATCH);693}694695vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)696{697return armCheckAddSubConstant(static_cast<s32>(val));698}699700vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)701{702if (Assembler::IsImmConditionalCompare(val))703return vixl::aarch64::Operand(static_cast<int64_t>(val));704705EmitMov(RWSCRATCH, static_cast<u32>(val));706return vixl::aarch64::Operand(RWSCRATCH);707}708709vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)710{711if (Assembler::IsImmLogical(val, 32))712return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));713714EmitMov(RWSCRATCH, val);715return vixl::aarch64::Operand(RWSCRATCH);716}717718void CPU::ARM64Recompiler::BeginBlock()719{720Recompiler::BeginBlock();721}722723void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)724{725// store it first to reduce code size, because we can offset726armMoveAddressToReg(armAsm, RXARG1, ram_ptr);727armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);728729bool first = true;730u32 offset = 0;731Label block_changed;732733while (size >= 16)734{735const VRegister vtmp = v2.V4S();736const VRegister dst = first ? v0.V4S() : v1.V4S();737armAsm->ldr(dst, MemOperand(RXARG1, offset));738armAsm->ldr(vtmp, MemOperand(RXARG2, offset));739armAsm->cmeq(dst, dst, vtmp);740if (!first)741armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());742else743first = false;744745offset += 16;746size -= 16;747}748749if (!first)750{751// TODO: make sure this doesn't choke on ffffffff752armAsm->uminv(s0, v0.V4S());753armAsm->fcmp(s0, 0.0);754armAsm->b(&block_changed, eq);755}756757while (size >= 8)758{759armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));760armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));761armAsm->cmp(RXARG3, RXSCRATCH);762armAsm->b(&block_changed, ne);763offset += 8;764size -= 8;765}766767while (size >= 4)768{769armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));770armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));771armAsm->cmp(RWARG3, RWSCRATCH);772armAsm->b(&block_changed, ne);773offset += 4;774size -= 4;775}776777DebugAssert(size == 0);778779Label block_unchanged;780armAsm->b(&block_unchanged);781armAsm->bind(&block_changed);782armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);783armAsm->bind(&block_unchanged);784}785786void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()787{788if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))789{790if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))791{792armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());793armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));794armEmitMov(armAsm, RWARG3, m_block->size);795armAsm->mul(RWARG2, RWARG2, RWARG3);796armAsm->add(RWARG1, RWARG1, RWARG2);797armAsm->str(RWARG1, PTR(&g_state.pending_ticks));798}799else800{801armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));802armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));803armAsm->str(RWARG1, PTR(&g_state.pending_ticks));804}805}806else if (m_block->icache_line_count > 0)807{808const auto& ticks_reg = RWARG1;809const auto& current_tag_reg = RWARG2;810const auto& existing_tag_reg = RWARG3;811const auto& fill_ticks_reg = w4;812const auto& ticks_to_add_reg = w5;813814VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;815const TickCount fill_ticks = GetICacheFillTicks(current_pc);816if (fill_ticks <= 0)817return;818819armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));820armEmitMov(armAsm, current_tag_reg, current_pc);821armEmitMov(armAsm, fill_ticks_reg, fill_ticks);822823for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)824{825const u32 line = GetICacheLine(current_pc);826const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));827828Label cache_hit;829armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));830armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));831armAsm->cmp(existing_tag_reg, current_tag_reg);832armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);833armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);834835if (i != (m_block->icache_line_count - 1))836armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));837}838839armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));840}841}842843void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,844s32 arg3reg /*= -1*/)845{846if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))847armAsm->mov(RXARG1, XRegister(arg1reg));848if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))849armAsm->mov(RXARG2, XRegister(arg2reg));850if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))851armAsm->mov(RXARG3, XRegister(arg3reg));852EmitCall(func);853}854855void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)856{857if (newpc.has_value())858{859if (m_dirty_pc || m_compiler_pc != newpc)860{861EmitMov(RWSCRATCH, newpc.value());862armAsm->str(RWSCRATCH, PTR(&g_state.pc));863}864}865m_dirty_pc = false;866867// flush regs868Flush(FLUSH_END_BLOCK);869EndAndLinkBlock(newpc, do_event_test, false);870}871872void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)873{874// flush regs, but not pc, it's going to get overwritten875// flush cycles because of the GTE instruction stuff...876Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);877878// TODO: flush load delay879880EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,881inst->cop.cop_n));882EmitMov(RWARG2, m_current_instruction_pc);883if (excode != Exception::BP)884{885EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));886}887else888{889EmitMov(RWARG3, inst->bits);890EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));891}892m_dirty_pc = false;893894EndAndLinkBlock(std::nullopt, true, false);895}896897void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)898{899// event test900// pc should've been flushed901DebugAssert(!m_dirty_pc && !m_block_ended);902m_block_ended = true;903904// TODO: try extracting this to a function905906// save cycles for event test907const TickCount cycles = std::exchange(m_cycles, 0);908909// pending_ticks += cycles910// if (pending_ticks >= downcount) { dispatch_event(); }911if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)912armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));913if (do_event_test)914armAsm->ldr(RWARG2, PTR(&g_state.downcount));915if (cycles > 0)916armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));917if (m_gte_done_cycle > cycles)918{919armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));920armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));921}922if (do_event_test)923armAsm->cmp(RWARG1, RWARG2);924if (cycles > 0)925armAsm->str(RWARG1, PTR(&g_state.pending_ticks));926if (do_event_test)927armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);928929// jump to dispatcher or next block930if (force_run_events)931{932armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);933}934else if (!newpc.has_value())935{936armEmitJmp(armAsm, CodeCache::g_dispatcher, false);937}938else939{940const void* target = (newpc.value() == m_block->pc) ?941CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),942armAsm->GetBuffer()->GetStartAddress<const void*>()) :943CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());944armEmitJmp(armAsm, target, true);945}946}947948const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)949{950#ifdef VIXL_DEBUG951m_emitter_check.reset();952m_far_emitter_check.reset();953#endif954955m_emitter.FinalizeCode();956m_far_emitter.FinalizeCode();957958u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();959*code_size = static_cast<u32>(m_emitter.GetCursorOffset());960*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());961armAsm = nullptr;962return code;963}964965const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const966{967static constexpr std::array<const char*, 32> reg64_names = {968{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",969"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};970return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";971}972973void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)974{975EmitMov(WRegister(reg), val);976}977978void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)979{980armAsm->ldr(WRegister(reg), PTR(ptr));981}982983void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)984{985armAsm->str(WRegister(reg), PTR(ptr));986}987988void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)989{990if (val == 0)991{992armAsm->str(wzr, PTR(ptr));993return;994}995996EmitMov(RWSCRATCH, val);997armAsm->str(RWSCRATCH, PTR(ptr));998}9991000void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)1001{1002if (src != dst)1003armAsm->mov(WRegister(dst), WRegister(src));1004}10051006void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const1007{1008DebugAssert(cf.valid_host_s || cf.const_s);1009}10101011void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const1012{1013DebugAssert(cf.valid_host_t || cf.const_t);1014}10151016vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const1017{1018DebugAssert(r < Reg::count);1019return PTR(&g_state.regs.r[static_cast<u32>(r)]);1020}10211022vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const1023{1024DebugAssert(cf.valid_host_d);1025return WRegister(cf.host_d);1026}10271028vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const1029{1030DebugAssert(cf.valid_host_s);1031return WRegister(cf.host_s);1032}10331034vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const1035{1036DebugAssert(cf.valid_host_t);1037return WRegister(cf.host_t);1038}10391040vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const1041{1042DebugAssert(cf.valid_host_lo);1043return WRegister(cf.host_lo);1044}10451046vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const1047{1048DebugAssert(cf.valid_host_hi);1049return WRegister(cf.host_hi);1050}10511052void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1053{1054DebugAssert(dst.IsW());1055if (cf.valid_host_s)1056{1057if (cf.host_s != dst.GetCode())1058armAsm->mov(dst, WRegister(cf.host_s));1059}1060else if (cf.const_s)1061{1062const u32 cv = GetConstantRegU32(cf.MipsS());1063if (cv == 0)1064armAsm->mov(dst, wzr);1065else1066EmitMov(dst, cv);1067}1068else1069{1070WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));1071armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));1072}1073}10741075void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1076{1077DebugAssert(dst.IsW());1078if (cf.valid_host_t)1079{1080if (cf.host_t != dst.GetCode())1081armAsm->mov(dst, WRegister(cf.host_t));1082}1083else if (cf.const_t)1084{1085const u32 cv = GetConstantRegU32(cf.MipsT());1086if (cv == 0)1087armAsm->mov(dst, wzr);1088else1089EmitMov(dst, cv);1090}1091else1092{1093WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));1094armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));1095}1096}10971098void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)1099{1100DebugAssert(reg < Reg::count && dst.IsW());1101if (ignore_load_delays && m_load_delay_register == reg)1102{1103if (m_load_delay_value_register == NUM_HOST_REGS)1104armAsm->ldr(dst, PTR(&g_state.load_delay_value));1105else1106armAsm->mov(dst, WRegister(m_load_delay_value_register));1107}1108else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))1109{1110armAsm->mov(dst, WRegister(hreg.value()));1111}1112else if (HasConstantReg(reg))1113{1114EmitMov(dst, GetConstantRegU32(reg));1115}1116else1117{1118armAsm->ldr(dst, MipsPtr(reg));1119}1120}11211122void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,1123Reg arg3reg /* = Reg::count */)1124{1125DebugAssert(g_settings.gpu_pgxp_enable);11261127Flush(FLUSH_FOR_C_CALL);11281129if (arg2reg != Reg::count)1130MoveMIPSRegToReg(RWARG2, arg2reg);1131if (arg3reg != Reg::count)1132MoveMIPSRegToReg(RWARG3, arg3reg);11331134EmitMov(RWARG1, arg1val);1135EmitCall(func);1136}11371138void CPU::ARM64Recompiler::Flush(u32 flags)1139{1140Recompiler::Flush(flags);11411142if (flags & FLUSH_PC && m_dirty_pc)1143{1144StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);1145m_dirty_pc = false;1146}11471148if (flags & FLUSH_INSTRUCTION_BITS)1149{1150// This sucks, but it's only used for fallbacks.1151EmitMov(RWARG1, inst->bits);1152EmitMov(RWARG2, m_current_instruction_pc);1153EmitMov(RWARG3, m_current_instruction_branch_delay_slot);1154armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));1155armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));1156armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));1157}11581159if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)1160{1161// This sucks :(1162// TODO: make it a function?1163armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));1164armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));1165EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));1166armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));1167armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));1168EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));1169armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1170m_load_delay_dirty = false;1171}11721173if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)1174{1175if (m_load_delay_value_register != NUM_HOST_REGS)1176FreeHostReg(m_load_delay_value_register);11771178EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));1179armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1180m_load_delay_register = Reg::count;1181m_load_delay_dirty = true;1182}11831184if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)1185{1186// May as well flush cycles while we're here.1187// GTE spanning blocks is very rare, we _could_ disable this for speed.1188armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1189armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));1190if (m_cycles > 0)1191{1192armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1193m_cycles = 0;1194}1195armAsm->cmp(RWARG2, RWARG1);1196armAsm->csel(RWARG1, RWARG2, RWARG1, hs);1197armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1198m_dirty_gte_done_cycle = false;1199}12001201if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)1202{1203armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));12041205// update cycles at the same time1206if (flags & FLUSH_CYCLES && m_cycles > 0)1207{1208armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1209armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1210m_gte_done_cycle -= m_cycles;1211m_cycles = 0;1212}12131214armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));1215armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));1216m_gte_done_cycle = 0;1217m_dirty_gte_done_cycle = true;1218}12191220if (flags & FLUSH_CYCLES && m_cycles > 0)1221{1222armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1223armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1224armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1225m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);1226m_cycles = 0;1227}1228}12291230void CPU::ARM64Recompiler::Compile_Fallback()1231{1232WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,1233inst->bits);12341235Flush(FLUSH_FOR_INTERPRETER);12361237EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));12381239// TODO: make me less garbage1240// TODO: this is wrong, it flushes the load delay on the same cycle when we return.1241// but nothing should be going through here..1242Label no_load_delay;1243armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));1244armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));1245armAsm->b(&no_load_delay, eq);1246armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));1247armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));1248armAsm->str(RWARG2, PTR(&g_state.load_delay_value));1249EmitMov(RWARG1, static_cast<u32>(Reg::count));1250armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));1251armAsm->bind(&no_load_delay);12521253m_load_delay_dirty = EMULATE_LOAD_DELAYS;1254}12551256void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)1257{1258DebugAssert(pcreg.IsW());1259if (!g_settings.cpu_recompiler_memory_exceptions)1260return;12611262armAsm->tst(pcreg, armCheckLogicalConstant(0x3));1263SwitchToFarCode(true, ne);12641265BackupHostState();1266EndBlockWithException(Exception::AdEL);12671268RestoreHostState();1269SwitchToNearCode(false);1270}12711272void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)1273{1274const Register pcreg = CFGetRegS(cf);1275CheckBranchTarget(pcreg);12761277armAsm->str(pcreg, PTR(&g_state.pc));12781279CompileBranchDelaySlot(false);1280EndBlock(std::nullopt, true);1281}12821283void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)1284{1285const Register pcreg = CFGetRegS(cf);1286if (MipsD() != Reg::zero)1287SetConstantReg(MipsD(), GetBranchReturnAddress(cf));12881289CheckBranchTarget(pcreg);1290armAsm->str(pcreg, PTR(&g_state.pc));12911292CompileBranchDelaySlot(false);1293EndBlock(std::nullopt, true);1294}12951296void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)1297{1298AssertRegOrConstS(cf);12991300const u32 taken_pc = GetConditionalBranchTarget(cf);13011302Flush(FLUSH_FOR_BRANCH);13031304DebugAssert(cf.valid_host_s);13051306// MipsT() here should equal zero for zero branches.1307DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);13081309Label taken;1310const Register rs = CFGetRegS(cf);1311switch (cond)1312{1313case BranchCondition::Equal:1314case BranchCondition::NotEqual:1315{1316AssertRegOrConstT(cf);1317if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))1318{1319(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);1320}1321else1322{1323if (cf.valid_host_t)1324armAsm->cmp(rs, CFGetRegT(cf));1325else if (cf.const_t)1326armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));13271328armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);1329}1330}1331break;13321333case BranchCondition::GreaterThanZero:1334{1335armAsm->cmp(rs, 0);1336armAsm->b(&taken, gt);1337}1338break;13391340case BranchCondition::GreaterEqualZero:1341{1342armAsm->cmp(rs, 0);1343armAsm->b(&taken, ge);1344}1345break;13461347case BranchCondition::LessThanZero:1348{1349armAsm->cmp(rs, 0);1350armAsm->b(&taken, lt);1351}1352break;13531354case BranchCondition::LessEqualZero:1355{1356armAsm->cmp(rs, 0);1357armAsm->b(&taken, le);1358}1359break;1360}13611362BackupHostState();1363if (!cf.delay_slot_swapped)1364CompileBranchDelaySlot();13651366EndBlock(m_compiler_pc, true);13671368armAsm->bind(&taken);13691370RestoreHostState();1371if (!cf.delay_slot_swapped)1372CompileBranchDelaySlot();13731374EndBlock(taken_pc, true);1375}13761377void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)1378{1379const Register rs = CFGetRegS(cf);1380const Register rt = CFGetRegT(cf);1381if (const u32 imm = inst->i.imm_sext32(); imm != 0)1382{1383if (!overflow)1384{1385armAsm->add(rt, rs, armCheckAddSubConstant(imm));1386}1387else1388{1389armAsm->adds(rt, rs, armCheckAddSubConstant(imm));1390TestOverflow(rt);1391}1392}1393else if (rt.GetCode() != rs.GetCode())1394{1395armAsm->mov(rt, rs);1396}1397}13981399void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)1400{1401Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);1402}14031404void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)1405{1406Compile_addi(cf, false);1407}14081409void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)1410{1411Compile_slti(cf, true);1412}14131414void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)1415{1416Compile_slti(cf, false);1417}14181419void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)1420{1421armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));1422armAsm->cset(CFGetRegT(cf), sign ? lt : lo);1423}14241425void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)1426{1427const Register rt = CFGetRegT(cf);1428if (const u32 imm = inst->i.imm_zext32(); imm != 0)1429armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));1430else1431armAsm->mov(rt, wzr);1432}14331434void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)1435{1436const Register rt = CFGetRegT(cf);1437const Register rs = CFGetRegS(cf);1438if (const u32 imm = inst->i.imm_zext32(); imm != 0)1439armAsm->orr(rt, rs, armCheckLogicalConstant(imm));1440else if (rt.GetCode() != rs.GetCode())1441armAsm->mov(rt, rs);1442}14431444void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)1445{1446const Register rt = CFGetRegT(cf);1447const Register rs = CFGetRegS(cf);1448if (const u32 imm = inst->i.imm_zext32(); imm != 0)1449armAsm->eor(rt, rs, armCheckLogicalConstant(imm));1450else if (rt.GetCode() != rs.GetCode())1451armAsm->mov(rt, rs);1452}14531454void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,1455void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1456const vixl::aarch64::Register&, unsigned))1457{1458const Register rd = CFGetRegD(cf);1459const Register rt = CFGetRegT(cf);1460if (inst->r.shamt > 0)1461(armAsm->*op)(rd, rt, inst->r.shamt);1462else if (rd.GetCode() != rt.GetCode())1463armAsm->mov(rd, rt);1464}14651466void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)1467{1468Compile_shift(cf, &Assembler::lsl);1469}14701471void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)1472{1473Compile_shift(cf, &Assembler::lsr);1474}14751476void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)1477{1478Compile_shift(cf, &Assembler::asr);1479}14801481void CPU::ARM64Recompiler::Compile_variable_shift(1482CompileFlags cf,1483void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,1484const vixl::aarch64::Register&),1485void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))1486{1487const Register rd = CFGetRegD(cf);14881489AssertRegOrConstS(cf);1490AssertRegOrConstT(cf);14911492const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1493if (!cf.valid_host_t)1494MoveTToReg(rt, cf);14951496if (cf.const_s)1497{1498if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)1499(armAsm->*op_const)(rd, rt, shift);1500else if (rd.GetCode() != rt.GetCode())1501armAsm->mov(rd, rt);1502}1503else1504{1505(armAsm->*op)(rd, rt, CFGetRegS(cf));1506}1507}15081509void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)1510{1511Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);1512}15131514void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)1515{1516Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);1517}15181519void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)1520{1521Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);1522}15231524void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)1525{1526const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1527if (!cf.valid_host_s)1528MoveSToReg(rs, cf);15291530const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1531if (!cf.valid_host_t)1532MoveTToReg(rt, cf);15331534// TODO: if lo/hi gets killed, we can use a 32-bit multiply1535const Register lo = CFGetRegLO(cf);1536const Register hi = CFGetRegHI(cf);15371538(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);1539armAsm->lsr(hi.X(), lo.X(), 32);1540}15411542void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)1543{1544Compile_mult(cf, true);1545}15461547void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)1548{1549Compile_mult(cf, false);1550}15511552void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)1553{1554const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1555if (!cf.valid_host_s)1556MoveSToReg(rs, cf);15571558const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1559if (!cf.valid_host_t)1560MoveTToReg(rt, cf);15611562const Register rlo = CFGetRegLO(cf);1563const Register rhi = CFGetRegHI(cf);15641565// TODO: This could be slightly more optimal1566Label done;1567Label not_divide_by_zero;1568armAsm->cbnz(rt, ¬_divide_by_zero);1569armAsm->mov(rhi, rs); // hi = num1570EmitMov(rlo, 1);1571EmitMov(RWSCRATCH, static_cast<u32>(-1));1572armAsm->cmp(rs, 0);1573armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 11574armAsm->b(&done);15751576armAsm->bind(¬_divide_by_zero);1577Label not_unrepresentable;1578armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));1579armAsm->b(¬_unrepresentable, ne);1580armAsm->cmp(rt, armCheckCompareConstant(-1));1581armAsm->b(¬_unrepresentable, ne);15821583EmitMov(rlo, 0x80000000u);1584EmitMov(rhi, 0);1585armAsm->b(&done);15861587armAsm->bind(¬_unrepresentable);15881589armAsm->sdiv(rlo, rs, rt);15901591// TODO: skip when hi is dead1592armAsm->msub(rhi, rlo, rt, rs);15931594armAsm->bind(&done);1595}15961597void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)1598{1599const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1600if (!cf.valid_host_s)1601MoveSToReg(rs, cf);16021603const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1604if (!cf.valid_host_t)1605MoveTToReg(rt, cf);16061607const Register rlo = CFGetRegLO(cf);1608const Register rhi = CFGetRegHI(cf);16091610Label done;1611Label not_divide_by_zero;1612armAsm->cbnz(rt, ¬_divide_by_zero);1613EmitMov(rlo, static_cast<u32>(-1));1614armAsm->mov(rhi, rs);1615armAsm->b(&done);16161617armAsm->bind(¬_divide_by_zero);16181619armAsm->udiv(rlo, rs, rt);16201621// TODO: skip when hi is dead1622armAsm->msub(rhi, rlo, rt, rs);16231624armAsm->bind(&done);1625}16261627void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)1628{1629DebugAssert(result.IsW());1630SwitchToFarCode(true, vs);16311632BackupHostState();16331634// toss the result1635ClearHostReg(result.GetCode());16361637EndBlockWithException(Exception::Ov);16381639RestoreHostState();16401641SwitchToNearCode(false);1642}16431644void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,1645void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1646const vixl::aarch64::Register&,1647const vixl::aarch64::Operand&),1648bool commutative, bool logical, bool overflow)1649{1650AssertRegOrConstS(cf);1651AssertRegOrConstT(cf);16521653const Register rd = CFGetRegD(cf);1654if (cf.valid_host_s && cf.valid_host_t)1655{1656(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));1657}1658else if (commutative && (cf.const_s || cf.const_t))1659{1660const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);1661if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1662{1663(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1664}1665else1666{1667if (rd.GetCode() != src.GetCode())1668armAsm->mov(rd, src);1669overflow = false;1670}1671}1672else if (cf.const_s)1673{1674// TODO: Check where we can use wzr here1675EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));1676(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));1677}1678else if (cf.const_t)1679{1680const Register rs = CFGetRegS(cf);1681if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1682{1683(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1684}1685else1686{1687if (rd.GetCode() != rs.GetCode())1688armAsm->mov(rd, rs);1689overflow = false;1690}1691}16921693if (overflow)1694TestOverflow(rd);1695}16961697void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)1698{1699if (g_settings.cpu_recompiler_memory_exceptions)1700Compile_dst_op(cf, &Assembler::adds, true, false, true);1701else1702Compile_dst_op(cf, &Assembler::add, true, false, false);1703}17041705void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)1706{1707Compile_dst_op(cf, &Assembler::add, true, false, false);1708}17091710void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)1711{1712if (g_settings.cpu_recompiler_memory_exceptions)1713Compile_dst_op(cf, &Assembler::subs, false, false, true);1714else1715Compile_dst_op(cf, &Assembler::sub, false, false, false);1716}17171718void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)1719{1720Compile_dst_op(cf, &Assembler::sub, false, false, false);1721}17221723void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)1724{1725AssertRegOrConstS(cf);1726AssertRegOrConstT(cf);17271728// special cases - and with self -> self, and with 0 -> 01729const Register regd = CFGetRegD(cf);1730if (cf.MipsS() == cf.MipsT())1731{1732armAsm->mov(regd, CFGetRegS(cf));1733return;1734}1735else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1736{1737armAsm->mov(regd, wzr);1738return;1739}17401741Compile_dst_op(cf, &Assembler::and_, true, true, false);1742}17431744void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)1745{1746AssertRegOrConstS(cf);1747AssertRegOrConstT(cf);17481749// or/nor with 0 -> no effect1750const Register regd = CFGetRegD(cf);1751if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())1752{1753cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1754return;1755}17561757Compile_dst_op(cf, &Assembler::orr, true, true, false);1758}17591760void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)1761{1762AssertRegOrConstS(cf);1763AssertRegOrConstT(cf);17641765const Register regd = CFGetRegD(cf);1766if (cf.MipsS() == cf.MipsT())1767{1768// xor with self -> zero1769armAsm->mov(regd, wzr);1770return;1771}1772else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1773{1774// xor with zero -> no effect1775cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1776return;1777}17781779Compile_dst_op(cf, &Assembler::eor, true, true, false);1780}17811782void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)1783{1784Compile_or(cf);1785armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));1786}17871788void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)1789{1790Compile_slt(cf, true);1791}17921793void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)1794{1795Compile_slt(cf, false);1796}17971798void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)1799{1800AssertRegOrConstS(cf);1801AssertRegOrConstT(cf);18021803// TODO: swap and reverse op for constants1804if (cf.const_s)1805{1806EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));1807armAsm->cmp(RWSCRATCH, CFGetRegT(cf));1808}1809else if (cf.const_t)1810{1811armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));1812}1813else1814{1815armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));1816}18171818armAsm->cset(CFGetRegD(cf), sign ? lt : lo);1819}18201821vixl::aarch64::Register1822CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,1823const std::optional<const vixl::aarch64::Register>& reg)1824{1825const u32 imm = inst->i.imm_sext32();1826if (cf.valid_host_s && imm == 0 && !reg.has_value())1827return CFGetRegS(cf);18281829const Register dst = reg.has_value() ? reg.value() : RWARG1;1830if (address.has_value())1831{1832EmitMov(dst, address.value());1833}1834else if (imm == 0)1835{1836if (cf.valid_host_s)1837{1838if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())1839armAsm->mov(dst, CFGetRegS(cf));1840}1841else1842{1843armAsm->ldr(dst, MipsPtr(cf.MipsS()));1844}1845}1846else1847{1848if (cf.valid_host_s)1849{1850armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1851}1852else1853{1854armAsm->ldr(dst, MipsPtr(cf.MipsS()));1855armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1856}1857}18581859return dst;1860}18611862template<typename RegAllocFn>1863vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,1864MemoryAccessSize size, bool sign, bool use_fastmem,1865const RegAllocFn& dst_reg_alloc)1866{1867DebugAssert(addr_reg.IsW());1868if (use_fastmem)1869{1870m_cycles += Bus::RAM_READ_TICKS;18711872const Register dst = dst_reg_alloc();18731874if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1875{1876DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1877armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1878armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1879}18801881const MemOperand mem =1882MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1883u8* start = armAsm->GetCursorAddress<u8*>();1884switch (size)1885{1886case MemoryAccessSize::Byte:1887sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);1888break;18891890case MemoryAccessSize::HalfWord:1891sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);1892break;18931894case MemoryAccessSize::Word:1895armAsm->ldr(dst, mem);1896break;1897}18981899AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);1900return dst;1901}19021903if (addr_reg.GetCode() != RWARG1.GetCode())1904armAsm->mov(RWARG1, addr_reg);19051906const bool checked = g_settings.cpu_recompiler_memory_exceptions;1907switch (size)1908{1909case MemoryAccessSize::Byte:1910{1911EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :1912reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));1913}1914break;1915case MemoryAccessSize::HalfWord:1916{1917EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :1918reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));1919}1920break;1921case MemoryAccessSize::Word:1922{1923EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :1924reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));1925}1926break;1927}19281929// TODO: turn this into an asm function instead1930if (checked)1931{1932SwitchToFarCodeIfBitSet(RXRET, 63);1933BackupHostState();19341935// Need to stash this in a temp because of the flush.1936const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));1937armAsm->neg(temp.X(), RXRET);1938armAsm->lsl(temp, temp, 2);19391940Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);19411942// cause_bits = (-result << 2) | BD | cop_n1943armAsm->orr(RWARG1, temp,1944armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(1945static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));1946EmitMov(RWARG2, m_current_instruction_pc);1947EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1948FreeHostReg(temp.GetCode());1949EndBlock(std::nullopt, true);19501951RestoreHostState();1952SwitchToNearCode(false);1953}19541955const Register dst_reg = dst_reg_alloc();1956switch (size)1957{1958case MemoryAccessSize::Byte:1959{1960sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);1961}1962break;1963case MemoryAccessSize::HalfWord:1964{1965sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);1966}1967break;1968case MemoryAccessSize::Word:1969{1970if (dst_reg.GetCode() != RWRET.GetCode())1971armAsm->mov(dst_reg, RWRET);1972}1973break;1974}19751976return dst_reg;1977}19781979void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,1980const vixl::aarch64::Register& value_reg, MemoryAccessSize size,1981bool use_fastmem)1982{1983DebugAssert(addr_reg.IsW() && value_reg.IsW());1984if (use_fastmem)1985{1986if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1987{1988DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1989armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1990armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1991}19921993const MemOperand mem =1994MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1995u8* start = armAsm->GetCursorAddress<u8*>();1996switch (size)1997{1998case MemoryAccessSize::Byte:1999armAsm->strb(value_reg, mem);2000break;20012002case MemoryAccessSize::HalfWord:2003armAsm->strh(value_reg, mem);2004break;20052006case MemoryAccessSize::Word:2007armAsm->str(value_reg, mem);2008break;2009}2010AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);2011return;2012}20132014if (addr_reg.GetCode() != RWARG1.GetCode())2015armAsm->mov(RWARG1, addr_reg);2016if (value_reg.GetCode() != RWARG2.GetCode())2017armAsm->mov(RWARG2, value_reg);20182019const bool checked = g_settings.cpu_recompiler_memory_exceptions;2020switch (size)2021{2022case MemoryAccessSize::Byte:2023{2024EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :2025reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));2026}2027break;2028case MemoryAccessSize::HalfWord:2029{2030EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :2031reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));2032}2033break;2034case MemoryAccessSize::Word:2035{2036EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :2037reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));2038}2039break;2040}20412042// TODO: turn this into an asm function instead2043if (checked)2044{2045SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);2046BackupHostState();20472048// Need to stash this in a temp because of the flush.2049const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2050armAsm->lsl(temp, RWRET, 2);20512052Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);20532054// cause_bits = (result << 2) | BD | cop_n2055armAsm->orr(RWARG1, temp,2056armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(2057static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));2058EmitMov(RWARG2, m_current_instruction_pc);2059EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2060FreeHostReg(temp.GetCode());2061EndBlock(std::nullopt, true);20622063RestoreHostState();2064SwitchToNearCode(false);2065}2066}20672068void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2069const std::optional<VirtualMemoryAddress>& address)2070{2071const std::optional<WRegister> addr_reg =2072g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2073std::optional<WRegister>();2074FlushForLoadStore(address, false, use_fastmem);2075const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2076const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {2077if (cf.MipsT() == Reg::zero)2078return RWRET;20792080return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2081EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,2082cf.MipsT()));2083});20842085if (g_settings.gpu_pgxp_enable)2086{2087Flush(FLUSH_FOR_C_CALL);20882089EmitMov(RWARG1, inst->bits);2090armAsm->mov(RWARG2, addr);2091armAsm->mov(RWARG3, data);2092EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);2093FreeHostReg(addr_reg.value().GetCode());2094}2095}20962097void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2098const std::optional<VirtualMemoryAddress>& address)2099{2100DebugAssert(size == MemoryAccessSize::Word && !sign);21012102const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2103FlushForLoadStore(address, false, use_fastmem);21042105// TODO: if address is constant, this can be simplified..21062107// If we're coming from another block, just flush the load delay and hope for the best..2108if (m_load_delay_dirty)2109UpdateLoadDelay();21102111// We'd need to be careful here if we weren't overwriting it..2112ComputeLoadStoreAddressArg(cf, address, addr);21132114// Do PGXP first, it does its own load.2115if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)2116{2117Flush(FLUSH_FOR_C_CALL);2118EmitMov(RWARG1, inst->bits);2119armAsm->mov(RWARG2, addr);2120MoveMIPSRegToReg(RWARG3, inst->r.rt, true);2121EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));2122}21232124armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2125GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });21262127if (inst->r.rt == Reg::zero)2128{2129FreeHostReg(addr.GetCode());2130return;2131}21322133// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is2134// never written back. NOTE: can't trust T in cf because of the flush2135const Reg rt = inst->r.rt;2136Register value;2137if (m_load_delay_register == rt)2138{2139const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?2140AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :2141m_load_delay_value_register;2142RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);2143value = WRegister(existing_ld_rt);2144}2145else2146{2147if constexpr (EMULATE_LOAD_DELAYS)2148{2149value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));2150if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())2151armAsm->mov(value, WRegister(rtreg.value()));2152else if (HasConstantReg(rt))2153EmitMov(value, GetConstantRegU32(rt));2154else2155armAsm->ldr(value, MipsPtr(rt));2156}2157else2158{2159value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));2160}2161}21622163DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2164armAsm->and_(RWARG2, addr, 3);2165armAsm->lsl(RWARG2, RWARG2, 3); // *82166EmitMov(RWARG3, 24);2167armAsm->sub(RWARG3, RWARG3, RWARG2);21682169if (inst->op == InstructionOp::lwl)2170{2171// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;2172// new_value = (value & mask) | (RWRET << (24 - shift));2173EmitMov(RWSCRATCH, 0xFFFFFFu);2174armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);2175armAsm->and_(value, value, RWSCRATCH);2176armAsm->lslv(RWRET, RWRET, RWARG3);2177armAsm->orr(value, value, RWRET);2178}2179else2180{2181// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);2182// new_value = (value & mask) | (RWRET >> shift);2183armAsm->lsrv(RWRET, RWRET, RWARG2);2184EmitMov(RWSCRATCH, 0xFFFFFF00u);2185armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);2186armAsm->and_(value, value, RWSCRATCH);2187armAsm->orr(value, value, RWRET);2188}21892190FreeHostReg(addr.GetCode());2191}21922193void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2194const std::optional<VirtualMemoryAddress>& address)2195{2196const u32 index = static_cast<u32>(inst->r.rt.GetValue());2197const auto [ptr, action] = GetGTERegisterPointer(index, true);2198const std::optional<WRegister> addr_reg =2199g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2200std::optional<WRegister>();2201FlushForLoadStore(address, false, use_fastmem);2202const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2203const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {2204return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?2205WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2206RWRET;2207});22082209switch (action)2210{2211case GTERegisterAccessAction::Ignore:2212{2213break;2214}22152216case GTERegisterAccessAction::Direct:2217{2218armAsm->str(value, PTR(ptr));2219break;2220}22212222case GTERegisterAccessAction::SignExtend16:2223{2224armAsm->sxth(RWARG3, value);2225armAsm->str(RWARG3, PTR(ptr));2226break;2227}22282229case GTERegisterAccessAction::ZeroExtend16:2230{2231armAsm->uxth(RWARG3, value);2232armAsm->str(RWARG3, PTR(ptr));2233break;2234}22352236case GTERegisterAccessAction::CallHandler:2237{2238Flush(FLUSH_FOR_C_CALL);2239armAsm->mov(RWARG2, value);2240EmitMov(RWARG1, index);2241EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2242break;2243}22442245case GTERegisterAccessAction::PushFIFO:2246{2247// SXY0 <- SXY12248// SXY1 <- SXY22249// SXY2 <- SXYP2250DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2251armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2252armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2253armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2254armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2255armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));2256break;2257}22582259default:2260{2261Panic("Unknown action");2262return;2263}2264}22652266if (g_settings.gpu_pgxp_enable)2267{2268Flush(FLUSH_FOR_C_CALL);2269armAsm->mov(RWARG3, value);2270if (value.GetCode() != RWRET.GetCode())2271FreeHostReg(value.GetCode());2272armAsm->mov(RWARG2, addr);2273FreeHostReg(addr_reg.value().GetCode());2274EmitMov(RWARG1, inst->bits);2275EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));2276}2277}22782279void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2280const std::optional<VirtualMemoryAddress>& address)2281{2282AssertRegOrConstS(cf);2283AssertRegOrConstT(cf);22842285const std::optional<WRegister> addr_reg =2286g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2287std::optional<WRegister>();2288FlushForLoadStore(address, true, use_fastmem);2289const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2290const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;2291if (!cf.valid_host_t)2292MoveTToReg(RWARG2, cf);22932294GenerateStore(addr, data, size, use_fastmem);22952296if (g_settings.gpu_pgxp_enable)2297{2298Flush(FLUSH_FOR_C_CALL);2299MoveMIPSRegToReg(RWARG3, cf.MipsT());2300armAsm->mov(RWARG2, addr);2301EmitMov(RWARG1, inst->bits);2302EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);2303FreeHostReg(addr_reg.value().GetCode());2304}2305}23062307void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2308const std::optional<VirtualMemoryAddress>& address)2309{2310DebugAssert(size == MemoryAccessSize::Word && !sign);23112312// TODO: this can take over rt's value if it's no longer needed2313// NOTE: can't trust T in cf because of the alloc2314const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));23152316FlushForLoadStore(address, true, use_fastmem);23172318// TODO: if address is constant, this can be simplified..2319// We'd need to be careful here if we weren't overwriting it..2320ComputeLoadStoreAddressArg(cf, address, addr);23212322if (g_settings.gpu_pgxp_enable)2323{2324Flush(FLUSH_FOR_C_CALL);2325EmitMov(RWARG1, inst->bits);2326armAsm->mov(RWARG2, addr);2327MoveMIPSRegToReg(RWARG3, inst->r.rt);2328EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));2329}23302331armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2332GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });23332334armAsm->and_(RWSCRATCH, addr, 3);2335armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *82336armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));23372338MoveMIPSRegToReg(RWARG2, inst->r.rt);23392340if (inst->op == InstructionOp::swl)2341{2342// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;2343// new_value = (RWRET & mem_mask) | (value >> (24 - shift));2344EmitMov(RWARG3, 0xFFFFFF00u);2345armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);2346armAsm->and_(RWRET, RWRET, RWARG3);23472348EmitMov(RWARG3, 24);2349armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2350armAsm->lsrv(RWARG2, RWARG2, RWARG3);2351armAsm->orr(RWARG2, RWARG2, RWRET);2352}2353else2354{2355// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);2356// new_value = (RWRET & mem_mask) | (value << shift);2357armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);23582359EmitMov(RWARG3, 24);2360armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2361EmitMov(RWSCRATCH, 0x00FFFFFFu);2362armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);2363armAsm->and_(RWRET, RWRET, RWSCRATCH);2364armAsm->orr(RWARG2, RWARG2, RWRET);2365}23662367GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);2368FreeHostReg(addr.GetCode());2369}23702371void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2372const std::optional<VirtualMemoryAddress>& address)2373{2374const u32 index = static_cast<u32>(inst->r.rt.GetValue());2375const auto [ptr, action] = GetGTERegisterPointer(index, false);2376const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?2377WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2378RWARG1;2379const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;2380FlushForLoadStore(address, true, use_fastmem);2381ComputeLoadStoreAddressArg(cf, address, addr);23822383switch (action)2384{2385case GTERegisterAccessAction::Direct:2386{2387armAsm->ldr(data, PTR(ptr));2388}2389break;23902391case GTERegisterAccessAction::CallHandler:2392{2393// should already be flushed.. except in fastmem case2394Flush(FLUSH_FOR_C_CALL);2395EmitMov(RWARG1, index);2396EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));2397armAsm->mov(data, RWRET);2398}2399break;24002401default:2402{2403Panic("Unknown action");2404}2405break;2406}24072408GenerateStore(addr, data, size, use_fastmem);2409if (!g_settings.gpu_pgxp_enable)2410{2411if (addr.GetCode() != RWARG1.GetCode())2412FreeHostReg(addr.GetCode());2413}2414else2415{2416// TODO: This can be simplified because we don't need to validate in PGXP..2417Flush(FLUSH_FOR_C_CALL);2418armAsm->mov(RWARG3, data);2419FreeHostReg(data.GetCode());2420armAsm->mov(RWARG2, addr);2421FreeHostReg(addr.GetCode());2422EmitMov(RWARG1, inst->bits);2423EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));2424}2425}24262427void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)2428{2429// TODO: we need better constant setting here.. which will need backprop2430AssertRegOrConstT(cf);24312432const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());2433const u32* ptr = GetCop0RegPtr(reg);2434const u32 mask = GetCop0RegWriteMask(reg);2435if (!ptr)2436{2437Compile_Fallback();2438return;2439}24402441if (mask == 0)2442{2443// if it's a read-only register, ignore2444DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));2445return;2446}24472448// for some registers, we need to test certain bits2449const bool needs_bit_test = (reg == Cop0Reg::SR);2450const Register new_value = RWARG1;2451const Register old_value = RWARG2;2452const Register changed_bits = RWARG3;2453const Register mask_reg = RWSCRATCH;24542455// Load old value2456armAsm->ldr(old_value, PTR(ptr));24572458// No way we fit this in an immediate..2459EmitMov(mask_reg, mask);24602461// update value2462if (cf.valid_host_t)2463armAsm->and_(new_value, CFGetRegT(cf), mask_reg);2464else2465EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);24662467if (needs_bit_test)2468armAsm->eor(changed_bits, old_value, new_value);2469armAsm->bic(old_value, old_value, mask_reg);2470armAsm->orr(new_value, old_value, new_value);2471armAsm->str(new_value, PTR(ptr));24722473if (reg == Cop0Reg::SR)2474{2475// TODO: replace with register backup2476// We could just inline the whole thing..2477Flush(FLUSH_FOR_C_CALL);24782479Label caches_unchanged;2480armAsm->tbz(changed_bits, 16, &caches_unchanged);2481EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));2482armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below2483if (CodeCache::IsUsingFastmem())2484armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));2485armAsm->bind(&caches_unchanged);24862487TestInterrupts(RWARG1);2488}2489else if (reg == Cop0Reg::CAUSE)2490{2491armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2492TestInterrupts(RWARG1);2493}2494else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)2495{2496// need to check whether we're switching to debug mode2497Flush(FLUSH_FOR_C_CALL);2498EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));2499SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);2500BackupHostState();2501Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);2502EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return2503RestoreHostState();2504SwitchToNearCode(false);2505}2506}25072508void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)2509{2510// shift mode bits right two, preserving upper bits2511armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2512armAsm->bfxil(RWARG1, RWARG1, 2, 4);2513armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));25142515TestInterrupts(RWARG1);2516}25172518void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)2519{2520DebugAssert(sr.IsW());25212522// if Iec == 0 then goto no_interrupt2523Label no_interrupt;2524armAsm->tbz(sr, 0, &no_interrupt);25252526// sr & cause2527armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));2528armAsm->and_(sr, sr, RWSCRATCH);25292530// ((sr & cause) & 0xff00) == 0 goto no_interrupt2531armAsm->tst(sr, 0xFF00);25322533SwitchToFarCode(true, ne);2534BackupHostState();25352536// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.2537UpdateLoadDelay();25382539Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);25402541// Can't use EndBlockWithException() here, because it'll use the wrong PC.2542// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.2543if (!iinfo->is_last_instruction)2544{2545EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,2546(inst + 1)->cop.cop_n));2547EmitMov(RWARG2, m_compiler_pc);2548EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2549m_dirty_pc = false;2550EndAndLinkBlock(std::nullopt, true, false);2551}2552else2553{2554if (m_dirty_pc)2555EmitMov(RWARG1, m_compiler_pc);2556armAsm->str(wzr, PTR(&g_state.downcount));2557if (m_dirty_pc)2558armAsm->str(RWARG1, PTR(&g_state.pc));2559m_dirty_pc = false;2560EndAndLinkBlock(std::nullopt, false, true);2561}25622563RestoreHostState();2564SwitchToNearCode(false);25652566armAsm->bind(&no_interrupt);2567}25682569void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)2570{2571const u32 index = inst->cop.Cop2Index();2572const Reg rt = inst->r.rt;25732574const auto [ptr, action] = GetGTERegisterPointer(index, false);2575if (action == GTERegisterAccessAction::Ignore)2576return;25772578u32 hreg;2579if (action == GTERegisterAccessAction::Direct)2580{2581hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2582EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2583armAsm->ldr(WRegister(hreg), PTR(ptr));2584}2585else if (action == GTERegisterAccessAction::CallHandler)2586{2587Flush(FLUSH_FOR_C_CALL);2588EmitMov(RWARG1, index);2589EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));25902591hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2592EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2593armAsm->mov(WRegister(hreg), RWRET);2594}2595else2596{2597Panic("Unknown action");2598return;2599}26002601if (g_settings.gpu_pgxp_enable)2602{2603Flush(FLUSH_FOR_C_CALL);2604EmitMov(RWARG1, inst->bits);2605armAsm->mov(RWARG2, WRegister(hreg));2606EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));2607}2608}26092610void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)2611{2612const u32 index = inst->cop.Cop2Index();2613const auto [ptr, action] = GetGTERegisterPointer(index, true);2614if (action == GTERegisterAccessAction::Ignore)2615return;26162617if (action == GTERegisterAccessAction::Direct)2618{2619if (cf.const_t)2620StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);2621else2622armAsm->str(CFGetRegT(cf), PTR(ptr));2623}2624else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)2625{2626const bool sign = (action == GTERegisterAccessAction::SignExtend16);2627if (cf.valid_host_t)2628{2629sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));2630armAsm->str(RWARG1, PTR(ptr));2631}2632else if (cf.const_t)2633{2634const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));2635StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);2636}2637else2638{2639Panic("Unsupported setup");2640}2641}2642else if (action == GTERegisterAccessAction::CallHandler)2643{2644Flush(FLUSH_FOR_C_CALL);2645EmitMov(RWARG1, index);2646MoveTToReg(RWARG2, cf);2647EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2648}2649else if (action == GTERegisterAccessAction::PushFIFO)2650{2651// SXY0 <- SXY12652// SXY1 <- SXY22653// SXY2 <- SXYP2654DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());2655armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2656armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2657armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2658armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2659if (cf.valid_host_t)2660armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));2661else if (cf.const_t)2662StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);2663else2664Panic("Unsupported setup");2665}2666else2667{2668Panic("Unknown action");2669}2670}26712672void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)2673{2674TickCount func_ticks;2675GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);26762677Flush(FLUSH_FOR_C_CALL);2678EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);2679EmitCall(reinterpret_cast<const void*>(func));26802681AddGTETicks(func_ticks);2682}26832684u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,2685TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,2686u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,2687bool is_load)2688{2689Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);2690Assembler* armAsm = &arm_asm;26912692#ifdef VIXL_DEBUG2693vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);2694#endif26952696static constexpr u32 GPR_SIZE = 8;26972698// save regs2699u32 num_gprs = 0;27002701for (u32 i = 0; i < NUM_HOST_REGS; i++)2702{2703if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2704num_gprs++;2705}27062707const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);27082709// TODO: use stp+ldp, vixl helper?27102711if (stack_size > 0)2712{2713armAsm->sub(sp, sp, stack_size);27142715u32 stack_offset = 0;2716for (u32 i = 0; i < NUM_HOST_REGS; i++)2717{2718if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2719{2720armAsm->str(XRegister(i), MemOperand(sp, stack_offset));2721stack_offset += GPR_SIZE;2722}2723}2724}27252726if (cycles_to_add != 0)2727{2728// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles2729Assert(Assembler::IsImmAddSub(cycles_to_add));2730armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2731armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);2732armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2733}27342735if (address_register != static_cast<u8>(RWARG1.GetCode()))2736armAsm->mov(RWARG1, WRegister(address_register));27372738if (!is_load)2739{2740if (data_register != static_cast<u8>(RWARG2.GetCode()))2741armAsm->mov(RWARG2, WRegister(data_register));2742}27432744switch (size)2745{2746case MemoryAccessSize::Byte:2747{2748armEmitCall(armAsm,2749is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :2750reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),2751false);2752}2753break;2754case MemoryAccessSize::HalfWord:2755{2756armEmitCall(armAsm,2757is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :2758reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),2759false);2760}2761break;2762case MemoryAccessSize::Word:2763{2764armEmitCall(armAsm,2765is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :2766reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),2767false);2768}2769break;2770}27712772if (is_load)2773{2774const WRegister dst = WRegister(data_register);2775switch (size)2776{2777case MemoryAccessSize::Byte:2778{2779is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);2780}2781break;2782case MemoryAccessSize::HalfWord:2783{2784is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);2785}2786break;2787case MemoryAccessSize::Word:2788{2789if (dst.GetCode() != RWRET.GetCode())2790armAsm->mov(dst, RWRET);2791}2792break;2793}2794}27952796if (cycles_to_remove != 0)2797{2798Assert(Assembler::IsImmAddSub(cycles_to_remove));2799armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2800armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);2801armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2802}28032804// restore regs2805if (stack_size > 0)2806{2807u32 stack_offset = 0;2808for (u32 i = 0; i < NUM_HOST_REGS; i++)2809{2810if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2811{2812armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));2813stack_offset += GPR_SIZE;2814}2815}28162817armAsm->add(sp, sp, stack_size);2818}28192820armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);2821armAsm->FinalizeCode();28222823return static_cast<u32>(armAsm->GetCursorOffset());2824}28252826#endif // CPU_ARCH_ARM64282728282829