Path: blob/master/src/core/cpu_recompiler_arm32.cpp
4214 views
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>1// SPDX-License-Identifier: CC-BY-NC-ND-4.023#include "cpu_recompiler_arm32.h"4#include "cpu_core_private.h"5#include "cpu_pgxp.h"6#include "gte.h"7#include "settings.h"8#include "timing_event.h"910#include "common/align.h"11#include "common/assert.h"12#include "common/log.h"13#include "common/memmap.h"14#include "common/string_util.h"1516#include <limits>1718#ifdef CPU_ARCH_ARM321920#include "vixl/aarch32/constants-aarch32.h"21#include "vixl/aarch32/instructions-aarch32.h"2223#ifdef ENABLE_HOST_DISASSEMBLY24#include "vixl/aarch32/disasm-aarch32.h"25#include <iostream>26#endif2728LOG_CHANNEL(Recompiler);2930#define PTR(x) vixl::aarch32::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))31#define RMEMBASE vixl::aarch32::r33233#define RRET vixl::aarch32::r034#define RRETHI vixl::aarch32::r135#define RARG1 vixl::aarch32::r036#define RARG2 vixl::aarch32::r137#define RARG3 vixl::aarch32::r238#define RSCRATCH vixl::aarch32::r1239#define RSTATE vixl::aarch32::r44041static bool armIsCallerSavedRegister(u32 id);42static s32 armGetPCDisplacement(const void* current, const void* target);43static bool armIsPCDisplacementInImmediateRange(s32 displacement);44static void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);45static void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm);46static void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);47static void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);48static void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr);49static void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);50static void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr,51const vixl::aarch32::Register& tempreg = RSCRATCH);52static u8* armGetJumpTrampoline(const void* target);5354static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;55static std::unordered_map<const void*, u32> s_trampoline_targets;56static u8* s_trampoline_start_ptr = nullptr;57static u32 s_trampoline_used = 0;5859namespace CPU {6061using namespace vixl::aarch32;6263static ARM32Recompiler s_instance;64Recompiler* g_compiler = &s_instance;6566} // namespace CPU6768bool armIsCallerSavedRegister(u32 id)69{70return ((id >= 0 && id <= 3) || // r0-r371(id == 12 || id == 14)); // sp, pc72}7374s32 armGetPCDisplacement(const void* current, const void* target)75{76Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));77Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));78return static_cast<s32>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)));79}8081bool armIsPCDisplacementInImmediateRange(s32 displacement)82{83return (displacement >= -33554432 && displacement <= 33554428);84}8586void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm)87{88if (vixl::IsUintN(16, imm))89{90armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);91return;92}9394armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);95armAsm->movt(vixl::aarch32::al, rd, imm >> 16);96}9798void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)99{100armEmitMov(armAsm, reg, static_cast<u32>(reinterpret_cast<uintptr_t>(addr)));101}102103void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)104{105const void* cur = armAsm->GetCursorAddress<const void*>();106s32 displacement = armGetPCDisplacement(cur, ptr);107bool use_bx = !armIsPCDisplacementInImmediateRange(displacement);108if (use_bx && !force_inline)109{110if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)111{112displacement = armGetPCDisplacement(cur, trampoline);113use_bx = !armIsPCDisplacementInImmediateRange(displacement);114}115}116117if (use_bx)118{119armMoveAddressToReg(armAsm, RSCRATCH, ptr);120armAsm->bx(RSCRATCH);121}122else123{124vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());125armAsm->b(&label);126}127}128129void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)130{131const void* cur = armAsm->GetCursorAddress<const void*>();132s32 displacement = armGetPCDisplacement(cur, ptr);133bool use_blx = !armIsPCDisplacementInImmediateRange(displacement);134if (use_blx && !force_inline)135{136if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)137{138displacement = armGetPCDisplacement(cur, trampoline);139use_blx = !armIsPCDisplacementInImmediateRange(displacement);140}141}142143if (use_blx)144{145armMoveAddressToReg(armAsm, RSCRATCH, ptr);146armAsm->blx(RSCRATCH);147}148else149{150vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());151armAsm->bl(&label);152}153}154155void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr)156{157const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);158if (!armIsPCDisplacementInImmediateRange(displacement))159{160armMoveAddressToReg(armAsm, RSCRATCH, ptr);161armAsm->blx(cond, RSCRATCH);162}163else164{165vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());166armAsm->b(cond, &label);167}168}169170void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)171{172armMoveAddressToReg(armAsm, reg, addr);173armAsm->ldr(reg, vixl::aarch32::MemOperand(reg));174}175176[[maybe_unused]] void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,177const void* addr, const vixl::aarch32::Register& tempreg)178{179armMoveAddressToReg(armAsm, tempreg, addr);180armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));181}182183void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)184{185#ifdef ENABLE_HOST_DISASSEMBLY186vixl::aarch32::PrintDisassembler dis(std::cout, 0);187dis.SetCodeAddress(reinterpret_cast<uintptr_t>(start));188dis.DisassembleA32Buffer(static_cast<const u32*>(start), size);189#else190ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");191#endif192}193194u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)195{196return size / vixl::aarch32::kA32InstructionSizeInBytes;197}198199u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)200{201using namespace vixl::aarch32;202203const s32 disp = armGetPCDisplacement(code, dst);204DebugAssert(armIsPCDisplacementInImmediateRange(disp));205206// A32 jumps are silly.207{208Assembler emit(static_cast<vixl::byte*>(code), kA32InstructionSizeInBytes, A32);209Label label(disp);210emit.b(&label);211}212213if (flush_icache)214MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);215216return kA32InstructionSizeInBytes;217}218219u8* armGetJumpTrampoline(const void* target)220{221auto it = s_trampoline_targets.find(target);222if (it != s_trampoline_targets.end())223return s_trampoline_start_ptr + it->second;224225// align to 16 bytes?226const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);227228// 4 movs plus a jump229if (TRAMPOLINE_AREA_SIZE - offset < 20)230{231Panic("Ran out of space in constant pool");232return nullptr;233}234235u8* start = s_trampoline_start_ptr + offset;236vixl::aarch32::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);237armMoveAddressToReg(&armAsm, RSCRATCH, target);238armAsm.bx(RSCRATCH);239240const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());241DebugAssert(size < 20);242s_trampoline_targets.emplace(target, offset);243s_trampoline_used = offset + static_cast<u32>(size);244245MemMap::FlushInstructionCache(start, size);246return start;247}248249u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)250{251using namespace vixl::aarch32;252253Assembler actual_asm(static_cast<u8*>(code), code_size);254Assembler* armAsm = &actual_asm;255256#ifdef VIXL_DEBUG257vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);258#endif259260Label dispatch;261Label run_events_and_dispatch;262263g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();264{265// Need the CPU state for basically everything :-)266armMoveAddressToReg(armAsm, RSTATE, &g_state);267}268269// check events then for frame done270{271Label skip_event_check;272armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));273armAsm->ldr(RARG2, PTR(&g_state.downcount));274armAsm->cmp(RARG1, RARG2);275armAsm->b(lt, &skip_event_check);276277g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();278armAsm->bind(&run_events_and_dispatch);279armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);280281armAsm->bind(&skip_event_check);282}283284// TODO: align?285g_dispatcher = armAsm->GetCursorAddress<const void*>();286{287armAsm->bind(&dispatch);288289// x9 <- s_fast_map[pc >> 16]290armAsm->ldr(RARG1, PTR(&g_state.pc));291armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());292armAsm->lsr(RARG2, RARG1, 16);293armAsm->ubfx(RARG1, RARG1, 2, 14);294armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));295296// blr(x9[pc * 2]) (fast_map[pc >> 2])297armAsm->ldr(RARG1, MemOperand(RARG2, RARG1, LSL, 2));298armAsm->bx(RARG1);299}300301g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();302{303armAsm->ldr(RARG1, PTR(&g_state.pc));304armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);305armAsm->b(&dispatch);306}307308g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();309{310armAsm->ldr(RARG1, PTR(&g_state.pc));311armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);312armAsm->b(&dispatch);313}314315g_interpret_block = armAsm->GetCursorAddress<const void*>();316{317armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);318armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));319armAsm->ldr(RARG2, PTR(&g_state.downcount));320armAsm->cmp(RARG1, RARG2);321armAsm->b(ge, &run_events_and_dispatch);322armAsm->b(&dispatch);323}324325armAsm->FinalizeCode();326327s_trampoline_targets.clear();328s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();329s_trampoline_used = 0;330331return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;332}333334void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)335{336constexpr u8 padding_value = 0x00;337std::memset(dst, padding_value, size);338}339340CPU::ARM32Recompiler::ARM32Recompiler() : m_emitter(A32), m_far_emitter(A32)341{342}343344CPU::ARM32Recompiler::~ARM32Recompiler() = default;345346const void* CPU::ARM32Recompiler::GetCurrentCodePointer()347{348return armAsm->GetCursorAddress<const void*>();349}350351void CPU::ARM32Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,352u32 far_code_space)353{354Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);355356// TODO: don't recreate this every time..357DebugAssert(!armAsm);358m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);359m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);360armAsm = &m_emitter;361362#ifdef VIXL_DEBUG363m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(m_emitter.get(), code_buffer_space,364vixl::CodeBufferCheckScope::kDontReserveBufferSpace);365m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(366m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);367#endif368369// Need to wipe it out so it's correct when toggling fastmem.370m_host_regs = {};371372const u32 membase_idx =373(CodeCache::IsUsingFastmem() && block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) ?374RMEMBASE.GetCode() :375NUM_HOST_REGS;376for (u32 i = 0; i < NUM_HOST_REGS; i++)377{378HostRegAlloc& ra = m_host_regs[i];379380if (i == RARG1.GetCode() || i == RARG2.GetCode() || i == RARG3.GetCode() || i == RSCRATCH.GetCode() ||381i == RSTATE.GetCode() || i == membase_idx || i == sp.GetCode() || i == pc.GetCode())382{383continue;384}385386ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);387}388}389390void CPU::ARM32Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch32::ConditionType cond)391{392DebugAssert(armAsm == &m_emitter);393if (emit_jump)394{395const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());396if (armIsPCDisplacementInImmediateRange(disp))397{398Label ldisp(armAsm->GetCursorOffset() + disp);399armAsm->b(cond, &ldisp);400}401else if (cond != vixl::aarch32::al)402{403Label skip;404armAsm->b(Condition(cond).Negate(), &skip);405armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);406armAsm->bind(&skip);407}408else409{410armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);411}412}413armAsm = &m_far_emitter;414}415416void CPU::ARM32Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch32::Register& reg, u32 bit)417{418armAsm->tst(reg, 1u << bit);419420const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());421if (armIsPCDisplacementInImmediateRange(disp))422{423Label ldisp(armAsm->GetCursorOffset() + disp);424armAsm->b(ne, &ldisp);425}426else427{428Label skip;429armAsm->b(eq, &skip);430armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);431armAsm->bind(&skip);432}433434armAsm = &m_far_emitter;435}436437void CPU::ARM32Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch32::Register& reg, bool nonzero)438{439armAsm->cmp(reg, 0);440441const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());442if (armIsPCDisplacementInImmediateRange(disp))443{444Label ldisp(armAsm->GetCursorOffset() + disp);445nonzero ? armAsm->b(ne, &ldisp) : armAsm->b(eq, &ldisp);446}447else448{449Label skip;450nonzero ? armAsm->b(eq, &skip) : armAsm->b(ne, &skip);451armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);452armAsm->bind(&skip);453}454455armAsm = &m_far_emitter;456}457458void CPU::ARM32Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch32::ConditionType cond)459{460DebugAssert(armAsm == &m_far_emitter);461if (emit_jump)462{463const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());464if (armIsPCDisplacementInImmediateRange(disp))465{466Label ldisp(armAsm->GetCursorOffset() + disp);467armAsm->b(cond, &ldisp);468}469else if (cond != vixl::aarch32::al)470{471Label skip;472armAsm->b(Condition(cond).Negate(), &skip);473armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);474armAsm->bind(&skip);475}476else477{478armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);479}480}481armAsm = &m_emitter;482}483484void CPU::ARM32Recompiler::EmitMov(const vixl::aarch32::Register& dst, u32 val)485{486armEmitMov(armAsm, dst, val);487}488489void CPU::ARM32Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)490{491armEmitCall(armAsm, ptr, force_inline);492}493494vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(s32 val)495{496if (ImmediateA32::IsImmediateA32(static_cast<u32>(val)))497return vixl::aarch32::Operand(static_cast<int32_t>(val));498499EmitMov(RSCRATCH, static_cast<u32>(val));500return vixl::aarch32::Operand(RSCRATCH);501}502503vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(u32 val)504{505return armCheckAddSubConstant(static_cast<s32>(val));506}507508vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckCompareConstant(s32 val)509{510return armCheckAddSubConstant(val);511}512513vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckLogicalConstant(u32 val)514{515return armCheckAddSubConstant(val);516}517518void CPU::ARM32Recompiler::BeginBlock()519{520Recompiler::BeginBlock();521}522523void CPU::ARM32Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)524{525// store it first to reduce code size, because we can offset526armMoveAddressToReg(armAsm, RARG1, ram_ptr);527armMoveAddressToReg(armAsm, RARG2, shadow_ptr);528529u32 offset = 0;530Label block_changed;531532#if 0533/* TODO: Vectorize534#include <arm_neon.h>535#include <stdint.h>536537bool foo(const void* a, const void* b)538{539uint8x16_t v1 = vld1q_u8((const uint8_t*)a);540uint8x16_t v2 = vld1q_u8((const uint8_t*)b);541uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);542uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);543uint8x16_t r = vceqq_u8(v1, v2);544uint8x16_t r2 = vceqq_u8(v2, v3);545uint8x16_t r3 = vandq_u8(r, r2);546uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));547if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)548return false;549else550return true;551}552*/553bool first = true;554555while (size >= 16)556{557const VRegister vtmp = a32::v2.V4S();558const VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();559m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));560m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));561m_emit->cmeq(dst, dst, vtmp);562if (!first)563m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());564else565first = false;566567offset += 16;568size -= 16;569}570571if (!first)572{573// TODO: make sure this doesn't choke on ffffffff574armAsm->uminv(a32::s0, a32::v0.V4S());575armAsm->fcmp(a32::s0, 0.0);576armAsm->b(&block_changed, a32::eq);577}578#endif579580while (size >= 4)581{582armAsm->ldr(RARG3, MemOperand(RARG1, offset));583armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset));584armAsm->cmp(RARG3, RSCRATCH);585armAsm->b(ne, &block_changed);586offset += 4;587size -= 4;588}589590DebugAssert(size == 0);591592Label block_unchanged;593armAsm->b(&block_unchanged);594armAsm->bind(&block_changed);595armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);596armAsm->bind(&block_unchanged);597}598599void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()600{601if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))602{603if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))604{605armEmitFarLoad(armAsm, RARG2, GetFetchMemoryAccessTimePtr());606armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));607armEmitMov(armAsm, RARG3, m_block->size);608armAsm->mul(RARG2, RARG2, RARG3);609armAsm->add(RARG1, RARG1, RARG2);610armAsm->str(RARG1, PTR(&g_state.pending_ticks));611}612else613{614armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));615armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));616armAsm->str(RARG1, PTR(&g_state.pending_ticks));617}618}619else if (m_block->icache_line_count > 0)620{621VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;622const TickCount fill_ticks = GetICacheFillTicks(current_pc);623if (fill_ticks <= 0)624return;625626const auto& ticks_reg = RARG1;627const auto& current_tag_reg = RARG2;628const auto& existing_tag_reg = RARG3;629const auto& fill_ticks_reg = r5;630631armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));632armEmitMov(armAsm, current_tag_reg, current_pc);633armEmitMov(armAsm, fill_ticks_reg, fill_ticks);634635for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)636{637const TickCount fill_ticks = GetICacheFillTicks(current_pc);638if (fill_ticks <= 0)639continue;640641const u32 line = GetICacheLine(current_pc);642const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));643644// Offsets must be <4K on ARM.645MemOperand line_addr = MemOperand(RSTATE, offset);646if (offset >= 4096)647{648armEmitMov(armAsm, RSCRATCH, offset);649line_addr = MemOperand(RSTATE, RSCRATCH);650}651652Label cache_hit;653armAsm->ldr(existing_tag_reg, line_addr);654armAsm->str(current_tag_reg, line_addr);655armAsm->cmp(existing_tag_reg, current_tag_reg);656armAsm->add(ne, ticks_reg, ticks_reg, fill_ticks_reg);657658if (i != (m_block->icache_line_count - 1))659armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));660}661662armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));663}664}665666void CPU::ARM32Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,667s32 arg3reg /*= -1*/)668{669if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.GetCode()))670armAsm->mov(RARG1, Register(arg1reg));671if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2.GetCode()))672armAsm->mov(RARG2, Register(arg2reg));673if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3.GetCode()))674armAsm->mov(RARG3, Register(arg3reg));675EmitCall(func);676}677678void CPU::ARM32Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)679{680if (newpc.has_value())681{682if (m_dirty_pc || m_compiler_pc != newpc)683{684EmitMov(RSCRATCH, newpc.value());685armAsm->str(RSCRATCH, PTR(&g_state.pc));686}687}688m_dirty_pc = false;689690// flush regs691Flush(FLUSH_END_BLOCK);692EndAndLinkBlock(newpc, do_event_test, false);693}694695void CPU::ARM32Recompiler::EndBlockWithException(Exception excode)696{697// flush regs, but not pc, it's going to get overwritten698// flush cycles because of the GTE instruction stuff...699Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);700701// TODO: flush load delay702703EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,704inst->cop.cop_n));705EmitMov(RARG2, m_current_instruction_pc);706if (excode != Exception::BP)707{708EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));709}710else711{712EmitMov(RARG3, inst->bits);713EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));714}715716m_dirty_pc = false;717718EndAndLinkBlock(std::nullopt, true, false);719}720721void CPU::ARM32Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)722{723// event test724// pc should've been flushed725DebugAssert(!m_dirty_pc && !m_block_ended);726m_block_ended = true;727728// TODO: try extracting this to a function729730// save cycles for event test731const TickCount cycles = std::exchange(m_cycles, 0);732733// pending_ticks += cycles734// if (pending_ticks >= downcount) { dispatch_event(); }735if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)736armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));737if (do_event_test)738armAsm->ldr(RARG2, PTR(&g_state.downcount));739if (cycles > 0)740armAsm->add(RARG1, RARG1, armCheckAddSubConstant(cycles));741if (m_gte_done_cycle > cycles)742{743armAsm->add(RARG2, RARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));744armAsm->str(RARG2, PTR(&g_state.gte_completion_tick));745}746if (do_event_test)747armAsm->cmp(RARG1, RARG2);748if (cycles > 0)749armAsm->str(RARG1, PTR(&g_state.pending_ticks));750if (do_event_test)751armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);752753// jump to dispatcher or next block754if (force_run_events)755{756armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);757}758else if (!newpc.has_value())759{760armEmitJmp(armAsm, CodeCache::g_dispatcher, false);761}762else763{764const void* target = (newpc.value() == m_block->pc) ?765CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),766armAsm->GetBuffer()->GetStartAddress<const void*>()) :767CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());768armEmitJmp(armAsm, target, true);769}770}771772const void* CPU::ARM32Recompiler::EndCompile(u32* code_size, u32* far_code_size)773{774#ifdef VIXL_DEBUG775m_emitter_check.reset();776m_far_emitter_check.reset();777#endif778779m_emitter.FinalizeCode();780m_far_emitter.FinalizeCode();781782u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();783*code_size = static_cast<u32>(m_emitter.GetCursorOffset());784*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());785armAsm = nullptr;786return code;787}788789const char* CPU::ARM32Recompiler::GetHostRegName(u32 reg) const790{791static constexpr std::array<const char*, 32> reg64_names = {792{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",793"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};794return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";795}796797void CPU::ARM32Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)798{799EmitMov(Register(reg), val);800}801802void CPU::ARM32Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)803{804armAsm->ldr(Register(reg), PTR(ptr));805}806807void CPU::ARM32Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)808{809armAsm->str(Register(reg), PTR(ptr));810}811812void CPU::ARM32Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)813{814EmitMov(RSCRATCH, val);815armAsm->str(RSCRATCH, PTR(ptr));816}817818void CPU::ARM32Recompiler::CopyHostReg(u32 dst, u32 src)819{820if (src != dst)821armAsm->mov(Register(dst), Register(src));822}823824void CPU::ARM32Recompiler::AssertRegOrConstS(CompileFlags cf) const825{826DebugAssert(cf.valid_host_s || cf.const_s);827}828829void CPU::ARM32Recompiler::AssertRegOrConstT(CompileFlags cf) const830{831DebugAssert(cf.valid_host_t || cf.const_t);832}833834vixl::aarch32::MemOperand CPU::ARM32Recompiler::MipsPtr(Reg r) const835{836DebugAssert(r < Reg::count);837return PTR(&g_state.regs.r[static_cast<u32>(r)]);838}839840vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegD(CompileFlags cf) const841{842DebugAssert(cf.valid_host_d);843return Register(cf.host_d);844}845846vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegS(CompileFlags cf) const847{848DebugAssert(cf.valid_host_s);849return Register(cf.host_s);850}851852vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegT(CompileFlags cf) const853{854DebugAssert(cf.valid_host_t);855return Register(cf.host_t);856}857858vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegLO(CompileFlags cf) const859{860DebugAssert(cf.valid_host_lo);861return Register(cf.host_lo);862}863864vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegHI(CompileFlags cf) const865{866DebugAssert(cf.valid_host_hi);867return Register(cf.host_hi);868}869870vixl::aarch32::Register CPU::ARM32Recompiler::GetMembaseReg()871{872const u32 code = RMEMBASE.GetCode();873if (!IsHostRegAllocated(code))874{875// Leave usable unset, so we don't try to allocate it later.876m_host_regs[code].type = HR_TYPE_MEMBASE;877m_host_regs[code].flags = HR_ALLOCATED;878armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));879}880881return RMEMBASE;882}883884void CPU::ARM32Recompiler::MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf)885{886if (cf.valid_host_s)887{888if (cf.host_s != dst.GetCode())889armAsm->mov(dst, Register(cf.host_s));890}891else if (cf.const_s)892{893const u32 cv = GetConstantRegU32(cf.MipsS());894EmitMov(dst, cv);895}896else897{898WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));899armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));900}901}902903void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf)904{905if (cf.valid_host_t)906{907if (cf.host_t != dst.GetCode())908armAsm->mov(dst, Register(cf.host_t));909}910else if (cf.const_t)911{912const u32 cv = GetConstantRegU32(cf.MipsT());913EmitMov(dst, cv);914}915else916{917WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));918armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));919}920}921922void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg)923{924DebugAssert(reg < Reg::count);925if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))926armAsm->mov(dst, Register(hreg.value()));927else if (HasConstantReg(reg))928EmitMov(dst, GetConstantRegU32(reg));929else930armAsm->ldr(dst, MipsPtr(reg));931}932933void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,934Reg arg3reg /* = Reg::count */)935{936DebugAssert(g_settings.gpu_pgxp_enable);937938Flush(FLUSH_FOR_C_CALL);939940if (arg2reg != Reg::count)941MoveMIPSRegToReg(RARG2, arg2reg);942if (arg3reg != Reg::count)943MoveMIPSRegToReg(RARG3, arg3reg);944945EmitMov(RARG1, arg1val);946EmitCall(func);947}948949void CPU::ARM32Recompiler::Flush(u32 flags)950{951Recompiler::Flush(flags);952953if (flags & FLUSH_PC && m_dirty_pc)954{955StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);956m_dirty_pc = false;957}958959if (flags & FLUSH_INSTRUCTION_BITS)960{961// This sucks, but it's only used for fallbacks.962EmitMov(RARG1, inst->bits);963EmitMov(RARG2, m_current_instruction_pc);964EmitMov(RARG3, m_current_instruction_branch_delay_slot);965armAsm->str(RARG1, PTR(&g_state.current_instruction.bits));966armAsm->str(RARG2, PTR(&g_state.current_instruction_pc));967armAsm->strb(RARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));968}969970if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)971{972// This sucks :(973// TODO: make it a function?974armAsm->ldrb(RARG1, PTR(&g_state.load_delay_reg));975armAsm->ldr(RARG2, PTR(&g_state.load_delay_value));976EmitMov(RSCRATCH, OFFSETOF(CPU::State, regs.r[0]));977armAsm->add(RARG1, RSCRATCH, vixl::aarch32::Operand(RARG1, LSL, 2));978armAsm->str(RARG2, MemOperand(RSTATE, RARG1));979EmitMov(RSCRATCH, static_cast<u8>(Reg::count));980armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));981m_load_delay_dirty = false;982}983984if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)985{986if (m_load_delay_value_register != NUM_HOST_REGS)987FreeHostReg(m_load_delay_value_register);988989EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));990armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));991m_load_delay_register = Reg::count;992m_load_delay_dirty = true;993}994995if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)996{997// May as well flush cycles while we're here.998// GTE spanning blocks is very rare, we _could_ disable this for speed.999armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));1000armAsm->ldr(RARG2, PTR(&g_state.gte_completion_tick));1001if (m_cycles > 0)1002{1003armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));1004m_cycles = 0;1005}1006armAsm->cmp(RARG2, RARG1);1007armAsm->mov(hs, RARG1, RARG2);1008armAsm->str(RARG1, PTR(&g_state.pending_ticks));1009m_dirty_gte_done_cycle = false;1010}10111012if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)1013{1014armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));10151016// update cycles at the same time1017if (flags & FLUSH_CYCLES && m_cycles > 0)1018{1019armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));1020armAsm->str(RARG1, PTR(&g_state.pending_ticks));1021m_gte_done_cycle -= m_cycles;1022m_cycles = 0;1023}10241025armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_gte_done_cycle));1026armAsm->str(RARG1, PTR(&g_state.gte_completion_tick));1027m_gte_done_cycle = 0;1028m_dirty_gte_done_cycle = true;1029}10301031if (flags & FLUSH_CYCLES && m_cycles > 0)1032{1033armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));1034armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));1035armAsm->str(RARG1, PTR(&g_state.pending_ticks));1036m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);1037m_cycles = 0;1038}1039}10401041void CPU::ARM32Recompiler::Compile_Fallback()1042{1043WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,1044inst->bits);10451046Flush(FLUSH_FOR_INTERPRETER);10471048EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));10491050// TODO: make me less garbage1051// TODO: this is wrong, it flushes the load delay on the same cycle when we return.1052// but nothing should be going through here..1053Label no_load_delay;1054armAsm->ldrb(RARG1, PTR(&g_state.next_load_delay_reg));1055armAsm->cmp(RARG1, static_cast<u8>(Reg::count));1056armAsm->b(eq, &no_load_delay);1057armAsm->ldr(RARG2, PTR(&g_state.next_load_delay_value));1058armAsm->strb(RARG1, PTR(&g_state.load_delay_reg));1059armAsm->str(RARG2, PTR(&g_state.load_delay_value));1060EmitMov(RARG1, static_cast<u32>(Reg::count));1061armAsm->strb(RARG1, PTR(&g_state.next_load_delay_reg));1062armAsm->bind(&no_load_delay);10631064m_load_delay_dirty = EMULATE_LOAD_DELAYS;1065}10661067void CPU::ARM32Recompiler::CheckBranchTarget(const vixl::aarch32::Register& pcreg)1068{1069if (!g_settings.cpu_recompiler_memory_exceptions)1070return;10711072armAsm->tst(pcreg, armCheckLogicalConstant(0x3));1073SwitchToFarCode(true, ne);10741075BackupHostState();1076EndBlockWithException(Exception::AdEL);10771078RestoreHostState();1079SwitchToNearCode(false);1080}10811082void CPU::ARM32Recompiler::Compile_jr(CompileFlags cf)1083{1084const Register pcreg = CFGetRegS(cf);1085CheckBranchTarget(pcreg);10861087armAsm->str(pcreg, PTR(&g_state.pc));10881089CompileBranchDelaySlot(false);1090EndBlock(std::nullopt, true);1091}10921093void CPU::ARM32Recompiler::Compile_jalr(CompileFlags cf)1094{1095const Register pcreg = CFGetRegS(cf);1096if (MipsD() != Reg::zero)1097SetConstantReg(MipsD(), GetBranchReturnAddress(cf));10981099CheckBranchTarget(pcreg);1100armAsm->str(pcreg, PTR(&g_state.pc));11011102CompileBranchDelaySlot(false);1103EndBlock(std::nullopt, true);1104}11051106void CPU::ARM32Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)1107{1108AssertRegOrConstS(cf);11091110const u32 taken_pc = GetConditionalBranchTarget(cf);11111112Flush(FLUSH_FOR_BRANCH);11131114DebugAssert(cf.valid_host_s);11151116// MipsT() here should equal zero for zero branches.1117DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);11181119Label taken;1120const Register rs = CFGetRegS(cf);1121switch (cond)1122{1123case BranchCondition::Equal:1124case BranchCondition::NotEqual:1125{1126AssertRegOrConstT(cf);1127if (cf.valid_host_t)1128armAsm->cmp(rs, CFGetRegT(cf));1129else if (cf.const_t)1130armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));11311132armAsm->b((cond == BranchCondition::Equal) ? eq : ne, &taken);1133}1134break;11351136case BranchCondition::GreaterThanZero:1137{1138armAsm->cmp(rs, 0);1139armAsm->b(gt, &taken);1140}1141break;11421143case BranchCondition::GreaterEqualZero:1144{1145armAsm->cmp(rs, 0);1146armAsm->b(ge, &taken);1147}1148break;11491150case BranchCondition::LessThanZero:1151{1152armAsm->cmp(rs, 0);1153armAsm->b(lt, &taken);1154}1155break;11561157case BranchCondition::LessEqualZero:1158{1159armAsm->cmp(rs, 0);1160armAsm->b(le, &taken);1161}1162break;1163}11641165BackupHostState();1166if (!cf.delay_slot_swapped)1167CompileBranchDelaySlot();11681169EndBlock(m_compiler_pc, true);11701171armAsm->bind(&taken);11721173RestoreHostState();1174if (!cf.delay_slot_swapped)1175CompileBranchDelaySlot();11761177EndBlock(taken_pc, true);1178}11791180void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf, bool overflow)1181{1182const Register rs = CFGetRegS(cf);1183const Register rt = CFGetRegT(cf);1184if (const u32 imm = inst->i.imm_sext32(); imm != 0)1185{1186if (!overflow)1187{1188armAsm->add(rt, rs, armCheckAddSubConstant(imm));1189}1190else1191{1192armAsm->adds(rt, rs, armCheckAddSubConstant(imm));1193TestOverflow(rt);1194}1195}1196else if (rt.GetCode() != rs.GetCode())1197{1198armAsm->mov(rt, rs);1199}1200}12011202void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf)1203{1204Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);1205}12061207void CPU::ARM32Recompiler::Compile_addiu(CompileFlags cf)1208{1209Compile_addi(cf, false);1210}12111212void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf)1213{1214Compile_slti(cf, true);1215}12161217void CPU::ARM32Recompiler::Compile_sltiu(CompileFlags cf)1218{1219Compile_slti(cf, false);1220}12211222void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf, bool sign)1223{1224const Register rs = CFGetRegS(cf);1225const Register rt = CFGetRegT(cf);1226armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));1227armAsm->mov(sign ? ge : hs, rt, 0);1228armAsm->mov(sign ? lt : lo, rt, 1);1229}12301231void CPU::ARM32Recompiler::Compile_andi(CompileFlags cf)1232{1233const Register rt = CFGetRegT(cf);1234if (const u32 imm = inst->i.imm_zext32(); imm != 0)1235armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));1236else1237EmitMov(rt, 0);1238}12391240void CPU::ARM32Recompiler::Compile_ori(CompileFlags cf)1241{1242const Register rt = CFGetRegT(cf);1243const Register rs = CFGetRegS(cf);1244if (const u32 imm = inst->i.imm_zext32(); imm != 0)1245armAsm->orr(rt, rs, armCheckLogicalConstant(imm));1246else if (rt.GetCode() != rs.GetCode())1247armAsm->mov(rt, rs);1248}12491250void CPU::ARM32Recompiler::Compile_xori(CompileFlags cf)1251{1252const Register rt = CFGetRegT(cf);1253const Register rs = CFGetRegS(cf);1254if (const u32 imm = inst->i.imm_zext32(); imm != 0)1255armAsm->eor(rt, rs, armCheckLogicalConstant(imm));1256else if (rt.GetCode() != rs.GetCode())1257armAsm->mov(rt, rs);1258}12591260void CPU::ARM32Recompiler::Compile_shift(CompileFlags cf,1261void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,1262vixl::aarch32::Register, const Operand&))1263{1264const Register rd = CFGetRegD(cf);1265const Register rt = CFGetRegT(cf);1266if (inst->r.shamt > 0)1267(armAsm->*op)(rd, rt, inst->r.shamt.GetValue());1268else if (rd.GetCode() != rt.GetCode())1269armAsm->mov(rd, rt);1270}12711272void CPU::ARM32Recompiler::Compile_sll(CompileFlags cf)1273{1274Compile_shift(cf, &Assembler::lsl);1275}12761277void CPU::ARM32Recompiler::Compile_srl(CompileFlags cf)1278{1279Compile_shift(cf, &Assembler::lsr);1280}12811282void CPU::ARM32Recompiler::Compile_sra(CompileFlags cf)1283{1284Compile_shift(cf, &Assembler::asr);1285}12861287void CPU::ARM32Recompiler::Compile_variable_shift(CompileFlags cf,1288void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,1289vixl::aarch32::Register,1290const Operand&))1291{1292const Register rd = CFGetRegD(cf);12931294AssertRegOrConstS(cf);1295AssertRegOrConstT(cf);12961297const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1298if (!cf.valid_host_t)1299MoveTToReg(rt, cf);13001301if (cf.const_s)1302{1303if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)1304(armAsm->*op)(rd, rt, shift & 0x1Fu);1305else if (rd.GetCode() != rt.GetCode())1306armAsm->mov(rd, rt);1307}1308else1309{1310armAsm->and_(RSCRATCH, CFGetRegS(cf), 0x1Fu);1311(armAsm->*op)(rd, rt, RSCRATCH);1312}1313}13141315void CPU::ARM32Recompiler::Compile_sllv(CompileFlags cf)1316{1317Compile_variable_shift(cf, &Assembler::lsl);1318}13191320void CPU::ARM32Recompiler::Compile_srlv(CompileFlags cf)1321{1322Compile_variable_shift(cf, &Assembler::lsr);1323}13241325void CPU::ARM32Recompiler::Compile_srav(CompileFlags cf)1326{1327Compile_variable_shift(cf, &Assembler::asr);1328}13291330void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf, bool sign)1331{1332const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1333if (!cf.valid_host_s)1334MoveSToReg(rs, cf);13351336const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1337if (!cf.valid_host_t)1338MoveTToReg(rt, cf);13391340// TODO: if lo/hi gets killed, we can use a 32-bit multiply1341const Register lo = CFGetRegLO(cf);1342const Register hi = CFGetRegHI(cf);13431344(sign) ? armAsm->smull(lo, hi, rs, rt) : armAsm->umull(lo, hi, rs, rt);1345}13461347void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf)1348{1349Compile_mult(cf, true);1350}13511352void CPU::ARM32Recompiler::Compile_multu(CompileFlags cf)1353{1354Compile_mult(cf, false);1355}13561357void CPU::ARM32Recompiler::Compile_div(CompileFlags cf)1358{1359const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1360if (!cf.valid_host_s)1361MoveSToReg(rs, cf);13621363const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1364if (!cf.valid_host_t)1365MoveTToReg(rt, cf);13661367const Register rlo = CFGetRegLO(cf);1368const Register rhi = CFGetRegHI(cf);13691370// TODO: This could be slightly more optimal1371Label done;1372Label not_divide_by_zero;1373armAsm->cmp(rt, 0);1374armAsm->b(ne, ¬_divide_by_zero);1375armAsm->mov(rhi, rs); // hi = num1376EmitMov(rlo, 1);1377EmitMov(RSCRATCH, static_cast<u32>(-1));1378armAsm->cmp(rs, 0);1379armAsm->mov(ge, rlo, RSCRATCH); // lo = s >= 0 ? -1 : 11380armAsm->b(&done);13811382armAsm->bind(¬_divide_by_zero);1383Label not_unrepresentable;1384armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));1385armAsm->b(ne, ¬_unrepresentable);1386armAsm->cmp(rt, armCheckCompareConstant(-1));1387armAsm->b(ne, ¬_unrepresentable);13881389EmitMov(rlo, 0x80000000u);1390EmitMov(rhi, 0);1391armAsm->b(&done);13921393armAsm->bind(¬_unrepresentable);13941395armAsm->sdiv(rlo, rs, rt);13961397// TODO: skip when hi is dead1398armAsm->mls(rhi, rlo, rt, rs);13991400armAsm->bind(&done);1401}14021403void CPU::ARM32Recompiler::Compile_divu(CompileFlags cf)1404{1405const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1406if (!cf.valid_host_s)1407MoveSToReg(rs, cf);14081409const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1410if (!cf.valid_host_t)1411MoveTToReg(rt, cf);14121413const Register rlo = CFGetRegLO(cf);1414const Register rhi = CFGetRegHI(cf);14151416Label done;1417Label not_divide_by_zero;1418armAsm->cmp(rt, 0);1419armAsm->b(ne, ¬_divide_by_zero);1420EmitMov(rlo, static_cast<u32>(-1));1421armAsm->mov(rhi, rs);1422armAsm->b(&done);14231424armAsm->bind(¬_divide_by_zero);14251426armAsm->udiv(rlo, rs, rt);14271428// TODO: skip when hi is dead1429armAsm->mls(rhi, rlo, rt, rs);14301431armAsm->bind(&done);1432}14331434void CPU::ARM32Recompiler::TestOverflow(const vixl::aarch32::Register& result)1435{1436SwitchToFarCode(true, vs);14371438BackupHostState();14391440// toss the result1441ClearHostReg(result.GetCode());14421443EndBlockWithException(Exception::Ov);14441445RestoreHostState();14461447SwitchToNearCode(false);1448}14491450void CPU::ARM32Recompiler::Compile_dst_op(CompileFlags cf,1451void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,1452vixl::aarch32::Register, const Operand&),1453bool commutative, bool logical, bool overflow)1454{1455AssertRegOrConstS(cf);1456AssertRegOrConstT(cf);14571458const Register rd = CFGetRegD(cf);1459if (cf.valid_host_s && cf.valid_host_t)1460{1461(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));1462}1463else if (commutative && (cf.const_s || cf.const_t))1464{1465const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);1466if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1467{1468(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1469}1470else1471{1472if (rd.GetCode() != src.GetCode())1473armAsm->mov(rd, src);1474overflow = false;1475}1476}1477else if (cf.const_s)1478{1479EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));1480(armAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));1481}1482else if (cf.const_t)1483{1484const Register rs = CFGetRegS(cf);1485if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1486{1487(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1488}1489else1490{1491if (rd.GetCode() != rs.GetCode())1492armAsm->mov(rd, rs);1493overflow = false;1494}1495}14961497if (overflow)1498TestOverflow(rd);1499}15001501void CPU::ARM32Recompiler::Compile_add(CompileFlags cf)1502{1503if (g_settings.cpu_recompiler_memory_exceptions)1504Compile_dst_op(cf, &Assembler::adds, true, false, true);1505else1506Compile_dst_op(cf, &Assembler::add, true, false, false);1507}15081509void CPU::ARM32Recompiler::Compile_addu(CompileFlags cf)1510{1511Compile_dst_op(cf, &Assembler::add, true, false, false);1512}15131514void CPU::ARM32Recompiler::Compile_sub(CompileFlags cf)1515{1516if (g_settings.cpu_recompiler_memory_exceptions)1517Compile_dst_op(cf, &Assembler::subs, false, false, true);1518else1519Compile_dst_op(cf, &Assembler::sub, false, false, false);1520}15211522void CPU::ARM32Recompiler::Compile_subu(CompileFlags cf)1523{1524Compile_dst_op(cf, &Assembler::sub, false, false, false);1525}15261527void CPU::ARM32Recompiler::Compile_and(CompileFlags cf)1528{1529AssertRegOrConstS(cf);1530AssertRegOrConstT(cf);15311532// special cases - and with self -> self, and with 0 -> 01533const Register regd = CFGetRegD(cf);1534if (cf.MipsS() == cf.MipsT())1535{1536armAsm->mov(regd, CFGetRegS(cf));1537return;1538}1539else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1540{1541EmitMov(regd, 0);1542return;1543}15441545Compile_dst_op(cf, &Assembler::and_, true, true, false);1546}15471548void CPU::ARM32Recompiler::Compile_or(CompileFlags cf)1549{1550AssertRegOrConstS(cf);1551AssertRegOrConstT(cf);15521553// or/nor with 0 -> no effect1554const Register regd = CFGetRegD(cf);1555if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())1556{1557cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1558return;1559}15601561Compile_dst_op(cf, &Assembler::orr, true, true, false);1562}15631564void CPU::ARM32Recompiler::Compile_xor(CompileFlags cf)1565{1566AssertRegOrConstS(cf);1567AssertRegOrConstT(cf);15681569const Register regd = CFGetRegD(cf);1570if (cf.MipsS() == cf.MipsT())1571{1572// xor with self -> zero1573EmitMov(regd, 0);1574return;1575}1576else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1577{1578// xor with zero -> no effect1579cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1580return;1581}15821583Compile_dst_op(cf, &Assembler::eor, true, true, false);1584}15851586void CPU::ARM32Recompiler::Compile_nor(CompileFlags cf)1587{1588Compile_or(cf);1589armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));1590}15911592void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf)1593{1594Compile_slt(cf, true);1595}15961597void CPU::ARM32Recompiler::Compile_sltu(CompileFlags cf)1598{1599Compile_slt(cf, false);1600}16011602void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf, bool sign)1603{1604AssertRegOrConstS(cf);1605AssertRegOrConstT(cf);16061607// TODO: swap and reverse op for constants1608if (cf.const_s)1609{1610EmitMov(RSCRATCH, GetConstantRegS32(cf.MipsS()));1611armAsm->cmp(RSCRATCH, CFGetRegT(cf));1612}1613else if (cf.const_t)1614{1615armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));1616}1617else1618{1619armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));1620}16211622const Register rd = CFGetRegD(cf);1623armAsm->mov(sign ? ge : cs, rd, 0);1624armAsm->mov(sign ? lt : lo, rd, 1);1625}16261627vixl::aarch32::Register1628CPU::ARM32Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,1629const std::optional<const vixl::aarch32::Register>& reg)1630{1631const u32 imm = inst->i.imm_sext32();1632if (cf.valid_host_s && imm == 0 && !reg.has_value())1633return CFGetRegS(cf);16341635const Register dst = reg.has_value() ? reg.value() : RARG1;1636if (address.has_value())1637{1638EmitMov(dst, address.value());1639}1640else if (imm == 0)1641{1642if (cf.valid_host_s)1643{1644if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())1645armAsm->mov(dst, CFGetRegS(cf));1646}1647else1648{1649armAsm->ldr(dst, MipsPtr(cf.MipsS()));1650}1651}1652else1653{1654if (cf.valid_host_s)1655{1656armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1657}1658else1659{1660armAsm->ldr(dst, MipsPtr(cf.MipsS()));1661armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1662}1663}16641665return dst;1666}16671668template<typename RegAllocFn>1669vixl::aarch32::Register CPU::ARM32Recompiler::GenerateLoad(const vixl::aarch32::Register& addr_reg,1670MemoryAccessSize size, bool sign, bool use_fastmem,1671const RegAllocFn& dst_reg_alloc)1672{1673if (use_fastmem)1674{1675DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);1676m_cycles += Bus::RAM_READ_TICKS;16771678const Register dst = dst_reg_alloc();1679const Register membase = GetMembaseReg();1680DebugAssert(addr_reg.GetCode() != RARG3.GetCode());1681armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1682armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));16831684const MemOperand mem = MemOperand(RARG3, addr_reg);1685u8* start = armAsm->GetCursorAddress<u8*>();1686switch (size)1687{1688case MemoryAccessSize::Byte:1689sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);1690break;16911692case MemoryAccessSize::HalfWord:1693sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);1694break;16951696case MemoryAccessSize::Word:1697armAsm->ldr(dst, mem);1698break;1699}17001701AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), dst.GetCode(), size, sign, true);1702return dst;1703}17041705if (addr_reg.GetCode() != RARG1.GetCode())1706armAsm->mov(RARG1, addr_reg);17071708const bool checked = g_settings.cpu_recompiler_memory_exceptions;1709switch (size)1710{1711case MemoryAccessSize::Byte:1712{1713EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :1714reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));1715}1716break;1717case MemoryAccessSize::HalfWord:1718{1719EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :1720reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));1721}1722break;1723case MemoryAccessSize::Word:1724{1725EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :1726reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));1727}1728break;1729}17301731// TODO: turn this into an asm function instead1732if (checked)1733{1734SwitchToFarCodeIfBitSet(RRETHI, 31);1735BackupHostState();17361737// Need to stash this in a temp because of the flush.1738const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));1739armAsm->rsb(temp, RRETHI, 0);1740armAsm->lsl(temp, temp, 2);17411742Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);17431744// cause_bits = (-result << 2) | BD | cop_n1745armAsm->orr(RARG1, temp,1746armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(1747static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));1748EmitMov(RARG2, m_current_instruction_pc);1749EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1750FreeHostReg(temp.GetCode());1751EndBlock(std::nullopt, true);17521753RestoreHostState();1754SwitchToNearCode(false);1755}17561757const Register dst_reg = dst_reg_alloc();1758switch (size)1759{1760case MemoryAccessSize::Byte:1761{1762sign ? armAsm->sxtb(dst_reg, RRET) : armAsm->uxtb(dst_reg, RRET);1763}1764break;1765case MemoryAccessSize::HalfWord:1766{1767sign ? armAsm->sxth(dst_reg, RRET) : armAsm->uxth(dst_reg, RRET);1768}1769break;1770case MemoryAccessSize::Word:1771{1772if (dst_reg.GetCode() != RRET.GetCode())1773armAsm->mov(dst_reg, RRET);1774}1775break;1776}17771778return dst_reg;1779}17801781void CPU::ARM32Recompiler::GenerateStore(const vixl::aarch32::Register& addr_reg,1782const vixl::aarch32::Register& value_reg, MemoryAccessSize size,1783bool use_fastmem)1784{1785if (use_fastmem)1786{1787DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);1788DebugAssert(addr_reg.GetCode() != RARG3.GetCode());1789const Register membase = GetMembaseReg();1790armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1791armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));17921793const MemOperand mem = MemOperand(RARG3, addr_reg);1794u8* start = armAsm->GetCursorAddress<u8*>();1795switch (size)1796{1797case MemoryAccessSize::Byte:1798armAsm->strb(value_reg, mem);1799break;18001801case MemoryAccessSize::HalfWord:1802armAsm->strh(value_reg, mem);1803break;18041805case MemoryAccessSize::Word:1806armAsm->str(value_reg, mem);1807break;1808}1809AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);1810return;1811}18121813if (addr_reg.GetCode() != RARG1.GetCode())1814armAsm->mov(RARG1, addr_reg);1815if (value_reg.GetCode() != RARG2.GetCode())1816armAsm->mov(RARG2, value_reg);18171818const bool checked = g_settings.cpu_recompiler_memory_exceptions;1819switch (size)1820{1821case MemoryAccessSize::Byte:1822{1823EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :1824reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));1825}1826break;1827case MemoryAccessSize::HalfWord:1828{1829EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :1830reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));1831}1832break;1833case MemoryAccessSize::Word:1834{1835EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :1836reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));1837}1838break;1839}18401841// TODO: turn this into an asm function instead1842if (checked)1843{1844SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);1845BackupHostState();18461847// Need to stash this in a temp because of the flush.1848const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));1849armAsm->lsl(temp, RRET, 2);18501851Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);18521853// cause_bits = (result << 2) | BD | cop_n1854armAsm->orr(RARG1, temp,1855armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(1856static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));1857EmitMov(RARG2, m_current_instruction_pc);1858EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1859FreeHostReg(temp.GetCode());1860EndBlock(std::nullopt, true);18611862RestoreHostState();1863SwitchToNearCode(false);1864}1865}18661867void CPU::ARM32Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,1868const std::optional<VirtualMemoryAddress>& address)1869{1870const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?1871std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :1872std::optional<Register>();1873FlushForLoadStore(address, false, use_fastmem);1874const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);1875const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {1876if (cf.MipsT() == Reg::zero)1877return RRET;18781879return Register(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),1880EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));1881});18821883if (g_settings.gpu_pgxp_enable)1884{1885Flush(FLUSH_FOR_C_CALL);18861887EmitMov(RARG1, inst->bits);1888armAsm->mov(RARG2, addr);1889armAsm->mov(RARG3, data);1890EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);1891FreeHostReg(addr_reg.value().GetCode());1892}1893}18941895void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,1896const std::optional<VirtualMemoryAddress>& address)1897{1898DebugAssert(size == MemoryAccessSize::Word && !sign);18991900const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));1901FlushForLoadStore(address, false, use_fastmem);19021903// TODO: if address is constant, this can be simplified..19041905// If we're coming from another block, just flush the load delay and hope for the best..1906if (m_load_delay_dirty)1907UpdateLoadDelay();19081909// We'd need to be careful here if we weren't overwriting it..1910ComputeLoadStoreAddressArg(cf, address, addr);1911armAsm->bic(RARG1, addr, 3);1912GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });19131914if (inst->r.rt == Reg::zero)1915{1916FreeHostReg(addr.GetCode());1917return;1918}19191920// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is1921// never written back. NOTE: can't trust T in cf because of the flush1922const Reg rt = inst->r.rt;1923Register value;1924if (m_load_delay_register == rt)1925{1926const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?1927AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :1928m_load_delay_value_register;1929RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);1930value = Register(existing_ld_rt);1931}1932else1933{1934if constexpr (EMULATE_LOAD_DELAYS)1935{1936value = Register(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));1937if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())1938armAsm->mov(value, Register(rtreg.value()));1939else if (HasConstantReg(rt))1940EmitMov(value, GetConstantRegU32(rt));1941else1942armAsm->ldr(value, MipsPtr(rt));1943}1944else1945{1946value = Register(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));1947}1948}19491950DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());1951armAsm->and_(RARG2, addr, 3);1952armAsm->lsl(RARG2, RARG2, 3); // *81953EmitMov(RARG3, 24);1954armAsm->sub(RARG3, RARG3, RARG2);19551956if (inst->op == InstructionOp::lwl)1957{1958// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;1959// new_value = (value & mask) | (RWRET << (24 - shift));1960EmitMov(RSCRATCH, 0xFFFFFFu);1961armAsm->lsr(RSCRATCH, RSCRATCH, RARG2);1962armAsm->and_(value, value, RSCRATCH);1963armAsm->lsl(RRET, RRET, RARG3);1964armAsm->orr(value, value, RRET);1965}1966else1967{1968// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);1969// new_value = (value & mask) | (RWRET >> shift);1970armAsm->lsr(RRET, RRET, RARG2);1971EmitMov(RSCRATCH, 0xFFFFFF00u);1972armAsm->lsl(RSCRATCH, RSCRATCH, RARG3);1973armAsm->and_(value, value, RSCRATCH);1974armAsm->orr(value, value, RRET);1975}19761977FreeHostReg(addr.GetCode());19781979if (g_settings.gpu_pgxp_enable)1980{1981Flush(FLUSH_FOR_C_CALL);1982armAsm->mov(RARG3, value);1983armAsm->bic(RARG2, addr, 3);1984EmitMov(RARG1, inst->bits);1985EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));1986}1987}19881989void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,1990const std::optional<VirtualMemoryAddress>& address)1991{1992const u32 index = static_cast<u32>(inst->r.rt.GetValue());1993const auto [ptr, action] = GetGTERegisterPointer(index, true);1994const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?1995std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :1996std::optional<Register>();1997FlushForLoadStore(address, false, use_fastmem);1998const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);1999const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {2000return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?2001Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :2002RRET;2003});20042005switch (action)2006{2007case GTERegisterAccessAction::Ignore:2008{2009break;2010}20112012case GTERegisterAccessAction::Direct:2013{2014armAsm->str(value, PTR(ptr));2015break;2016}20172018case GTERegisterAccessAction::SignExtend16:2019{2020armAsm->sxth(RARG3, value);2021armAsm->str(RARG3, PTR(ptr));2022break;2023}20242025case GTERegisterAccessAction::ZeroExtend16:2026{2027armAsm->uxth(RARG3, value);2028armAsm->str(RARG3, PTR(ptr));2029break;2030}20312032case GTERegisterAccessAction::CallHandler:2033{2034Flush(FLUSH_FOR_C_CALL);2035armAsm->mov(RARG2, value);2036EmitMov(RARG1, index);2037EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2038break;2039}20402041case GTERegisterAccessAction::PushFIFO:2042{2043// SXY0 <- SXY12044// SXY1 <- SXY22045// SXY2 <- SXYP2046DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());2047armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));2048armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));2049armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));2050armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));2051armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));2052break;2053}20542055default:2056{2057Panic("Unknown action");2058return;2059}2060}20612062if (g_settings.gpu_pgxp_enable)2063{2064Flush(FLUSH_FOR_C_CALL);2065armAsm->mov(RARG3, value);2066if (value.GetCode() != RRET.GetCode())2067FreeHostReg(value.GetCode());2068armAsm->mov(RARG2, addr);2069FreeHostReg(addr_reg.value().GetCode());2070EmitMov(RARG1, inst->bits);2071EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));2072}2073}20742075void CPU::ARM32Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2076const std::optional<VirtualMemoryAddress>& address)2077{2078AssertRegOrConstS(cf);2079AssertRegOrConstT(cf);20802081const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?2082std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :2083std::optional<Register>();2084FlushForLoadStore(address, true, use_fastmem);2085const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2086const Register data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;2087if (!cf.valid_host_t)2088MoveTToReg(RARG2, cf);20892090GenerateStore(addr, data, size, use_fastmem);20912092if (g_settings.gpu_pgxp_enable)2093{2094Flush(FLUSH_FOR_C_CALL);2095MoveMIPSRegToReg(RARG3, cf.MipsT());2096armAsm->mov(RARG2, addr);2097EmitMov(RARG1, inst->bits);2098EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);2099FreeHostReg(addr_reg.value().GetCode());2100}2101}21022103void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2104const std::optional<VirtualMemoryAddress>& address)2105{2106DebugAssert(size == MemoryAccessSize::Word && !sign);21072108// TODO: this can take over rt's value if it's no longer needed2109// NOTE: can't trust T in cf because of the alloc2110const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));2111const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;2112if (g_settings.gpu_pgxp_enable)2113MoveMIPSRegToReg(value, inst->r.rt);21142115FlushForLoadStore(address, true, use_fastmem);21162117// TODO: if address is constant, this can be simplified..2118// We'd need to be careful here if we weren't overwriting it..2119ComputeLoadStoreAddressArg(cf, address, addr);2120armAsm->bic(RARG1, addr, 3);2121GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });21222123armAsm->and_(RSCRATCH, addr, 3);2124armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *82125armAsm->bic(addr, addr, 3);21262127// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.2128if (!g_settings.gpu_pgxp_enable)2129MoveMIPSRegToReg(value, inst->r.rt);21302131if (inst->op == InstructionOp::swl)2132{2133// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;2134// new_value = (RWRET & mem_mask) | (value >> (24 - shift));2135EmitMov(RARG3, 0xFFFFFF00u);2136armAsm->lsl(RARG3, RARG3, RSCRATCH);2137armAsm->and_(RRET, RRET, RARG3);21382139EmitMov(RARG3, 24);2140armAsm->sub(RARG3, RARG3, RSCRATCH);2141armAsm->lsr(value, value, RARG3);2142armAsm->orr(value, value, RRET);2143}2144else2145{2146// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);2147// new_value = (RWRET & mem_mask) | (value << shift);2148armAsm->lsl(value, value, RSCRATCH);21492150EmitMov(RARG3, 24);2151armAsm->sub(RARG3, RARG3, RSCRATCH);2152EmitMov(RSCRATCH, 0x00FFFFFFu);2153armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);2154armAsm->and_(RRET, RRET, RSCRATCH);2155armAsm->orr(value, value, RRET);2156}21572158if (!g_settings.gpu_pgxp_enable)2159{2160GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);2161FreeHostReg(addr.GetCode());2162}2163else2164{2165GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);21662167Flush(FLUSH_FOR_C_CALL);2168armAsm->mov(RARG3, value);2169FreeHostReg(value.GetCode());2170armAsm->mov(RARG2, addr);2171FreeHostReg(addr.GetCode());2172EmitMov(RARG1, inst->bits);2173EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));2174}2175}21762177void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2178const std::optional<VirtualMemoryAddress>& address)2179{2180const u32 index = static_cast<u32>(inst->r.rt.GetValue());2181const auto [ptr, action] = GetGTERegisterPointer(index, false);2182const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?2183Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :2184RARG1;2185const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;2186FlushForLoadStore(address, true, use_fastmem);2187ComputeLoadStoreAddressArg(cf, address, addr);21882189switch (action)2190{2191case GTERegisterAccessAction::Direct:2192{2193armAsm->ldr(data, PTR(ptr));2194}2195break;21962197case GTERegisterAccessAction::CallHandler:2198{2199// should already be flushed.. except in fastmem case2200Flush(FLUSH_FOR_C_CALL);2201EmitMov(RARG1, index);2202EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));2203armAsm->mov(data, RRET);2204}2205break;22062207default:2208{2209Panic("Unknown action");2210}2211break;2212}22132214GenerateStore(addr, data, size, use_fastmem);2215if (!g_settings.gpu_pgxp_enable)2216{2217if (addr.GetCode() != RARG1.GetCode())2218FreeHostReg(addr.GetCode());2219}2220else2221{2222// TODO: This can be simplified because we don't need to validate in PGXP..2223Flush(FLUSH_FOR_C_CALL);2224armAsm->mov(RARG3, data);2225FreeHostReg(data.GetCode());2226armAsm->mov(RARG2, addr);2227FreeHostReg(addr.GetCode());2228EmitMov(RARG1, inst->bits);2229EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));2230}2231}22322233void CPU::ARM32Recompiler::Compile_mtc0(CompileFlags cf)2234{2235// TODO: we need better constant setting here.. which will need backprop2236AssertRegOrConstT(cf);22372238const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());2239const u32* ptr = GetCop0RegPtr(reg);2240const u32 mask = GetCop0RegWriteMask(reg);2241if (!ptr)2242{2243Compile_Fallback();2244return;2245}22462247if (mask == 0)2248{2249// if it's a read-only register, ignore2250DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));2251return;2252}22532254// for some registers, we need to test certain bits2255const bool needs_bit_test = (reg == Cop0Reg::SR);2256const Register new_value = RARG1;2257const Register old_value = RARG2;2258const Register changed_bits = RARG3;2259const Register mask_reg = RSCRATCH;22602261// Load old value2262armAsm->ldr(old_value, PTR(ptr));22632264// No way we fit this in an immediate..2265EmitMov(mask_reg, mask);22662267// update value2268if (cf.valid_host_t)2269armAsm->and_(new_value, CFGetRegT(cf), mask_reg);2270else2271EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);22722273if (needs_bit_test)2274armAsm->eor(changed_bits, old_value, new_value);2275armAsm->bic(old_value, old_value, mask_reg);2276armAsm->orr(new_value, old_value, new_value);2277armAsm->str(new_value, PTR(ptr));22782279if (reg == Cop0Reg::SR)2280{2281// TODO: replace with register backup2282// We could just inline the whole thing..2283Flush(FLUSH_FOR_C_CALL);22842285Label caches_unchanged;2286armAsm->tst(changed_bits, 1u << 16);2287armAsm->b(eq, &caches_unchanged);2288EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));2289armAsm->ldr(RARG1, PTR(ptr)); // reload value for interrupt test below2290armAsm->bind(&caches_unchanged);22912292// might need to reload fastmem base too2293if (CodeCache::IsUsingFastmem() && m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions) &&2294IsHostRegAllocated(RMEMBASE.GetCode()))2295{2296FreeHostReg(RMEMBASE.GetCode());2297}22982299TestInterrupts(RARG1);2300}2301else if (reg == Cop0Reg::CAUSE)2302{2303armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));2304TestInterrupts(RARG1);2305}2306else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)2307{2308// need to check whether we're switching to debug mode2309Flush(FLUSH_FOR_C_CALL);2310EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));2311SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);2312BackupHostState();2313Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);2314EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return2315RestoreHostState();2316SwitchToNearCode(false);2317}2318}23192320void CPU::ARM32Recompiler::Compile_rfe(CompileFlags cf)2321{2322// shift mode bits right two, preserving upper bits2323armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));2324armAsm->bic(RARG2, RARG1, 15);2325armAsm->ubfx(RARG1, RARG1, 2, 4);2326armAsm->orr(RARG1, RARG1, RARG2);2327armAsm->str(RARG1, PTR(&g_state.cop0_regs.sr.bits));23282329TestInterrupts(RARG1);2330}23312332void CPU::ARM32Recompiler::TestInterrupts(const vixl::aarch32::Register& sr)2333{2334// if Iec == 0 then goto no_interrupt2335Label no_interrupt;2336armAsm->tst(sr, 1);2337armAsm->b(eq, &no_interrupt);23382339// sr & cause2340armAsm->ldr(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));2341armAsm->and_(sr, sr, RSCRATCH);23422343// ((sr & cause) & 0xff00) == 0 goto no_interrupt2344armAsm->tst(sr, 0xFF00);23452346SwitchToFarCode(true, ne);2347BackupHostState();23482349// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.2350UpdateLoadDelay();23512352Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);23532354// Can't use EndBlockWithException() here, because it'll use the wrong PC.2355// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.2356if (!iinfo->is_last_instruction)2357{2358EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,2359(inst + 1)->cop.cop_n));2360EmitMov(RARG2, m_compiler_pc);2361EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2362m_dirty_pc = false;2363EndAndLinkBlock(std::nullopt, true, false);2364}2365else2366{2367EmitMov(RARG1, 0);2368if (m_dirty_pc)2369EmitMov(RARG2, m_compiler_pc);2370armAsm->str(RARG1, PTR(&g_state.downcount));2371if (m_dirty_pc)2372armAsm->str(RARG2, PTR(&g_state.pc));2373m_dirty_pc = false;2374EndAndLinkBlock(std::nullopt, false, true);2375}23762377RestoreHostState();2378SwitchToNearCode(false);23792380armAsm->bind(&no_interrupt);2381}23822383void CPU::ARM32Recompiler::Compile_mfc2(CompileFlags cf)2384{2385const u32 index = inst->cop.Cop2Index();2386const Reg rt = inst->r.rt;23872388const auto [ptr, action] = GetGTERegisterPointer(index, false);2389if (action == GTERegisterAccessAction::Ignore)2390return;23912392u32 hreg;2393if (action == GTERegisterAccessAction::Direct)2394{2395hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2396EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2397armAsm->ldr(Register(hreg), PTR(ptr));2398}2399else if (action == GTERegisterAccessAction::CallHandler)2400{2401Flush(FLUSH_FOR_C_CALL);2402EmitMov(RARG1, index);2403EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));24042405hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2406EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2407armAsm->mov(Register(hreg), RRET);2408}2409else2410{2411Panic("Unknown action");2412return;2413}24142415if (g_settings.gpu_pgxp_enable)2416{2417Flush(FLUSH_FOR_C_CALL);2418EmitMov(RARG1, inst->bits);2419armAsm->mov(RARG2, Register(hreg));2420EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));2421}2422}24232424void CPU::ARM32Recompiler::Compile_mtc2(CompileFlags cf)2425{2426const u32 index = inst->cop.Cop2Index();2427const auto [ptr, action] = GetGTERegisterPointer(index, true);2428if (action == GTERegisterAccessAction::Ignore)2429return;24302431if (action == GTERegisterAccessAction::Direct)2432{2433if (cf.const_t)2434StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);2435else2436armAsm->str(CFGetRegT(cf), PTR(ptr));2437}2438else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)2439{2440const bool sign = (action == GTERegisterAccessAction::SignExtend16);2441if (cf.valid_host_t)2442{2443sign ? armAsm->sxth(RARG1, CFGetRegT(cf)) : armAsm->uxth(RARG1, CFGetRegT(cf));2444armAsm->str(RARG1, PTR(ptr));2445}2446else if (cf.const_t)2447{2448const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));2449StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);2450}2451else2452{2453Panic("Unsupported setup");2454}2455}2456else if (action == GTERegisterAccessAction::CallHandler)2457{2458Flush(FLUSH_FOR_C_CALL);2459EmitMov(RARG1, index);2460MoveTToReg(RARG2, cf);2461EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2462}2463else if (action == GTERegisterAccessAction::PushFIFO)2464{2465// SXY0 <- SXY12466// SXY1 <- SXY22467// SXY2 <- SXYP2468DebugAssert(RRET.GetCode() != RARG2.GetCode() && RRET.GetCode() != RARG3.GetCode());2469armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));2470armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));2471armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));2472armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));2473if (cf.valid_host_t)2474armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));2475else if (cf.const_t)2476StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);2477else2478Panic("Unsupported setup");2479}2480else2481{2482Panic("Unknown action");2483}2484}24852486void CPU::ARM32Recompiler::Compile_cop2(CompileFlags cf)2487{2488TickCount func_ticks;2489GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);24902491Flush(FLUSH_FOR_C_CALL);2492EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);2493EmitCall(reinterpret_cast<const void*>(func));24942495AddGTETicks(func_ticks);2496}24972498u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,2499TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,2500u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,2501bool is_load)2502{2503Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);2504Assembler* armAsm = &arm_asm;25052506#ifdef VIXL_DEBUG2507vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);2508#endif25092510// save regs2511RegisterList save_regs;25122513for (u32 i = 0; i < NUM_HOST_REGS; i++)2514{2515if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2516save_regs.Combine(RegisterList(Register(i)));2517}25182519if (!save_regs.IsEmpty())2520armAsm->push(save_regs);25212522if (address_register != static_cast<u8>(RARG1.GetCode()))2523armAsm->mov(RARG1, Register(address_register));25242525if (!is_load)2526{2527if (data_register != static_cast<u8>(RARG2.GetCode()))2528armAsm->mov(RARG2, Register(data_register));2529}25302531if (cycles_to_add != 0)2532{2533// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles2534armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));2535if (!ImmediateA32::IsImmediateA32(cycles_to_add))2536{2537armEmitMov(armAsm, RSCRATCH, cycles_to_add);2538armAsm->add(RARG3, RARG3, RSCRATCH);2539}2540else2541{2542armAsm->add(RARG3, RARG3, cycles_to_add);2543}25442545armAsm->str(RARG3, PTR(&g_state.pending_ticks));2546}25472548switch (size)2549{2550case MemoryAccessSize::Byte:2551{2552armEmitCall(armAsm,2553is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :2554reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte),2555false);2556}2557break;2558case MemoryAccessSize::HalfWord:2559{2560armEmitCall(armAsm,2561is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :2562reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord),2563false);2564}2565break;2566case MemoryAccessSize::Word:2567{2568armEmitCall(armAsm,2569is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :2570reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord),2571false);2572}2573break;2574}25752576if (is_load)2577{2578const Register dst = Register(data_register);2579switch (size)2580{2581case MemoryAccessSize::Byte:2582{2583is_signed ? armAsm->sxtb(dst, RRET) : armAsm->uxtb(dst, RRET);2584}2585break;2586case MemoryAccessSize::HalfWord:2587{2588is_signed ? armAsm->sxth(dst, RRET) : armAsm->uxth(dst, RRET);2589}2590break;2591case MemoryAccessSize::Word:2592{2593if (dst.GetCode() != RRET.GetCode())2594armAsm->mov(dst, RRET);2595}2596break;2597}2598}25992600if (cycles_to_remove != 0)2601{2602armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));2603if (!ImmediateA32::IsImmediateA32(cycles_to_remove))2604{2605armEmitMov(armAsm, RSCRATCH, cycles_to_remove);2606armAsm->sub(RARG3, RARG3, RSCRATCH);2607}2608else2609{2610armAsm->sub(RARG3, RARG3, cycles_to_remove);2611}2612armAsm->str(RARG3, PTR(&g_state.pending_ticks));2613}26142615// restore regs2616if (!save_regs.IsEmpty())2617armAsm->pop(save_regs);26182619armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);2620armAsm->FinalizeCode();26212622return static_cast<u32>(armAsm->GetCursorOffset());2623}26242625#endif // CPU_ARCH_ARM32262626272628