Path: blob/master/src/core/cpu_recompiler_loongarch64.cpp
10595 views
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]> and contributors.1// SPDX-License-Identifier: CC-BY-NC-ND-4.023#include "cpu_recompiler_loongarch64.h"4#include "cpu_code_cache_private.h"5#include "cpu_core_private.h"6#include "cpu_pgxp.h"7#include "gte.h"8#include "settings.h"9#include "timing_event.h"1011#include "common/align.h"12#include "common/assert.h"13#include "common/log.h"14#include "common/memmap.h"15#include "common/string_util.h"1617#include <limits>1819#ifdef CPU_ARCH_LOONGARCH642021LOG_CHANNEL(Recompiler);2223#define OFFS(x) ((u32)(((u8*)(x)) - ((u8*)&g_state)))2425static constexpr u32 BLOCK_LINK_SIZE = 8; // pcaddu18i + jirl2627#define RRET LA_A028#define RARG1 LA_A029#define RARG2 LA_A130#define RARG3 LA_A231#define RSCRATCH LA_T832#define RSTATE LA_S733#define RMEMBASE LA_S83435static bool laIsCallerSavedRegister(u32 id);36static bool laIsValidSImm12(u32 imm);37static bool laIsValidUImm12(u32 imm);38static std::pair<s32, s32> laGetAddressImmediates12(const void* cur, const void* target);39static void laMoveAddressToReg(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr);40static void laEmitMov(lagoon_assembler_t* laAsm, la_gpr_t rd, u32 imm);41static void laEmitMov64(lagoon_assembler_t* laAsm, la_gpr_t rd, u64 imm);42static u32 laEmitJmp(lagoon_assembler_t* laAsm, const void* ptr, la_gpr_t link_reg = LA_ZERO);43static u32 laEmitCall(lagoon_assembler_t* laAsm, const void* ptr);44static void laEmitFarLoad(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, bool sign_extend_word = false);45static void laEmitFarStore(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, la_gpr_t tempreg = RSCRATCH);46static void laEmitSExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word47static void laEmitUExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word48static void laEmitSExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word49static void laEmitUExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word50static void laEmitDSExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> doubleword51static void laEmitDUExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> doubleword5253namespace CPU {5455using namespace CPU;5657LoongArch64Recompiler s_instance;58Recompiler* g_compiler = &s_instance;5960} // namespace CPU6162bool laIsCallerSavedRegister(u32 id)63{64return id == 1 || (id >= 4 && id <= 20);65}6667bool laIsValidSImm12(u32 imm)68{69const s32 simm = static_cast<s32>(imm);70return (simm >= -2048 && simm <= 2047);71}7273bool laIsValidUImm12(u32 imm)74{75return (imm <= 4095);76}7778std::pair<s32, s32> laGetAddressImmediates12(const void* cur, const void* target)79{80const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));81Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&82disp <= static_cast<s64>(std::numeric_limits<s32>::max()));8384const s64 hi = disp + 0x800;85const s64 lo = disp - (hi & 0xFFFFF000);86return std::make_pair(static_cast<s32>(hi >> 12), static_cast<s32>((lo << 52) >> 52));87}8889std::pair<s32, s32> laGetAddressImmediates18(const void* cur, const void* target)90{91const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));92Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&93disp <= static_cast<s64>(std::numeric_limits<s32>::max()));9495const s64 hi = disp + 0x20000;96const s64 lo = disp - (hi & 0xFFFC0000);97return std::make_pair(static_cast<s32>(hi >> 18), static_cast<s32>((lo << 46) >> 46));98}99100void laMoveAddressToReg(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr)101{102const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);103la_pcaddu12i(laAsm, reg, hi);104la_addi_d(laAsm, reg, reg, lo);105}106107void laEmitMov(lagoon_assembler_t* laAsm, la_gpr_t rd, u32 imm)108{109la_load_immediate32(laAsm, rd, static_cast<s32>(imm));110}111112void laEmitMov64(lagoon_assembler_t* laAsm, la_gpr_t rd, u64 imm)113{114la_load_immediate64(laAsm, rd, static_cast<s64>(imm));115}116117u32 laEmitJmp(lagoon_assembler_t* laAsm, const void* ptr, la_gpr_t link_reg)118{119const auto [hi, lo] = laGetAddressImmediates18(laAsm->cursor, ptr);120la_pcaddu18i(laAsm, RSCRATCH, hi);121la_jirl(laAsm, link_reg, RSCRATCH, lo);122return 8;123}124125u32 laEmitCall(lagoon_assembler_t* laAsm, const void* ptr)126{127return laEmitJmp(laAsm, ptr, LA_RA);128}129130void laEmitFarLoad(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, bool sign_extend_word)131{132const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);133la_pcaddu12i(laAsm, reg, hi);134if (sign_extend_word)135la_ld_w(laAsm, reg, reg, lo);136else137la_ld_wu(laAsm, reg, reg, lo);138}139140[[maybe_unused]] void laEmitFarStore(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, la_gpr_t tempreg)141{142const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);143la_pcaddu12i(laAsm, tempreg, hi);144la_st_w(laAsm, reg, tempreg, lo);145}146147void laEmitSExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)148{149la_ext_w_b(laAsm, rd, rs);150}151152void laEmitUExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)153{154la_andi(laAsm, rd, rs, 0xFF);155}156157void laEmitSExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)158{159la_ext_w_h(laAsm, rd, rs);160}161162void laEmitUExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)163{164la_bstrpick_d(laAsm, rd, rs, 15, 0);165}166167void laEmitDSExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)168{169la_addi_w(laAsm, rd, rs, 0);170}171172void laEmitDUExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)173{174la_bstrpick_d(laAsm, rd, rs, 31, 0);175}176177void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)178{179#ifdef ENABLE_HOST_DISASSEMBLY180const u32* code = static_cast<const u32*>(start);181const u32 count = size / 4;182char buf[256];183for (u32 i = 0; i < count; i++)184{185lagoon_insn_t insn;186la_disasm_one(*(code + i), &insn);187la_insn_to_str(&insn, buf, sizeof(buf));188INFO_LOG("\t0x{:016X}\t{}", reinterpret_cast<uintptr_t>(code + i), buf);189}190#else191ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");192#endif193}194195u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)196{197#ifdef ENABLE_HOST_DISASSEMBLY198return size / 4;199#else200ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");201return size / 4;202#endif203}204205u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)206{207lagoon_assembler_t asm_obj;208lagoon_assembler_t* laAsm = &asm_obj;209la_init_assembler(laAsm, static_cast<u8*>(code), code_size);210211lagoon_label_t dispatch = {};212lagoon_label_t run_events_and_dispatch = {};213214g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(laAsm->cursor);215{216// TODO: reserve some space for saving caller-saved registers217218// Need the CPU state for basically everything :-)219laMoveAddressToReg(laAsm, RSTATE, &g_state);220// Fastmem setup221if (IsUsingFastmem())222la_ld_d(laAsm, RMEMBASE, RSTATE, OFFS(&g_state.fastmem_base));223224// Fall through to event dispatcher225}226227// check events then for frame done228{229lagoon_label_t skip_event_check = {};230la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));231la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));232la_bltu(laAsm, RARG1, RARG2, la_label(laAsm, &skip_event_check));233234la_bind(laAsm, &run_events_and_dispatch);235g_run_events_and_dispatch = laAsm->cursor;236laEmitCall(laAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents));237238la_bind(laAsm, &skip_event_check);239la_label_free(laAsm, &skip_event_check);240}241242// TODO: align?243g_dispatcher = laAsm->cursor;244{245la_bind(laAsm, &dispatch);246247// x9 <- s_fast_map[pc >> 16]248la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));249laMoveAddressToReg(laAsm, RARG3, g_code_lut.data());250la_srli_w(laAsm, RARG2, RARG1, 16);251la_slli_d(laAsm, RARG2, RARG2, 3);252la_add_d(laAsm, RARG2, RARG2, RARG3);253la_ld_d(laAsm, RARG2, RARG2, 0);254la_slli_d(laAsm, RARG1, RARG1, 48); // idx = (pc & 0xFFFF) >> 2255la_srli_d(laAsm, RARG1, RARG1, 50);256la_slli_d(laAsm, RARG1, RARG1, 3);257258// blr(x9[pc * 2]) (fast_map[idx])259la_add_d(laAsm, RARG1, RARG1, RARG2);260la_ld_d(laAsm, RARG1, RARG1, 0);261la_jirl(laAsm, LA_ZERO, RARG1, 0);262}263264g_compile_or_revalidate_block = laAsm->cursor;265{266la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));267laEmitCall(laAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock));268la_b(laAsm, la_label(laAsm, &dispatch));269}270271g_discard_and_recompile_block = laAsm->cursor;272{273la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));274laEmitCall(laAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock));275la_b(laAsm, la_label(laAsm, &dispatch));276}277278g_interpret_block = laAsm->cursor;279{280laEmitCall(laAsm, CodeCache::GetInterpretUncachedBlockFunction());281la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));282la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));283la_bge(laAsm, RARG1, RARG2, la_label(laAsm, &run_events_and_dispatch));284la_b(laAsm, la_label(laAsm, &dispatch));285}286287la_label_free(laAsm, &dispatch);288la_label_free(laAsm, &run_events_and_dispatch);289290// TODO: align?291292return static_cast<u32>(laAsm->cursor - laAsm->buffer);293}294295void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)296{297constexpr u8 padding_value = 0x00;298std::memset(dst, padding_value, size);299}300301u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)302{303{304lagoon_assembler_t assembler;305la_init_assembler(&assembler, static_cast<u8*>(code), BLOCK_LINK_SIZE);306laEmitCall(&assembler, dst);307308DebugAssert(static_cast<size_t>(assembler.cursor - assembler.buffer) <= BLOCK_LINK_SIZE);309if (la_get_remaining_buffer_size(&assembler) > 0)310la_andi(&assembler, LA_ZERO, LA_ZERO, 0); // NOP311}312313if (flush_icache)314MemMap::FlushInstructionCache(code, BLOCK_LINK_SIZE);315316return BLOCK_LINK_SIZE;317}318319CPU::LoongArch64Recompiler::LoongArch64Recompiler() = default;320321CPU::LoongArch64Recompiler::~LoongArch64Recompiler() = default;322323const void* CPU::LoongArch64Recompiler::GetCurrentCodePointer()324{325return laAsm->cursor;326}327328void CPU::LoongArch64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,329u8* far_code_buffer, u32 far_code_space)330{331Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);332333DebugAssert(!laAsm);334la_init_assembler(&m_emitter, code_buffer, code_buffer_space);335la_init_assembler(&m_far_emitter, far_code_buffer, far_code_space);336laAsm = &m_emitter;337338// Need to wipe it out so it's correct when toggling fastmem.339m_host_regs = {};340341const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE : NUM_HOST_REGS;342for (u32 i = 0; i < NUM_HOST_REGS; i++)343{344HostRegAlloc& hra = m_host_regs[i];345346// Reserved: zero(0), ra(1), tp(2), sp(3), r21(reserved)347if (i == RARG1 || i == RARG2 || i == RARG3 || i == RSCRATCH || i == RSTATE || i == membase_idx || i < 4 || i == 21)348{349continue;350}351352hra.flags = HR_USABLE | (laIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);353}354}355356void CPU::LoongArch64Recompiler::SwitchToFarCode(bool emit_jump, LaBranchCondition cond, la_gpr_t rs1, la_gpr_t rs2)357{358DebugAssert(laAsm == &m_emitter);359if (emit_jump)360{361const void* target = m_far_emitter.cursor;362if (cond != LaBranchCondition::None)363{364lagoon_label_t skip = {};365switch (cond)366{367case LaBranchCondition::EQ:368la_bne(laAsm, rs1, rs2, la_label(laAsm, &skip));369break;370case LaBranchCondition::NE:371la_beq(laAsm, rs1, rs2, la_label(laAsm, &skip));372break;373case LaBranchCondition::LT:374la_bge(laAsm, rs1, rs2, la_label(laAsm, &skip));375break;376case LaBranchCondition::GE:377la_blt(laAsm, rs1, rs2, la_label(laAsm, &skip));378break;379case LaBranchCondition::LTU:380la_bgeu(laAsm, rs1, rs2, la_label(laAsm, &skip));381break;382case LaBranchCondition::GEU:383la_bltu(laAsm, rs1, rs2, la_label(laAsm, &skip));384break;385default:386break;387}388laEmitJmp(laAsm, target);389la_bind(laAsm, &skip);390la_label_free(laAsm, &skip);391}392else393{394laEmitCall(laAsm, target);395}396}397laAsm = &m_far_emitter;398}399400void CPU::LoongArch64Recompiler::SwitchToNearCode(bool emit_jump)401{402DebugAssert(laAsm == &m_far_emitter);403if (emit_jump)404laEmitJmp(laAsm, m_emitter.cursor);405laAsm = &m_emitter;406}407408void CPU::LoongArch64Recompiler::EmitMov(la_gpr_t dst, u32 val)409{410laEmitMov(laAsm, dst, val);411}412413void CPU::LoongArch64Recompiler::EmitCall(const void* ptr)414{415laEmitCall(laAsm, ptr);416}417418void CPU::LoongArch64Recompiler::SafeImmSImm12(la_gpr_t rd, la_gpr_t rs, u32 imm, LaRRSImmOp iop, LaRRROp rop)419{420DebugAssert(rd != RSCRATCH && rs != RSCRATCH);421422if (laIsValidSImm12(imm))423{424iop(laAsm, rd, rs, imm);425return;426}427428laEmitMov(laAsm, RSCRATCH, imm);429rop(laAsm, rd, rs, RSCRATCH);430}431432void CPU::LoongArch64Recompiler::SafeImmUImm12(la_gpr_t rd, la_gpr_t rs, u32 imm, LaRRUImmOp iop, LaRRROp rop)433{434DebugAssert(rd != RSCRATCH && rs != RSCRATCH);435436if (laIsValidUImm12(imm))437{438iop(laAsm, rd, rs, imm);439return;440}441442laEmitMov(laAsm, RSCRATCH, imm);443rop(laAsm, rd, rs, RSCRATCH);444}445446void CPU::LoongArch64Recompiler::SafeADDI(la_gpr_t rd, la_gpr_t rs, u32 imm)447{448SafeImmSImm12(rd, rs, imm, la_addi_d, la_add_d);449}450451void CPU::LoongArch64Recompiler::SafeADDIW(la_gpr_t rd, la_gpr_t rs, u32 imm)452{453SafeImmSImm12(rd, rs, imm, la_addi_w, la_add_w);454}455456void CPU::LoongArch64Recompiler::SafeSUBIW(la_gpr_t rd, la_gpr_t rs, u32 imm)457{458const u32 nimm = static_cast<u32>(-static_cast<s32>(imm));459SafeImmSImm12(rd, rs, nimm, la_addi_w, la_add_w);460}461462void CPU::LoongArch64Recompiler::SafeANDI(la_gpr_t rd, la_gpr_t rs, u32 imm)463{464SafeImmUImm12(rd, rs, imm, la_andi, la_and);465}466467void CPU::LoongArch64Recompiler::SafeORI(la_gpr_t rd, la_gpr_t rs, u32 imm)468{469SafeImmUImm12(rd, rs, imm, la_ori, la_or);470}471472void CPU::LoongArch64Recompiler::SafeXORI(la_gpr_t rd, la_gpr_t rs, u32 imm)473{474SafeImmUImm12(rd, rs, imm, la_xori, la_xor);475}476477void CPU::LoongArch64Recompiler::SafeSLTI(la_gpr_t rd, la_gpr_t rs, u32 imm)478{479SafeImmSImm12(rd, rs, imm, la_slti, la_slt);480}481482void CPU::LoongArch64Recompiler::SafeSLTIU(la_gpr_t rd, la_gpr_t rs, u32 imm)483{484SafeImmSImm12(rd, rs, imm, la_sltui, la_sltu);485}486487void CPU::LoongArch64Recompiler::EmitSExtB(la_gpr_t rd, la_gpr_t rs)488{489laEmitSExtB(laAsm, rd, rs);490}491492void CPU::LoongArch64Recompiler::EmitUExtB(la_gpr_t rd, la_gpr_t rs)493{494laEmitUExtB(laAsm, rd, rs);495}496497void CPU::LoongArch64Recompiler::EmitSExtH(la_gpr_t rd, la_gpr_t rs)498{499laEmitSExtH(laAsm, rd, rs);500}501502void CPU::LoongArch64Recompiler::EmitUExtH(la_gpr_t rd, la_gpr_t rs)503{504laEmitUExtH(laAsm, rd, rs);505}506507void CPU::LoongArch64Recompiler::EmitDSExtW(la_gpr_t rd, la_gpr_t rs)508{509laEmitDSExtW(laAsm, rd, rs);510}511512void CPU::LoongArch64Recompiler::EmitDUExtW(la_gpr_t rd, la_gpr_t rs)513{514laEmitDUExtW(laAsm, rd, rs);515}516517void CPU::LoongArch64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)518{519// store it first to reduce code size, because we can offset520laEmitMov64(laAsm, RARG1, static_cast<u64>(reinterpret_cast<uintptr_t>(ram_ptr)));521laEmitMov64(laAsm, RARG2, static_cast<u64>(reinterpret_cast<uintptr_t>(shadow_ptr)));522523u32 offset = 0;524lagoon_label_t block_changed = {};525526while (size >= 8)527{528la_ld_d(laAsm, RARG3, RARG1, offset);529la_ld_d(laAsm, RSCRATCH, RARG2, offset);530la_bne(laAsm, RARG3, RSCRATCH, la_label(laAsm, &block_changed));531offset += 8;532size -= 8;533}534535while (size >= 4)536{537la_ld_w(laAsm, RARG3, RARG1, offset);538la_ld_w(laAsm, RSCRATCH, RARG2, offset);539la_bne(laAsm, RARG3, RSCRATCH, la_label(laAsm, &block_changed));540offset += 4;541size -= 4;542}543544DebugAssert(size == 0);545546lagoon_label_t block_unchanged = {};547la_b(laAsm, la_label(laAsm, &block_unchanged));548la_bind(laAsm, &block_changed);549laEmitJmp(laAsm, CodeCache::g_discard_and_recompile_block);550la_bind(laAsm, &block_unchanged);551la_label_free(laAsm, &block_changed);552la_label_free(laAsm, &block_unchanged);553}554555void CPU::LoongArch64Recompiler::GenerateICacheCheckAndUpdate()556{557if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))558{559if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))560{561laEmitFarLoad(laAsm, RARG2, GetFetchMemoryAccessTimePtr());562la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));563laEmitMov(laAsm, RARG3, m_block->size);564la_mul_w(laAsm, RARG2, RARG2, RARG3);565la_add_d(laAsm, RARG1, RARG1, RARG2);566la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));567}568else569{570la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));571SafeADDIW(RARG1, RARG1, static_cast<u32>(m_block->uncached_fetch_ticks));572la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));573}574}575else if (m_block->icache_line_count > 0)576{577const auto& ticks_reg = RARG1;578const auto& current_tag_reg = RARG2;579const auto& existing_tag_reg = RARG3;580581// start of block, nothing should be using this582const auto& maddr_reg = LA_T0;583DebugAssert(!IsHostRegAllocated(maddr_reg));584585VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;586la_ld_w(laAsm, ticks_reg, RSTATE, OFFS(&g_state.pending_ticks));587laEmitMov(laAsm, current_tag_reg, current_pc);588589for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)590{591const TickCount fill_ticks = GetICacheFillTicks(current_pc);592if (fill_ticks <= 0)593continue;594595const u32 line = GetICacheLine(current_pc);596const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));597598// Offsets must fit in signed 12 bits.599lagoon_label_t cache_hit = {};600if (offset >= 2048)601{602SafeADDI(maddr_reg, RSTATE, offset);603la_ld_w(laAsm, existing_tag_reg, maddr_reg, 0);604la_beq(laAsm, existing_tag_reg, current_tag_reg, la_label(laAsm, &cache_hit));605la_st_w(laAsm, current_tag_reg, maddr_reg, 0);606}607else608{609la_ld_w(laAsm, existing_tag_reg, RSTATE, offset);610la_beq(laAsm, existing_tag_reg, current_tag_reg, la_label(laAsm, &cache_hit));611la_st_w(laAsm, current_tag_reg, RSTATE, offset);612}613614SafeADDIW(ticks_reg, ticks_reg, static_cast<u32>(fill_ticks));615la_bind(laAsm, &cache_hit);616la_label_free(laAsm, &cache_hit);617618if (i != (m_block->icache_line_count - 1))619SafeADDIW(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE);620}621622la_st_w(laAsm, ticks_reg, RSTATE, OFFS(&g_state.pending_ticks));623}624}625626void CPU::LoongArch64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,627s32 arg3reg /*= -1*/)628{629if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1))630la_or(laAsm, RARG1, static_cast<la_gpr_t>(arg1reg), LA_ZERO);631if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2))632la_or(laAsm, RARG2, static_cast<la_gpr_t>(arg2reg), LA_ZERO);633if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3))634la_or(laAsm, RARG3, static_cast<la_gpr_t>(arg3reg), LA_ZERO);635EmitCall(func);636}637638void CPU::LoongArch64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)639{640if (newpc.has_value())641{642if (m_dirty_pc || m_compiler_pc != newpc)643{644EmitMov(RSCRATCH, newpc.value());645la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pc));646}647}648m_dirty_pc = false;649650// flush regs651Flush(FLUSH_END_BLOCK);652EndAndLinkBlock(newpc, do_event_test, false);653}654655void CPU::LoongArch64Recompiler::EndBlockWithException(Exception excode)656{657// flush regs, but not pc, it's going to get overwritten658// flush cycles because of the GTE instruction stuff...659Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);660661// TODO: flush load delay662663EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,664inst->cop.cop_n));665EmitMov(RARG2, m_current_instruction_pc);666if (excode != Exception::BP)667{668EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));669}670else671{672EmitMov(RARG3, inst->bits);673EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));674}675m_dirty_pc = false;676677EndAndLinkBlock(std::nullopt, true, false);678}679680void CPU::LoongArch64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,681bool force_run_events)682{683// event test684// pc should've been flushed685DebugAssert(!m_dirty_pc && !m_block_ended);686m_block_ended = true;687688// TODO: try extracting this to a function689// TODO: move the cycle flush in here..690691// save cycles for event test692const TickCount cycles = std::exchange(m_cycles, 0);693694// pending_ticks += cycles695// if (pending_ticks >= downcount) { dispatch_event(); }696if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)697la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));698if (do_event_test)699la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));700if (cycles > 0)701{702SafeADDIW(RARG1, RARG1, cycles);703la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));704}705if (m_gte_done_cycle > cycles)706{707SafeADDIW(RARG3, RARG1, m_gte_done_cycle - cycles);708la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_completion_tick));709}710711if (do_event_test)712{713// TODO: see if we can do a far jump somehow with this..714lagoon_label_t cont = {};715la_blt(laAsm, RARG1, RARG2, la_label(laAsm, &cont));716laEmitJmp(laAsm, CodeCache::g_run_events_and_dispatch);717la_bind(laAsm, &cont);718la_label_free(laAsm, &cont);719}720721// jump to dispatcher or next block722if (force_run_events)723{724laEmitJmp(laAsm, CodeCache::g_run_events_and_dispatch);725}726else if (!newpc.has_value())727{728laEmitJmp(laAsm, CodeCache::g_dispatcher);729}730else731{732const void* target = (newpc.value() == m_block->pc) ?733CodeCache::CreateSelfBlockLink(m_block, laAsm->cursor, laAsm->buffer) :734CodeCache::CreateBlockLink(m_block, laAsm->cursor, newpc.value());735laEmitJmp(laAsm, target);736}737}738739const void* CPU::LoongArch64Recompiler::EndCompile(u32* code_size, u32* far_code_size)740{741u8* const code = m_emitter.buffer;742*code_size = static_cast<u32>(m_emitter.cursor - m_emitter.buffer);743*far_code_size = static_cast<u32>(m_far_emitter.cursor - m_far_emitter.buffer);744laAsm = nullptr;745return code;746}747748const char* CPU::LoongArch64Recompiler::GetHostRegName(u32 reg) const749{750static constexpr std::array<const char*, 32> reg64_names = {751{"$zero", "$ra", "$tp", "$sp", "$a0", "$a1", "$a2", "$a3", "$a4", "$a5", "$a6",752"$a7", "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "$r21",753"$fp", "$s0", "$s1", "$s2", "$s3", "$s4", "$s5", "$s6", "$s7", "$s8"}};754return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";755}756757void CPU::LoongArch64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)758{759EmitMov(static_cast<la_gpr_t>(reg), val);760}761762void CPU::LoongArch64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)763{764la_ld_w(laAsm, static_cast<la_gpr_t>(reg), RSTATE, OFFS(ptr));765}766767void CPU::LoongArch64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)768{769la_st_w(laAsm, static_cast<la_gpr_t>(reg), RSTATE, OFFS(ptr));770}771772void CPU::LoongArch64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)773{774if (val == 0)775{776la_st_w(laAsm, LA_ZERO, RSTATE, OFFS(ptr));777return;778}779780EmitMov(RSCRATCH, val);781la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(ptr));782}783784void CPU::LoongArch64Recompiler::CopyHostReg(u32 dst, u32 src)785{786if (src != dst)787la_or(laAsm, static_cast<la_gpr_t>(dst), static_cast<la_gpr_t>(src), LA_ZERO);788}789790void CPU::LoongArch64Recompiler::AssertRegOrConstS(CompileFlags cf) const791{792DebugAssert(cf.valid_host_s || cf.const_s);793}794795void CPU::LoongArch64Recompiler::AssertRegOrConstT(CompileFlags cf) const796{797DebugAssert(cf.valid_host_t || cf.const_t);798}799800la_gpr_t CPU::LoongArch64Recompiler::CFGetSafeRegS(CompileFlags cf, la_gpr_t temp_reg)801{802if (cf.valid_host_s)803{804return static_cast<la_gpr_t>(cf.host_s);805}806else if (cf.const_s)807{808if (HasConstantRegValue(cf.MipsS(), 0))809return LA_ZERO;810811EmitMov(temp_reg, GetConstantRegU32(cf.MipsS()));812return temp_reg;813}814else815{816WARNING_LOG("Hit memory path in CFGetSafeRegS() for {}", GetRegName(cf.MipsS()));817la_ld_w(laAsm, temp_reg, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));818return temp_reg;819}820}821822la_gpr_t CPU::LoongArch64Recompiler::CFGetSafeRegT(CompileFlags cf, la_gpr_t temp_reg)823{824if (cf.valid_host_t)825{826return static_cast<la_gpr_t>(cf.host_t);827}828else if (cf.const_t)829{830if (HasConstantRegValue(cf.MipsT(), 0))831return LA_ZERO;832833EmitMov(temp_reg, GetConstantRegU32(cf.MipsT()));834return temp_reg;835}836else837{838WARNING_LOG("Hit memory path in CFGetSafeRegT() for {}", GetRegName(cf.MipsT()));839la_ld_w(laAsm, temp_reg, RSTATE, OFFS(&g_state.regs.r[cf.mips_t]));840return temp_reg;841}842}843844la_gpr_t CPU::LoongArch64Recompiler::CFGetRegD(CompileFlags cf) const845{846DebugAssert(cf.valid_host_d);847return static_cast<la_gpr_t>(cf.host_d);848}849850la_gpr_t CPU::LoongArch64Recompiler::CFGetRegS(CompileFlags cf) const851{852DebugAssert(cf.valid_host_s);853return static_cast<la_gpr_t>(cf.host_s);854}855856la_gpr_t CPU::LoongArch64Recompiler::CFGetRegT(CompileFlags cf) const857{858DebugAssert(cf.valid_host_t);859return static_cast<la_gpr_t>(cf.host_t);860}861862la_gpr_t CPU::LoongArch64Recompiler::CFGetRegLO(CompileFlags cf) const863{864DebugAssert(cf.valid_host_lo);865return static_cast<la_gpr_t>(cf.host_lo);866}867868la_gpr_t CPU::LoongArch64Recompiler::CFGetRegHI(CompileFlags cf) const869{870DebugAssert(cf.valid_host_hi);871return static_cast<la_gpr_t>(cf.host_hi);872}873874void CPU::LoongArch64Recompiler::MoveSToReg(la_gpr_t dst, CompileFlags cf)875{876if (cf.valid_host_s)877{878if (cf.host_s != dst)879la_or(laAsm, dst, static_cast<la_gpr_t>(cf.host_s), LA_ZERO);880}881else if (cf.const_s)882{883EmitMov(dst, GetConstantRegU32(cf.MipsS()));884}885else886{887WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));888la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));889}890}891892void CPU::LoongArch64Recompiler::MoveTToReg(la_gpr_t dst, CompileFlags cf)893{894if (cf.valid_host_t)895{896if (cf.host_t != dst)897la_or(laAsm, dst, static_cast<la_gpr_t>(cf.host_t), LA_ZERO);898}899else if (cf.const_t)900{901EmitMov(dst, GetConstantRegU32(cf.MipsT()));902}903else904{905WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));906la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_t]));907}908}909910void CPU::LoongArch64Recompiler::MoveMIPSRegToReg(la_gpr_t dst, Reg reg, bool ignore_load_delays)911{912DebugAssert(reg < Reg::count);913if (ignore_load_delays && m_load_delay_register == reg)914{915if (m_load_delay_value_register == NUM_HOST_REGS)916la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.load_delay_value));917else918la_or(laAsm, dst, static_cast<la_gpr_t>(m_load_delay_value_register), LA_ZERO);919}920else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))921{922la_or(laAsm, dst, static_cast<la_gpr_t>(hreg.value()), LA_ZERO);923}924else if (HasConstantReg(reg))925{926EmitMov(dst, GetConstantRegU32(reg));927}928else929{930la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[static_cast<u8>(reg)]));931}932}933934void CPU::LoongArch64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,935Reg arg2reg /* = Reg::count */,936Reg arg3reg /* = Reg::count */)937{938DebugAssert(g_settings.gpu_pgxp_enable);939940Flush(FLUSH_FOR_C_CALL);941942if (arg2reg != Reg::count)943MoveMIPSRegToReg(RARG2, arg2reg);944if (arg3reg != Reg::count)945MoveMIPSRegToReg(RARG3, arg3reg);946947EmitMov(RARG1, arg1val);948EmitCall(func);949}950951void CPU::LoongArch64Recompiler::Flush(u32 flags)952{953Recompiler::Flush(flags);954955if (flags & FLUSH_PC && m_dirty_pc)956{957StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);958m_dirty_pc = false;959}960961if (flags & FLUSH_INSTRUCTION_BITS)962{963// This sucks, but it's only used for fallbacks.964Panic("Not implemented");965}966967if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)968{969// This sucks :(970// TODO: make it a function?971la_ld_bu(laAsm, RARG1, RSTATE, OFFS(&g_state.load_delay_reg));972la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.load_delay_value));973la_slli_d(laAsm, RARG1, RARG1, 2); // *4974la_add_d(laAsm, RARG1, RARG1, RSTATE);975la_st_w(laAsm, RARG2, RARG1, OFFSETOF(CPU::State, regs.r[0]));976la_addi_d(laAsm, RSCRATCH, LA_ZERO, static_cast<u8>(Reg::count));977la_st_b(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.load_delay_reg));978m_load_delay_dirty = false;979}980981if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)982{983if (m_load_delay_value_register != NUM_HOST_REGS)984FreeHostReg(m_load_delay_value_register);985986EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));987la_st_b(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.load_delay_reg));988m_load_delay_register = Reg::count;989m_load_delay_dirty = true;990}991992if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)993{994// May as well flush cycles while we're here.995// GTE spanning blocks is very rare, we _could_ disable this for speed.996la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));997la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_completion_tick));998if (m_cycles > 0)999{1000SafeADDIW(RARG1, RARG1, m_cycles);1001la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));1002m_cycles = 0;1003}10041005lagoon_label_t no_stall = {};1006la_bge(laAsm, RARG1, RARG2, la_label(laAsm, &no_stall));1007la_or(laAsm, RARG1, RARG2, LA_ZERO);1008la_bind(laAsm, &no_stall);1009la_label_free(laAsm, &no_stall);1010la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));1011m_dirty_gte_done_cycle = false;1012}10131014if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)1015{1016la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));10171018// update cycles at the same time1019if (flags & FLUSH_CYCLES && m_cycles > 0)1020{1021SafeADDIW(RARG1, RARG1, m_cycles);1022la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));1023m_gte_done_cycle -= m_cycles;1024m_cycles = 0;1025}10261027SafeADDIW(RARG1, RARG1, m_gte_done_cycle);1028la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.gte_completion_tick));1029m_gte_done_cycle = 0;1030m_dirty_gte_done_cycle = true;1031}10321033if (flags & FLUSH_CYCLES && m_cycles > 0)1034{1035la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));1036SafeADDIW(RARG1, RARG1, m_cycles);1037la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));1038m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);1039m_cycles = 0;1040}1041}10421043void CPU::LoongArch64Recompiler::Compile_Fallback()1044{1045WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,1046inst->bits);10471048Flush(FLUSH_FOR_INTERPRETER);10491050Panic("Fixme");1051}10521053void CPU::LoongArch64Recompiler::CheckBranchTarget(la_gpr_t pcreg)1054{1055if (!g_settings.cpu_recompiler_memory_exceptions)1056return;10571058DebugAssert(pcreg != RSCRATCH);1059la_andi(laAsm, RSCRATCH, pcreg, 0x3);1060SwitchToFarCode(true, LaBranchCondition::NE, RSCRATCH, LA_ZERO);10611062BackupHostState();1063EndBlockWithException(Exception::AdEL);10641065RestoreHostState();1066SwitchToNearCode(false);1067}10681069void CPU::LoongArch64Recompiler::Compile_jr(CompileFlags cf)1070{1071const la_gpr_t pcreg = CFGetRegS(cf);1072CheckBranchTarget(pcreg);10731074la_st_w(laAsm, pcreg, RSTATE, OFFS(&g_state.pc));10751076CompileBranchDelaySlot(false);1077EndBlock(std::nullopt, true);1078}10791080void CPU::LoongArch64Recompiler::Compile_jalr(CompileFlags cf)1081{1082const la_gpr_t pcreg = CFGetRegS(cf);1083if (MipsD() != Reg::zero)1084SetConstantReg(MipsD(), GetBranchReturnAddress(cf));10851086CheckBranchTarget(pcreg);1087la_st_w(laAsm, pcreg, RSTATE, OFFS(&g_state.pc));10881089CompileBranchDelaySlot(false);1090EndBlock(std::nullopt, true);1091}10921093void CPU::LoongArch64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)1094{1095AssertRegOrConstS(cf);10961097const u32 taken_pc = GetConditionalBranchTarget(cf);10981099Flush(FLUSH_FOR_BRANCH);11001101DebugAssert(cf.valid_host_s);11021103// MipsT() here should equal zero for zero branches.1104DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);11051106lagoon_label_t taken = {};1107const la_gpr_t rs = CFGetRegS(cf);1108switch (cond)1109{1110case BranchCondition::Equal:1111case BranchCondition::NotEqual:1112{1113AssertRegOrConstT(cf);1114if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))1115{1116(cond == BranchCondition::Equal) ? la_beqz(laAsm, rs, la_label(laAsm, &taken)) :1117la_bnez(laAsm, rs, la_label(laAsm, &taken));1118}1119else1120{1121const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG1;1122if (!cf.valid_host_t)1123MoveTToReg(RARG1, cf);1124if (cond == Recompiler::BranchCondition::Equal)1125la_beq(laAsm, rs, rt, la_label(laAsm, &taken));1126else1127la_bne(laAsm, rs, rt, la_label(laAsm, &taken));1128}1129}1130break;11311132case BranchCondition::GreaterThanZero:1133{1134la_blt(laAsm, LA_ZERO, rs, la_label(laAsm, &taken));1135}1136break;11371138case BranchCondition::GreaterEqualZero:1139{1140la_bge(laAsm, rs, LA_ZERO, la_label(laAsm, &taken));1141}1142break;11431144case BranchCondition::LessThanZero:1145{1146la_blt(laAsm, rs, LA_ZERO, la_label(laAsm, &taken));1147}1148break;11491150case BranchCondition::LessEqualZero:1151{1152la_bge(laAsm, LA_ZERO, rs, la_label(laAsm, &taken));1153}1154break;1155}11561157BackupHostState();1158if (!cf.delay_slot_swapped)1159CompileBranchDelaySlot();11601161EndBlock(m_compiler_pc, true);11621163la_bind(laAsm, &taken);1164la_label_free(laAsm, &taken);11651166RestoreHostState();1167if (!cf.delay_slot_swapped)1168CompileBranchDelaySlot();11691170EndBlock(taken_pc, true);1171}11721173void CPU::LoongArch64Recompiler::Compile_addi(CompileFlags cf, bool overflow)1174{1175const la_gpr_t rs = CFGetRegS(cf);1176const la_gpr_t rt = CFGetRegT(cf);1177if (const u32 imm = inst->i.imm_sext32(); imm != 0)1178{1179if (!overflow)1180{1181SafeADDIW(rt, rs, imm);1182}1183else1184{1185SafeADDI(RARG1, rs, imm);1186SafeADDIW(rt, rs, imm);1187TestOverflow(RARG1, rt, rt);1188}1189}1190else if (rt != rs)1191{1192la_or(laAsm, rt, rs, LA_ZERO);1193}1194}11951196void CPU::LoongArch64Recompiler::Compile_addi(CompileFlags cf)1197{1198Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);1199}12001201void CPU::LoongArch64Recompiler::Compile_addiu(CompileFlags cf)1202{1203Compile_addi(cf, false);1204}12051206void CPU::LoongArch64Recompiler::Compile_slti(CompileFlags cf)1207{1208Compile_slti(cf, true);1209}12101211void CPU::LoongArch64Recompiler::Compile_sltiu(CompileFlags cf)1212{1213Compile_slti(cf, false);1214}12151216void CPU::LoongArch64Recompiler::Compile_slti(CompileFlags cf, bool sign)1217{1218if (sign)1219SafeSLTI(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());1220else1221SafeSLTIU(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());1222}12231224void CPU::LoongArch64Recompiler::Compile_andi(CompileFlags cf)1225{1226const la_gpr_t rt = CFGetRegT(cf);1227if (const u32 imm = inst->i.imm_zext32(); imm != 0)1228SafeANDI(rt, CFGetRegS(cf), imm);1229else1230EmitMov(rt, 0);1231}12321233void CPU::LoongArch64Recompiler::Compile_ori(CompileFlags cf)1234{1235const la_gpr_t rt = CFGetRegT(cf);1236const la_gpr_t rs = CFGetRegS(cf);1237if (const u32 imm = inst->i.imm_zext32(); imm != 0)1238SafeORI(rt, rs, imm);1239else if (rt != rs)1240la_or(laAsm, rt, rs, LA_ZERO);1241}12421243void CPU::LoongArch64Recompiler::Compile_xori(CompileFlags cf)1244{1245const la_gpr_t rt = CFGetRegT(cf);1246const la_gpr_t rs = CFGetRegS(cf);1247if (const u32 imm = inst->i.imm_zext32(); imm != 0)1248SafeXORI(rt, rs, imm);1249else if (rt != rs)1250la_or(laAsm, rt, rs, LA_ZERO);1251}12521253void CPU::LoongArch64Recompiler::Compile_shift(CompileFlags cf, LaRRROp op, LaRRUImmOp op_const)1254{1255const la_gpr_t rd = CFGetRegD(cf);1256const la_gpr_t rt = CFGetRegT(cf);1257if (inst->r.shamt > 0)1258op_const(laAsm, rd, rt, inst->r.shamt);1259else if (rd != rt)1260la_or(laAsm, rd, rt, LA_ZERO);1261}12621263void CPU::LoongArch64Recompiler::Compile_sll(CompileFlags cf)1264{1265Compile_shift(cf, la_sll_w, la_slli_w);1266}12671268void CPU::LoongArch64Recompiler::Compile_srl(CompileFlags cf)1269{1270Compile_shift(cf, la_srl_w, la_srli_w);1271}12721273void CPU::LoongArch64Recompiler::Compile_sra(CompileFlags cf)1274{1275Compile_shift(cf, la_sra_w, la_srai_w);1276}12771278void CPU::LoongArch64Recompiler::Compile_variable_shift(CompileFlags cf, LaRRROp op, LaRRUImmOp op_const)1279{1280const la_gpr_t rd = CFGetRegD(cf);12811282AssertRegOrConstS(cf);1283AssertRegOrConstT(cf);12841285const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1286if (!cf.valid_host_t)1287MoveTToReg(rt, cf);12881289if (cf.const_s)1290{1291if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)1292op_const(laAsm, rd, rt, shift & 31u);1293else if (rd != rt)1294la_or(laAsm, rd, rt, LA_ZERO);1295}1296else1297{1298op(laAsm, rd, rt, CFGetRegS(cf));1299}1300}13011302void CPU::LoongArch64Recompiler::Compile_sllv(CompileFlags cf)1303{1304Compile_variable_shift(cf, la_sll_w, la_slli_w);1305}13061307void CPU::LoongArch64Recompiler::Compile_srlv(CompileFlags cf)1308{1309Compile_variable_shift(cf, la_srl_w, la_srli_w);1310}13111312void CPU::LoongArch64Recompiler::Compile_srav(CompileFlags cf)1313{1314Compile_variable_shift(cf, la_sra_w, la_srai_w);1315}13161317void CPU::LoongArch64Recompiler::Compile_mult(CompileFlags cf, bool sign)1318{1319const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1320if (!cf.valid_host_s)1321MoveSToReg(rs, cf);13221323const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1324if (!cf.valid_host_t)1325MoveTToReg(rt, cf);13261327// TODO: if lo/hi gets killed, we can use a 32-bit multiply1328const la_gpr_t lo = CFGetRegLO(cf);1329const la_gpr_t hi = CFGetRegHI(cf);13301331if (sign)1332{1333la_mul_d(laAsm, lo, rs, rt);1334la_srai_d(laAsm, hi, lo, 32);1335EmitDSExtW(lo, lo);1336}1337else1338{1339EmitDUExtW(RARG1, rs);1340EmitDUExtW(RARG2, rt);1341la_mul_d(laAsm, lo, RARG1, RARG2);1342la_srai_d(laAsm, hi, lo, 32);1343EmitDSExtW(lo, lo);1344}1345}13461347void CPU::LoongArch64Recompiler::Compile_mult(CompileFlags cf)1348{1349Compile_mult(cf, true);1350}13511352void CPU::LoongArch64Recompiler::Compile_multu(CompileFlags cf)1353{1354Compile_mult(cf, false);1355}13561357void CPU::LoongArch64Recompiler::Compile_div(CompileFlags cf)1358{1359const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1360if (!cf.valid_host_s)1361MoveSToReg(rs, cf);13621363const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1364if (!cf.valid_host_t)1365MoveTToReg(rt, cf);13661367const la_gpr_t rlo = CFGetRegLO(cf);1368const la_gpr_t rhi = CFGetRegHI(cf);13691370lagoon_label_t done = {};1371lagoon_label_t not_divide_by_zero = {};1372la_bnez(laAsm, rt, la_label(laAsm, ¬_divide_by_zero));1373la_or(laAsm, rhi, rs, LA_ZERO); // hi = num1374la_srai_d(laAsm, rlo, rs, 63);1375la_andi(laAsm, rlo, rlo, 2);1376la_addi_d(laAsm, rlo, rlo, -1); // lo = s >= 0 ? -1 : 11377la_b(laAsm, la_label(laAsm, &done));13781379la_bind(laAsm, ¬_divide_by_zero);1380la_label_free(laAsm, ¬_divide_by_zero);13811382lagoon_label_t not_unrepresentable = {};1383EmitMov(RSCRATCH, static_cast<u32>(-1));1384la_bne(laAsm, rt, RSCRATCH, la_label(laAsm, ¬_unrepresentable));1385EmitMov(rlo, 0x80000000u);1386la_bne(laAsm, rs, rlo, la_label(laAsm, ¬_unrepresentable));1387EmitMov(rhi, 0);1388la_b(laAsm, la_label(laAsm, &done));13891390la_bind(laAsm, ¬_unrepresentable);1391la_label_free(laAsm, ¬_unrepresentable);13921393la_div_w(laAsm, rlo, rs, rt);1394la_mod_w(laAsm, rhi, rs, rt);13951396la_bind(laAsm, &done);1397la_label_free(laAsm, &done);1398}13991400void CPU::LoongArch64Recompiler::Compile_divu(CompileFlags cf)1401{1402const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;1403if (!cf.valid_host_s)1404MoveSToReg(rs, cf);14051406const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;1407if (!cf.valid_host_t)1408MoveTToReg(rt, cf);14091410const la_gpr_t rlo = CFGetRegLO(cf);1411const la_gpr_t rhi = CFGetRegHI(cf);14121413// Semantics match? :-)1414la_div_wu(laAsm, rlo, rs, rt);1415la_mod_wu(laAsm, rhi, rs, rt);1416}14171418void CPU::LoongArch64Recompiler::TestOverflow(la_gpr_t long_res, la_gpr_t res, la_gpr_t reg_to_discard)1419{1420SwitchToFarCode(true, LaBranchCondition::NE, long_res, res);14211422BackupHostState();14231424// toss the result1425ClearHostReg(reg_to_discard);14261427EndBlockWithException(Exception::Ov);14281429RestoreHostState();14301431SwitchToNearCode(false);1432}14331434void CPU::LoongArch64Recompiler::Compile_dst_op(CompileFlags cf, LaRRROp op,1435void (LoongArch64Recompiler::*op_const)(la_gpr_t rd, la_gpr_t rs,1436u32 imm),1437LaRRROp op_long, bool commutative, bool overflow)1438{1439AssertRegOrConstS(cf);1440AssertRegOrConstT(cf);14411442const la_gpr_t rd = CFGetRegD(cf);14431444if (overflow)1445{1446const la_gpr_t rs = CFGetSafeRegS(cf, RARG1);1447const la_gpr_t rt = CFGetSafeRegT(cf, RARG2);1448op_long(laAsm, RARG3, rs, rt);1449op(laAsm, rd, rs, rt);1450TestOverflow(RARG3, rd, rd);1451return;1452}14531454if (cf.valid_host_s && cf.valid_host_t)1455{1456op(laAsm, rd, CFGetRegS(cf), CFGetRegT(cf));1457}1458else if (commutative && (cf.const_s || cf.const_t))1459{1460const la_gpr_t src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);1461if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1462{1463(this->*op_const)(rd, src, cv);1464}1465else1466{1467if (rd != src)1468la_or(laAsm, rd, src, LA_ZERO);1469overflow = false;1470}1471}1472else if (cf.const_s)1473{1474if (HasConstantRegValue(cf.MipsS(), 0))1475{1476op(laAsm, rd, LA_ZERO, CFGetRegT(cf));1477}1478else1479{1480EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));1481op(laAsm, rd, RSCRATCH, CFGetRegT(cf));1482}1483}1484else if (cf.const_t)1485{1486const la_gpr_t rs = CFGetRegS(cf);1487if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1488{1489(this->*op_const)(rd, rs, cv);1490}1491else1492{1493if (rd != rs)1494la_or(laAsm, rd, rs, LA_ZERO);1495overflow = false;1496}1497}1498}14991500void CPU::LoongArch64Recompiler::Compile_add(CompileFlags cf)1501{1502Compile_dst_op(cf, la_add_w, &LoongArch64Recompiler::SafeADDIW, la_add_d, true,1503g_settings.cpu_recompiler_memory_exceptions);1504}15051506void CPU::LoongArch64Recompiler::Compile_addu(CompileFlags cf)1507{1508Compile_dst_op(cf, la_add_w, &LoongArch64Recompiler::SafeADDIW, la_add_d, true, false);1509}15101511void CPU::LoongArch64Recompiler::Compile_sub(CompileFlags cf)1512{1513Compile_dst_op(cf, la_sub_w, &LoongArch64Recompiler::SafeSUBIW, la_sub_d, false,1514g_settings.cpu_recompiler_memory_exceptions);1515}15161517void CPU::LoongArch64Recompiler::Compile_subu(CompileFlags cf)1518{1519Compile_dst_op(cf, la_sub_w, &LoongArch64Recompiler::SafeSUBIW, la_sub_d, false, false);1520}15211522void CPU::LoongArch64Recompiler::Compile_and(CompileFlags cf)1523{1524AssertRegOrConstS(cf);1525AssertRegOrConstT(cf);15261527// special cases - and with self -> self, and with 0 -> 01528const la_gpr_t regd = CFGetRegD(cf);1529if (cf.MipsS() == cf.MipsT())1530{1531la_or(laAsm, regd, CFGetRegS(cf), LA_ZERO);1532return;1533}1534else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1535{1536EmitMov(regd, 0);1537return;1538}15391540Compile_dst_op(cf, la_and, &LoongArch64Recompiler::SafeANDI, la_and, true, false);1541}15421543void CPU::LoongArch64Recompiler::Compile_or(CompileFlags cf)1544{1545AssertRegOrConstS(cf);1546AssertRegOrConstT(cf);15471548// or/nor with 0 -> no effect1549const la_gpr_t regd = CFGetRegD(cf);1550if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())1551{1552cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1553return;1554}15551556Compile_dst_op(cf, la_or, &LoongArch64Recompiler::SafeORI, la_or, true, false);1557}15581559void CPU::LoongArch64Recompiler::Compile_xor(CompileFlags cf)1560{1561AssertRegOrConstS(cf);1562AssertRegOrConstT(cf);15631564const la_gpr_t regd = CFGetRegD(cf);1565if (cf.MipsS() == cf.MipsT())1566{1567// xor with self -> zero1568EmitMov(regd, 0);1569return;1570}1571else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1572{1573// xor with zero -> no effect1574cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1575return;1576}15771578Compile_dst_op(cf, la_xor, &LoongArch64Recompiler::SafeXORI, la_xor, true, false);1579}15801581void CPU::LoongArch64Recompiler::Compile_nor(CompileFlags cf)1582{1583Compile_or(cf);1584la_nor(laAsm, CFGetRegD(cf), CFGetRegD(cf), LA_ZERO);1585}15861587void CPU::LoongArch64Recompiler::Compile_slt(CompileFlags cf)1588{1589Compile_slt(cf, true);1590}15911592void CPU::LoongArch64Recompiler::Compile_sltu(CompileFlags cf)1593{1594Compile_slt(cf, false);1595}15961597void CPU::LoongArch64Recompiler::Compile_slt(CompileFlags cf, bool sign)1598{1599AssertRegOrConstS(cf);1600AssertRegOrConstT(cf);16011602const la_gpr_t rd = CFGetRegD(cf);1603const la_gpr_t rs = CFGetSafeRegS(cf, RARG1);16041605if (cf.const_t && laIsValidSImm12(GetConstantRegU32(cf.MipsT())))1606{1607if (sign)1608la_slti(laAsm, rd, rs, GetConstantRegS32(cf.MipsT()));1609else1610la_sltui(laAsm, rd, rs, GetConstantRegS32(cf.MipsT()));1611}1612else1613{1614const la_gpr_t rt = CFGetSafeRegT(cf, RARG2);1615if (sign)1616la_slt(laAsm, rd, rs, rt);1617else1618la_sltu(laAsm, rd, rs, rt);1619}1620}16211622la_gpr_t CPU::LoongArch64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf,1623const std::optional<VirtualMemoryAddress>& address,1624const std::optional<la_gpr_t>& reg)1625{1626const u32 imm = inst->i.imm_sext32();1627if (cf.valid_host_s && imm == 0 && !reg.has_value())1628return CFGetRegS(cf);16291630const la_gpr_t dst = reg.has_value() ? reg.value() : RARG1;1631if (address.has_value())1632{1633EmitMov(dst, address.value());1634}1635else if (imm == 0)1636{1637if (cf.valid_host_s)1638{1639if (const la_gpr_t src = CFGetRegS(cf); src != dst)1640la_or(laAsm, dst, src, LA_ZERO);1641}1642else1643{1644la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));1645}1646}1647else1648{1649if (cf.valid_host_s)1650{1651SafeADDIW(dst, CFGetRegS(cf), inst->i.imm_sext32());1652}1653else1654{1655la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));1656SafeADDIW(dst, dst, inst->i.imm_sext32());1657}1658}16591660return dst;1661}16621663template<typename RegAllocFn>1664la_gpr_t CPU::LoongArch64Recompiler::GenerateLoad(la_gpr_t addr_reg, MemoryAccessSize size, bool sign, bool use_fastmem,1665const RegAllocFn& dst_reg_alloc)1666{1667if (use_fastmem)1668{1669m_cycles += Bus::RAM_READ_TICKS;16701671// TODO: Make this better. If we're loading the address from state, we can use LD_WU instead, and skip this.1672// TODO: LUT fastmem1673const la_gpr_t dst = dst_reg_alloc();1674// Zero-extend address to 64-bit1675la_bstrpick_d(laAsm, RSCRATCH, addr_reg, 31, 0);16761677if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1678{1679DebugAssert(addr_reg != RARG3);1680la_srli_d(laAsm, RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);1681la_slli_d(laAsm, RARG3, RARG3, 3);1682la_add_d(laAsm, RARG3, RARG3, RMEMBASE);1683la_ld_d(laAsm, RARG3, RARG3, 0);1684la_add_d(laAsm, RSCRATCH, RSCRATCH, RARG3);1685}1686else1687{1688la_add_d(laAsm, RSCRATCH, RSCRATCH, RMEMBASE);1689}16901691u8* start = laAsm->cursor;1692switch (size)1693{1694case MemoryAccessSize::Byte:1695sign ? la_ld_b(laAsm, dst, RSCRATCH, 0) : la_ld_bu(laAsm, dst, RSCRATCH, 0);1696break;16971698case MemoryAccessSize::HalfWord:1699sign ? la_ld_h(laAsm, dst, RSCRATCH, 0) : la_ld_hu(laAsm, dst, RSCRATCH, 0);1700break;17011702case MemoryAccessSize::Word:1703la_ld_w(laAsm, dst, RSCRATCH, 0);1704break;1705}17061707// We need a nop, because the slowmem jump might be more than 1MB away.1708la_andi(laAsm, LA_ZERO, LA_ZERO, 0); // NOP17091710AddLoadStoreInfo(start, 8, addr_reg, dst, size, sign, true);1711return dst;1712}17131714if (addr_reg != RARG1)1715la_or(laAsm, RARG1, addr_reg, LA_ZERO);17161717const bool checked = g_settings.cpu_recompiler_memory_exceptions;1718switch (size)1719{1720case MemoryAccessSize::Byte:1721{1722EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :1723reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));1724}1725break;1726case MemoryAccessSize::HalfWord:1727{1728EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :1729reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));1730}1731break;1732case MemoryAccessSize::Word:1733{1734EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :1735reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));1736}1737break;1738}17391740// TODO: turn this into an asm function instead1741if (checked)1742{1743la_srli_d(laAsm, RSCRATCH, RRET, 63);1744SwitchToFarCode(true, LaBranchCondition::NE, RSCRATCH, LA_ZERO);1745BackupHostState();17461747// Need to stash this in a temp because of the flush.1748const la_gpr_t temp = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));1749la_sub_d(laAsm, temp, LA_ZERO, RRET);1750la_slli_w(laAsm, temp, temp, 2);17511752Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);17531754// cause_bits = (-result << 2) | BD | cop_n1755SafeORI(RARG1, temp,1756Cop0Registers::CAUSE::MakeValueForException(1757static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));1758EmitMov(RARG2, m_current_instruction_pc);1759EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1760FreeHostReg(temp);1761EndBlock(std::nullopt, true);17621763RestoreHostState();1764SwitchToNearCode(false);1765}17661767const la_gpr_t dst_reg = dst_reg_alloc();1768switch (size)1769{1770case MemoryAccessSize::Byte:1771{1772sign ? EmitSExtB(dst_reg, RRET) : EmitUExtB(dst_reg, RRET);1773}1774break;1775case MemoryAccessSize::HalfWord:1776{1777sign ? EmitSExtH(dst_reg, RRET) : EmitUExtH(dst_reg, RRET);1778}1779break;1780case MemoryAccessSize::Word:1781{1782// Need to undo the zero-extend.1783if (checked)1784laEmitDSExtW(laAsm, dst_reg, RRET);1785else if (dst_reg != RRET)1786la_or(laAsm, dst_reg, RRET, LA_ZERO);1787}1788break;1789}17901791return dst_reg;1792}17931794void CPU::LoongArch64Recompiler::GenerateStore(la_gpr_t addr_reg, la_gpr_t value_reg, MemoryAccessSize size,1795bool use_fastmem)1796{1797if (use_fastmem)1798{1799DebugAssert(value_reg != RSCRATCH);1800// Zero-extend address to 64-bit1801la_bstrpick_d(laAsm, RSCRATCH, addr_reg, 31, 0);18021803if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1804{1805DebugAssert(addr_reg != RARG3);1806la_srli_d(laAsm, RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);1807la_slli_d(laAsm, RARG3, RARG3, 3);1808la_add_d(laAsm, RARG3, RARG3, RMEMBASE);1809la_ld_d(laAsm, RARG3, RARG3, 0);1810la_add_d(laAsm, RSCRATCH, RSCRATCH, RARG3);1811}1812else1813{1814la_add_d(laAsm, RSCRATCH, RSCRATCH, RMEMBASE);1815}18161817u8* start = laAsm->cursor;1818switch (size)1819{1820case MemoryAccessSize::Byte:1821la_st_b(laAsm, value_reg, RSCRATCH, 0);1822break;18231824case MemoryAccessSize::HalfWord:1825la_st_h(laAsm, value_reg, RSCRATCH, 0);1826break;18271828case MemoryAccessSize::Word:1829la_st_w(laAsm, value_reg, RSCRATCH, 0);1830break;1831}18321833// We need a nop, because the slowmem jump might be more than 1MB away.1834la_andi(laAsm, LA_ZERO, LA_ZERO, 0); // NOP18351836AddLoadStoreInfo(start, 8, addr_reg, value_reg, size, false, false);1837return;1838}18391840if (addr_reg != RARG1)1841la_or(laAsm, RARG1, addr_reg, LA_ZERO);1842if (value_reg != RARG2)1843la_or(laAsm, RARG2, value_reg, LA_ZERO);18441845const bool checked = g_settings.cpu_recompiler_memory_exceptions;1846switch (size)1847{1848case MemoryAccessSize::Byte:1849{1850EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :1851reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));1852}1853break;1854case MemoryAccessSize::HalfWord:1855{1856EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :1857reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));1858}1859break;1860case MemoryAccessSize::Word:1861{1862EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :1863reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));1864}1865break;1866}18671868// TODO: turn this into an asm function instead1869if (checked)1870{1871SwitchToFarCode(true, LaBranchCondition::NE, RRET, LA_ZERO);1872BackupHostState();18731874// Need to stash this in a temp because of the flush.1875const la_gpr_t temp = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));1876la_slli_w(laAsm, temp, RRET, 2);18771878Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);18791880// cause_bits = (result << 2) | BD | cop_n1881SafeORI(RARG1, temp,1882Cop0Registers::CAUSE::MakeValueForException(1883static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));1884EmitMov(RARG2, m_current_instruction_pc);1885EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1886FreeHostReg(temp);1887EndBlock(std::nullopt, true);18881889RestoreHostState();1890SwitchToNearCode(false);1891}1892}18931894void CPU::LoongArch64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,1895const std::optional<VirtualMemoryAddress>& address)1896{1897const std::optional<la_gpr_t> addr_reg =1898(g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero) ?1899std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :1900std::optional<la_gpr_t>();1901FlushForLoadStore(address, false, use_fastmem);1902const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);1903const la_gpr_t data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {1904if (cf.MipsT() == Reg::zero)1905return RRET;19061907return static_cast<la_gpr_t>(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),1908EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,1909cf.MipsT()));1910});19111912if (g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero)1913{1914Flush(FLUSH_FOR_C_CALL);19151916EmitMov(RARG1, inst->bits);1917la_or(laAsm, RARG2, addr, LA_ZERO);1918la_or(laAsm, RARG3, data, LA_ZERO);1919EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);1920FreeHostReg(addr_reg.value());1921}1922}19231924void CPU::LoongArch64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,1925const std::optional<VirtualMemoryAddress>& address)1926{1927DebugAssert(size == MemoryAccessSize::Word && !sign);19281929const la_gpr_t addr = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));1930FlushForLoadStore(address, false, use_fastmem);19311932// TODO: if address is constant, this can be simplified..19331934// If we're coming from another block, just flush the load delay and hope for the best..1935if (m_load_delay_dirty)1936UpdateLoadDelay();19371938// We'd need to be careful here if we weren't overwriting it..1939ComputeLoadStoreAddressArg(cf, address, addr);19401941// Do PGXP first, it does its own load.1942if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)1943{1944Flush(FLUSH_FOR_C_CALL);1945EmitMov(RARG1, inst->bits);1946la_or(laAsm, RARG2, addr, LA_ZERO);1947MoveMIPSRegToReg(RARG3, inst->r.rt, true);1948EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));1949}19501951la_or(laAsm, RARG1, addr, LA_ZERO);1952la_bstrins_d(laAsm, RARG1, LA_ZERO, 1, 0); // addr & ~31953GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });19541955if (inst->r.rt == Reg::zero)1956{1957FreeHostReg(addr);1958return;1959}19601961// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is1962// never written back. NOTE: can't trust T in cf because of the flush1963const Reg rt = inst->r.rt;1964la_gpr_t value;1965if (m_load_delay_register == rt)1966{1967const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?1968AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :1969m_load_delay_value_register;1970RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);1971value = static_cast<la_gpr_t>(existing_ld_rt);1972}1973else1974{1975if constexpr (EMULATE_LOAD_DELAYS)1976{1977value = static_cast<la_gpr_t>(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));1978if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())1979la_or(laAsm, value, static_cast<la_gpr_t>(rtreg.value()), LA_ZERO);1980else if (HasConstantReg(rt))1981EmitMov(value, GetConstantRegU32(rt));1982else1983la_ld_w(laAsm, value, RSTATE, OFFS(&g_state.regs.r[static_cast<u8>(rt)]));1984}1985else1986{1987value = static_cast<la_gpr_t>(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));1988}1989}19901991DebugAssert(value != RARG2 && value != RARG3);1992la_andi(laAsm, RARG2, addr, 3);1993la_slli_w(laAsm, RARG2, RARG2, 3); // *81994EmitMov(RARG3, 24);1995la_sub_w(laAsm, RARG3, RARG3, RARG2);19961997if (inst->op == InstructionOp::lwl)1998{1999// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;2000// new_value = (value & mask) | (RWRET << (24 - shift));2001EmitMov(RSCRATCH, 0xFFFFFFu);2002la_srl_w(laAsm, RSCRATCH, RSCRATCH, RARG2);2003la_and(laAsm, value, value, RSCRATCH);2004la_sll_w(laAsm, RRET, RRET, RARG3);2005la_or(laAsm, value, value, RRET);2006}2007else2008{2009// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);2010// new_value = (value & mask) | (RWRET >> shift);2011la_srl_w(laAsm, RRET, RRET, RARG2);2012EmitMov(RSCRATCH, 0xFFFFFF00u);2013la_sll_w(laAsm, RSCRATCH, RSCRATCH, RARG3);2014la_and(laAsm, value, value, RSCRATCH);2015la_or(laAsm, value, value, RRET);2016}20172018FreeHostReg(addr);2019}20202021void CPU::LoongArch64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2022const std::optional<VirtualMemoryAddress>& address)2023{2024const u32 index = static_cast<u32>(inst->r.rt.GetValue());2025const auto [ptr, action] = GetGTERegisterPointer(index, true);2026const std::optional<la_gpr_t> addr_reg =2027g_settings.gpu_pgxp_enable ? std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :2028std::optional<la_gpr_t>();2029FlushForLoadStore(address, false, use_fastmem);2030const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2031const la_gpr_t value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {2032return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?2033static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) :2034RRET;2035});20362037switch (action)2038{2039case GTERegisterAccessAction::Ignore:2040{2041break;2042}20432044case GTERegisterAccessAction::Direct:2045{2046la_st_w(laAsm, value, RSTATE, OFFS(ptr));2047break;2048}20492050case GTERegisterAccessAction::SignExtend16:2051{2052EmitSExtH(RARG3, value);2053la_st_w(laAsm, RARG3, RSTATE, OFFS(ptr));2054break;2055}20562057case GTERegisterAccessAction::ZeroExtend16:2058{2059EmitUExtH(RARG3, value);2060la_st_w(laAsm, RARG3, RSTATE, OFFS(ptr));2061break;2062}20632064case GTERegisterAccessAction::CallHandler:2065{2066Flush(FLUSH_FOR_C_CALL);2067la_or(laAsm, RARG2, value, LA_ZERO);2068EmitMov(RARG1, index);2069EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2070break;2071}20722073case GTERegisterAccessAction::PushFIFO:2074{2075// SXY0 <- SXY12076// SXY1 <- SXY22077// SXY2 <- SXYP2078DebugAssert(value != RARG2 && value != RARG3);2079la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));2080la_ld_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));2081la_st_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY0[0]));2082la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));2083la_st_w(laAsm, value, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));2084break;2085}20862087default:2088{2089Panic("Unknown action");2090return;2091}2092}20932094if (g_settings.gpu_pgxp_enable)2095{2096Flush(FLUSH_FOR_C_CALL);2097la_or(laAsm, RARG3, value, LA_ZERO);2098if (value != RRET)2099FreeHostReg(value);2100la_or(laAsm, RARG2, addr, LA_ZERO);2101FreeHostReg(addr_reg.value());2102EmitMov(RARG1, inst->bits);2103EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));2104}2105}21062107void CPU::LoongArch64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2108const std::optional<VirtualMemoryAddress>& address)2109{2110AssertRegOrConstS(cf);2111AssertRegOrConstT(cf);21122113const std::optional<la_gpr_t> addr_reg =2114g_settings.gpu_pgxp_enable ? std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :2115std::optional<la_gpr_t>();2116FlushForLoadStore(address, true, use_fastmem);2117const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2118const la_gpr_t data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;2119if (!cf.valid_host_t)2120MoveTToReg(RARG2, cf);21212122GenerateStore(addr, data, size, use_fastmem);21232124if (g_settings.gpu_pgxp_enable)2125{2126Flush(FLUSH_FOR_C_CALL);2127MoveMIPSRegToReg(RARG3, cf.MipsT());2128la_or(laAsm, RARG2, addr, LA_ZERO);2129EmitMov(RARG1, inst->bits);2130EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);2131FreeHostReg(addr_reg.value());2132}2133}21342135void CPU::LoongArch64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2136const std::optional<VirtualMemoryAddress>& address)2137{2138DebugAssert(size == MemoryAccessSize::Word && !sign);21392140// TODO: this can take over rt's value if it's no longer needed2141// NOTE: can't trust T in cf because of the alloc2142const la_gpr_t addr = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));21432144FlushForLoadStore(address, true, use_fastmem);21452146// TODO: if address is constant, this can be simplified..2147// We'd need to be careful here if we weren't overwriting it..2148ComputeLoadStoreAddressArg(cf, address, addr);21492150if (g_settings.gpu_pgxp_enable)2151{2152Flush(FLUSH_FOR_C_CALL);2153EmitMov(RARG1, inst->bits);2154la_or(laAsm, RARG2, addr, LA_ZERO);2155MoveMIPSRegToReg(RARG3, inst->r.rt);2156EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));2157}21582159la_or(laAsm, RARG1, addr, LA_ZERO);2160la_bstrins_d(laAsm, RARG1, LA_ZERO, 1, 0); // addr & ~32161GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });21622163la_andi(laAsm, RSCRATCH, addr, 3);2164la_slli_w(laAsm, RSCRATCH, RSCRATCH, 3); // *82165la_bstrins_d(laAsm, addr, LA_ZERO, 1, 0); // addr & ~321662167// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.2168if (!g_settings.gpu_pgxp_enable)2169MoveMIPSRegToReg(RARG2, inst->r.rt);21702171if (inst->op == InstructionOp::swl)2172{2173// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;2174// new_value = (RWRET & mem_mask) | (value >> (24 - shift));2175EmitMov(RARG3, 0xFFFFFF00u);2176la_sll_w(laAsm, RARG3, RARG3, RSCRATCH);2177la_and(laAsm, RRET, RRET, RARG3);21782179EmitMov(RARG3, 24);2180la_sub_w(laAsm, RARG3, RARG3, RSCRATCH);2181la_srl_w(laAsm, RARG2, RARG2, RARG3);2182la_or(laAsm, RARG2, RARG2, RRET);2183}2184else2185{2186// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);2187// new_value = (RWRET & mem_mask) | (value << shift);2188la_sll_w(laAsm, RARG2, RARG2, RSCRATCH);21892190EmitMov(RARG3, 24);2191la_sub_w(laAsm, RARG3, RARG3, RSCRATCH);2192EmitMov(RSCRATCH, 0x00FFFFFFu);2193la_srl_w(laAsm, RSCRATCH, RSCRATCH, RARG3);2194la_and(laAsm, RRET, RRET, RSCRATCH);2195la_or(laAsm, RARG2, RARG2, RRET);2196}21972198GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);2199FreeHostReg(addr);2200}22012202void CPU::LoongArch64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2203const std::optional<VirtualMemoryAddress>& address)2204{2205const u32 index = static_cast<u32>(inst->r.rt.GetValue());2206const auto [ptr, action] = GetGTERegisterPointer(index, false);2207const la_gpr_t addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?2208static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) :2209RARG1;2210const la_gpr_t data =2211g_settings.gpu_pgxp_enable ? static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;2212FlushForLoadStore(address, true, use_fastmem);2213ComputeLoadStoreAddressArg(cf, address, addr);22142215switch (action)2216{2217case GTERegisterAccessAction::Direct:2218{2219la_ld_w(laAsm, data, RSTATE, OFFS(ptr));2220}2221break;22222223case GTERegisterAccessAction::CallHandler:2224{2225// should already be flushed.. except in fastmem case2226Flush(FLUSH_FOR_C_CALL);2227EmitMov(RARG1, index);2228EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));2229la_or(laAsm, data, RRET, LA_ZERO);2230}2231break;22322233default:2234{2235Panic("Unknown action");2236}2237break;2238}22392240GenerateStore(addr, data, size, use_fastmem);22412242if (!g_settings.gpu_pgxp_enable)2243{2244if (addr != RARG1)2245FreeHostReg(addr);2246}2247else2248{2249// TODO: This can be simplified because we don't need to validate in PGXP..2250Flush(FLUSH_FOR_C_CALL);2251la_or(laAsm, RARG3, data, LA_ZERO);2252FreeHostReg(data);2253la_or(laAsm, RARG2, addr, LA_ZERO);2254FreeHostReg(addr);2255EmitMov(RARG1, inst->bits);2256EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));2257}2258}22592260void CPU::LoongArch64Recompiler::Compile_mtc0(CompileFlags cf)2261{2262// TODO: we need better constant setting here.. which will need backprop2263AssertRegOrConstT(cf);22642265const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());2266const u32* ptr = GetCop0RegPtr(reg);2267const u32 mask = GetCop0RegWriteMask(reg);2268if (!ptr)2269{2270Compile_Fallback();2271return;2272}22732274if (mask == 0)2275{2276// if it's a read-only register, ignore2277DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));2278return;2279}22802281// for some registers, we need to test certain bits2282const bool needs_bit_test = (reg == Cop0Reg::SR);2283const la_gpr_t new_value = RARG1;2284const la_gpr_t old_value = RARG2;2285const la_gpr_t changed_bits = RARG3;2286const la_gpr_t mask_reg = RSCRATCH;22872288// Load old value2289la_ld_w(laAsm, old_value, RSTATE, OFFS(ptr));22902291// No way we fit this in an immediate..2292EmitMov(mask_reg, mask);22932294// update value2295if (cf.valid_host_t)2296la_and(laAsm, new_value, CFGetRegT(cf), mask_reg);2297else2298EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);22992300if (needs_bit_test)2301la_xor(laAsm, changed_bits, old_value, new_value);2302la_nor(laAsm, mask_reg, mask_reg, LA_ZERO);2303la_and(laAsm, old_value, old_value, mask_reg);2304la_or(laAsm, new_value, old_value, new_value);2305la_st_w(laAsm, new_value, RSTATE, OFFS(ptr));23062307if (reg == Cop0Reg::SR)2308{2309// TODO: replace with register backup2310// We could just inline the whole thing..2311Flush(FLUSH_FOR_C_CALL);23122313lagoon_label_t caches_unchanged = {};2314la_srli_w(laAsm, RSCRATCH, changed_bits, 16);2315la_andi(laAsm, RSCRATCH, RSCRATCH, 1);2316la_beq(laAsm, RSCRATCH, LA_ZERO, la_label(laAsm, &caches_unchanged));2317EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));2318la_ld_w(laAsm, new_value, RSTATE, OFFS(ptr));2319if (CodeCache::IsUsingFastmem())2320la_ld_d(laAsm, RMEMBASE, RSTATE, OFFS(&g_state.fastmem_base));2321la_bind(laAsm, &caches_unchanged);2322la_label_free(laAsm, &caches_unchanged);23232324TestInterrupts(RARG1);2325}2326else if (reg == Cop0Reg::CAUSE)2327{2328la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));2329TestInterrupts(RARG1);2330}2331else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)2332{2333// need to check whether we're switching to debug mode2334Flush(FLUSH_FOR_C_CALL);2335EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));2336SwitchToFarCode(true, LaBranchCondition::NE, RRET, LA_ZERO);2337BackupHostState();2338Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);2339EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return2340RestoreHostState();2341SwitchToNearCode(false);2342}2343}23442345void CPU::LoongArch64Recompiler::Compile_rfe(CompileFlags cf)2346{2347// shift mode bits right two, preserving upper bits2348la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));2349la_srli_w(laAsm, RSCRATCH, RARG1, 2);2350la_andi(laAsm, RSCRATCH, RSCRATCH, 0xf);2351la_bstrins_d(laAsm, RARG1, LA_ZERO, 3, 0);2352la_or(laAsm, RARG1, RARG1, RSCRATCH);2353la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));23542355TestInterrupts(RARG1);2356}23572358void CPU::LoongArch64Recompiler::TestInterrupts(la_gpr_t sr)2359{2360DebugAssert(sr != RSCRATCH);23612362// if Iec == 0 then goto no_interrupt2363lagoon_label_t no_interrupt = {};2364la_andi(laAsm, RSCRATCH, sr, 1);2365la_beqz(laAsm, RSCRATCH, la_label(laAsm, &no_interrupt));23662367// sr & cause2368la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.cop0_regs.cause.bits));2369la_and(laAsm, sr, sr, RSCRATCH);23702371// ((sr & cause) & 0xff00) == 0 goto no_interrupt2372la_srli_w(laAsm, sr, sr, 8);2373la_andi(laAsm, sr, sr, 0xFF);2374SwitchToFarCode(true, LaBranchCondition::NE, sr, LA_ZERO);23752376BackupHostState();23772378// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.2379UpdateLoadDelay();23802381Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);23822383// Can't use EndBlockWithException() here, because it'll use the wrong PC.2384// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.2385if (!iinfo->is_last_instruction)2386{2387EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,2388(inst + 1)->cop.cop_n));2389EmitMov(RARG2, m_compiler_pc);2390EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2391m_dirty_pc = false;2392EndAndLinkBlock(std::nullopt, true, false);2393}2394else2395{2396if (m_dirty_pc)2397EmitMov(RARG1, m_compiler_pc);2398la_st_w(laAsm, LA_ZERO, RSTATE, OFFS(&g_state.downcount));2399if (m_dirty_pc)2400la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));2401m_dirty_pc = false;2402EndAndLinkBlock(std::nullopt, false, true);2403}24042405RestoreHostState();2406SwitchToNearCode(false);24072408la_bind(laAsm, &no_interrupt);2409la_label_free(laAsm, &no_interrupt);2410}24112412void CPU::LoongArch64Recompiler::Compile_mfc2(CompileFlags cf)2413{2414const u32 index = inst->cop.Cop2Index();2415const Reg rt = inst->r.rt;24162417const auto [ptr, action] = GetGTERegisterPointer(index, false);2418if (action == GTERegisterAccessAction::Ignore)2419return;24202421u32 hreg;2422if (action == GTERegisterAccessAction::Direct)2423{2424hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2425EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2426la_ld_w(laAsm, static_cast<la_gpr_t>(hreg), RSTATE, OFFS(ptr));2427}2428else if (action == GTERegisterAccessAction::CallHandler)2429{2430Flush(FLUSH_FOR_C_CALL);2431EmitMov(RARG1, index);2432EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));24332434hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2435EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2436la_or(laAsm, static_cast<la_gpr_t>(hreg), RRET, LA_ZERO);2437}2438else2439{2440Panic("Unknown action");2441}24422443if (g_settings.gpu_pgxp_enable)2444{2445Flush(FLUSH_FOR_C_CALL);2446EmitMov(RARG1, inst->bits);2447la_or(laAsm, RARG2, static_cast<la_gpr_t>(hreg), LA_ZERO);2448EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));2449}2450}24512452void CPU::LoongArch64Recompiler::Compile_mtc2(CompileFlags cf)2453{2454const u32 index = inst->cop.Cop2Index();2455const auto [ptr, action] = GetGTERegisterPointer(index, true);2456if (action == GTERegisterAccessAction::Ignore)2457return;24582459if (action == GTERegisterAccessAction::Direct)2460{2461if (cf.const_t)2462StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);2463else2464la_st_w(laAsm, CFGetRegT(cf), RSTATE, OFFS(ptr));2465}2466else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)2467{2468const bool sign = (action == GTERegisterAccessAction::SignExtend16);2469if (cf.valid_host_t)2470{2471sign ? EmitSExtH(RARG1, CFGetRegT(cf)) : EmitUExtH(RARG1, CFGetRegT(cf));2472la_st_w(laAsm, RARG1, RSTATE, OFFS(ptr));2473}2474else if (cf.const_t)2475{2476const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));2477StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);2478}2479else2480{2481Panic("Unsupported setup");2482}2483}2484else if (action == GTERegisterAccessAction::CallHandler)2485{2486Flush(FLUSH_FOR_C_CALL);2487EmitMov(RARG1, index);2488MoveTToReg(RARG2, cf);2489EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2490}2491else if (action == GTERegisterAccessAction::PushFIFO)2492{2493// SXY0 <- SXY12494// SXY1 <- SXY22495// SXY2 <- SXYP2496DebugAssert(RRET != RARG2 && RRET != RARG3);2497la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));2498la_ld_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));2499la_st_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY0[0]));2500la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));2501if (cf.valid_host_t)2502la_st_w(laAsm, CFGetRegT(cf), RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));2503else if (cf.const_t)2504StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);2505else2506Panic("Unsupported setup");2507}2508else2509{2510Panic("Unknown action");2511}2512}25132514void CPU::LoongArch64Recompiler::Compile_cop2(CompileFlags cf)2515{2516TickCount func_ticks;2517GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);25182519Flush(FLUSH_FOR_C_CALL);2520EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);2521EmitCall(reinterpret_cast<const void*>(func));25222523AddGTETicks(func_ticks);2524}25252526u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,2527TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,2528u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,2529bool is_load)2530{2531lagoon_assembler_t la_asm;2532lagoon_assembler_t* laAsm = &la_asm;2533la_init_assembler(laAsm, static_cast<u8*>(thunk_code), thunk_space);25342535static constexpr u32 GPR_SIZE = 8;25362537// save regs2538u32 num_gprs = 0;25392540for (u32 i = 0; i < NUM_HOST_REGS; i++)2541{2542if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))2543num_gprs++;2544}25452546const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);25472548if (stack_size > 0)2549{2550la_addi_d(laAsm, LA_SP, LA_SP, -static_cast<s32>(stack_size));25512552u32 stack_offset = 0;2553for (u32 i = 0; i < NUM_HOST_REGS; i++)2554{2555if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))2556{2557la_st_d(laAsm, static_cast<la_gpr_t>(i), LA_SP, stack_offset);2558stack_offset += GPR_SIZE;2559}2560}2561}25622563if (cycles_to_add != 0)2564{2565// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles2566Assert(laIsValidSImm12(cycles_to_add));2567la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));2568la_addi_w(laAsm, RSCRATCH, RSCRATCH, cycles_to_add);2569la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));2570}25712572if (address_register != RARG1)2573la_or(laAsm, RARG1, static_cast<la_gpr_t>(address_register), LA_ZERO);25742575if (!is_load)2576{2577if (data_register != RARG2)2578la_or(laAsm, RARG2, static_cast<la_gpr_t>(data_register), LA_ZERO);2579}25802581switch (size)2582{2583case MemoryAccessSize::Byte:2584{2585laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :2586reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));2587}2588break;2589case MemoryAccessSize::HalfWord:2590{2591laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :2592reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));2593}2594break;2595case MemoryAccessSize::Word:2596{2597laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :2598reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));2599}2600break;2601}26022603if (is_load)2604{2605const la_gpr_t dst = static_cast<la_gpr_t>(data_register);2606switch (size)2607{2608case MemoryAccessSize::Byte:2609{2610is_signed ? laEmitSExtB(laAsm, dst, RRET) : laEmitUExtB(laAsm, dst, RRET);2611}2612break;2613case MemoryAccessSize::HalfWord:2614{2615is_signed ? laEmitSExtH(laAsm, dst, RRET) : laEmitUExtH(laAsm, dst, RRET);2616}2617break;2618case MemoryAccessSize::Word:2619{2620if (dst != RRET)2621la_or(laAsm, dst, RRET, LA_ZERO);2622}2623break;2624}2625}26262627if (cycles_to_remove != 0)2628{2629Assert(laIsValidSImm12(-cycles_to_remove));2630la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));2631la_addi_w(laAsm, RSCRATCH, RSCRATCH, -cycles_to_remove);2632la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));2633}26342635// restore regs2636if (stack_size > 0)2637{2638u32 stack_offset = 0;2639for (u32 i = 0; i < NUM_HOST_REGS; i++)2640{2641if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))2642{2643la_ld_d(laAsm, static_cast<la_gpr_t>(i), LA_SP, stack_offset);2644stack_offset += GPR_SIZE;2645}2646}26472648la_addi_d(laAsm, LA_SP, LA_SP, stack_size);2649}26502651laEmitJmp(laAsm, static_cast<const u8*>(code_address) + code_size);26522653return static_cast<u32>(laAsm->cursor - laAsm->buffer);2654}26552656#endif // CPU_ARCH_LOONGARCH6426572658