Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_loongarch64.cpp
10595 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]> and contributors.
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_loongarch64.h"
5
#include "cpu_code_cache_private.h"
6
#include "cpu_core_private.h"
7
#include "cpu_pgxp.h"
8
#include "gte.h"
9
#include "settings.h"
10
#include "timing_event.h"
11
12
#include "common/align.h"
13
#include "common/assert.h"
14
#include "common/log.h"
15
#include "common/memmap.h"
16
#include "common/string_util.h"
17
18
#include <limits>
19
20
#ifdef CPU_ARCH_LOONGARCH64
21
22
LOG_CHANNEL(Recompiler);
23
24
#define OFFS(x) ((u32)(((u8*)(x)) - ((u8*)&g_state)))
25
26
static constexpr u32 BLOCK_LINK_SIZE = 8; // pcaddu18i + jirl
27
28
#define RRET LA_A0
29
#define RARG1 LA_A0
30
#define RARG2 LA_A1
31
#define RARG3 LA_A2
32
#define RSCRATCH LA_T8
33
#define RSTATE LA_S7
34
#define RMEMBASE LA_S8
35
36
static bool laIsCallerSavedRegister(u32 id);
37
static bool laIsValidSImm12(u32 imm);
38
static bool laIsValidUImm12(u32 imm);
39
static std::pair<s32, s32> laGetAddressImmediates12(const void* cur, const void* target);
40
static void laMoveAddressToReg(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr);
41
static void laEmitMov(lagoon_assembler_t* laAsm, la_gpr_t rd, u32 imm);
42
static void laEmitMov64(lagoon_assembler_t* laAsm, la_gpr_t rd, u64 imm);
43
static u32 laEmitJmp(lagoon_assembler_t* laAsm, const void* ptr, la_gpr_t link_reg = LA_ZERO);
44
static u32 laEmitCall(lagoon_assembler_t* laAsm, const void* ptr);
45
static void laEmitFarLoad(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, bool sign_extend_word = false);
46
static void laEmitFarStore(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, la_gpr_t tempreg = RSCRATCH);
47
static void laEmitSExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word
48
static void laEmitUExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word
49
static void laEmitSExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word
50
static void laEmitUExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> word
51
static void laEmitDSExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> doubleword
52
static void laEmitDUExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs); // -> doubleword
53
54
namespace CPU {
55
56
using namespace CPU;
57
58
LoongArch64Recompiler s_instance;
59
Recompiler* g_compiler = &s_instance;
60
61
} // namespace CPU
62
63
bool laIsCallerSavedRegister(u32 id)
64
{
65
return id == 1 || (id >= 4 && id <= 20);
66
}
67
68
bool laIsValidSImm12(u32 imm)
69
{
70
const s32 simm = static_cast<s32>(imm);
71
return (simm >= -2048 && simm <= 2047);
72
}
73
74
bool laIsValidUImm12(u32 imm)
75
{
76
return (imm <= 4095);
77
}
78
79
std::pair<s32, s32> laGetAddressImmediates12(const void* cur, const void* target)
80
{
81
const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));
82
Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&
83
disp <= static_cast<s64>(std::numeric_limits<s32>::max()));
84
85
const s64 hi = disp + 0x800;
86
const s64 lo = disp - (hi & 0xFFFFF000);
87
return std::make_pair(static_cast<s32>(hi >> 12), static_cast<s32>((lo << 52) >> 52));
88
}
89
90
std::pair<s32, s32> laGetAddressImmediates18(const void* cur, const void* target)
91
{
92
const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));
93
Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&
94
disp <= static_cast<s64>(std::numeric_limits<s32>::max()));
95
96
const s64 hi = disp + 0x20000;
97
const s64 lo = disp - (hi & 0xFFFC0000);
98
return std::make_pair(static_cast<s32>(hi >> 18), static_cast<s32>((lo << 46) >> 46));
99
}
100
101
void laMoveAddressToReg(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr)
102
{
103
const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);
104
la_pcaddu12i(laAsm, reg, hi);
105
la_addi_d(laAsm, reg, reg, lo);
106
}
107
108
void laEmitMov(lagoon_assembler_t* laAsm, la_gpr_t rd, u32 imm)
109
{
110
la_load_immediate32(laAsm, rd, static_cast<s32>(imm));
111
}
112
113
void laEmitMov64(lagoon_assembler_t* laAsm, la_gpr_t rd, u64 imm)
114
{
115
la_load_immediate64(laAsm, rd, static_cast<s64>(imm));
116
}
117
118
u32 laEmitJmp(lagoon_assembler_t* laAsm, const void* ptr, la_gpr_t link_reg)
119
{
120
const auto [hi, lo] = laGetAddressImmediates18(laAsm->cursor, ptr);
121
la_pcaddu18i(laAsm, RSCRATCH, hi);
122
la_jirl(laAsm, link_reg, RSCRATCH, lo);
123
return 8;
124
}
125
126
u32 laEmitCall(lagoon_assembler_t* laAsm, const void* ptr)
127
{
128
return laEmitJmp(laAsm, ptr, LA_RA);
129
}
130
131
void laEmitFarLoad(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, bool sign_extend_word)
132
{
133
const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);
134
la_pcaddu12i(laAsm, reg, hi);
135
if (sign_extend_word)
136
la_ld_w(laAsm, reg, reg, lo);
137
else
138
la_ld_wu(laAsm, reg, reg, lo);
139
}
140
141
[[maybe_unused]] void laEmitFarStore(lagoon_assembler_t* laAsm, la_gpr_t reg, const void* addr, la_gpr_t tempreg)
142
{
143
const auto [hi, lo] = laGetAddressImmediates12(laAsm->cursor, addr);
144
la_pcaddu12i(laAsm, tempreg, hi);
145
la_st_w(laAsm, reg, tempreg, lo);
146
}
147
148
void laEmitSExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
149
{
150
la_ext_w_b(laAsm, rd, rs);
151
}
152
153
void laEmitUExtB(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
154
{
155
la_andi(laAsm, rd, rs, 0xFF);
156
}
157
158
void laEmitSExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
159
{
160
la_ext_w_h(laAsm, rd, rs);
161
}
162
163
void laEmitUExtH(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
164
{
165
la_bstrpick_d(laAsm, rd, rs, 15, 0);
166
}
167
168
void laEmitDSExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
169
{
170
la_addi_w(laAsm, rd, rs, 0);
171
}
172
173
void laEmitDUExtW(lagoon_assembler_t* laAsm, la_gpr_t rd, la_gpr_t rs)
174
{
175
la_bstrpick_d(laAsm, rd, rs, 31, 0);
176
}
177
178
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
179
{
180
#ifdef ENABLE_HOST_DISASSEMBLY
181
const u32* code = static_cast<const u32*>(start);
182
const u32 count = size / 4;
183
char buf[256];
184
for (u32 i = 0; i < count; i++)
185
{
186
lagoon_insn_t insn;
187
la_disasm_one(*(code + i), &insn);
188
la_insn_to_str(&insn, buf, sizeof(buf));
189
INFO_LOG("\t0x{:016X}\t{}", reinterpret_cast<uintptr_t>(code + i), buf);
190
}
191
#else
192
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
193
#endif
194
}
195
196
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
197
{
198
#ifdef ENABLE_HOST_DISASSEMBLY
199
return size / 4;
200
#else
201
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
202
return size / 4;
203
#endif
204
}
205
206
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
207
{
208
lagoon_assembler_t asm_obj;
209
lagoon_assembler_t* laAsm = &asm_obj;
210
la_init_assembler(laAsm, static_cast<u8*>(code), code_size);
211
212
lagoon_label_t dispatch = {};
213
lagoon_label_t run_events_and_dispatch = {};
214
215
g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(laAsm->cursor);
216
{
217
// TODO: reserve some space for saving caller-saved registers
218
219
// Need the CPU state for basically everything :-)
220
laMoveAddressToReg(laAsm, RSTATE, &g_state);
221
// Fastmem setup
222
if (IsUsingFastmem())
223
la_ld_d(laAsm, RMEMBASE, RSTATE, OFFS(&g_state.fastmem_base));
224
225
// Fall through to event dispatcher
226
}
227
228
// check events then for frame done
229
{
230
lagoon_label_t skip_event_check = {};
231
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
232
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));
233
la_bltu(laAsm, RARG1, RARG2, la_label(laAsm, &skip_event_check));
234
235
la_bind(laAsm, &run_events_and_dispatch);
236
g_run_events_and_dispatch = laAsm->cursor;
237
laEmitCall(laAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents));
238
239
la_bind(laAsm, &skip_event_check);
240
la_label_free(laAsm, &skip_event_check);
241
}
242
243
// TODO: align?
244
g_dispatcher = laAsm->cursor;
245
{
246
la_bind(laAsm, &dispatch);
247
248
// x9 <- s_fast_map[pc >> 16]
249
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));
250
laMoveAddressToReg(laAsm, RARG3, g_code_lut.data());
251
la_srli_w(laAsm, RARG2, RARG1, 16);
252
la_slli_d(laAsm, RARG2, RARG2, 3);
253
la_add_d(laAsm, RARG2, RARG2, RARG3);
254
la_ld_d(laAsm, RARG2, RARG2, 0);
255
la_slli_d(laAsm, RARG1, RARG1, 48); // idx = (pc & 0xFFFF) >> 2
256
la_srli_d(laAsm, RARG1, RARG1, 50);
257
la_slli_d(laAsm, RARG1, RARG1, 3);
258
259
// blr(x9[pc * 2]) (fast_map[idx])
260
la_add_d(laAsm, RARG1, RARG1, RARG2);
261
la_ld_d(laAsm, RARG1, RARG1, 0);
262
la_jirl(laAsm, LA_ZERO, RARG1, 0);
263
}
264
265
g_compile_or_revalidate_block = laAsm->cursor;
266
{
267
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));
268
laEmitCall(laAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock));
269
la_b(laAsm, la_label(laAsm, &dispatch));
270
}
271
272
g_discard_and_recompile_block = laAsm->cursor;
273
{
274
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));
275
laEmitCall(laAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock));
276
la_b(laAsm, la_label(laAsm, &dispatch));
277
}
278
279
g_interpret_block = laAsm->cursor;
280
{
281
laEmitCall(laAsm, CodeCache::GetInterpretUncachedBlockFunction());
282
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
283
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));
284
la_bge(laAsm, RARG1, RARG2, la_label(laAsm, &run_events_and_dispatch));
285
la_b(laAsm, la_label(laAsm, &dispatch));
286
}
287
288
la_label_free(laAsm, &dispatch);
289
la_label_free(laAsm, &run_events_and_dispatch);
290
291
// TODO: align?
292
293
return static_cast<u32>(laAsm->cursor - laAsm->buffer);
294
}
295
296
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
297
{
298
constexpr u8 padding_value = 0x00;
299
std::memset(dst, padding_value, size);
300
}
301
302
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
303
{
304
{
305
lagoon_assembler_t assembler;
306
la_init_assembler(&assembler, static_cast<u8*>(code), BLOCK_LINK_SIZE);
307
laEmitCall(&assembler, dst);
308
309
DebugAssert(static_cast<size_t>(assembler.cursor - assembler.buffer) <= BLOCK_LINK_SIZE);
310
if (la_get_remaining_buffer_size(&assembler) > 0)
311
la_andi(&assembler, LA_ZERO, LA_ZERO, 0); // NOP
312
}
313
314
if (flush_icache)
315
MemMap::FlushInstructionCache(code, BLOCK_LINK_SIZE);
316
317
return BLOCK_LINK_SIZE;
318
}
319
320
CPU::LoongArch64Recompiler::LoongArch64Recompiler() = default;
321
322
CPU::LoongArch64Recompiler::~LoongArch64Recompiler() = default;
323
324
const void* CPU::LoongArch64Recompiler::GetCurrentCodePointer()
325
{
326
return laAsm->cursor;
327
}
328
329
void CPU::LoongArch64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
330
u8* far_code_buffer, u32 far_code_space)
331
{
332
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
333
334
DebugAssert(!laAsm);
335
la_init_assembler(&m_emitter, code_buffer, code_buffer_space);
336
la_init_assembler(&m_far_emitter, far_code_buffer, far_code_space);
337
laAsm = &m_emitter;
338
339
// Need to wipe it out so it's correct when toggling fastmem.
340
m_host_regs = {};
341
342
const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE : NUM_HOST_REGS;
343
for (u32 i = 0; i < NUM_HOST_REGS; i++)
344
{
345
HostRegAlloc& hra = m_host_regs[i];
346
347
// Reserved: zero(0), ra(1), tp(2), sp(3), r21(reserved)
348
if (i == RARG1 || i == RARG2 || i == RARG3 || i == RSCRATCH || i == RSTATE || i == membase_idx || i < 4 || i == 21)
349
{
350
continue;
351
}
352
353
hra.flags = HR_USABLE | (laIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
354
}
355
}
356
357
void CPU::LoongArch64Recompiler::SwitchToFarCode(bool emit_jump, LaBranchCondition cond, la_gpr_t rs1, la_gpr_t rs2)
358
{
359
DebugAssert(laAsm == &m_emitter);
360
if (emit_jump)
361
{
362
const void* target = m_far_emitter.cursor;
363
if (cond != LaBranchCondition::None)
364
{
365
lagoon_label_t skip = {};
366
switch (cond)
367
{
368
case LaBranchCondition::EQ:
369
la_bne(laAsm, rs1, rs2, la_label(laAsm, &skip));
370
break;
371
case LaBranchCondition::NE:
372
la_beq(laAsm, rs1, rs2, la_label(laAsm, &skip));
373
break;
374
case LaBranchCondition::LT:
375
la_bge(laAsm, rs1, rs2, la_label(laAsm, &skip));
376
break;
377
case LaBranchCondition::GE:
378
la_blt(laAsm, rs1, rs2, la_label(laAsm, &skip));
379
break;
380
case LaBranchCondition::LTU:
381
la_bgeu(laAsm, rs1, rs2, la_label(laAsm, &skip));
382
break;
383
case LaBranchCondition::GEU:
384
la_bltu(laAsm, rs1, rs2, la_label(laAsm, &skip));
385
break;
386
default:
387
break;
388
}
389
laEmitJmp(laAsm, target);
390
la_bind(laAsm, &skip);
391
la_label_free(laAsm, &skip);
392
}
393
else
394
{
395
laEmitCall(laAsm, target);
396
}
397
}
398
laAsm = &m_far_emitter;
399
}
400
401
void CPU::LoongArch64Recompiler::SwitchToNearCode(bool emit_jump)
402
{
403
DebugAssert(laAsm == &m_far_emitter);
404
if (emit_jump)
405
laEmitJmp(laAsm, m_emitter.cursor);
406
laAsm = &m_emitter;
407
}
408
409
void CPU::LoongArch64Recompiler::EmitMov(la_gpr_t dst, u32 val)
410
{
411
laEmitMov(laAsm, dst, val);
412
}
413
414
void CPU::LoongArch64Recompiler::EmitCall(const void* ptr)
415
{
416
laEmitCall(laAsm, ptr);
417
}
418
419
void CPU::LoongArch64Recompiler::SafeImmSImm12(la_gpr_t rd, la_gpr_t rs, u32 imm, LaRRSImmOp iop, LaRRROp rop)
420
{
421
DebugAssert(rd != RSCRATCH && rs != RSCRATCH);
422
423
if (laIsValidSImm12(imm))
424
{
425
iop(laAsm, rd, rs, imm);
426
return;
427
}
428
429
laEmitMov(laAsm, RSCRATCH, imm);
430
rop(laAsm, rd, rs, RSCRATCH);
431
}
432
433
void CPU::LoongArch64Recompiler::SafeImmUImm12(la_gpr_t rd, la_gpr_t rs, u32 imm, LaRRUImmOp iop, LaRRROp rop)
434
{
435
DebugAssert(rd != RSCRATCH && rs != RSCRATCH);
436
437
if (laIsValidUImm12(imm))
438
{
439
iop(laAsm, rd, rs, imm);
440
return;
441
}
442
443
laEmitMov(laAsm, RSCRATCH, imm);
444
rop(laAsm, rd, rs, RSCRATCH);
445
}
446
447
void CPU::LoongArch64Recompiler::SafeADDI(la_gpr_t rd, la_gpr_t rs, u32 imm)
448
{
449
SafeImmSImm12(rd, rs, imm, la_addi_d, la_add_d);
450
}
451
452
void CPU::LoongArch64Recompiler::SafeADDIW(la_gpr_t rd, la_gpr_t rs, u32 imm)
453
{
454
SafeImmSImm12(rd, rs, imm, la_addi_w, la_add_w);
455
}
456
457
void CPU::LoongArch64Recompiler::SafeSUBIW(la_gpr_t rd, la_gpr_t rs, u32 imm)
458
{
459
const u32 nimm = static_cast<u32>(-static_cast<s32>(imm));
460
SafeImmSImm12(rd, rs, nimm, la_addi_w, la_add_w);
461
}
462
463
void CPU::LoongArch64Recompiler::SafeANDI(la_gpr_t rd, la_gpr_t rs, u32 imm)
464
{
465
SafeImmUImm12(rd, rs, imm, la_andi, la_and);
466
}
467
468
void CPU::LoongArch64Recompiler::SafeORI(la_gpr_t rd, la_gpr_t rs, u32 imm)
469
{
470
SafeImmUImm12(rd, rs, imm, la_ori, la_or);
471
}
472
473
void CPU::LoongArch64Recompiler::SafeXORI(la_gpr_t rd, la_gpr_t rs, u32 imm)
474
{
475
SafeImmUImm12(rd, rs, imm, la_xori, la_xor);
476
}
477
478
void CPU::LoongArch64Recompiler::SafeSLTI(la_gpr_t rd, la_gpr_t rs, u32 imm)
479
{
480
SafeImmSImm12(rd, rs, imm, la_slti, la_slt);
481
}
482
483
void CPU::LoongArch64Recompiler::SafeSLTIU(la_gpr_t rd, la_gpr_t rs, u32 imm)
484
{
485
SafeImmSImm12(rd, rs, imm, la_sltui, la_sltu);
486
}
487
488
void CPU::LoongArch64Recompiler::EmitSExtB(la_gpr_t rd, la_gpr_t rs)
489
{
490
laEmitSExtB(laAsm, rd, rs);
491
}
492
493
void CPU::LoongArch64Recompiler::EmitUExtB(la_gpr_t rd, la_gpr_t rs)
494
{
495
laEmitUExtB(laAsm, rd, rs);
496
}
497
498
void CPU::LoongArch64Recompiler::EmitSExtH(la_gpr_t rd, la_gpr_t rs)
499
{
500
laEmitSExtH(laAsm, rd, rs);
501
}
502
503
void CPU::LoongArch64Recompiler::EmitUExtH(la_gpr_t rd, la_gpr_t rs)
504
{
505
laEmitUExtH(laAsm, rd, rs);
506
}
507
508
void CPU::LoongArch64Recompiler::EmitDSExtW(la_gpr_t rd, la_gpr_t rs)
509
{
510
laEmitDSExtW(laAsm, rd, rs);
511
}
512
513
void CPU::LoongArch64Recompiler::EmitDUExtW(la_gpr_t rd, la_gpr_t rs)
514
{
515
laEmitDUExtW(laAsm, rd, rs);
516
}
517
518
void CPU::LoongArch64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
519
{
520
// store it first to reduce code size, because we can offset
521
laEmitMov64(laAsm, RARG1, static_cast<u64>(reinterpret_cast<uintptr_t>(ram_ptr)));
522
laEmitMov64(laAsm, RARG2, static_cast<u64>(reinterpret_cast<uintptr_t>(shadow_ptr)));
523
524
u32 offset = 0;
525
lagoon_label_t block_changed = {};
526
527
while (size >= 8)
528
{
529
la_ld_d(laAsm, RARG3, RARG1, offset);
530
la_ld_d(laAsm, RSCRATCH, RARG2, offset);
531
la_bne(laAsm, RARG3, RSCRATCH, la_label(laAsm, &block_changed));
532
offset += 8;
533
size -= 8;
534
}
535
536
while (size >= 4)
537
{
538
la_ld_w(laAsm, RARG3, RARG1, offset);
539
la_ld_w(laAsm, RSCRATCH, RARG2, offset);
540
la_bne(laAsm, RARG3, RSCRATCH, la_label(laAsm, &block_changed));
541
offset += 4;
542
size -= 4;
543
}
544
545
DebugAssert(size == 0);
546
547
lagoon_label_t block_unchanged = {};
548
la_b(laAsm, la_label(laAsm, &block_unchanged));
549
la_bind(laAsm, &block_changed);
550
laEmitJmp(laAsm, CodeCache::g_discard_and_recompile_block);
551
la_bind(laAsm, &block_unchanged);
552
la_label_free(laAsm, &block_changed);
553
la_label_free(laAsm, &block_unchanged);
554
}
555
556
void CPU::LoongArch64Recompiler::GenerateICacheCheckAndUpdate()
557
{
558
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
559
{
560
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
561
{
562
laEmitFarLoad(laAsm, RARG2, GetFetchMemoryAccessTimePtr());
563
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
564
laEmitMov(laAsm, RARG3, m_block->size);
565
la_mul_w(laAsm, RARG2, RARG2, RARG3);
566
la_add_d(laAsm, RARG1, RARG1, RARG2);
567
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
568
}
569
else
570
{
571
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
572
SafeADDIW(RARG1, RARG1, static_cast<u32>(m_block->uncached_fetch_ticks));
573
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
574
}
575
}
576
else if (m_block->icache_line_count > 0)
577
{
578
const auto& ticks_reg = RARG1;
579
const auto& current_tag_reg = RARG2;
580
const auto& existing_tag_reg = RARG3;
581
582
// start of block, nothing should be using this
583
const auto& maddr_reg = LA_T0;
584
DebugAssert(!IsHostRegAllocated(maddr_reg));
585
586
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
587
la_ld_w(laAsm, ticks_reg, RSTATE, OFFS(&g_state.pending_ticks));
588
laEmitMov(laAsm, current_tag_reg, current_pc);
589
590
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
591
{
592
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
593
if (fill_ticks <= 0)
594
continue;
595
596
const u32 line = GetICacheLine(current_pc);
597
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
598
599
// Offsets must fit in signed 12 bits.
600
lagoon_label_t cache_hit = {};
601
if (offset >= 2048)
602
{
603
SafeADDI(maddr_reg, RSTATE, offset);
604
la_ld_w(laAsm, existing_tag_reg, maddr_reg, 0);
605
la_beq(laAsm, existing_tag_reg, current_tag_reg, la_label(laAsm, &cache_hit));
606
la_st_w(laAsm, current_tag_reg, maddr_reg, 0);
607
}
608
else
609
{
610
la_ld_w(laAsm, existing_tag_reg, RSTATE, offset);
611
la_beq(laAsm, existing_tag_reg, current_tag_reg, la_label(laAsm, &cache_hit));
612
la_st_w(laAsm, current_tag_reg, RSTATE, offset);
613
}
614
615
SafeADDIW(ticks_reg, ticks_reg, static_cast<u32>(fill_ticks));
616
la_bind(laAsm, &cache_hit);
617
la_label_free(laAsm, &cache_hit);
618
619
if (i != (m_block->icache_line_count - 1))
620
SafeADDIW(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE);
621
}
622
623
la_st_w(laAsm, ticks_reg, RSTATE, OFFS(&g_state.pending_ticks));
624
}
625
}
626
627
void CPU::LoongArch64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
628
s32 arg3reg /*= -1*/)
629
{
630
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1))
631
la_or(laAsm, RARG1, static_cast<la_gpr_t>(arg1reg), LA_ZERO);
632
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2))
633
la_or(laAsm, RARG2, static_cast<la_gpr_t>(arg2reg), LA_ZERO);
634
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3))
635
la_or(laAsm, RARG3, static_cast<la_gpr_t>(arg3reg), LA_ZERO);
636
EmitCall(func);
637
}
638
639
void CPU::LoongArch64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
640
{
641
if (newpc.has_value())
642
{
643
if (m_dirty_pc || m_compiler_pc != newpc)
644
{
645
EmitMov(RSCRATCH, newpc.value());
646
la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pc));
647
}
648
}
649
m_dirty_pc = false;
650
651
// flush regs
652
Flush(FLUSH_END_BLOCK);
653
EndAndLinkBlock(newpc, do_event_test, false);
654
}
655
656
void CPU::LoongArch64Recompiler::EndBlockWithException(Exception excode)
657
{
658
// flush regs, but not pc, it's going to get overwritten
659
// flush cycles because of the GTE instruction stuff...
660
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
661
662
// TODO: flush load delay
663
664
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
665
inst->cop.cop_n));
666
EmitMov(RARG2, m_current_instruction_pc);
667
if (excode != Exception::BP)
668
{
669
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
670
}
671
else
672
{
673
EmitMov(RARG3, inst->bits);
674
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
675
}
676
m_dirty_pc = false;
677
678
EndAndLinkBlock(std::nullopt, true, false);
679
}
680
681
void CPU::LoongArch64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,
682
bool force_run_events)
683
{
684
// event test
685
// pc should've been flushed
686
DebugAssert(!m_dirty_pc && !m_block_ended);
687
m_block_ended = true;
688
689
// TODO: try extracting this to a function
690
// TODO: move the cycle flush in here..
691
692
// save cycles for event test
693
const TickCount cycles = std::exchange(m_cycles, 0);
694
695
// pending_ticks += cycles
696
// if (pending_ticks >= downcount) { dispatch_event(); }
697
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
698
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
699
if (do_event_test)
700
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.downcount));
701
if (cycles > 0)
702
{
703
SafeADDIW(RARG1, RARG1, cycles);
704
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
705
}
706
if (m_gte_done_cycle > cycles)
707
{
708
SafeADDIW(RARG3, RARG1, m_gte_done_cycle - cycles);
709
la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_completion_tick));
710
}
711
712
if (do_event_test)
713
{
714
// TODO: see if we can do a far jump somehow with this..
715
lagoon_label_t cont = {};
716
la_blt(laAsm, RARG1, RARG2, la_label(laAsm, &cont));
717
laEmitJmp(laAsm, CodeCache::g_run_events_and_dispatch);
718
la_bind(laAsm, &cont);
719
la_label_free(laAsm, &cont);
720
}
721
722
// jump to dispatcher or next block
723
if (force_run_events)
724
{
725
laEmitJmp(laAsm, CodeCache::g_run_events_and_dispatch);
726
}
727
else if (!newpc.has_value())
728
{
729
laEmitJmp(laAsm, CodeCache::g_dispatcher);
730
}
731
else
732
{
733
const void* target = (newpc.value() == m_block->pc) ?
734
CodeCache::CreateSelfBlockLink(m_block, laAsm->cursor, laAsm->buffer) :
735
CodeCache::CreateBlockLink(m_block, laAsm->cursor, newpc.value());
736
laEmitJmp(laAsm, target);
737
}
738
}
739
740
const void* CPU::LoongArch64Recompiler::EndCompile(u32* code_size, u32* far_code_size)
741
{
742
u8* const code = m_emitter.buffer;
743
*code_size = static_cast<u32>(m_emitter.cursor - m_emitter.buffer);
744
*far_code_size = static_cast<u32>(m_far_emitter.cursor - m_far_emitter.buffer);
745
laAsm = nullptr;
746
return code;
747
}
748
749
const char* CPU::LoongArch64Recompiler::GetHostRegName(u32 reg) const
750
{
751
static constexpr std::array<const char*, 32> reg64_names = {
752
{"$zero", "$ra", "$tp", "$sp", "$a0", "$a1", "$a2", "$a3", "$a4", "$a5", "$a6",
753
"$a7", "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "$r21",
754
"$fp", "$s0", "$s1", "$s2", "$s3", "$s4", "$s5", "$s6", "$s7", "$s8"}};
755
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
756
}
757
758
void CPU::LoongArch64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
759
{
760
EmitMov(static_cast<la_gpr_t>(reg), val);
761
}
762
763
void CPU::LoongArch64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
764
{
765
la_ld_w(laAsm, static_cast<la_gpr_t>(reg), RSTATE, OFFS(ptr));
766
}
767
768
void CPU::LoongArch64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
769
{
770
la_st_w(laAsm, static_cast<la_gpr_t>(reg), RSTATE, OFFS(ptr));
771
}
772
773
void CPU::LoongArch64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
774
{
775
if (val == 0)
776
{
777
la_st_w(laAsm, LA_ZERO, RSTATE, OFFS(ptr));
778
return;
779
}
780
781
EmitMov(RSCRATCH, val);
782
la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(ptr));
783
}
784
785
void CPU::LoongArch64Recompiler::CopyHostReg(u32 dst, u32 src)
786
{
787
if (src != dst)
788
la_or(laAsm, static_cast<la_gpr_t>(dst), static_cast<la_gpr_t>(src), LA_ZERO);
789
}
790
791
void CPU::LoongArch64Recompiler::AssertRegOrConstS(CompileFlags cf) const
792
{
793
DebugAssert(cf.valid_host_s || cf.const_s);
794
}
795
796
void CPU::LoongArch64Recompiler::AssertRegOrConstT(CompileFlags cf) const
797
{
798
DebugAssert(cf.valid_host_t || cf.const_t);
799
}
800
801
la_gpr_t CPU::LoongArch64Recompiler::CFGetSafeRegS(CompileFlags cf, la_gpr_t temp_reg)
802
{
803
if (cf.valid_host_s)
804
{
805
return static_cast<la_gpr_t>(cf.host_s);
806
}
807
else if (cf.const_s)
808
{
809
if (HasConstantRegValue(cf.MipsS(), 0))
810
return LA_ZERO;
811
812
EmitMov(temp_reg, GetConstantRegU32(cf.MipsS()));
813
return temp_reg;
814
}
815
else
816
{
817
WARNING_LOG("Hit memory path in CFGetSafeRegS() for {}", GetRegName(cf.MipsS()));
818
la_ld_w(laAsm, temp_reg, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));
819
return temp_reg;
820
}
821
}
822
823
la_gpr_t CPU::LoongArch64Recompiler::CFGetSafeRegT(CompileFlags cf, la_gpr_t temp_reg)
824
{
825
if (cf.valid_host_t)
826
{
827
return static_cast<la_gpr_t>(cf.host_t);
828
}
829
else if (cf.const_t)
830
{
831
if (HasConstantRegValue(cf.MipsT(), 0))
832
return LA_ZERO;
833
834
EmitMov(temp_reg, GetConstantRegU32(cf.MipsT()));
835
return temp_reg;
836
}
837
else
838
{
839
WARNING_LOG("Hit memory path in CFGetSafeRegT() for {}", GetRegName(cf.MipsT()));
840
la_ld_w(laAsm, temp_reg, RSTATE, OFFS(&g_state.regs.r[cf.mips_t]));
841
return temp_reg;
842
}
843
}
844
845
la_gpr_t CPU::LoongArch64Recompiler::CFGetRegD(CompileFlags cf) const
846
{
847
DebugAssert(cf.valid_host_d);
848
return static_cast<la_gpr_t>(cf.host_d);
849
}
850
851
la_gpr_t CPU::LoongArch64Recompiler::CFGetRegS(CompileFlags cf) const
852
{
853
DebugAssert(cf.valid_host_s);
854
return static_cast<la_gpr_t>(cf.host_s);
855
}
856
857
la_gpr_t CPU::LoongArch64Recompiler::CFGetRegT(CompileFlags cf) const
858
{
859
DebugAssert(cf.valid_host_t);
860
return static_cast<la_gpr_t>(cf.host_t);
861
}
862
863
la_gpr_t CPU::LoongArch64Recompiler::CFGetRegLO(CompileFlags cf) const
864
{
865
DebugAssert(cf.valid_host_lo);
866
return static_cast<la_gpr_t>(cf.host_lo);
867
}
868
869
la_gpr_t CPU::LoongArch64Recompiler::CFGetRegHI(CompileFlags cf) const
870
{
871
DebugAssert(cf.valid_host_hi);
872
return static_cast<la_gpr_t>(cf.host_hi);
873
}
874
875
void CPU::LoongArch64Recompiler::MoveSToReg(la_gpr_t dst, CompileFlags cf)
876
{
877
if (cf.valid_host_s)
878
{
879
if (cf.host_s != dst)
880
la_or(laAsm, dst, static_cast<la_gpr_t>(cf.host_s), LA_ZERO);
881
}
882
else if (cf.const_s)
883
{
884
EmitMov(dst, GetConstantRegU32(cf.MipsS()));
885
}
886
else
887
{
888
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
889
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));
890
}
891
}
892
893
void CPU::LoongArch64Recompiler::MoveTToReg(la_gpr_t dst, CompileFlags cf)
894
{
895
if (cf.valid_host_t)
896
{
897
if (cf.host_t != dst)
898
la_or(laAsm, dst, static_cast<la_gpr_t>(cf.host_t), LA_ZERO);
899
}
900
else if (cf.const_t)
901
{
902
EmitMov(dst, GetConstantRegU32(cf.MipsT()));
903
}
904
else
905
{
906
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
907
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_t]));
908
}
909
}
910
911
void CPU::LoongArch64Recompiler::MoveMIPSRegToReg(la_gpr_t dst, Reg reg, bool ignore_load_delays)
912
{
913
DebugAssert(reg < Reg::count);
914
if (ignore_load_delays && m_load_delay_register == reg)
915
{
916
if (m_load_delay_value_register == NUM_HOST_REGS)
917
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.load_delay_value));
918
else
919
la_or(laAsm, dst, static_cast<la_gpr_t>(m_load_delay_value_register), LA_ZERO);
920
}
921
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
922
{
923
la_or(laAsm, dst, static_cast<la_gpr_t>(hreg.value()), LA_ZERO);
924
}
925
else if (HasConstantReg(reg))
926
{
927
EmitMov(dst, GetConstantRegU32(reg));
928
}
929
else
930
{
931
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[static_cast<u8>(reg)]));
932
}
933
}
934
935
void CPU::LoongArch64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
936
Reg arg2reg /* = Reg::count */,
937
Reg arg3reg /* = Reg::count */)
938
{
939
DebugAssert(g_settings.gpu_pgxp_enable);
940
941
Flush(FLUSH_FOR_C_CALL);
942
943
if (arg2reg != Reg::count)
944
MoveMIPSRegToReg(RARG2, arg2reg);
945
if (arg3reg != Reg::count)
946
MoveMIPSRegToReg(RARG3, arg3reg);
947
948
EmitMov(RARG1, arg1val);
949
EmitCall(func);
950
}
951
952
void CPU::LoongArch64Recompiler::Flush(u32 flags)
953
{
954
Recompiler::Flush(flags);
955
956
if (flags & FLUSH_PC && m_dirty_pc)
957
{
958
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
959
m_dirty_pc = false;
960
}
961
962
if (flags & FLUSH_INSTRUCTION_BITS)
963
{
964
// This sucks, but it's only used for fallbacks.
965
Panic("Not implemented");
966
}
967
968
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
969
{
970
// This sucks :(
971
// TODO: make it a function?
972
la_ld_bu(laAsm, RARG1, RSTATE, OFFS(&g_state.load_delay_reg));
973
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.load_delay_value));
974
la_slli_d(laAsm, RARG1, RARG1, 2); // *4
975
la_add_d(laAsm, RARG1, RARG1, RSTATE);
976
la_st_w(laAsm, RARG2, RARG1, OFFSETOF(CPU::State, regs.r[0]));
977
la_addi_d(laAsm, RSCRATCH, LA_ZERO, static_cast<u8>(Reg::count));
978
la_st_b(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.load_delay_reg));
979
m_load_delay_dirty = false;
980
}
981
982
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
983
{
984
if (m_load_delay_value_register != NUM_HOST_REGS)
985
FreeHostReg(m_load_delay_value_register);
986
987
EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
988
la_st_b(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.load_delay_reg));
989
m_load_delay_register = Reg::count;
990
m_load_delay_dirty = true;
991
}
992
993
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
994
{
995
// May as well flush cycles while we're here.
996
// GTE spanning blocks is very rare, we _could_ disable this for speed.
997
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
998
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_completion_tick));
999
if (m_cycles > 0)
1000
{
1001
SafeADDIW(RARG1, RARG1, m_cycles);
1002
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1003
m_cycles = 0;
1004
}
1005
1006
lagoon_label_t no_stall = {};
1007
la_bge(laAsm, RARG1, RARG2, la_label(laAsm, &no_stall));
1008
la_or(laAsm, RARG1, RARG2, LA_ZERO);
1009
la_bind(laAsm, &no_stall);
1010
la_label_free(laAsm, &no_stall);
1011
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1012
m_dirty_gte_done_cycle = false;
1013
}
1014
1015
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1016
{
1017
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1018
1019
// update cycles at the same time
1020
if (flags & FLUSH_CYCLES && m_cycles > 0)
1021
{
1022
SafeADDIW(RARG1, RARG1, m_cycles);
1023
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1024
m_gte_done_cycle -= m_cycles;
1025
m_cycles = 0;
1026
}
1027
1028
SafeADDIW(RARG1, RARG1, m_gte_done_cycle);
1029
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.gte_completion_tick));
1030
m_gte_done_cycle = 0;
1031
m_dirty_gte_done_cycle = true;
1032
}
1033
1034
if (flags & FLUSH_CYCLES && m_cycles > 0)
1035
{
1036
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1037
SafeADDIW(RARG1, RARG1, m_cycles);
1038
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pending_ticks));
1039
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1040
m_cycles = 0;
1041
}
1042
}
1043
1044
void CPU::LoongArch64Recompiler::Compile_Fallback()
1045
{
1046
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1047
inst->bits);
1048
1049
Flush(FLUSH_FOR_INTERPRETER);
1050
1051
Panic("Fixme");
1052
}
1053
1054
void CPU::LoongArch64Recompiler::CheckBranchTarget(la_gpr_t pcreg)
1055
{
1056
if (!g_settings.cpu_recompiler_memory_exceptions)
1057
return;
1058
1059
DebugAssert(pcreg != RSCRATCH);
1060
la_andi(laAsm, RSCRATCH, pcreg, 0x3);
1061
SwitchToFarCode(true, LaBranchCondition::NE, RSCRATCH, LA_ZERO);
1062
1063
BackupHostState();
1064
EndBlockWithException(Exception::AdEL);
1065
1066
RestoreHostState();
1067
SwitchToNearCode(false);
1068
}
1069
1070
void CPU::LoongArch64Recompiler::Compile_jr(CompileFlags cf)
1071
{
1072
const la_gpr_t pcreg = CFGetRegS(cf);
1073
CheckBranchTarget(pcreg);
1074
1075
la_st_w(laAsm, pcreg, RSTATE, OFFS(&g_state.pc));
1076
1077
CompileBranchDelaySlot(false);
1078
EndBlock(std::nullopt, true);
1079
}
1080
1081
void CPU::LoongArch64Recompiler::Compile_jalr(CompileFlags cf)
1082
{
1083
const la_gpr_t pcreg = CFGetRegS(cf);
1084
if (MipsD() != Reg::zero)
1085
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1086
1087
CheckBranchTarget(pcreg);
1088
la_st_w(laAsm, pcreg, RSTATE, OFFS(&g_state.pc));
1089
1090
CompileBranchDelaySlot(false);
1091
EndBlock(std::nullopt, true);
1092
}
1093
1094
void CPU::LoongArch64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1095
{
1096
AssertRegOrConstS(cf);
1097
1098
const u32 taken_pc = GetConditionalBranchTarget(cf);
1099
1100
Flush(FLUSH_FOR_BRANCH);
1101
1102
DebugAssert(cf.valid_host_s);
1103
1104
// MipsT() here should equal zero for zero branches.
1105
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1106
1107
lagoon_label_t taken = {};
1108
const la_gpr_t rs = CFGetRegS(cf);
1109
switch (cond)
1110
{
1111
case BranchCondition::Equal:
1112
case BranchCondition::NotEqual:
1113
{
1114
AssertRegOrConstT(cf);
1115
if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
1116
{
1117
(cond == BranchCondition::Equal) ? la_beqz(laAsm, rs, la_label(laAsm, &taken)) :
1118
la_bnez(laAsm, rs, la_label(laAsm, &taken));
1119
}
1120
else
1121
{
1122
const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG1;
1123
if (!cf.valid_host_t)
1124
MoveTToReg(RARG1, cf);
1125
if (cond == Recompiler::BranchCondition::Equal)
1126
la_beq(laAsm, rs, rt, la_label(laAsm, &taken));
1127
else
1128
la_bne(laAsm, rs, rt, la_label(laAsm, &taken));
1129
}
1130
}
1131
break;
1132
1133
case BranchCondition::GreaterThanZero:
1134
{
1135
la_blt(laAsm, LA_ZERO, rs, la_label(laAsm, &taken));
1136
}
1137
break;
1138
1139
case BranchCondition::GreaterEqualZero:
1140
{
1141
la_bge(laAsm, rs, LA_ZERO, la_label(laAsm, &taken));
1142
}
1143
break;
1144
1145
case BranchCondition::LessThanZero:
1146
{
1147
la_blt(laAsm, rs, LA_ZERO, la_label(laAsm, &taken));
1148
}
1149
break;
1150
1151
case BranchCondition::LessEqualZero:
1152
{
1153
la_bge(laAsm, LA_ZERO, rs, la_label(laAsm, &taken));
1154
}
1155
break;
1156
}
1157
1158
BackupHostState();
1159
if (!cf.delay_slot_swapped)
1160
CompileBranchDelaySlot();
1161
1162
EndBlock(m_compiler_pc, true);
1163
1164
la_bind(laAsm, &taken);
1165
la_label_free(laAsm, &taken);
1166
1167
RestoreHostState();
1168
if (!cf.delay_slot_swapped)
1169
CompileBranchDelaySlot();
1170
1171
EndBlock(taken_pc, true);
1172
}
1173
1174
void CPU::LoongArch64Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1175
{
1176
const la_gpr_t rs = CFGetRegS(cf);
1177
const la_gpr_t rt = CFGetRegT(cf);
1178
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1179
{
1180
if (!overflow)
1181
{
1182
SafeADDIW(rt, rs, imm);
1183
}
1184
else
1185
{
1186
SafeADDI(RARG1, rs, imm);
1187
SafeADDIW(rt, rs, imm);
1188
TestOverflow(RARG1, rt, rt);
1189
}
1190
}
1191
else if (rt != rs)
1192
{
1193
la_or(laAsm, rt, rs, LA_ZERO);
1194
}
1195
}
1196
1197
void CPU::LoongArch64Recompiler::Compile_addi(CompileFlags cf)
1198
{
1199
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1200
}
1201
1202
void CPU::LoongArch64Recompiler::Compile_addiu(CompileFlags cf)
1203
{
1204
Compile_addi(cf, false);
1205
}
1206
1207
void CPU::LoongArch64Recompiler::Compile_slti(CompileFlags cf)
1208
{
1209
Compile_slti(cf, true);
1210
}
1211
1212
void CPU::LoongArch64Recompiler::Compile_sltiu(CompileFlags cf)
1213
{
1214
Compile_slti(cf, false);
1215
}
1216
1217
void CPU::LoongArch64Recompiler::Compile_slti(CompileFlags cf, bool sign)
1218
{
1219
if (sign)
1220
SafeSLTI(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
1221
else
1222
SafeSLTIU(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
1223
}
1224
1225
void CPU::LoongArch64Recompiler::Compile_andi(CompileFlags cf)
1226
{
1227
const la_gpr_t rt = CFGetRegT(cf);
1228
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1229
SafeANDI(rt, CFGetRegS(cf), imm);
1230
else
1231
EmitMov(rt, 0);
1232
}
1233
1234
void CPU::LoongArch64Recompiler::Compile_ori(CompileFlags cf)
1235
{
1236
const la_gpr_t rt = CFGetRegT(cf);
1237
const la_gpr_t rs = CFGetRegS(cf);
1238
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1239
SafeORI(rt, rs, imm);
1240
else if (rt != rs)
1241
la_or(laAsm, rt, rs, LA_ZERO);
1242
}
1243
1244
void CPU::LoongArch64Recompiler::Compile_xori(CompileFlags cf)
1245
{
1246
const la_gpr_t rt = CFGetRegT(cf);
1247
const la_gpr_t rs = CFGetRegS(cf);
1248
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1249
SafeXORI(rt, rs, imm);
1250
else if (rt != rs)
1251
la_or(laAsm, rt, rs, LA_ZERO);
1252
}
1253
1254
void CPU::LoongArch64Recompiler::Compile_shift(CompileFlags cf, LaRRROp op, LaRRUImmOp op_const)
1255
{
1256
const la_gpr_t rd = CFGetRegD(cf);
1257
const la_gpr_t rt = CFGetRegT(cf);
1258
if (inst->r.shamt > 0)
1259
op_const(laAsm, rd, rt, inst->r.shamt);
1260
else if (rd != rt)
1261
la_or(laAsm, rd, rt, LA_ZERO);
1262
}
1263
1264
void CPU::LoongArch64Recompiler::Compile_sll(CompileFlags cf)
1265
{
1266
Compile_shift(cf, la_sll_w, la_slli_w);
1267
}
1268
1269
void CPU::LoongArch64Recompiler::Compile_srl(CompileFlags cf)
1270
{
1271
Compile_shift(cf, la_srl_w, la_srli_w);
1272
}
1273
1274
void CPU::LoongArch64Recompiler::Compile_sra(CompileFlags cf)
1275
{
1276
Compile_shift(cf, la_sra_w, la_srai_w);
1277
}
1278
1279
void CPU::LoongArch64Recompiler::Compile_variable_shift(CompileFlags cf, LaRRROp op, LaRRUImmOp op_const)
1280
{
1281
const la_gpr_t rd = CFGetRegD(cf);
1282
1283
AssertRegOrConstS(cf);
1284
AssertRegOrConstT(cf);
1285
1286
const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1287
if (!cf.valid_host_t)
1288
MoveTToReg(rt, cf);
1289
1290
if (cf.const_s)
1291
{
1292
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1293
op_const(laAsm, rd, rt, shift & 31u);
1294
else if (rd != rt)
1295
la_or(laAsm, rd, rt, LA_ZERO);
1296
}
1297
else
1298
{
1299
op(laAsm, rd, rt, CFGetRegS(cf));
1300
}
1301
}
1302
1303
void CPU::LoongArch64Recompiler::Compile_sllv(CompileFlags cf)
1304
{
1305
Compile_variable_shift(cf, la_sll_w, la_slli_w);
1306
}
1307
1308
void CPU::LoongArch64Recompiler::Compile_srlv(CompileFlags cf)
1309
{
1310
Compile_variable_shift(cf, la_srl_w, la_srli_w);
1311
}
1312
1313
void CPU::LoongArch64Recompiler::Compile_srav(CompileFlags cf)
1314
{
1315
Compile_variable_shift(cf, la_sra_w, la_srai_w);
1316
}
1317
1318
void CPU::LoongArch64Recompiler::Compile_mult(CompileFlags cf, bool sign)
1319
{
1320
const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1321
if (!cf.valid_host_s)
1322
MoveSToReg(rs, cf);
1323
1324
const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1325
if (!cf.valid_host_t)
1326
MoveTToReg(rt, cf);
1327
1328
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1329
const la_gpr_t lo = CFGetRegLO(cf);
1330
const la_gpr_t hi = CFGetRegHI(cf);
1331
1332
if (sign)
1333
{
1334
la_mul_d(laAsm, lo, rs, rt);
1335
la_srai_d(laAsm, hi, lo, 32);
1336
EmitDSExtW(lo, lo);
1337
}
1338
else
1339
{
1340
EmitDUExtW(RARG1, rs);
1341
EmitDUExtW(RARG2, rt);
1342
la_mul_d(laAsm, lo, RARG1, RARG2);
1343
la_srai_d(laAsm, hi, lo, 32);
1344
EmitDSExtW(lo, lo);
1345
}
1346
}
1347
1348
void CPU::LoongArch64Recompiler::Compile_mult(CompileFlags cf)
1349
{
1350
Compile_mult(cf, true);
1351
}
1352
1353
void CPU::LoongArch64Recompiler::Compile_multu(CompileFlags cf)
1354
{
1355
Compile_mult(cf, false);
1356
}
1357
1358
void CPU::LoongArch64Recompiler::Compile_div(CompileFlags cf)
1359
{
1360
const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1361
if (!cf.valid_host_s)
1362
MoveSToReg(rs, cf);
1363
1364
const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1365
if (!cf.valid_host_t)
1366
MoveTToReg(rt, cf);
1367
1368
const la_gpr_t rlo = CFGetRegLO(cf);
1369
const la_gpr_t rhi = CFGetRegHI(cf);
1370
1371
lagoon_label_t done = {};
1372
lagoon_label_t not_divide_by_zero = {};
1373
la_bnez(laAsm, rt, la_label(laAsm, &not_divide_by_zero));
1374
la_or(laAsm, rhi, rs, LA_ZERO); // hi = num
1375
la_srai_d(laAsm, rlo, rs, 63);
1376
la_andi(laAsm, rlo, rlo, 2);
1377
la_addi_d(laAsm, rlo, rlo, -1); // lo = s >= 0 ? -1 : 1
1378
la_b(laAsm, la_label(laAsm, &done));
1379
1380
la_bind(laAsm, &not_divide_by_zero);
1381
la_label_free(laAsm, &not_divide_by_zero);
1382
1383
lagoon_label_t not_unrepresentable = {};
1384
EmitMov(RSCRATCH, static_cast<u32>(-1));
1385
la_bne(laAsm, rt, RSCRATCH, la_label(laAsm, &not_unrepresentable));
1386
EmitMov(rlo, 0x80000000u);
1387
la_bne(laAsm, rs, rlo, la_label(laAsm, &not_unrepresentable));
1388
EmitMov(rhi, 0);
1389
la_b(laAsm, la_label(laAsm, &done));
1390
1391
la_bind(laAsm, &not_unrepresentable);
1392
la_label_free(laAsm, &not_unrepresentable);
1393
1394
la_div_w(laAsm, rlo, rs, rt);
1395
la_mod_w(laAsm, rhi, rs, rt);
1396
1397
la_bind(laAsm, &done);
1398
la_label_free(laAsm, &done);
1399
}
1400
1401
void CPU::LoongArch64Recompiler::Compile_divu(CompileFlags cf)
1402
{
1403
const la_gpr_t rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1404
if (!cf.valid_host_s)
1405
MoveSToReg(rs, cf);
1406
1407
const la_gpr_t rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1408
if (!cf.valid_host_t)
1409
MoveTToReg(rt, cf);
1410
1411
const la_gpr_t rlo = CFGetRegLO(cf);
1412
const la_gpr_t rhi = CFGetRegHI(cf);
1413
1414
// Semantics match? :-)
1415
la_div_wu(laAsm, rlo, rs, rt);
1416
la_mod_wu(laAsm, rhi, rs, rt);
1417
}
1418
1419
void CPU::LoongArch64Recompiler::TestOverflow(la_gpr_t long_res, la_gpr_t res, la_gpr_t reg_to_discard)
1420
{
1421
SwitchToFarCode(true, LaBranchCondition::NE, long_res, res);
1422
1423
BackupHostState();
1424
1425
// toss the result
1426
ClearHostReg(reg_to_discard);
1427
1428
EndBlockWithException(Exception::Ov);
1429
1430
RestoreHostState();
1431
1432
SwitchToNearCode(false);
1433
}
1434
1435
void CPU::LoongArch64Recompiler::Compile_dst_op(CompileFlags cf, LaRRROp op,
1436
void (LoongArch64Recompiler::*op_const)(la_gpr_t rd, la_gpr_t rs,
1437
u32 imm),
1438
LaRRROp op_long, bool commutative, bool overflow)
1439
{
1440
AssertRegOrConstS(cf);
1441
AssertRegOrConstT(cf);
1442
1443
const la_gpr_t rd = CFGetRegD(cf);
1444
1445
if (overflow)
1446
{
1447
const la_gpr_t rs = CFGetSafeRegS(cf, RARG1);
1448
const la_gpr_t rt = CFGetSafeRegT(cf, RARG2);
1449
op_long(laAsm, RARG3, rs, rt);
1450
op(laAsm, rd, rs, rt);
1451
TestOverflow(RARG3, rd, rd);
1452
return;
1453
}
1454
1455
if (cf.valid_host_s && cf.valid_host_t)
1456
{
1457
op(laAsm, rd, CFGetRegS(cf), CFGetRegT(cf));
1458
}
1459
else if (commutative && (cf.const_s || cf.const_t))
1460
{
1461
const la_gpr_t src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1462
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1463
{
1464
(this->*op_const)(rd, src, cv);
1465
}
1466
else
1467
{
1468
if (rd != src)
1469
la_or(laAsm, rd, src, LA_ZERO);
1470
overflow = false;
1471
}
1472
}
1473
else if (cf.const_s)
1474
{
1475
if (HasConstantRegValue(cf.MipsS(), 0))
1476
{
1477
op(laAsm, rd, LA_ZERO, CFGetRegT(cf));
1478
}
1479
else
1480
{
1481
EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
1482
op(laAsm, rd, RSCRATCH, CFGetRegT(cf));
1483
}
1484
}
1485
else if (cf.const_t)
1486
{
1487
const la_gpr_t rs = CFGetRegS(cf);
1488
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1489
{
1490
(this->*op_const)(rd, rs, cv);
1491
}
1492
else
1493
{
1494
if (rd != rs)
1495
la_or(laAsm, rd, rs, LA_ZERO);
1496
overflow = false;
1497
}
1498
}
1499
}
1500
1501
void CPU::LoongArch64Recompiler::Compile_add(CompileFlags cf)
1502
{
1503
Compile_dst_op(cf, la_add_w, &LoongArch64Recompiler::SafeADDIW, la_add_d, true,
1504
g_settings.cpu_recompiler_memory_exceptions);
1505
}
1506
1507
void CPU::LoongArch64Recompiler::Compile_addu(CompileFlags cf)
1508
{
1509
Compile_dst_op(cf, la_add_w, &LoongArch64Recompiler::SafeADDIW, la_add_d, true, false);
1510
}
1511
1512
void CPU::LoongArch64Recompiler::Compile_sub(CompileFlags cf)
1513
{
1514
Compile_dst_op(cf, la_sub_w, &LoongArch64Recompiler::SafeSUBIW, la_sub_d, false,
1515
g_settings.cpu_recompiler_memory_exceptions);
1516
}
1517
1518
void CPU::LoongArch64Recompiler::Compile_subu(CompileFlags cf)
1519
{
1520
Compile_dst_op(cf, la_sub_w, &LoongArch64Recompiler::SafeSUBIW, la_sub_d, false, false);
1521
}
1522
1523
void CPU::LoongArch64Recompiler::Compile_and(CompileFlags cf)
1524
{
1525
AssertRegOrConstS(cf);
1526
AssertRegOrConstT(cf);
1527
1528
// special cases - and with self -> self, and with 0 -> 0
1529
const la_gpr_t regd = CFGetRegD(cf);
1530
if (cf.MipsS() == cf.MipsT())
1531
{
1532
la_or(laAsm, regd, CFGetRegS(cf), LA_ZERO);
1533
return;
1534
}
1535
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1536
{
1537
EmitMov(regd, 0);
1538
return;
1539
}
1540
1541
Compile_dst_op(cf, la_and, &LoongArch64Recompiler::SafeANDI, la_and, true, false);
1542
}
1543
1544
void CPU::LoongArch64Recompiler::Compile_or(CompileFlags cf)
1545
{
1546
AssertRegOrConstS(cf);
1547
AssertRegOrConstT(cf);
1548
1549
// or/nor with 0 -> no effect
1550
const la_gpr_t regd = CFGetRegD(cf);
1551
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1552
{
1553
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1554
return;
1555
}
1556
1557
Compile_dst_op(cf, la_or, &LoongArch64Recompiler::SafeORI, la_or, true, false);
1558
}
1559
1560
void CPU::LoongArch64Recompiler::Compile_xor(CompileFlags cf)
1561
{
1562
AssertRegOrConstS(cf);
1563
AssertRegOrConstT(cf);
1564
1565
const la_gpr_t regd = CFGetRegD(cf);
1566
if (cf.MipsS() == cf.MipsT())
1567
{
1568
// xor with self -> zero
1569
EmitMov(regd, 0);
1570
return;
1571
}
1572
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1573
{
1574
// xor with zero -> no effect
1575
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1576
return;
1577
}
1578
1579
Compile_dst_op(cf, la_xor, &LoongArch64Recompiler::SafeXORI, la_xor, true, false);
1580
}
1581
1582
void CPU::LoongArch64Recompiler::Compile_nor(CompileFlags cf)
1583
{
1584
Compile_or(cf);
1585
la_nor(laAsm, CFGetRegD(cf), CFGetRegD(cf), LA_ZERO);
1586
}
1587
1588
void CPU::LoongArch64Recompiler::Compile_slt(CompileFlags cf)
1589
{
1590
Compile_slt(cf, true);
1591
}
1592
1593
void CPU::LoongArch64Recompiler::Compile_sltu(CompileFlags cf)
1594
{
1595
Compile_slt(cf, false);
1596
}
1597
1598
void CPU::LoongArch64Recompiler::Compile_slt(CompileFlags cf, bool sign)
1599
{
1600
AssertRegOrConstS(cf);
1601
AssertRegOrConstT(cf);
1602
1603
const la_gpr_t rd = CFGetRegD(cf);
1604
const la_gpr_t rs = CFGetSafeRegS(cf, RARG1);
1605
1606
if (cf.const_t && laIsValidSImm12(GetConstantRegU32(cf.MipsT())))
1607
{
1608
if (sign)
1609
la_slti(laAsm, rd, rs, GetConstantRegS32(cf.MipsT()));
1610
else
1611
la_sltui(laAsm, rd, rs, GetConstantRegS32(cf.MipsT()));
1612
}
1613
else
1614
{
1615
const la_gpr_t rt = CFGetSafeRegT(cf, RARG2);
1616
if (sign)
1617
la_slt(laAsm, rd, rs, rt);
1618
else
1619
la_sltu(laAsm, rd, rs, rt);
1620
}
1621
}
1622
1623
la_gpr_t CPU::LoongArch64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf,
1624
const std::optional<VirtualMemoryAddress>& address,
1625
const std::optional<la_gpr_t>& reg)
1626
{
1627
const u32 imm = inst->i.imm_sext32();
1628
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1629
return CFGetRegS(cf);
1630
1631
const la_gpr_t dst = reg.has_value() ? reg.value() : RARG1;
1632
if (address.has_value())
1633
{
1634
EmitMov(dst, address.value());
1635
}
1636
else if (imm == 0)
1637
{
1638
if (cf.valid_host_s)
1639
{
1640
if (const la_gpr_t src = CFGetRegS(cf); src != dst)
1641
la_or(laAsm, dst, src, LA_ZERO);
1642
}
1643
else
1644
{
1645
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));
1646
}
1647
}
1648
else
1649
{
1650
if (cf.valid_host_s)
1651
{
1652
SafeADDIW(dst, CFGetRegS(cf), inst->i.imm_sext32());
1653
}
1654
else
1655
{
1656
la_ld_w(laAsm, dst, RSTATE, OFFS(&g_state.regs.r[cf.mips_s]));
1657
SafeADDIW(dst, dst, inst->i.imm_sext32());
1658
}
1659
}
1660
1661
return dst;
1662
}
1663
1664
template<typename RegAllocFn>
1665
la_gpr_t CPU::LoongArch64Recompiler::GenerateLoad(la_gpr_t addr_reg, MemoryAccessSize size, bool sign, bool use_fastmem,
1666
const RegAllocFn& dst_reg_alloc)
1667
{
1668
if (use_fastmem)
1669
{
1670
m_cycles += Bus::RAM_READ_TICKS;
1671
1672
// TODO: Make this better. If we're loading the address from state, we can use LD_WU instead, and skip this.
1673
// TODO: LUT fastmem
1674
const la_gpr_t dst = dst_reg_alloc();
1675
// Zero-extend address to 64-bit
1676
la_bstrpick_d(laAsm, RSCRATCH, addr_reg, 31, 0);
1677
1678
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1679
{
1680
DebugAssert(addr_reg != RARG3);
1681
la_srli_d(laAsm, RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
1682
la_slli_d(laAsm, RARG3, RARG3, 3);
1683
la_add_d(laAsm, RARG3, RARG3, RMEMBASE);
1684
la_ld_d(laAsm, RARG3, RARG3, 0);
1685
la_add_d(laAsm, RSCRATCH, RSCRATCH, RARG3);
1686
}
1687
else
1688
{
1689
la_add_d(laAsm, RSCRATCH, RSCRATCH, RMEMBASE);
1690
}
1691
1692
u8* start = laAsm->cursor;
1693
switch (size)
1694
{
1695
case MemoryAccessSize::Byte:
1696
sign ? la_ld_b(laAsm, dst, RSCRATCH, 0) : la_ld_bu(laAsm, dst, RSCRATCH, 0);
1697
break;
1698
1699
case MemoryAccessSize::HalfWord:
1700
sign ? la_ld_h(laAsm, dst, RSCRATCH, 0) : la_ld_hu(laAsm, dst, RSCRATCH, 0);
1701
break;
1702
1703
case MemoryAccessSize::Word:
1704
la_ld_w(laAsm, dst, RSCRATCH, 0);
1705
break;
1706
}
1707
1708
// We need a nop, because the slowmem jump might be more than 1MB away.
1709
la_andi(laAsm, LA_ZERO, LA_ZERO, 0); // NOP
1710
1711
AddLoadStoreInfo(start, 8, addr_reg, dst, size, sign, true);
1712
return dst;
1713
}
1714
1715
if (addr_reg != RARG1)
1716
la_or(laAsm, RARG1, addr_reg, LA_ZERO);
1717
1718
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1719
switch (size)
1720
{
1721
case MemoryAccessSize::Byte:
1722
{
1723
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :
1724
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));
1725
}
1726
break;
1727
case MemoryAccessSize::HalfWord:
1728
{
1729
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :
1730
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));
1731
}
1732
break;
1733
case MemoryAccessSize::Word:
1734
{
1735
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :
1736
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));
1737
}
1738
break;
1739
}
1740
1741
// TODO: turn this into an asm function instead
1742
if (checked)
1743
{
1744
la_srli_d(laAsm, RSCRATCH, RRET, 63);
1745
SwitchToFarCode(true, LaBranchCondition::NE, RSCRATCH, LA_ZERO);
1746
BackupHostState();
1747
1748
// Need to stash this in a temp because of the flush.
1749
const la_gpr_t temp = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));
1750
la_sub_d(laAsm, temp, LA_ZERO, RRET);
1751
la_slli_w(laAsm, temp, temp, 2);
1752
1753
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1754
1755
// cause_bits = (-result << 2) | BD | cop_n
1756
SafeORI(RARG1, temp,
1757
Cop0Registers::CAUSE::MakeValueForException(
1758
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
1759
EmitMov(RARG2, m_current_instruction_pc);
1760
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1761
FreeHostReg(temp);
1762
EndBlock(std::nullopt, true);
1763
1764
RestoreHostState();
1765
SwitchToNearCode(false);
1766
}
1767
1768
const la_gpr_t dst_reg = dst_reg_alloc();
1769
switch (size)
1770
{
1771
case MemoryAccessSize::Byte:
1772
{
1773
sign ? EmitSExtB(dst_reg, RRET) : EmitUExtB(dst_reg, RRET);
1774
}
1775
break;
1776
case MemoryAccessSize::HalfWord:
1777
{
1778
sign ? EmitSExtH(dst_reg, RRET) : EmitUExtH(dst_reg, RRET);
1779
}
1780
break;
1781
case MemoryAccessSize::Word:
1782
{
1783
// Need to undo the zero-extend.
1784
if (checked)
1785
laEmitDSExtW(laAsm, dst_reg, RRET);
1786
else if (dst_reg != RRET)
1787
la_or(laAsm, dst_reg, RRET, LA_ZERO);
1788
}
1789
break;
1790
}
1791
1792
return dst_reg;
1793
}
1794
1795
void CPU::LoongArch64Recompiler::GenerateStore(la_gpr_t addr_reg, la_gpr_t value_reg, MemoryAccessSize size,
1796
bool use_fastmem)
1797
{
1798
if (use_fastmem)
1799
{
1800
DebugAssert(value_reg != RSCRATCH);
1801
// Zero-extend address to 64-bit
1802
la_bstrpick_d(laAsm, RSCRATCH, addr_reg, 31, 0);
1803
1804
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1805
{
1806
DebugAssert(addr_reg != RARG3);
1807
la_srli_d(laAsm, RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
1808
la_slli_d(laAsm, RARG3, RARG3, 3);
1809
la_add_d(laAsm, RARG3, RARG3, RMEMBASE);
1810
la_ld_d(laAsm, RARG3, RARG3, 0);
1811
la_add_d(laAsm, RSCRATCH, RSCRATCH, RARG3);
1812
}
1813
else
1814
{
1815
la_add_d(laAsm, RSCRATCH, RSCRATCH, RMEMBASE);
1816
}
1817
1818
u8* start = laAsm->cursor;
1819
switch (size)
1820
{
1821
case MemoryAccessSize::Byte:
1822
la_st_b(laAsm, value_reg, RSCRATCH, 0);
1823
break;
1824
1825
case MemoryAccessSize::HalfWord:
1826
la_st_h(laAsm, value_reg, RSCRATCH, 0);
1827
break;
1828
1829
case MemoryAccessSize::Word:
1830
la_st_w(laAsm, value_reg, RSCRATCH, 0);
1831
break;
1832
}
1833
1834
// We need a nop, because the slowmem jump might be more than 1MB away.
1835
la_andi(laAsm, LA_ZERO, LA_ZERO, 0); // NOP
1836
1837
AddLoadStoreInfo(start, 8, addr_reg, value_reg, size, false, false);
1838
return;
1839
}
1840
1841
if (addr_reg != RARG1)
1842
la_or(laAsm, RARG1, addr_reg, LA_ZERO);
1843
if (value_reg != RARG2)
1844
la_or(laAsm, RARG2, value_reg, LA_ZERO);
1845
1846
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1847
switch (size)
1848
{
1849
case MemoryAccessSize::Byte:
1850
{
1851
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :
1852
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
1853
}
1854
break;
1855
case MemoryAccessSize::HalfWord:
1856
{
1857
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :
1858
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
1859
}
1860
break;
1861
case MemoryAccessSize::Word:
1862
{
1863
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :
1864
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
1865
}
1866
break;
1867
}
1868
1869
// TODO: turn this into an asm function instead
1870
if (checked)
1871
{
1872
SwitchToFarCode(true, LaBranchCondition::NE, RRET, LA_ZERO);
1873
BackupHostState();
1874
1875
// Need to stash this in a temp because of the flush.
1876
const la_gpr_t temp = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));
1877
la_slli_w(laAsm, temp, RRET, 2);
1878
1879
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1880
1881
// cause_bits = (result << 2) | BD | cop_n
1882
SafeORI(RARG1, temp,
1883
Cop0Registers::CAUSE::MakeValueForException(
1884
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
1885
EmitMov(RARG2, m_current_instruction_pc);
1886
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1887
FreeHostReg(temp);
1888
EndBlock(std::nullopt, true);
1889
1890
RestoreHostState();
1891
SwitchToNearCode(false);
1892
}
1893
}
1894
1895
void CPU::LoongArch64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1896
const std::optional<VirtualMemoryAddress>& address)
1897
{
1898
const std::optional<la_gpr_t> addr_reg =
1899
(g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero) ?
1900
std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1901
std::optional<la_gpr_t>();
1902
FlushForLoadStore(address, false, use_fastmem);
1903
const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
1904
const la_gpr_t data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
1905
if (cf.MipsT() == Reg::zero)
1906
return RRET;
1907
1908
return static_cast<la_gpr_t>(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
1909
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
1910
cf.MipsT()));
1911
});
1912
1913
if (g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero)
1914
{
1915
Flush(FLUSH_FOR_C_CALL);
1916
1917
EmitMov(RARG1, inst->bits);
1918
la_or(laAsm, RARG2, addr, LA_ZERO);
1919
la_or(laAsm, RARG3, data, LA_ZERO);
1920
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
1921
FreeHostReg(addr_reg.value());
1922
}
1923
}
1924
1925
void CPU::LoongArch64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1926
const std::optional<VirtualMemoryAddress>& address)
1927
{
1928
DebugAssert(size == MemoryAccessSize::Word && !sign);
1929
1930
const la_gpr_t addr = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));
1931
FlushForLoadStore(address, false, use_fastmem);
1932
1933
// TODO: if address is constant, this can be simplified..
1934
1935
// If we're coming from another block, just flush the load delay and hope for the best..
1936
if (m_load_delay_dirty)
1937
UpdateLoadDelay();
1938
1939
// We'd need to be careful here if we weren't overwriting it..
1940
ComputeLoadStoreAddressArg(cf, address, addr);
1941
1942
// Do PGXP first, it does its own load.
1943
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
1944
{
1945
Flush(FLUSH_FOR_C_CALL);
1946
EmitMov(RARG1, inst->bits);
1947
la_or(laAsm, RARG2, addr, LA_ZERO);
1948
MoveMIPSRegToReg(RARG3, inst->r.rt, true);
1949
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
1950
}
1951
1952
la_or(laAsm, RARG1, addr, LA_ZERO);
1953
la_bstrins_d(laAsm, RARG1, LA_ZERO, 1, 0); // addr & ~3
1954
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
1955
1956
if (inst->r.rt == Reg::zero)
1957
{
1958
FreeHostReg(addr);
1959
return;
1960
}
1961
1962
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
1963
// never written back. NOTE: can't trust T in cf because of the flush
1964
const Reg rt = inst->r.rt;
1965
la_gpr_t value;
1966
if (m_load_delay_register == rt)
1967
{
1968
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
1969
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
1970
m_load_delay_value_register;
1971
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
1972
value = static_cast<la_gpr_t>(existing_ld_rt);
1973
}
1974
else
1975
{
1976
if constexpr (EMULATE_LOAD_DELAYS)
1977
{
1978
value = static_cast<la_gpr_t>(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
1979
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
1980
la_or(laAsm, value, static_cast<la_gpr_t>(rtreg.value()), LA_ZERO);
1981
else if (HasConstantReg(rt))
1982
EmitMov(value, GetConstantRegU32(rt));
1983
else
1984
la_ld_w(laAsm, value, RSTATE, OFFS(&g_state.regs.r[static_cast<u8>(rt)]));
1985
}
1986
else
1987
{
1988
value = static_cast<la_gpr_t>(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
1989
}
1990
}
1991
1992
DebugAssert(value != RARG2 && value != RARG3);
1993
la_andi(laAsm, RARG2, addr, 3);
1994
la_slli_w(laAsm, RARG2, RARG2, 3); // *8
1995
EmitMov(RARG3, 24);
1996
la_sub_w(laAsm, RARG3, RARG3, RARG2);
1997
1998
if (inst->op == InstructionOp::lwl)
1999
{
2000
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
2001
// new_value = (value & mask) | (RWRET << (24 - shift));
2002
EmitMov(RSCRATCH, 0xFFFFFFu);
2003
la_srl_w(laAsm, RSCRATCH, RSCRATCH, RARG2);
2004
la_and(laAsm, value, value, RSCRATCH);
2005
la_sll_w(laAsm, RRET, RRET, RARG3);
2006
la_or(laAsm, value, value, RRET);
2007
}
2008
else
2009
{
2010
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
2011
// new_value = (value & mask) | (RWRET >> shift);
2012
la_srl_w(laAsm, RRET, RRET, RARG2);
2013
EmitMov(RSCRATCH, 0xFFFFFF00u);
2014
la_sll_w(laAsm, RSCRATCH, RSCRATCH, RARG3);
2015
la_and(laAsm, value, value, RSCRATCH);
2016
la_or(laAsm, value, value, RRET);
2017
}
2018
2019
FreeHostReg(addr);
2020
}
2021
2022
void CPU::LoongArch64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2023
const std::optional<VirtualMemoryAddress>& address)
2024
{
2025
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2026
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2027
const std::optional<la_gpr_t> addr_reg =
2028
g_settings.gpu_pgxp_enable ? std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2029
std::optional<la_gpr_t>();
2030
FlushForLoadStore(address, false, use_fastmem);
2031
const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2032
const la_gpr_t value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2033
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2034
static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2035
RRET;
2036
});
2037
2038
switch (action)
2039
{
2040
case GTERegisterAccessAction::Ignore:
2041
{
2042
break;
2043
}
2044
2045
case GTERegisterAccessAction::Direct:
2046
{
2047
la_st_w(laAsm, value, RSTATE, OFFS(ptr));
2048
break;
2049
}
2050
2051
case GTERegisterAccessAction::SignExtend16:
2052
{
2053
EmitSExtH(RARG3, value);
2054
la_st_w(laAsm, RARG3, RSTATE, OFFS(ptr));
2055
break;
2056
}
2057
2058
case GTERegisterAccessAction::ZeroExtend16:
2059
{
2060
EmitUExtH(RARG3, value);
2061
la_st_w(laAsm, RARG3, RSTATE, OFFS(ptr));
2062
break;
2063
}
2064
2065
case GTERegisterAccessAction::CallHandler:
2066
{
2067
Flush(FLUSH_FOR_C_CALL);
2068
la_or(laAsm, RARG2, value, LA_ZERO);
2069
EmitMov(RARG1, index);
2070
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2071
break;
2072
}
2073
2074
case GTERegisterAccessAction::PushFIFO:
2075
{
2076
// SXY0 <- SXY1
2077
// SXY1 <- SXY2
2078
// SXY2 <- SXYP
2079
DebugAssert(value != RARG2 && value != RARG3);
2080
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));
2081
la_ld_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));
2082
la_st_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY0[0]));
2083
la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));
2084
la_st_w(laAsm, value, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));
2085
break;
2086
}
2087
2088
default:
2089
{
2090
Panic("Unknown action");
2091
return;
2092
}
2093
}
2094
2095
if (g_settings.gpu_pgxp_enable)
2096
{
2097
Flush(FLUSH_FOR_C_CALL);
2098
la_or(laAsm, RARG3, value, LA_ZERO);
2099
if (value != RRET)
2100
FreeHostReg(value);
2101
la_or(laAsm, RARG2, addr, LA_ZERO);
2102
FreeHostReg(addr_reg.value());
2103
EmitMov(RARG1, inst->bits);
2104
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2105
}
2106
}
2107
2108
void CPU::LoongArch64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2109
const std::optional<VirtualMemoryAddress>& address)
2110
{
2111
AssertRegOrConstS(cf);
2112
AssertRegOrConstT(cf);
2113
2114
const std::optional<la_gpr_t> addr_reg =
2115
g_settings.gpu_pgxp_enable ? std::optional<la_gpr_t>(static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2116
std::optional<la_gpr_t>();
2117
FlushForLoadStore(address, true, use_fastmem);
2118
const la_gpr_t addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2119
const la_gpr_t data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
2120
if (!cf.valid_host_t)
2121
MoveTToReg(RARG2, cf);
2122
2123
GenerateStore(addr, data, size, use_fastmem);
2124
2125
if (g_settings.gpu_pgxp_enable)
2126
{
2127
Flush(FLUSH_FOR_C_CALL);
2128
MoveMIPSRegToReg(RARG3, cf.MipsT());
2129
la_or(laAsm, RARG2, addr, LA_ZERO);
2130
EmitMov(RARG1, inst->bits);
2131
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2132
FreeHostReg(addr_reg.value());
2133
}
2134
}
2135
2136
void CPU::LoongArch64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2137
const std::optional<VirtualMemoryAddress>& address)
2138
{
2139
DebugAssert(size == MemoryAccessSize::Word && !sign);
2140
2141
// TODO: this can take over rt's value if it's no longer needed
2142
// NOTE: can't trust T in cf because of the alloc
2143
const la_gpr_t addr = static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED));
2144
2145
FlushForLoadStore(address, true, use_fastmem);
2146
2147
// TODO: if address is constant, this can be simplified..
2148
// We'd need to be careful here if we weren't overwriting it..
2149
ComputeLoadStoreAddressArg(cf, address, addr);
2150
2151
if (g_settings.gpu_pgxp_enable)
2152
{
2153
Flush(FLUSH_FOR_C_CALL);
2154
EmitMov(RARG1, inst->bits);
2155
la_or(laAsm, RARG2, addr, LA_ZERO);
2156
MoveMIPSRegToReg(RARG3, inst->r.rt);
2157
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
2158
}
2159
2160
la_or(laAsm, RARG1, addr, LA_ZERO);
2161
la_bstrins_d(laAsm, RARG1, LA_ZERO, 1, 0); // addr & ~3
2162
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
2163
2164
la_andi(laAsm, RSCRATCH, addr, 3);
2165
la_slli_w(laAsm, RSCRATCH, RSCRATCH, 3); // *8
2166
la_bstrins_d(laAsm, addr, LA_ZERO, 1, 0); // addr & ~3
2167
2168
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
2169
if (!g_settings.gpu_pgxp_enable)
2170
MoveMIPSRegToReg(RARG2, inst->r.rt);
2171
2172
if (inst->op == InstructionOp::swl)
2173
{
2174
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2175
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2176
EmitMov(RARG3, 0xFFFFFF00u);
2177
la_sll_w(laAsm, RARG3, RARG3, RSCRATCH);
2178
la_and(laAsm, RRET, RRET, RARG3);
2179
2180
EmitMov(RARG3, 24);
2181
la_sub_w(laAsm, RARG3, RARG3, RSCRATCH);
2182
la_srl_w(laAsm, RARG2, RARG2, RARG3);
2183
la_or(laAsm, RARG2, RARG2, RRET);
2184
}
2185
else
2186
{
2187
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2188
// new_value = (RWRET & mem_mask) | (value << shift);
2189
la_sll_w(laAsm, RARG2, RARG2, RSCRATCH);
2190
2191
EmitMov(RARG3, 24);
2192
la_sub_w(laAsm, RARG3, RARG3, RSCRATCH);
2193
EmitMov(RSCRATCH, 0x00FFFFFFu);
2194
la_srl_w(laAsm, RSCRATCH, RSCRATCH, RARG3);
2195
la_and(laAsm, RRET, RRET, RSCRATCH);
2196
la_or(laAsm, RARG2, RARG2, RRET);
2197
}
2198
2199
GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
2200
FreeHostReg(addr);
2201
}
2202
2203
void CPU::LoongArch64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2204
const std::optional<VirtualMemoryAddress>& address)
2205
{
2206
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2207
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2208
const la_gpr_t addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2209
static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2210
RARG1;
2211
const la_gpr_t data =
2212
g_settings.gpu_pgxp_enable ? static_cast<la_gpr_t>(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
2213
FlushForLoadStore(address, true, use_fastmem);
2214
ComputeLoadStoreAddressArg(cf, address, addr);
2215
2216
switch (action)
2217
{
2218
case GTERegisterAccessAction::Direct:
2219
{
2220
la_ld_w(laAsm, data, RSTATE, OFFS(ptr));
2221
}
2222
break;
2223
2224
case GTERegisterAccessAction::CallHandler:
2225
{
2226
// should already be flushed.. except in fastmem case
2227
Flush(FLUSH_FOR_C_CALL);
2228
EmitMov(RARG1, index);
2229
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2230
la_or(laAsm, data, RRET, LA_ZERO);
2231
}
2232
break;
2233
2234
default:
2235
{
2236
Panic("Unknown action");
2237
}
2238
break;
2239
}
2240
2241
GenerateStore(addr, data, size, use_fastmem);
2242
2243
if (!g_settings.gpu_pgxp_enable)
2244
{
2245
if (addr != RARG1)
2246
FreeHostReg(addr);
2247
}
2248
else
2249
{
2250
// TODO: This can be simplified because we don't need to validate in PGXP..
2251
Flush(FLUSH_FOR_C_CALL);
2252
la_or(laAsm, RARG3, data, LA_ZERO);
2253
FreeHostReg(data);
2254
la_or(laAsm, RARG2, addr, LA_ZERO);
2255
FreeHostReg(addr);
2256
EmitMov(RARG1, inst->bits);
2257
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2258
}
2259
}
2260
2261
void CPU::LoongArch64Recompiler::Compile_mtc0(CompileFlags cf)
2262
{
2263
// TODO: we need better constant setting here.. which will need backprop
2264
AssertRegOrConstT(cf);
2265
2266
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2267
const u32* ptr = GetCop0RegPtr(reg);
2268
const u32 mask = GetCop0RegWriteMask(reg);
2269
if (!ptr)
2270
{
2271
Compile_Fallback();
2272
return;
2273
}
2274
2275
if (mask == 0)
2276
{
2277
// if it's a read-only register, ignore
2278
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2279
return;
2280
}
2281
2282
// for some registers, we need to test certain bits
2283
const bool needs_bit_test = (reg == Cop0Reg::SR);
2284
const la_gpr_t new_value = RARG1;
2285
const la_gpr_t old_value = RARG2;
2286
const la_gpr_t changed_bits = RARG3;
2287
const la_gpr_t mask_reg = RSCRATCH;
2288
2289
// Load old value
2290
la_ld_w(laAsm, old_value, RSTATE, OFFS(ptr));
2291
2292
// No way we fit this in an immediate..
2293
EmitMov(mask_reg, mask);
2294
2295
// update value
2296
if (cf.valid_host_t)
2297
la_and(laAsm, new_value, CFGetRegT(cf), mask_reg);
2298
else
2299
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2300
2301
if (needs_bit_test)
2302
la_xor(laAsm, changed_bits, old_value, new_value);
2303
la_nor(laAsm, mask_reg, mask_reg, LA_ZERO);
2304
la_and(laAsm, old_value, old_value, mask_reg);
2305
la_or(laAsm, new_value, old_value, new_value);
2306
la_st_w(laAsm, new_value, RSTATE, OFFS(ptr));
2307
2308
if (reg == Cop0Reg::SR)
2309
{
2310
// TODO: replace with register backup
2311
// We could just inline the whole thing..
2312
Flush(FLUSH_FOR_C_CALL);
2313
2314
lagoon_label_t caches_unchanged = {};
2315
la_srli_w(laAsm, RSCRATCH, changed_bits, 16);
2316
la_andi(laAsm, RSCRATCH, RSCRATCH, 1);
2317
la_beq(laAsm, RSCRATCH, LA_ZERO, la_label(laAsm, &caches_unchanged));
2318
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2319
la_ld_w(laAsm, new_value, RSTATE, OFFS(ptr));
2320
if (CodeCache::IsUsingFastmem())
2321
la_ld_d(laAsm, RMEMBASE, RSTATE, OFFS(&g_state.fastmem_base));
2322
la_bind(laAsm, &caches_unchanged);
2323
la_label_free(laAsm, &caches_unchanged);
2324
2325
TestInterrupts(RARG1);
2326
}
2327
else if (reg == Cop0Reg::CAUSE)
2328
{
2329
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));
2330
TestInterrupts(RARG1);
2331
}
2332
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2333
{
2334
// need to check whether we're switching to debug mode
2335
Flush(FLUSH_FOR_C_CALL);
2336
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2337
SwitchToFarCode(true, LaBranchCondition::NE, RRET, LA_ZERO);
2338
BackupHostState();
2339
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2340
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2341
RestoreHostState();
2342
SwitchToNearCode(false);
2343
}
2344
}
2345
2346
void CPU::LoongArch64Recompiler::Compile_rfe(CompileFlags cf)
2347
{
2348
// shift mode bits right two, preserving upper bits
2349
la_ld_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));
2350
la_srli_w(laAsm, RSCRATCH, RARG1, 2);
2351
la_andi(laAsm, RSCRATCH, RSCRATCH, 0xf);
2352
la_bstrins_d(laAsm, RARG1, LA_ZERO, 3, 0);
2353
la_or(laAsm, RARG1, RARG1, RSCRATCH);
2354
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.cop0_regs.sr.bits));
2355
2356
TestInterrupts(RARG1);
2357
}
2358
2359
void CPU::LoongArch64Recompiler::TestInterrupts(la_gpr_t sr)
2360
{
2361
DebugAssert(sr != RSCRATCH);
2362
2363
// if Iec == 0 then goto no_interrupt
2364
lagoon_label_t no_interrupt = {};
2365
la_andi(laAsm, RSCRATCH, sr, 1);
2366
la_beqz(laAsm, RSCRATCH, la_label(laAsm, &no_interrupt));
2367
2368
// sr & cause
2369
la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.cop0_regs.cause.bits));
2370
la_and(laAsm, sr, sr, RSCRATCH);
2371
2372
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2373
la_srli_w(laAsm, sr, sr, 8);
2374
la_andi(laAsm, sr, sr, 0xFF);
2375
SwitchToFarCode(true, LaBranchCondition::NE, sr, LA_ZERO);
2376
2377
BackupHostState();
2378
2379
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2380
UpdateLoadDelay();
2381
2382
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2383
2384
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2385
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2386
if (!iinfo->is_last_instruction)
2387
{
2388
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2389
(inst + 1)->cop.cop_n));
2390
EmitMov(RARG2, m_compiler_pc);
2391
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2392
m_dirty_pc = false;
2393
EndAndLinkBlock(std::nullopt, true, false);
2394
}
2395
else
2396
{
2397
if (m_dirty_pc)
2398
EmitMov(RARG1, m_compiler_pc);
2399
la_st_w(laAsm, LA_ZERO, RSTATE, OFFS(&g_state.downcount));
2400
if (m_dirty_pc)
2401
la_st_w(laAsm, RARG1, RSTATE, OFFS(&g_state.pc));
2402
m_dirty_pc = false;
2403
EndAndLinkBlock(std::nullopt, false, true);
2404
}
2405
2406
RestoreHostState();
2407
SwitchToNearCode(false);
2408
2409
la_bind(laAsm, &no_interrupt);
2410
la_label_free(laAsm, &no_interrupt);
2411
}
2412
2413
void CPU::LoongArch64Recompiler::Compile_mfc2(CompileFlags cf)
2414
{
2415
const u32 index = inst->cop.Cop2Index();
2416
const Reg rt = inst->r.rt;
2417
2418
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2419
if (action == GTERegisterAccessAction::Ignore)
2420
return;
2421
2422
u32 hreg;
2423
if (action == GTERegisterAccessAction::Direct)
2424
{
2425
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2426
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2427
la_ld_w(laAsm, static_cast<la_gpr_t>(hreg), RSTATE, OFFS(ptr));
2428
}
2429
else if (action == GTERegisterAccessAction::CallHandler)
2430
{
2431
Flush(FLUSH_FOR_C_CALL);
2432
EmitMov(RARG1, index);
2433
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2434
2435
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2436
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2437
la_or(laAsm, static_cast<la_gpr_t>(hreg), RRET, LA_ZERO);
2438
}
2439
else
2440
{
2441
Panic("Unknown action");
2442
}
2443
2444
if (g_settings.gpu_pgxp_enable)
2445
{
2446
Flush(FLUSH_FOR_C_CALL);
2447
EmitMov(RARG1, inst->bits);
2448
la_or(laAsm, RARG2, static_cast<la_gpr_t>(hreg), LA_ZERO);
2449
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2450
}
2451
}
2452
2453
void CPU::LoongArch64Recompiler::Compile_mtc2(CompileFlags cf)
2454
{
2455
const u32 index = inst->cop.Cop2Index();
2456
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2457
if (action == GTERegisterAccessAction::Ignore)
2458
return;
2459
2460
if (action == GTERegisterAccessAction::Direct)
2461
{
2462
if (cf.const_t)
2463
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2464
else
2465
la_st_w(laAsm, CFGetRegT(cf), RSTATE, OFFS(ptr));
2466
}
2467
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2468
{
2469
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2470
if (cf.valid_host_t)
2471
{
2472
sign ? EmitSExtH(RARG1, CFGetRegT(cf)) : EmitUExtH(RARG1, CFGetRegT(cf));
2473
la_st_w(laAsm, RARG1, RSTATE, OFFS(ptr));
2474
}
2475
else if (cf.const_t)
2476
{
2477
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2478
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2479
}
2480
else
2481
{
2482
Panic("Unsupported setup");
2483
}
2484
}
2485
else if (action == GTERegisterAccessAction::CallHandler)
2486
{
2487
Flush(FLUSH_FOR_C_CALL);
2488
EmitMov(RARG1, index);
2489
MoveTToReg(RARG2, cf);
2490
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2491
}
2492
else if (action == GTERegisterAccessAction::PushFIFO)
2493
{
2494
// SXY0 <- SXY1
2495
// SXY1 <- SXY2
2496
// SXY2 <- SXYP
2497
DebugAssert(RRET != RARG2 && RRET != RARG3);
2498
la_ld_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));
2499
la_ld_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));
2500
la_st_w(laAsm, RARG2, RSTATE, OFFS(&g_state.gte_regs.SXY0[0]));
2501
la_st_w(laAsm, RARG3, RSTATE, OFFS(&g_state.gte_regs.SXY1[0]));
2502
if (cf.valid_host_t)
2503
la_st_w(laAsm, CFGetRegT(cf), RSTATE, OFFS(&g_state.gte_regs.SXY2[0]));
2504
else if (cf.const_t)
2505
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2506
else
2507
Panic("Unsupported setup");
2508
}
2509
else
2510
{
2511
Panic("Unknown action");
2512
}
2513
}
2514
2515
void CPU::LoongArch64Recompiler::Compile_cop2(CompileFlags cf)
2516
{
2517
TickCount func_ticks;
2518
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2519
2520
Flush(FLUSH_FOR_C_CALL);
2521
EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2522
EmitCall(reinterpret_cast<const void*>(func));
2523
2524
AddGTETicks(func_ticks);
2525
}
2526
2527
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2528
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2529
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2530
bool is_load)
2531
{
2532
lagoon_assembler_t la_asm;
2533
lagoon_assembler_t* laAsm = &la_asm;
2534
la_init_assembler(laAsm, static_cast<u8*>(thunk_code), thunk_space);
2535
2536
static constexpr u32 GPR_SIZE = 8;
2537
2538
// save regs
2539
u32 num_gprs = 0;
2540
2541
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2542
{
2543
if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))
2544
num_gprs++;
2545
}
2546
2547
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
2548
2549
if (stack_size > 0)
2550
{
2551
la_addi_d(laAsm, LA_SP, LA_SP, -static_cast<s32>(stack_size));
2552
2553
u32 stack_offset = 0;
2554
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2555
{
2556
if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))
2557
{
2558
la_st_d(laAsm, static_cast<la_gpr_t>(i), LA_SP, stack_offset);
2559
stack_offset += GPR_SIZE;
2560
}
2561
}
2562
}
2563
2564
if (cycles_to_add != 0)
2565
{
2566
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2567
Assert(laIsValidSImm12(cycles_to_add));
2568
la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));
2569
la_addi_w(laAsm, RSCRATCH, RSCRATCH, cycles_to_add);
2570
la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));
2571
}
2572
2573
if (address_register != RARG1)
2574
la_or(laAsm, RARG1, static_cast<la_gpr_t>(address_register), LA_ZERO);
2575
2576
if (!is_load)
2577
{
2578
if (data_register != RARG2)
2579
la_or(laAsm, RARG2, static_cast<la_gpr_t>(data_register), LA_ZERO);
2580
}
2581
2582
switch (size)
2583
{
2584
case MemoryAccessSize::Byte:
2585
{
2586
laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :
2587
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
2588
}
2589
break;
2590
case MemoryAccessSize::HalfWord:
2591
{
2592
laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :
2593
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
2594
}
2595
break;
2596
case MemoryAccessSize::Word:
2597
{
2598
laEmitCall(laAsm, is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :
2599
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
2600
}
2601
break;
2602
}
2603
2604
if (is_load)
2605
{
2606
const la_gpr_t dst = static_cast<la_gpr_t>(data_register);
2607
switch (size)
2608
{
2609
case MemoryAccessSize::Byte:
2610
{
2611
is_signed ? laEmitSExtB(laAsm, dst, RRET) : laEmitUExtB(laAsm, dst, RRET);
2612
}
2613
break;
2614
case MemoryAccessSize::HalfWord:
2615
{
2616
is_signed ? laEmitSExtH(laAsm, dst, RRET) : laEmitUExtH(laAsm, dst, RRET);
2617
}
2618
break;
2619
case MemoryAccessSize::Word:
2620
{
2621
if (dst != RRET)
2622
la_or(laAsm, dst, RRET, LA_ZERO);
2623
}
2624
break;
2625
}
2626
}
2627
2628
if (cycles_to_remove != 0)
2629
{
2630
Assert(laIsValidSImm12(-cycles_to_remove));
2631
la_ld_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));
2632
la_addi_w(laAsm, RSCRATCH, RSCRATCH, -cycles_to_remove);
2633
la_st_w(laAsm, RSCRATCH, RSTATE, OFFS(&g_state.pending_ticks));
2634
}
2635
2636
// restore regs
2637
if (stack_size > 0)
2638
{
2639
u32 stack_offset = 0;
2640
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2641
{
2642
if ((gpr_bitmask & (1u << i)) && laIsCallerSavedRegister(i) && (!is_load || data_register != i))
2643
{
2644
la_ld_d(laAsm, static_cast<la_gpr_t>(i), LA_SP, stack_offset);
2645
stack_offset += GPR_SIZE;
2646
}
2647
}
2648
2649
la_addi_d(laAsm, LA_SP, LA_SP, stack_size);
2650
}
2651
2652
laEmitJmp(laAsm, static_cast<const u8*>(code_address) + code_size);
2653
2654
return static_cast<u32>(laAsm->cursor - laAsm->buffer);
2655
}
2656
2657
#endif // CPU_ARCH_LOONGARCH64
2658