Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_arm64.cpp
7429 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_arm64.h"
5
#include "cpu_core_private.h"
6
#include "cpu_pgxp.h"
7
#include "gte.h"
8
#include "settings.h"
9
#include "timing_event.h"
10
11
#include "common/align.h"
12
#include "common/assert.h"
13
#include "common/log.h"
14
#include "common/memmap.h"
15
#include "common/string_util.h"
16
17
#include <limits>
18
19
#ifdef CPU_ARCH_ARM64
20
21
#include "vixl/aarch64/constants-aarch64.h"
22
23
#ifdef ENABLE_HOST_DISASSEMBLY
24
#include "vixl/aarch64/disasm-aarch64.h"
25
#endif
26
27
LOG_CHANNEL(Recompiler);
28
29
#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
30
31
#define RWRET vixl::aarch64::w0
32
#define RXRET vixl::aarch64::x0
33
#define RWARG1 vixl::aarch64::w0
34
#define RXARG1 vixl::aarch64::x0
35
#define RWARG2 vixl::aarch64::w1
36
#define RXARG2 vixl::aarch64::x1
37
#define RWARG3 vixl::aarch64::w2
38
#define RXARG3 vixl::aarch64::x2
39
#define RWSCRATCH vixl::aarch64::w16
40
#define RXSCRATCH vixl::aarch64::x16
41
#define RSTATE vixl::aarch64::x19
42
#define RMEMBASE vixl::aarch64::x20
43
44
static bool armIsCallerSavedRegister(u32 id);
45
static s64 armGetPCDisplacement(const void* current, const void* target);
46
static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);
47
static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);
48
static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
49
static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
50
static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
51
static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
52
static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
53
bool sign_extend_word = false);
54
static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
55
const vixl::aarch64::Register& tempreg = RXSCRATCH);
56
static u8* armGetJumpTrampoline(const void* target);
57
static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);
58
59
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
60
static std::unordered_map<const void*, u32> s_trampoline_targets;
61
static u8* s_trampoline_start_ptr = nullptr;
62
static u32 s_trampoline_used = 0;
63
64
namespace CPU {
65
66
using namespace vixl::aarch64;
67
68
static ARM64Recompiler s_instance;
69
Recompiler* g_compiler = &s_instance;
70
71
} // namespace CPU
72
73
bool armIsCallerSavedRegister(u32 id)
74
{
75
// same on both linux and windows
76
return (id <= 18);
77
}
78
79
void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)
80
{
81
// From vixl macro assembler.
82
DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());
83
DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());
84
85
if (imm == 0)
86
{
87
armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));
88
return;
89
}
90
91
// The worst case for size is mov 64-bit immediate to sp:
92
// * up to 4 instructions to materialise the constant
93
// * 1 instruction to move to sp
94
95
// Immediates on Aarch64 can be produced using an initial value, and zero to
96
// three move keep operations.
97
//
98
// Initial values can be generated with:
99
// 1. 64-bit move zero (movz).
100
// 2. 32-bit move inverted (movn).
101
// 3. 64-bit move inverted.
102
// 4. 32-bit orr immediate.
103
// 5. 64-bit orr immediate.
104
// Move-keep may then be used to modify each of the 16-bit half words.
105
//
106
// The code below supports all five initial value generators, and
107
// applying move-keep operations to move-zero and move-inverted initial
108
// values.
109
110
// Try to move the immediate in one instruction, and if that fails, switch to
111
// using multiple instructions.
112
const unsigned reg_size = rd.GetSizeInBits();
113
114
if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())
115
{
116
// Immediate can be represented in a move zero instruction. Movz can't write
117
// to the stack pointer.
118
armAsm->movz(rd, imm);
119
return;
120
}
121
else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())
122
{
123
// Immediate can be represented in a move negative instruction. Movn can't
124
// write to the stack pointer.
125
armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));
126
return;
127
}
128
else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))
129
{
130
// Immediate can be represented in a logical orr instruction.
131
DebugAssert(!rd.IsZero());
132
armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);
133
return;
134
}
135
136
// Generic immediate case. Imm will be represented by
137
// [imm3, imm2, imm1, imm0], where each imm is 16 bits.
138
// A move-zero or move-inverted is generated for the first non-zero or
139
// non-0xffff immX, and a move-keep for subsequent non-zero immX.
140
141
uint64_t ignored_halfword = 0;
142
bool invert_move = false;
143
// If the number of 0xffff halfwords is greater than the number of 0x0000
144
// halfwords, it's more efficient to use move-inverted.
145
if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))
146
{
147
ignored_halfword = 0xffff;
148
invert_move = true;
149
}
150
151
// Iterate through the halfwords. Use movn/movz for the first non-ignored
152
// halfword, and movk for subsequent halfwords.
153
DebugAssert((reg_size % 16) == 0);
154
bool first_mov_done = false;
155
for (unsigned i = 0; i < (reg_size / 16); i++)
156
{
157
uint64_t imm16 = (imm >> (16 * i)) & 0xffff;
158
if (imm16 != ignored_halfword)
159
{
160
if (!first_mov_done)
161
{
162
if (invert_move)
163
armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);
164
else
165
armAsm->movz(rd, imm16, 16 * i);
166
first_mov_done = true;
167
}
168
else
169
{
170
// Construct a wider constant.
171
armAsm->movk(rd, imm16, 16 * i);
172
}
173
}
174
}
175
176
DebugAssert(first_mov_done);
177
}
178
179
s64 armGetPCDisplacement(const void* current, const void* target)
180
{
181
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
182
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
183
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
184
}
185
186
bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)
187
{
188
const void* cur = armAsm->GetCursorAddress<const void*>();
189
const void* current_code_ptr_page =
190
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
191
const void* ptr_page =
192
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
193
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
194
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
195
196
return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||
197
vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));
198
}
199
200
void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)
201
{
202
DebugAssert(reg.IsX());
203
204
const void* cur = armAsm->GetCursorAddress<const void*>();
205
const void* current_code_ptr_page =
206
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
207
const void* ptr_page =
208
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
209
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
210
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
211
if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))
212
{
213
armAsm->adrp(reg, page_displacement);
214
armAsm->add(reg, reg, page_offset);
215
}
216
else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))
217
{
218
armAsm->adrp(reg, page_displacement);
219
armAsm->orr(reg, reg, page_offset);
220
}
221
else
222
{
223
armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));
224
}
225
}
226
227
void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
228
{
229
const void* cur = armAsm->GetCursorAddress<const void*>();
230
s64 displacement = armGetPCDisplacement(cur, ptr);
231
bool use_blr = !vixl::IsInt26(displacement);
232
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
233
if (use_blr && use_trampoline && !force_inline)
234
{
235
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
236
{
237
displacement = armGetPCDisplacement(cur, trampoline);
238
use_blr = !vixl::IsInt26(displacement);
239
}
240
}
241
242
if (use_blr)
243
{
244
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
245
armAsm->br(RXSCRATCH);
246
}
247
else
248
{
249
armAsm->b(displacement);
250
}
251
}
252
253
void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
254
{
255
const void* cur = armAsm->GetCursorAddress<const void*>();
256
s64 displacement = armGetPCDisplacement(cur, ptr);
257
bool use_blr = !vixl::IsInt26(displacement);
258
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
259
if (use_blr && use_trampoline && !force_inline)
260
{
261
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
262
{
263
displacement = armGetPCDisplacement(cur, trampoline);
264
use_blr = !vixl::IsInt26(displacement);
265
}
266
}
267
268
if (use_blr)
269
{
270
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
271
armAsm->blr(RXSCRATCH);
272
}
273
else
274
{
275
armAsm->bl(displacement);
276
}
277
}
278
279
void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)
280
{
281
const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
282
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
283
// pxAssert(Common::IsAligned(jump_distance, 4));
284
285
if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))
286
{
287
armAsm->b(jump_distance >> 2, cond);
288
}
289
else
290
{
291
vixl::aarch64::Label branch_not_taken;
292
armAsm->b(&branch_not_taken, InvertCondition(cond));
293
294
const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
295
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
296
armAsm->b(new_jump_distance >> 2);
297
armAsm->bind(&branch_not_taken);
298
}
299
}
300
301
void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
302
bool sign_extend_word)
303
{
304
const void* cur = armAsm->GetCursorAddress<const void*>();
305
const void* current_code_ptr_page =
306
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
307
const void* ptr_page =
308
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
309
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
310
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
311
vixl::aarch64::MemOperand memop;
312
313
const vixl::aarch64::Register xreg = reg.X();
314
if (vixl::IsInt21(page_displacement))
315
{
316
armAsm->adrp(xreg, page_displacement);
317
memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));
318
}
319
else
320
{
321
armMoveAddressToReg(armAsm, xreg, addr);
322
memop = vixl::aarch64::MemOperand(xreg);
323
}
324
325
if (sign_extend_word)
326
armAsm->ldrsw(reg, memop);
327
else
328
armAsm->ldr(reg, memop);
329
}
330
331
[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,
332
const void* addr, const vixl::aarch64::Register& tempreg)
333
{
334
DebugAssert(tempreg.IsX());
335
336
const void* cur = armAsm->GetCursorAddress<const void*>();
337
const void* current_code_ptr_page =
338
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
339
const void* ptr_page =
340
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
341
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
342
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
343
344
if (vixl::IsInt21(page_displacement))
345
{
346
armAsm->adrp(tempreg, page_displacement);
347
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));
348
}
349
else
350
{
351
armMoveAddressToReg(armAsm, tempreg, addr);
352
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));
353
}
354
}
355
356
u8* armGetJumpTrampoline(const void* target)
357
{
358
auto it = s_trampoline_targets.find(target);
359
if (it != s_trampoline_targets.end())
360
return s_trampoline_start_ptr + it->second;
361
362
// align to 16 bytes?
363
const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);
364
365
// 4 movs plus a jump
366
if (TRAMPOLINE_AREA_SIZE - offset < 20)
367
{
368
Panic("Ran out of space in constant pool");
369
return nullptr;
370
}
371
372
u8* start = s_trampoline_start_ptr + offset;
373
vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
374
#ifdef VIXL_DEBUG
375
vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,
376
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
377
#endif
378
armMoveAddressToReg(&armAsm, RXSCRATCH, target);
379
armAsm.br(RXSCRATCH);
380
armAsm.FinalizeCode();
381
382
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
383
DebugAssert(size < 20);
384
s_trampoline_targets.emplace(target, offset);
385
s_trampoline_used = offset + static_cast<u32>(size);
386
387
MemMap::FlushInstructionCache(start, size);
388
return start;
389
}
390
391
void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)
392
{
393
size_t addr = armAsm->GetCursorAddress<size_t>();
394
const size_t end_addr = Common::AlignUpPow2(addr, alignment);
395
while (addr != end_addr)
396
{
397
armAsm->nop();
398
addr += vixl::aarch64::kInstructionSize;
399
}
400
}
401
402
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
403
{
404
#ifdef ENABLE_HOST_DISASSEMBLY
405
class MyDisassembler : public vixl::aarch64::Disassembler
406
{
407
protected:
408
void ProcessOutput(const vixl::aarch64::Instruction* instr) override
409
{
410
DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());
411
}
412
};
413
414
vixl::aarch64::Decoder decoder;
415
MyDisassembler disas;
416
decoder.AppendVisitor(&disas);
417
decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),
418
reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));
419
#else
420
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
421
#endif
422
}
423
424
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
425
{
426
return size / vixl::aarch64::kInstructionSize;
427
}
428
429
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
430
{
431
using namespace vixl::aarch64;
432
433
const s64 disp = armGetPCDisplacement(code, dst);
434
DebugAssert(vixl::IsInt26(disp));
435
436
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
437
std::memcpy(code, &new_code, sizeof(new_code));
438
if (flush_icache)
439
MemMap::FlushInstructionCache(code, kInstructionSize);
440
441
return kInstructionSize;
442
}
443
444
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
445
{
446
using namespace vixl::aarch64;
447
448
Assembler actual_asm(static_cast<u8*>(code), code_size);
449
Assembler* RESTRICT armAsm = &actual_asm;
450
451
#ifdef VIXL_DEBUG
452
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
453
#endif
454
455
Label dispatch;
456
Label run_events_and_dispatch;
457
458
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
459
{
460
#ifdef _WIN32
461
// Frame pointer setup is needed on Windows
462
armAsm->stp(x29, x30, MemOperand(sp, -16, PreIndex));
463
armAsm->mov(x29, sp);
464
#endif
465
466
// Need the CPU state for basically everything :-)
467
armMoveAddressToReg(armAsm, RSTATE, &g_state);
468
469
// Fastmem setup, oldrec doesn't need it
470
if (IsUsingFastmem())
471
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
472
473
// Fall through to event dispatcher
474
}
475
476
// check events then for frame done
477
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
478
{
479
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
480
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
481
armAsm->cmp(RWARG1, RWARG2);
482
armAsm->b(&dispatch, lt);
483
484
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
485
armAsm->bind(&run_events_and_dispatch);
486
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
487
}
488
489
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
490
g_dispatcher = armAsm->GetCursorAddress<const void*>();
491
{
492
armAsm->bind(&dispatch);
493
494
// x9 <- s_fast_map[pc >> 16]
495
armAsm->ldr(RWARG1, PTR(&g_state.pc));
496
armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());
497
armAsm->lsr(RWARG2, RWARG1, 16);
498
armAsm->ubfx(RWARG1, RWARG1, 2, 14);
499
armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));
500
501
// blr(x9[pc * 2]) (fast_map[pc >> 2])
502
armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));
503
armAsm->br(RXARG1);
504
}
505
506
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
507
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
508
{
509
armAsm->ldr(RWARG1, PTR(&g_state.pc));
510
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
511
armAsm->b(&dispatch);
512
}
513
514
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
515
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
516
{
517
armAsm->ldr(RWARG1, PTR(&g_state.pc));
518
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
519
armAsm->b(&dispatch);
520
}
521
522
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
523
g_interpret_block = armAsm->GetCursorAddress<const void*>();
524
{
525
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
526
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
527
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
528
armAsm->cmp(RWARG1, RWARG2);
529
armAsm->b(&run_events_and_dispatch, ge);
530
armAsm->b(&dispatch);
531
}
532
533
armAsm->FinalizeCode();
534
535
s_trampoline_targets.clear();
536
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
537
s_trampoline_used = 0;
538
539
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
540
}
541
542
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
543
{
544
constexpr u8 padding_value = 0x00;
545
std::memset(dst, padding_value, size);
546
}
547
548
CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)
549
{
550
}
551
552
CPU::ARM64Recompiler::~ARM64Recompiler() = default;
553
554
const void* CPU::ARM64Recompiler::GetCurrentCodePointer()
555
{
556
return armAsm->GetCursorAddress<const void*>();
557
}
558
559
void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
560
u32 far_code_space)
561
{
562
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
563
564
// TODO: don't recreate this every time..
565
DebugAssert(!armAsm);
566
m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
567
m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
568
armAsm = &m_emitter;
569
570
#ifdef VIXL_DEBUG
571
m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,
572
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
573
m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
574
&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
575
#endif
576
577
// Need to wipe it out so it's correct when toggling fastmem.
578
m_host_regs = {};
579
580
// Frame pointer must be valid on Windows.
581
#ifdef _WIN32
582
constexpr u32 max_reg_idx = 28;
583
#else
584
constexpr u32 max_reg_idx = 29;
585
#endif
586
587
const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
588
for (u32 i = 0; i < NUM_HOST_REGS; i++)
589
{
590
HostRegAlloc& ra = m_host_regs[i];
591
592
if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
593
i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i > max_reg_idx)
594
{
595
continue;
596
}
597
598
ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
599
}
600
}
601
602
void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
603
{
604
DebugAssert(armAsm == &m_emitter);
605
if (emit_jump)
606
{
607
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
608
if (cond != Condition::al)
609
{
610
if (vixl::IsInt19(disp))
611
{
612
armAsm->b(disp, cond);
613
}
614
else
615
{
616
Label skip;
617
armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
618
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
619
armAsm->bind(&skip);
620
}
621
}
622
else
623
{
624
armAsm->b(disp);
625
}
626
}
627
armAsm = &m_far_emitter;
628
}
629
630
void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
631
{
632
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
633
if (vixl::IsInt14(disp))
634
{
635
armAsm->tbnz(reg, bit, disp);
636
}
637
else
638
{
639
Label skip;
640
armAsm->tbz(reg, bit, &skip);
641
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
642
armAsm->bind(&skip);
643
}
644
645
armAsm = &m_far_emitter;
646
}
647
648
void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
649
{
650
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
651
if (vixl::IsInt19(disp))
652
{
653
nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
654
}
655
else
656
{
657
Label skip;
658
nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
659
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
660
armAsm->bind(&skip);
661
}
662
663
armAsm = &m_far_emitter;
664
}
665
666
void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
667
{
668
DebugAssert(armAsm == &m_far_emitter);
669
if (emit_jump)
670
{
671
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
672
(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
673
}
674
armAsm = &m_emitter;
675
}
676
677
void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)
678
{
679
armEmitMov(armAsm, dst, val);
680
}
681
682
void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
683
{
684
armEmitCall(armAsm, ptr, force_inline);
685
}
686
687
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)
688
{
689
if (Assembler::IsImmAddSub(val))
690
return vixl::aarch64::Operand(static_cast<int64_t>(val));
691
692
EmitMov(RWSCRATCH, static_cast<u32>(val));
693
return vixl::aarch64::Operand(RWSCRATCH);
694
}
695
696
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)
697
{
698
return armCheckAddSubConstant(static_cast<s32>(val));
699
}
700
701
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)
702
{
703
if (Assembler::IsImmConditionalCompare(val))
704
return vixl::aarch64::Operand(static_cast<int64_t>(val));
705
706
EmitMov(RWSCRATCH, static_cast<u32>(val));
707
return vixl::aarch64::Operand(RWSCRATCH);
708
}
709
710
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)
711
{
712
if (Assembler::IsImmLogical(val, 32))
713
return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));
714
715
EmitMov(RWSCRATCH, val);
716
return vixl::aarch64::Operand(RWSCRATCH);
717
}
718
719
void CPU::ARM64Recompiler::BeginBlock()
720
{
721
Recompiler::BeginBlock();
722
}
723
724
void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
725
{
726
// store it first to reduce code size, because we can offset
727
armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
728
armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
729
730
bool first = true;
731
u32 offset = 0;
732
Label block_changed;
733
734
while (size >= 16)
735
{
736
const VRegister vtmp = v2.V4S();
737
const VRegister dst = first ? v0.V4S() : v1.V4S();
738
armAsm->ldr(dst, MemOperand(RXARG1, offset));
739
armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
740
armAsm->cmeq(dst, dst, vtmp);
741
if (!first)
742
armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());
743
else
744
first = false;
745
746
offset += 16;
747
size -= 16;
748
}
749
750
if (!first)
751
{
752
// TODO: make sure this doesn't choke on ffffffff
753
armAsm->uminv(s0, v0.V4S());
754
armAsm->fcmp(s0, 0.0);
755
armAsm->b(&block_changed, eq);
756
}
757
758
while (size >= 8)
759
{
760
armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
761
armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
762
armAsm->cmp(RXARG3, RXSCRATCH);
763
armAsm->b(&block_changed, ne);
764
offset += 8;
765
size -= 8;
766
}
767
768
while (size >= 4)
769
{
770
armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
771
armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
772
armAsm->cmp(RWARG3, RWSCRATCH);
773
armAsm->b(&block_changed, ne);
774
offset += 4;
775
size -= 4;
776
}
777
778
DebugAssert(size == 0);
779
780
Label block_unchanged;
781
armAsm->b(&block_unchanged);
782
armAsm->bind(&block_changed);
783
armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
784
armAsm->bind(&block_unchanged);
785
}
786
787
void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()
788
{
789
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
790
{
791
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
792
{
793
armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());
794
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
795
armEmitMov(armAsm, RWARG3, m_block->size);
796
armAsm->mul(RWARG2, RWARG2, RWARG3);
797
armAsm->add(RWARG1, RWARG1, RWARG2);
798
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
799
}
800
else
801
{
802
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
803
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
804
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
805
}
806
}
807
else if (m_block->icache_line_count > 0)
808
{
809
const auto& ticks_reg = RWARG1;
810
const auto& current_tag_reg = RWARG2;
811
const auto& existing_tag_reg = RWARG3;
812
const auto& fill_ticks_reg = w4;
813
const auto& ticks_to_add_reg = w5;
814
815
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
816
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
817
if (fill_ticks <= 0)
818
return;
819
820
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
821
armEmitMov(armAsm, current_tag_reg, current_pc);
822
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
823
824
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
825
{
826
const u32 line = GetICacheLine(current_pc);
827
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
828
829
Label cache_hit;
830
armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
831
armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
832
armAsm->cmp(existing_tag_reg, current_tag_reg);
833
armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);
834
armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);
835
836
if (i != (m_block->icache_line_count - 1))
837
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
838
}
839
840
armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
841
}
842
}
843
844
void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
845
s32 arg3reg /*= -1*/)
846
{
847
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))
848
armAsm->mov(RXARG1, XRegister(arg1reg));
849
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))
850
armAsm->mov(RXARG2, XRegister(arg2reg));
851
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))
852
armAsm->mov(RXARG3, XRegister(arg3reg));
853
EmitCall(func);
854
}
855
856
void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
857
{
858
if (newpc.has_value())
859
{
860
if (m_dirty_pc || m_compiler_pc != newpc)
861
{
862
EmitMov(RWSCRATCH, newpc.value());
863
armAsm->str(RWSCRATCH, PTR(&g_state.pc));
864
}
865
}
866
m_dirty_pc = false;
867
868
// flush regs
869
Flush(FLUSH_END_BLOCK);
870
EndAndLinkBlock(newpc, do_event_test, false);
871
}
872
873
void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)
874
{
875
// flush regs, but not pc, it's going to get overwritten
876
// flush cycles because of the GTE instruction stuff...
877
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
878
879
// TODO: flush load delay
880
881
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
882
inst->cop.cop_n));
883
EmitMov(RWARG2, m_current_instruction_pc);
884
if (excode != Exception::BP)
885
{
886
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
887
}
888
else
889
{
890
EmitMov(RWARG3, inst->bits);
891
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
892
}
893
m_dirty_pc = false;
894
895
EndAndLinkBlock(std::nullopt, true, false);
896
}
897
898
void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
899
{
900
// event test
901
// pc should've been flushed
902
DebugAssert(!m_dirty_pc && !m_block_ended);
903
m_block_ended = true;
904
905
// TODO: try extracting this to a function
906
907
// save cycles for event test
908
const TickCount cycles = std::exchange(m_cycles, 0);
909
910
// pending_ticks += cycles
911
// if (pending_ticks >= downcount) { dispatch_event(); }
912
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
913
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
914
if (do_event_test)
915
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
916
if (cycles > 0)
917
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
918
if (m_gte_done_cycle > cycles)
919
{
920
armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
921
armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
922
}
923
if (do_event_test)
924
armAsm->cmp(RWARG1, RWARG2);
925
if (cycles > 0)
926
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
927
if (do_event_test)
928
armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
929
930
// jump to dispatcher or next block
931
if (force_run_events)
932
{
933
armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
934
}
935
else if (!newpc.has_value())
936
{
937
armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
938
}
939
else
940
{
941
const void* target = (newpc.value() == m_block->pc) ?
942
CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),
943
armAsm->GetBuffer()->GetStartAddress<const void*>()) :
944
CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
945
armEmitJmp(armAsm, target, true);
946
}
947
}
948
949
const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)
950
{
951
#ifdef VIXL_DEBUG
952
m_emitter_check.reset();
953
m_far_emitter_check.reset();
954
#endif
955
956
m_emitter.FinalizeCode();
957
m_far_emitter.FinalizeCode();
958
959
u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
960
*code_size = static_cast<u32>(m_emitter.GetCursorOffset());
961
*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
962
armAsm = nullptr;
963
return code;
964
}
965
966
const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const
967
{
968
static constexpr std::array<const char*, 32> reg64_names = {
969
{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
970
"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
971
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
972
}
973
974
void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
975
{
976
EmitMov(WRegister(reg), val);
977
}
978
979
void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
980
{
981
armAsm->ldr(WRegister(reg), PTR(ptr));
982
}
983
984
void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
985
{
986
armAsm->str(WRegister(reg), PTR(ptr));
987
}
988
989
void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
990
{
991
if (val == 0)
992
{
993
armAsm->str(wzr, PTR(ptr));
994
return;
995
}
996
997
EmitMov(RWSCRATCH, val);
998
armAsm->str(RWSCRATCH, PTR(ptr));
999
}
1000
1001
void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)
1002
{
1003
if (src != dst)
1004
armAsm->mov(WRegister(dst), WRegister(src));
1005
}
1006
1007
void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const
1008
{
1009
DebugAssert(cf.valid_host_s || cf.const_s);
1010
}
1011
1012
void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const
1013
{
1014
DebugAssert(cf.valid_host_t || cf.const_t);
1015
}
1016
1017
vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const
1018
{
1019
DebugAssert(r < Reg::count);
1020
return PTR(&g_state.regs.r[static_cast<u32>(r)]);
1021
}
1022
1023
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const
1024
{
1025
DebugAssert(cf.valid_host_d);
1026
return WRegister(cf.host_d);
1027
}
1028
1029
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const
1030
{
1031
DebugAssert(cf.valid_host_s);
1032
return WRegister(cf.host_s);
1033
}
1034
1035
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const
1036
{
1037
DebugAssert(cf.valid_host_t);
1038
return WRegister(cf.host_t);
1039
}
1040
1041
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const
1042
{
1043
DebugAssert(cf.valid_host_lo);
1044
return WRegister(cf.host_lo);
1045
}
1046
1047
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const
1048
{
1049
DebugAssert(cf.valid_host_hi);
1050
return WRegister(cf.host_hi);
1051
}
1052
1053
void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1054
{
1055
DebugAssert(dst.IsW());
1056
if (cf.valid_host_s)
1057
{
1058
if (cf.host_s != dst.GetCode())
1059
armAsm->mov(dst, WRegister(cf.host_s));
1060
}
1061
else if (cf.const_s)
1062
{
1063
const u32 cv = GetConstantRegU32(cf.MipsS());
1064
if (cv == 0)
1065
armAsm->mov(dst, wzr);
1066
else
1067
EmitMov(dst, cv);
1068
}
1069
else
1070
{
1071
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
1072
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
1073
}
1074
}
1075
1076
void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1077
{
1078
DebugAssert(dst.IsW());
1079
if (cf.valid_host_t)
1080
{
1081
if (cf.host_t != dst.GetCode())
1082
armAsm->mov(dst, WRegister(cf.host_t));
1083
}
1084
else if (cf.const_t)
1085
{
1086
const u32 cv = GetConstantRegU32(cf.MipsT());
1087
if (cv == 0)
1088
armAsm->mov(dst, wzr);
1089
else
1090
EmitMov(dst, cv);
1091
}
1092
else
1093
{
1094
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
1095
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
1096
}
1097
}
1098
1099
void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)
1100
{
1101
DebugAssert(reg < Reg::count && dst.IsW());
1102
if (ignore_load_delays && m_load_delay_register == reg)
1103
{
1104
if (m_load_delay_value_register == NUM_HOST_REGS)
1105
armAsm->ldr(dst, PTR(&g_state.load_delay_value));
1106
else
1107
armAsm->mov(dst, WRegister(m_load_delay_value_register));
1108
}
1109
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
1110
{
1111
armAsm->mov(dst, WRegister(hreg.value()));
1112
}
1113
else if (HasConstantReg(reg))
1114
{
1115
EmitMov(dst, GetConstantRegU32(reg));
1116
}
1117
else
1118
{
1119
armAsm->ldr(dst, MipsPtr(reg));
1120
}
1121
}
1122
1123
void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
1124
Reg arg3reg /* = Reg::count */)
1125
{
1126
DebugAssert(g_settings.gpu_pgxp_enable);
1127
1128
Flush(FLUSH_FOR_C_CALL);
1129
1130
if (arg2reg != Reg::count)
1131
MoveMIPSRegToReg(RWARG2, arg2reg);
1132
if (arg3reg != Reg::count)
1133
MoveMIPSRegToReg(RWARG3, arg3reg);
1134
1135
EmitMov(RWARG1, arg1val);
1136
EmitCall(func);
1137
}
1138
1139
void CPU::ARM64Recompiler::Flush(u32 flags)
1140
{
1141
Recompiler::Flush(flags);
1142
1143
if (flags & FLUSH_PC && m_dirty_pc)
1144
{
1145
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
1146
m_dirty_pc = false;
1147
}
1148
1149
if (flags & FLUSH_INSTRUCTION_BITS)
1150
{
1151
// This sucks, but it's only used for fallbacks.
1152
EmitMov(RWARG1, inst->bits);
1153
EmitMov(RWARG2, m_current_instruction_pc);
1154
EmitMov(RWARG3, m_current_instruction_branch_delay_slot);
1155
armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));
1156
armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));
1157
armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
1158
}
1159
1160
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
1161
{
1162
// This sucks :(
1163
// TODO: make it a function?
1164
armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
1165
armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
1166
EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
1167
armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
1168
armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
1169
EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));
1170
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1171
m_load_delay_dirty = false;
1172
}
1173
1174
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
1175
{
1176
if (m_load_delay_value_register != NUM_HOST_REGS)
1177
FreeHostReg(m_load_delay_value_register);
1178
1179
EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));
1180
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1181
m_load_delay_register = Reg::count;
1182
m_load_delay_dirty = true;
1183
}
1184
1185
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
1186
{
1187
// May as well flush cycles while we're here.
1188
// GTE spanning blocks is very rare, we _could_ disable this for speed.
1189
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1190
armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
1191
if (m_cycles > 0)
1192
{
1193
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1194
m_cycles = 0;
1195
}
1196
armAsm->cmp(RWARG2, RWARG1);
1197
armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
1198
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1199
m_dirty_gte_done_cycle = false;
1200
}
1201
1202
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1203
{
1204
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1205
1206
// update cycles at the same time
1207
if (flags & FLUSH_CYCLES && m_cycles > 0)
1208
{
1209
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1210
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1211
m_gte_done_cycle -= m_cycles;
1212
m_cycles = 0;
1213
}
1214
1215
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
1216
armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
1217
m_gte_done_cycle = 0;
1218
m_dirty_gte_done_cycle = true;
1219
}
1220
1221
if (flags & FLUSH_CYCLES && m_cycles > 0)
1222
{
1223
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1224
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1225
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1226
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1227
m_cycles = 0;
1228
}
1229
}
1230
1231
void CPU::ARM64Recompiler::Compile_Fallback()
1232
{
1233
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1234
inst->bits);
1235
1236
Flush(FLUSH_FOR_INTERPRETER);
1237
1238
EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));
1239
1240
// TODO: make me less garbage
1241
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
1242
// but nothing should be going through here..
1243
Label no_load_delay;
1244
armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));
1245
armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));
1246
armAsm->b(&no_load_delay, eq);
1247
armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));
1248
armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));
1249
armAsm->str(RWARG2, PTR(&g_state.load_delay_value));
1250
EmitMov(RWARG1, static_cast<u32>(Reg::count));
1251
armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));
1252
armAsm->bind(&no_load_delay);
1253
1254
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
1255
}
1256
1257
void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)
1258
{
1259
DebugAssert(pcreg.IsW());
1260
if (!g_settings.cpu_recompiler_memory_exceptions)
1261
return;
1262
1263
armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
1264
SwitchToFarCode(true, ne);
1265
1266
BackupHostState();
1267
EndBlockWithException(Exception::AdEL);
1268
1269
RestoreHostState();
1270
SwitchToNearCode(false);
1271
}
1272
1273
void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)
1274
{
1275
const Register pcreg = CFGetRegS(cf);
1276
CheckBranchTarget(pcreg);
1277
1278
armAsm->str(pcreg, PTR(&g_state.pc));
1279
1280
CompileBranchDelaySlot(false);
1281
EndBlock(std::nullopt, true);
1282
}
1283
1284
void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)
1285
{
1286
const Register pcreg = CFGetRegS(cf);
1287
if (MipsD() != Reg::zero)
1288
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1289
1290
CheckBranchTarget(pcreg);
1291
armAsm->str(pcreg, PTR(&g_state.pc));
1292
1293
CompileBranchDelaySlot(false);
1294
EndBlock(std::nullopt, true);
1295
}
1296
1297
void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1298
{
1299
AssertRegOrConstS(cf);
1300
1301
const u32 taken_pc = GetConditionalBranchTarget(cf);
1302
1303
Flush(FLUSH_FOR_BRANCH);
1304
1305
DebugAssert(cf.valid_host_s);
1306
1307
// MipsT() here should equal zero for zero branches.
1308
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1309
1310
Label taken;
1311
const Register rs = CFGetRegS(cf);
1312
switch (cond)
1313
{
1314
case BranchCondition::Equal:
1315
case BranchCondition::NotEqual:
1316
{
1317
AssertRegOrConstT(cf);
1318
if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
1319
{
1320
(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
1321
}
1322
else
1323
{
1324
if (cf.valid_host_t)
1325
armAsm->cmp(rs, CFGetRegT(cf));
1326
else if (cf.const_t)
1327
armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
1328
1329
armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
1330
}
1331
}
1332
break;
1333
1334
case BranchCondition::GreaterThanZero:
1335
{
1336
armAsm->cmp(rs, 0);
1337
armAsm->b(&taken, gt);
1338
}
1339
break;
1340
1341
case BranchCondition::GreaterEqualZero:
1342
{
1343
armAsm->cmp(rs, 0);
1344
armAsm->b(&taken, ge);
1345
}
1346
break;
1347
1348
case BranchCondition::LessThanZero:
1349
{
1350
armAsm->cmp(rs, 0);
1351
armAsm->b(&taken, lt);
1352
}
1353
break;
1354
1355
case BranchCondition::LessEqualZero:
1356
{
1357
armAsm->cmp(rs, 0);
1358
armAsm->b(&taken, le);
1359
}
1360
break;
1361
}
1362
1363
BackupHostState();
1364
if (!cf.delay_slot_swapped)
1365
CompileBranchDelaySlot();
1366
1367
EndBlock(m_compiler_pc, true);
1368
1369
armAsm->bind(&taken);
1370
1371
RestoreHostState();
1372
if (!cf.delay_slot_swapped)
1373
CompileBranchDelaySlot();
1374
1375
EndBlock(taken_pc, true);
1376
}
1377
1378
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1379
{
1380
const Register rs = CFGetRegS(cf);
1381
const Register rt = CFGetRegT(cf);
1382
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1383
{
1384
if (!overflow)
1385
{
1386
armAsm->add(rt, rs, armCheckAddSubConstant(imm));
1387
}
1388
else
1389
{
1390
armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
1391
TestOverflow(rt);
1392
}
1393
}
1394
else if (rt.GetCode() != rs.GetCode())
1395
{
1396
armAsm->mov(rt, rs);
1397
}
1398
}
1399
1400
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)
1401
{
1402
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1403
}
1404
1405
void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)
1406
{
1407
Compile_addi(cf, false);
1408
}
1409
1410
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)
1411
{
1412
Compile_slti(cf, true);
1413
}
1414
1415
void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)
1416
{
1417
Compile_slti(cf, false);
1418
}
1419
1420
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)
1421
{
1422
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
1423
armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
1424
}
1425
1426
void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)
1427
{
1428
const Register rt = CFGetRegT(cf);
1429
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1430
armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
1431
else
1432
armAsm->mov(rt, wzr);
1433
}
1434
1435
void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)
1436
{
1437
const Register rt = CFGetRegT(cf);
1438
const Register rs = CFGetRegS(cf);
1439
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1440
armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
1441
else if (rt.GetCode() != rs.GetCode())
1442
armAsm->mov(rt, rs);
1443
}
1444
1445
void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)
1446
{
1447
const Register rt = CFGetRegT(cf);
1448
const Register rs = CFGetRegS(cf);
1449
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1450
armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
1451
else if (rt.GetCode() != rs.GetCode())
1452
armAsm->mov(rt, rs);
1453
}
1454
1455
void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,
1456
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1457
const vixl::aarch64::Register&, unsigned))
1458
{
1459
const Register rd = CFGetRegD(cf);
1460
const Register rt = CFGetRegT(cf);
1461
if (inst->r.shamt > 0)
1462
(armAsm->*op)(rd, rt, inst->r.shamt);
1463
else if (rd.GetCode() != rt.GetCode())
1464
armAsm->mov(rd, rt);
1465
}
1466
1467
void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)
1468
{
1469
Compile_shift(cf, &Assembler::lsl);
1470
}
1471
1472
void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)
1473
{
1474
Compile_shift(cf, &Assembler::lsr);
1475
}
1476
1477
void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)
1478
{
1479
Compile_shift(cf, &Assembler::asr);
1480
}
1481
1482
void CPU::ARM64Recompiler::Compile_variable_shift(
1483
CompileFlags cf,
1484
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
1485
const vixl::aarch64::Register&),
1486
void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
1487
{
1488
const Register rd = CFGetRegD(cf);
1489
1490
AssertRegOrConstS(cf);
1491
AssertRegOrConstT(cf);
1492
1493
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1494
if (!cf.valid_host_t)
1495
MoveTToReg(rt, cf);
1496
1497
if (cf.const_s)
1498
{
1499
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1500
(armAsm->*op_const)(rd, rt, shift);
1501
else if (rd.GetCode() != rt.GetCode())
1502
armAsm->mov(rd, rt);
1503
}
1504
else
1505
{
1506
(armAsm->*op)(rd, rt, CFGetRegS(cf));
1507
}
1508
}
1509
1510
void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)
1511
{
1512
Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
1513
}
1514
1515
void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)
1516
{
1517
Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
1518
}
1519
1520
void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)
1521
{
1522
Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
1523
}
1524
1525
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)
1526
{
1527
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1528
if (!cf.valid_host_s)
1529
MoveSToReg(rs, cf);
1530
1531
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1532
if (!cf.valid_host_t)
1533
MoveTToReg(rt, cf);
1534
1535
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1536
const Register lo = CFGetRegLO(cf);
1537
const Register hi = CFGetRegHI(cf);
1538
1539
(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
1540
armAsm->lsr(hi.X(), lo.X(), 32);
1541
}
1542
1543
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)
1544
{
1545
Compile_mult(cf, true);
1546
}
1547
1548
void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)
1549
{
1550
Compile_mult(cf, false);
1551
}
1552
1553
void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)
1554
{
1555
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1556
if (!cf.valid_host_s)
1557
MoveSToReg(rs, cf);
1558
1559
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1560
if (!cf.valid_host_t)
1561
MoveTToReg(rt, cf);
1562
1563
const Register rlo = CFGetRegLO(cf);
1564
const Register rhi = CFGetRegHI(cf);
1565
1566
// TODO: This could be slightly more optimal
1567
Label done;
1568
Label not_divide_by_zero;
1569
armAsm->cbnz(rt, &not_divide_by_zero);
1570
armAsm->mov(rhi, rs); // hi = num
1571
EmitMov(rlo, 1);
1572
EmitMov(RWSCRATCH, static_cast<u32>(-1));
1573
armAsm->cmp(rs, 0);
1574
armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
1575
armAsm->b(&done);
1576
1577
armAsm->bind(&not_divide_by_zero);
1578
Label not_unrepresentable;
1579
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
1580
armAsm->b(&not_unrepresentable, ne);
1581
armAsm->cmp(rt, armCheckCompareConstant(-1));
1582
armAsm->b(&not_unrepresentable, ne);
1583
1584
EmitMov(rlo, 0x80000000u);
1585
EmitMov(rhi, 0);
1586
armAsm->b(&done);
1587
1588
armAsm->bind(&not_unrepresentable);
1589
1590
armAsm->sdiv(rlo, rs, rt);
1591
1592
// TODO: skip when hi is dead
1593
armAsm->msub(rhi, rlo, rt, rs);
1594
1595
armAsm->bind(&done);
1596
}
1597
1598
void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)
1599
{
1600
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1601
if (!cf.valid_host_s)
1602
MoveSToReg(rs, cf);
1603
1604
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1605
if (!cf.valid_host_t)
1606
MoveTToReg(rt, cf);
1607
1608
const Register rlo = CFGetRegLO(cf);
1609
const Register rhi = CFGetRegHI(cf);
1610
1611
Label done;
1612
Label not_divide_by_zero;
1613
armAsm->cbnz(rt, &not_divide_by_zero);
1614
EmitMov(rlo, static_cast<u32>(-1));
1615
armAsm->mov(rhi, rs);
1616
armAsm->b(&done);
1617
1618
armAsm->bind(&not_divide_by_zero);
1619
1620
armAsm->udiv(rlo, rs, rt);
1621
1622
// TODO: skip when hi is dead
1623
armAsm->msub(rhi, rlo, rt, rs);
1624
1625
armAsm->bind(&done);
1626
}
1627
1628
void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)
1629
{
1630
DebugAssert(result.IsW());
1631
SwitchToFarCode(true, vs);
1632
1633
BackupHostState();
1634
1635
// toss the result
1636
ClearHostReg(result.GetCode());
1637
1638
EndBlockWithException(Exception::Ov);
1639
1640
RestoreHostState();
1641
1642
SwitchToNearCode(false);
1643
}
1644
1645
void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,
1646
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1647
const vixl::aarch64::Register&,
1648
const vixl::aarch64::Operand&),
1649
bool commutative, bool logical, bool overflow)
1650
{
1651
AssertRegOrConstS(cf);
1652
AssertRegOrConstT(cf);
1653
1654
const Register rd = CFGetRegD(cf);
1655
if (cf.valid_host_s && cf.valid_host_t)
1656
{
1657
(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
1658
}
1659
else if (commutative && (cf.const_s || cf.const_t))
1660
{
1661
const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1662
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1663
{
1664
(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1665
}
1666
else
1667
{
1668
if (rd.GetCode() != src.GetCode())
1669
armAsm->mov(rd, src);
1670
overflow = false;
1671
}
1672
}
1673
else if (cf.const_s)
1674
{
1675
// TODO: Check where we can use wzr here
1676
EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
1677
(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
1678
}
1679
else if (cf.const_t)
1680
{
1681
const Register rs = CFGetRegS(cf);
1682
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1683
{
1684
(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1685
}
1686
else
1687
{
1688
if (rd.GetCode() != rs.GetCode())
1689
armAsm->mov(rd, rs);
1690
overflow = false;
1691
}
1692
}
1693
1694
if (overflow)
1695
TestOverflow(rd);
1696
}
1697
1698
void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)
1699
{
1700
if (g_settings.cpu_recompiler_memory_exceptions)
1701
Compile_dst_op(cf, &Assembler::adds, true, false, true);
1702
else
1703
Compile_dst_op(cf, &Assembler::add, true, false, false);
1704
}
1705
1706
void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)
1707
{
1708
Compile_dst_op(cf, &Assembler::add, true, false, false);
1709
}
1710
1711
void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)
1712
{
1713
if (g_settings.cpu_recompiler_memory_exceptions)
1714
Compile_dst_op(cf, &Assembler::subs, false, false, true);
1715
else
1716
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1717
}
1718
1719
void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)
1720
{
1721
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1722
}
1723
1724
void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)
1725
{
1726
AssertRegOrConstS(cf);
1727
AssertRegOrConstT(cf);
1728
1729
// special cases - and with self -> self, and with 0 -> 0
1730
const Register regd = CFGetRegD(cf);
1731
if (cf.MipsS() == cf.MipsT())
1732
{
1733
armAsm->mov(regd, CFGetRegS(cf));
1734
return;
1735
}
1736
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1737
{
1738
armAsm->mov(regd, wzr);
1739
return;
1740
}
1741
1742
Compile_dst_op(cf, &Assembler::and_, true, true, false);
1743
}
1744
1745
void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)
1746
{
1747
AssertRegOrConstS(cf);
1748
AssertRegOrConstT(cf);
1749
1750
// or/nor with 0 -> no effect
1751
const Register regd = CFGetRegD(cf);
1752
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1753
{
1754
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1755
return;
1756
}
1757
1758
Compile_dst_op(cf, &Assembler::orr, true, true, false);
1759
}
1760
1761
void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)
1762
{
1763
AssertRegOrConstS(cf);
1764
AssertRegOrConstT(cf);
1765
1766
const Register regd = CFGetRegD(cf);
1767
if (cf.MipsS() == cf.MipsT())
1768
{
1769
// xor with self -> zero
1770
armAsm->mov(regd, wzr);
1771
return;
1772
}
1773
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1774
{
1775
// xor with zero -> no effect
1776
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1777
return;
1778
}
1779
1780
Compile_dst_op(cf, &Assembler::eor, true, true, false);
1781
}
1782
1783
void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)
1784
{
1785
Compile_or(cf);
1786
armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
1787
}
1788
1789
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)
1790
{
1791
Compile_slt(cf, true);
1792
}
1793
1794
void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)
1795
{
1796
Compile_slt(cf, false);
1797
}
1798
1799
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)
1800
{
1801
AssertRegOrConstS(cf);
1802
AssertRegOrConstT(cf);
1803
1804
// TODO: swap and reverse op for constants
1805
if (cf.const_s)
1806
{
1807
EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
1808
armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
1809
}
1810
else if (cf.const_t)
1811
{
1812
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
1813
}
1814
else
1815
{
1816
armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
1817
}
1818
1819
armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
1820
}
1821
1822
vixl::aarch64::Register
1823
CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1824
const std::optional<const vixl::aarch64::Register>& reg)
1825
{
1826
const u32 imm = inst->i.imm_sext32();
1827
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1828
return CFGetRegS(cf);
1829
1830
const Register dst = reg.has_value() ? reg.value() : RWARG1;
1831
if (address.has_value())
1832
{
1833
EmitMov(dst, address.value());
1834
}
1835
else if (imm == 0)
1836
{
1837
if (cf.valid_host_s)
1838
{
1839
if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
1840
armAsm->mov(dst, CFGetRegS(cf));
1841
}
1842
else
1843
{
1844
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1845
}
1846
}
1847
else
1848
{
1849
if (cf.valid_host_s)
1850
{
1851
armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1852
}
1853
else
1854
{
1855
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1856
armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1857
}
1858
}
1859
1860
return dst;
1861
}
1862
1863
template<typename RegAllocFn>
1864
vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,
1865
MemoryAccessSize size, bool sign, bool use_fastmem,
1866
const RegAllocFn& dst_reg_alloc)
1867
{
1868
DebugAssert(addr_reg.IsW());
1869
if (use_fastmem)
1870
{
1871
m_cycles += Bus::RAM_READ_TICKS;
1872
1873
const Register dst = dst_reg_alloc();
1874
1875
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1876
{
1877
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1878
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1879
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1880
}
1881
1882
const MemOperand mem =
1883
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1884
u8* start = armAsm->GetCursorAddress<u8*>();
1885
switch (size)
1886
{
1887
case MemoryAccessSize::Byte:
1888
sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
1889
break;
1890
1891
case MemoryAccessSize::HalfWord:
1892
sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
1893
break;
1894
1895
case MemoryAccessSize::Word:
1896
armAsm->ldr(dst, mem);
1897
break;
1898
}
1899
1900
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
1901
return dst;
1902
}
1903
1904
if (addr_reg.GetCode() != RWARG1.GetCode())
1905
armAsm->mov(RWARG1, addr_reg);
1906
1907
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1908
switch (size)
1909
{
1910
case MemoryAccessSize::Byte:
1911
{
1912
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :
1913
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));
1914
}
1915
break;
1916
case MemoryAccessSize::HalfWord:
1917
{
1918
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :
1919
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));
1920
}
1921
break;
1922
case MemoryAccessSize::Word:
1923
{
1924
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :
1925
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));
1926
}
1927
break;
1928
}
1929
1930
// TODO: turn this into an asm function instead
1931
if (checked)
1932
{
1933
SwitchToFarCodeIfBitSet(RXRET, 63);
1934
BackupHostState();
1935
1936
// Need to stash this in a temp because of the flush.
1937
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
1938
armAsm->neg(temp.X(), RXRET);
1939
armAsm->lsl(temp, temp, 2);
1940
1941
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1942
1943
// cause_bits = (-result << 2) | BD | cop_n
1944
armAsm->orr(RWARG1, temp,
1945
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1946
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1947
EmitMov(RWARG2, m_current_instruction_pc);
1948
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1949
FreeHostReg(temp.GetCode());
1950
EndBlock(std::nullopt, true);
1951
1952
RestoreHostState();
1953
SwitchToNearCode(false);
1954
}
1955
1956
const Register dst_reg = dst_reg_alloc();
1957
switch (size)
1958
{
1959
case MemoryAccessSize::Byte:
1960
{
1961
sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
1962
}
1963
break;
1964
case MemoryAccessSize::HalfWord:
1965
{
1966
sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
1967
}
1968
break;
1969
case MemoryAccessSize::Word:
1970
{
1971
if (dst_reg.GetCode() != RWRET.GetCode())
1972
armAsm->mov(dst_reg, RWRET);
1973
}
1974
break;
1975
}
1976
1977
return dst_reg;
1978
}
1979
1980
void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,
1981
const vixl::aarch64::Register& value_reg, MemoryAccessSize size,
1982
bool use_fastmem)
1983
{
1984
DebugAssert(addr_reg.IsW() && value_reg.IsW());
1985
if (use_fastmem)
1986
{
1987
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1988
{
1989
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1990
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1991
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1992
}
1993
1994
const MemOperand mem =
1995
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1996
u8* start = armAsm->GetCursorAddress<u8*>();
1997
switch (size)
1998
{
1999
case MemoryAccessSize::Byte:
2000
armAsm->strb(value_reg, mem);
2001
break;
2002
2003
case MemoryAccessSize::HalfWord:
2004
armAsm->strh(value_reg, mem);
2005
break;
2006
2007
case MemoryAccessSize::Word:
2008
armAsm->str(value_reg, mem);
2009
break;
2010
}
2011
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
2012
return;
2013
}
2014
2015
if (addr_reg.GetCode() != RWARG1.GetCode())
2016
armAsm->mov(RWARG1, addr_reg);
2017
if (value_reg.GetCode() != RWARG2.GetCode())
2018
armAsm->mov(RWARG2, value_reg);
2019
2020
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
2021
switch (size)
2022
{
2023
case MemoryAccessSize::Byte:
2024
{
2025
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :
2026
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));
2027
}
2028
break;
2029
case MemoryAccessSize::HalfWord:
2030
{
2031
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :
2032
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));
2033
}
2034
break;
2035
case MemoryAccessSize::Word:
2036
{
2037
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :
2038
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));
2039
}
2040
break;
2041
}
2042
2043
// TODO: turn this into an asm function instead
2044
if (checked)
2045
{
2046
SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
2047
BackupHostState();
2048
2049
// Need to stash this in a temp because of the flush.
2050
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2051
armAsm->lsl(temp, RWRET, 2);
2052
2053
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
2054
2055
// cause_bits = (result << 2) | BD | cop_n
2056
armAsm->orr(RWARG1, temp,
2057
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
2058
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
2059
EmitMov(RWARG2, m_current_instruction_pc);
2060
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2061
FreeHostReg(temp.GetCode());
2062
EndBlock(std::nullopt, true);
2063
2064
RestoreHostState();
2065
SwitchToNearCode(false);
2066
}
2067
}
2068
2069
void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2070
const std::optional<VirtualMemoryAddress>& address)
2071
{
2072
const std::optional<WRegister> addr_reg =
2073
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2074
std::optional<WRegister>();
2075
FlushForLoadStore(address, false, use_fastmem);
2076
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2077
const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {
2078
if (cf.MipsT() == Reg::zero)
2079
return RWRET;
2080
2081
return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2082
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
2083
cf.MipsT()));
2084
});
2085
2086
if (g_settings.gpu_pgxp_enable)
2087
{
2088
Flush(FLUSH_FOR_C_CALL);
2089
2090
EmitMov(RWARG1, inst->bits);
2091
armAsm->mov(RWARG2, addr);
2092
armAsm->mov(RWARG3, data);
2093
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
2094
FreeHostReg(addr_reg.value().GetCode());
2095
}
2096
}
2097
2098
void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2099
const std::optional<VirtualMemoryAddress>& address)
2100
{
2101
DebugAssert(size == MemoryAccessSize::Word && !sign);
2102
2103
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2104
FlushForLoadStore(address, false, use_fastmem);
2105
2106
// TODO: if address is constant, this can be simplified..
2107
2108
// If we're coming from another block, just flush the load delay and hope for the best..
2109
if (m_load_delay_dirty)
2110
UpdateLoadDelay();
2111
2112
// We'd need to be careful here if we weren't overwriting it..
2113
ComputeLoadStoreAddressArg(cf, address, addr);
2114
2115
// Do PGXP first, it does its own load.
2116
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
2117
{
2118
Flush(FLUSH_FOR_C_CALL);
2119
EmitMov(RWARG1, inst->bits);
2120
armAsm->mov(RWARG2, addr);
2121
MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
2122
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
2123
}
2124
2125
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2126
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2127
2128
if (inst->r.rt == Reg::zero)
2129
{
2130
FreeHostReg(addr.GetCode());
2131
return;
2132
}
2133
2134
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
2135
// never written back. NOTE: can't trust T in cf because of the flush
2136
const Reg rt = inst->r.rt;
2137
Register value;
2138
if (m_load_delay_register == rt)
2139
{
2140
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
2141
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
2142
m_load_delay_value_register;
2143
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
2144
value = WRegister(existing_ld_rt);
2145
}
2146
else
2147
{
2148
if constexpr (EMULATE_LOAD_DELAYS)
2149
{
2150
value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
2151
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
2152
armAsm->mov(value, WRegister(rtreg.value()));
2153
else if (HasConstantReg(rt))
2154
EmitMov(value, GetConstantRegU32(rt));
2155
else
2156
armAsm->ldr(value, MipsPtr(rt));
2157
}
2158
else
2159
{
2160
value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
2161
}
2162
}
2163
2164
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2165
armAsm->and_(RWARG2, addr, 3);
2166
armAsm->lsl(RWARG2, RWARG2, 3); // *8
2167
EmitMov(RWARG3, 24);
2168
armAsm->sub(RWARG3, RWARG3, RWARG2);
2169
2170
if (inst->op == InstructionOp::lwl)
2171
{
2172
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
2173
// new_value = (value & mask) | (RWRET << (24 - shift));
2174
EmitMov(RWSCRATCH, 0xFFFFFFu);
2175
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);
2176
armAsm->and_(value, value, RWSCRATCH);
2177
armAsm->lslv(RWRET, RWRET, RWARG3);
2178
armAsm->orr(value, value, RWRET);
2179
}
2180
else
2181
{
2182
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
2183
// new_value = (value & mask) | (RWRET >> shift);
2184
armAsm->lsrv(RWRET, RWRET, RWARG2);
2185
EmitMov(RWSCRATCH, 0xFFFFFF00u);
2186
armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);
2187
armAsm->and_(value, value, RWSCRATCH);
2188
armAsm->orr(value, value, RWRET);
2189
}
2190
2191
FreeHostReg(addr.GetCode());
2192
}
2193
2194
void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2195
const std::optional<VirtualMemoryAddress>& address)
2196
{
2197
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2198
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2199
const std::optional<WRegister> addr_reg =
2200
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2201
std::optional<WRegister>();
2202
FlushForLoadStore(address, false, use_fastmem);
2203
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2204
const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2205
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2206
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2207
RWRET;
2208
});
2209
2210
switch (action)
2211
{
2212
case GTERegisterAccessAction::Ignore:
2213
{
2214
break;
2215
}
2216
2217
case GTERegisterAccessAction::Direct:
2218
{
2219
armAsm->str(value, PTR(ptr));
2220
break;
2221
}
2222
2223
case GTERegisterAccessAction::SignExtend16:
2224
{
2225
armAsm->sxth(RWARG3, value);
2226
armAsm->str(RWARG3, PTR(ptr));
2227
break;
2228
}
2229
2230
case GTERegisterAccessAction::ZeroExtend16:
2231
{
2232
armAsm->uxth(RWARG3, value);
2233
armAsm->str(RWARG3, PTR(ptr));
2234
break;
2235
}
2236
2237
case GTERegisterAccessAction::CallHandler:
2238
{
2239
Flush(FLUSH_FOR_C_CALL);
2240
armAsm->mov(RWARG2, value);
2241
EmitMov(RWARG1, index);
2242
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2243
break;
2244
}
2245
2246
case GTERegisterAccessAction::PushFIFO:
2247
{
2248
// SXY0 <- SXY1
2249
// SXY1 <- SXY2
2250
// SXY2 <- SXYP
2251
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2252
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2253
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2254
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2255
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2256
armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
2257
break;
2258
}
2259
2260
default:
2261
{
2262
Panic("Unknown action");
2263
return;
2264
}
2265
}
2266
2267
if (g_settings.gpu_pgxp_enable)
2268
{
2269
Flush(FLUSH_FOR_C_CALL);
2270
armAsm->mov(RWARG3, value);
2271
if (value.GetCode() != RWRET.GetCode())
2272
FreeHostReg(value.GetCode());
2273
armAsm->mov(RWARG2, addr);
2274
FreeHostReg(addr_reg.value().GetCode());
2275
EmitMov(RWARG1, inst->bits);
2276
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2277
}
2278
}
2279
2280
void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2281
const std::optional<VirtualMemoryAddress>& address)
2282
{
2283
AssertRegOrConstS(cf);
2284
AssertRegOrConstT(cf);
2285
2286
const std::optional<WRegister> addr_reg =
2287
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2288
std::optional<WRegister>();
2289
FlushForLoadStore(address, true, use_fastmem);
2290
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2291
const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
2292
if (!cf.valid_host_t)
2293
MoveTToReg(RWARG2, cf);
2294
2295
GenerateStore(addr, data, size, use_fastmem);
2296
2297
if (g_settings.gpu_pgxp_enable)
2298
{
2299
Flush(FLUSH_FOR_C_CALL);
2300
MoveMIPSRegToReg(RWARG3, cf.MipsT());
2301
armAsm->mov(RWARG2, addr);
2302
EmitMov(RWARG1, inst->bits);
2303
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2304
FreeHostReg(addr_reg.value().GetCode());
2305
}
2306
}
2307
2308
void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2309
const std::optional<VirtualMemoryAddress>& address)
2310
{
2311
DebugAssert(size == MemoryAccessSize::Word && !sign);
2312
2313
// TODO: this can take over rt's value if it's no longer needed
2314
// NOTE: can't trust T in cf because of the alloc
2315
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2316
2317
FlushForLoadStore(address, true, use_fastmem);
2318
2319
// TODO: if address is constant, this can be simplified..
2320
// We'd need to be careful here if we weren't overwriting it..
2321
ComputeLoadStoreAddressArg(cf, address, addr);
2322
2323
if (g_settings.gpu_pgxp_enable)
2324
{
2325
Flush(FLUSH_FOR_C_CALL);
2326
EmitMov(RWARG1, inst->bits);
2327
armAsm->mov(RWARG2, addr);
2328
MoveMIPSRegToReg(RWARG3, inst->r.rt);
2329
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
2330
}
2331
2332
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2333
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2334
2335
armAsm->and_(RWSCRATCH, addr, 3);
2336
armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
2337
armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
2338
2339
MoveMIPSRegToReg(RWARG2, inst->r.rt);
2340
2341
if (inst->op == InstructionOp::swl)
2342
{
2343
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2344
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2345
EmitMov(RWARG3, 0xFFFFFF00u);
2346
armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
2347
armAsm->and_(RWRET, RWRET, RWARG3);
2348
2349
EmitMov(RWARG3, 24);
2350
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2351
armAsm->lsrv(RWARG2, RWARG2, RWARG3);
2352
armAsm->orr(RWARG2, RWARG2, RWRET);
2353
}
2354
else
2355
{
2356
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2357
// new_value = (RWRET & mem_mask) | (value << shift);
2358
armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);
2359
2360
EmitMov(RWARG3, 24);
2361
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2362
EmitMov(RWSCRATCH, 0x00FFFFFFu);
2363
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
2364
armAsm->and_(RWRET, RWRET, RWSCRATCH);
2365
armAsm->orr(RWARG2, RWARG2, RWRET);
2366
}
2367
2368
GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
2369
FreeHostReg(addr.GetCode());
2370
}
2371
2372
void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2373
const std::optional<VirtualMemoryAddress>& address)
2374
{
2375
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2376
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2377
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2378
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2379
RWARG1;
2380
const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
2381
FlushForLoadStore(address, true, use_fastmem);
2382
ComputeLoadStoreAddressArg(cf, address, addr);
2383
2384
switch (action)
2385
{
2386
case GTERegisterAccessAction::Direct:
2387
{
2388
armAsm->ldr(data, PTR(ptr));
2389
}
2390
break;
2391
2392
case GTERegisterAccessAction::CallHandler:
2393
{
2394
// should already be flushed.. except in fastmem case
2395
Flush(FLUSH_FOR_C_CALL);
2396
EmitMov(RWARG1, index);
2397
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2398
armAsm->mov(data, RWRET);
2399
}
2400
break;
2401
2402
default:
2403
{
2404
Panic("Unknown action");
2405
}
2406
break;
2407
}
2408
2409
GenerateStore(addr, data, size, use_fastmem);
2410
if (!g_settings.gpu_pgxp_enable)
2411
{
2412
if (addr.GetCode() != RWARG1.GetCode())
2413
FreeHostReg(addr.GetCode());
2414
}
2415
else
2416
{
2417
// TODO: This can be simplified because we don't need to validate in PGXP..
2418
Flush(FLUSH_FOR_C_CALL);
2419
armAsm->mov(RWARG3, data);
2420
FreeHostReg(data.GetCode());
2421
armAsm->mov(RWARG2, addr);
2422
FreeHostReg(addr.GetCode());
2423
EmitMov(RWARG1, inst->bits);
2424
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2425
}
2426
}
2427
2428
void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)
2429
{
2430
// TODO: we need better constant setting here.. which will need backprop
2431
AssertRegOrConstT(cf);
2432
2433
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2434
const u32* ptr = GetCop0RegPtr(reg);
2435
const u32 mask = GetCop0RegWriteMask(reg);
2436
if (!ptr)
2437
{
2438
Compile_Fallback();
2439
return;
2440
}
2441
2442
if (mask == 0)
2443
{
2444
// if it's a read-only register, ignore
2445
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2446
return;
2447
}
2448
2449
// for some registers, we need to test certain bits
2450
const bool needs_bit_test = (reg == Cop0Reg::SR);
2451
const Register new_value = RWARG1;
2452
const Register old_value = RWARG2;
2453
const Register changed_bits = RWARG3;
2454
const Register mask_reg = RWSCRATCH;
2455
2456
// Load old value
2457
armAsm->ldr(old_value, PTR(ptr));
2458
2459
// No way we fit this in an immediate..
2460
EmitMov(mask_reg, mask);
2461
2462
// update value
2463
if (cf.valid_host_t)
2464
armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
2465
else
2466
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2467
2468
if (needs_bit_test)
2469
armAsm->eor(changed_bits, old_value, new_value);
2470
armAsm->bic(old_value, old_value, mask_reg);
2471
armAsm->orr(new_value, old_value, new_value);
2472
armAsm->str(new_value, PTR(ptr));
2473
2474
if (reg == Cop0Reg::SR)
2475
{
2476
// TODO: replace with register backup
2477
// We could just inline the whole thing..
2478
Flush(FLUSH_FOR_C_CALL);
2479
2480
Label caches_unchanged;
2481
armAsm->tbz(changed_bits, 16, &caches_unchanged);
2482
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2483
armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below
2484
if (CodeCache::IsUsingFastmem())
2485
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
2486
armAsm->bind(&caches_unchanged);
2487
2488
TestInterrupts(RWARG1);
2489
}
2490
else if (reg == Cop0Reg::CAUSE)
2491
{
2492
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2493
TestInterrupts(RWARG1);
2494
}
2495
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2496
{
2497
// need to check whether we're switching to debug mode
2498
Flush(FLUSH_FOR_C_CALL);
2499
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2500
SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);
2501
BackupHostState();
2502
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2503
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2504
RestoreHostState();
2505
SwitchToNearCode(false);
2506
}
2507
}
2508
2509
void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)
2510
{
2511
// shift mode bits right two, preserving upper bits
2512
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2513
armAsm->bfxil(RWARG1, RWARG1, 2, 4);
2514
armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2515
2516
TestInterrupts(RWARG1);
2517
}
2518
2519
void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)
2520
{
2521
DebugAssert(sr.IsW());
2522
2523
// if Iec == 0 then goto no_interrupt
2524
Label no_interrupt;
2525
armAsm->tbz(sr, 0, &no_interrupt);
2526
2527
// sr & cause
2528
armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
2529
armAsm->and_(sr, sr, RWSCRATCH);
2530
2531
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2532
armAsm->tst(sr, 0xFF00);
2533
2534
SwitchToFarCode(true, ne);
2535
BackupHostState();
2536
2537
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2538
UpdateLoadDelay();
2539
2540
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2541
2542
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2543
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2544
if (!iinfo->is_last_instruction)
2545
{
2546
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2547
(inst + 1)->cop.cop_n));
2548
EmitMov(RWARG2, m_compiler_pc);
2549
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2550
m_dirty_pc = false;
2551
EndAndLinkBlock(std::nullopt, true, false);
2552
}
2553
else
2554
{
2555
if (m_dirty_pc)
2556
EmitMov(RWARG1, m_compiler_pc);
2557
armAsm->str(wzr, PTR(&g_state.downcount));
2558
if (m_dirty_pc)
2559
armAsm->str(RWARG1, PTR(&g_state.pc));
2560
m_dirty_pc = false;
2561
EndAndLinkBlock(std::nullopt, false, true);
2562
}
2563
2564
RestoreHostState();
2565
SwitchToNearCode(false);
2566
2567
armAsm->bind(&no_interrupt);
2568
}
2569
2570
void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)
2571
{
2572
const u32 index = inst->cop.Cop2Index();
2573
const Reg rt = inst->r.rt;
2574
2575
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2576
if (action == GTERegisterAccessAction::Ignore)
2577
return;
2578
2579
u32 hreg;
2580
if (action == GTERegisterAccessAction::Direct)
2581
{
2582
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2583
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2584
armAsm->ldr(WRegister(hreg), PTR(ptr));
2585
}
2586
else if (action == GTERegisterAccessAction::CallHandler)
2587
{
2588
Flush(FLUSH_FOR_C_CALL);
2589
EmitMov(RWARG1, index);
2590
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2591
2592
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2593
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2594
armAsm->mov(WRegister(hreg), RWRET);
2595
}
2596
else
2597
{
2598
Panic("Unknown action");
2599
return;
2600
}
2601
2602
if (g_settings.gpu_pgxp_enable)
2603
{
2604
Flush(FLUSH_FOR_C_CALL);
2605
EmitMov(RWARG1, inst->bits);
2606
armAsm->mov(RWARG2, WRegister(hreg));
2607
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2608
}
2609
}
2610
2611
void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)
2612
{
2613
const u32 index = inst->cop.Cop2Index();
2614
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2615
if (action == GTERegisterAccessAction::Ignore)
2616
return;
2617
2618
if (action == GTERegisterAccessAction::Direct)
2619
{
2620
if (cf.const_t)
2621
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2622
else
2623
armAsm->str(CFGetRegT(cf), PTR(ptr));
2624
}
2625
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2626
{
2627
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2628
if (cf.valid_host_t)
2629
{
2630
sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
2631
armAsm->str(RWARG1, PTR(ptr));
2632
}
2633
else if (cf.const_t)
2634
{
2635
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2636
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2637
}
2638
else
2639
{
2640
Panic("Unsupported setup");
2641
}
2642
}
2643
else if (action == GTERegisterAccessAction::CallHandler)
2644
{
2645
Flush(FLUSH_FOR_C_CALL);
2646
EmitMov(RWARG1, index);
2647
MoveTToReg(RWARG2, cf);
2648
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2649
}
2650
else if (action == GTERegisterAccessAction::PushFIFO)
2651
{
2652
// SXY0 <- SXY1
2653
// SXY1 <- SXY2
2654
// SXY2 <- SXYP
2655
DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
2656
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2657
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2658
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2659
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2660
if (cf.valid_host_t)
2661
armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
2662
else if (cf.const_t)
2663
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2664
else
2665
Panic("Unsupported setup");
2666
}
2667
else
2668
{
2669
Panic("Unknown action");
2670
}
2671
}
2672
2673
void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)
2674
{
2675
TickCount func_ticks;
2676
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2677
2678
Flush(FLUSH_FOR_C_CALL);
2679
EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2680
EmitCall(reinterpret_cast<const void*>(func));
2681
2682
AddGTETicks(func_ticks);
2683
}
2684
2685
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2686
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2687
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2688
bool is_load)
2689
{
2690
Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
2691
Assembler* armAsm = &arm_asm;
2692
2693
#ifdef VIXL_DEBUG
2694
vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
2695
#endif
2696
2697
static constexpr u32 GPR_SIZE = 8;
2698
2699
// save regs
2700
u32 num_gprs = 0;
2701
2702
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2703
{
2704
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2705
num_gprs++;
2706
}
2707
2708
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
2709
2710
// TODO: use stp+ldp, vixl helper?
2711
2712
if (stack_size > 0)
2713
{
2714
armAsm->sub(sp, sp, stack_size);
2715
2716
u32 stack_offset = 0;
2717
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2718
{
2719
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2720
{
2721
armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
2722
stack_offset += GPR_SIZE;
2723
}
2724
}
2725
}
2726
2727
if (cycles_to_add != 0)
2728
{
2729
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2730
Assert(Assembler::IsImmAddSub(cycles_to_add));
2731
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2732
armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
2733
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2734
}
2735
2736
if (address_register != static_cast<u8>(RWARG1.GetCode()))
2737
armAsm->mov(RWARG1, WRegister(address_register));
2738
2739
if (!is_load)
2740
{
2741
if (data_register != static_cast<u8>(RWARG2.GetCode()))
2742
armAsm->mov(RWARG2, WRegister(data_register));
2743
}
2744
2745
switch (size)
2746
{
2747
case MemoryAccessSize::Byte:
2748
{
2749
armEmitCall(armAsm,
2750
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :
2751
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),
2752
false);
2753
}
2754
break;
2755
case MemoryAccessSize::HalfWord:
2756
{
2757
armEmitCall(armAsm,
2758
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :
2759
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),
2760
false);
2761
}
2762
break;
2763
case MemoryAccessSize::Word:
2764
{
2765
armEmitCall(armAsm,
2766
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :
2767
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),
2768
false);
2769
}
2770
break;
2771
}
2772
2773
if (is_load)
2774
{
2775
const WRegister dst = WRegister(data_register);
2776
switch (size)
2777
{
2778
case MemoryAccessSize::Byte:
2779
{
2780
is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);
2781
}
2782
break;
2783
case MemoryAccessSize::HalfWord:
2784
{
2785
is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);
2786
}
2787
break;
2788
case MemoryAccessSize::Word:
2789
{
2790
if (dst.GetCode() != RWRET.GetCode())
2791
armAsm->mov(dst, RWRET);
2792
}
2793
break;
2794
}
2795
}
2796
2797
if (cycles_to_remove != 0)
2798
{
2799
Assert(Assembler::IsImmAddSub(cycles_to_remove));
2800
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2801
armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);
2802
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2803
}
2804
2805
// restore regs
2806
if (stack_size > 0)
2807
{
2808
u32 stack_offset = 0;
2809
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2810
{
2811
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2812
{
2813
armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));
2814
stack_offset += GPR_SIZE;
2815
}
2816
}
2817
2818
armAsm->add(sp, sp, stack_size);
2819
}
2820
2821
armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
2822
armAsm->FinalizeCode();
2823
2824
return static_cast<u32>(armAsm->GetCursorOffset());
2825
}
2826
2827
#endif // CPU_ARCH_ARM64
2828
2829