Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_arm64.cpp
4212 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_arm64.h"
5
#include "cpu_core_private.h"
6
#include "cpu_pgxp.h"
7
#include "gte.h"
8
#include "settings.h"
9
#include "timing_event.h"
10
11
#include "common/align.h"
12
#include "common/assert.h"
13
#include "common/log.h"
14
#include "common/memmap.h"
15
#include "common/string_util.h"
16
17
#include <limits>
18
19
#ifdef CPU_ARCH_ARM64
20
21
#include "vixl/aarch64/constants-aarch64.h"
22
23
#ifdef ENABLE_HOST_DISASSEMBLY
24
#include "vixl/aarch64/disasm-aarch64.h"
25
#endif
26
27
LOG_CHANNEL(Recompiler);
28
29
#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
30
31
#define RWRET vixl::aarch64::w0
32
#define RXRET vixl::aarch64::x0
33
#define RWARG1 vixl::aarch64::w0
34
#define RXARG1 vixl::aarch64::x0
35
#define RWARG2 vixl::aarch64::w1
36
#define RXARG2 vixl::aarch64::x1
37
#define RWARG3 vixl::aarch64::w2
38
#define RXARG3 vixl::aarch64::x2
39
#define RWSCRATCH vixl::aarch64::w16
40
#define RXSCRATCH vixl::aarch64::x16
41
#define RSTATE vixl::aarch64::x19
42
#define RMEMBASE vixl::aarch64::x20
43
44
static bool armIsCallerSavedRegister(u32 id);
45
static s64 armGetPCDisplacement(const void* current, const void* target);
46
static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);
47
static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);
48
static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
49
static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
50
static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
51
static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
52
static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
53
bool sign_extend_word = false);
54
static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
55
const vixl::aarch64::Register& tempreg = RXSCRATCH);
56
static u8* armGetJumpTrampoline(const void* target);
57
static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);
58
59
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
60
static std::unordered_map<const void*, u32> s_trampoline_targets;
61
static u8* s_trampoline_start_ptr = nullptr;
62
static u32 s_trampoline_used = 0;
63
64
namespace CPU {
65
66
using namespace vixl::aarch64;
67
68
static ARM64Recompiler s_instance;
69
Recompiler* g_compiler = &s_instance;
70
71
} // namespace CPU
72
73
bool armIsCallerSavedRegister(u32 id)
74
{
75
// same on both linux and windows
76
return (id <= 18);
77
}
78
79
void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)
80
{
81
// From vixl macro assembler.
82
DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());
83
DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());
84
85
if (imm == 0)
86
{
87
armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));
88
return;
89
}
90
91
// The worst case for size is mov 64-bit immediate to sp:
92
// * up to 4 instructions to materialise the constant
93
// * 1 instruction to move to sp
94
95
// Immediates on Aarch64 can be produced using an initial value, and zero to
96
// three move keep operations.
97
//
98
// Initial values can be generated with:
99
// 1. 64-bit move zero (movz).
100
// 2. 32-bit move inverted (movn).
101
// 3. 64-bit move inverted.
102
// 4. 32-bit orr immediate.
103
// 5. 64-bit orr immediate.
104
// Move-keep may then be used to modify each of the 16-bit half words.
105
//
106
// The code below supports all five initial value generators, and
107
// applying move-keep operations to move-zero and move-inverted initial
108
// values.
109
110
// Try to move the immediate in one instruction, and if that fails, switch to
111
// using multiple instructions.
112
const unsigned reg_size = rd.GetSizeInBits();
113
114
if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())
115
{
116
// Immediate can be represented in a move zero instruction. Movz can't write
117
// to the stack pointer.
118
armAsm->movz(rd, imm);
119
return;
120
}
121
else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())
122
{
123
// Immediate can be represented in a move negative instruction. Movn can't
124
// write to the stack pointer.
125
armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));
126
return;
127
}
128
else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))
129
{
130
// Immediate can be represented in a logical orr instruction.
131
DebugAssert(!rd.IsZero());
132
armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);
133
return;
134
}
135
136
// Generic immediate case. Imm will be represented by
137
// [imm3, imm2, imm1, imm0], where each imm is 16 bits.
138
// A move-zero or move-inverted is generated for the first non-zero or
139
// non-0xffff immX, and a move-keep for subsequent non-zero immX.
140
141
uint64_t ignored_halfword = 0;
142
bool invert_move = false;
143
// If the number of 0xffff halfwords is greater than the number of 0x0000
144
// halfwords, it's more efficient to use move-inverted.
145
if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))
146
{
147
ignored_halfword = 0xffff;
148
invert_move = true;
149
}
150
151
// Iterate through the halfwords. Use movn/movz for the first non-ignored
152
// halfword, and movk for subsequent halfwords.
153
DebugAssert((reg_size % 16) == 0);
154
bool first_mov_done = false;
155
for (unsigned i = 0; i < (reg_size / 16); i++)
156
{
157
uint64_t imm16 = (imm >> (16 * i)) & 0xffff;
158
if (imm16 != ignored_halfword)
159
{
160
if (!first_mov_done)
161
{
162
if (invert_move)
163
armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);
164
else
165
armAsm->movz(rd, imm16, 16 * i);
166
first_mov_done = true;
167
}
168
else
169
{
170
// Construct a wider constant.
171
armAsm->movk(rd, imm16, 16 * i);
172
}
173
}
174
}
175
176
DebugAssert(first_mov_done);
177
}
178
179
s64 armGetPCDisplacement(const void* current, const void* target)
180
{
181
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
182
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
183
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
184
}
185
186
bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)
187
{
188
const void* cur = armAsm->GetCursorAddress<const void*>();
189
const void* current_code_ptr_page =
190
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
191
const void* ptr_page =
192
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
193
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
194
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
195
196
return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||
197
vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));
198
}
199
200
void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)
201
{
202
DebugAssert(reg.IsX());
203
204
const void* cur = armAsm->GetCursorAddress<const void*>();
205
const void* current_code_ptr_page =
206
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
207
const void* ptr_page =
208
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
209
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
210
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
211
if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))
212
{
213
armAsm->adrp(reg, page_displacement);
214
armAsm->add(reg, reg, page_offset);
215
}
216
else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))
217
{
218
armAsm->adrp(reg, page_displacement);
219
armAsm->orr(reg, reg, page_offset);
220
}
221
else
222
{
223
armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));
224
}
225
}
226
227
void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
228
{
229
const void* cur = armAsm->GetCursorAddress<const void*>();
230
s64 displacement = armGetPCDisplacement(cur, ptr);
231
bool use_blr = !vixl::IsInt26(displacement);
232
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
233
if (use_blr && use_trampoline && !force_inline)
234
{
235
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
236
{
237
displacement = armGetPCDisplacement(cur, trampoline);
238
use_blr = !vixl::IsInt26(displacement);
239
}
240
}
241
242
if (use_blr)
243
{
244
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
245
armAsm->br(RXSCRATCH);
246
}
247
else
248
{
249
armAsm->b(displacement);
250
}
251
}
252
253
void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
254
{
255
const void* cur = armAsm->GetCursorAddress<const void*>();
256
s64 displacement = armGetPCDisplacement(cur, ptr);
257
bool use_blr = !vixl::IsInt26(displacement);
258
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
259
if (use_blr && use_trampoline && !force_inline)
260
{
261
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
262
{
263
displacement = armGetPCDisplacement(cur, trampoline);
264
use_blr = !vixl::IsInt26(displacement);
265
}
266
}
267
268
if (use_blr)
269
{
270
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
271
armAsm->blr(RXSCRATCH);
272
}
273
else
274
{
275
armAsm->bl(displacement);
276
}
277
}
278
279
void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)
280
{
281
const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
282
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
283
// pxAssert(Common::IsAligned(jump_distance, 4));
284
285
if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))
286
{
287
armAsm->b(jump_distance >> 2, cond);
288
}
289
else
290
{
291
vixl::aarch64::Label branch_not_taken;
292
armAsm->b(&branch_not_taken, InvertCondition(cond));
293
294
const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
295
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
296
armAsm->b(new_jump_distance >> 2);
297
armAsm->bind(&branch_not_taken);
298
}
299
}
300
301
void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
302
bool sign_extend_word)
303
{
304
const void* cur = armAsm->GetCursorAddress<const void*>();
305
const void* current_code_ptr_page =
306
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
307
const void* ptr_page =
308
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
309
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
310
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
311
vixl::aarch64::MemOperand memop;
312
313
const vixl::aarch64::Register xreg = reg.X();
314
if (vixl::IsInt21(page_displacement))
315
{
316
armAsm->adrp(xreg, page_displacement);
317
memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));
318
}
319
else
320
{
321
armMoveAddressToReg(armAsm, xreg, addr);
322
memop = vixl::aarch64::MemOperand(xreg);
323
}
324
325
if (sign_extend_word)
326
armAsm->ldrsw(reg, memop);
327
else
328
armAsm->ldr(reg, memop);
329
}
330
331
[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,
332
const void* addr, const vixl::aarch64::Register& tempreg)
333
{
334
DebugAssert(tempreg.IsX());
335
336
const void* cur = armAsm->GetCursorAddress<const void*>();
337
const void* current_code_ptr_page =
338
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
339
const void* ptr_page =
340
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
341
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
342
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
343
344
if (vixl::IsInt21(page_displacement))
345
{
346
armAsm->adrp(tempreg, page_displacement);
347
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));
348
}
349
else
350
{
351
armMoveAddressToReg(armAsm, tempreg, addr);
352
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));
353
}
354
}
355
356
u8* armGetJumpTrampoline(const void* target)
357
{
358
auto it = s_trampoline_targets.find(target);
359
if (it != s_trampoline_targets.end())
360
return s_trampoline_start_ptr + it->second;
361
362
// align to 16 bytes?
363
const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);
364
365
// 4 movs plus a jump
366
if (TRAMPOLINE_AREA_SIZE - offset < 20)
367
{
368
Panic("Ran out of space in constant pool");
369
return nullptr;
370
}
371
372
u8* start = s_trampoline_start_ptr + offset;
373
vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
374
#ifdef VIXL_DEBUG
375
vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,
376
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
377
#endif
378
armMoveAddressToReg(&armAsm, RXSCRATCH, target);
379
armAsm.br(RXSCRATCH);
380
armAsm.FinalizeCode();
381
382
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
383
DebugAssert(size < 20);
384
s_trampoline_targets.emplace(target, offset);
385
s_trampoline_used = offset + static_cast<u32>(size);
386
387
MemMap::FlushInstructionCache(start, size);
388
return start;
389
}
390
391
void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)
392
{
393
size_t addr = armAsm->GetCursorAddress<size_t>();
394
const size_t end_addr = Common::AlignUpPow2(addr, alignment);
395
while (addr != end_addr)
396
{
397
armAsm->nop();
398
addr += vixl::aarch64::kInstructionSize;
399
}
400
}
401
402
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
403
{
404
#ifdef ENABLE_HOST_DISASSEMBLY
405
class MyDisassembler : public vixl::aarch64::Disassembler
406
{
407
protected:
408
void ProcessOutput(const vixl::aarch64::Instruction* instr) override
409
{
410
DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());
411
}
412
};
413
414
vixl::aarch64::Decoder decoder;
415
MyDisassembler disas;
416
decoder.AppendVisitor(&disas);
417
decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),
418
reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));
419
#else
420
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
421
#endif
422
}
423
424
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
425
{
426
return size / vixl::aarch64::kInstructionSize;
427
}
428
429
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
430
{
431
using namespace vixl::aarch64;
432
433
const s64 disp = armGetPCDisplacement(code, dst);
434
DebugAssert(vixl::IsInt26(disp));
435
436
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
437
std::memcpy(code, &new_code, sizeof(new_code));
438
if (flush_icache)
439
MemMap::FlushInstructionCache(code, kInstructionSize);
440
441
return kInstructionSize;
442
}
443
444
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
445
{
446
using namespace vixl::aarch64;
447
448
Assembler actual_asm(static_cast<u8*>(code), code_size);
449
Assembler* RESTRICT armAsm = &actual_asm;
450
451
#ifdef VIXL_DEBUG
452
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
453
#endif
454
455
Label dispatch;
456
Label run_events_and_dispatch;
457
458
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
459
{
460
// Need the CPU state for basically everything :-)
461
armMoveAddressToReg(armAsm, RSTATE, &g_state);
462
463
// Fastmem setup, oldrec doesn't need it
464
if (IsUsingFastmem())
465
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
466
467
// Fall through to event dispatcher
468
}
469
470
// check events then for frame done
471
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
472
{
473
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
474
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
475
armAsm->cmp(RWARG1, RWARG2);
476
armAsm->b(&dispatch, lt);
477
478
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
479
armAsm->bind(&run_events_and_dispatch);
480
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
481
}
482
483
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
484
g_dispatcher = armAsm->GetCursorAddress<const void*>();
485
{
486
armAsm->bind(&dispatch);
487
488
// x9 <- s_fast_map[pc >> 16]
489
armAsm->ldr(RWARG1, PTR(&g_state.pc));
490
armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());
491
armAsm->lsr(RWARG2, RWARG1, 16);
492
armAsm->ubfx(RWARG1, RWARG1, 2, 14);
493
armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));
494
495
// blr(x9[pc * 2]) (fast_map[pc >> 2])
496
armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));
497
armAsm->br(RXARG1);
498
}
499
500
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
501
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
502
{
503
armAsm->ldr(RWARG1, PTR(&g_state.pc));
504
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
505
armAsm->b(&dispatch);
506
}
507
508
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
509
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
510
{
511
armAsm->ldr(RWARG1, PTR(&g_state.pc));
512
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
513
armAsm->b(&dispatch);
514
}
515
516
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
517
g_interpret_block = armAsm->GetCursorAddress<const void*>();
518
{
519
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
520
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
521
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
522
armAsm->cmp(RWARG1, RWARG2);
523
armAsm->b(&run_events_and_dispatch, ge);
524
armAsm->b(&dispatch);
525
}
526
527
armAsm->FinalizeCode();
528
529
s_trampoline_targets.clear();
530
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
531
s_trampoline_used = 0;
532
533
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
534
}
535
536
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
537
{
538
constexpr u8 padding_value = 0x00;
539
std::memset(dst, padding_value, size);
540
}
541
542
CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)
543
{
544
}
545
546
CPU::ARM64Recompiler::~ARM64Recompiler() = default;
547
548
const void* CPU::ARM64Recompiler::GetCurrentCodePointer()
549
{
550
return armAsm->GetCursorAddress<const void*>();
551
}
552
553
void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
554
u32 far_code_space)
555
{
556
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
557
558
// TODO: don't recreate this every time..
559
DebugAssert(!armAsm);
560
m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
561
m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
562
armAsm = &m_emitter;
563
564
#ifdef VIXL_DEBUG
565
m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,
566
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
567
m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
568
&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
569
#endif
570
571
// Need to wipe it out so it's correct when toggling fastmem.
572
m_host_regs = {};
573
574
const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
575
for (u32 i = 0; i < NUM_HOST_REGS; i++)
576
{
577
HostRegAlloc& ra = m_host_regs[i];
578
579
if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
580
i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)
581
{
582
continue;
583
}
584
585
ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
586
}
587
}
588
589
void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
590
{
591
DebugAssert(armAsm == &m_emitter);
592
if (emit_jump)
593
{
594
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
595
if (cond != Condition::al)
596
{
597
if (vixl::IsInt19(disp))
598
{
599
armAsm->b(disp, cond);
600
}
601
else
602
{
603
Label skip;
604
armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
605
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
606
armAsm->bind(&skip);
607
}
608
}
609
else
610
{
611
armAsm->b(disp);
612
}
613
}
614
armAsm = &m_far_emitter;
615
}
616
617
void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
618
{
619
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
620
if (vixl::IsInt14(disp))
621
{
622
armAsm->tbnz(reg, bit, disp);
623
}
624
else
625
{
626
Label skip;
627
armAsm->tbz(reg, bit, &skip);
628
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
629
armAsm->bind(&skip);
630
}
631
632
armAsm = &m_far_emitter;
633
}
634
635
void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
636
{
637
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
638
if (vixl::IsInt19(disp))
639
{
640
nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
641
}
642
else
643
{
644
Label skip;
645
nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
646
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
647
armAsm->bind(&skip);
648
}
649
650
armAsm = &m_far_emitter;
651
}
652
653
void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
654
{
655
DebugAssert(armAsm == &m_far_emitter);
656
if (emit_jump)
657
{
658
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
659
(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
660
}
661
armAsm = &m_emitter;
662
}
663
664
void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)
665
{
666
armEmitMov(armAsm, dst, val);
667
}
668
669
void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
670
{
671
armEmitCall(armAsm, ptr, force_inline);
672
}
673
674
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)
675
{
676
if (Assembler::IsImmAddSub(val))
677
return vixl::aarch64::Operand(static_cast<int64_t>(val));
678
679
EmitMov(RWSCRATCH, static_cast<u32>(val));
680
return vixl::aarch64::Operand(RWSCRATCH);
681
}
682
683
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)
684
{
685
return armCheckAddSubConstant(static_cast<s32>(val));
686
}
687
688
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)
689
{
690
if (Assembler::IsImmConditionalCompare(val))
691
return vixl::aarch64::Operand(static_cast<int64_t>(val));
692
693
EmitMov(RWSCRATCH, static_cast<u32>(val));
694
return vixl::aarch64::Operand(RWSCRATCH);
695
}
696
697
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)
698
{
699
if (Assembler::IsImmLogical(val, 32))
700
return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));
701
702
EmitMov(RWSCRATCH, val);
703
return vixl::aarch64::Operand(RWSCRATCH);
704
}
705
706
void CPU::ARM64Recompiler::BeginBlock()
707
{
708
Recompiler::BeginBlock();
709
}
710
711
void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
712
{
713
// store it first to reduce code size, because we can offset
714
armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
715
armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
716
717
bool first = true;
718
u32 offset = 0;
719
Label block_changed;
720
721
while (size >= 16)
722
{
723
const VRegister vtmp = v2.V4S();
724
const VRegister dst = first ? v0.V4S() : v1.V4S();
725
armAsm->ldr(dst, MemOperand(RXARG1, offset));
726
armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
727
armAsm->cmeq(dst, dst, vtmp);
728
if (!first)
729
armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());
730
else
731
first = false;
732
733
offset += 16;
734
size -= 16;
735
}
736
737
if (!first)
738
{
739
// TODO: make sure this doesn't choke on ffffffff
740
armAsm->uminv(s0, v0.V4S());
741
armAsm->fcmp(s0, 0.0);
742
armAsm->b(&block_changed, eq);
743
}
744
745
while (size >= 8)
746
{
747
armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
748
armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
749
armAsm->cmp(RXARG3, RXSCRATCH);
750
armAsm->b(&block_changed, ne);
751
offset += 8;
752
size -= 8;
753
}
754
755
while (size >= 4)
756
{
757
armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
758
armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
759
armAsm->cmp(RWARG3, RWSCRATCH);
760
armAsm->b(&block_changed, ne);
761
offset += 4;
762
size -= 4;
763
}
764
765
DebugAssert(size == 0);
766
767
Label block_unchanged;
768
armAsm->b(&block_unchanged);
769
armAsm->bind(&block_changed);
770
armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
771
armAsm->bind(&block_unchanged);
772
}
773
774
void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()
775
{
776
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
777
{
778
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
779
{
780
armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());
781
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
782
armEmitMov(armAsm, RWARG3, m_block->size);
783
armAsm->mul(RWARG2, RWARG2, RWARG3);
784
armAsm->add(RWARG1, RWARG1, RWARG2);
785
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
786
}
787
else
788
{
789
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
790
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
791
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
792
}
793
}
794
else if (m_block->icache_line_count > 0)
795
{
796
const auto& ticks_reg = RWARG1;
797
const auto& current_tag_reg = RWARG2;
798
const auto& existing_tag_reg = RWARG3;
799
const auto& fill_ticks_reg = w4;
800
const auto& ticks_to_add_reg = w5;
801
802
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
803
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
804
if (fill_ticks <= 0)
805
return;
806
807
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
808
armEmitMov(armAsm, current_tag_reg, current_pc);
809
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
810
811
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
812
{
813
const u32 line = GetICacheLine(current_pc);
814
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
815
816
Label cache_hit;
817
armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
818
armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
819
armAsm->cmp(existing_tag_reg, current_tag_reg);
820
armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);
821
armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);
822
823
if (i != (m_block->icache_line_count - 1))
824
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
825
}
826
827
armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
828
}
829
}
830
831
void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
832
s32 arg3reg /*= -1*/)
833
{
834
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))
835
armAsm->mov(RXARG1, XRegister(arg1reg));
836
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))
837
armAsm->mov(RXARG2, XRegister(arg2reg));
838
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))
839
armAsm->mov(RXARG3, XRegister(arg3reg));
840
EmitCall(func);
841
}
842
843
void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
844
{
845
if (newpc.has_value())
846
{
847
if (m_dirty_pc || m_compiler_pc != newpc)
848
{
849
EmitMov(RWSCRATCH, newpc.value());
850
armAsm->str(RWSCRATCH, PTR(&g_state.pc));
851
}
852
}
853
m_dirty_pc = false;
854
855
// flush regs
856
Flush(FLUSH_END_BLOCK);
857
EndAndLinkBlock(newpc, do_event_test, false);
858
}
859
860
void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)
861
{
862
// flush regs, but not pc, it's going to get overwritten
863
// flush cycles because of the GTE instruction stuff...
864
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
865
866
// TODO: flush load delay
867
868
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
869
inst->cop.cop_n));
870
EmitMov(RWARG2, m_current_instruction_pc);
871
if (excode != Exception::BP)
872
{
873
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
874
}
875
else
876
{
877
EmitMov(RWARG3, inst->bits);
878
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
879
}
880
m_dirty_pc = false;
881
882
EndAndLinkBlock(std::nullopt, true, false);
883
}
884
885
void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
886
{
887
// event test
888
// pc should've been flushed
889
DebugAssert(!m_dirty_pc && !m_block_ended);
890
m_block_ended = true;
891
892
// TODO: try extracting this to a function
893
894
// save cycles for event test
895
const TickCount cycles = std::exchange(m_cycles, 0);
896
897
// pending_ticks += cycles
898
// if (pending_ticks >= downcount) { dispatch_event(); }
899
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
900
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
901
if (do_event_test)
902
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
903
if (cycles > 0)
904
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
905
if (m_gte_done_cycle > cycles)
906
{
907
armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
908
armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
909
}
910
if (do_event_test)
911
armAsm->cmp(RWARG1, RWARG2);
912
if (cycles > 0)
913
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
914
if (do_event_test)
915
armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
916
917
// jump to dispatcher or next block
918
if (force_run_events)
919
{
920
armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
921
}
922
else if (!newpc.has_value())
923
{
924
armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
925
}
926
else
927
{
928
const void* target = (newpc.value() == m_block->pc) ?
929
CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),
930
armAsm->GetBuffer()->GetStartAddress<const void*>()) :
931
CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
932
armEmitJmp(armAsm, target, true);
933
}
934
}
935
936
const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)
937
{
938
#ifdef VIXL_DEBUG
939
m_emitter_check.reset();
940
m_far_emitter_check.reset();
941
#endif
942
943
m_emitter.FinalizeCode();
944
m_far_emitter.FinalizeCode();
945
946
u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
947
*code_size = static_cast<u32>(m_emitter.GetCursorOffset());
948
*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
949
armAsm = nullptr;
950
return code;
951
}
952
953
const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const
954
{
955
static constexpr std::array<const char*, 32> reg64_names = {
956
{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
957
"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
958
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
959
}
960
961
void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
962
{
963
EmitMov(WRegister(reg), val);
964
}
965
966
void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
967
{
968
armAsm->ldr(WRegister(reg), PTR(ptr));
969
}
970
971
void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
972
{
973
armAsm->str(WRegister(reg), PTR(ptr));
974
}
975
976
void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
977
{
978
if (val == 0)
979
{
980
armAsm->str(wzr, PTR(ptr));
981
return;
982
}
983
984
EmitMov(RWSCRATCH, val);
985
armAsm->str(RWSCRATCH, PTR(ptr));
986
}
987
988
void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)
989
{
990
if (src != dst)
991
armAsm->mov(WRegister(dst), WRegister(src));
992
}
993
994
void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const
995
{
996
DebugAssert(cf.valid_host_s || cf.const_s);
997
}
998
999
void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const
1000
{
1001
DebugAssert(cf.valid_host_t || cf.const_t);
1002
}
1003
1004
vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const
1005
{
1006
DebugAssert(r < Reg::count);
1007
return PTR(&g_state.regs.r[static_cast<u32>(r)]);
1008
}
1009
1010
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const
1011
{
1012
DebugAssert(cf.valid_host_d);
1013
return WRegister(cf.host_d);
1014
}
1015
1016
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const
1017
{
1018
DebugAssert(cf.valid_host_s);
1019
return WRegister(cf.host_s);
1020
}
1021
1022
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const
1023
{
1024
DebugAssert(cf.valid_host_t);
1025
return WRegister(cf.host_t);
1026
}
1027
1028
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const
1029
{
1030
DebugAssert(cf.valid_host_lo);
1031
return WRegister(cf.host_lo);
1032
}
1033
1034
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const
1035
{
1036
DebugAssert(cf.valid_host_hi);
1037
return WRegister(cf.host_hi);
1038
}
1039
1040
void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1041
{
1042
DebugAssert(dst.IsW());
1043
if (cf.valid_host_s)
1044
{
1045
if (cf.host_s != dst.GetCode())
1046
armAsm->mov(dst, WRegister(cf.host_s));
1047
}
1048
else if (cf.const_s)
1049
{
1050
const u32 cv = GetConstantRegU32(cf.MipsS());
1051
if (cv == 0)
1052
armAsm->mov(dst, wzr);
1053
else
1054
EmitMov(dst, cv);
1055
}
1056
else
1057
{
1058
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
1059
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
1060
}
1061
}
1062
1063
void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1064
{
1065
DebugAssert(dst.IsW());
1066
if (cf.valid_host_t)
1067
{
1068
if (cf.host_t != dst.GetCode())
1069
armAsm->mov(dst, WRegister(cf.host_t));
1070
}
1071
else if (cf.const_t)
1072
{
1073
const u32 cv = GetConstantRegU32(cf.MipsT());
1074
if (cv == 0)
1075
armAsm->mov(dst, wzr);
1076
else
1077
EmitMov(dst, cv);
1078
}
1079
else
1080
{
1081
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
1082
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
1083
}
1084
}
1085
1086
void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg)
1087
{
1088
DebugAssert(reg < Reg::count && dst.IsW());
1089
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
1090
armAsm->mov(dst, WRegister(hreg.value()));
1091
else if (HasConstantReg(reg))
1092
EmitMov(dst, GetConstantRegU32(reg));
1093
else
1094
armAsm->ldr(dst, MipsPtr(reg));
1095
}
1096
1097
void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
1098
Reg arg3reg /* = Reg::count */)
1099
{
1100
DebugAssert(g_settings.gpu_pgxp_enable);
1101
1102
Flush(FLUSH_FOR_C_CALL);
1103
1104
if (arg2reg != Reg::count)
1105
MoveMIPSRegToReg(RWARG2, arg2reg);
1106
if (arg3reg != Reg::count)
1107
MoveMIPSRegToReg(RWARG3, arg3reg);
1108
1109
EmitMov(RWARG1, arg1val);
1110
EmitCall(func);
1111
}
1112
1113
void CPU::ARM64Recompiler::Flush(u32 flags)
1114
{
1115
Recompiler::Flush(flags);
1116
1117
if (flags & FLUSH_PC && m_dirty_pc)
1118
{
1119
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
1120
m_dirty_pc = false;
1121
}
1122
1123
if (flags & FLUSH_INSTRUCTION_BITS)
1124
{
1125
// This sucks, but it's only used for fallbacks.
1126
EmitMov(RWARG1, inst->bits);
1127
EmitMov(RWARG2, m_current_instruction_pc);
1128
EmitMov(RWARG3, m_current_instruction_branch_delay_slot);
1129
armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));
1130
armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));
1131
armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
1132
}
1133
1134
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
1135
{
1136
// This sucks :(
1137
// TODO: make it a function?
1138
armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
1139
armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
1140
EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
1141
armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
1142
armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
1143
EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));
1144
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1145
m_load_delay_dirty = false;
1146
}
1147
1148
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
1149
{
1150
if (m_load_delay_value_register != NUM_HOST_REGS)
1151
FreeHostReg(m_load_delay_value_register);
1152
1153
EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));
1154
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1155
m_load_delay_register = Reg::count;
1156
m_load_delay_dirty = true;
1157
}
1158
1159
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
1160
{
1161
// May as well flush cycles while we're here.
1162
// GTE spanning blocks is very rare, we _could_ disable this for speed.
1163
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1164
armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
1165
if (m_cycles > 0)
1166
{
1167
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1168
m_cycles = 0;
1169
}
1170
armAsm->cmp(RWARG2, RWARG1);
1171
armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
1172
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1173
m_dirty_gte_done_cycle = false;
1174
}
1175
1176
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1177
{
1178
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1179
1180
// update cycles at the same time
1181
if (flags & FLUSH_CYCLES && m_cycles > 0)
1182
{
1183
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1184
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1185
m_gte_done_cycle -= m_cycles;
1186
m_cycles = 0;
1187
}
1188
1189
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
1190
armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
1191
m_gte_done_cycle = 0;
1192
m_dirty_gte_done_cycle = true;
1193
}
1194
1195
if (flags & FLUSH_CYCLES && m_cycles > 0)
1196
{
1197
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1198
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1199
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1200
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1201
m_cycles = 0;
1202
}
1203
}
1204
1205
void CPU::ARM64Recompiler::Compile_Fallback()
1206
{
1207
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1208
inst->bits);
1209
1210
Flush(FLUSH_FOR_INTERPRETER);
1211
1212
EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));
1213
1214
// TODO: make me less garbage
1215
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
1216
// but nothing should be going through here..
1217
Label no_load_delay;
1218
armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));
1219
armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));
1220
armAsm->b(&no_load_delay, eq);
1221
armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));
1222
armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));
1223
armAsm->str(RWARG2, PTR(&g_state.load_delay_value));
1224
EmitMov(RWARG1, static_cast<u32>(Reg::count));
1225
armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));
1226
armAsm->bind(&no_load_delay);
1227
1228
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
1229
}
1230
1231
void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)
1232
{
1233
DebugAssert(pcreg.IsW());
1234
if (!g_settings.cpu_recompiler_memory_exceptions)
1235
return;
1236
1237
armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
1238
SwitchToFarCode(true, ne);
1239
1240
BackupHostState();
1241
EndBlockWithException(Exception::AdEL);
1242
1243
RestoreHostState();
1244
SwitchToNearCode(false);
1245
}
1246
1247
void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)
1248
{
1249
const Register pcreg = CFGetRegS(cf);
1250
CheckBranchTarget(pcreg);
1251
1252
armAsm->str(pcreg, PTR(&g_state.pc));
1253
1254
CompileBranchDelaySlot(false);
1255
EndBlock(std::nullopt, true);
1256
}
1257
1258
void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)
1259
{
1260
const Register pcreg = CFGetRegS(cf);
1261
if (MipsD() != Reg::zero)
1262
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1263
1264
CheckBranchTarget(pcreg);
1265
armAsm->str(pcreg, PTR(&g_state.pc));
1266
1267
CompileBranchDelaySlot(false);
1268
EndBlock(std::nullopt, true);
1269
}
1270
1271
void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1272
{
1273
AssertRegOrConstS(cf);
1274
1275
const u32 taken_pc = GetConditionalBranchTarget(cf);
1276
1277
Flush(FLUSH_FOR_BRANCH);
1278
1279
DebugAssert(cf.valid_host_s);
1280
1281
// MipsT() here should equal zero for zero branches.
1282
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1283
1284
Label taken;
1285
const Register rs = CFGetRegS(cf);
1286
switch (cond)
1287
{
1288
case BranchCondition::Equal:
1289
case BranchCondition::NotEqual:
1290
{
1291
AssertRegOrConstT(cf);
1292
if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
1293
{
1294
(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
1295
}
1296
else
1297
{
1298
if (cf.valid_host_t)
1299
armAsm->cmp(rs, CFGetRegT(cf));
1300
else if (cf.const_t)
1301
armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
1302
1303
armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
1304
}
1305
}
1306
break;
1307
1308
case BranchCondition::GreaterThanZero:
1309
{
1310
armAsm->cmp(rs, 0);
1311
armAsm->b(&taken, gt);
1312
}
1313
break;
1314
1315
case BranchCondition::GreaterEqualZero:
1316
{
1317
armAsm->cmp(rs, 0);
1318
armAsm->b(&taken, ge);
1319
}
1320
break;
1321
1322
case BranchCondition::LessThanZero:
1323
{
1324
armAsm->cmp(rs, 0);
1325
armAsm->b(&taken, lt);
1326
}
1327
break;
1328
1329
case BranchCondition::LessEqualZero:
1330
{
1331
armAsm->cmp(rs, 0);
1332
armAsm->b(&taken, le);
1333
}
1334
break;
1335
}
1336
1337
BackupHostState();
1338
if (!cf.delay_slot_swapped)
1339
CompileBranchDelaySlot();
1340
1341
EndBlock(m_compiler_pc, true);
1342
1343
armAsm->bind(&taken);
1344
1345
RestoreHostState();
1346
if (!cf.delay_slot_swapped)
1347
CompileBranchDelaySlot();
1348
1349
EndBlock(taken_pc, true);
1350
}
1351
1352
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1353
{
1354
const Register rs = CFGetRegS(cf);
1355
const Register rt = CFGetRegT(cf);
1356
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1357
{
1358
if (!overflow)
1359
{
1360
armAsm->add(rt, rs, armCheckAddSubConstant(imm));
1361
}
1362
else
1363
{
1364
armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
1365
TestOverflow(rt);
1366
}
1367
}
1368
else if (rt.GetCode() != rs.GetCode())
1369
{
1370
armAsm->mov(rt, rs);
1371
}
1372
}
1373
1374
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)
1375
{
1376
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1377
}
1378
1379
void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)
1380
{
1381
Compile_addi(cf, false);
1382
}
1383
1384
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)
1385
{
1386
Compile_slti(cf, true);
1387
}
1388
1389
void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)
1390
{
1391
Compile_slti(cf, false);
1392
}
1393
1394
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)
1395
{
1396
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
1397
armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
1398
}
1399
1400
void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)
1401
{
1402
const Register rt = CFGetRegT(cf);
1403
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1404
armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
1405
else
1406
armAsm->mov(rt, wzr);
1407
}
1408
1409
void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)
1410
{
1411
const Register rt = CFGetRegT(cf);
1412
const Register rs = CFGetRegS(cf);
1413
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1414
armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
1415
else if (rt.GetCode() != rs.GetCode())
1416
armAsm->mov(rt, rs);
1417
}
1418
1419
void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)
1420
{
1421
const Register rt = CFGetRegT(cf);
1422
const Register rs = CFGetRegS(cf);
1423
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1424
armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
1425
else if (rt.GetCode() != rs.GetCode())
1426
armAsm->mov(rt, rs);
1427
}
1428
1429
void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,
1430
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1431
const vixl::aarch64::Register&, unsigned))
1432
{
1433
const Register rd = CFGetRegD(cf);
1434
const Register rt = CFGetRegT(cf);
1435
if (inst->r.shamt > 0)
1436
(armAsm->*op)(rd, rt, inst->r.shamt);
1437
else if (rd.GetCode() != rt.GetCode())
1438
armAsm->mov(rd, rt);
1439
}
1440
1441
void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)
1442
{
1443
Compile_shift(cf, &Assembler::lsl);
1444
}
1445
1446
void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)
1447
{
1448
Compile_shift(cf, &Assembler::lsr);
1449
}
1450
1451
void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)
1452
{
1453
Compile_shift(cf, &Assembler::asr);
1454
}
1455
1456
void CPU::ARM64Recompiler::Compile_variable_shift(
1457
CompileFlags cf,
1458
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
1459
const vixl::aarch64::Register&),
1460
void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
1461
{
1462
const Register rd = CFGetRegD(cf);
1463
1464
AssertRegOrConstS(cf);
1465
AssertRegOrConstT(cf);
1466
1467
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1468
if (!cf.valid_host_t)
1469
MoveTToReg(rt, cf);
1470
1471
if (cf.const_s)
1472
{
1473
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1474
(armAsm->*op_const)(rd, rt, shift);
1475
else if (rd.GetCode() != rt.GetCode())
1476
armAsm->mov(rd, rt);
1477
}
1478
else
1479
{
1480
(armAsm->*op)(rd, rt, CFGetRegS(cf));
1481
}
1482
}
1483
1484
void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)
1485
{
1486
Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
1487
}
1488
1489
void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)
1490
{
1491
Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
1492
}
1493
1494
void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)
1495
{
1496
Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
1497
}
1498
1499
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)
1500
{
1501
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1502
if (!cf.valid_host_s)
1503
MoveSToReg(rs, cf);
1504
1505
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1506
if (!cf.valid_host_t)
1507
MoveTToReg(rt, cf);
1508
1509
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1510
const Register lo = CFGetRegLO(cf);
1511
const Register hi = CFGetRegHI(cf);
1512
1513
(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
1514
armAsm->lsr(hi.X(), lo.X(), 32);
1515
}
1516
1517
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)
1518
{
1519
Compile_mult(cf, true);
1520
}
1521
1522
void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)
1523
{
1524
Compile_mult(cf, false);
1525
}
1526
1527
void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)
1528
{
1529
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1530
if (!cf.valid_host_s)
1531
MoveSToReg(rs, cf);
1532
1533
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1534
if (!cf.valid_host_t)
1535
MoveTToReg(rt, cf);
1536
1537
const Register rlo = CFGetRegLO(cf);
1538
const Register rhi = CFGetRegHI(cf);
1539
1540
// TODO: This could be slightly more optimal
1541
Label done;
1542
Label not_divide_by_zero;
1543
armAsm->cbnz(rt, &not_divide_by_zero);
1544
armAsm->mov(rhi, rs); // hi = num
1545
EmitMov(rlo, 1);
1546
EmitMov(RWSCRATCH, static_cast<u32>(-1));
1547
armAsm->cmp(rs, 0);
1548
armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
1549
armAsm->b(&done);
1550
1551
armAsm->bind(&not_divide_by_zero);
1552
Label not_unrepresentable;
1553
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
1554
armAsm->b(&not_unrepresentable, ne);
1555
armAsm->cmp(rt, armCheckCompareConstant(-1));
1556
armAsm->b(&not_unrepresentable, ne);
1557
1558
EmitMov(rlo, 0x80000000u);
1559
EmitMov(rhi, 0);
1560
armAsm->b(&done);
1561
1562
armAsm->bind(&not_unrepresentable);
1563
1564
armAsm->sdiv(rlo, rs, rt);
1565
1566
// TODO: skip when hi is dead
1567
armAsm->msub(rhi, rlo, rt, rs);
1568
1569
armAsm->bind(&done);
1570
}
1571
1572
void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)
1573
{
1574
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1575
if (!cf.valid_host_s)
1576
MoveSToReg(rs, cf);
1577
1578
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1579
if (!cf.valid_host_t)
1580
MoveTToReg(rt, cf);
1581
1582
const Register rlo = CFGetRegLO(cf);
1583
const Register rhi = CFGetRegHI(cf);
1584
1585
Label done;
1586
Label not_divide_by_zero;
1587
armAsm->cbnz(rt, &not_divide_by_zero);
1588
EmitMov(rlo, static_cast<u32>(-1));
1589
armAsm->mov(rhi, rs);
1590
armAsm->b(&done);
1591
1592
armAsm->bind(&not_divide_by_zero);
1593
1594
armAsm->udiv(rlo, rs, rt);
1595
1596
// TODO: skip when hi is dead
1597
armAsm->msub(rhi, rlo, rt, rs);
1598
1599
armAsm->bind(&done);
1600
}
1601
1602
void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)
1603
{
1604
DebugAssert(result.IsW());
1605
SwitchToFarCode(true, vs);
1606
1607
BackupHostState();
1608
1609
// toss the result
1610
ClearHostReg(result.GetCode());
1611
1612
EndBlockWithException(Exception::Ov);
1613
1614
RestoreHostState();
1615
1616
SwitchToNearCode(false);
1617
}
1618
1619
void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,
1620
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1621
const vixl::aarch64::Register&,
1622
const vixl::aarch64::Operand&),
1623
bool commutative, bool logical, bool overflow)
1624
{
1625
AssertRegOrConstS(cf);
1626
AssertRegOrConstT(cf);
1627
1628
const Register rd = CFGetRegD(cf);
1629
if (cf.valid_host_s && cf.valid_host_t)
1630
{
1631
(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
1632
}
1633
else if (commutative && (cf.const_s || cf.const_t))
1634
{
1635
const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1636
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1637
{
1638
(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1639
}
1640
else
1641
{
1642
if (rd.GetCode() != src.GetCode())
1643
armAsm->mov(rd, src);
1644
overflow = false;
1645
}
1646
}
1647
else if (cf.const_s)
1648
{
1649
// TODO: Check where we can use wzr here
1650
EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
1651
(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
1652
}
1653
else if (cf.const_t)
1654
{
1655
const Register rs = CFGetRegS(cf);
1656
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1657
{
1658
(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1659
}
1660
else
1661
{
1662
if (rd.GetCode() != rs.GetCode())
1663
armAsm->mov(rd, rs);
1664
overflow = false;
1665
}
1666
}
1667
1668
if (overflow)
1669
TestOverflow(rd);
1670
}
1671
1672
void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)
1673
{
1674
if (g_settings.cpu_recompiler_memory_exceptions)
1675
Compile_dst_op(cf, &Assembler::adds, true, false, true);
1676
else
1677
Compile_dst_op(cf, &Assembler::add, true, false, false);
1678
}
1679
1680
void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)
1681
{
1682
Compile_dst_op(cf, &Assembler::add, true, false, false);
1683
}
1684
1685
void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)
1686
{
1687
if (g_settings.cpu_recompiler_memory_exceptions)
1688
Compile_dst_op(cf, &Assembler::subs, false, false, true);
1689
else
1690
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1691
}
1692
1693
void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)
1694
{
1695
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1696
}
1697
1698
void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)
1699
{
1700
AssertRegOrConstS(cf);
1701
AssertRegOrConstT(cf);
1702
1703
// special cases - and with self -> self, and with 0 -> 0
1704
const Register regd = CFGetRegD(cf);
1705
if (cf.MipsS() == cf.MipsT())
1706
{
1707
armAsm->mov(regd, CFGetRegS(cf));
1708
return;
1709
}
1710
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1711
{
1712
armAsm->mov(regd, wzr);
1713
return;
1714
}
1715
1716
Compile_dst_op(cf, &Assembler::and_, true, true, false);
1717
}
1718
1719
void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)
1720
{
1721
AssertRegOrConstS(cf);
1722
AssertRegOrConstT(cf);
1723
1724
// or/nor with 0 -> no effect
1725
const Register regd = CFGetRegD(cf);
1726
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1727
{
1728
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1729
return;
1730
}
1731
1732
Compile_dst_op(cf, &Assembler::orr, true, true, false);
1733
}
1734
1735
void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)
1736
{
1737
AssertRegOrConstS(cf);
1738
AssertRegOrConstT(cf);
1739
1740
const Register regd = CFGetRegD(cf);
1741
if (cf.MipsS() == cf.MipsT())
1742
{
1743
// xor with self -> zero
1744
armAsm->mov(regd, wzr);
1745
return;
1746
}
1747
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1748
{
1749
// xor with zero -> no effect
1750
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1751
return;
1752
}
1753
1754
Compile_dst_op(cf, &Assembler::eor, true, true, false);
1755
}
1756
1757
void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)
1758
{
1759
Compile_or(cf);
1760
armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
1761
}
1762
1763
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)
1764
{
1765
Compile_slt(cf, true);
1766
}
1767
1768
void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)
1769
{
1770
Compile_slt(cf, false);
1771
}
1772
1773
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)
1774
{
1775
AssertRegOrConstS(cf);
1776
AssertRegOrConstT(cf);
1777
1778
// TODO: swap and reverse op for constants
1779
if (cf.const_s)
1780
{
1781
EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
1782
armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
1783
}
1784
else if (cf.const_t)
1785
{
1786
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
1787
}
1788
else
1789
{
1790
armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
1791
}
1792
1793
armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
1794
}
1795
1796
vixl::aarch64::Register
1797
CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1798
const std::optional<const vixl::aarch64::Register>& reg)
1799
{
1800
const u32 imm = inst->i.imm_sext32();
1801
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1802
return CFGetRegS(cf);
1803
1804
const Register dst = reg.has_value() ? reg.value() : RWARG1;
1805
if (address.has_value())
1806
{
1807
EmitMov(dst, address.value());
1808
}
1809
else if (imm == 0)
1810
{
1811
if (cf.valid_host_s)
1812
{
1813
if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
1814
armAsm->mov(dst, CFGetRegS(cf));
1815
}
1816
else
1817
{
1818
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1819
}
1820
}
1821
else
1822
{
1823
if (cf.valid_host_s)
1824
{
1825
armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1826
}
1827
else
1828
{
1829
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1830
armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1831
}
1832
}
1833
1834
return dst;
1835
}
1836
1837
template<typename RegAllocFn>
1838
vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,
1839
MemoryAccessSize size, bool sign, bool use_fastmem,
1840
const RegAllocFn& dst_reg_alloc)
1841
{
1842
DebugAssert(addr_reg.IsW());
1843
if (use_fastmem)
1844
{
1845
m_cycles += Bus::RAM_READ_TICKS;
1846
1847
const Register dst = dst_reg_alloc();
1848
1849
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1850
{
1851
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1852
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1853
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1854
}
1855
1856
const MemOperand mem =
1857
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1858
u8* start = armAsm->GetCursorAddress<u8*>();
1859
switch (size)
1860
{
1861
case MemoryAccessSize::Byte:
1862
sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
1863
break;
1864
1865
case MemoryAccessSize::HalfWord:
1866
sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
1867
break;
1868
1869
case MemoryAccessSize::Word:
1870
armAsm->ldr(dst, mem);
1871
break;
1872
}
1873
1874
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
1875
return dst;
1876
}
1877
1878
if (addr_reg.GetCode() != RWARG1.GetCode())
1879
armAsm->mov(RWARG1, addr_reg);
1880
1881
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1882
switch (size)
1883
{
1884
case MemoryAccessSize::Byte:
1885
{
1886
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :
1887
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));
1888
}
1889
break;
1890
case MemoryAccessSize::HalfWord:
1891
{
1892
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :
1893
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));
1894
}
1895
break;
1896
case MemoryAccessSize::Word:
1897
{
1898
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :
1899
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));
1900
}
1901
break;
1902
}
1903
1904
// TODO: turn this into an asm function instead
1905
if (checked)
1906
{
1907
SwitchToFarCodeIfBitSet(RXRET, 63);
1908
BackupHostState();
1909
1910
// Need to stash this in a temp because of the flush.
1911
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
1912
armAsm->neg(temp.X(), RXRET);
1913
armAsm->lsl(temp, temp, 2);
1914
1915
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1916
1917
// cause_bits = (-result << 2) | BD | cop_n
1918
armAsm->orr(RWARG1, temp,
1919
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1920
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1921
EmitMov(RWARG2, m_current_instruction_pc);
1922
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1923
FreeHostReg(temp.GetCode());
1924
EndBlock(std::nullopt, true);
1925
1926
RestoreHostState();
1927
SwitchToNearCode(false);
1928
}
1929
1930
const Register dst_reg = dst_reg_alloc();
1931
switch (size)
1932
{
1933
case MemoryAccessSize::Byte:
1934
{
1935
sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
1936
}
1937
break;
1938
case MemoryAccessSize::HalfWord:
1939
{
1940
sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
1941
}
1942
break;
1943
case MemoryAccessSize::Word:
1944
{
1945
if (dst_reg.GetCode() != RWRET.GetCode())
1946
armAsm->mov(dst_reg, RWRET);
1947
}
1948
break;
1949
}
1950
1951
return dst_reg;
1952
}
1953
1954
void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,
1955
const vixl::aarch64::Register& value_reg, MemoryAccessSize size,
1956
bool use_fastmem)
1957
{
1958
DebugAssert(addr_reg.IsW() && value_reg.IsW());
1959
if (use_fastmem)
1960
{
1961
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1962
{
1963
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1964
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1965
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1966
}
1967
1968
const MemOperand mem =
1969
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1970
u8* start = armAsm->GetCursorAddress<u8*>();
1971
switch (size)
1972
{
1973
case MemoryAccessSize::Byte:
1974
armAsm->strb(value_reg, mem);
1975
break;
1976
1977
case MemoryAccessSize::HalfWord:
1978
armAsm->strh(value_reg, mem);
1979
break;
1980
1981
case MemoryAccessSize::Word:
1982
armAsm->str(value_reg, mem);
1983
break;
1984
}
1985
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
1986
return;
1987
}
1988
1989
if (addr_reg.GetCode() != RWARG1.GetCode())
1990
armAsm->mov(RWARG1, addr_reg);
1991
if (value_reg.GetCode() != RWARG2.GetCode())
1992
armAsm->mov(RWARG2, value_reg);
1993
1994
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1995
switch (size)
1996
{
1997
case MemoryAccessSize::Byte:
1998
{
1999
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :
2000
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));
2001
}
2002
break;
2003
case MemoryAccessSize::HalfWord:
2004
{
2005
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :
2006
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));
2007
}
2008
break;
2009
case MemoryAccessSize::Word:
2010
{
2011
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :
2012
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));
2013
}
2014
break;
2015
}
2016
2017
// TODO: turn this into an asm function instead
2018
if (checked)
2019
{
2020
SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
2021
BackupHostState();
2022
2023
// Need to stash this in a temp because of the flush.
2024
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2025
armAsm->lsl(temp, RWRET, 2);
2026
2027
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
2028
2029
// cause_bits = (result << 2) | BD | cop_n
2030
armAsm->orr(RWARG1, temp,
2031
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
2032
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
2033
EmitMov(RWARG2, m_current_instruction_pc);
2034
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2035
FreeHostReg(temp.GetCode());
2036
EndBlock(std::nullopt, true);
2037
2038
RestoreHostState();
2039
SwitchToNearCode(false);
2040
}
2041
}
2042
2043
void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2044
const std::optional<VirtualMemoryAddress>& address)
2045
{
2046
const std::optional<WRegister> addr_reg =
2047
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2048
std::optional<WRegister>();
2049
FlushForLoadStore(address, false, use_fastmem);
2050
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2051
const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {
2052
if (cf.MipsT() == Reg::zero)
2053
return RWRET;
2054
2055
return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2056
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
2057
cf.MipsT()));
2058
});
2059
2060
if (g_settings.gpu_pgxp_enable)
2061
{
2062
Flush(FLUSH_FOR_C_CALL);
2063
2064
EmitMov(RWARG1, inst->bits);
2065
armAsm->mov(RWARG2, addr);
2066
armAsm->mov(RWARG3, data);
2067
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
2068
FreeHostReg(addr_reg.value().GetCode());
2069
}
2070
}
2071
2072
void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2073
const std::optional<VirtualMemoryAddress>& address)
2074
{
2075
DebugAssert(size == MemoryAccessSize::Word && !sign);
2076
2077
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2078
FlushForLoadStore(address, false, use_fastmem);
2079
2080
// TODO: if address is constant, this can be simplified..
2081
2082
// If we're coming from another block, just flush the load delay and hope for the best..
2083
if (m_load_delay_dirty)
2084
UpdateLoadDelay();
2085
2086
// We'd need to be careful here if we weren't overwriting it..
2087
ComputeLoadStoreAddressArg(cf, address, addr);
2088
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2089
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2090
2091
if (inst->r.rt == Reg::zero)
2092
{
2093
FreeHostReg(addr.GetCode());
2094
return;
2095
}
2096
2097
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
2098
// never written back. NOTE: can't trust T in cf because of the flush
2099
const Reg rt = inst->r.rt;
2100
Register value;
2101
if (m_load_delay_register == rt)
2102
{
2103
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
2104
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
2105
m_load_delay_value_register;
2106
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
2107
value = WRegister(existing_ld_rt);
2108
}
2109
else
2110
{
2111
if constexpr (EMULATE_LOAD_DELAYS)
2112
{
2113
value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
2114
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
2115
armAsm->mov(value, WRegister(rtreg.value()));
2116
else if (HasConstantReg(rt))
2117
EmitMov(value, GetConstantRegU32(rt));
2118
else
2119
armAsm->ldr(value, MipsPtr(rt));
2120
}
2121
else
2122
{
2123
value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
2124
}
2125
}
2126
2127
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2128
armAsm->and_(RWARG2, addr, 3);
2129
armAsm->lsl(RWARG2, RWARG2, 3); // *8
2130
EmitMov(RWARG3, 24);
2131
armAsm->sub(RWARG3, RWARG3, RWARG2);
2132
2133
if (inst->op == InstructionOp::lwl)
2134
{
2135
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
2136
// new_value = (value & mask) | (RWRET << (24 - shift));
2137
EmitMov(RWSCRATCH, 0xFFFFFFu);
2138
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);
2139
armAsm->and_(value, value, RWSCRATCH);
2140
armAsm->lslv(RWRET, RWRET, RWARG3);
2141
armAsm->orr(value, value, RWRET);
2142
}
2143
else
2144
{
2145
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
2146
// new_value = (value & mask) | (RWRET >> shift);
2147
armAsm->lsrv(RWRET, RWRET, RWARG2);
2148
EmitMov(RWSCRATCH, 0xFFFFFF00u);
2149
armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);
2150
armAsm->and_(value, value, RWSCRATCH);
2151
armAsm->orr(value, value, RWRET);
2152
}
2153
2154
FreeHostReg(addr.GetCode());
2155
2156
if (g_settings.gpu_pgxp_enable)
2157
{
2158
Flush(FLUSH_FOR_C_CALL);
2159
armAsm->mov(RWARG3, value);
2160
armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u));
2161
EmitMov(RWARG1, inst->bits);
2162
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
2163
}
2164
}
2165
2166
void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2167
const std::optional<VirtualMemoryAddress>& address)
2168
{
2169
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2170
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2171
const std::optional<WRegister> addr_reg =
2172
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2173
std::optional<WRegister>();
2174
FlushForLoadStore(address, false, use_fastmem);
2175
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2176
const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2177
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2178
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2179
RWRET;
2180
});
2181
2182
switch (action)
2183
{
2184
case GTERegisterAccessAction::Ignore:
2185
{
2186
break;
2187
}
2188
2189
case GTERegisterAccessAction::Direct:
2190
{
2191
armAsm->str(value, PTR(ptr));
2192
break;
2193
}
2194
2195
case GTERegisterAccessAction::SignExtend16:
2196
{
2197
armAsm->sxth(RWARG3, value);
2198
armAsm->str(RWARG3, PTR(ptr));
2199
break;
2200
}
2201
2202
case GTERegisterAccessAction::ZeroExtend16:
2203
{
2204
armAsm->uxth(RWARG3, value);
2205
armAsm->str(RWARG3, PTR(ptr));
2206
break;
2207
}
2208
2209
case GTERegisterAccessAction::CallHandler:
2210
{
2211
Flush(FLUSH_FOR_C_CALL);
2212
armAsm->mov(RWARG2, value);
2213
EmitMov(RWARG1, index);
2214
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2215
break;
2216
}
2217
2218
case GTERegisterAccessAction::PushFIFO:
2219
{
2220
// SXY0 <- SXY1
2221
// SXY1 <- SXY2
2222
// SXY2 <- SXYP
2223
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2224
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2225
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2226
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2227
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2228
armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
2229
break;
2230
}
2231
2232
default:
2233
{
2234
Panic("Unknown action");
2235
return;
2236
}
2237
}
2238
2239
if (g_settings.gpu_pgxp_enable)
2240
{
2241
Flush(FLUSH_FOR_C_CALL);
2242
armAsm->mov(RWARG3, value);
2243
if (value.GetCode() != RWRET.GetCode())
2244
FreeHostReg(value.GetCode());
2245
armAsm->mov(RWARG2, addr);
2246
FreeHostReg(addr_reg.value().GetCode());
2247
EmitMov(RWARG1, inst->bits);
2248
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2249
}
2250
}
2251
2252
void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2253
const std::optional<VirtualMemoryAddress>& address)
2254
{
2255
AssertRegOrConstS(cf);
2256
AssertRegOrConstT(cf);
2257
2258
const std::optional<WRegister> addr_reg =
2259
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2260
std::optional<WRegister>();
2261
FlushForLoadStore(address, true, use_fastmem);
2262
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2263
const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
2264
if (!cf.valid_host_t)
2265
MoveTToReg(RWARG2, cf);
2266
2267
GenerateStore(addr, data, size, use_fastmem);
2268
2269
if (g_settings.gpu_pgxp_enable)
2270
{
2271
Flush(FLUSH_FOR_C_CALL);
2272
MoveMIPSRegToReg(RWARG3, cf.MipsT());
2273
armAsm->mov(RWARG2, addr);
2274
EmitMov(RWARG1, inst->bits);
2275
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2276
FreeHostReg(addr_reg.value().GetCode());
2277
}
2278
}
2279
2280
void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2281
const std::optional<VirtualMemoryAddress>& address)
2282
{
2283
DebugAssert(size == MemoryAccessSize::Word && !sign);
2284
2285
// TODO: this can take over rt's value if it's no longer needed
2286
// NOTE: can't trust T in cf because of the alloc
2287
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2288
const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
2289
if (g_settings.gpu_pgxp_enable)
2290
MoveMIPSRegToReg(value, inst->r.rt);
2291
2292
FlushForLoadStore(address, true, use_fastmem);
2293
2294
// TODO: if address is constant, this can be simplified..
2295
// We'd need to be careful here if we weren't overwriting it..
2296
ComputeLoadStoreAddressArg(cf, address, addr);
2297
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2298
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2299
2300
armAsm->and_(RWSCRATCH, addr, 3);
2301
armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
2302
armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
2303
2304
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
2305
if (!g_settings.gpu_pgxp_enable)
2306
MoveMIPSRegToReg(value, inst->r.rt);
2307
2308
if (inst->op == InstructionOp::swl)
2309
{
2310
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2311
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2312
EmitMov(RWARG3, 0xFFFFFF00u);
2313
armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
2314
armAsm->and_(RWRET, RWRET, RWARG3);
2315
2316
EmitMov(RWARG3, 24);
2317
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2318
armAsm->lsrv(value, value, RWARG3);
2319
armAsm->orr(value, value, RWRET);
2320
}
2321
else
2322
{
2323
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2324
// new_value = (RWRET & mem_mask) | (value << shift);
2325
armAsm->lslv(value, value, RWSCRATCH);
2326
2327
EmitMov(RWARG3, 24);
2328
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2329
EmitMov(RWSCRATCH, 0x00FFFFFFu);
2330
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
2331
armAsm->and_(RWRET, RWRET, RWSCRATCH);
2332
armAsm->orr(value, value, RWRET);
2333
}
2334
2335
if (!g_settings.gpu_pgxp_enable)
2336
{
2337
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2338
FreeHostReg(addr.GetCode());
2339
}
2340
else
2341
{
2342
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2343
2344
Flush(FLUSH_FOR_C_CALL);
2345
armAsm->mov(RWARG3, value);
2346
FreeHostReg(value.GetCode());
2347
armAsm->mov(RWARG2, addr);
2348
FreeHostReg(addr.GetCode());
2349
EmitMov(RWARG1, inst->bits);
2350
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
2351
}
2352
}
2353
2354
void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2355
const std::optional<VirtualMemoryAddress>& address)
2356
{
2357
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2358
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2359
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2360
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2361
RWARG1;
2362
const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
2363
FlushForLoadStore(address, true, use_fastmem);
2364
ComputeLoadStoreAddressArg(cf, address, addr);
2365
2366
switch (action)
2367
{
2368
case GTERegisterAccessAction::Direct:
2369
{
2370
armAsm->ldr(data, PTR(ptr));
2371
}
2372
break;
2373
2374
case GTERegisterAccessAction::CallHandler:
2375
{
2376
// should already be flushed.. except in fastmem case
2377
Flush(FLUSH_FOR_C_CALL);
2378
EmitMov(RWARG1, index);
2379
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2380
armAsm->mov(data, RWRET);
2381
}
2382
break;
2383
2384
default:
2385
{
2386
Panic("Unknown action");
2387
}
2388
break;
2389
}
2390
2391
GenerateStore(addr, data, size, use_fastmem);
2392
if (!g_settings.gpu_pgxp_enable)
2393
{
2394
if (addr.GetCode() != RWARG1.GetCode())
2395
FreeHostReg(addr.GetCode());
2396
}
2397
else
2398
{
2399
// TODO: This can be simplified because we don't need to validate in PGXP..
2400
Flush(FLUSH_FOR_C_CALL);
2401
armAsm->mov(RWARG3, data);
2402
FreeHostReg(data.GetCode());
2403
armAsm->mov(RWARG2, addr);
2404
FreeHostReg(addr.GetCode());
2405
EmitMov(RWARG1, inst->bits);
2406
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2407
}
2408
}
2409
2410
void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)
2411
{
2412
// TODO: we need better constant setting here.. which will need backprop
2413
AssertRegOrConstT(cf);
2414
2415
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2416
const u32* ptr = GetCop0RegPtr(reg);
2417
const u32 mask = GetCop0RegWriteMask(reg);
2418
if (!ptr)
2419
{
2420
Compile_Fallback();
2421
return;
2422
}
2423
2424
if (mask == 0)
2425
{
2426
// if it's a read-only register, ignore
2427
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2428
return;
2429
}
2430
2431
// for some registers, we need to test certain bits
2432
const bool needs_bit_test = (reg == Cop0Reg::SR);
2433
const Register new_value = RWARG1;
2434
const Register old_value = RWARG2;
2435
const Register changed_bits = RWARG3;
2436
const Register mask_reg = RWSCRATCH;
2437
2438
// Load old value
2439
armAsm->ldr(old_value, PTR(ptr));
2440
2441
// No way we fit this in an immediate..
2442
EmitMov(mask_reg, mask);
2443
2444
// update value
2445
if (cf.valid_host_t)
2446
armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
2447
else
2448
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2449
2450
if (needs_bit_test)
2451
armAsm->eor(changed_bits, old_value, new_value);
2452
armAsm->bic(old_value, old_value, mask_reg);
2453
armAsm->orr(new_value, old_value, new_value);
2454
armAsm->str(new_value, PTR(ptr));
2455
2456
if (reg == Cop0Reg::SR)
2457
{
2458
// TODO: replace with register backup
2459
// We could just inline the whole thing..
2460
Flush(FLUSH_FOR_C_CALL);
2461
2462
Label caches_unchanged;
2463
armAsm->tbz(changed_bits, 16, &caches_unchanged);
2464
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2465
armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below
2466
if (CodeCache::IsUsingFastmem())
2467
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
2468
armAsm->bind(&caches_unchanged);
2469
2470
TestInterrupts(RWARG1);
2471
}
2472
else if (reg == Cop0Reg::CAUSE)
2473
{
2474
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2475
TestInterrupts(RWARG1);
2476
}
2477
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2478
{
2479
// need to check whether we're switching to debug mode
2480
Flush(FLUSH_FOR_C_CALL);
2481
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2482
SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);
2483
BackupHostState();
2484
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2485
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2486
RestoreHostState();
2487
SwitchToNearCode(false);
2488
}
2489
}
2490
2491
void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)
2492
{
2493
// shift mode bits right two, preserving upper bits
2494
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2495
armAsm->bfxil(RWARG1, RWARG1, 2, 4);
2496
armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2497
2498
TestInterrupts(RWARG1);
2499
}
2500
2501
void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)
2502
{
2503
DebugAssert(sr.IsW());
2504
2505
// if Iec == 0 then goto no_interrupt
2506
Label no_interrupt;
2507
armAsm->tbz(sr, 0, &no_interrupt);
2508
2509
// sr & cause
2510
armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
2511
armAsm->and_(sr, sr, RWSCRATCH);
2512
2513
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2514
armAsm->tst(sr, 0xFF00);
2515
2516
SwitchToFarCode(true, ne);
2517
BackupHostState();
2518
2519
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2520
UpdateLoadDelay();
2521
2522
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2523
2524
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2525
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2526
if (!iinfo->is_last_instruction)
2527
{
2528
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2529
(inst + 1)->cop.cop_n));
2530
EmitMov(RWARG2, m_compiler_pc);
2531
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2532
m_dirty_pc = false;
2533
EndAndLinkBlock(std::nullopt, true, false);
2534
}
2535
else
2536
{
2537
if (m_dirty_pc)
2538
EmitMov(RWARG1, m_compiler_pc);
2539
armAsm->str(wzr, PTR(&g_state.downcount));
2540
if (m_dirty_pc)
2541
armAsm->str(RWARG1, PTR(&g_state.pc));
2542
m_dirty_pc = false;
2543
EndAndLinkBlock(std::nullopt, false, true);
2544
}
2545
2546
RestoreHostState();
2547
SwitchToNearCode(false);
2548
2549
armAsm->bind(&no_interrupt);
2550
}
2551
2552
void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)
2553
{
2554
const u32 index = inst->cop.Cop2Index();
2555
const Reg rt = inst->r.rt;
2556
2557
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2558
if (action == GTERegisterAccessAction::Ignore)
2559
return;
2560
2561
u32 hreg;
2562
if (action == GTERegisterAccessAction::Direct)
2563
{
2564
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2565
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2566
armAsm->ldr(WRegister(hreg), PTR(ptr));
2567
}
2568
else if (action == GTERegisterAccessAction::CallHandler)
2569
{
2570
Flush(FLUSH_FOR_C_CALL);
2571
EmitMov(RWARG1, index);
2572
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2573
2574
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2575
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2576
armAsm->mov(WRegister(hreg), RWRET);
2577
}
2578
else
2579
{
2580
Panic("Unknown action");
2581
return;
2582
}
2583
2584
if (g_settings.gpu_pgxp_enable)
2585
{
2586
Flush(FLUSH_FOR_C_CALL);
2587
EmitMov(RWARG1, inst->bits);
2588
armAsm->mov(RWARG2, WRegister(hreg));
2589
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2590
}
2591
}
2592
2593
void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)
2594
{
2595
const u32 index = inst->cop.Cop2Index();
2596
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2597
if (action == GTERegisterAccessAction::Ignore)
2598
return;
2599
2600
if (action == GTERegisterAccessAction::Direct)
2601
{
2602
if (cf.const_t)
2603
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2604
else
2605
armAsm->str(CFGetRegT(cf), PTR(ptr));
2606
}
2607
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2608
{
2609
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2610
if (cf.valid_host_t)
2611
{
2612
sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
2613
armAsm->str(RWARG1, PTR(ptr));
2614
}
2615
else if (cf.const_t)
2616
{
2617
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2618
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2619
}
2620
else
2621
{
2622
Panic("Unsupported setup");
2623
}
2624
}
2625
else if (action == GTERegisterAccessAction::CallHandler)
2626
{
2627
Flush(FLUSH_FOR_C_CALL);
2628
EmitMov(RWARG1, index);
2629
MoveTToReg(RWARG2, cf);
2630
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2631
}
2632
else if (action == GTERegisterAccessAction::PushFIFO)
2633
{
2634
// SXY0 <- SXY1
2635
// SXY1 <- SXY2
2636
// SXY2 <- SXYP
2637
DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
2638
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2639
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2640
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2641
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2642
if (cf.valid_host_t)
2643
armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
2644
else if (cf.const_t)
2645
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2646
else
2647
Panic("Unsupported setup");
2648
}
2649
else
2650
{
2651
Panic("Unknown action");
2652
}
2653
}
2654
2655
void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)
2656
{
2657
TickCount func_ticks;
2658
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2659
2660
Flush(FLUSH_FOR_C_CALL);
2661
EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2662
EmitCall(reinterpret_cast<const void*>(func));
2663
2664
AddGTETicks(func_ticks);
2665
}
2666
2667
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2668
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2669
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2670
bool is_load)
2671
{
2672
Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
2673
Assembler* armAsm = &arm_asm;
2674
2675
#ifdef VIXL_DEBUG
2676
vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
2677
#endif
2678
2679
static constexpr u32 GPR_SIZE = 8;
2680
2681
// save regs
2682
u32 num_gprs = 0;
2683
2684
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2685
{
2686
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2687
num_gprs++;
2688
}
2689
2690
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
2691
2692
// TODO: use stp+ldp, vixl helper?
2693
2694
if (stack_size > 0)
2695
{
2696
armAsm->sub(sp, sp, stack_size);
2697
2698
u32 stack_offset = 0;
2699
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2700
{
2701
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2702
{
2703
armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
2704
stack_offset += GPR_SIZE;
2705
}
2706
}
2707
}
2708
2709
if (cycles_to_add != 0)
2710
{
2711
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2712
Assert(Assembler::IsImmAddSub(cycles_to_add));
2713
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2714
armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
2715
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2716
}
2717
2718
if (address_register != static_cast<u8>(RWARG1.GetCode()))
2719
armAsm->mov(RWARG1, WRegister(address_register));
2720
2721
if (!is_load)
2722
{
2723
if (data_register != static_cast<u8>(RWARG2.GetCode()))
2724
armAsm->mov(RWARG2, WRegister(data_register));
2725
}
2726
2727
switch (size)
2728
{
2729
case MemoryAccessSize::Byte:
2730
{
2731
armEmitCall(armAsm,
2732
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :
2733
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),
2734
false);
2735
}
2736
break;
2737
case MemoryAccessSize::HalfWord:
2738
{
2739
armEmitCall(armAsm,
2740
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :
2741
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),
2742
false);
2743
}
2744
break;
2745
case MemoryAccessSize::Word:
2746
{
2747
armEmitCall(armAsm,
2748
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :
2749
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),
2750
false);
2751
}
2752
break;
2753
}
2754
2755
if (is_load)
2756
{
2757
const WRegister dst = WRegister(data_register);
2758
switch (size)
2759
{
2760
case MemoryAccessSize::Byte:
2761
{
2762
is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);
2763
}
2764
break;
2765
case MemoryAccessSize::HalfWord:
2766
{
2767
is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);
2768
}
2769
break;
2770
case MemoryAccessSize::Word:
2771
{
2772
if (dst.GetCode() != RWRET.GetCode())
2773
armAsm->mov(dst, RWRET);
2774
}
2775
break;
2776
}
2777
}
2778
2779
if (cycles_to_remove != 0)
2780
{
2781
Assert(Assembler::IsImmAddSub(cycles_to_remove));
2782
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2783
armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);
2784
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2785
}
2786
2787
// restore regs
2788
if (stack_size > 0)
2789
{
2790
u32 stack_offset = 0;
2791
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2792
{
2793
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2794
{
2795
armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));
2796
stack_offset += GPR_SIZE;
2797
}
2798
}
2799
2800
armAsm->add(sp, sp, stack_size);
2801
}
2802
2803
armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
2804
armAsm->FinalizeCode();
2805
2806
return static_cast<u32>(armAsm->GetCursorOffset());
2807
}
2808
2809
#endif // CPU_ARCH_ARM64
2810
2811