Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_arm32.cpp
4214 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_arm32.h"
5
#include "cpu_core_private.h"
6
#include "cpu_pgxp.h"
7
#include "gte.h"
8
#include "settings.h"
9
#include "timing_event.h"
10
11
#include "common/align.h"
12
#include "common/assert.h"
13
#include "common/log.h"
14
#include "common/memmap.h"
15
#include "common/string_util.h"
16
17
#include <limits>
18
19
#ifdef CPU_ARCH_ARM32
20
21
#include "vixl/aarch32/constants-aarch32.h"
22
#include "vixl/aarch32/instructions-aarch32.h"
23
24
#ifdef ENABLE_HOST_DISASSEMBLY
25
#include "vixl/aarch32/disasm-aarch32.h"
26
#include <iostream>
27
#endif
28
29
LOG_CHANNEL(Recompiler);
30
31
#define PTR(x) vixl::aarch32::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
32
#define RMEMBASE vixl::aarch32::r3
33
34
#define RRET vixl::aarch32::r0
35
#define RRETHI vixl::aarch32::r1
36
#define RARG1 vixl::aarch32::r0
37
#define RARG2 vixl::aarch32::r1
38
#define RARG3 vixl::aarch32::r2
39
#define RSCRATCH vixl::aarch32::r12
40
#define RSTATE vixl::aarch32::r4
41
42
static bool armIsCallerSavedRegister(u32 id);
43
static s32 armGetPCDisplacement(const void* current, const void* target);
44
static bool armIsPCDisplacementInImmediateRange(s32 displacement);
45
static void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);
46
static void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm);
47
static void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
48
static void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
49
static void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr);
50
static void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);
51
static void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr,
52
const vixl::aarch32::Register& tempreg = RSCRATCH);
53
static u8* armGetJumpTrampoline(const void* target);
54
55
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
56
static std::unordered_map<const void*, u32> s_trampoline_targets;
57
static u8* s_trampoline_start_ptr = nullptr;
58
static u32 s_trampoline_used = 0;
59
60
namespace CPU {
61
62
using namespace vixl::aarch32;
63
64
static ARM32Recompiler s_instance;
65
Recompiler* g_compiler = &s_instance;
66
67
} // namespace CPU
68
69
bool armIsCallerSavedRegister(u32 id)
70
{
71
return ((id >= 0 && id <= 3) || // r0-r3
72
(id == 12 || id == 14)); // sp, pc
73
}
74
75
s32 armGetPCDisplacement(const void* current, const void* target)
76
{
77
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
78
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
79
return static_cast<s32>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)));
80
}
81
82
bool armIsPCDisplacementInImmediateRange(s32 displacement)
83
{
84
return (displacement >= -33554432 && displacement <= 33554428);
85
}
86
87
void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm)
88
{
89
if (vixl::IsUintN(16, imm))
90
{
91
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
92
return;
93
}
94
95
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
96
armAsm->movt(vixl::aarch32::al, rd, imm >> 16);
97
}
98
99
void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)
100
{
101
armEmitMov(armAsm, reg, static_cast<u32>(reinterpret_cast<uintptr_t>(addr)));
102
}
103
104
void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
105
{
106
const void* cur = armAsm->GetCursorAddress<const void*>();
107
s32 displacement = armGetPCDisplacement(cur, ptr);
108
bool use_bx = !armIsPCDisplacementInImmediateRange(displacement);
109
if (use_bx && !force_inline)
110
{
111
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
112
{
113
displacement = armGetPCDisplacement(cur, trampoline);
114
use_bx = !armIsPCDisplacementInImmediateRange(displacement);
115
}
116
}
117
118
if (use_bx)
119
{
120
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
121
armAsm->bx(RSCRATCH);
122
}
123
else
124
{
125
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
126
armAsm->b(&label);
127
}
128
}
129
130
void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
131
{
132
const void* cur = armAsm->GetCursorAddress<const void*>();
133
s32 displacement = armGetPCDisplacement(cur, ptr);
134
bool use_blx = !armIsPCDisplacementInImmediateRange(displacement);
135
if (use_blx && !force_inline)
136
{
137
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
138
{
139
displacement = armGetPCDisplacement(cur, trampoline);
140
use_blx = !armIsPCDisplacementInImmediateRange(displacement);
141
}
142
}
143
144
if (use_blx)
145
{
146
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
147
armAsm->blx(RSCRATCH);
148
}
149
else
150
{
151
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
152
armAsm->bl(&label);
153
}
154
}
155
156
void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr)
157
{
158
const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);
159
if (!armIsPCDisplacementInImmediateRange(displacement))
160
{
161
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
162
armAsm->blx(cond, RSCRATCH);
163
}
164
else
165
{
166
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
167
armAsm->b(cond, &label);
168
}
169
}
170
171
void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)
172
{
173
armMoveAddressToReg(armAsm, reg, addr);
174
armAsm->ldr(reg, vixl::aarch32::MemOperand(reg));
175
}
176
177
[[maybe_unused]] void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
178
const void* addr, const vixl::aarch32::Register& tempreg)
179
{
180
armMoveAddressToReg(armAsm, tempreg, addr);
181
armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
182
}
183
184
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
185
{
186
#ifdef ENABLE_HOST_DISASSEMBLY
187
vixl::aarch32::PrintDisassembler dis(std::cout, 0);
188
dis.SetCodeAddress(reinterpret_cast<uintptr_t>(start));
189
dis.DisassembleA32Buffer(static_cast<const u32*>(start), size);
190
#else
191
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
192
#endif
193
}
194
195
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
196
{
197
return size / vixl::aarch32::kA32InstructionSizeInBytes;
198
}
199
200
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
201
{
202
using namespace vixl::aarch32;
203
204
const s32 disp = armGetPCDisplacement(code, dst);
205
DebugAssert(armIsPCDisplacementInImmediateRange(disp));
206
207
// A32 jumps are silly.
208
{
209
Assembler emit(static_cast<vixl::byte*>(code), kA32InstructionSizeInBytes, A32);
210
Label label(disp);
211
emit.b(&label);
212
}
213
214
if (flush_icache)
215
MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);
216
217
return kA32InstructionSizeInBytes;
218
}
219
220
u8* armGetJumpTrampoline(const void* target)
221
{
222
auto it = s_trampoline_targets.find(target);
223
if (it != s_trampoline_targets.end())
224
return s_trampoline_start_ptr + it->second;
225
226
// align to 16 bytes?
227
const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);
228
229
// 4 movs plus a jump
230
if (TRAMPOLINE_AREA_SIZE - offset < 20)
231
{
232
Panic("Ran out of space in constant pool");
233
return nullptr;
234
}
235
236
u8* start = s_trampoline_start_ptr + offset;
237
vixl::aarch32::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
238
armMoveAddressToReg(&armAsm, RSCRATCH, target);
239
armAsm.bx(RSCRATCH);
240
241
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
242
DebugAssert(size < 20);
243
s_trampoline_targets.emplace(target, offset);
244
s_trampoline_used = offset + static_cast<u32>(size);
245
246
MemMap::FlushInstructionCache(start, size);
247
return start;
248
}
249
250
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
251
{
252
using namespace vixl::aarch32;
253
254
Assembler actual_asm(static_cast<u8*>(code), code_size);
255
Assembler* armAsm = &actual_asm;
256
257
#ifdef VIXL_DEBUG
258
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
259
#endif
260
261
Label dispatch;
262
Label run_events_and_dispatch;
263
264
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
265
{
266
// Need the CPU state for basically everything :-)
267
armMoveAddressToReg(armAsm, RSTATE, &g_state);
268
}
269
270
// check events then for frame done
271
{
272
Label skip_event_check;
273
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
274
armAsm->ldr(RARG2, PTR(&g_state.downcount));
275
armAsm->cmp(RARG1, RARG2);
276
armAsm->b(lt, &skip_event_check);
277
278
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
279
armAsm->bind(&run_events_and_dispatch);
280
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
281
282
armAsm->bind(&skip_event_check);
283
}
284
285
// TODO: align?
286
g_dispatcher = armAsm->GetCursorAddress<const void*>();
287
{
288
armAsm->bind(&dispatch);
289
290
// x9 <- s_fast_map[pc >> 16]
291
armAsm->ldr(RARG1, PTR(&g_state.pc));
292
armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());
293
armAsm->lsr(RARG2, RARG1, 16);
294
armAsm->ubfx(RARG1, RARG1, 2, 14);
295
armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));
296
297
// blr(x9[pc * 2]) (fast_map[pc >> 2])
298
armAsm->ldr(RARG1, MemOperand(RARG2, RARG1, LSL, 2));
299
armAsm->bx(RARG1);
300
}
301
302
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
303
{
304
armAsm->ldr(RARG1, PTR(&g_state.pc));
305
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
306
armAsm->b(&dispatch);
307
}
308
309
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
310
{
311
armAsm->ldr(RARG1, PTR(&g_state.pc));
312
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
313
armAsm->b(&dispatch);
314
}
315
316
g_interpret_block = armAsm->GetCursorAddress<const void*>();
317
{
318
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
319
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
320
armAsm->ldr(RARG2, PTR(&g_state.downcount));
321
armAsm->cmp(RARG1, RARG2);
322
armAsm->b(ge, &run_events_and_dispatch);
323
armAsm->b(&dispatch);
324
}
325
326
armAsm->FinalizeCode();
327
328
s_trampoline_targets.clear();
329
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
330
s_trampoline_used = 0;
331
332
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
333
}
334
335
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
336
{
337
constexpr u8 padding_value = 0x00;
338
std::memset(dst, padding_value, size);
339
}
340
341
CPU::ARM32Recompiler::ARM32Recompiler() : m_emitter(A32), m_far_emitter(A32)
342
{
343
}
344
345
CPU::ARM32Recompiler::~ARM32Recompiler() = default;
346
347
const void* CPU::ARM32Recompiler::GetCurrentCodePointer()
348
{
349
return armAsm->GetCursorAddress<const void*>();
350
}
351
352
void CPU::ARM32Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
353
u32 far_code_space)
354
{
355
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
356
357
// TODO: don't recreate this every time..
358
DebugAssert(!armAsm);
359
m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
360
m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
361
armAsm = &m_emitter;
362
363
#ifdef VIXL_DEBUG
364
m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(m_emitter.get(), code_buffer_space,
365
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
366
m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
367
m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
368
#endif
369
370
// Need to wipe it out so it's correct when toggling fastmem.
371
m_host_regs = {};
372
373
const u32 membase_idx =
374
(CodeCache::IsUsingFastmem() && block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) ?
375
RMEMBASE.GetCode() :
376
NUM_HOST_REGS;
377
for (u32 i = 0; i < NUM_HOST_REGS; i++)
378
{
379
HostRegAlloc& ra = m_host_regs[i];
380
381
if (i == RARG1.GetCode() || i == RARG2.GetCode() || i == RARG3.GetCode() || i == RSCRATCH.GetCode() ||
382
i == RSTATE.GetCode() || i == membase_idx || i == sp.GetCode() || i == pc.GetCode())
383
{
384
continue;
385
}
386
387
ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
388
}
389
}
390
391
void CPU::ARM32Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch32::ConditionType cond)
392
{
393
DebugAssert(armAsm == &m_emitter);
394
if (emit_jump)
395
{
396
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
397
if (armIsPCDisplacementInImmediateRange(disp))
398
{
399
Label ldisp(armAsm->GetCursorOffset() + disp);
400
armAsm->b(cond, &ldisp);
401
}
402
else if (cond != vixl::aarch32::al)
403
{
404
Label skip;
405
armAsm->b(Condition(cond).Negate(), &skip);
406
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
407
armAsm->bind(&skip);
408
}
409
else
410
{
411
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
412
}
413
}
414
armAsm = &m_far_emitter;
415
}
416
417
void CPU::ARM32Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch32::Register& reg, u32 bit)
418
{
419
armAsm->tst(reg, 1u << bit);
420
421
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
422
if (armIsPCDisplacementInImmediateRange(disp))
423
{
424
Label ldisp(armAsm->GetCursorOffset() + disp);
425
armAsm->b(ne, &ldisp);
426
}
427
else
428
{
429
Label skip;
430
armAsm->b(eq, &skip);
431
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
432
armAsm->bind(&skip);
433
}
434
435
armAsm = &m_far_emitter;
436
}
437
438
void CPU::ARM32Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch32::Register& reg, bool nonzero)
439
{
440
armAsm->cmp(reg, 0);
441
442
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
443
if (armIsPCDisplacementInImmediateRange(disp))
444
{
445
Label ldisp(armAsm->GetCursorOffset() + disp);
446
nonzero ? armAsm->b(ne, &ldisp) : armAsm->b(eq, &ldisp);
447
}
448
else
449
{
450
Label skip;
451
nonzero ? armAsm->b(eq, &skip) : armAsm->b(ne, &skip);
452
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
453
armAsm->bind(&skip);
454
}
455
456
armAsm = &m_far_emitter;
457
}
458
459
void CPU::ARM32Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch32::ConditionType cond)
460
{
461
DebugAssert(armAsm == &m_far_emitter);
462
if (emit_jump)
463
{
464
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
465
if (armIsPCDisplacementInImmediateRange(disp))
466
{
467
Label ldisp(armAsm->GetCursorOffset() + disp);
468
armAsm->b(cond, &ldisp);
469
}
470
else if (cond != vixl::aarch32::al)
471
{
472
Label skip;
473
armAsm->b(Condition(cond).Negate(), &skip);
474
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
475
armAsm->bind(&skip);
476
}
477
else
478
{
479
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
480
}
481
}
482
armAsm = &m_emitter;
483
}
484
485
void CPU::ARM32Recompiler::EmitMov(const vixl::aarch32::Register& dst, u32 val)
486
{
487
armEmitMov(armAsm, dst, val);
488
}
489
490
void CPU::ARM32Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
491
{
492
armEmitCall(armAsm, ptr, force_inline);
493
}
494
495
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(s32 val)
496
{
497
if (ImmediateA32::IsImmediateA32(static_cast<u32>(val)))
498
return vixl::aarch32::Operand(static_cast<int32_t>(val));
499
500
EmitMov(RSCRATCH, static_cast<u32>(val));
501
return vixl::aarch32::Operand(RSCRATCH);
502
}
503
504
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(u32 val)
505
{
506
return armCheckAddSubConstant(static_cast<s32>(val));
507
}
508
509
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckCompareConstant(s32 val)
510
{
511
return armCheckAddSubConstant(val);
512
}
513
514
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckLogicalConstant(u32 val)
515
{
516
return armCheckAddSubConstant(val);
517
}
518
519
void CPU::ARM32Recompiler::BeginBlock()
520
{
521
Recompiler::BeginBlock();
522
}
523
524
void CPU::ARM32Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
525
{
526
// store it first to reduce code size, because we can offset
527
armMoveAddressToReg(armAsm, RARG1, ram_ptr);
528
armMoveAddressToReg(armAsm, RARG2, shadow_ptr);
529
530
u32 offset = 0;
531
Label block_changed;
532
533
#if 0
534
/* TODO: Vectorize
535
#include <arm_neon.h>
536
#include <stdint.h>
537
538
bool foo(const void* a, const void* b)
539
{
540
uint8x16_t v1 = vld1q_u8((const uint8_t*)a);
541
uint8x16_t v2 = vld1q_u8((const uint8_t*)b);
542
uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);
543
uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);
544
uint8x16_t r = vceqq_u8(v1, v2);
545
uint8x16_t r2 = vceqq_u8(v2, v3);
546
uint8x16_t r3 = vandq_u8(r, r2);
547
uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));
548
if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)
549
return false;
550
else
551
return true;
552
}
553
*/
554
bool first = true;
555
556
while (size >= 16)
557
{
558
const VRegister vtmp = a32::v2.V4S();
559
const VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();
560
m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));
561
m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));
562
m_emit->cmeq(dst, dst, vtmp);
563
if (!first)
564
m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
565
else
566
first = false;
567
568
offset += 16;
569
size -= 16;
570
}
571
572
if (!first)
573
{
574
// TODO: make sure this doesn't choke on ffffffff
575
armAsm->uminv(a32::s0, a32::v0.V4S());
576
armAsm->fcmp(a32::s0, 0.0);
577
armAsm->b(&block_changed, a32::eq);
578
}
579
#endif
580
581
while (size >= 4)
582
{
583
armAsm->ldr(RARG3, MemOperand(RARG1, offset));
584
armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset));
585
armAsm->cmp(RARG3, RSCRATCH);
586
armAsm->b(ne, &block_changed);
587
offset += 4;
588
size -= 4;
589
}
590
591
DebugAssert(size == 0);
592
593
Label block_unchanged;
594
armAsm->b(&block_unchanged);
595
armAsm->bind(&block_changed);
596
armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
597
armAsm->bind(&block_unchanged);
598
}
599
600
void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
601
{
602
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
603
{
604
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
605
{
606
armEmitFarLoad(armAsm, RARG2, GetFetchMemoryAccessTimePtr());
607
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
608
armEmitMov(armAsm, RARG3, m_block->size);
609
armAsm->mul(RARG2, RARG2, RARG3);
610
armAsm->add(RARG1, RARG1, RARG2);
611
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
612
}
613
else
614
{
615
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
616
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
617
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
618
}
619
}
620
else if (m_block->icache_line_count > 0)
621
{
622
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
623
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
624
if (fill_ticks <= 0)
625
return;
626
627
const auto& ticks_reg = RARG1;
628
const auto& current_tag_reg = RARG2;
629
const auto& existing_tag_reg = RARG3;
630
const auto& fill_ticks_reg = r5;
631
632
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
633
armEmitMov(armAsm, current_tag_reg, current_pc);
634
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
635
636
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
637
{
638
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
639
if (fill_ticks <= 0)
640
continue;
641
642
const u32 line = GetICacheLine(current_pc);
643
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
644
645
// Offsets must be <4K on ARM.
646
MemOperand line_addr = MemOperand(RSTATE, offset);
647
if (offset >= 4096)
648
{
649
armEmitMov(armAsm, RSCRATCH, offset);
650
line_addr = MemOperand(RSTATE, RSCRATCH);
651
}
652
653
Label cache_hit;
654
armAsm->ldr(existing_tag_reg, line_addr);
655
armAsm->str(current_tag_reg, line_addr);
656
armAsm->cmp(existing_tag_reg, current_tag_reg);
657
armAsm->add(ne, ticks_reg, ticks_reg, fill_ticks_reg);
658
659
if (i != (m_block->icache_line_count - 1))
660
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
661
}
662
663
armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
664
}
665
}
666
667
void CPU::ARM32Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
668
s32 arg3reg /*= -1*/)
669
{
670
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.GetCode()))
671
armAsm->mov(RARG1, Register(arg1reg));
672
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2.GetCode()))
673
armAsm->mov(RARG2, Register(arg2reg));
674
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3.GetCode()))
675
armAsm->mov(RARG3, Register(arg3reg));
676
EmitCall(func);
677
}
678
679
void CPU::ARM32Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
680
{
681
if (newpc.has_value())
682
{
683
if (m_dirty_pc || m_compiler_pc != newpc)
684
{
685
EmitMov(RSCRATCH, newpc.value());
686
armAsm->str(RSCRATCH, PTR(&g_state.pc));
687
}
688
}
689
m_dirty_pc = false;
690
691
// flush regs
692
Flush(FLUSH_END_BLOCK);
693
EndAndLinkBlock(newpc, do_event_test, false);
694
}
695
696
void CPU::ARM32Recompiler::EndBlockWithException(Exception excode)
697
{
698
// flush regs, but not pc, it's going to get overwritten
699
// flush cycles because of the GTE instruction stuff...
700
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
701
702
// TODO: flush load delay
703
704
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
705
inst->cop.cop_n));
706
EmitMov(RARG2, m_current_instruction_pc);
707
if (excode != Exception::BP)
708
{
709
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
710
}
711
else
712
{
713
EmitMov(RARG3, inst->bits);
714
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
715
}
716
717
m_dirty_pc = false;
718
719
EndAndLinkBlock(std::nullopt, true, false);
720
}
721
722
void CPU::ARM32Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
723
{
724
// event test
725
// pc should've been flushed
726
DebugAssert(!m_dirty_pc && !m_block_ended);
727
m_block_ended = true;
728
729
// TODO: try extracting this to a function
730
731
// save cycles for event test
732
const TickCount cycles = std::exchange(m_cycles, 0);
733
734
// pending_ticks += cycles
735
// if (pending_ticks >= downcount) { dispatch_event(); }
736
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
737
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
738
if (do_event_test)
739
armAsm->ldr(RARG2, PTR(&g_state.downcount));
740
if (cycles > 0)
741
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(cycles));
742
if (m_gte_done_cycle > cycles)
743
{
744
armAsm->add(RARG2, RARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
745
armAsm->str(RARG2, PTR(&g_state.gte_completion_tick));
746
}
747
if (do_event_test)
748
armAsm->cmp(RARG1, RARG2);
749
if (cycles > 0)
750
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
751
if (do_event_test)
752
armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
753
754
// jump to dispatcher or next block
755
if (force_run_events)
756
{
757
armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
758
}
759
else if (!newpc.has_value())
760
{
761
armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
762
}
763
else
764
{
765
const void* target = (newpc.value() == m_block->pc) ?
766
CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),
767
armAsm->GetBuffer()->GetStartAddress<const void*>()) :
768
CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
769
armEmitJmp(armAsm, target, true);
770
}
771
}
772
773
const void* CPU::ARM32Recompiler::EndCompile(u32* code_size, u32* far_code_size)
774
{
775
#ifdef VIXL_DEBUG
776
m_emitter_check.reset();
777
m_far_emitter_check.reset();
778
#endif
779
780
m_emitter.FinalizeCode();
781
m_far_emitter.FinalizeCode();
782
783
u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
784
*code_size = static_cast<u32>(m_emitter.GetCursorOffset());
785
*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
786
armAsm = nullptr;
787
return code;
788
}
789
790
const char* CPU::ARM32Recompiler::GetHostRegName(u32 reg) const
791
{
792
static constexpr std::array<const char*, 32> reg64_names = {
793
{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
794
"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
795
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
796
}
797
798
void CPU::ARM32Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
799
{
800
EmitMov(Register(reg), val);
801
}
802
803
void CPU::ARM32Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
804
{
805
armAsm->ldr(Register(reg), PTR(ptr));
806
}
807
808
void CPU::ARM32Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
809
{
810
armAsm->str(Register(reg), PTR(ptr));
811
}
812
813
void CPU::ARM32Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
814
{
815
EmitMov(RSCRATCH, val);
816
armAsm->str(RSCRATCH, PTR(ptr));
817
}
818
819
void CPU::ARM32Recompiler::CopyHostReg(u32 dst, u32 src)
820
{
821
if (src != dst)
822
armAsm->mov(Register(dst), Register(src));
823
}
824
825
void CPU::ARM32Recompiler::AssertRegOrConstS(CompileFlags cf) const
826
{
827
DebugAssert(cf.valid_host_s || cf.const_s);
828
}
829
830
void CPU::ARM32Recompiler::AssertRegOrConstT(CompileFlags cf) const
831
{
832
DebugAssert(cf.valid_host_t || cf.const_t);
833
}
834
835
vixl::aarch32::MemOperand CPU::ARM32Recompiler::MipsPtr(Reg r) const
836
{
837
DebugAssert(r < Reg::count);
838
return PTR(&g_state.regs.r[static_cast<u32>(r)]);
839
}
840
841
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegD(CompileFlags cf) const
842
{
843
DebugAssert(cf.valid_host_d);
844
return Register(cf.host_d);
845
}
846
847
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegS(CompileFlags cf) const
848
{
849
DebugAssert(cf.valid_host_s);
850
return Register(cf.host_s);
851
}
852
853
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegT(CompileFlags cf) const
854
{
855
DebugAssert(cf.valid_host_t);
856
return Register(cf.host_t);
857
}
858
859
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegLO(CompileFlags cf) const
860
{
861
DebugAssert(cf.valid_host_lo);
862
return Register(cf.host_lo);
863
}
864
865
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegHI(CompileFlags cf) const
866
{
867
DebugAssert(cf.valid_host_hi);
868
return Register(cf.host_hi);
869
}
870
871
vixl::aarch32::Register CPU::ARM32Recompiler::GetMembaseReg()
872
{
873
const u32 code = RMEMBASE.GetCode();
874
if (!IsHostRegAllocated(code))
875
{
876
// Leave usable unset, so we don't try to allocate it later.
877
m_host_regs[code].type = HR_TYPE_MEMBASE;
878
m_host_regs[code].flags = HR_ALLOCATED;
879
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
880
}
881
882
return RMEMBASE;
883
}
884
885
void CPU::ARM32Recompiler::MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
886
{
887
if (cf.valid_host_s)
888
{
889
if (cf.host_s != dst.GetCode())
890
armAsm->mov(dst, Register(cf.host_s));
891
}
892
else if (cf.const_s)
893
{
894
const u32 cv = GetConstantRegU32(cf.MipsS());
895
EmitMov(dst, cv);
896
}
897
else
898
{
899
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
900
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
901
}
902
}
903
904
void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
905
{
906
if (cf.valid_host_t)
907
{
908
if (cf.host_t != dst.GetCode())
909
armAsm->mov(dst, Register(cf.host_t));
910
}
911
else if (cf.const_t)
912
{
913
const u32 cv = GetConstantRegU32(cf.MipsT());
914
EmitMov(dst, cv);
915
}
916
else
917
{
918
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
919
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
920
}
921
}
922
923
void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg)
924
{
925
DebugAssert(reg < Reg::count);
926
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
927
armAsm->mov(dst, Register(hreg.value()));
928
else if (HasConstantReg(reg))
929
EmitMov(dst, GetConstantRegU32(reg));
930
else
931
armAsm->ldr(dst, MipsPtr(reg));
932
}
933
934
void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
935
Reg arg3reg /* = Reg::count */)
936
{
937
DebugAssert(g_settings.gpu_pgxp_enable);
938
939
Flush(FLUSH_FOR_C_CALL);
940
941
if (arg2reg != Reg::count)
942
MoveMIPSRegToReg(RARG2, arg2reg);
943
if (arg3reg != Reg::count)
944
MoveMIPSRegToReg(RARG3, arg3reg);
945
946
EmitMov(RARG1, arg1val);
947
EmitCall(func);
948
}
949
950
void CPU::ARM32Recompiler::Flush(u32 flags)
951
{
952
Recompiler::Flush(flags);
953
954
if (flags & FLUSH_PC && m_dirty_pc)
955
{
956
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
957
m_dirty_pc = false;
958
}
959
960
if (flags & FLUSH_INSTRUCTION_BITS)
961
{
962
// This sucks, but it's only used for fallbacks.
963
EmitMov(RARG1, inst->bits);
964
EmitMov(RARG2, m_current_instruction_pc);
965
EmitMov(RARG3, m_current_instruction_branch_delay_slot);
966
armAsm->str(RARG1, PTR(&g_state.current_instruction.bits));
967
armAsm->str(RARG2, PTR(&g_state.current_instruction_pc));
968
armAsm->strb(RARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
969
}
970
971
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
972
{
973
// This sucks :(
974
// TODO: make it a function?
975
armAsm->ldrb(RARG1, PTR(&g_state.load_delay_reg));
976
armAsm->ldr(RARG2, PTR(&g_state.load_delay_value));
977
EmitMov(RSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
978
armAsm->add(RARG1, RSCRATCH, vixl::aarch32::Operand(RARG1, LSL, 2));
979
armAsm->str(RARG2, MemOperand(RSTATE, RARG1));
980
EmitMov(RSCRATCH, static_cast<u8>(Reg::count));
981
armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
982
m_load_delay_dirty = false;
983
}
984
985
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
986
{
987
if (m_load_delay_value_register != NUM_HOST_REGS)
988
FreeHostReg(m_load_delay_value_register);
989
990
EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
991
armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
992
m_load_delay_register = Reg::count;
993
m_load_delay_dirty = true;
994
}
995
996
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
997
{
998
// May as well flush cycles while we're here.
999
// GTE spanning blocks is very rare, we _could_ disable this for speed.
1000
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1001
armAsm->ldr(RARG2, PTR(&g_state.gte_completion_tick));
1002
if (m_cycles > 0)
1003
{
1004
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1005
m_cycles = 0;
1006
}
1007
armAsm->cmp(RARG2, RARG1);
1008
armAsm->mov(hs, RARG1, RARG2);
1009
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1010
m_dirty_gte_done_cycle = false;
1011
}
1012
1013
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1014
{
1015
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1016
1017
// update cycles at the same time
1018
if (flags & FLUSH_CYCLES && m_cycles > 0)
1019
{
1020
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1021
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1022
m_gte_done_cycle -= m_cycles;
1023
m_cycles = 0;
1024
}
1025
1026
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_gte_done_cycle));
1027
armAsm->str(RARG1, PTR(&g_state.gte_completion_tick));
1028
m_gte_done_cycle = 0;
1029
m_dirty_gte_done_cycle = true;
1030
}
1031
1032
if (flags & FLUSH_CYCLES && m_cycles > 0)
1033
{
1034
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1035
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1036
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1037
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1038
m_cycles = 0;
1039
}
1040
}
1041
1042
void CPU::ARM32Recompiler::Compile_Fallback()
1043
{
1044
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1045
inst->bits);
1046
1047
Flush(FLUSH_FOR_INTERPRETER);
1048
1049
EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));
1050
1051
// TODO: make me less garbage
1052
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
1053
// but nothing should be going through here..
1054
Label no_load_delay;
1055
armAsm->ldrb(RARG1, PTR(&g_state.next_load_delay_reg));
1056
armAsm->cmp(RARG1, static_cast<u8>(Reg::count));
1057
armAsm->b(eq, &no_load_delay);
1058
armAsm->ldr(RARG2, PTR(&g_state.next_load_delay_value));
1059
armAsm->strb(RARG1, PTR(&g_state.load_delay_reg));
1060
armAsm->str(RARG2, PTR(&g_state.load_delay_value));
1061
EmitMov(RARG1, static_cast<u32>(Reg::count));
1062
armAsm->strb(RARG1, PTR(&g_state.next_load_delay_reg));
1063
armAsm->bind(&no_load_delay);
1064
1065
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
1066
}
1067
1068
void CPU::ARM32Recompiler::CheckBranchTarget(const vixl::aarch32::Register& pcreg)
1069
{
1070
if (!g_settings.cpu_recompiler_memory_exceptions)
1071
return;
1072
1073
armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
1074
SwitchToFarCode(true, ne);
1075
1076
BackupHostState();
1077
EndBlockWithException(Exception::AdEL);
1078
1079
RestoreHostState();
1080
SwitchToNearCode(false);
1081
}
1082
1083
void CPU::ARM32Recompiler::Compile_jr(CompileFlags cf)
1084
{
1085
const Register pcreg = CFGetRegS(cf);
1086
CheckBranchTarget(pcreg);
1087
1088
armAsm->str(pcreg, PTR(&g_state.pc));
1089
1090
CompileBranchDelaySlot(false);
1091
EndBlock(std::nullopt, true);
1092
}
1093
1094
void CPU::ARM32Recompiler::Compile_jalr(CompileFlags cf)
1095
{
1096
const Register pcreg = CFGetRegS(cf);
1097
if (MipsD() != Reg::zero)
1098
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1099
1100
CheckBranchTarget(pcreg);
1101
armAsm->str(pcreg, PTR(&g_state.pc));
1102
1103
CompileBranchDelaySlot(false);
1104
EndBlock(std::nullopt, true);
1105
}
1106
1107
void CPU::ARM32Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1108
{
1109
AssertRegOrConstS(cf);
1110
1111
const u32 taken_pc = GetConditionalBranchTarget(cf);
1112
1113
Flush(FLUSH_FOR_BRANCH);
1114
1115
DebugAssert(cf.valid_host_s);
1116
1117
// MipsT() here should equal zero for zero branches.
1118
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1119
1120
Label taken;
1121
const Register rs = CFGetRegS(cf);
1122
switch (cond)
1123
{
1124
case BranchCondition::Equal:
1125
case BranchCondition::NotEqual:
1126
{
1127
AssertRegOrConstT(cf);
1128
if (cf.valid_host_t)
1129
armAsm->cmp(rs, CFGetRegT(cf));
1130
else if (cf.const_t)
1131
armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
1132
1133
armAsm->b((cond == BranchCondition::Equal) ? eq : ne, &taken);
1134
}
1135
break;
1136
1137
case BranchCondition::GreaterThanZero:
1138
{
1139
armAsm->cmp(rs, 0);
1140
armAsm->b(gt, &taken);
1141
}
1142
break;
1143
1144
case BranchCondition::GreaterEqualZero:
1145
{
1146
armAsm->cmp(rs, 0);
1147
armAsm->b(ge, &taken);
1148
}
1149
break;
1150
1151
case BranchCondition::LessThanZero:
1152
{
1153
armAsm->cmp(rs, 0);
1154
armAsm->b(lt, &taken);
1155
}
1156
break;
1157
1158
case BranchCondition::LessEqualZero:
1159
{
1160
armAsm->cmp(rs, 0);
1161
armAsm->b(le, &taken);
1162
}
1163
break;
1164
}
1165
1166
BackupHostState();
1167
if (!cf.delay_slot_swapped)
1168
CompileBranchDelaySlot();
1169
1170
EndBlock(m_compiler_pc, true);
1171
1172
armAsm->bind(&taken);
1173
1174
RestoreHostState();
1175
if (!cf.delay_slot_swapped)
1176
CompileBranchDelaySlot();
1177
1178
EndBlock(taken_pc, true);
1179
}
1180
1181
void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1182
{
1183
const Register rs = CFGetRegS(cf);
1184
const Register rt = CFGetRegT(cf);
1185
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1186
{
1187
if (!overflow)
1188
{
1189
armAsm->add(rt, rs, armCheckAddSubConstant(imm));
1190
}
1191
else
1192
{
1193
armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
1194
TestOverflow(rt);
1195
}
1196
}
1197
else if (rt.GetCode() != rs.GetCode())
1198
{
1199
armAsm->mov(rt, rs);
1200
}
1201
}
1202
1203
void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf)
1204
{
1205
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1206
}
1207
1208
void CPU::ARM32Recompiler::Compile_addiu(CompileFlags cf)
1209
{
1210
Compile_addi(cf, false);
1211
}
1212
1213
void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf)
1214
{
1215
Compile_slti(cf, true);
1216
}
1217
1218
void CPU::ARM32Recompiler::Compile_sltiu(CompileFlags cf)
1219
{
1220
Compile_slti(cf, false);
1221
}
1222
1223
void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf, bool sign)
1224
{
1225
const Register rs = CFGetRegS(cf);
1226
const Register rt = CFGetRegT(cf);
1227
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
1228
armAsm->mov(sign ? ge : hs, rt, 0);
1229
armAsm->mov(sign ? lt : lo, rt, 1);
1230
}
1231
1232
void CPU::ARM32Recompiler::Compile_andi(CompileFlags cf)
1233
{
1234
const Register rt = CFGetRegT(cf);
1235
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1236
armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
1237
else
1238
EmitMov(rt, 0);
1239
}
1240
1241
void CPU::ARM32Recompiler::Compile_ori(CompileFlags cf)
1242
{
1243
const Register rt = CFGetRegT(cf);
1244
const Register rs = CFGetRegS(cf);
1245
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1246
armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
1247
else if (rt.GetCode() != rs.GetCode())
1248
armAsm->mov(rt, rs);
1249
}
1250
1251
void CPU::ARM32Recompiler::Compile_xori(CompileFlags cf)
1252
{
1253
const Register rt = CFGetRegT(cf);
1254
const Register rs = CFGetRegS(cf);
1255
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1256
armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
1257
else if (rt.GetCode() != rs.GetCode())
1258
armAsm->mov(rt, rs);
1259
}
1260
1261
void CPU::ARM32Recompiler::Compile_shift(CompileFlags cf,
1262
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1263
vixl::aarch32::Register, const Operand&))
1264
{
1265
const Register rd = CFGetRegD(cf);
1266
const Register rt = CFGetRegT(cf);
1267
if (inst->r.shamt > 0)
1268
(armAsm->*op)(rd, rt, inst->r.shamt.GetValue());
1269
else if (rd.GetCode() != rt.GetCode())
1270
armAsm->mov(rd, rt);
1271
}
1272
1273
void CPU::ARM32Recompiler::Compile_sll(CompileFlags cf)
1274
{
1275
Compile_shift(cf, &Assembler::lsl);
1276
}
1277
1278
void CPU::ARM32Recompiler::Compile_srl(CompileFlags cf)
1279
{
1280
Compile_shift(cf, &Assembler::lsr);
1281
}
1282
1283
void CPU::ARM32Recompiler::Compile_sra(CompileFlags cf)
1284
{
1285
Compile_shift(cf, &Assembler::asr);
1286
}
1287
1288
void CPU::ARM32Recompiler::Compile_variable_shift(CompileFlags cf,
1289
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1290
vixl::aarch32::Register,
1291
const Operand&))
1292
{
1293
const Register rd = CFGetRegD(cf);
1294
1295
AssertRegOrConstS(cf);
1296
AssertRegOrConstT(cf);
1297
1298
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1299
if (!cf.valid_host_t)
1300
MoveTToReg(rt, cf);
1301
1302
if (cf.const_s)
1303
{
1304
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1305
(armAsm->*op)(rd, rt, shift & 0x1Fu);
1306
else if (rd.GetCode() != rt.GetCode())
1307
armAsm->mov(rd, rt);
1308
}
1309
else
1310
{
1311
armAsm->and_(RSCRATCH, CFGetRegS(cf), 0x1Fu);
1312
(armAsm->*op)(rd, rt, RSCRATCH);
1313
}
1314
}
1315
1316
void CPU::ARM32Recompiler::Compile_sllv(CompileFlags cf)
1317
{
1318
Compile_variable_shift(cf, &Assembler::lsl);
1319
}
1320
1321
void CPU::ARM32Recompiler::Compile_srlv(CompileFlags cf)
1322
{
1323
Compile_variable_shift(cf, &Assembler::lsr);
1324
}
1325
1326
void CPU::ARM32Recompiler::Compile_srav(CompileFlags cf)
1327
{
1328
Compile_variable_shift(cf, &Assembler::asr);
1329
}
1330
1331
void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf, bool sign)
1332
{
1333
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1334
if (!cf.valid_host_s)
1335
MoveSToReg(rs, cf);
1336
1337
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1338
if (!cf.valid_host_t)
1339
MoveTToReg(rt, cf);
1340
1341
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1342
const Register lo = CFGetRegLO(cf);
1343
const Register hi = CFGetRegHI(cf);
1344
1345
(sign) ? armAsm->smull(lo, hi, rs, rt) : armAsm->umull(lo, hi, rs, rt);
1346
}
1347
1348
void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf)
1349
{
1350
Compile_mult(cf, true);
1351
}
1352
1353
void CPU::ARM32Recompiler::Compile_multu(CompileFlags cf)
1354
{
1355
Compile_mult(cf, false);
1356
}
1357
1358
void CPU::ARM32Recompiler::Compile_div(CompileFlags cf)
1359
{
1360
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1361
if (!cf.valid_host_s)
1362
MoveSToReg(rs, cf);
1363
1364
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1365
if (!cf.valid_host_t)
1366
MoveTToReg(rt, cf);
1367
1368
const Register rlo = CFGetRegLO(cf);
1369
const Register rhi = CFGetRegHI(cf);
1370
1371
// TODO: This could be slightly more optimal
1372
Label done;
1373
Label not_divide_by_zero;
1374
armAsm->cmp(rt, 0);
1375
armAsm->b(ne, &not_divide_by_zero);
1376
armAsm->mov(rhi, rs); // hi = num
1377
EmitMov(rlo, 1);
1378
EmitMov(RSCRATCH, static_cast<u32>(-1));
1379
armAsm->cmp(rs, 0);
1380
armAsm->mov(ge, rlo, RSCRATCH); // lo = s >= 0 ? -1 : 1
1381
armAsm->b(&done);
1382
1383
armAsm->bind(&not_divide_by_zero);
1384
Label not_unrepresentable;
1385
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
1386
armAsm->b(ne, &not_unrepresentable);
1387
armAsm->cmp(rt, armCheckCompareConstant(-1));
1388
armAsm->b(ne, &not_unrepresentable);
1389
1390
EmitMov(rlo, 0x80000000u);
1391
EmitMov(rhi, 0);
1392
armAsm->b(&done);
1393
1394
armAsm->bind(&not_unrepresentable);
1395
1396
armAsm->sdiv(rlo, rs, rt);
1397
1398
// TODO: skip when hi is dead
1399
armAsm->mls(rhi, rlo, rt, rs);
1400
1401
armAsm->bind(&done);
1402
}
1403
1404
void CPU::ARM32Recompiler::Compile_divu(CompileFlags cf)
1405
{
1406
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1407
if (!cf.valid_host_s)
1408
MoveSToReg(rs, cf);
1409
1410
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1411
if (!cf.valid_host_t)
1412
MoveTToReg(rt, cf);
1413
1414
const Register rlo = CFGetRegLO(cf);
1415
const Register rhi = CFGetRegHI(cf);
1416
1417
Label done;
1418
Label not_divide_by_zero;
1419
armAsm->cmp(rt, 0);
1420
armAsm->b(ne, &not_divide_by_zero);
1421
EmitMov(rlo, static_cast<u32>(-1));
1422
armAsm->mov(rhi, rs);
1423
armAsm->b(&done);
1424
1425
armAsm->bind(&not_divide_by_zero);
1426
1427
armAsm->udiv(rlo, rs, rt);
1428
1429
// TODO: skip when hi is dead
1430
armAsm->mls(rhi, rlo, rt, rs);
1431
1432
armAsm->bind(&done);
1433
}
1434
1435
void CPU::ARM32Recompiler::TestOverflow(const vixl::aarch32::Register& result)
1436
{
1437
SwitchToFarCode(true, vs);
1438
1439
BackupHostState();
1440
1441
// toss the result
1442
ClearHostReg(result.GetCode());
1443
1444
EndBlockWithException(Exception::Ov);
1445
1446
RestoreHostState();
1447
1448
SwitchToNearCode(false);
1449
}
1450
1451
void CPU::ARM32Recompiler::Compile_dst_op(CompileFlags cf,
1452
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1453
vixl::aarch32::Register, const Operand&),
1454
bool commutative, bool logical, bool overflow)
1455
{
1456
AssertRegOrConstS(cf);
1457
AssertRegOrConstT(cf);
1458
1459
const Register rd = CFGetRegD(cf);
1460
if (cf.valid_host_s && cf.valid_host_t)
1461
{
1462
(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
1463
}
1464
else if (commutative && (cf.const_s || cf.const_t))
1465
{
1466
const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1467
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1468
{
1469
(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1470
}
1471
else
1472
{
1473
if (rd.GetCode() != src.GetCode())
1474
armAsm->mov(rd, src);
1475
overflow = false;
1476
}
1477
}
1478
else if (cf.const_s)
1479
{
1480
EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
1481
(armAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));
1482
}
1483
else if (cf.const_t)
1484
{
1485
const Register rs = CFGetRegS(cf);
1486
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1487
{
1488
(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1489
}
1490
else
1491
{
1492
if (rd.GetCode() != rs.GetCode())
1493
armAsm->mov(rd, rs);
1494
overflow = false;
1495
}
1496
}
1497
1498
if (overflow)
1499
TestOverflow(rd);
1500
}
1501
1502
void CPU::ARM32Recompiler::Compile_add(CompileFlags cf)
1503
{
1504
if (g_settings.cpu_recompiler_memory_exceptions)
1505
Compile_dst_op(cf, &Assembler::adds, true, false, true);
1506
else
1507
Compile_dst_op(cf, &Assembler::add, true, false, false);
1508
}
1509
1510
void CPU::ARM32Recompiler::Compile_addu(CompileFlags cf)
1511
{
1512
Compile_dst_op(cf, &Assembler::add, true, false, false);
1513
}
1514
1515
void CPU::ARM32Recompiler::Compile_sub(CompileFlags cf)
1516
{
1517
if (g_settings.cpu_recompiler_memory_exceptions)
1518
Compile_dst_op(cf, &Assembler::subs, false, false, true);
1519
else
1520
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1521
}
1522
1523
void CPU::ARM32Recompiler::Compile_subu(CompileFlags cf)
1524
{
1525
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1526
}
1527
1528
void CPU::ARM32Recompiler::Compile_and(CompileFlags cf)
1529
{
1530
AssertRegOrConstS(cf);
1531
AssertRegOrConstT(cf);
1532
1533
// special cases - and with self -> self, and with 0 -> 0
1534
const Register regd = CFGetRegD(cf);
1535
if (cf.MipsS() == cf.MipsT())
1536
{
1537
armAsm->mov(regd, CFGetRegS(cf));
1538
return;
1539
}
1540
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1541
{
1542
EmitMov(regd, 0);
1543
return;
1544
}
1545
1546
Compile_dst_op(cf, &Assembler::and_, true, true, false);
1547
}
1548
1549
void CPU::ARM32Recompiler::Compile_or(CompileFlags cf)
1550
{
1551
AssertRegOrConstS(cf);
1552
AssertRegOrConstT(cf);
1553
1554
// or/nor with 0 -> no effect
1555
const Register regd = CFGetRegD(cf);
1556
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1557
{
1558
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1559
return;
1560
}
1561
1562
Compile_dst_op(cf, &Assembler::orr, true, true, false);
1563
}
1564
1565
void CPU::ARM32Recompiler::Compile_xor(CompileFlags cf)
1566
{
1567
AssertRegOrConstS(cf);
1568
AssertRegOrConstT(cf);
1569
1570
const Register regd = CFGetRegD(cf);
1571
if (cf.MipsS() == cf.MipsT())
1572
{
1573
// xor with self -> zero
1574
EmitMov(regd, 0);
1575
return;
1576
}
1577
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1578
{
1579
// xor with zero -> no effect
1580
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1581
return;
1582
}
1583
1584
Compile_dst_op(cf, &Assembler::eor, true, true, false);
1585
}
1586
1587
void CPU::ARM32Recompiler::Compile_nor(CompileFlags cf)
1588
{
1589
Compile_or(cf);
1590
armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
1591
}
1592
1593
void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf)
1594
{
1595
Compile_slt(cf, true);
1596
}
1597
1598
void CPU::ARM32Recompiler::Compile_sltu(CompileFlags cf)
1599
{
1600
Compile_slt(cf, false);
1601
}
1602
1603
void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf, bool sign)
1604
{
1605
AssertRegOrConstS(cf);
1606
AssertRegOrConstT(cf);
1607
1608
// TODO: swap and reverse op for constants
1609
if (cf.const_s)
1610
{
1611
EmitMov(RSCRATCH, GetConstantRegS32(cf.MipsS()));
1612
armAsm->cmp(RSCRATCH, CFGetRegT(cf));
1613
}
1614
else if (cf.const_t)
1615
{
1616
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
1617
}
1618
else
1619
{
1620
armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
1621
}
1622
1623
const Register rd = CFGetRegD(cf);
1624
armAsm->mov(sign ? ge : cs, rd, 0);
1625
armAsm->mov(sign ? lt : lo, rd, 1);
1626
}
1627
1628
vixl::aarch32::Register
1629
CPU::ARM32Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1630
const std::optional<const vixl::aarch32::Register>& reg)
1631
{
1632
const u32 imm = inst->i.imm_sext32();
1633
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1634
return CFGetRegS(cf);
1635
1636
const Register dst = reg.has_value() ? reg.value() : RARG1;
1637
if (address.has_value())
1638
{
1639
EmitMov(dst, address.value());
1640
}
1641
else if (imm == 0)
1642
{
1643
if (cf.valid_host_s)
1644
{
1645
if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
1646
armAsm->mov(dst, CFGetRegS(cf));
1647
}
1648
else
1649
{
1650
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1651
}
1652
}
1653
else
1654
{
1655
if (cf.valid_host_s)
1656
{
1657
armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1658
}
1659
else
1660
{
1661
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1662
armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1663
}
1664
}
1665
1666
return dst;
1667
}
1668
1669
template<typename RegAllocFn>
1670
vixl::aarch32::Register CPU::ARM32Recompiler::GenerateLoad(const vixl::aarch32::Register& addr_reg,
1671
MemoryAccessSize size, bool sign, bool use_fastmem,
1672
const RegAllocFn& dst_reg_alloc)
1673
{
1674
if (use_fastmem)
1675
{
1676
DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
1677
m_cycles += Bus::RAM_READ_TICKS;
1678
1679
const Register dst = dst_reg_alloc();
1680
const Register membase = GetMembaseReg();
1681
DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
1682
armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1683
armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
1684
1685
const MemOperand mem = MemOperand(RARG3, addr_reg);
1686
u8* start = armAsm->GetCursorAddress<u8*>();
1687
switch (size)
1688
{
1689
case MemoryAccessSize::Byte:
1690
sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
1691
break;
1692
1693
case MemoryAccessSize::HalfWord:
1694
sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
1695
break;
1696
1697
case MemoryAccessSize::Word:
1698
armAsm->ldr(dst, mem);
1699
break;
1700
}
1701
1702
AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
1703
return dst;
1704
}
1705
1706
if (addr_reg.GetCode() != RARG1.GetCode())
1707
armAsm->mov(RARG1, addr_reg);
1708
1709
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1710
switch (size)
1711
{
1712
case MemoryAccessSize::Byte:
1713
{
1714
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :
1715
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));
1716
}
1717
break;
1718
case MemoryAccessSize::HalfWord:
1719
{
1720
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :
1721
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));
1722
}
1723
break;
1724
case MemoryAccessSize::Word:
1725
{
1726
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :
1727
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));
1728
}
1729
break;
1730
}
1731
1732
// TODO: turn this into an asm function instead
1733
if (checked)
1734
{
1735
SwitchToFarCodeIfBitSet(RRETHI, 31);
1736
BackupHostState();
1737
1738
// Need to stash this in a temp because of the flush.
1739
const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1740
armAsm->rsb(temp, RRETHI, 0);
1741
armAsm->lsl(temp, temp, 2);
1742
1743
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1744
1745
// cause_bits = (-result << 2) | BD | cop_n
1746
armAsm->orr(RARG1, temp,
1747
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1748
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1749
EmitMov(RARG2, m_current_instruction_pc);
1750
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1751
FreeHostReg(temp.GetCode());
1752
EndBlock(std::nullopt, true);
1753
1754
RestoreHostState();
1755
SwitchToNearCode(false);
1756
}
1757
1758
const Register dst_reg = dst_reg_alloc();
1759
switch (size)
1760
{
1761
case MemoryAccessSize::Byte:
1762
{
1763
sign ? armAsm->sxtb(dst_reg, RRET) : armAsm->uxtb(dst_reg, RRET);
1764
}
1765
break;
1766
case MemoryAccessSize::HalfWord:
1767
{
1768
sign ? armAsm->sxth(dst_reg, RRET) : armAsm->uxth(dst_reg, RRET);
1769
}
1770
break;
1771
case MemoryAccessSize::Word:
1772
{
1773
if (dst_reg.GetCode() != RRET.GetCode())
1774
armAsm->mov(dst_reg, RRET);
1775
}
1776
break;
1777
}
1778
1779
return dst_reg;
1780
}
1781
1782
void CPU::ARM32Recompiler::GenerateStore(const vixl::aarch32::Register& addr_reg,
1783
const vixl::aarch32::Register& value_reg, MemoryAccessSize size,
1784
bool use_fastmem)
1785
{
1786
if (use_fastmem)
1787
{
1788
DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
1789
DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
1790
const Register membase = GetMembaseReg();
1791
armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1792
armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
1793
1794
const MemOperand mem = MemOperand(RARG3, addr_reg);
1795
u8* start = armAsm->GetCursorAddress<u8*>();
1796
switch (size)
1797
{
1798
case MemoryAccessSize::Byte:
1799
armAsm->strb(value_reg, mem);
1800
break;
1801
1802
case MemoryAccessSize::HalfWord:
1803
armAsm->strh(value_reg, mem);
1804
break;
1805
1806
case MemoryAccessSize::Word:
1807
armAsm->str(value_reg, mem);
1808
break;
1809
}
1810
AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
1811
return;
1812
}
1813
1814
if (addr_reg.GetCode() != RARG1.GetCode())
1815
armAsm->mov(RARG1, addr_reg);
1816
if (value_reg.GetCode() != RARG2.GetCode())
1817
armAsm->mov(RARG2, value_reg);
1818
1819
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1820
switch (size)
1821
{
1822
case MemoryAccessSize::Byte:
1823
{
1824
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :
1825
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
1826
}
1827
break;
1828
case MemoryAccessSize::HalfWord:
1829
{
1830
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :
1831
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
1832
}
1833
break;
1834
case MemoryAccessSize::Word:
1835
{
1836
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :
1837
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
1838
}
1839
break;
1840
}
1841
1842
// TODO: turn this into an asm function instead
1843
if (checked)
1844
{
1845
SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);
1846
BackupHostState();
1847
1848
// Need to stash this in a temp because of the flush.
1849
const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1850
armAsm->lsl(temp, RRET, 2);
1851
1852
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1853
1854
// cause_bits = (result << 2) | BD | cop_n
1855
armAsm->orr(RARG1, temp,
1856
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1857
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1858
EmitMov(RARG2, m_current_instruction_pc);
1859
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1860
FreeHostReg(temp.GetCode());
1861
EndBlock(std::nullopt, true);
1862
1863
RestoreHostState();
1864
SwitchToNearCode(false);
1865
}
1866
}
1867
1868
void CPU::ARM32Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1869
const std::optional<VirtualMemoryAddress>& address)
1870
{
1871
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
1872
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1873
std::optional<Register>();
1874
FlushForLoadStore(address, false, use_fastmem);
1875
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
1876
const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
1877
if (cf.MipsT() == Reg::zero)
1878
return RRET;
1879
1880
return Register(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
1881
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
1882
});
1883
1884
if (g_settings.gpu_pgxp_enable)
1885
{
1886
Flush(FLUSH_FOR_C_CALL);
1887
1888
EmitMov(RARG1, inst->bits);
1889
armAsm->mov(RARG2, addr);
1890
armAsm->mov(RARG3, data);
1891
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
1892
FreeHostReg(addr_reg.value().GetCode());
1893
}
1894
}
1895
1896
void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1897
const std::optional<VirtualMemoryAddress>& address)
1898
{
1899
DebugAssert(size == MemoryAccessSize::Word && !sign);
1900
1901
const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1902
FlushForLoadStore(address, false, use_fastmem);
1903
1904
// TODO: if address is constant, this can be simplified..
1905
1906
// If we're coming from another block, just flush the load delay and hope for the best..
1907
if (m_load_delay_dirty)
1908
UpdateLoadDelay();
1909
1910
// We'd need to be careful here if we weren't overwriting it..
1911
ComputeLoadStoreAddressArg(cf, address, addr);
1912
armAsm->bic(RARG1, addr, 3);
1913
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
1914
1915
if (inst->r.rt == Reg::zero)
1916
{
1917
FreeHostReg(addr.GetCode());
1918
return;
1919
}
1920
1921
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
1922
// never written back. NOTE: can't trust T in cf because of the flush
1923
const Reg rt = inst->r.rt;
1924
Register value;
1925
if (m_load_delay_register == rt)
1926
{
1927
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
1928
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
1929
m_load_delay_value_register;
1930
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
1931
value = Register(existing_ld_rt);
1932
}
1933
else
1934
{
1935
if constexpr (EMULATE_LOAD_DELAYS)
1936
{
1937
value = Register(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
1938
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
1939
armAsm->mov(value, Register(rtreg.value()));
1940
else if (HasConstantReg(rt))
1941
EmitMov(value, GetConstantRegU32(rt));
1942
else
1943
armAsm->ldr(value, MipsPtr(rt));
1944
}
1945
else
1946
{
1947
value = Register(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
1948
}
1949
}
1950
1951
DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
1952
armAsm->and_(RARG2, addr, 3);
1953
armAsm->lsl(RARG2, RARG2, 3); // *8
1954
EmitMov(RARG3, 24);
1955
armAsm->sub(RARG3, RARG3, RARG2);
1956
1957
if (inst->op == InstructionOp::lwl)
1958
{
1959
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
1960
// new_value = (value & mask) | (RWRET << (24 - shift));
1961
EmitMov(RSCRATCH, 0xFFFFFFu);
1962
armAsm->lsr(RSCRATCH, RSCRATCH, RARG2);
1963
armAsm->and_(value, value, RSCRATCH);
1964
armAsm->lsl(RRET, RRET, RARG3);
1965
armAsm->orr(value, value, RRET);
1966
}
1967
else
1968
{
1969
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
1970
// new_value = (value & mask) | (RWRET >> shift);
1971
armAsm->lsr(RRET, RRET, RARG2);
1972
EmitMov(RSCRATCH, 0xFFFFFF00u);
1973
armAsm->lsl(RSCRATCH, RSCRATCH, RARG3);
1974
armAsm->and_(value, value, RSCRATCH);
1975
armAsm->orr(value, value, RRET);
1976
}
1977
1978
FreeHostReg(addr.GetCode());
1979
1980
if (g_settings.gpu_pgxp_enable)
1981
{
1982
Flush(FLUSH_FOR_C_CALL);
1983
armAsm->mov(RARG3, value);
1984
armAsm->bic(RARG2, addr, 3);
1985
EmitMov(RARG1, inst->bits);
1986
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
1987
}
1988
}
1989
1990
void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1991
const std::optional<VirtualMemoryAddress>& address)
1992
{
1993
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
1994
const auto [ptr, action] = GetGTERegisterPointer(index, true);
1995
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
1996
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1997
std::optional<Register>();
1998
FlushForLoadStore(address, false, use_fastmem);
1999
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2000
const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2001
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2002
Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2003
RRET;
2004
});
2005
2006
switch (action)
2007
{
2008
case GTERegisterAccessAction::Ignore:
2009
{
2010
break;
2011
}
2012
2013
case GTERegisterAccessAction::Direct:
2014
{
2015
armAsm->str(value, PTR(ptr));
2016
break;
2017
}
2018
2019
case GTERegisterAccessAction::SignExtend16:
2020
{
2021
armAsm->sxth(RARG3, value);
2022
armAsm->str(RARG3, PTR(ptr));
2023
break;
2024
}
2025
2026
case GTERegisterAccessAction::ZeroExtend16:
2027
{
2028
armAsm->uxth(RARG3, value);
2029
armAsm->str(RARG3, PTR(ptr));
2030
break;
2031
}
2032
2033
case GTERegisterAccessAction::CallHandler:
2034
{
2035
Flush(FLUSH_FOR_C_CALL);
2036
armAsm->mov(RARG2, value);
2037
EmitMov(RARG1, index);
2038
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2039
break;
2040
}
2041
2042
case GTERegisterAccessAction::PushFIFO:
2043
{
2044
// SXY0 <- SXY1
2045
// SXY1 <- SXY2
2046
// SXY2 <- SXYP
2047
DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
2048
armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
2049
armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
2050
armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
2051
armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
2052
armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
2053
break;
2054
}
2055
2056
default:
2057
{
2058
Panic("Unknown action");
2059
return;
2060
}
2061
}
2062
2063
if (g_settings.gpu_pgxp_enable)
2064
{
2065
Flush(FLUSH_FOR_C_CALL);
2066
armAsm->mov(RARG3, value);
2067
if (value.GetCode() != RRET.GetCode())
2068
FreeHostReg(value.GetCode());
2069
armAsm->mov(RARG2, addr);
2070
FreeHostReg(addr_reg.value().GetCode());
2071
EmitMov(RARG1, inst->bits);
2072
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2073
}
2074
}
2075
2076
void CPU::ARM32Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2077
const std::optional<VirtualMemoryAddress>& address)
2078
{
2079
AssertRegOrConstS(cf);
2080
AssertRegOrConstT(cf);
2081
2082
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
2083
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2084
std::optional<Register>();
2085
FlushForLoadStore(address, true, use_fastmem);
2086
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2087
const Register data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
2088
if (!cf.valid_host_t)
2089
MoveTToReg(RARG2, cf);
2090
2091
GenerateStore(addr, data, size, use_fastmem);
2092
2093
if (g_settings.gpu_pgxp_enable)
2094
{
2095
Flush(FLUSH_FOR_C_CALL);
2096
MoveMIPSRegToReg(RARG3, cf.MipsT());
2097
armAsm->mov(RARG2, addr);
2098
EmitMov(RARG1, inst->bits);
2099
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2100
FreeHostReg(addr_reg.value().GetCode());
2101
}
2102
}
2103
2104
void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2105
const std::optional<VirtualMemoryAddress>& address)
2106
{
2107
DebugAssert(size == MemoryAccessSize::Word && !sign);
2108
2109
// TODO: this can take over rt's value if it's no longer needed
2110
// NOTE: can't trust T in cf because of the alloc
2111
const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
2112
const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
2113
if (g_settings.gpu_pgxp_enable)
2114
MoveMIPSRegToReg(value, inst->r.rt);
2115
2116
FlushForLoadStore(address, true, use_fastmem);
2117
2118
// TODO: if address is constant, this can be simplified..
2119
// We'd need to be careful here if we weren't overwriting it..
2120
ComputeLoadStoreAddressArg(cf, address, addr);
2121
armAsm->bic(RARG1, addr, 3);
2122
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
2123
2124
armAsm->and_(RSCRATCH, addr, 3);
2125
armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8
2126
armAsm->bic(addr, addr, 3);
2127
2128
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
2129
if (!g_settings.gpu_pgxp_enable)
2130
MoveMIPSRegToReg(value, inst->r.rt);
2131
2132
if (inst->op == InstructionOp::swl)
2133
{
2134
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2135
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2136
EmitMov(RARG3, 0xFFFFFF00u);
2137
armAsm->lsl(RARG3, RARG3, RSCRATCH);
2138
armAsm->and_(RRET, RRET, RARG3);
2139
2140
EmitMov(RARG3, 24);
2141
armAsm->sub(RARG3, RARG3, RSCRATCH);
2142
armAsm->lsr(value, value, RARG3);
2143
armAsm->orr(value, value, RRET);
2144
}
2145
else
2146
{
2147
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2148
// new_value = (RWRET & mem_mask) | (value << shift);
2149
armAsm->lsl(value, value, RSCRATCH);
2150
2151
EmitMov(RARG3, 24);
2152
armAsm->sub(RARG3, RARG3, RSCRATCH);
2153
EmitMov(RSCRATCH, 0x00FFFFFFu);
2154
armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);
2155
armAsm->and_(RRET, RRET, RSCRATCH);
2156
armAsm->orr(value, value, RRET);
2157
}
2158
2159
if (!g_settings.gpu_pgxp_enable)
2160
{
2161
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2162
FreeHostReg(addr.GetCode());
2163
}
2164
else
2165
{
2166
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2167
2168
Flush(FLUSH_FOR_C_CALL);
2169
armAsm->mov(RARG3, value);
2170
FreeHostReg(value.GetCode());
2171
armAsm->mov(RARG2, addr);
2172
FreeHostReg(addr.GetCode());
2173
EmitMov(RARG1, inst->bits);
2174
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
2175
}
2176
}
2177
2178
void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2179
const std::optional<VirtualMemoryAddress>& address)
2180
{
2181
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2182
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2183
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2184
Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2185
RARG1;
2186
const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
2187
FlushForLoadStore(address, true, use_fastmem);
2188
ComputeLoadStoreAddressArg(cf, address, addr);
2189
2190
switch (action)
2191
{
2192
case GTERegisterAccessAction::Direct:
2193
{
2194
armAsm->ldr(data, PTR(ptr));
2195
}
2196
break;
2197
2198
case GTERegisterAccessAction::CallHandler:
2199
{
2200
// should already be flushed.. except in fastmem case
2201
Flush(FLUSH_FOR_C_CALL);
2202
EmitMov(RARG1, index);
2203
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2204
armAsm->mov(data, RRET);
2205
}
2206
break;
2207
2208
default:
2209
{
2210
Panic("Unknown action");
2211
}
2212
break;
2213
}
2214
2215
GenerateStore(addr, data, size, use_fastmem);
2216
if (!g_settings.gpu_pgxp_enable)
2217
{
2218
if (addr.GetCode() != RARG1.GetCode())
2219
FreeHostReg(addr.GetCode());
2220
}
2221
else
2222
{
2223
// TODO: This can be simplified because we don't need to validate in PGXP..
2224
Flush(FLUSH_FOR_C_CALL);
2225
armAsm->mov(RARG3, data);
2226
FreeHostReg(data.GetCode());
2227
armAsm->mov(RARG2, addr);
2228
FreeHostReg(addr.GetCode());
2229
EmitMov(RARG1, inst->bits);
2230
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2231
}
2232
}
2233
2234
void CPU::ARM32Recompiler::Compile_mtc0(CompileFlags cf)
2235
{
2236
// TODO: we need better constant setting here.. which will need backprop
2237
AssertRegOrConstT(cf);
2238
2239
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2240
const u32* ptr = GetCop0RegPtr(reg);
2241
const u32 mask = GetCop0RegWriteMask(reg);
2242
if (!ptr)
2243
{
2244
Compile_Fallback();
2245
return;
2246
}
2247
2248
if (mask == 0)
2249
{
2250
// if it's a read-only register, ignore
2251
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2252
return;
2253
}
2254
2255
// for some registers, we need to test certain bits
2256
const bool needs_bit_test = (reg == Cop0Reg::SR);
2257
const Register new_value = RARG1;
2258
const Register old_value = RARG2;
2259
const Register changed_bits = RARG3;
2260
const Register mask_reg = RSCRATCH;
2261
2262
// Load old value
2263
armAsm->ldr(old_value, PTR(ptr));
2264
2265
// No way we fit this in an immediate..
2266
EmitMov(mask_reg, mask);
2267
2268
// update value
2269
if (cf.valid_host_t)
2270
armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
2271
else
2272
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2273
2274
if (needs_bit_test)
2275
armAsm->eor(changed_bits, old_value, new_value);
2276
armAsm->bic(old_value, old_value, mask_reg);
2277
armAsm->orr(new_value, old_value, new_value);
2278
armAsm->str(new_value, PTR(ptr));
2279
2280
if (reg == Cop0Reg::SR)
2281
{
2282
// TODO: replace with register backup
2283
// We could just inline the whole thing..
2284
Flush(FLUSH_FOR_C_CALL);
2285
2286
Label caches_unchanged;
2287
armAsm->tst(changed_bits, 1u << 16);
2288
armAsm->b(eq, &caches_unchanged);
2289
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2290
armAsm->ldr(RARG1, PTR(ptr)); // reload value for interrupt test below
2291
armAsm->bind(&caches_unchanged);
2292
2293
// might need to reload fastmem base too
2294
if (CodeCache::IsUsingFastmem() && m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions) &&
2295
IsHostRegAllocated(RMEMBASE.GetCode()))
2296
{
2297
FreeHostReg(RMEMBASE.GetCode());
2298
}
2299
2300
TestInterrupts(RARG1);
2301
}
2302
else if (reg == Cop0Reg::CAUSE)
2303
{
2304
armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2305
TestInterrupts(RARG1);
2306
}
2307
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2308
{
2309
// need to check whether we're switching to debug mode
2310
Flush(FLUSH_FOR_C_CALL);
2311
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2312
SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);
2313
BackupHostState();
2314
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2315
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2316
RestoreHostState();
2317
SwitchToNearCode(false);
2318
}
2319
}
2320
2321
void CPU::ARM32Recompiler::Compile_rfe(CompileFlags cf)
2322
{
2323
// shift mode bits right two, preserving upper bits
2324
armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2325
armAsm->bic(RARG2, RARG1, 15);
2326
armAsm->ubfx(RARG1, RARG1, 2, 4);
2327
armAsm->orr(RARG1, RARG1, RARG2);
2328
armAsm->str(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2329
2330
TestInterrupts(RARG1);
2331
}
2332
2333
void CPU::ARM32Recompiler::TestInterrupts(const vixl::aarch32::Register& sr)
2334
{
2335
// if Iec == 0 then goto no_interrupt
2336
Label no_interrupt;
2337
armAsm->tst(sr, 1);
2338
armAsm->b(eq, &no_interrupt);
2339
2340
// sr & cause
2341
armAsm->ldr(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
2342
armAsm->and_(sr, sr, RSCRATCH);
2343
2344
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2345
armAsm->tst(sr, 0xFF00);
2346
2347
SwitchToFarCode(true, ne);
2348
BackupHostState();
2349
2350
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2351
UpdateLoadDelay();
2352
2353
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2354
2355
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2356
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2357
if (!iinfo->is_last_instruction)
2358
{
2359
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2360
(inst + 1)->cop.cop_n));
2361
EmitMov(RARG2, m_compiler_pc);
2362
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2363
m_dirty_pc = false;
2364
EndAndLinkBlock(std::nullopt, true, false);
2365
}
2366
else
2367
{
2368
EmitMov(RARG1, 0);
2369
if (m_dirty_pc)
2370
EmitMov(RARG2, m_compiler_pc);
2371
armAsm->str(RARG1, PTR(&g_state.downcount));
2372
if (m_dirty_pc)
2373
armAsm->str(RARG2, PTR(&g_state.pc));
2374
m_dirty_pc = false;
2375
EndAndLinkBlock(std::nullopt, false, true);
2376
}
2377
2378
RestoreHostState();
2379
SwitchToNearCode(false);
2380
2381
armAsm->bind(&no_interrupt);
2382
}
2383
2384
void CPU::ARM32Recompiler::Compile_mfc2(CompileFlags cf)
2385
{
2386
const u32 index = inst->cop.Cop2Index();
2387
const Reg rt = inst->r.rt;
2388
2389
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2390
if (action == GTERegisterAccessAction::Ignore)
2391
return;
2392
2393
u32 hreg;
2394
if (action == GTERegisterAccessAction::Direct)
2395
{
2396
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2397
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2398
armAsm->ldr(Register(hreg), PTR(ptr));
2399
}
2400
else if (action == GTERegisterAccessAction::CallHandler)
2401
{
2402
Flush(FLUSH_FOR_C_CALL);
2403
EmitMov(RARG1, index);
2404
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2405
2406
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2407
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2408
armAsm->mov(Register(hreg), RRET);
2409
}
2410
else
2411
{
2412
Panic("Unknown action");
2413
return;
2414
}
2415
2416
if (g_settings.gpu_pgxp_enable)
2417
{
2418
Flush(FLUSH_FOR_C_CALL);
2419
EmitMov(RARG1, inst->bits);
2420
armAsm->mov(RARG2, Register(hreg));
2421
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2422
}
2423
}
2424
2425
void CPU::ARM32Recompiler::Compile_mtc2(CompileFlags cf)
2426
{
2427
const u32 index = inst->cop.Cop2Index();
2428
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2429
if (action == GTERegisterAccessAction::Ignore)
2430
return;
2431
2432
if (action == GTERegisterAccessAction::Direct)
2433
{
2434
if (cf.const_t)
2435
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2436
else
2437
armAsm->str(CFGetRegT(cf), PTR(ptr));
2438
}
2439
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2440
{
2441
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2442
if (cf.valid_host_t)
2443
{
2444
sign ? armAsm->sxth(RARG1, CFGetRegT(cf)) : armAsm->uxth(RARG1, CFGetRegT(cf));
2445
armAsm->str(RARG1, PTR(ptr));
2446
}
2447
else if (cf.const_t)
2448
{
2449
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2450
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2451
}
2452
else
2453
{
2454
Panic("Unsupported setup");
2455
}
2456
}
2457
else if (action == GTERegisterAccessAction::CallHandler)
2458
{
2459
Flush(FLUSH_FOR_C_CALL);
2460
EmitMov(RARG1, index);
2461
MoveTToReg(RARG2, cf);
2462
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2463
}
2464
else if (action == GTERegisterAccessAction::PushFIFO)
2465
{
2466
// SXY0 <- SXY1
2467
// SXY1 <- SXY2
2468
// SXY2 <- SXYP
2469
DebugAssert(RRET.GetCode() != RARG2.GetCode() && RRET.GetCode() != RARG3.GetCode());
2470
armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
2471
armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
2472
armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
2473
armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
2474
if (cf.valid_host_t)
2475
armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
2476
else if (cf.const_t)
2477
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2478
else
2479
Panic("Unsupported setup");
2480
}
2481
else
2482
{
2483
Panic("Unknown action");
2484
}
2485
}
2486
2487
void CPU::ARM32Recompiler::Compile_cop2(CompileFlags cf)
2488
{
2489
TickCount func_ticks;
2490
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2491
2492
Flush(FLUSH_FOR_C_CALL);
2493
EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2494
EmitCall(reinterpret_cast<const void*>(func));
2495
2496
AddGTETicks(func_ticks);
2497
}
2498
2499
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2500
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2501
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2502
bool is_load)
2503
{
2504
Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
2505
Assembler* armAsm = &arm_asm;
2506
2507
#ifdef VIXL_DEBUG
2508
vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
2509
#endif
2510
2511
// save regs
2512
RegisterList save_regs;
2513
2514
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2515
{
2516
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2517
save_regs.Combine(RegisterList(Register(i)));
2518
}
2519
2520
if (!save_regs.IsEmpty())
2521
armAsm->push(save_regs);
2522
2523
if (address_register != static_cast<u8>(RARG1.GetCode()))
2524
armAsm->mov(RARG1, Register(address_register));
2525
2526
if (!is_load)
2527
{
2528
if (data_register != static_cast<u8>(RARG2.GetCode()))
2529
armAsm->mov(RARG2, Register(data_register));
2530
}
2531
2532
if (cycles_to_add != 0)
2533
{
2534
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2535
armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
2536
if (!ImmediateA32::IsImmediateA32(cycles_to_add))
2537
{
2538
armEmitMov(armAsm, RSCRATCH, cycles_to_add);
2539
armAsm->add(RARG3, RARG3, RSCRATCH);
2540
}
2541
else
2542
{
2543
armAsm->add(RARG3, RARG3, cycles_to_add);
2544
}
2545
2546
armAsm->str(RARG3, PTR(&g_state.pending_ticks));
2547
}
2548
2549
switch (size)
2550
{
2551
case MemoryAccessSize::Byte:
2552
{
2553
armEmitCall(armAsm,
2554
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :
2555
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte),
2556
false);
2557
}
2558
break;
2559
case MemoryAccessSize::HalfWord:
2560
{
2561
armEmitCall(armAsm,
2562
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :
2563
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord),
2564
false);
2565
}
2566
break;
2567
case MemoryAccessSize::Word:
2568
{
2569
armEmitCall(armAsm,
2570
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :
2571
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord),
2572
false);
2573
}
2574
break;
2575
}
2576
2577
if (is_load)
2578
{
2579
const Register dst = Register(data_register);
2580
switch (size)
2581
{
2582
case MemoryAccessSize::Byte:
2583
{
2584
is_signed ? armAsm->sxtb(dst, RRET) : armAsm->uxtb(dst, RRET);
2585
}
2586
break;
2587
case MemoryAccessSize::HalfWord:
2588
{
2589
is_signed ? armAsm->sxth(dst, RRET) : armAsm->uxth(dst, RRET);
2590
}
2591
break;
2592
case MemoryAccessSize::Word:
2593
{
2594
if (dst.GetCode() != RRET.GetCode())
2595
armAsm->mov(dst, RRET);
2596
}
2597
break;
2598
}
2599
}
2600
2601
if (cycles_to_remove != 0)
2602
{
2603
armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
2604
if (!ImmediateA32::IsImmediateA32(cycles_to_remove))
2605
{
2606
armEmitMov(armAsm, RSCRATCH, cycles_to_remove);
2607
armAsm->sub(RARG3, RARG3, RSCRATCH);
2608
}
2609
else
2610
{
2611
armAsm->sub(RARG3, RARG3, cycles_to_remove);
2612
}
2613
armAsm->str(RARG3, PTR(&g_state.pending_ticks));
2614
}
2615
2616
// restore regs
2617
if (!save_regs.IsEmpty())
2618
armAsm->pop(save_regs);
2619
2620
armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
2621
armAsm->FinalizeCode();
2622
2623
return static_cast<u32>(armAsm->GetCursorOffset());
2624
}
2625
2626
#endif // CPU_ARCH_ARM32
2627
2628