Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_x64.cpp
4212 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_x64.h"
5
#include "cpu_code_cache_private.h"
6
#include "cpu_core_private.h"
7
#include "cpu_pgxp.h"
8
#include "gte.h"
9
#include "settings.h"
10
#include "timing_event.h"
11
12
#include "common/align.h"
13
#include "common/assert.h"
14
#include "common/log.h"
15
#include "common/small_string.h"
16
#include "common/string_util.h"
17
18
#include <limits>
19
20
#ifdef CPU_ARCH_X64
21
22
#ifdef ENABLE_HOST_DISASSEMBLY
23
#include "Zycore/Format.h"
24
#include "Zycore/Status.h"
25
#include "Zydis/Zydis.h"
26
#endif
27
28
LOG_CHANNEL(Recompiler);
29
30
#define RMEMBASE cg->rbx
31
#define RSTATE cg->rbp
32
33
// #define PTR(x) (cg->rip + (x))
34
#define PTR(x) (RSTATE + (((u8*)(x)) - ((u8*)&g_state)))
35
36
// PGXP TODO: LWL etc, MFC0
37
// PGXP TODO: Spyro 1 level gates have issues.
38
39
static constexpr u32 FUNCTION_ALIGNMENT = 16;
40
static constexpr u32 BACKPATCH_JMP_SIZE = 5;
41
42
static bool IsCallerSavedRegister(u32 id);
43
44
// ABI selection
45
#if defined(_WIN32)
46
47
#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX)
48
#define RWARG1 Xbyak::Reg32(Xbyak::Operand::RCX)
49
#define RWARG2 Xbyak::Reg32(Xbyak::Operand::RDX)
50
#define RWARG3 Xbyak::Reg32(Xbyak::Operand::R8D)
51
#define RWARG4 Xbyak::Reg32(Xbyak::Operand::R9D)
52
#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX)
53
#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RCX)
54
#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RDX)
55
#define RXARG3 Xbyak::Reg64(Xbyak::Operand::R8)
56
#define RXARG4 Xbyak::Reg64(Xbyak::Operand::R9)
57
58
// on win32, we need to reserve an additional 32 bytes shadow space when calling out to C
59
static constexpr u32 STACK_SHADOW_SIZE = 32;
60
61
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
62
63
#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX)
64
#define RWARG1 Xbyak::Reg32(Xbyak::Operand::EDI)
65
#define RWARG2 Xbyak::Reg32(Xbyak::Operand::ESI)
66
#define RWARG3 Xbyak::Reg32(Xbyak::Operand::EDX)
67
#define RWARG4 Xbyak::Reg32(Xbyak::Operand::ECX)
68
#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX)
69
#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RDI)
70
#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RSI)
71
#define RXARG3 Xbyak::Reg64(Xbyak::Operand::RDX)
72
#define RXARG4 Xbyak::Reg64(Xbyak::Operand::RCX)
73
74
static constexpr u32 STACK_SHADOW_SIZE = 0;
75
76
#else
77
78
#error Unknown ABI.
79
80
#endif
81
82
namespace CPU {
83
84
using namespace Xbyak;
85
86
static X64Recompiler s_instance;
87
Recompiler* g_compiler = &s_instance;
88
89
} // namespace CPU
90
91
bool IsCallerSavedRegister(u32 id)
92
{
93
#ifdef _WIN32
94
// The x64 ABI considers the registers RAX, RCX, RDX, R8, R9, R10, R11, and XMM0-XMM5 volatile.
95
return (id <= 2 || (id >= 8 && id <= 11));
96
#else
97
// rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch registers.
98
return (id <= 2 || id == 6 || id == 7 || (id >= 8 && id <= 11));
99
#endif
100
}
101
102
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
103
{
104
using namespace Xbyak;
105
106
#ifdef _WIN32
107
// Shadow space for Win32
108
constexpr u32 stack_size = 32 + 8;
109
#else
110
// Stack still needs to be aligned
111
constexpr u32 stack_size = 8;
112
#endif
113
114
DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler);
115
116
CodeGenerator acg(code_size, static_cast<u8*>(code));
117
CodeGenerator* cg = &acg;
118
119
Label dispatch;
120
Label exit_recompiler;
121
Label run_events_and_dispatch;
122
123
g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(const_cast<u8*>(cg->getCurr()));
124
{
125
// Don't need to save registers, because we fastjmp out when execution is interrupted.
126
cg->sub(cg->rsp, stack_size);
127
128
// CPU state pointer
129
cg->lea(cg->rbp, cg->qword[cg->rip + &g_state]);
130
131
// newrec preloads fastmem base
132
if (CodeCache::IsUsingFastmem())
133
cg->mov(cg->rbx, cg->qword[PTR(&g_state.fastmem_base)]);
134
135
// Fall through to event dispatcher
136
}
137
138
// check events then for frame done
139
cg->align(FUNCTION_ALIGNMENT);
140
{
141
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
142
cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
143
cg->jl(dispatch);
144
145
g_run_events_and_dispatch = cg->getCurr();
146
cg->L(run_events_and_dispatch);
147
cg->call(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
148
}
149
150
cg->align(FUNCTION_ALIGNMENT);
151
g_dispatcher = cg->getCurr();
152
{
153
cg->L(dispatch);
154
155
// rcx <- s_fast_map[pc >> 16]
156
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
157
cg->lea(RXARG2, cg->dword[PTR(g_code_lut.data())]);
158
cg->mov(RWARG3, RWARG1);
159
cg->shr(RWARG3, LUT_TABLE_SHIFT);
160
cg->mov(RXARG2, cg->qword[RXARG2 + RXARG3 * 8]);
161
cg->and_(RWARG1, (LUT_TABLE_SIZE - 1) << 2); // 0xFFFC
162
163
// call(rcx[pc * 2]) (fast_map[pc >> 2])
164
cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]);
165
}
166
167
cg->align(FUNCTION_ALIGNMENT);
168
g_compile_or_revalidate_block = cg->getCurr();
169
{
170
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
171
cg->call(&CompileOrRevalidateBlock);
172
cg->jmp(dispatch);
173
}
174
175
cg->align(FUNCTION_ALIGNMENT);
176
g_discard_and_recompile_block = cg->getCurr();
177
{
178
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
179
cg->call(&DiscardAndRecompileBlock);
180
cg->jmp(dispatch);
181
}
182
183
cg->align(FUNCTION_ALIGNMENT);
184
g_interpret_block = cg->getCurr();
185
{
186
cg->call(CodeCache::GetInterpretUncachedBlockFunction());
187
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
188
cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
189
cg->jge(run_events_and_dispatch);
190
cg->jmp(dispatch);
191
}
192
193
return static_cast<u32>(cg->getSize());
194
}
195
196
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
197
{
198
u8* ptr = static_cast<u8*>(code);
199
*(ptr++) = 0xE9; // jmp
200
201
const ptrdiff_t disp = (reinterpret_cast<uintptr_t>(dst) - reinterpret_cast<uintptr_t>(code)) - 5;
202
DebugAssert(disp >= static_cast<ptrdiff_t>(std::numeric_limits<s32>::min()) &&
203
disp <= static_cast<ptrdiff_t>(std::numeric_limits<s32>::max()));
204
205
const s32 disp32 = static_cast<s32>(disp);
206
std::memcpy(ptr, &disp32, sizeof(disp32));
207
return 5;
208
}
209
210
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
211
{
212
// Copied from Xbyak nop(), to avoid constructing a CodeGenerator.
213
static const uint8_t nopTbl[9][9] = {
214
{0x90},
215
{0x66, 0x90},
216
{0x0F, 0x1F, 0x00},
217
{0x0F, 0x1F, 0x40, 0x00},
218
{0x0F, 0x1F, 0x44, 0x00, 0x00},
219
{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
220
{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
221
{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
222
{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
223
};
224
const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
225
u8* dst_ptr = static_cast<u8*>(dst);
226
while (size > 0)
227
{
228
size_t len = (std::min)(n, size);
229
const uint8_t* seq = nopTbl[len - 1];
230
std::memcpy(dst_ptr, seq, len);
231
dst_ptr += len;
232
size -= len;
233
}
234
}
235
236
#ifdef ENABLE_HOST_DISASSEMBLY
237
238
static ZydisFormatterFunc s_old_print_address;
239
240
static ZyanStatus ZydisFormatterPrintAddressAbsolute(const ZydisFormatter* formatter, ZydisFormatterBuffer* buffer,
241
ZydisFormatterContext* context)
242
{
243
using namespace CPU;
244
245
ZyanU64 address;
246
ZYAN_CHECK(ZydisCalcAbsoluteAddress(context->instruction, context->operand, context->runtime_address, &address));
247
248
char buf[128];
249
u32 len = 0;
250
251
#define A(x) static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(x))
252
253
if (address >= A(Bus::g_ram) && address < A(Bus::g_ram + Bus::g_ram_size))
254
{
255
len = snprintf(buf, sizeof(buf), "g_ram+0x%08X", static_cast<u32>(address - A(Bus::g_ram)));
256
}
257
else if (address >= A(&g_state.regs) &&
258
address < A(reinterpret_cast<const u8*>(&g_state.regs) + sizeof(CPU::Registers)))
259
{
260
len = snprintf(buf, sizeof(buf), "g_state.regs.%s",
261
GetRegName(static_cast<CPU::Reg>(((address - A(&g_state.regs.r[0])) / 4u))));
262
}
263
else if (address >= A(&g_state.cop0_regs) &&
264
address < A(reinterpret_cast<const u8*>(&g_state.cop0_regs) + sizeof(CPU::Cop0Registers)))
265
{
266
for (const DebuggerRegisterListEntry& rle : g_debugger_register_list)
267
{
268
if (address == static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(rle.value_ptr)))
269
{
270
len = snprintf(buf, sizeof(buf), "g_state.cop0_regs.%s", rle.name);
271
break;
272
}
273
}
274
}
275
else if (address >= A(&g_state.gte_regs) &&
276
address < A(reinterpret_cast<const u8*>(&g_state.gte_regs) + sizeof(GTE::Regs)))
277
{
278
for (const DebuggerRegisterListEntry& rle : g_debugger_register_list)
279
{
280
if (address == static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(rle.value_ptr)))
281
{
282
len = snprintf(buf, sizeof(buf), "g_state.gte_regs.%s", rle.name);
283
break;
284
}
285
}
286
}
287
else if (address == A(&g_state.load_delay_reg))
288
{
289
len = snprintf(buf, sizeof(buf), "g_state.load_delay_reg");
290
}
291
else if (address == A(&g_state.next_load_delay_reg))
292
{
293
len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_reg");
294
}
295
else if (address == A(&g_state.load_delay_value))
296
{
297
len = snprintf(buf, sizeof(buf), "g_state.load_delay_value");
298
}
299
else if (address == A(&g_state.next_load_delay_value))
300
{
301
len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_value");
302
}
303
else if (address == A(&g_state.pending_ticks))
304
{
305
len = snprintf(buf, sizeof(buf), "g_state.pending_ticks");
306
}
307
else if (address == A(&g_state.downcount))
308
{
309
len = snprintf(buf, sizeof(buf), "g_state.downcount");
310
}
311
312
#undef A
313
314
if (len > 0)
315
{
316
ZYAN_CHECK(ZydisFormatterBufferAppend(buffer, ZYDIS_TOKEN_SYMBOL));
317
ZyanString* string;
318
ZYAN_CHECK(ZydisFormatterBufferGetString(buffer, &string));
319
return ZyanStringAppendFormat(string, "&%s", buf);
320
}
321
322
return s_old_print_address(formatter, buffer, context);
323
}
324
325
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
326
{
327
ZydisDecoder disas_decoder;
328
ZydisFormatter disas_formatter;
329
ZydisDecodedInstruction disas_instruction;
330
ZydisDecodedOperand disas_operands[ZYDIS_MAX_OPERAND_COUNT];
331
ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
332
ZydisFormatterInit(&disas_formatter, ZYDIS_FORMATTER_STYLE_INTEL);
333
s_old_print_address = (ZydisFormatterFunc)&ZydisFormatterPrintAddressAbsolute;
334
ZydisFormatterSetHook(&disas_formatter, ZYDIS_FORMATTER_FUNC_PRINT_ADDRESS_ABS, (const void**)&s_old_print_address);
335
336
const u8* ptr = static_cast<const u8*>(start);
337
TinyString hex;
338
ZyanUSize remaining = size;
339
while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&disas_decoder, ptr, remaining, &disas_instruction, disas_operands)))
340
{
341
char buffer[256];
342
if (ZYAN_SUCCESS(ZydisFormatterFormatInstruction(&disas_formatter, &disas_instruction, disas_operands,
343
ZYDIS_MAX_OPERAND_COUNT, buffer, sizeof(buffer),
344
static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(ptr)), nullptr)))
345
{
346
hex.clear();
347
for (u32 i = 0; i < 10; i++)
348
{
349
if (i < disas_instruction.length)
350
hex.append_format(" {:02X}", ptr[i]);
351
else
352
hex.append(" ");
353
}
354
DEBUG_LOG(" {:016X} {} {}", static_cast<u64>(reinterpret_cast<uintptr_t>(ptr)), hex, buffer);
355
}
356
357
ptr += disas_instruction.length;
358
remaining -= disas_instruction.length;
359
}
360
}
361
362
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
363
{
364
ZydisDecoder disas_decoder;
365
ZydisDecodedInstruction disas_instruction;
366
ZydisDecoderContext disas_context;
367
ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
368
369
const u8* ptr = static_cast<const u8*>(start);
370
ZyanUSize remaining = size;
371
u32 inst_count = 0;
372
while (
373
ZYAN_SUCCESS(ZydisDecoderDecodeInstruction(&disas_decoder, &disas_context, ptr, remaining, &disas_instruction)))
374
{
375
ptr += disas_instruction.length;
376
remaining -= disas_instruction.length;
377
inst_count++;
378
}
379
380
return inst_count;
381
}
382
383
#else
384
385
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
386
{
387
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
388
}
389
390
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
391
{
392
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
393
return 0;
394
}
395
396
#endif // ENABLE_HOST_DISASSEMBLY
397
398
CPU::X64Recompiler::X64Recompiler() = default;
399
400
CPU::X64Recompiler::~X64Recompiler() = default;
401
402
void CPU::X64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
403
u32 far_code_space)
404
{
405
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
406
407
// TODO: don't recreate this every time..
408
DebugAssert(!m_emitter && !m_far_emitter && !cg);
409
m_emitter = std::make_unique<Xbyak::CodeGenerator>(code_buffer_space, code_buffer);
410
m_far_emitter = std::make_unique<Xbyak::CodeGenerator>(far_code_space, far_code_buffer);
411
cg = m_emitter.get();
412
413
// Need to wipe it out so it's correct when toggling fastmem.
414
m_host_regs = {};
415
416
const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast<u32>(RMEMBASE.getIdx()) : NUM_HOST_REGS;
417
const u32 cpu_idx = static_cast<u32>(RSTATE.getIdx());
418
for (u32 i = 0; i < NUM_HOST_REGS; i++)
419
{
420
HostRegAlloc& ra = m_host_regs[i];
421
422
if (i == static_cast<u32>(RWRET.getIdx()) || i == static_cast<u32>(RWARG1.getIdx()) ||
423
i == static_cast<u32>(RWARG2.getIdx()) || i == static_cast<u32>(RWARG3.getIdx()) ||
424
i == static_cast<u32>(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx ||
425
i == static_cast<u32>(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */)
426
{
427
continue;
428
}
429
430
ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
431
}
432
}
433
434
void CPU::X64Recompiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
435
{
436
DebugAssert(cg == m_emitter.get());
437
if (emit_jump)
438
{
439
const void* fcptr = m_far_emitter->getCurr<const void*>();
440
(jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
441
}
442
cg = m_far_emitter.get();
443
}
444
445
void CPU::X64Recompiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
446
{
447
DebugAssert(cg == m_far_emitter.get());
448
if (emit_jump)
449
{
450
const void* fcptr = m_emitter->getCurr<const void*>();
451
(jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
452
}
453
cg = m_emitter.get();
454
}
455
456
void CPU::X64Recompiler::BeginBlock()
457
{
458
Recompiler::BeginBlock();
459
460
#if 0
461
if (m_block->pc == 0xBFC06F0C)
462
{
463
//__debugbreak();
464
cg->db(0xcc);
465
}
466
#endif
467
468
#if 0
469
cg->nop();
470
cg->mov(RWARG1, m_block->pc);
471
cg->nop();
472
#endif
473
}
474
475
void CPU::X64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
476
{
477
// store it first to reduce code size, because we can offset
478
cg->mov(RXARG1, static_cast<size_t>(reinterpret_cast<uintptr_t>(ram_ptr)));
479
cg->mov(RXARG2, static_cast<size_t>(reinterpret_cast<uintptr_t>(shadow_ptr)));
480
481
bool first = true;
482
u32 offset = 0;
483
while (size >= 16)
484
{
485
const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1;
486
cg->movups(dst, cg->xword[RXARG1 + offset]);
487
cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]);
488
if (!first)
489
cg->pand(cg->xmm0, dst);
490
else
491
first = false;
492
493
offset += 16;
494
size -= 16;
495
}
496
497
// TODO: better codegen for 16 byte aligned blocks
498
if (!first)
499
{
500
cg->movmskps(cg->eax, cg->xmm0);
501
cg->cmp(cg->eax, 0xf);
502
cg->jne(CodeCache::g_discard_and_recompile_block);
503
}
504
505
while (size >= 8)
506
{
507
cg->mov(RXARG3, cg->qword[RXARG1 + offset]);
508
cg->cmp(RXARG3, cg->qword[RXARG2 + offset]);
509
cg->jne(CodeCache::g_discard_and_recompile_block);
510
offset += 8;
511
size -= 8;
512
}
513
514
while (size >= 4)
515
{
516
cg->mov(RWARG3, cg->dword[RXARG1 + offset]);
517
cg->cmp(RWARG3, cg->dword[RXARG2 + offset]);
518
cg->jne(CodeCache::g_discard_and_recompile_block);
519
offset += 4;
520
size -= 4;
521
}
522
523
DebugAssert(size == 0);
524
}
525
526
void CPU::X64Recompiler::GenerateICacheCheckAndUpdate()
527
{
528
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
529
{
530
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
531
{
532
cg->mov(cg->eax, m_block->size);
533
cg->mul(cg->dword[cg->rip + GetFetchMemoryAccessTimePtr()]);
534
cg->add(cg->dword[PTR(&g_state.pending_ticks)], cg->eax);
535
}
536
else
537
{
538
cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(m_block->uncached_fetch_ticks));
539
}
540
}
541
else if (m_block->icache_line_count > 0)
542
{
543
// RAM to ROM is not contiguous, therefore the cost will be the same across the entire block.
544
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
545
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
546
if (fill_ticks <= 0)
547
return;
548
549
cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]);
550
cg->xor_(RWARG2, RWARG2);
551
cg->mov(RWARG4, fill_ticks);
552
553
// TODO: Vectorize this...
554
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
555
{
556
const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc);
557
558
const u32 line = GetICacheLine(current_pc);
559
const u32 offset = (line * sizeof(u32));
560
561
cg->xor_(RWARG3, RWARG3);
562
cg->cmp(cg->dword[RXARG1 + offset], tag);
563
cg->mov(cg->dword[RXARG1 + offset], tag);
564
cg->cmovne(RWARG3, RWARG4);
565
cg->add(RWARG2, RWARG3);
566
}
567
568
cg->add(cg->dword[PTR(&g_state.pending_ticks)], RWARG2);
569
}
570
}
571
572
void CPU::X64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
573
s32 arg3reg /*= -1*/)
574
{
575
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.getIdx()))
576
cg->mov(RXARG1, Reg64(arg1reg));
577
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.getIdx()))
578
cg->mov(RXARG2, Reg64(arg2reg));
579
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.getIdx()))
580
cg->mov(RXARG3, Reg64(arg3reg));
581
cg->call(func);
582
}
583
584
void CPU::X64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
585
{
586
if (newpc.has_value())
587
{
588
if (m_dirty_pc || m_compiler_pc != newpc)
589
cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value());
590
}
591
m_dirty_pc = false;
592
593
// flush regs
594
Flush(FLUSH_END_BLOCK);
595
EndAndLinkBlock(newpc, do_event_test, false);
596
}
597
598
void CPU::X64Recompiler::EndBlockWithException(Exception excode)
599
{
600
// flush regs, but not pc, it's going to get overwritten
601
// flush cycles because of the GTE instruction stuff...
602
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
603
604
// TODO: flush load delay
605
606
cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
607
inst->cop.cop_n));
608
cg->mov(RWARG2, m_current_instruction_pc);
609
610
if (excode != Exception::BP)
611
{
612
cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
613
}
614
else
615
{
616
cg->mov(RWARG3, inst->bits);
617
cg->call(&CPU::RaiseBreakException);
618
}
619
620
m_dirty_pc = false;
621
622
EndAndLinkBlock(std::nullopt, true, false);
623
}
624
625
void CPU::X64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
626
{
627
// event test
628
// pc should've been flushed
629
DebugAssert(!m_dirty_pc && !m_block_ended);
630
m_block_ended = true;
631
632
// TODO: try extracting this to a function
633
634
// save cycles for event test
635
const TickCount cycles = std::exchange(m_cycles, 0);
636
637
// fast path when not doing an event test
638
if (!do_event_test && m_gte_done_cycle <= cycles)
639
{
640
if (cycles == 1)
641
cg->inc(cg->dword[PTR(&g_state.pending_ticks)]);
642
else if (cycles > 0)
643
cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles);
644
645
if (force_run_events)
646
{
647
cg->jmp(CodeCache::g_run_events_and_dispatch);
648
return;
649
}
650
}
651
else
652
{
653
// pending_ticks += cycles
654
// if (pending_ticks >= downcount) { dispatch_event(); }
655
if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles)
656
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
657
if (cycles > 0)
658
cg->add(RWARG1, cycles);
659
if (m_gte_done_cycle > cycles)
660
{
661
cg->mov(RWARG2, RWARG1);
662
((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles);
663
cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2);
664
}
665
if (do_event_test)
666
cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
667
if (cycles > 0)
668
cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
669
if (do_event_test)
670
cg->jge(CodeCache::g_run_events_and_dispatch);
671
}
672
673
// jump to dispatcher or next block
674
if (!newpc.has_value())
675
{
676
cg->jmp(CodeCache::g_dispatcher);
677
}
678
else
679
{
680
const void* target = (newpc.value() == m_block->pc) ?
681
CodeCache::CreateSelfBlockLink(m_block, cg->getCurr<void*>(), cg->getCode()) :
682
CodeCache::CreateBlockLink(m_block, cg->getCurr<void*>(), newpc.value());
683
cg->jmp(target, CodeGenerator::T_NEAR);
684
}
685
}
686
687
const void* CPU::X64Recompiler::EndCompile(u32* code_size, u32* far_code_size)
688
{
689
const void* code = m_emitter->getCode();
690
*code_size = static_cast<u32>(m_emitter->getSize());
691
*far_code_size = static_cast<u32>(m_far_emitter->getSize());
692
cg = nullptr;
693
m_far_emitter.reset();
694
m_emitter.reset();
695
return code;
696
}
697
698
const void* CPU::X64Recompiler::GetCurrentCodePointer()
699
{
700
return cg->getCurr();
701
}
702
703
const char* CPU::X64Recompiler::GetHostRegName(u32 reg) const
704
{
705
static constexpr std::array<const char*, 16> reg64_names = {
706
{"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}};
707
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
708
}
709
710
void CPU::X64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
711
{
712
cg->mov(Reg32(reg), val);
713
}
714
715
void CPU::X64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
716
{
717
cg->mov(Reg32(reg), cg->dword[PTR(ptr)]);
718
}
719
720
void CPU::X64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
721
{
722
cg->mov(cg->dword[PTR(ptr)], Reg32(reg));
723
}
724
725
void CPU::X64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
726
{
727
cg->mov(cg->dword[PTR(ptr)], val);
728
}
729
730
void CPU::X64Recompiler::CopyHostReg(u32 dst, u32 src)
731
{
732
if (src != dst)
733
cg->mov(Reg32(dst), Reg32(src));
734
}
735
736
Xbyak::Address CPU::X64Recompiler::MipsPtr(Reg r) const
737
{
738
DebugAssert(r < Reg::count);
739
return cg->dword[PTR(&g_state.regs.r[static_cast<u32>(r)])];
740
}
741
742
Xbyak::Reg32 CPU::X64Recompiler::CFGetRegD(CompileFlags cf) const
743
{
744
DebugAssert(cf.valid_host_d);
745
return Reg32(cf.host_d);
746
}
747
748
Xbyak::Reg32 CPU::X64Recompiler::CFGetRegS(CompileFlags cf) const
749
{
750
DebugAssert(cf.valid_host_s);
751
return Reg32(cf.host_s);
752
}
753
754
Xbyak::Reg32 CPU::X64Recompiler::CFGetRegT(CompileFlags cf) const
755
{
756
DebugAssert(cf.valid_host_t);
757
return Reg32(cf.host_t);
758
}
759
760
Xbyak::Reg32 CPU::X64Recompiler::CFGetRegLO(CompileFlags cf) const
761
{
762
DebugAssert(cf.valid_host_lo);
763
return Reg32(cf.host_lo);
764
}
765
766
Xbyak::Reg32 CPU::X64Recompiler::CFGetRegHI(CompileFlags cf) const
767
{
768
DebugAssert(cf.valid_host_hi);
769
return Reg32(cf.host_hi);
770
}
771
772
Xbyak::Reg32 CPU::X64Recompiler::MoveSToD(CompileFlags cf)
773
{
774
DebugAssert(cf.valid_host_d);
775
DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d);
776
777
const Reg32 rd = CFGetRegD(cf);
778
MoveSToReg(rd, cf);
779
780
return rd;
781
}
782
783
Xbyak::Reg32 CPU::X64Recompiler::MoveSToT(CompileFlags cf)
784
{
785
DebugAssert(cf.valid_host_t);
786
787
const Reg32 rt = CFGetRegT(cf);
788
if (cf.valid_host_s)
789
{
790
const Reg32 rs = CFGetRegS(cf);
791
if (rt != rs)
792
cg->mov(rt, rs);
793
}
794
else if (cf.const_s)
795
{
796
if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0)
797
cg->mov(rt, cv);
798
else
799
cg->xor_(rt, rt);
800
}
801
else
802
{
803
cg->mov(rt, MipsPtr(cf.MipsS()));
804
}
805
806
return rt;
807
}
808
809
Xbyak::Reg32 CPU::X64Recompiler::MoveTToD(CompileFlags cf)
810
{
811
DebugAssert(cf.valid_host_d);
812
DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d);
813
814
const Reg32 rd = CFGetRegD(cf);
815
MoveTToReg(rd, cf);
816
return rd;
817
}
818
819
void CPU::X64Recompiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf)
820
{
821
if (cf.valid_host_s)
822
{
823
if (cf.host_s != static_cast<u32>(dst.getIdx()))
824
cg->mov(dst, Reg32(cf.host_s));
825
}
826
else if (cf.const_s)
827
{
828
const u32 cv = GetConstantRegU32(cf.MipsS());
829
if (cv == 0)
830
cg->xor_(dst, dst);
831
else
832
cg->mov(dst, cv);
833
}
834
else
835
{
836
cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]);
837
}
838
}
839
840
void CPU::X64Recompiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf)
841
{
842
if (cf.valid_host_t)
843
{
844
if (cf.host_t != static_cast<u32>(dst.getIdx()))
845
cg->mov(dst, Reg32(cf.host_t));
846
}
847
else if (cf.const_t)
848
{
849
const u32 cv = GetConstantRegU32(cf.MipsT());
850
if (cv == 0)
851
cg->xor_(dst, dst);
852
else
853
cg->mov(dst, cv);
854
}
855
else
856
{
857
cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]);
858
}
859
}
860
861
void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg)
862
{
863
DebugAssert(reg < Reg::count);
864
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
865
cg->mov(dst, Reg32(hreg.value()));
866
else if (HasConstantReg(reg))
867
cg->mov(dst, GetConstantRegU32(reg));
868
else
869
cg->mov(dst, MipsPtr(reg));
870
}
871
872
void CPU::X64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
873
Reg arg3reg /* = Reg::count */)
874
{
875
DebugAssert(g_settings.gpu_pgxp_enable);
876
877
Flush(FLUSH_FOR_C_CALL);
878
879
if (arg2reg != Reg::count)
880
MoveMIPSRegToReg(RWARG2, arg2reg);
881
if (arg3reg != Reg::count)
882
MoveMIPSRegToReg(RWARG3, arg3reg);
883
884
cg->mov(RWARG1, arg1val);
885
cg->call(func);
886
}
887
888
void CPU::X64Recompiler::Flush(u32 flags)
889
{
890
Recompiler::Flush(flags);
891
892
if (flags & FLUSH_PC && m_dirty_pc)
893
{
894
cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc);
895
m_dirty_pc = false;
896
}
897
898
if (flags & FLUSH_INSTRUCTION_BITS)
899
{
900
cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits);
901
cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc);
902
cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot);
903
}
904
905
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
906
{
907
// This sucks :(
908
// TODO: make it a function?
909
cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]);
910
cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]);
911
cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2);
912
cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(Reg::count));
913
m_load_delay_dirty = false;
914
}
915
916
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
917
{
918
if (m_load_delay_value_register != NUM_HOST_REGS)
919
FreeHostReg(m_load_delay_value_register);
920
921
cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(m_load_delay_register));
922
m_load_delay_register = Reg::count;
923
m_load_delay_dirty = true;
924
}
925
926
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
927
{
928
// May as well flush cycles while we're here.
929
// GTE spanning blocks is very rare, we _could_ disable this for speed.
930
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
931
cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]);
932
if (m_cycles > 0)
933
{
934
(m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
935
m_cycles = 0;
936
}
937
cg->cmp(RWARG2, RWARG1);
938
cg->cmova(RWARG1, RWARG2);
939
cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
940
m_dirty_gte_done_cycle = false;
941
}
942
943
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
944
{
945
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
946
947
// update cycles at the same time
948
if (flags & FLUSH_CYCLES && m_cycles > 0)
949
{
950
(m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
951
cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
952
m_gte_done_cycle -= m_cycles;
953
m_cycles = 0;
954
}
955
956
(m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle);
957
cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1);
958
m_gte_done_cycle = 0;
959
m_dirty_gte_done_cycle = true;
960
}
961
962
if (flags & FLUSH_CYCLES && m_cycles > 0)
963
{
964
(m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) :
965
cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles);
966
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
967
m_cycles = 0;
968
}
969
}
970
971
void CPU::X64Recompiler::Compile_Fallback()
972
{
973
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
974
inst->bits);
975
976
Flush(FLUSH_FOR_INTERPRETER);
977
978
cg->call(&CPU::RecompilerThunks::InterpretInstruction);
979
980
// TODO: make me less garbage
981
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
982
// but nothing should be going through here..
983
Label no_load_delay;
984
cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
985
cg->cmp(RWARG1, static_cast<u8>(Reg::count));
986
cg->je(no_load_delay, CodeGenerator::T_SHORT);
987
cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
988
cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
989
cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
990
cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
991
cg->L(no_load_delay);
992
993
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
994
}
995
996
void CPU::X64Recompiler::CheckBranchTarget(const Xbyak::Reg32& pcreg)
997
{
998
if (!g_settings.cpu_recompiler_memory_exceptions)
999
return;
1000
1001
cg->test(pcreg, 0x3);
1002
SwitchToFarCode(true, &CodeGenerator::jnz);
1003
1004
BackupHostState();
1005
EndBlockWithException(Exception::AdEL);
1006
1007
RestoreHostState();
1008
SwitchToNearCode(false);
1009
}
1010
1011
void CPU::X64Recompiler::Compile_jr(CompileFlags cf)
1012
{
1013
if (!cf.valid_host_s)
1014
cg->mov(RWARG1, MipsPtr(cf.MipsS()));
1015
1016
const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1017
CheckBranchTarget(pcreg);
1018
1019
cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
1020
1021
CompileBranchDelaySlot(false);
1022
EndBlock(std::nullopt, true);
1023
}
1024
1025
void CPU::X64Recompiler::Compile_jalr(CompileFlags cf)
1026
{
1027
if (!cf.valid_host_s)
1028
cg->mov(RWARG1, MipsPtr(cf.MipsS()));
1029
1030
const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1031
1032
if (MipsD() != Reg::zero)
1033
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1034
1035
CheckBranchTarget(pcreg);
1036
cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
1037
1038
CompileBranchDelaySlot(false);
1039
EndBlock(std::nullopt, true);
1040
}
1041
1042
void CPU::X64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1043
{
1044
const u32 taken_pc = GetConditionalBranchTarget(cf);
1045
1046
Flush(FLUSH_FOR_BRANCH);
1047
1048
DebugAssert(cf.valid_host_s);
1049
1050
// MipsT() here should equal zero for zero branches.
1051
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1052
1053
// TODO: Swap this back to near once instructions don't blow up
1054
constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR;
1055
Label taken;
1056
switch (cond)
1057
{
1058
case BranchCondition::Equal:
1059
case BranchCondition::NotEqual:
1060
{
1061
// we should always have S, maybe not T
1062
// TODO: if it's zero, we can just do test rs, rs
1063
if (cf.valid_host_t)
1064
cg->cmp(CFGetRegS(cf), CFGetRegT(cf));
1065
else if (cf.const_t)
1066
cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT()));
1067
else
1068
cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT()));
1069
1070
(cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type);
1071
}
1072
break;
1073
1074
case BranchCondition::GreaterThanZero:
1075
{
1076
cg->cmp(CFGetRegS(cf), 0);
1077
cg->jg(taken, type);
1078
}
1079
break;
1080
1081
case BranchCondition::GreaterEqualZero:
1082
{
1083
cg->test(CFGetRegS(cf), CFGetRegS(cf));
1084
cg->jns(taken, type);
1085
}
1086
break;
1087
1088
case BranchCondition::LessThanZero:
1089
{
1090
cg->test(CFGetRegS(cf), CFGetRegS(cf));
1091
cg->js(taken, type);
1092
}
1093
break;
1094
1095
case BranchCondition::LessEqualZero:
1096
{
1097
cg->cmp(CFGetRegS(cf), 0);
1098
cg->jle(taken, type);
1099
}
1100
break;
1101
}
1102
1103
BackupHostState();
1104
if (!cf.delay_slot_swapped)
1105
CompileBranchDelaySlot();
1106
1107
EndBlock(m_compiler_pc, true);
1108
1109
cg->L(taken);
1110
1111
RestoreHostState();
1112
if (!cf.delay_slot_swapped)
1113
CompileBranchDelaySlot();
1114
1115
EndBlock(taken_pc, true);
1116
}
1117
1118
void CPU::X64Recompiler::Compile_addi(CompileFlags cf)
1119
{
1120
const Reg32 rt = MoveSToT(cf);
1121
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1122
{
1123
cg->add(rt, imm);
1124
if (g_settings.cpu_recompiler_memory_exceptions)
1125
{
1126
DebugAssert(cf.valid_host_t);
1127
TestOverflow(rt);
1128
}
1129
}
1130
}
1131
1132
void CPU::X64Recompiler::Compile_addiu(CompileFlags cf)
1133
{
1134
const Reg32 rt = MoveSToT(cf);
1135
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1136
cg->add(rt, imm);
1137
}
1138
1139
void CPU::X64Recompiler::Compile_slti(CompileFlags cf)
1140
{
1141
Compile_slti(cf, true);
1142
}
1143
1144
void CPU::X64Recompiler::Compile_sltiu(CompileFlags cf)
1145
{
1146
Compile_slti(cf, false);
1147
}
1148
1149
void CPU::X64Recompiler::Compile_slti(CompileFlags cf, bool sign)
1150
{
1151
const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
1152
1153
// Case where T == S, can't use xor because it changes flags
1154
if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s)
1155
cg->xor_(rt, rt);
1156
1157
if (cf.valid_host_s)
1158
cg->cmp(CFGetRegS(cf), inst->i.imm_sext32());
1159
else
1160
cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32());
1161
1162
if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s)
1163
cg->mov(rt, 0);
1164
1165
sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8());
1166
1167
if (!cf.valid_host_t)
1168
cg->mov(MipsPtr(cf.MipsT()), rt);
1169
}
1170
1171
void CPU::X64Recompiler::Compile_andi(CompileFlags cf)
1172
{
1173
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1174
{
1175
const Reg32 rt = MoveSToT(cf);
1176
cg->and_(rt, imm);
1177
}
1178
else
1179
{
1180
const Reg32 rt = CFGetRegT(cf);
1181
cg->xor_(rt, rt);
1182
}
1183
}
1184
1185
void CPU::X64Recompiler::Compile_ori(CompileFlags cf)
1186
{
1187
const Reg32 rt = MoveSToT(cf);
1188
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1189
cg->or_(rt, imm);
1190
}
1191
1192
void CPU::X64Recompiler::Compile_xori(CompileFlags cf)
1193
{
1194
const Reg32 rt = MoveSToT(cf);
1195
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1196
cg->xor_(rt, imm);
1197
}
1198
1199
void CPU::X64Recompiler::Compile_sll(CompileFlags cf)
1200
{
1201
const Reg32 rd = MoveTToD(cf);
1202
if (inst->r.shamt > 0)
1203
cg->shl(rd, inst->r.shamt);
1204
}
1205
1206
void CPU::X64Recompiler::Compile_srl(CompileFlags cf)
1207
{
1208
const Reg32 rd = MoveTToD(cf);
1209
if (inst->r.shamt > 0)
1210
cg->shr(rd, inst->r.shamt);
1211
}
1212
1213
void CPU::X64Recompiler::Compile_sra(CompileFlags cf)
1214
{
1215
const Reg32 rd = MoveTToD(cf);
1216
if (inst->r.shamt > 0)
1217
cg->sar(rd, inst->r.shamt);
1218
}
1219
1220
void CPU::X64Recompiler::Compile_variable_shift(CompileFlags cf,
1221
void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&,
1222
const Xbyak::Reg8&),
1223
void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int))
1224
{
1225
const Reg32 rd = CFGetRegD(cf);
1226
if (!cf.const_s)
1227
{
1228
MoveSToReg(cg->ecx, cf);
1229
MoveTToReg(rd, cf);
1230
(cg->*op)(rd, cg->cl);
1231
}
1232
else
1233
{
1234
MoveTToReg(rd, cf);
1235
(cg->*op_const)(rd, GetConstantRegU32(cf.MipsS()));
1236
}
1237
}
1238
1239
void CPU::X64Recompiler::Compile_sllv(CompileFlags cf)
1240
{
1241
Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl);
1242
}
1243
1244
void CPU::X64Recompiler::Compile_srlv(CompileFlags cf)
1245
{
1246
Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr);
1247
}
1248
1249
void CPU::X64Recompiler::Compile_srav(CompileFlags cf)
1250
{
1251
Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar);
1252
}
1253
1254
void CPU::X64Recompiler::Compile_mult(CompileFlags cf, bool sign)
1255
{
1256
// RAX/RDX shouldn't be allocatable..
1257
DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) &&
1258
!(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE));
1259
1260
MoveSToReg(cg->eax, cf);
1261
if (cf.valid_host_t)
1262
{
1263
sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf));
1264
}
1265
else if (cf.const_t)
1266
{
1267
cg->mov(cg->edx, GetConstantRegU32(cf.MipsT()));
1268
sign ? cg->imul(cg->edx) : cg->mul(cg->edx);
1269
}
1270
else
1271
{
1272
sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT()));
1273
}
1274
1275
// TODO: skip writeback if it's not needed
1276
if (cf.valid_host_lo)
1277
cg->mov(CFGetRegLO(cf), cg->eax);
1278
else
1279
cg->mov(MipsPtr(Reg::lo), cg->eax);
1280
if (cf.valid_host_lo)
1281
cg->mov(CFGetRegHI(cf), cg->edx);
1282
else
1283
cg->mov(MipsPtr(Reg::hi), cg->edx);
1284
}
1285
1286
void CPU::X64Recompiler::Compile_mult(CompileFlags cf)
1287
{
1288
Compile_mult(cf, true);
1289
}
1290
1291
void CPU::X64Recompiler::Compile_multu(CompileFlags cf)
1292
{
1293
Compile_mult(cf, false);
1294
}
1295
1296
void CPU::X64Recompiler::Compile_div(CompileFlags cf)
1297
{
1298
// not supported without registers for now..
1299
DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
1300
1301
const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
1302
if (!cf.valid_host_t)
1303
MoveTToReg(rt, cf);
1304
1305
const Reg32 rlo = CFGetRegLO(cf);
1306
const Reg32 rhi = CFGetRegHI(cf);
1307
1308
MoveSToReg(cg->eax, cf);
1309
cg->cdq();
1310
1311
Label done;
1312
Label not_divide_by_zero;
1313
cg->test(rt, rt);
1314
cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
1315
cg->test(cg->eax, cg->eax);
1316
cg->mov(rhi, cg->eax); // hi = num
1317
cg->mov(rlo, 1);
1318
cg->mov(cg->eax, static_cast<u32>(-1));
1319
cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1
1320
cg->jmp(done, CodeGenerator::T_SHORT);
1321
1322
cg->L(not_divide_by_zero);
1323
Label not_unrepresentable;
1324
cg->cmp(cg->eax, 0x80000000u);
1325
cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
1326
cg->cmp(rt, static_cast<u32>(-1));
1327
cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
1328
1329
cg->mov(rlo, 0x80000000u);
1330
cg->xor_(rhi, rhi);
1331
cg->jmp(done, CodeGenerator::T_SHORT);
1332
1333
cg->L(not_unrepresentable);
1334
1335
cg->idiv(rt);
1336
cg->mov(rlo, cg->eax);
1337
cg->mov(rhi, cg->edx);
1338
1339
cg->L(done);
1340
}
1341
1342
void CPU::X64Recompiler::Compile_divu(CompileFlags cf)
1343
{
1344
// not supported without registers for now..
1345
DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
1346
1347
const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
1348
if (!cf.valid_host_t)
1349
MoveTToReg(rt, cf);
1350
1351
const Reg32 rlo = CFGetRegLO(cf);
1352
const Reg32 rhi = CFGetRegHI(cf);
1353
1354
MoveSToReg(cg->eax, cf);
1355
cg->xor_(cg->edx, cg->edx);
1356
1357
Label done;
1358
Label not_divide_by_zero;
1359
cg->test(rt, rt);
1360
cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
1361
cg->mov(rlo, static_cast<u32>(-1));
1362
cg->mov(rhi, cg->eax);
1363
cg->jmp(done, CodeGenerator::T_SHORT);
1364
1365
cg->L(not_divide_by_zero);
1366
cg->div(rt);
1367
cg->mov(rlo, cg->eax);
1368
cg->mov(rhi, cg->edx);
1369
1370
cg->L(done);
1371
}
1372
1373
void CPU::X64Recompiler::TestOverflow(const Xbyak::Reg32& result)
1374
{
1375
SwitchToFarCode(true, &Xbyak::CodeGenerator::jo);
1376
1377
BackupHostState();
1378
1379
// toss the result
1380
ClearHostReg(result.getIdx());
1381
1382
EndBlockWithException(Exception::Ov);
1383
1384
RestoreHostState();
1385
1386
SwitchToNearCode(false);
1387
}
1388
1389
void CPU::X64Recompiler::Compile_dst_op(CompileFlags cf,
1390
void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&),
1391
void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32),
1392
bool commutative, bool overflow)
1393
{
1394
if (cf.valid_host_s && cf.valid_host_t)
1395
{
1396
if (cf.host_d == cf.host_s)
1397
{
1398
(cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
1399
}
1400
else if (cf.host_d == cf.host_t)
1401
{
1402
if (commutative)
1403
{
1404
(cg->*op)(CFGetRegD(cf), CFGetRegS(cf));
1405
}
1406
else
1407
{
1408
cg->mov(RWARG1, CFGetRegT(cf));
1409
cg->mov(CFGetRegD(cf), CFGetRegS(cf));
1410
(cg->*op)(CFGetRegD(cf), RWARG1);
1411
}
1412
}
1413
else
1414
{
1415
cg->mov(CFGetRegD(cf), CFGetRegS(cf));
1416
(cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
1417
}
1418
}
1419
else if (commutative && (cf.const_s || cf.const_t))
1420
{
1421
const Reg32 rd = CFGetRegD(cf);
1422
(cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf);
1423
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1424
(cg->*op_const)(CFGetRegD(cf), cv);
1425
else
1426
overflow = false;
1427
}
1428
else if (cf.const_s)
1429
{
1430
// need to backup T?
1431
if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t)
1432
{
1433
cg->mov(RWARG1, CFGetRegT(cf));
1434
MoveSToReg(CFGetRegD(cf), cf);
1435
(cg->*op)(CFGetRegD(cf), RWARG1);
1436
}
1437
else
1438
{
1439
MoveSToReg(CFGetRegD(cf), cf);
1440
(cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
1441
}
1442
}
1443
else if (cf.const_t)
1444
{
1445
MoveSToReg(CFGetRegD(cf), cf);
1446
if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0)
1447
(cg->*op_const)(CFGetRegD(cf), cv);
1448
else
1449
overflow = false;
1450
}
1451
else if (cf.valid_host_s)
1452
{
1453
if (cf.host_d != cf.host_s)
1454
cg->mov(CFGetRegD(cf), CFGetRegS(cf));
1455
(cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
1456
}
1457
else if (cf.valid_host_t)
1458
{
1459
if (cf.host_d != cf.host_t)
1460
cg->mov(CFGetRegD(cf), CFGetRegT(cf));
1461
(cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS()));
1462
}
1463
else
1464
{
1465
cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS()));
1466
(cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
1467
}
1468
1469
if (overflow)
1470
{
1471
DebugAssert(cf.valid_host_d);
1472
TestOverflow(CFGetRegD(cf));
1473
}
1474
}
1475
1476
void CPU::X64Recompiler::Compile_add(CompileFlags cf)
1477
{
1478
Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions);
1479
}
1480
1481
void CPU::X64Recompiler::Compile_addu(CompileFlags cf)
1482
{
1483
Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false);
1484
}
1485
1486
void CPU::X64Recompiler::Compile_sub(CompileFlags cf)
1487
{
1488
Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions);
1489
}
1490
1491
void CPU::X64Recompiler::Compile_subu(CompileFlags cf)
1492
{
1493
Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false);
1494
}
1495
1496
void CPU::X64Recompiler::Compile_and(CompileFlags cf)
1497
{
1498
// special cases - and with self -> self, and with 0 -> 0
1499
const Reg32 regd = CFGetRegD(cf);
1500
if (cf.MipsS() == cf.MipsT())
1501
{
1502
MoveSToReg(regd, cf);
1503
return;
1504
}
1505
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1506
{
1507
cg->xor_(regd, regd);
1508
return;
1509
}
1510
1511
Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false);
1512
}
1513
1514
void CPU::X64Recompiler::Compile_or(CompileFlags cf)
1515
{
1516
// or/nor with 0 -> no effect
1517
const Reg32 regd = CFGetRegD(cf);
1518
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1519
{
1520
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1521
return;
1522
}
1523
1524
Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false);
1525
}
1526
1527
void CPU::X64Recompiler::Compile_xor(CompileFlags cf)
1528
{
1529
const Reg32 regd = CFGetRegD(cf);
1530
if (cf.MipsS() == cf.MipsT())
1531
{
1532
// xor with self -> zero
1533
cg->xor_(regd, regd);
1534
return;
1535
}
1536
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1537
{
1538
// xor with zero -> no effect
1539
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1540
return;
1541
}
1542
1543
Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false);
1544
}
1545
1546
void CPU::X64Recompiler::Compile_nor(CompileFlags cf)
1547
{
1548
Compile_or(cf);
1549
cg->not_(CFGetRegD(cf));
1550
}
1551
1552
void CPU::X64Recompiler::Compile_slt(CompileFlags cf)
1553
{
1554
Compile_slt(cf, true);
1555
}
1556
1557
void CPU::X64Recompiler::Compile_sltu(CompileFlags cf)
1558
{
1559
Compile_slt(cf, false);
1560
}
1561
1562
void CPU::X64Recompiler::Compile_slt(CompileFlags cf, bool sign)
1563
{
1564
const Reg32 rd = CFGetRegD(cf);
1565
const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1566
const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
1567
if (!cf.valid_host_s)
1568
MoveSToReg(rs, cf);
1569
1570
// Case where D == S, can't use xor because it changes flags
1571
// TODO: swap and reverse op for constants
1572
if (rd != rs && rd != rt)
1573
cg->xor_(rd, rd);
1574
1575
if (cf.valid_host_t)
1576
cg->cmp(rs, CFGetRegT(cf));
1577
else if (cf.const_t)
1578
cg->cmp(rs, GetConstantRegU32(cf.MipsT()));
1579
else
1580
cg->cmp(rs, MipsPtr(cf.MipsT()));
1581
1582
if (rd == rs || rd == rt)
1583
cg->mov(rd, 0);
1584
1585
sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8());
1586
}
1587
1588
Xbyak::Reg32
1589
CPU::X64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1590
const std::optional<const Xbyak::Reg32>& reg /* = std::nullopt */)
1591
{
1592
const u32 imm = inst->i.imm_sext32();
1593
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1594
return CFGetRegS(cf);
1595
1596
const Reg32 dst = reg.has_value() ? reg.value() : RWARG1;
1597
if (address.has_value())
1598
{
1599
cg->mov(dst, address.value());
1600
}
1601
else
1602
{
1603
if (cf.valid_host_s)
1604
{
1605
if (const Reg32 src = CFGetRegS(cf); src != dst)
1606
cg->mov(dst, CFGetRegS(cf));
1607
}
1608
else
1609
{
1610
cg->mov(dst, MipsPtr(cf.MipsS()));
1611
}
1612
1613
if (imm != 0)
1614
cg->add(dst, inst->i.imm_sext32());
1615
}
1616
1617
return dst;
1618
}
1619
1620
template<typename RegAllocFn>
1621
Xbyak::Reg32 CPU::X64Recompiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign,
1622
bool use_fastmem, const RegAllocFn& dst_reg_alloc)
1623
{
1624
if (use_fastmem)
1625
{
1626
m_cycles += Bus::RAM_READ_TICKS;
1627
1628
const Reg32 dst = dst_reg_alloc();
1629
1630
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1631
{
1632
DebugAssert(addr_reg != RWARG3);
1633
cg->mov(RWARG3, addr_reg.cvt32());
1634
cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
1635
cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
1636
}
1637
1638
const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
1639
u8* start = cg->getCurr<u8*>();
1640
switch (size)
1641
{
1642
case MemoryAccessSize::Byte:
1643
{
1644
sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) :
1645
cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]);
1646
}
1647
break;
1648
1649
case MemoryAccessSize::HalfWord:
1650
{
1651
sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) :
1652
cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]);
1653
}
1654
break;
1655
1656
case MemoryAccessSize::Word:
1657
{
1658
cg->mov(dst, cg->word[membase + addr_reg.cvt64()]);
1659
}
1660
break;
1661
}
1662
1663
u8* end = cg->getCurr<u8*>();
1664
while ((end - start) < BACKPATCH_JMP_SIZE)
1665
{
1666
cg->nop();
1667
end = cg->getCurr<u8*>();
1668
}
1669
1670
AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
1671
static_cast<u32>(dst.getIdx()), size, sign, true);
1672
return dst;
1673
}
1674
1675
if (addr_reg != RWARG1)
1676
cg->mov(RWARG1, addr_reg);
1677
1678
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1679
switch (size)
1680
{
1681
case MemoryAccessSize::Byte:
1682
{
1683
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :
1684
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));
1685
}
1686
break;
1687
case MemoryAccessSize::HalfWord:
1688
{
1689
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :
1690
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));
1691
}
1692
break;
1693
case MemoryAccessSize::Word:
1694
{
1695
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :
1696
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));
1697
}
1698
break;
1699
}
1700
1701
// TODO: turn this into an asm function instead
1702
if (checked)
1703
{
1704
cg->test(RXRET, RXRET);
1705
1706
BackupHostState();
1707
SwitchToFarCode(true, &CodeGenerator::js);
1708
1709
// flush regs, but not pc, it's going to get overwritten
1710
// flush cycles because of the GTE instruction stuff...
1711
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1712
1713
// cause_bits = (-result << 2) | BD | cop_n
1714
cg->mov(RWARG1, RWRET);
1715
cg->neg(RWARG1);
1716
cg->shl(RWARG1, 2);
1717
cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
1718
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
1719
cg->mov(RWARG2, m_current_instruction_pc);
1720
cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
1721
m_dirty_pc = false;
1722
EndAndLinkBlock(std::nullopt, true, false);
1723
1724
SwitchToNearCode(false);
1725
RestoreHostState();
1726
}
1727
1728
const Xbyak::Reg32 dst_reg = dst_reg_alloc();
1729
switch (size)
1730
{
1731
case MemoryAccessSize::Byte:
1732
{
1733
sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8());
1734
}
1735
break;
1736
case MemoryAccessSize::HalfWord:
1737
{
1738
sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16());
1739
}
1740
break;
1741
case MemoryAccessSize::Word:
1742
{
1743
if (dst_reg != RWRET)
1744
cg->mov(dst_reg, RWRET);
1745
}
1746
break;
1747
}
1748
1749
return dst_reg;
1750
}
1751
1752
void CPU::X64Recompiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg,
1753
MemoryAccessSize size, bool use_fastmem)
1754
{
1755
if (use_fastmem)
1756
{
1757
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1758
{
1759
DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3);
1760
cg->mov(RWARG3, addr_reg.cvt32());
1761
cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
1762
cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
1763
}
1764
1765
const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
1766
u8* start = cg->getCurr<u8*>();
1767
switch (size)
1768
{
1769
case MemoryAccessSize::Byte:
1770
cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8());
1771
break;
1772
1773
case MemoryAccessSize::HalfWord:
1774
cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16());
1775
break;
1776
1777
case MemoryAccessSize::Word:
1778
cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32());
1779
break;
1780
}
1781
1782
u8* end = cg->getCurr<u8*>();
1783
while ((end - start) < BACKPATCH_JMP_SIZE)
1784
{
1785
cg->nop();
1786
end = cg->getCurr<u8*>();
1787
}
1788
1789
AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
1790
static_cast<u32>(value_reg.getIdx()), size, false, false);
1791
return;
1792
}
1793
1794
if (addr_reg != RWARG1)
1795
cg->mov(RWARG1, addr_reg);
1796
if (value_reg != RWARG2)
1797
cg->mov(RWARG2, value_reg);
1798
1799
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1800
switch (size)
1801
{
1802
case MemoryAccessSize::Byte:
1803
{
1804
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :
1805
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
1806
}
1807
break;
1808
case MemoryAccessSize::HalfWord:
1809
{
1810
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :
1811
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
1812
}
1813
break;
1814
case MemoryAccessSize::Word:
1815
{
1816
cg->call(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :
1817
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
1818
}
1819
break;
1820
}
1821
1822
// TODO: turn this into an asm function instead
1823
if (checked)
1824
{
1825
cg->test(RWRET, RWRET);
1826
1827
BackupHostState();
1828
SwitchToFarCode(true, &CodeGenerator::jnz);
1829
1830
// flush regs, but not pc, it's going to get overwritten
1831
// flush cycles because of the GTE instruction stuff...
1832
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1833
1834
// cause_bits = (result << 2) | BD | cop_n
1835
cg->mov(RWARG1, RWRET);
1836
cg->shl(RWARG1, 2);
1837
cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
1838
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
1839
cg->mov(RWARG2, m_current_instruction_pc);
1840
cg->call(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1841
m_dirty_pc = false;
1842
EndAndLinkBlock(std::nullopt, true, false);
1843
1844
SwitchToNearCode(false);
1845
RestoreHostState();
1846
}
1847
}
1848
1849
void CPU::X64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1850
const std::optional<VirtualMemoryAddress>& address)
1851
{
1852
const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
1853
std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1854
std::optional<Reg32>();
1855
FlushForLoadStore(address, false, use_fastmem);
1856
const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
1857
1858
const Reg32 data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
1859
if (cf.MipsT() == Reg::zero)
1860
return RWRET;
1861
1862
return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
1863
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
1864
});
1865
1866
if (g_settings.gpu_pgxp_enable)
1867
{
1868
Flush(FLUSH_FOR_C_CALL);
1869
1870
cg->mov(RWARG1, inst->bits);
1871
cg->mov(RWARG2, addr);
1872
cg->mov(RWARG3, data);
1873
cg->call(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
1874
FreeHostReg(addr_reg.value().getIdx());
1875
}
1876
}
1877
1878
void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1879
const std::optional<VirtualMemoryAddress>& address)
1880
{
1881
DebugAssert(size == MemoryAccessSize::Word && !sign);
1882
1883
const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
1884
FlushForLoadStore(address, false, use_fastmem);
1885
1886
// TODO: if address is constant, this can be simplified..
1887
1888
// If we're coming from another block, just flush the load delay and hope for the best..
1889
if (m_load_delay_dirty)
1890
UpdateLoadDelay();
1891
1892
// We'd need to be careful here if we weren't overwriting it..
1893
ComputeLoadStoreAddressArg(cf, address, addr);
1894
cg->mov(RWARG1, addr);
1895
cg->and_(RWARG1, ~0x3u);
1896
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
1897
1898
if (inst->r.rt == Reg::zero)
1899
{
1900
FreeHostReg(addr.getIdx());
1901
return;
1902
}
1903
1904
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
1905
// never written back. NOTE: can't trust T in cf because of the flush
1906
const Reg rt = inst->r.rt;
1907
Reg32 value;
1908
if (m_load_delay_register == rt)
1909
{
1910
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
1911
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
1912
m_load_delay_value_register;
1913
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
1914
value = Reg32(existing_ld_rt);
1915
}
1916
else
1917
{
1918
if constexpr (EMULATE_LOAD_DELAYS)
1919
{
1920
value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
1921
if (HasConstantReg(rt))
1922
cg->mov(value, GetConstantRegU32(rt));
1923
else if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
1924
cg->mov(value, Reg32(rtreg.value()));
1925
else
1926
cg->mov(value, MipsPtr(rt));
1927
}
1928
else
1929
{
1930
value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
1931
}
1932
}
1933
1934
DebugAssert(value != cg->ecx);
1935
cg->mov(cg->ecx, addr);
1936
cg->and_(cg->ecx, 3);
1937
cg->shl(cg->ecx, 3); // *8
1938
1939
// TODO for other arch: reverse subtract
1940
DebugAssert(RWARG2 != cg->ecx);
1941
cg->mov(RWARG2, 24);
1942
cg->sub(RWARG2, cg->ecx);
1943
1944
if (inst->op == InstructionOp::lwl)
1945
{
1946
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
1947
// new_value = (value & mask) | (RWRET << (24 - shift));
1948
cg->mov(RWARG3, 0xFFFFFFu);
1949
cg->shr(RWARG3, cg->cl);
1950
cg->and_(value, RWARG3);
1951
cg->mov(cg->ecx, RWARG2);
1952
cg->shl(RWRET, cg->cl);
1953
cg->or_(value, RWRET);
1954
}
1955
else
1956
{
1957
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
1958
// new_value = (value & mask) | (RWRET >> shift);
1959
cg->shr(RWRET, cg->cl);
1960
cg->mov(RWARG3, 0xFFFFFF00u);
1961
cg->mov(cg->ecx, RWARG2);
1962
cg->shl(RWARG3, cg->cl);
1963
cg->and_(value, RWARG3);
1964
cg->or_(value, RWRET);
1965
}
1966
1967
FreeHostReg(addr.getIdx());
1968
1969
if (g_settings.gpu_pgxp_enable)
1970
{
1971
Flush(FLUSH_FOR_C_CALL);
1972
1973
DebugAssert(value != RWARG3);
1974
cg->mov(RWARG3, value);
1975
cg->mov(RWARG2, addr);
1976
cg->and_(RWARG2, ~0x3u);
1977
cg->mov(RWARG1, inst->bits);
1978
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LW));
1979
}
1980
}
1981
1982
void CPU::X64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1983
const std::optional<VirtualMemoryAddress>& address)
1984
{
1985
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
1986
const auto [ptr, action] = GetGTERegisterPointer(index, true);
1987
const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
1988
std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1989
std::optional<Reg32>();
1990
FlushForLoadStore(address, false, use_fastmem);
1991
const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
1992
const Reg32 value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
1993
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
1994
Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) :
1995
RWRET;
1996
});
1997
1998
switch (action)
1999
{
2000
case GTERegisterAccessAction::Ignore:
2001
{
2002
break;
2003
}
2004
2005
case GTERegisterAccessAction::Direct:
2006
{
2007
cg->mov(cg->dword[PTR(ptr)], value);
2008
break;
2009
}
2010
2011
case GTERegisterAccessAction::SignExtend16:
2012
{
2013
cg->movsx(RWARG3, value.cvt16());
2014
cg->mov(cg->dword[PTR(ptr)], RWARG3);
2015
break;
2016
}
2017
2018
case GTERegisterAccessAction::ZeroExtend16:
2019
{
2020
cg->movzx(RWARG3, value.cvt16());
2021
cg->mov(cg->dword[PTR(ptr)], RWARG3);
2022
break;
2023
}
2024
2025
case GTERegisterAccessAction::CallHandler:
2026
{
2027
Flush(FLUSH_FOR_C_CALL);
2028
cg->mov(RWARG2, value);
2029
cg->mov(RWARG1, index);
2030
cg->call(&GTE::WriteRegister);
2031
break;
2032
}
2033
2034
case GTERegisterAccessAction::PushFIFO:
2035
{
2036
// SXY0 <- SXY1
2037
// SXY1 <- SXY2
2038
// SXY2 <- SXYP
2039
DebugAssert(value != RWARG1 && value != RWARG2);
2040
cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
2041
cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
2042
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
2043
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
2044
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], value);
2045
break;
2046
}
2047
2048
default:
2049
{
2050
Panic("Unknown action");
2051
return;
2052
}
2053
}
2054
2055
if (g_settings.gpu_pgxp_enable)
2056
{
2057
Flush(FLUSH_FOR_C_CALL);
2058
cg->mov(RWARG3, value);
2059
if (value != RWRET)
2060
FreeHostReg(value.getIdx());
2061
cg->mov(RWARG2, addr);
2062
FreeHostReg(addr_reg.value().getIdx());
2063
cg->mov(RWARG1, inst->bits);
2064
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2065
}
2066
}
2067
2068
void CPU::X64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2069
const std::optional<VirtualMemoryAddress>& address)
2070
{
2071
const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
2072
std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2073
std::optional<Reg32>();
2074
FlushForLoadStore(address, true, use_fastmem);
2075
const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2076
const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
2077
if (!cf.valid_host_t)
2078
MoveTToReg(RWARG2, cf);
2079
2080
GenerateStore(addr, data, size, use_fastmem);
2081
2082
if (g_settings.gpu_pgxp_enable)
2083
{
2084
Flush(FLUSH_FOR_C_CALL);
2085
MoveMIPSRegToReg(RWARG3, cf.MipsT());
2086
cg->mov(RWARG2, addr);
2087
cg->mov(RWARG1, inst->bits);
2088
cg->call(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2089
FreeHostReg(addr_reg.value().getIdx());
2090
}
2091
}
2092
2093
void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2094
const std::optional<VirtualMemoryAddress>& address)
2095
{
2096
DebugAssert(size == MemoryAccessSize::Word && !sign);
2097
2098
// TODO: this can take over rt's value if it's no longer needed
2099
// NOTE: can't trust T in cf because of the alloc
2100
const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
2101
const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
2102
if (g_settings.gpu_pgxp_enable)
2103
MoveMIPSRegToReg(value, inst->r.rt);
2104
2105
FlushForLoadStore(address, true, use_fastmem);
2106
2107
// TODO: if address is constant, this can be simplified..
2108
// We'd need to be careful here if we weren't overwriting it..
2109
ComputeLoadStoreAddressArg(cf, address, addr);
2110
cg->mov(RWARG1, addr);
2111
cg->and_(RWARG1, ~0x3u);
2112
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2113
2114
DebugAssert(value != cg->ecx);
2115
cg->mov(cg->ecx, addr);
2116
cg->and_(cg->ecx, 3);
2117
cg->shl(cg->ecx, 3); // *8
2118
cg->and_(addr, ~0x3u);
2119
2120
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
2121
if (!g_settings.gpu_pgxp_enable)
2122
MoveMIPSRegToReg(value, inst->r.rt);
2123
2124
if (inst->op == InstructionOp::swl)
2125
{
2126
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2127
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2128
cg->mov(RWARG3, 0xFFFFFF00u);
2129
cg->shl(RWARG3, cg->cl);
2130
cg->and_(RWRET, RWARG3);
2131
2132
cg->mov(RWARG3, 24);
2133
cg->sub(RWARG3, cg->ecx);
2134
cg->mov(cg->ecx, RWARG3);
2135
cg->shr(value, cg->cl);
2136
cg->or_(value, RWRET);
2137
}
2138
else
2139
{
2140
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2141
// new_value = (RWRET & mem_mask) | (value << shift);
2142
cg->shl(value, cg->cl);
2143
2144
DebugAssert(RWARG3 != cg->ecx);
2145
cg->mov(RWARG3, 24);
2146
cg->sub(RWARG3, cg->ecx);
2147
cg->mov(cg->ecx, RWARG3);
2148
cg->mov(RWARG3, 0x00FFFFFFu);
2149
cg->shr(RWARG3, cg->cl);
2150
cg->and_(RWRET, RWARG3);
2151
cg->or_(value, RWRET);
2152
}
2153
2154
if (!g_settings.gpu_pgxp_enable)
2155
{
2156
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2157
FreeHostReg(addr.getIdx());
2158
}
2159
else
2160
{
2161
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
2162
2163
Flush(FLUSH_FOR_C_CALL);
2164
cg->mov(RWARG3, value);
2165
FreeHostReg(value.getIdx());
2166
cg->mov(RWARG2, addr);
2167
FreeHostReg(addr.getIdx());
2168
cg->mov(RWARG1, inst->bits);
2169
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SW));
2170
}
2171
}
2172
2173
void CPU::X64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2174
const std::optional<VirtualMemoryAddress>& address)
2175
{
2176
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2177
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2178
switch (action)
2179
{
2180
case GTERegisterAccessAction::Direct:
2181
{
2182
cg->mov(RWARG2, cg->dword[PTR(ptr)]);
2183
}
2184
break;
2185
2186
case GTERegisterAccessAction::CallHandler:
2187
{
2188
// should already be flushed.. except in fastmem case
2189
Flush(FLUSH_FOR_C_CALL);
2190
cg->mov(RWARG1, index);
2191
cg->call(&GTE::ReadRegister);
2192
cg->mov(RWARG2, RWRET);
2193
}
2194
break;
2195
2196
default:
2197
{
2198
Panic("Unknown action");
2199
}
2200
break;
2201
}
2202
2203
// PGXP makes this a giant pain.
2204
if (!g_settings.gpu_pgxp_enable)
2205
{
2206
FlushForLoadStore(address, true, use_fastmem);
2207
const Reg32 addr = ComputeLoadStoreAddressArg(cf, address);
2208
GenerateStore(addr, RWARG2, size, use_fastmem);
2209
return;
2210
}
2211
2212
// TODO: This can be simplified because we don't need to validate in PGXP..
2213
const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
2214
const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
2215
FlushForLoadStore(address, true, use_fastmem);
2216
ComputeLoadStoreAddressArg(cf, address, addr_reg);
2217
cg->mov(data_backup, RWARG2);
2218
GenerateStore(addr_reg, RWARG2, size, use_fastmem);
2219
2220
Flush(FLUSH_FOR_C_CALL);
2221
cg->mov(RWARG3, data_backup);
2222
cg->mov(RWARG2, addr_reg);
2223
cg->mov(RWARG1, inst->bits);
2224
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2225
FreeHostReg(addr_reg.getIdx());
2226
FreeHostReg(data_backup.getIdx());
2227
}
2228
2229
void CPU::X64Recompiler::Compile_mtc0(CompileFlags cf)
2230
{
2231
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2232
const u32* ptr = GetCop0RegPtr(reg);
2233
const u32 mask = GetCop0RegWriteMask(reg);
2234
if (!ptr)
2235
{
2236
Compile_Fallback();
2237
return;
2238
}
2239
2240
// TODO: const apply mask
2241
const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
2242
const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0;
2243
if (mask == 0)
2244
{
2245
// if it's a read-only register, ignore
2246
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2247
return;
2248
}
2249
2250
// for some registers, we need to test certain bits
2251
const bool needs_bit_test = (reg == Cop0Reg::SR);
2252
const Reg32 changed_bits = RWARG3;
2253
2254
// update value
2255
if (cf.valid_host_t)
2256
{
2257
cg->mov(RWARG1, rt);
2258
cg->mov(RWARG2, cg->dword[PTR(ptr)]);
2259
cg->and_(RWARG1, mask);
2260
if (needs_bit_test)
2261
{
2262
cg->mov(changed_bits, RWARG2);
2263
cg->xor_(changed_bits, RWARG1);
2264
}
2265
cg->and_(RWARG2, ~mask);
2266
cg->or_(RWARG2, RWARG1);
2267
cg->mov(cg->dword[PTR(ptr)], RWARG2);
2268
}
2269
else
2270
{
2271
cg->mov(RWARG2, cg->dword[PTR(ptr)]);
2272
if (needs_bit_test)
2273
{
2274
cg->mov(changed_bits, RWARG2);
2275
cg->xor_(changed_bits, constant_value & mask);
2276
}
2277
cg->and_(RWARG2, ~mask);
2278
cg->or_(RWARG2, constant_value & mask);
2279
cg->mov(cg->dword[PTR(ptr)], RWARG2);
2280
}
2281
2282
if (reg == Cop0Reg::SR)
2283
{
2284
// TODO: replace with register backup
2285
// We could just inline the whole thing..
2286
Flush(FLUSH_FOR_C_CALL);
2287
2288
Label caches_unchanged;
2289
cg->test(changed_bits, 1u << 16);
2290
cg->jz(caches_unchanged);
2291
cg->call(&CPU::UpdateMemoryPointers);
2292
cg->mov(RWARG2, cg->dword[PTR(ptr)]); // reload value for interrupt test below
2293
if (CodeCache::IsUsingFastmem())
2294
cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]);
2295
2296
cg->L(caches_unchanged);
2297
2298
TestInterrupts(RWARG2);
2299
}
2300
else if (reg == Cop0Reg::CAUSE)
2301
{
2302
cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]);
2303
TestInterrupts(RWARG1);
2304
}
2305
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2306
{
2307
// need to check whether we're switching to debug mode
2308
Flush(FLUSH_FOR_C_CALL);
2309
cg->call(&CPU::UpdateDebugDispatcherFlag);
2310
cg->test(cg->al, cg->al);
2311
SwitchToFarCode(true, &Xbyak::CodeGenerator::jnz);
2312
BackupHostState();
2313
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2314
cg->call(&CPU::ExitExecution); // does not return
2315
RestoreHostState();
2316
SwitchToNearCode(false);
2317
}
2318
}
2319
2320
void CPU::X64Recompiler::Compile_rfe(CompileFlags cf)
2321
{
2322
// shift mode bits right two, preserving upper bits
2323
static constexpr u32 mode_bits_mask = UINT32_C(0b1111);
2324
cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]);
2325
cg->mov(RWARG2, RWARG1);
2326
cg->shr(RWARG2, 2);
2327
cg->and_(RWARG1, ~mode_bits_mask);
2328
cg->and_(RWARG2, mode_bits_mask);
2329
cg->or_(RWARG1, RWARG2);
2330
cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1);
2331
2332
TestInterrupts(RWARG1);
2333
}
2334
2335
void CPU::X64Recompiler::TestInterrupts(const Xbyak::Reg32& sr)
2336
{
2337
// if Iec == 0 then goto no_interrupt
2338
Label no_interrupt;
2339
2340
cg->test(sr, 1);
2341
cg->jz(no_interrupt, CodeGenerator::T_NEAR);
2342
2343
// sr & cause
2344
cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]);
2345
2346
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2347
cg->test(sr, 0xFF00);
2348
2349
SwitchToFarCode(true, &CodeGenerator::jnz);
2350
BackupHostState();
2351
2352
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2353
UpdateLoadDelay();
2354
2355
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2356
2357
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2358
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2359
if (!iinfo->is_last_instruction)
2360
{
2361
cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2362
(inst + 1)->cop.cop_n));
2363
cg->mov(RWARG2, m_compiler_pc);
2364
cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
2365
m_dirty_pc = false;
2366
EndAndLinkBlock(std::nullopt, true, false);
2367
}
2368
else
2369
{
2370
if (m_dirty_pc)
2371
cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc);
2372
m_dirty_pc = false;
2373
cg->mov(cg->dword[PTR(&g_state.downcount)], 0);
2374
EndAndLinkBlock(std::nullopt, false, true);
2375
}
2376
2377
RestoreHostState();
2378
SwitchToNearCode(false);
2379
2380
cg->L(no_interrupt);
2381
}
2382
2383
void CPU::X64Recompiler::Compile_mfc2(CompileFlags cf)
2384
{
2385
const u32 index = inst->cop.Cop2Index();
2386
const Reg rt = inst->r.rt;
2387
2388
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2389
if (action == GTERegisterAccessAction::Ignore)
2390
return;
2391
2392
u32 hreg;
2393
if (action == GTERegisterAccessAction::Direct)
2394
{
2395
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2396
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2397
cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]);
2398
}
2399
else if (action == GTERegisterAccessAction::CallHandler)
2400
{
2401
Flush(FLUSH_FOR_C_CALL);
2402
cg->mov(RWARG1, index);
2403
cg->call(&GTE::ReadRegister);
2404
2405
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2406
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2407
cg->mov(Reg32(hreg), RWRET);
2408
}
2409
else
2410
{
2411
Panic("Unknown action");
2412
return;
2413
}
2414
2415
if (g_settings.gpu_pgxp_enable)
2416
{
2417
Flush(FLUSH_FOR_C_CALL);
2418
cg->mov(RWARG1, inst->bits);
2419
cg->mov(RWARG2, Reg32(hreg));
2420
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2421
}
2422
}
2423
2424
void CPU::X64Recompiler::Compile_mtc2(CompileFlags cf)
2425
{
2426
const u32 index = inst->cop.Cop2Index();
2427
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2428
if (action == GTERegisterAccessAction::Ignore)
2429
return;
2430
2431
if (action == GTERegisterAccessAction::Direct)
2432
{
2433
if (cf.const_t)
2434
{
2435
cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT()));
2436
}
2437
else if (cf.valid_host_t)
2438
{
2439
cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf));
2440
}
2441
else
2442
{
2443
cg->mov(RWARG1, MipsPtr(cf.MipsT()));
2444
cg->mov(cg->dword[PTR(ptr)], RWARG1);
2445
}
2446
}
2447
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2448
{
2449
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2450
if (cf.const_t)
2451
{
2452
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2453
cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv));
2454
}
2455
else if (cf.valid_host_t)
2456
{
2457
sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t));
2458
cg->mov(cg->dword[PTR(ptr)], RWARG1);
2459
}
2460
else
2461
{
2462
sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) :
2463
cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]);
2464
cg->mov(cg->dword[PTR(ptr)], RWARG1);
2465
}
2466
}
2467
else if (action == GTERegisterAccessAction::CallHandler)
2468
{
2469
Flush(FLUSH_FOR_C_CALL);
2470
cg->mov(RWARG1, index);
2471
MoveTToReg(RWARG2, cf);
2472
cg->call(&GTE::WriteRegister);
2473
}
2474
else if (action == GTERegisterAccessAction::PushFIFO)
2475
{
2476
// SXY0 <- SXY1
2477
// SXY1 <- SXY2
2478
// SXY2 <- SXYP
2479
cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
2480
cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
2481
if (!cf.const_t && !cf.valid_host_t)
2482
cg->mov(RWARG3, MipsPtr(cf.MipsT()));
2483
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
2484
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
2485
if (cf.const_t)
2486
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT()));
2487
else if (cf.valid_host_t)
2488
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf));
2489
else
2490
cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3);
2491
}
2492
else
2493
{
2494
Panic("Unknown action");
2495
}
2496
}
2497
2498
void CPU::X64Recompiler::Compile_cop2(CompileFlags cf)
2499
{
2500
TickCount func_ticks;
2501
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2502
2503
Flush(FLUSH_FOR_C_CALL);
2504
cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2505
cg->call(reinterpret_cast<const void*>(func));
2506
2507
AddGTETicks(func_ticks);
2508
}
2509
2510
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2511
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2512
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2513
bool is_load)
2514
{
2515
CodeGenerator acg(thunk_space, thunk_code);
2516
CodeGenerator* cg = &acg;
2517
2518
static constexpr u32 GPR_SIZE = 8;
2519
2520
// save regs
2521
u32 num_gprs = 0;
2522
2523
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2524
{
2525
if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
2526
num_gprs++;
2527
}
2528
2529
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
2530
2531
if (stack_size > 0)
2532
{
2533
cg->sub(cg->rsp, stack_size);
2534
2535
u32 stack_offset = STACK_SHADOW_SIZE;
2536
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2537
{
2538
if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
2539
{
2540
cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i));
2541
stack_offset += GPR_SIZE;
2542
}
2543
}
2544
}
2545
2546
if (cycles_to_add != 0)
2547
cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add);
2548
2549
if (address_register != static_cast<u8>(RWARG1.getIdx()))
2550
cg->mov(RWARG1, Reg32(address_register));
2551
2552
if (!is_load)
2553
{
2554
if (data_register != static_cast<u8>(RWARG2.getIdx()))
2555
cg->mov(RWARG2, Reg32(data_register));
2556
}
2557
2558
switch (size)
2559
{
2560
case MemoryAccessSize::Byte:
2561
{
2562
cg->call(is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :
2563
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
2564
}
2565
break;
2566
case MemoryAccessSize::HalfWord:
2567
{
2568
cg->call(is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :
2569
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
2570
}
2571
break;
2572
case MemoryAccessSize::Word:
2573
{
2574
cg->call(is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :
2575
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
2576
}
2577
break;
2578
}
2579
2580
if (is_load)
2581
{
2582
const Reg32 dst = Reg32(data_register);
2583
switch (size)
2584
{
2585
case MemoryAccessSize::Byte:
2586
{
2587
is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8());
2588
}
2589
break;
2590
case MemoryAccessSize::HalfWord:
2591
{
2592
is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16());
2593
}
2594
break;
2595
case MemoryAccessSize::Word:
2596
{
2597
if (dst != RWRET)
2598
cg->mov(dst, RWRET);
2599
}
2600
break;
2601
}
2602
}
2603
2604
if (cycles_to_remove != 0)
2605
cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove);
2606
2607
// restore regs
2608
if (stack_size > 0)
2609
{
2610
u32 stack_offset = STACK_SHADOW_SIZE;
2611
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2612
{
2613
if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
2614
{
2615
cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]);
2616
stack_offset += GPR_SIZE;
2617
}
2618
}
2619
2620
cg->add(cg->rsp, stack_size);
2621
}
2622
2623
cg->jmp(static_cast<const u8*>(code_address) + code_size);
2624
2625
// fill the rest of it with nops, if any
2626
DebugAssert(code_size >= BACKPATCH_JMP_SIZE);
2627
if (code_size > BACKPATCH_JMP_SIZE)
2628
std::memset(static_cast<u8*>(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE);
2629
2630
return static_cast<u32>(cg->getSize());
2631
}
2632
2633
#endif // CPU_ARCH_X64
2634
2635