CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM64/Arm64RegCacheFPU.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(ARM64)
20
21
#include <cstring>
22
23
#include "Common/CPUDetect.h"
24
#include "Common/Log.h"
25
#include "Core/Reporting.h"
26
#include "Core/MIPS/MIPS.h"
27
#include "Core/MIPS/ARM64/Arm64RegCacheFPU.h"
28
#include "Core/MIPS/ARM64/Arm64Jit.h"
29
#include "Core/MIPS/MIPSTables.h"
30
31
using namespace Arm64Gen;
32
using namespace Arm64JitConstants;
33
34
Arm64RegCacheFPU::Arm64RegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), vr(mr + 32), js_(js), jo_(jo) {
35
numARMFpuReg_ = 32;
36
}
37
38
void Arm64RegCacheFPU::Init(Arm64Gen::ARM64XEmitter *emit, Arm64Gen::ARM64FloatEmitter *fp) {
39
emit_ = emit;
40
fp_ = fp;
41
}
42
43
void Arm64RegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
44
if (!initialReady) {
45
SetupInitialRegs();
46
initialReady = true;
47
}
48
49
memcpy(ar, arInitial, sizeof(ar));
50
memcpy(mr, mrInitial, sizeof(mr));
51
pendingFlush = false;
52
}
53
54
void Arm64RegCacheFPU::SetupInitialRegs() {
55
for (int i = 0; i < numARMFpuReg_; i++) {
56
arInitial[i].mipsReg = -1;
57
arInitial[i].isDirty = false;
58
}
59
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
60
mrInitial[i].loc = ML_MEM;
61
mrInitial[i].reg = INVALID_REG;
62
mrInitial[i].spillLock = false;
63
mrInitial[i].tempLock = false;
64
}
65
}
66
67
const ARM64Reg *Arm64RegCacheFPU::GetMIPSAllocationOrder(int &count) {
68
// On ARM64, all 32 registers are fully 128-bit and fully interchangable so we don't
69
// have to care about upper or lower registers. However, only S8-S15 are callee-save, and
70
// only the bottom 64 bits of those. So we should allocate into these when we call
71
// C functions, although we don't currently do so...
72
73
static const ARM64Reg allocationOrder[] = {
74
// Reserve four full 128-bit temp registers, should be plenty.
75
S4, S5, S6, S7,
76
S8, S9, S10, S11, // Partially callee-save (bottom 64 bits)
77
S12, S13, S14, S15, // Partially callee-save (bottom 64 bits)
78
S16, S17, S18, S19,
79
S20, S21, S22, S23,
80
S24, S25, S26, S27,
81
S28, S29, S30, S31,
82
};
83
84
static const ARM64Reg allocationOrderNEONVFPU[] = {
85
// Reserve four full 128-bit temp registers, should be plenty.
86
87
// Then let's use 12 register as singles
88
S4, S5, S6, S7,
89
S8, S9, S10, S11,
90
S12, S13, S14, S15,
91
92
// And do quads in the rest? Or use a strategy more similar to what we do on x86?
93
};
94
95
if (jo_->useASIMDVFPU) {
96
count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARM64Reg);
97
return allocationOrderNEONVFPU;
98
} else {
99
count = sizeof(allocationOrder) / sizeof(const ARM64Reg);
100
return allocationOrder;
101
}
102
}
103
104
bool Arm64RegCacheFPU::IsMapped(MIPSReg r) {
105
return mr[r].loc == ML_ARMREG;
106
}
107
108
bool Arm64RegCacheFPU::IsInRAM(MIPSReg r) {
109
return mr[r].loc == ML_MEM;
110
}
111
112
ARM64Reg Arm64RegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {
113
// INFO_LOG(Log::JIT, "FPR MapReg: %i flags=%i", mipsReg, mapFlags);
114
if (jo_->useASIMDVFPU && mipsReg >= 32) {
115
ERROR_LOG(Log::JIT, "Cannot map VFPU registers to ARM VFP registers in NEON mode. PC=%08x", js_->compilerPC);
116
return S0;
117
}
118
119
pendingFlush = true;
120
// Let's see if it's already mapped. If so we just need to update the dirty flag.
121
// We don't need to check for ML_NOINIT because we assume that anyone who maps
122
// with that flag immediately writes a "known" value to the register.
123
if (mr[mipsReg].loc == ML_ARMREG) {
124
if (ar[mr[mipsReg].reg].mipsReg != mipsReg) {
125
ERROR_LOG(Log::JIT, "Reg mapping out of sync! MR %i", mipsReg);
126
}
127
if (mapFlags & MAP_DIRTY) {
128
ar[mr[mipsReg].reg].isDirty = true;
129
}
130
//INFO_LOG(Log::JIT, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg);
131
return (ARM64Reg)(mr[mipsReg].reg + S0);
132
}
133
134
// Okay, not mapped, so we need to allocate an ARM register.
135
136
int allocCount;
137
const ARM64Reg *allocOrder = GetMIPSAllocationOrder(allocCount);
138
139
allocate:
140
for (int i = 0; i < allocCount; i++) {
141
int reg = DecodeReg(allocOrder[i]);
142
143
if (ar[reg].mipsReg == -1) {
144
// That means it's free. Grab it, and load the value into it (if requested).
145
ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
146
if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {
147
if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {
148
fp_->LDR(32, INDEX_UNSIGNED, (ARM64Reg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));
149
}
150
}
151
ar[reg].mipsReg = mipsReg;
152
mr[mipsReg].loc = ML_ARMREG;
153
mr[mipsReg].reg = reg;
154
//INFO_LOG(Log::JIT, "Mapped %i to %i", mipsReg, mr[mipsReg].reg);
155
return (ARM64Reg)(reg + S0);
156
}
157
}
158
159
160
// Still nothing. Let's spill a reg and goto 10.
161
// TODO: Use age or something to choose which register to spill?
162
// TODO: Spill dirty regs first? or opposite?
163
int bestToSpill = -1;
164
for (int i = 0; i < allocCount; i++) {
165
int reg = allocOrder[i] - S0;
166
if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock))
167
continue;
168
bestToSpill = reg;
169
break;
170
}
171
172
if (bestToSpill != -1) {
173
FlushArmReg((ARM64Reg)(S0 + bestToSpill));
174
goto allocate;
175
}
176
177
// Uh oh, we have all them spilllocked....
178
ERROR_LOG(Log::JIT, "Out of spillable registers at PC %08x!!!", js_->compilerPC);
179
return INVALID_REG;
180
}
181
182
void Arm64RegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) {
183
SpillLock(rd, rs);
184
MapReg(rd);
185
MapReg(rs);
186
ReleaseSpillLock(rd);
187
ReleaseSpillLock(rs);
188
}
189
190
void Arm64RegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) {
191
SpillLock(rd, rs);
192
bool load = !avoidLoad || rd == rs;
193
MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
194
MapReg(rs);
195
ReleaseSpillLock(rd);
196
ReleaseSpillLock(rs);
197
}
198
199
void Arm64RegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) {
200
SpillLock(rd, rs, rt);
201
bool load = !avoidLoad || (rd == rs || rd == rt);
202
MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
203
MapReg(rt);
204
MapReg(rs);
205
ReleaseSpillLock(rd);
206
ReleaseSpillLock(rs);
207
ReleaseSpillLock(rt);
208
}
209
210
void Arm64RegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) {
211
for (int i = 0; i < GetNumVectorElements(sz); i++) {
212
vr[v[i]].spillLock = true;
213
}
214
}
215
216
void Arm64RegCacheFPU::SpillLockV(int vec, VectorSize sz) {
217
u8 v[4];
218
GetVectorRegs(v, sz, vec);
219
SpillLockV(v, sz);
220
}
221
222
void Arm64RegCacheFPU::MapRegV(int vreg, int flags) {
223
MapReg(vreg + 32, flags);
224
}
225
226
void Arm64RegCacheFPU::LoadToRegV(ARM64Reg armReg, int vreg) {
227
if (vr[vreg].loc == ML_ARMREG) {
228
fp_->FMOV(armReg, (ARM64Reg)(S0 + vr[vreg].reg));
229
} else {
230
MapRegV(vreg);
231
fp_->FMOV(armReg, V(vreg));
232
}
233
}
234
235
void Arm64RegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) {
236
u8 v[4];
237
GetVectorRegs(v, sz, vec);
238
SpillLockV(v, sz);
239
for (int i = 0; i < GetNumVectorElements(sz); i++) {
240
MapRegV(v[i], flags);
241
}
242
}
243
244
void Arm64RegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) {
245
SpillLockV(v, sz);
246
for (int i = 0; i < GetNumVectorElements(sz); i++) {
247
MapRegV(v[i], flags);
248
}
249
}
250
251
void Arm64RegCacheFPU::MapInInV(int vs, int vt) {
252
SpillLockV(vs);
253
SpillLockV(vt);
254
MapRegV(vs);
255
MapRegV(vt);
256
ReleaseSpillLockV(vs);
257
ReleaseSpillLockV(vt);
258
}
259
260
void Arm64RegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) {
261
bool load = !avoidLoad || (vd == vs);
262
SpillLockV(vd);
263
SpillLockV(vs);
264
MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
265
MapRegV(vs);
266
ReleaseSpillLockV(vd);
267
ReleaseSpillLockV(vs);
268
}
269
270
void Arm64RegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) {
271
bool load = !avoidLoad || (vd == vs || vd == vt);
272
SpillLockV(vd);
273
SpillLockV(vs);
274
SpillLockV(vt);
275
MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
276
MapRegV(vs);
277
MapRegV(vt);
278
ReleaseSpillLockV(vd);
279
ReleaseSpillLockV(vs);
280
ReleaseSpillLockV(vt);
281
}
282
283
void Arm64RegCacheFPU::FlushArmReg(ARM64Reg r) {
284
if (r >= S0 && r <= S31) {
285
int reg = r - S0;
286
if (ar[reg].mipsReg == -1) {
287
// Nothing to do, reg not mapped.
288
return;
289
}
290
if (ar[reg].mipsReg != -1) {
291
if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG){
292
//INFO_LOG(Log::JIT, "Flushing ARM reg %i", reg);
293
fp_->STR(32, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
294
}
295
// IMMs won't be in an ARM reg.
296
mr[ar[reg].mipsReg].loc = ML_MEM;
297
mr[ar[reg].mipsReg].reg = INVALID_REG;
298
} else {
299
ERROR_LOG(Log::JIT, "Dirty but no mipsreg?");
300
}
301
ar[reg].mipsReg = -1;
302
ar[reg].isDirty = false;
303
}
304
}
305
306
void Arm64RegCacheFPU::FlushV(MIPSReg r) {
307
FlushR(r + 32);
308
}
309
310
void Arm64RegCacheFPU::FlushR(MIPSReg r) {
311
switch (mr[r].loc) {
312
case ML_IMM:
313
// IMM is always "dirty".
314
// IMM is not allowed for FP (yet).
315
ERROR_LOG(Log::JIT, "Imm in FP register?");
316
break;
317
318
case ML_ARMREG:
319
if (mr[r].reg == INVALID_REG) {
320
ERROR_LOG(Log::JIT, "FlushR: MipsReg had bad ArmReg");
321
}
322
FlushArmReg((ARM64Reg)(S0 + mr[r].reg));
323
break;
324
325
case ML_MEM:
326
// Already there, nothing to do.
327
break;
328
329
default:
330
//BAD
331
break;
332
}
333
}
334
335
Arm64Gen::ARM64Reg Arm64RegCacheFPU::ARM64RegForFlush(int r) {
336
switch (mr[r].loc) {
337
case ML_IMM:
338
// IMM is always "dirty".
339
// IMM is not allowed for FP (yet).
340
ERROR_LOG(Log::JIT, "Imm in FP register?");
341
return INVALID_REG;
342
343
case ML_ARMREG:
344
if (mr[r].reg == INVALID_REG) {
345
ERROR_LOG_REPORT(Log::JIT, "ARM64RegForFlush: MipsReg %d had bad ArmReg", r);
346
return INVALID_REG;
347
}
348
// No need to flush if it's not dirty.
349
if (!ar[mr[r].reg].isDirty) {
350
return INVALID_REG;
351
}
352
return (ARM64Reg)(S0 + mr[r].reg);
353
354
case ML_MEM:
355
return INVALID_REG;
356
357
default:
358
ERROR_LOG_REPORT(Log::JIT, "ARM64RegForFlush: MipsReg %d with invalid location %d", r, mr[r].loc);
359
return INVALID_REG;
360
}
361
}
362
363
void Arm64RegCacheFPU::FlushAll() {
364
if (!pendingFlush) {
365
// Nothing allocated. FPU regs are not nearly as common as GPR.
366
return;
367
}
368
369
// Discard temps!
370
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {
371
DiscardR(i);
372
}
373
374
int numArmRegs = 0;
375
376
const ARM64Reg *order = GetMIPSAllocationOrder(numArmRegs);
377
378
// Flush pairs first when possible. Note that STP's offset can't reach more than 256 bytes so
379
// most VFPU registers cannot be flushed this way, unless we are willing to generate another offset pointer
380
// (which we could actually do right here, point right in the middle of the VFPU stuff and would reach it all)... TODO
381
for (int i = 0; i < 31; i++) {
382
int mr1 = i;
383
int mr2 = i + 1;
384
ARM64Reg ar1 = ARM64RegForFlush(mr1);
385
ARM64Reg ar2 = ARM64RegForFlush(mr2);
386
387
if (ar1 != INVALID_REG && ar2 != INVALID_REG) {
388
fp_->STP(32, INDEX_SIGNED, ar1, ar2, CTXREG, GetMipsRegOffset(mr1));
389
DiscardR(mr1);
390
DiscardR(mr2);
391
}
392
}
393
394
// Then flush one by one.
395
396
for (int i = 0; i < numArmRegs; i++) {
397
int a = DecodeReg(order[i]);
398
int m = ar[a].mipsReg;
399
400
if (ar[a].isDirty) {
401
if (m == -1) {
402
INFO_LOG(Log::JIT, "ARM reg %d is dirty but has no mipsreg", a);
403
continue;
404
}
405
406
fp_->STR(32, INDEX_UNSIGNED, (ARM64Reg)(a + S0), CTXREG, GetMipsRegOffset(m));
407
408
mr[m].loc = ML_MEM;
409
mr[m].reg = (int)INVALID_REG;
410
ar[a].mipsReg = -1;
411
ar[a].isDirty = false;
412
} else {
413
if (m != -1) {
414
mr[m].loc = ML_MEM;
415
mr[m].reg = (int)INVALID_REG;
416
}
417
ar[a].mipsReg = -1;
418
// already not dirty
419
}
420
}
421
422
// Sanity check
423
for (int i = 0; i < numARMFpuReg_; i++) {
424
if (ar[i].mipsReg != -1) {
425
ERROR_LOG(Log::JIT, "Flush fail: ar[%d].mipsReg=%d", i, ar[i].mipsReg);
426
}
427
}
428
pendingFlush = false;
429
}
430
431
void Arm64RegCacheFPU::DiscardR(MIPSReg r) {
432
switch (mr[r].loc) {
433
case ML_IMM:
434
// IMM is always "dirty".
435
// IMM is not allowed for FP (yet).
436
ERROR_LOG(Log::JIT, "Imm in FP register?");
437
break;
438
439
case ML_ARMREG:
440
if (mr[r].reg == INVALID_REG) {
441
ERROR_LOG(Log::JIT, "DiscardR: MipsReg had bad ArmReg");
442
} else {
443
// Note that we DO NOT write it back here. That's the whole point of Discard.
444
ar[mr[r].reg].isDirty = false;
445
ar[mr[r].reg].mipsReg = -1;
446
}
447
break;
448
449
case ML_MEM:
450
// Already there, nothing to do.
451
break;
452
453
default:
454
//BAD
455
break;
456
}
457
mr[r].loc = ML_MEM;
458
mr[r].reg = (int)INVALID_REG;
459
mr[r].tempLock = false;
460
mr[r].spillLock = false;
461
}
462
463
bool Arm64RegCacheFPU::IsTempX(ARM64Reg r) const {
464
return ar[r - S0].mipsReg >= TEMP0;
465
}
466
467
int Arm64RegCacheFPU::GetTempR() {
468
if (jo_->useASIMDVFPU) {
469
ERROR_LOG(Log::JIT, "VFP temps not allowed in NEON mode");
470
return 0;
471
}
472
pendingFlush = true;
473
for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) {
474
if (mr[r].loc == ML_MEM && !mr[r].tempLock) {
475
mr[r].tempLock = true;
476
return r;
477
}
478
}
479
480
ERROR_LOG(Log::CPU, "Out of temp regs! Might need to DiscardR() some");
481
_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
482
return -1;
483
}
484
485
int Arm64RegCacheFPU::GetMipsRegOffset(MIPSReg r) {
486
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
487
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
488
ERROR_LOG(Log::JIT, "bad mips register %i, out of range", r);
489
return 0; // or what?
490
}
491
492
if (r < 32 || r >= 32 + 128) {
493
return (32 + r) << 2;
494
} else {
495
// r is between 32 and 128 + 32
496
return (32 + 32 + voffset[r - 32]) << 2;
497
}
498
}
499
500
void Arm64RegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {
501
mr[r1].spillLock = true;
502
if (r2 != -1) mr[r2].spillLock = true;
503
if (r3 != -1) mr[r3].spillLock = true;
504
if (r4 != -1) mr[r4].spillLock = true;
505
}
506
507
// This is actually pretty slow with all the 160 regs...
508
void Arm64RegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
509
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
510
mr[i].spillLock = false;
511
}
512
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
513
DiscardR(i);
514
}
515
}
516
517
ARM64Reg Arm64RegCacheFPU::R(int mipsReg) {
518
if (mr[mipsReg].loc == ML_ARMREG) {
519
return (ARM64Reg)(mr[mipsReg].reg + S0);
520
} else {
521
if (mipsReg < 32) {
522
ERROR_LOG(Log::JIT, "FReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
523
} else if (mipsReg < 32 + 128) {
524
ERROR_LOG(Log::JIT, "VReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
525
} else {
526
ERROR_LOG(Log::JIT, "Tempreg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 128 - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
527
}
528
return INVALID_REG; // BAAAD
529
}
530
}
531
532
#endif // PPSSPP_ARCH(ARM64)
533
534