Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM/ArmRegCacheFPU.cpp
5673 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <cstring>
19
20
#include "Common/CPUDetect.h"
21
#include "Common/Log.h"
22
#include "Core/MIPS/MIPS.h"
23
#include "Core/MIPS/ARM/ArmRegCacheFPU.h"
24
#include "Core/MIPS/ARM/ArmJit.h"
25
#include "Core/MIPS/MIPSTables.h"
26
27
using namespace ArmGen;
28
using namespace ArmJitConstants;
29
30
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}
31
32
void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
33
if (!initialReady) {
34
SetupInitialRegs();
35
initialReady = true;
36
}
37
38
memcpy(ar, arInitial, sizeof(ar));
39
memcpy(mr, mrInitial, sizeof(mr));
40
pendingFlush = false;
41
}
42
43
void ArmRegCacheFPU::SetupInitialRegs() {
44
for (int i = 0; i < NUM_ARMFPUREG; i++) {
45
arInitial[i].mipsReg = -1;
46
arInitial[i].isDirty = false;
47
}
48
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
49
mrInitial[i].loc = ML_MEM;
50
mrInitial[i].reg = INVALID_REG;
51
mrInitial[i].spillLock = false;
52
mrInitial[i].tempLock = false;
53
}
54
for (int i = 0; i < NUM_ARMQUADS; i++) {
55
qr[i].isDirty = false;
56
qr[i].mipsVec = -1;
57
qr[i].sz = V_Invalid;
58
qr[i].spillLock = false;
59
qr[i].isTemp = false;
60
memset(qr[i].vregs, 0xff, 4);
61
}
62
}
63
64
const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
65
// VFP mapping
66
// VFPU registers and regular FP registers are mapped interchangably on top of the standard
67
// 16 FPU registers.
68
69
// NEON mapping
70
// We map FPU and VFPU registers entirely separately. FPU is mapped to 12 of the bottom 16 S registers.
71
// VFPU is mapped to the upper 48 regs, 32 of which can only be reached through NEON
72
// (or D16-D31 as doubles, but not relevant).
73
// Might consider shifting the split in the future, giving more regs to NEON allowing it to map more quads.
74
75
// We should attempt to map scalars to low Q registers and wider things to high registers,
76
// as the NEON instructions are all 2-vector or 4-vector, they don't do scalar, we want to be
77
// able to use regular VFP instructions too.
78
static const ARMReg allocationOrderNEON[] = {
79
// Reserve four temp registers. Useful when building quads until we really figure out
80
// how to do that best.
81
S4, S5, S6, S7, // Q1
82
S8, S9, S10, S11, // Q2
83
S12, S13, S14, S15, // Q3
84
S16, S17, S18, S19, // Q4
85
S20, S21, S22, S23, // Q5
86
S24, S25, S26, S27, // Q6
87
S28, S29, S30, S31, // Q7
88
// Q8-Q15 free for NEON tricks
89
};
90
91
count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
92
return allocationOrderNEON;
93
}
94
95
bool ArmRegCacheFPU::IsMapped(MIPSReg r) {
96
return mr[r].loc == ML_ARMREG;
97
}
98
99
ARMReg ArmRegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {
100
pendingFlush = true;
101
// Let's see if it's already mapped. If so we just need to update the dirty flag.
102
// We don't need to check for ML_NOINIT because we assume that anyone who maps
103
// with that flag immediately writes a "known" value to the register.
104
if (mr[mipsReg].loc == ML_ARMREG) {
105
if (ar[mr[mipsReg].reg].mipsReg != mipsReg) {
106
ERROR_LOG(Log::JIT, "Reg mapping out of sync! MR %i", mipsReg);
107
}
108
if (mapFlags & MAP_DIRTY) {
109
ar[mr[mipsReg].reg].isDirty = true;
110
}
111
//INFO_LOG(Log::JIT, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg);
112
return (ARMReg)(mr[mipsReg].reg + S0);
113
}
114
115
// Okay, not mapped, so we need to allocate an ARM register.
116
117
int allocCount;
118
const ARMReg *allocOrder = GetMIPSAllocationOrder(allocCount);
119
120
allocate:
121
for (int i = 0; i < allocCount; i++) {
122
int reg = allocOrder[i] - S0;
123
124
if (ar[reg].mipsReg == -1) {
125
// That means it's free. Grab it, and load the value into it (if requested).
126
ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
127
if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {
128
if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {
129
emit_->VLDR((ARMReg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));
130
}
131
}
132
ar[reg].mipsReg = mipsReg;
133
mr[mipsReg].loc = ML_ARMREG;
134
mr[mipsReg].reg = reg;
135
//INFO_LOG(Log::JIT, "Mapped %i to %i", mipsReg, mr[mipsReg].reg);
136
return (ARMReg)(reg + S0);
137
}
138
}
139
140
141
// Still nothing. Let's spill a reg and goto 10.
142
// TODO: Use age or something to choose which register to spill?
143
// TODO: Spill dirty regs first? or opposite?
144
int bestToSpill = -1;
145
for (int i = 0; i < allocCount; i++) {
146
int reg = allocOrder[i] - S0;
147
if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock))
148
continue;
149
bestToSpill = reg;
150
break;
151
}
152
153
if (bestToSpill != -1) {
154
FlushArmReg((ARMReg)(S0 + bestToSpill));
155
goto allocate;
156
}
157
158
// Uh oh, we have all them spilllocked....
159
ERROR_LOG(Log::JIT, "Out of spillable registers at PC %08x!!!", js_->compilerPC);
160
return INVALID_REG;
161
}
162
163
void ArmRegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) {
164
SpillLock(rd, rs);
165
MapReg(rd);
166
MapReg(rs);
167
ReleaseSpillLock(rd);
168
ReleaseSpillLock(rs);
169
}
170
171
void ArmRegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) {
172
SpillLock(rd, rs);
173
bool load = !avoidLoad || rd == rs;
174
MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
175
MapReg(rs);
176
ReleaseSpillLock(rd);
177
ReleaseSpillLock(rs);
178
}
179
180
void ArmRegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) {
181
SpillLock(rd, rs, rt);
182
bool load = !avoidLoad || (rd == rs || rd == rt);
183
MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
184
MapReg(rt);
185
MapReg(rs);
186
ReleaseSpillLock(rd);
187
ReleaseSpillLock(rs);
188
ReleaseSpillLock(rt);
189
}
190
191
void ArmRegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) {
192
for (int i = 0; i < GetNumVectorElements(sz); i++) {
193
vr[v[i]].spillLock = true;
194
}
195
}
196
197
void ArmRegCacheFPU::SpillLockV(int vec, VectorSize sz) {
198
u8 v[4];
199
GetVectorRegs(v, sz, vec);
200
SpillLockV(v, sz);
201
}
202
203
void ArmRegCacheFPU::MapRegV(int vreg, int flags) {
204
MapReg(vreg + 32, flags);
205
}
206
207
void ArmRegCacheFPU::LoadToRegV(ARMReg armReg, int vreg) {
208
if (vr[vreg].loc == ML_ARMREG) {
209
emit_->VMOV(armReg, (ARMReg)(S0 + vr[vreg].reg));
210
} else {
211
MapRegV(vreg);
212
emit_->VMOV(armReg, V(vreg));
213
}
214
}
215
216
void ArmRegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) {
217
u8 v[4];
218
GetVectorRegs(v, sz, vec);
219
SpillLockV(v, sz);
220
for (int i = 0; i < GetNumVectorElements(sz); i++) {
221
MapRegV(v[i], flags);
222
}
223
}
224
225
void ArmRegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) {
226
SpillLockV(v, sz);
227
for (int i = 0; i < GetNumVectorElements(sz); i++) {
228
MapRegV(v[i], flags);
229
}
230
}
231
232
void ArmRegCacheFPU::MapInInV(int vs, int vt) {
233
SpillLockV(vs);
234
SpillLockV(vt);
235
MapRegV(vs);
236
MapRegV(vt);
237
ReleaseSpillLockV(vs);
238
ReleaseSpillLockV(vt);
239
}
240
241
void ArmRegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) {
242
bool load = !avoidLoad || (vd == vs);
243
SpillLockV(vd);
244
SpillLockV(vs);
245
MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
246
MapRegV(vs);
247
ReleaseSpillLockV(vd);
248
ReleaseSpillLockV(vs);
249
}
250
251
void ArmRegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) {
252
bool load = !avoidLoad || (vd == vs || vd == vt);
253
SpillLockV(vd);
254
SpillLockV(vs);
255
SpillLockV(vt);
256
MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
257
MapRegV(vs);
258
MapRegV(vt);
259
ReleaseSpillLockV(vd);
260
ReleaseSpillLockV(vs);
261
ReleaseSpillLockV(vt);
262
}
263
264
void ArmRegCacheFPU::FlushArmReg(ARMReg r) {
265
if (r >= S0 && r <= S31) {
266
int reg = r - S0;
267
if (ar[reg].mipsReg == -1) {
268
// Nothing to do, reg not mapped.
269
return;
270
}
271
if (ar[reg].mipsReg != -1) {
272
if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG)
273
{
274
//INFO_LOG(Log::JIT, "Flushing ARM reg %i", reg);
275
emit_->VSTR(r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
276
}
277
// IMMs won't be in an ARM reg.
278
mr[ar[reg].mipsReg].loc = ML_MEM;
279
mr[ar[reg].mipsReg].reg = INVALID_REG;
280
} else {
281
ERROR_LOG(Log::JIT, "Dirty but no mipsreg?");
282
}
283
ar[reg].isDirty = false;
284
ar[reg].mipsReg = -1;
285
} else if (r >= D0 && r <= D31) {
286
// TODO: Convert to S regs and flush them individually.
287
} else if (r >= Q0 && r <= Q15) {
288
QFlush(r);
289
}
290
}
291
292
void ArmRegCacheFPU::FlushV(MIPSReg r) {
293
FlushR(r + 32);
294
}
295
296
/*
297
void ArmRegCacheFPU::FlushQWithV(MIPSReg r) {
298
// Look for it in all the quads. If it's in any, flush that quad clean.
299
int flushCount = 0;
300
for (int i = 0; i < MAX_ARMQUADS; i++) {
301
if (qr[i].sz == V_Invalid)
302
continue;
303
304
int n = qr[i].sz;
305
bool flushThis = false;
306
for (int j = 0; j < n; j++) {
307
if (qr[i].vregs[j] == r) {
308
flushThis = true;
309
}
310
}
311
312
if (flushThis) {
313
QFlush(i);
314
flushCount++;
315
}
316
}
317
318
if (flushCount > 1) {
319
WARN_LOG(Log::JIT, "ERROR: More than one quad was flushed to flush reg %i", r);
320
}
321
}
322
*/
323
324
void ArmRegCacheFPU::FlushR(MIPSReg r) {
325
switch (mr[r].loc) {
326
case ML_IMM:
327
// IMM is always "dirty".
328
// IMM is not allowed for FP (yet).
329
ERROR_LOG(Log::JIT, "Imm in FP register?");
330
break;
331
332
case ML_ARMREG:
333
if (mr[r].reg == INVALID_REG) {
334
ERROR_LOG(Log::JIT, "FlushR: MipsReg had bad ArmReg");
335
}
336
337
if (mr[r].reg >= Q0 && mr[r].reg <= Q15) {
338
// This should happen rarely, but occasionally we need to flush a single stray
339
// mipsreg that's been part of a quad.
340
int quad = mr[r].reg - Q0;
341
if (qr[quad].isDirty) {
342
WARN_LOG(Log::JIT, "FlushR found quad register %i - PC=%08x", quad, js_->compilerPC);
343
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffset(r), R1);
344
emit_->VST1_lane(F_32, (ARMReg)mr[r].reg, R0, mr[r].lane, true);
345
}
346
} else {
347
if (ar[mr[r].reg].isDirty) {
348
//INFO_LOG(Log::JIT, "Flushing dirty reg %i", mr[r].reg);
349
emit_->VSTR((ARMReg)(mr[r].reg + S0), CTXREG, GetMipsRegOffset(r));
350
ar[mr[r].reg].isDirty = false;
351
}
352
ar[mr[r].reg].mipsReg = -1;
353
}
354
break;
355
356
case ML_MEM:
357
// Already there, nothing to do.
358
break;
359
360
default:
361
//BAD
362
break;
363
}
364
mr[r].loc = ML_MEM;
365
mr[r].reg = (int)INVALID_REG;
366
}
367
368
// Scalar only. Need a similar one for sequential Q vectors.
369
int ArmRegCacheFPU::FlushGetSequential(int a) {
370
int c = 1;
371
int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
372
a++;
373
while (a < 32) {
374
if (!ar[a].isDirty || ar[a].mipsReg == -1)
375
break;
376
int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
377
if (mipsOffset != lastMipsOffset + 4) {
378
break;
379
}
380
381
lastMipsOffset = mipsOffset;
382
a++;
383
c++;
384
}
385
return c;
386
}
387
388
void ArmRegCacheFPU::FlushAll() {
389
if (!pendingFlush) {
390
// Nothing allocated. FPU regs are not nearly as common as GPR.
391
return;
392
}
393
394
// Discard temps!
395
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {
396
DiscardR(i);
397
}
398
399
// Flush quads!
400
// These could also use sequential detection.
401
for (int i = 4; i < NUM_ARMQUADS; i++) {
402
QFlush(i);
403
}
404
405
// Loop through the ARM registers, then use GetMipsRegOffset to determine if MIPS registers are
406
// sequential. This is necessary because we store VFPU registers in a staggered order to get
407
// columns sequential (most VFPU math in nearly all games is in columns, not rows).
408
409
int numArmRegs;
410
// We rely on the allocation order being sequential.
411
const ARMReg baseReg = GetMIPSAllocationOrder(numArmRegs)[0];
412
413
for (int i = 0; i < numArmRegs; i++) {
414
int a = (baseReg - S0) + i;
415
int m = ar[a].mipsReg;
416
417
if (ar[a].isDirty) {
418
if (m == -1) {
419
INFO_LOG(Log::JIT, "ARM reg %i is dirty but has no mipsreg", a);
420
continue;
421
}
422
423
int c = FlushGetSequential(a);
424
if (c == 1) {
425
// INFO_LOG(Log::JIT, "Got single register: %i (%i)", a, m);
426
emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
427
} else if (c == 2) {
428
// Probably not worth using VSTMIA for two.
429
int offset = GetMipsRegOffset(m);
430
emit_->VSTR((ARMReg)(a + S0), CTXREG, offset);
431
emit_->VSTR((ARMReg)(a + 1 + S0), CTXREG, offset + 4);
432
} else {
433
// INFO_LOG(Log::JIT, "Got sequence: %i at %i (%i)", c, a, m);
434
emit_->ADDI2R(SCRATCHREG1, CTXREG, GetMipsRegOffset(m), SCRATCHREG2);
435
// INFO_LOG(Log::JIT, "VSTMIA R0, %i, %i", a, c);
436
emit_->VSTMIA(SCRATCHREG1, false, (ARMReg)(S0 + a), c);
437
}
438
439
// Skip past, and mark as non-dirty.
440
for (int j = 0; j < c; j++) {
441
int b = a + j;
442
mr[ar[b].mipsReg].loc = ML_MEM;
443
mr[ar[b].mipsReg].reg = (int)INVALID_REG;
444
ar[a + j].mipsReg = -1;
445
ar[a + j].isDirty = false;
446
}
447
i += c - 1;
448
} else {
449
if (m != -1) {
450
mr[m].loc = ML_MEM;
451
mr[m].reg = (int)INVALID_REG;
452
}
453
ar[a].mipsReg = -1;
454
// already not dirty
455
}
456
}
457
458
// Sanity check
459
for (int i = 0; i < NUM_ARMFPUREG; i++) {
460
if (ar[i].mipsReg != -1) {
461
ERROR_LOG(Log::JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
462
}
463
}
464
pendingFlush = false;
465
}
466
467
void ArmRegCacheFPU::DiscardR(MIPSReg r) {
468
switch (mr[r].loc) {
469
case ML_IMM:
470
// IMM is always "dirty".
471
// IMM is not allowed for FP (yet).
472
ERROR_LOG(Log::JIT, "Imm in FP register?");
473
break;
474
475
case ML_ARMREG:
476
if (mr[r].reg == INVALID_REG) {
477
ERROR_LOG(Log::JIT, "DiscardR: MipsReg had bad ArmReg");
478
} else {
479
// Note that we DO NOT write it back here. That's the whole point of Discard.
480
ar[mr[r].reg].isDirty = false;
481
ar[mr[r].reg].mipsReg = -1;
482
}
483
break;
484
485
case ML_MEM:
486
// Already there, nothing to do.
487
break;
488
489
default:
490
//BAD
491
break;
492
}
493
mr[r].loc = ML_MEM;
494
mr[r].reg = (int)INVALID_REG;
495
mr[r].tempLock = false;
496
mr[r].spillLock = false;
497
}
498
499
bool ArmRegCacheFPU::IsTempX(ARMReg r) const {
500
return ar[r - S0].mipsReg >= TEMP0;
501
}
502
503
int ArmRegCacheFPU::GetTempR() {
504
pendingFlush = true;
505
for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) {
506
if (mr[r].loc == ML_MEM && !mr[r].tempLock) {
507
mr[r].tempLock = true;
508
return r;
509
}
510
}
511
512
ERROR_LOG(Log::CPU, "Out of temp regs! Might need to DiscardR() some");
513
_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
514
return -1;
515
}
516
517
int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
518
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
519
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
520
ERROR_LOG(Log::JIT, "bad mips register %i, out of range", r);
521
return 0; // or what?
522
}
523
524
if (r < 32 || r >= 32 + 128) {
525
return (32 + r) << 2;
526
} else {
527
// r is between 32 and 128 + 32
528
return (32 + 32 + voffset[r - 32]) << 2;
529
}
530
}
531
532
void ArmRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {
533
mr[r1].spillLock = true;
534
if (r2 != -1) mr[r2].spillLock = true;
535
if (r3 != -1) mr[r3].spillLock = true;
536
if (r4 != -1) mr[r4].spillLock = true;
537
}
538
539
// This is actually pretty slow with all the 160 regs...
540
void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
541
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
542
mr[i].spillLock = false;
543
}
544
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
545
DiscardR(i);
546
}
547
for (int i = 0; i < NUM_ARMQUADS; i++) {
548
qr[i].spillLock = false;
549
if (qr[i].isTemp) {
550
qr[i].isTemp = false;
551
qr[i].sz = V_Invalid;
552
}
553
}
554
}
555
556
ARMReg ArmRegCacheFPU::R(int mipsReg) {
557
if (mr[mipsReg].loc == ML_ARMREG) {
558
return (ARMReg)(mr[mipsReg].reg + S0);
559
} else {
560
if (mipsReg < 32) {
561
ERROR_LOG(Log::JIT, "FReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
562
} else if (mipsReg < 32 + 128) {
563
ERROR_LOG(Log::JIT, "VReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
564
} else {
565
ERROR_LOG(Log::JIT, "Tempreg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 128 - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
566
}
567
return INVALID_REG; // BAAAD
568
}
569
}
570
571
inline ARMReg QuadAsD(int quad) {
572
return (ARMReg)(D0 + quad * 2);
573
}
574
575
inline ARMReg QuadAsQ(int quad) {
576
return (ARMReg)(Q0 + quad);
577
}
578
579
bool MappableQ(int quad) {
580
return quad >= 4;
581
}
582
583
void ArmRegCacheFPU::QLoad4x4(MIPSGPReg regPtr, int vquads[4]) {
584
ERROR_LOG(Log::JIT, "QLoad4x4 not implemented");
585
// TODO
586
}
587
588
void ArmRegCacheFPU::QFlush(int quad) {
589
if (!MappableQ(quad)) {
590
ERROR_LOG(Log::JIT, "Cannot flush non-mappable quad %i", quad);
591
return;
592
}
593
594
if (qr[quad].isDirty && !qr[quad].isTemp) {
595
INFO_LOG(Log::JIT, "Flushing Q%i (%s)", quad, GetVectorNotation(qr[quad].mipsVec, qr[quad].sz).c_str());
596
597
ARMReg q = QuadAsQ(quad);
598
// Unlike reads, when writing to the register file we need to be careful to write the correct
599
// number of floats.
600
601
switch (qr[quad].sz) {
602
case V_Single:
603
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
604
emit_->VST1_lane(F_32, q, R0, 0, true);
605
// WARN_LOG(Log::JIT, "S: Falling back to individual flush: pc=%08x", js_->compilerPC);
606
break;
607
case V_Pair:
608
if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1])) {
609
// Can combine, it's a column!
610
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
611
emit_->VST1(F_32, q, R0, 1, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable
612
} else {
613
// WARN_LOG(Log::JIT, "P: Falling back to individual flush: pc=%08x", js_->compilerPC);
614
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
615
emit_->VST1_lane(F_32, q, R0, 0, true);
616
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
617
emit_->VST1_lane(F_32, q, R0, 1, true);
618
}
619
break;
620
case V_Triple:
621
if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2])) {
622
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
623
emit_->VST1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE); // TODO: Allow ALIGN_64 when applicable
624
emit_->VST1_lane(F_32, q, R0, 2, true);
625
} else {
626
// WARN_LOG(Log::JIT, "T: Falling back to individual flush: pc=%08x", js_->compilerPC);
627
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
628
emit_->VST1_lane(F_32, q, R0, 0, true);
629
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
630
emit_->VST1_lane(F_32, q, R0, 1, true);
631
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);
632
emit_->VST1_lane(F_32, q, R0, 2, true);
633
}
634
break;
635
case V_Quad:
636
if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2], qr[quad].vregs[3])) {
637
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
638
emit_->VST1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable
639
} else {
640
// WARN_LOG(Log::JIT, "Q: Falling back to individual flush: pc=%08x", js_->compilerPC);
641
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
642
emit_->VST1_lane(F_32, q, R0, 0, true);
643
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
644
emit_->VST1_lane(F_32, q, R0, 1, true);
645
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);
646
emit_->VST1_lane(F_32, q, R0, 2, true);
647
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[3]), R1);
648
emit_->VST1_lane(F_32, q, R0, 3, true);
649
}
650
break;
651
default:
652
ERROR_LOG(Log::JIT, "Unknown quad size %i", qr[quad].sz);
653
break;
654
}
655
656
qr[quad].isDirty = false;
657
658
int n = GetNumVectorElements(qr[quad].sz);
659
for (int i = 0; i < n; i++) {
660
int vr = qr[quad].vregs[i];
661
if (vr < 0 || vr > 128) {
662
ERROR_LOG(Log::JIT, "Bad vr %i", vr);
663
}
664
FPURegMIPS &m = mr[32 + vr];
665
m.loc = ML_MEM;
666
m.lane = -1;
667
m.reg = -1;
668
}
669
670
} else {
671
if (qr[quad].isTemp) {
672
WARN_LOG(Log::JIT, "Not flushing quad %i; dirty = %i, isTemp = %i", quad, qr[quad].isDirty, qr[quad].isTemp);
673
}
674
}
675
676
qr[quad].isTemp = false;
677
qr[quad].mipsVec = -1;
678
qr[quad].sz = V_Invalid;
679
memset(qr[quad].vregs, 0xFF, 4);
680
}
681
682
int ArmRegCacheFPU::QGetFreeQuad(int start, int count, const char *reason) {
683
// Search for a free quad. A quad is free if the first register in it is free.
684
for (int i = 0; i < count; i++) {
685
int q = (i + start) & 15;
686
687
if (!MappableQ(q))
688
continue;
689
690
// Don't steal temp quads!
691
if (qr[q].mipsVec == (int)INVALID_REG && !qr[q].isTemp) {
692
// INFO_LOG(Log::JIT, "Free quad: %i", q);
693
// Oh yeah! Free quad!
694
return q;
695
}
696
}
697
698
// Okay, find the "best scoring" reg to replace. Scoring algorithm TBD but may include some
699
// sort of age.
700
int bestQuad = -1;
701
int bestScore = -1;
702
for (int i = 0; i < count; i++) {
703
int q = (i + start) & 15;
704
705
if (!MappableQ(q))
706
continue;
707
if (qr[q].spillLock)
708
continue;
709
if (qr[q].isTemp)
710
continue;
711
712
int score = 0;
713
if (!qr[q].isDirty) {
714
score += 5;
715
}
716
717
if (score > bestScore) {
718
bestQuad = q;
719
bestScore = score;
720
}
721
}
722
723
if (bestQuad == -1) {
724
ERROR_LOG(Log::JIT, "Failed finding a free quad. Things will now go haywire!");
725
return -1;
726
} else {
727
INFO_LOG(Log::JIT, "No register found in %i and the next %i, kicked out #%i (%s)", start, count, bestQuad, reason ? reason : "no reason");
728
QFlush(bestQuad);
729
return bestQuad;
730
}
731
}
732
733
ARMReg ArmRegCacheFPU::QAllocTemp(VectorSize sz) {
734
int q = QGetFreeQuad(8, 16, "allocating temporary"); // Prefer high quads as temps
735
if (q < 0) {
736
ERROR_LOG(Log::JIT, "Failed to allocate temp quad");
737
q = 0;
738
}
739
qr[q].spillLock = true;
740
qr[q].isTemp = true;
741
qr[q].sz = sz;
742
qr[q].isDirty = false; // doesn't matter
743
744
INFO_LOG(Log::JIT, "Allocated temp quad %i", q);
745
746
if (sz == V_Single || sz == V_Pair) {
747
return D_0(ARMReg(Q0 + q));
748
} else {
749
return ARMReg(Q0 + q);
750
}
751
}
752
753
bool ArmRegCacheFPU::Consecutive(int v1, int v2) const {
754
return (voffset[v1] + 1) == voffset[v2];
755
}
756
757
bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3) const {
758
return Consecutive(v1, v2) && Consecutive(v2, v3);
759
}
760
761
bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3, int v4) const {
762
return Consecutive(v1, v2) && Consecutive(v2, v3) && Consecutive(v3, v4);
763
}
764
765
void ArmRegCacheFPU::QMapMatrix(ARMReg *regs, int matrix, MatrixSize mz, int flags) {
766
u8 vregs[4];
767
if (flags & MAP_MTX_TRANSPOSED) {
768
GetMatrixRows(matrix, mz, vregs);
769
} else {
770
GetMatrixColumns(matrix, mz, vregs);
771
}
772
773
// TODO: Zap existing mappings, reserve 4 consecutive regs, then do a fast load.
774
int n = GetMatrixSide(mz);
775
VectorSize vsz = GetVectorSize(mz);
776
for (int i = 0; i < n; i++) {
777
regs[i] = QMapReg(vregs[i], vsz, flags);
778
}
779
}
780
781
ARMReg ArmRegCacheFPU::QMapReg(int vreg, VectorSize sz, int flags) {
782
qTime_++;
783
784
int n = GetNumVectorElements(sz);
785
u8 vregs[4];
786
GetVectorRegs(vregs, sz, vreg);
787
788
// Range of registers to consider
789
int start = 0;
790
int count = 16;
791
792
if (flags & MAP_PREFER_HIGH) {
793
start = 8;
794
} else if (flags & MAP_PREFER_LOW) {
795
start = 4;
796
} else if (flags & MAP_FORCE_LOW) {
797
start = 4;
798
count = 4;
799
} else if (flags & MAP_FORCE_HIGH) {
800
start = 8;
801
count = 8;
802
}
803
804
// Let's check if they are all mapped in a quad somewhere.
805
// At the same time, check for the quad already being mapped.
806
// Later we can check for possible transposes as well.
807
808
// First just loop over all registers. If it's here and not in range, or overlapped, kick.
809
std::vector<int> quadsToFlush;
810
for (int i = 0; i < 16; i++) {
811
int q = (i + start) & 15;
812
if (!MappableQ(q))
813
continue;
814
815
// Skip unmapped quads.
816
if (qr[q].sz == V_Invalid)
817
continue;
818
819
// Check if completely there already. If so, set spill-lock, transfer dirty flag and exit.
820
if (vreg == qr[q].mipsVec && sz == qr[q].sz) {
821
if (i < count) {
822
INFO_LOG(Log::JIT, "Quad already mapped: %i : %i (size %i)", q, vreg, sz);
823
qr[q].isDirty = qr[q].isDirty || (flags & MAP_DIRTY);
824
qr[q].spillLock = true;
825
826
// Sanity check vregs
827
for (int i = 0; i < n; i++) {
828
if (vregs[i] != qr[q].vregs[i]) {
829
ERROR_LOG(Log::JIT, "Sanity check failed: %i vs %i", vregs[i], qr[q].vregs[i]);
830
}
831
}
832
833
return (ARMReg)(Q0 + q);
834
} else {
835
INFO_LOG(Log::JIT, "Quad already mapped at %i which is out of requested range [%i-%i) (count = %i), needs moving. For now we flush.", q, start, start+count, count);
836
quadsToFlush.push_back(q);
837
continue;
838
}
839
}
840
841
// Check for any overlap. Overlap == flush.
842
int origN = GetNumVectorElements(qr[q].sz);
843
for (int a = 0; a < n; a++) {
844
for (int b = 0; b < origN; b++) {
845
if (vregs[a] == qr[q].vregs[b]) {
846
quadsToFlush.push_back(q);
847
goto doubleBreak;
848
}
849
}
850
}
851
doubleBreak:
852
;
853
}
854
855
// We didn't find the extra register, but we got a list of regs to flush. Flush 'em.
856
// Here we can check for opportunities to do a "transpose-flush" of row vectors, etc.
857
if (!quadsToFlush.empty()) {
858
INFO_LOG(Log::JIT, "New mapping %s collided with %d quads, flushing them.", GetVectorNotation(vreg, sz).c_str(), (int)quadsToFlush.size());
859
}
860
for (size_t i = 0; i < quadsToFlush.size(); i++) {
861
QFlush(quadsToFlush[i]);
862
}
863
864
// Find where we want to map it, obeying the constraints we gave.
865
int quad = QGetFreeQuad(start, count, "mapping");
866
if (quad < 0)
867
return INVALID_REG;
868
869
// If parts of our register are elsewhere, and we are dirty, we need to flush them
870
// before we reload in a new location.
871
// This may be problematic if inputs overlap irregularly with output, say:
872
// vdot S700, R000, C000
873
// It might still work by accident...
874
if (flags & MAP_DIRTY) {
875
for (int i = 0; i < n; i++) {
876
FlushV(vregs[i]);
877
}
878
}
879
880
qr[quad].sz = sz;
881
qr[quad].mipsVec = vreg;
882
883
if ((flags & MAP_NOINIT) != MAP_NOINIT) {
884
// Okay, now we will try to load the whole thing in one go. This is possible
885
// if it's a row and easy if it's a single.
886
// Rows are rare, columns are common - but thanks to our register reordering,
887
// columns are actually in-order in memory.
888
switch (sz) {
889
case V_Single:
890
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
891
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
892
break;
893
case V_Pair:
894
if (Consecutive(vregs[0], vregs[1])) {
895
// Can combine, it's a column!
896
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
897
emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable
898
} else {
899
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
900
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
901
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
902
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
903
}
904
break;
905
case V_Triple:
906
if (Consecutive(vregs[0], vregs[1], vregs[2])) {
907
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
908
emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE); // TODO: Allow ALIGN_64 when applicable
909
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
910
} else {
911
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
912
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
913
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
914
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
915
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);
916
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
917
}
918
break;
919
case V_Quad:
920
if (Consecutive(vregs[0], vregs[1], vregs[2], vregs[3])) {
921
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
922
emit_->VLD1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE); // TODO: Allow ALIGN_64 when applicable
923
} else {
924
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
925
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
926
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
927
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
928
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);
929
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
930
emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[3]), R1);
931
emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 3, true);
932
}
933
break;
934
default:
935
;
936
}
937
}
938
939
// OK, let's fill out the arrays to confirm that we have grabbed these registers.
940
for (int i = 0; i < n; i++) {
941
int mipsReg = 32 + vregs[i];
942
mr[mipsReg].loc = ML_ARMREG;
943
mr[mipsReg].reg = QuadAsQ(quad);
944
mr[mipsReg].lane = i;
945
qr[quad].vregs[i] = vregs[i];
946
}
947
qr[quad].isDirty = (flags & MAP_DIRTY) != 0;
948
qr[quad].spillLock = true;
949
950
INFO_LOG(Log::JIT, "Mapped Q%i to vfpu %i (%s), sz=%i, dirty=%i", quad, vreg, GetVectorNotation(vreg, sz).c_str(), (int)sz, qr[quad].isDirty);
951
if (sz == V_Single || sz == V_Pair) {
952
return D_0(QuadAsQ(quad));
953
} else {
954
return QuadAsQ(quad);
955
}
956
}
957
958
959