CoCalc -- ArmRegCacheFPU.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM/ArmRegCacheFPU.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <cstring>
19

20
#include "Common/CPUDetect.h"
21
#include "Common/Log.h"
22
#include "Core/MIPS/MIPS.h"
23
#include "Core/MIPS/ARM/ArmRegCacheFPU.h"
24
#include "Core/MIPS/ARM/ArmJit.h"
25
#include "Core/MIPS/MIPSTables.h"
26

27
using namespace ArmGen;
28
using namespace ArmJitConstants;
29

30
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}
31

32
void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
33
	if (!initialReady) {
34
		SetupInitialRegs();
35
		initialReady = true;
36
	}
37

38
	memcpy(ar, arInitial, sizeof(ar));
39
	memcpy(mr, mrInitial, sizeof(mr));
40
	pendingFlush = false;
41
}
42

43
void ArmRegCacheFPU::SetupInitialRegs() {
44
	for (int i = 0; i < NUM_ARMFPUREG; i++) {
45
		arInitial[i].mipsReg = -1;
46
		arInitial[i].isDirty = false;
47
	}
48
	for (int i = 0; i < NUM_MIPSFPUREG; i++) {
49
		mrInitial[i].loc = ML_MEM;
50
		mrInitial[i].reg = INVALID_REG;
51
		mrInitial[i].spillLock = false;
52
		mrInitial[i].tempLock = false;
53
	}
54
	for (int i = 0; i < NUM_ARMQUADS; i++) {
55
		qr[i].isDirty = false;
56
		qr[i].mipsVec = -1;
57
		qr[i].sz = V_Invalid;
58
		qr[i].spillLock = false;
59
		qr[i].isTemp = false;
60
		memset(qr[i].vregs, 0xff, 4);
61
	}
62
}
63

64
const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
65
	// VFP mapping
66
	// VFPU registers and regular FP registers are mapped interchangably on top of the standard
67
	// 16 FPU registers.
68

69
	// NEON mapping
70
	// We map FPU and VFPU registers entirely separately. FPU is mapped to 12 of the bottom 16 S registers.
71
	// VFPU is mapped to the upper 48 regs, 32 of which can only be reached through NEON
72
	// (or D16-D31 as doubles, but not relevant).
73
	// Might consider shifting the split in the future, giving more regs to NEON allowing it to map more quads.
74
	
75
	// We should attempt to map scalars to low Q registers and wider things to high registers,
76
	// as the NEON instructions are all 2-vector or 4-vector, they don't do scalar, we want to be
77
	// able to use regular VFP instructions too.
78
	static const ARMReg allocationOrderNEON[] = {
79
		// Reserve four temp registers. Useful when building quads until we really figure out
80
		// how to do that best.
81
		S4,  S5,  S6,  S7,   // Q1
82
		S8,  S9,  S10, S11,  // Q2
83
		S12, S13, S14, S15,  // Q3
84
		S16, S17, S18, S19,  // Q4
85
		S20, S21, S22, S23,  // Q5
86
		S24, S25, S26, S27,  // Q6
87
		S28, S29, S30, S31,  // Q7
88
		// Q8-Q15 free for NEON tricks
89
	};
90

91
	static const ARMReg allocationOrderNEONVFPU[] = {
92
		// Reserve four temp registers. Useful when building quads until we really figure out
93
		// how to do that best.
94
		S4,  S5,  S6,  S7,   // Q1
95
		S8,  S9,  S10, S11,  // Q2
96
		S12, S13, S14, S15,  // Q3
97
		// Q4-Q15 free for VFPU
98
	};
99

100
	// NOTE: It's important that S2/S3 are not allocated with bNEON, even if !useNEONVFPU.
101
	// They are used by a few instructions, like vh2f.
102
	if (jo_->useNEONVFPU) {
103
		count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);
104
		return allocationOrderNEONVFPU;
105
	} else {
106
		count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
107
		return allocationOrderNEON;
108
	}
109
}
110

111
bool ArmRegCacheFPU::IsMapped(MIPSReg r) {
112
	return mr[r].loc == ML_ARMREG;
113
}
114

115
ARMReg ArmRegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {
116
	// INFO_LOG(Log::JIT, "FPR MapReg: %i flags=%i", mipsReg, mapFlags);
117
	if (jo_->useNEONVFPU && mipsReg >= 32) {
118
		ERROR_LOG(Log::JIT, "Cannot map VFPU registers to ARM VFP registers in NEON mode. PC=%08x", js_->compilerPC);
119
		return S0;
120
	}
121

122
	pendingFlush = true;
123
	// Let's see if it's already mapped. If so we just need to update the dirty flag.
124
	// We don't need to check for ML_NOINIT because we assume that anyone who maps
125
	// with that flag immediately writes a "known" value to the register.
126
	if (mr[mipsReg].loc == ML_ARMREG) {
127
		if (ar[mr[mipsReg].reg].mipsReg != mipsReg) {
128
			ERROR_LOG(Log::JIT, "Reg mapping out of sync! MR %i", mipsReg);
129
		}
130
		if (mapFlags & MAP_DIRTY) {
131
			ar[mr[mipsReg].reg].isDirty = true;
132
		}
133
		//INFO_LOG(Log::JIT, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg);
134
		return (ARMReg)(mr[mipsReg].reg + S0);
135
	}
136

137
	// Okay, not mapped, so we need to allocate an ARM register.
138

139
	int allocCount;
140
	const ARMReg *allocOrder = GetMIPSAllocationOrder(allocCount);
141

142
allocate:
143
	for (int i = 0; i < allocCount; i++) {
144
		int reg = allocOrder[i] - S0;
145

146
		if (ar[reg].mipsReg == -1) {
147
			// That means it's free. Grab it, and load the value into it (if requested).
148
			ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
149
			if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {
150
				if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {
151
					emit_->VLDR((ARMReg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));
152
				}
153
			}
154
			ar[reg].mipsReg = mipsReg;
155
			mr[mipsReg].loc = ML_ARMREG;
156
			mr[mipsReg].reg = reg;
157
			//INFO_LOG(Log::JIT, "Mapped %i to %i", mipsReg, mr[mipsReg].reg);
158
			return (ARMReg)(reg + S0);
159
		}
160
	}
161

162

163
	// Still nothing. Let's spill a reg and goto 10.
164
	// TODO: Use age or something to choose which register to spill?
165
	// TODO: Spill dirty regs first? or opposite?
166
	int bestToSpill = -1;
167
	for (int i = 0; i < allocCount; i++) {
168
		int reg = allocOrder[i] - S0;
169
		if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock))
170
			continue;
171
		bestToSpill = reg;
172
		break;
173
	}
174

175
	if (bestToSpill != -1) {
176
		FlushArmReg((ARMReg)(S0 + bestToSpill));
177
		goto allocate;
178
	}
179

180
	// Uh oh, we have all them spilllocked....
181
	ERROR_LOG(Log::JIT, "Out of spillable registers at PC %08x!!!", js_->compilerPC);
182
	return INVALID_REG;
183
}
184

185
void ArmRegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) {
186
	SpillLock(rd, rs);
187
	MapReg(rd);
188
	MapReg(rs);
189
	ReleaseSpillLock(rd);
190
	ReleaseSpillLock(rs);
191
}
192

193
void ArmRegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) {
194
	SpillLock(rd, rs);
195
	bool load = !avoidLoad || rd == rs;
196
	MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
197
	MapReg(rs);
198
	ReleaseSpillLock(rd);
199
	ReleaseSpillLock(rs);
200
}
201

202
void ArmRegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) {
203
	SpillLock(rd, rs, rt);
204
	bool load = !avoidLoad || (rd == rs || rd == rt);
205
	MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT);
206
	MapReg(rt);
207
	MapReg(rs);
208
	ReleaseSpillLock(rd);
209
	ReleaseSpillLock(rs);
210
	ReleaseSpillLock(rt);
211
}
212

213
void ArmRegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) {
214
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
215
		vr[v[i]].spillLock = true;
216
	}
217
}
218

219
void ArmRegCacheFPU::SpillLockV(int vec, VectorSize sz) {
220
	u8 v[4];
221
	GetVectorRegs(v, sz, vec);
222
	SpillLockV(v, sz);
223
}
224

225
void ArmRegCacheFPU::MapRegV(int vreg, int flags) {
226
	MapReg(vreg + 32, flags);
227
}
228

229
void ArmRegCacheFPU::LoadToRegV(ARMReg armReg, int vreg) {
230
	if (vr[vreg].loc == ML_ARMREG) {
231
		emit_->VMOV(armReg, (ARMReg)(S0 + vr[vreg].reg));
232
	} else {
233
		MapRegV(vreg);
234
		emit_->VMOV(armReg, V(vreg));
235
	}
236
}
237

238
void ArmRegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) {
239
	u8 v[4];
240
	GetVectorRegs(v, sz, vec);
241
	SpillLockV(v, sz);
242
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
243
		MapRegV(v[i], flags);
244
	}
245
}
246

247
void ArmRegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) {
248
	SpillLockV(v, sz);
249
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
250
		MapRegV(v[i], flags);
251
	}
252
}
253

254
void ArmRegCacheFPU::MapInInV(int vs, int vt) {
255
	SpillLockV(vs);
256
	SpillLockV(vt);
257
	MapRegV(vs);
258
	MapRegV(vt);
259
	ReleaseSpillLockV(vs);
260
	ReleaseSpillLockV(vt);
261
}
262

263
void ArmRegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) {
264
	bool load = !avoidLoad || (vd == vs);
265
	SpillLockV(vd);
266
	SpillLockV(vs);
267
	MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
268
	MapRegV(vs);
269
	ReleaseSpillLockV(vd);
270
	ReleaseSpillLockV(vs);
271
}
272

273
void ArmRegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) {
274
	bool load = !avoidLoad || (vd == vs || vd == vt);
275
	SpillLockV(vd);
276
	SpillLockV(vs);
277
	SpillLockV(vt);
278
	MapRegV(vd, load ? MAP_DIRTY : MAP_NOINIT);
279
	MapRegV(vs);
280
	MapRegV(vt);
281
	ReleaseSpillLockV(vd);
282
	ReleaseSpillLockV(vs);
283
	ReleaseSpillLockV(vt);
284
}
285

286
void ArmRegCacheFPU::FlushArmReg(ARMReg r) {
287
	if (r >= S0 && r <= S31) {
288
		int reg = r - S0;
289
		if (ar[reg].mipsReg == -1) {
290
			// Nothing to do, reg not mapped.
291
			return;
292
		}
293
		if (ar[reg].mipsReg != -1) {
294
			if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG)
295
			{
296
				//INFO_LOG(Log::JIT, "Flushing ARM reg %i", reg);
297
				emit_->VSTR(r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
298
			}
299
			// IMMs won't be in an ARM reg.
300
			mr[ar[reg].mipsReg].loc = ML_MEM;
301
			mr[ar[reg].mipsReg].reg = INVALID_REG;
302
		} else {
303
			ERROR_LOG(Log::JIT, "Dirty but no mipsreg?");
304
		}
305
		ar[reg].isDirty = false;
306
		ar[reg].mipsReg = -1;
307
	} else if (r >= D0 && r <= D31) {
308
		// TODO: Convert to S regs and flush them individually.
309
	} else if (r >= Q0 && r <= Q15) {
310
		QFlush(r);
311
	}
312
}
313

314
void ArmRegCacheFPU::FlushV(MIPSReg r) {
315
	FlushR(r + 32);
316
}
317

318
/*
319
void ArmRegCacheFPU::FlushQWithV(MIPSReg r) {
320
	// Look for it in all the quads. If it's in any, flush that quad clean.
321
	int flushCount = 0;
322
	for (int i = 0; i < MAX_ARMQUADS; i++) {
323
		if (qr[i].sz == V_Invalid)
324
			continue;
325

326
		int n = qr[i].sz;
327
		bool flushThis = false;
328
		for (int j = 0; j < n; j++) {
329
			if (qr[i].vregs[j] == r) {
330
				flushThis = true;
331
			}
332
		}
333

334
		if (flushThis) {
335
			QFlush(i);
336
			flushCount++;
337
		}
338
	}
339

340
	if (flushCount > 1) {
341
		WARN_LOG(Log::JIT, "ERROR: More than one quad was flushed to flush reg %i", r);
342
	}
343
}
344
*/
345

346
void ArmRegCacheFPU::FlushR(MIPSReg r) {
347
	switch (mr[r].loc) {
348
	case ML_IMM:
349
		// IMM is always "dirty".
350
		// IMM is not allowed for FP (yet).
351
		ERROR_LOG(Log::JIT, "Imm in FP register?");
352
		break;
353

354
	case ML_ARMREG:
355
		if (mr[r].reg == INVALID_REG) {
356
			ERROR_LOG(Log::JIT, "FlushR: MipsReg had bad ArmReg");
357
		}
358

359
		if (mr[r].reg >= Q0 && mr[r].reg <= Q15) {
360
			// This should happen rarely, but occasionally we need to flush a single stray
361
			// mipsreg that's been part of a quad.
362
			int quad = mr[r].reg - Q0;
363
			if (qr[quad].isDirty) {
364
				WARN_LOG(Log::JIT, "FlushR found quad register %i - PC=%08x", quad, js_->compilerPC);
365
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffset(r), R1);
366
				emit_->VST1_lane(F_32, (ARMReg)mr[r].reg, R0, mr[r].lane, true);
367
			}
368
		} else {
369
			if (ar[mr[r].reg].isDirty) {
370
				//INFO_LOG(Log::JIT, "Flushing dirty reg %i", mr[r].reg);
371
				emit_->VSTR((ARMReg)(mr[r].reg + S0), CTXREG, GetMipsRegOffset(r));
372
				ar[mr[r].reg].isDirty = false;
373
			}
374
			ar[mr[r].reg].mipsReg = -1;
375
		}
376
		break;
377

378
	case ML_MEM:
379
		// Already there, nothing to do.
380
		break;
381

382
	default:
383
		//BAD
384
		break;
385
	}
386
	mr[r].loc = ML_MEM;
387
	mr[r].reg = (int)INVALID_REG;
388
}
389

390
// Scalar only. Need a similar one for sequential Q vectors.
391
int ArmRegCacheFPU::FlushGetSequential(int a) {
392
	int c = 1;
393
	int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
394
	a++;
395
	while (a < 32) {
396
		if (!ar[a].isDirty || ar[a].mipsReg == -1)
397
			break;
398
		int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
399
		if (mipsOffset != lastMipsOffset + 4) {
400
			break;
401
		}
402

403
		lastMipsOffset = mipsOffset;
404
		a++;
405
		c++;
406
	}
407
	return c;
408
}
409

410
void ArmRegCacheFPU::FlushAll() {
411
	if (!pendingFlush) {
412
		// Nothing allocated.  FPU regs are not nearly as common as GPR.
413
		return;
414
	}
415

416
	// Discard temps!
417
	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {
418
		DiscardR(i);
419
	}
420

421
	// Flush quads!
422
	// These could also use sequential detection.
423
	for (int i = 4; i < NUM_ARMQUADS; i++) {
424
		QFlush(i);
425
	}
426

427
	// Loop through the ARM registers, then use GetMipsRegOffset to determine if MIPS registers are
428
	// sequential. This is necessary because we store VFPU registers in a staggered order to get
429
	// columns sequential (most VFPU math in nearly all games is in columns, not rows).
430
	
431
	int numArmRegs;
432
	// We rely on the allocation order being sequential.
433
	const ARMReg baseReg = GetMIPSAllocationOrder(numArmRegs)[0];
434

435
	for (int i = 0; i < numArmRegs; i++) {
436
		int a = (baseReg - S0) + i;
437
		int m = ar[a].mipsReg;
438

439
		if (ar[a].isDirty) {
440
			if (m == -1) {
441
				INFO_LOG(Log::JIT, "ARM reg %i is dirty but has no mipsreg", a);
442
				continue;
443
			}
444

445
			int c = FlushGetSequential(a);
446
			if (c == 1) {
447
				// INFO_LOG(Log::JIT, "Got single register: %i (%i)", a, m);
448
				emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
449
			} else if (c == 2) {
450
				// Probably not worth using VSTMIA for two.
451
				int offset = GetMipsRegOffset(m);
452
				emit_->VSTR((ARMReg)(a + S0), CTXREG, offset);
453
				emit_->VSTR((ARMReg)(a + 1 + S0), CTXREG, offset + 4);
454
			} else {
455
				// INFO_LOG(Log::JIT, "Got sequence: %i at %i (%i)", c, a, m);
456
				emit_->ADDI2R(SCRATCHREG1, CTXREG, GetMipsRegOffset(m), SCRATCHREG2);
457
				// INFO_LOG(Log::JIT, "VSTMIA R0, %i, %i", a, c);
458
				emit_->VSTMIA(SCRATCHREG1, false, (ARMReg)(S0 + a), c);
459
			}
460

461
			// Skip past, and mark as non-dirty.
462
			for (int j = 0; j < c; j++) {
463
				int b = a + j;
464
				mr[ar[b].mipsReg].loc = ML_MEM;
465
				mr[ar[b].mipsReg].reg = (int)INVALID_REG;
466
				ar[a + j].mipsReg = -1;
467
				ar[a + j].isDirty = false;
468
			}
469
			i += c - 1;
470
		} else {
471
			if (m != -1) {
472
				mr[m].loc = ML_MEM;
473
				mr[m].reg = (int)INVALID_REG;
474
			}
475
			ar[a].mipsReg = -1;
476
			// already not dirty
477
		}
478
	}
479

480
	// Sanity check
481
	for (int i = 0; i < NUM_ARMFPUREG; i++) {
482
		if (ar[i].mipsReg != -1) {
483
			ERROR_LOG(Log::JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
484
		}
485
	}
486
	pendingFlush = false;
487
}
488

489
void ArmRegCacheFPU::DiscardR(MIPSReg r) {
490
	switch (mr[r].loc) {
491
	case ML_IMM:
492
		// IMM is always "dirty".
493
		// IMM is not allowed for FP (yet).
494
		ERROR_LOG(Log::JIT, "Imm in FP register?");
495
		break;
496
		 
497
	case ML_ARMREG:
498
		if (mr[r].reg == INVALID_REG) {
499
			ERROR_LOG(Log::JIT, "DiscardR: MipsReg had bad ArmReg");
500
		} else {
501
			// Note that we DO NOT write it back here. That's the whole point of Discard.
502
			ar[mr[r].reg].isDirty = false;
503
			ar[mr[r].reg].mipsReg = -1;
504
		}
505
		break;
506

507
	case ML_MEM:
508
		// Already there, nothing to do.
509
		break;
510

511
	default:
512
		//BAD
513
		break;
514
	}
515
	mr[r].loc = ML_MEM;
516
	mr[r].reg = (int)INVALID_REG;
517
	mr[r].tempLock = false;
518
	mr[r].spillLock = false;
519
}
520

521
bool ArmRegCacheFPU::IsTempX(ARMReg r) const {
522
	return ar[r - S0].mipsReg >= TEMP0;
523
}
524

525
int ArmRegCacheFPU::GetTempR() {
526
	if (jo_->useNEONVFPU) {
527
		ERROR_LOG(Log::JIT, "VFP temps not allowed in NEON mode");
528
		return 0;
529
	}
530
	pendingFlush = true;
531
	for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) {
532
		if (mr[r].loc == ML_MEM && !mr[r].tempLock) {
533
			mr[r].tempLock = true;
534
			return r;
535
		}
536
	}
537

538
	ERROR_LOG(Log::CPU, "Out of temp regs! Might need to DiscardR() some");
539
	_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
540
	return -1;
541
}
542

543
int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
544
	// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
545
	if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
546
		ERROR_LOG(Log::JIT, "bad mips register %i, out of range", r);
547
		return 0;  // or what?
548
	}
549

550
	if (r < 32 || r >= 32 + 128) {
551
		return (32 + r) << 2;
552
	} else {
553
		// r is between 32 and 128 + 32
554
		return (32 + 32 + voffset[r - 32]) << 2;
555
	}
556
}
557

558
void ArmRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {
559
	mr[r1].spillLock = true;
560
	if (r2 != -1) mr[r2].spillLock = true;
561
	if (r3 != -1) mr[r3].spillLock = true;
562
	if (r4 != -1) mr[r4].spillLock = true;
563
}
564

565
// This is actually pretty slow with all the 160 regs...
566
void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
567
	for (int i = 0; i < NUM_MIPSFPUREG; i++) {
568
		mr[i].spillLock = false;
569
	}
570
	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
571
		DiscardR(i);
572
	}
573
	for (int i = 0; i < NUM_ARMQUADS; i++) {
574
		qr[i].spillLock = false;
575
		if (qr[i].isTemp) {
576
			qr[i].isTemp = false;
577
			qr[i].sz = V_Invalid;
578
		}
579
	}
580
}
581

582
ARMReg ArmRegCacheFPU::R(int mipsReg) {
583
	if (mr[mipsReg].loc == ML_ARMREG) {
584
		return (ARMReg)(mr[mipsReg].reg + S0);
585
	} else {
586
		if (mipsReg < 32) {
587
			ERROR_LOG(Log::JIT, "FReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
588
		} else if (mipsReg < 32 + 128) {
589
			ERROR_LOG(Log::JIT, "VReg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
590
		} else {
591
			ERROR_LOG(Log::JIT, "Tempreg %i not in ARM reg. compilerPC = %08x : %s", mipsReg - 128 - 32, js_->compilerPC, MIPSDisasmAt(js_->compilerPC).c_str());
592
		}
593
		return INVALID_REG;  // BAAAD
594
	}
595
}
596

597
inline ARMReg QuadAsD(int quad) {
598
	return (ARMReg)(D0 + quad * 2);
599
}
600

601
inline ARMReg QuadAsQ(int quad) {
602
	return (ARMReg)(Q0 + quad);
603
}
604

605
bool MappableQ(int quad) {
606
	return quad >= 4;
607
}
608

609
void ArmRegCacheFPU::QLoad4x4(MIPSGPReg regPtr, int vquads[4]) {
610
	ERROR_LOG(Log::JIT, "QLoad4x4 not implemented");
611
	// TODO	
612
}
613

614
void ArmRegCacheFPU::QFlush(int quad) {
615
	if (!MappableQ(quad)) {
616
		ERROR_LOG(Log::JIT, "Cannot flush non-mappable quad %i", quad);
617
		return;
618
	}
619

620
	if (qr[quad].isDirty && !qr[quad].isTemp) {
621
		INFO_LOG(Log::JIT, "Flushing Q%i (%s)", quad, GetVectorNotation(qr[quad].mipsVec, qr[quad].sz).c_str());
622

623
		ARMReg q = QuadAsQ(quad);
624
		// Unlike reads, when writing to the register file we need to be careful to write the correct
625
		// number of floats.
626

627
		switch (qr[quad].sz) {
628
		case V_Single:
629
			emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
630
			emit_->VST1_lane(F_32, q, R0, 0, true);
631
			// WARN_LOG(Log::JIT, "S: Falling back to individual flush: pc=%08x", js_->compilerPC);
632
			break;
633
		case V_Pair:
634
			if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1])) {
635
				// Can combine, it's a column!
636
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
637
				emit_->VST1(F_32, q, R0, 1, ALIGN_NONE);  // TODO: Allow ALIGN_64 when applicable
638
			} else {
639
				// WARN_LOG(Log::JIT, "P: Falling back to individual flush: pc=%08x", js_->compilerPC);
640
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
641
				emit_->VST1_lane(F_32, q, R0, 0, true);
642
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
643
				emit_->VST1_lane(F_32, q, R0, 1, true);
644
			}
645
			break;
646
		case V_Triple:
647
			if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2])) {
648
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
649
				emit_->VST1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE);  // TODO: Allow ALIGN_64 when applicable
650
				emit_->VST1_lane(F_32, q, R0, 2, true);
651
			} else {
652
				// WARN_LOG(Log::JIT, "T: Falling back to individual flush: pc=%08x", js_->compilerPC);
653
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
654
				emit_->VST1_lane(F_32, q, R0, 0, true);
655
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
656
				emit_->VST1_lane(F_32, q, R0, 1, true);
657
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);
658
				emit_->VST1_lane(F_32, q, R0, 2, true);
659
			}
660
			break;
661
		case V_Quad:
662
			if (Consecutive(qr[quad].vregs[0], qr[quad].vregs[1], qr[quad].vregs[2], qr[quad].vregs[3])) {
663
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
664
				emit_->VST1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE);  // TODO: Allow ALIGN_64 when applicable
665
			} else {
666
				// WARN_LOG(Log::JIT, "Q: Falling back to individual flush: pc=%08x", js_->compilerPC);
667
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[0]), R1);
668
				emit_->VST1_lane(F_32, q, R0, 0, true);
669
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[1]), R1);
670
				emit_->VST1_lane(F_32, q, R0, 1, true);
671
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[2]), R1);
672
				emit_->VST1_lane(F_32, q, R0, 2, true);
673
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(qr[quad].vregs[3]), R1);
674
				emit_->VST1_lane(F_32, q, R0, 3, true);
675
			}
676
			break;
677
		default:
678
			ERROR_LOG(Log::JIT, "Unknown quad size %i", qr[quad].sz);
679
			break;
680
		}
681

682
		qr[quad].isDirty = false;
683

684
		int n = GetNumVectorElements(qr[quad].sz);
685
		for (int i = 0; i < n; i++) {
686
			int vr = qr[quad].vregs[i];
687
			if (vr < 0 || vr > 128) {
688
				ERROR_LOG(Log::JIT, "Bad vr %i", vr);
689
			}
690
			FPURegMIPS &m = mr[32 + vr];
691
			m.loc = ML_MEM;
692
			m.lane = -1;
693
			m.reg = -1;
694
		}
695

696
	} else {
697
		if (qr[quad].isTemp) {
698
			WARN_LOG(Log::JIT, "Not flushing quad %i; dirty = %i, isTemp = %i", quad, qr[quad].isDirty, qr[quad].isTemp);
699
		}
700
	}
701

702
	qr[quad].isTemp = false;
703
	qr[quad].mipsVec = -1;
704
	qr[quad].sz = V_Invalid;
705
	memset(qr[quad].vregs, 0xFF, 4);
706
}
707

708
int ArmRegCacheFPU::QGetFreeQuad(int start, int count, const char *reason) {
709
	// Search for a free quad. A quad is free if the first register in it is free.
710
	for (int i = 0; i < count; i++) {
711
		int q = (i + start) & 15;
712

713
		if (!MappableQ(q))
714
			continue;
715

716
		// Don't steal temp quads!
717
		if (qr[q].mipsVec == (int)INVALID_REG && !qr[q].isTemp) {
718
			// INFO_LOG(Log::JIT, "Free quad: %i", q);
719
			// Oh yeah! Free quad!
720
			return q;
721
		}
722
	}
723

724
	// Okay, find the "best scoring" reg to replace. Scoring algorithm TBD but may include some
725
	// sort of age.
726
	int bestQuad = -1;
727
	int bestScore = -1;
728
	for (int i = 0; i < count; i++) {
729
		int q = (i + start) & 15;
730

731
		if (!MappableQ(q))
732
			continue;
733
		if (qr[q].spillLock)
734
			continue;
735
		if (qr[q].isTemp)
736
			continue;
737

738
		int score = 0;
739
		if (!qr[q].isDirty) {
740
			score += 5;
741
		}
742

743
		if (score > bestScore) {
744
			bestQuad = q;
745
			bestScore = score;
746
		}
747
	}
748

749
	if (bestQuad == -1) {
750
		ERROR_LOG(Log::JIT, "Failed finding a free quad. Things will now go haywire!");
751
		return -1;
752
	} else {
753
		INFO_LOG(Log::JIT, "No register found in %i and the next %i, kicked out #%i (%s)", start, count, bestQuad, reason ? reason : "no reason");
754
		QFlush(bestQuad);
755
		return bestQuad;
756
	}
757
}
758

759
ARMReg ArmRegCacheFPU::QAllocTemp(VectorSize sz) {
760
	int q = QGetFreeQuad(8, 16, "allocating temporary");  // Prefer high quads as temps
761
	if (q < 0) {
762
		ERROR_LOG(Log::JIT, "Failed to allocate temp quad");
763
		q = 0;
764
	}
765
	qr[q].spillLock = true;
766
	qr[q].isTemp = true;
767
	qr[q].sz = sz;
768
	qr[q].isDirty = false;  // doesn't matter
769

770
	INFO_LOG(Log::JIT, "Allocated temp quad %i", q);
771

772
	if (sz == V_Single || sz == V_Pair) {
773
		return D_0(ARMReg(Q0 + q));
774
	} else {
775
		return ARMReg(Q0 + q);
776
	}
777
}
778

779
bool ArmRegCacheFPU::Consecutive(int v1, int v2) const {
780
	return (voffset[v1] + 1) == voffset[v2];
781
}
782

783
bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3) const {
784
	return Consecutive(v1, v2) && Consecutive(v2, v3);
785
}
786

787
bool ArmRegCacheFPU::Consecutive(int v1, int v2, int v3, int v4) const {
788
	return Consecutive(v1, v2) && Consecutive(v2, v3) && Consecutive(v3, v4);
789
}
790

791
void ArmRegCacheFPU::QMapMatrix(ARMReg *regs, int matrix, MatrixSize mz, int flags) {
792
	u8 vregs[4];
793
	if (flags & MAP_MTX_TRANSPOSED) {
794
		GetMatrixRows(matrix, mz, vregs);
795
	} else {
796
		GetMatrixColumns(matrix, mz, vregs);
797
	}
798

799
	// TODO: Zap existing mappings, reserve 4 consecutive regs, then do a fast load.
800
	int n = GetMatrixSide(mz);
801
	VectorSize vsz = GetVectorSize(mz);
802
	for (int i = 0; i < n; i++) {
803
		regs[i] = QMapReg(vregs[i], vsz, flags);
804
	}
805
}
806

807
ARMReg ArmRegCacheFPU::QMapReg(int vreg, VectorSize sz, int flags) {
808
	qTime_++;
809

810
	int n = GetNumVectorElements(sz);
811
	u8 vregs[4];
812
	GetVectorRegs(vregs, sz, vreg);
813

814
	// Range of registers to consider
815
	int start = 0;
816
	int count = 16;
817

818
	if (flags & MAP_PREFER_HIGH) {
819
		start = 8;
820
	} else if (flags & MAP_PREFER_LOW) {
821
		start = 4;
822
	} else if (flags & MAP_FORCE_LOW) {
823
		start = 4;
824
		count = 4;
825
	} else if (flags & MAP_FORCE_HIGH) {
826
		start = 8;
827
		count = 8;
828
	}
829

830
	// Let's check if they are all mapped in a quad somewhere.
831
	// At the same time, check for the quad already being mapped.
832
	// Later we can check for possible transposes as well.
833

834
	// First just loop over all registers. If it's here and not in range, or overlapped, kick.
835
	std::vector<int> quadsToFlush;
836
	for (int i = 0; i < 16; i++) {
837
		int q = (i + start) & 15;
838
		if (!MappableQ(q))
839
			continue;
840

841
		// Skip unmapped quads.
842
		if (qr[q].sz == V_Invalid)
843
			continue;
844

845
		// Check if completely there already. If so, set spill-lock, transfer dirty flag and exit.
846
		if (vreg == qr[q].mipsVec && sz == qr[q].sz) {
847
			if (i < count) {
848
				INFO_LOG(Log::JIT, "Quad already mapped: %i : %i (size %i)", q, vreg, sz);
849
				qr[q].isDirty = qr[q].isDirty || (flags & MAP_DIRTY);
850
				qr[q].spillLock = true;
851

852
				// Sanity check vregs
853
				for (int i = 0; i < n; i++) {
854
					if (vregs[i] != qr[q].vregs[i]) {
855
						ERROR_LOG(Log::JIT, "Sanity check failed: %i vs %i", vregs[i], qr[q].vregs[i]);
856
					}
857
				}
858

859
				return (ARMReg)(Q0 + q);
860
			} else {
861
				INFO_LOG(Log::JIT, "Quad already mapped at %i which is out of requested range [%i-%i) (count = %i), needs moving. For now we flush.", q, start, start+count, count);
862
				quadsToFlush.push_back(q);
863
				continue;
864
			}
865
		}
866

867
		// Check for any overlap. Overlap == flush.
868
		int origN = GetNumVectorElements(qr[q].sz);
869
		for (int a = 0; a < n; a++) {
870
			for (int b = 0; b < origN; b++) {
871
				if (vregs[a] == qr[q].vregs[b]) {
872
					quadsToFlush.push_back(q);
873
					goto doubleBreak;
874
				}
875
			}
876
		}
877
	doubleBreak:
878
		;
879
	}
880

881
	// We didn't find the extra register, but we got a list of regs to flush. Flush 'em.
882
	// Here we can check for opportunities to do a "transpose-flush" of row vectors, etc.
883
	if (!quadsToFlush.empty()) {
884
		INFO_LOG(Log::JIT, "New mapping %s collided with %d quads, flushing them.", GetVectorNotation(vreg, sz).c_str(), (int)quadsToFlush.size());
885
	}
886
	for (size_t i = 0; i < quadsToFlush.size(); i++) {
887
		QFlush(quadsToFlush[i]);
888
	}
889

890
	// Find where we want to map it, obeying the constraints we gave.
891
	int quad = QGetFreeQuad(start, count, "mapping");
892
	if (quad < 0)
893
		return INVALID_REG;
894

895
	// If parts of our register are elsewhere, and we are dirty, we need to flush them
896
	// before we reload in a new location.
897
	// This may be problematic if inputs overlap irregularly with output, say:
898
	// vdot S700, R000, C000
899
	// It might still work by accident...
900
	if (flags & MAP_DIRTY) {
901
		for (int i = 0; i < n; i++) {
902
			FlushV(vregs[i]);
903
		}
904
	}
905

906
	qr[quad].sz = sz;
907
	qr[quad].mipsVec = vreg;
908

909
	if ((flags & MAP_NOINIT) != MAP_NOINIT) {
910
		// Okay, now we will try to load the whole thing in one go. This is possible
911
		// if it's a row and easy if it's a single.
912
		// Rows are rare, columns are common - but thanks to our register reordering,
913
		// columns are actually in-order in memory.
914
		switch (sz) {
915
		case V_Single:
916
			emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
917
			emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
918
			break;
919
		case V_Pair:
920
			if (Consecutive(vregs[0], vregs[1])) {
921
				// Can combine, it's a column!
922
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
923
				emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE);  // TODO: Allow ALIGN_64 when applicable
924
			} else {
925
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
926
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
927
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
928
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
929
			}
930
			break;
931
		case V_Triple:
932
			if (Consecutive(vregs[0], vregs[1], vregs[2])) {
933
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
934
				emit_->VLD1(F_32, QuadAsD(quad), R0, 1, ALIGN_NONE, REG_UPDATE);  // TODO: Allow ALIGN_64 when applicable
935
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
936
			} else {
937
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
938
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
939
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
940
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
941
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);
942
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
943
			}
944
			break;
945
		case V_Quad:
946
			if (Consecutive(vregs[0], vregs[1], vregs[2], vregs[3])) {
947
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
948
				emit_->VLD1(F_32, QuadAsD(quad), R0, 2, ALIGN_NONE);  // TODO: Allow ALIGN_64 when applicable
949
			} else {
950
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[0]), R1);
951
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 0, true);
952
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[1]), R1);
953
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 1, true);
954
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[2]), R1);
955
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 2, true);
956
				emit_->ADDI2R(R0, CTXREG, GetMipsRegOffsetV(vregs[3]), R1);
957
				emit_->VLD1_lane(F_32, QuadAsQ(quad), R0, 3, true);
958
			}
959
			break;
960
		default:
961
			;
962
		}
963
	}
964

965
	// OK, let's fill out the arrays to confirm that we have grabbed these registers.
966
	for (int i = 0; i < n; i++) {
967
		int mipsReg = 32 + vregs[i];
968
		mr[mipsReg].loc = ML_ARMREG;
969
		mr[mipsReg].reg = QuadAsQ(quad);
970
		mr[mipsReg].lane = i;
971
		qr[quad].vregs[i] = vregs[i];
972
	}
973
	qr[quad].isDirty = (flags & MAP_DIRTY) != 0;
974
	qr[quad].spillLock = true;
975

976
	INFO_LOG(Log::JIT, "Mapped Q%i to vfpu %i (%s), sz=%i, dirty=%i", quad, vreg, GetVectorNotation(vreg, sz).c_str(), (int)sz, qr[quad].isDirty);
977
	if (sz == V_Single || sz == V_Pair) {
978
		return D_0(QuadAsQ(quad));
979
	} else {
980
		return QuadAsQ(quad);
981
	}
982
}
983

984

985
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company