CoCalc -- RegCacheFPU.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/RegCacheFPU.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20

21
#include <cstring>
22
#include <emmintrin.h>
23

24
#include "Common/Log.h"
25
#include "Common/x64Emitter.h"
26
#include "Core/MIPS/MIPSAnalyst.h"
27
#include "Core/MIPS/x86/Jit.h"
28
#include "Core/MIPS/x86/RegCache.h"
29
#include "Core/MIPS/x86/RegCacheFPU.h"
30

31
using namespace Gen;
32
using namespace X64JitConstants;
33

34
FPURegCache::FPURegCache() {
35
	vregs = regs + 32;
36
}
37

38
void FPURegCache::Start(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo, MIPSAnalyst::AnalysisResults &stats, bool useRip) {
39
	mips_ = mipsState;
40
	useRip_ = useRip;
41
	if (!initialReady) {
42
		SetupInitialRegs();
43
		initialReady = true;
44
	}
45

46
	memcpy(xregs, xregsInitial, sizeof(xregs));
47
	memcpy(regs, regsInitial, sizeof(regs));
48
	pendingFlush = false;
49

50
	js_ = js;
51
	jo_ = jo;
52
}
53

54
void FPURegCache::SetupInitialRegs() {
55
	for (int i = 0; i < NUM_X_FPREGS; i++) {
56
		memset(xregsInitial[i].mipsRegs, -1, sizeof(xregsInitial[i].mipsRegs));
57
		xregsInitial[i].dirty = false;
58
	}
59
	memset(regsInitial, 0, sizeof(regsInitial));
60
	OpArg base = GetDefaultLocation(0);
61
	for (int i = 0; i < 32; i++) {
62
		regsInitial[i].location = base;
63
		base.IncreaseOffset(sizeof(float));
64
	}
65
	for (int i = 32; i < 32 + 128; i++) {
66
		regsInitial[i].location = GetDefaultLocation(i);
67
	}
68
	base = GetDefaultLocation(32 + 128);
69
	for (int i = 32 + 128; i < NUM_MIPS_FPRS; i++) {
70
		regsInitial[i].location = base;
71
		base.IncreaseOffset(sizeof(float));
72
	}
73
}
74

75
void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
76
	regs[p1].locked++;
77
	if (p2 != 0xFF) regs[p2].locked++;
78
	if (p3 != 0xFF) regs[p3].locked++;
79
	if (p4 != 0xFF) regs[p4].locked++;
80
}
81

82
void FPURegCache::SpillLockV(const u8 *vec, VectorSize sz) {
83
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
84
		vregs[vec[i]].locked++;
85
	}
86
}
87

88
void FPURegCache::SpillLockV(int vec, VectorSize sz) {
89
	u8 r[4];
90
	GetVectorRegs(r, sz, vec);
91
	SpillLockV(r, sz);
92
}
93

94
void FPURegCache::ReleaseSpillLockV(const u8 *vec, VectorSize sz) {
95
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
96
		vregs[vec[i]].locked = 0;
97
	}
98
}
99

100
void FPURegCache::ReduceSpillLock(int mipsreg) {
101
	regs[mipsreg].locked--;
102
}
103

104
void FPURegCache::ReduceSpillLockV(const u8 *vec, VectorSize sz) {
105
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
106
		vregs[vec[i]].locked--;
107
	}
108
}
109

110
void FPURegCache::FlushRemap(int oldreg, int newreg) {
111
	OpArg oldLocation = regs[oldreg].location;
112
	_assert_msg_(oldLocation.IsSimpleReg(), "FlushRemap: Must already be in an x86 SSE register");
113
	_assert_msg_(regs[oldreg].lane == 0, "FlushRemap only supports FPR registers");
114

115
	X64Reg xr = oldLocation.GetSimpleReg();
116
	if (oldreg == newreg) {
117
		xregs[xr].dirty = true;
118
		return;
119
	}
120

121
	StoreFromRegister(oldreg);
122

123
	// Now, if newreg already was mapped somewhere, get rid of that.
124
	DiscardR(newreg);
125

126
	// Now, take over the old register.
127
	regs[newreg].location = oldLocation;
128
	regs[newreg].away = true;
129
	regs[newreg].locked = true;
130
	regs[newreg].lane = 0;
131
	xregs[xr].mipsReg = newreg;
132
	xregs[xr].dirty = true;
133
}
134

135
void FPURegCache::MapRegV(int vreg, int flags) {
136
	MapReg(vreg + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
137
}
138

139
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
140
	u8 r[4];
141
	GetVectorRegs(r, sz, vec);
142
	SpillLockV(r, sz);
143
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
144
		MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
145
	}
146
	if ((flags & MAP_NOLOCK) != 0) {
147
		// We have to lock so the sz won't spill, so we unlock after.
148
		// If they were already locked, we only reduce the lock we added above.
149
		ReduceSpillLockV(r, sz);
150
	}
151
}
152

153
void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) {
154
	SpillLockV(r, sz);
155
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
156
		MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
157
	}
158
	if ((flags & MAP_NOLOCK) != 0) {
159
		// We have to lock so the sz won't spill, so we unlock after.
160
		// If they were already locked, we only reduce the lock we added above.
161
		ReduceSpillLockV(r, sz);
162
	}
163
}
164

165
bool FPURegCache::IsMappedVS(const u8 *v, VectorSize vsz) {
166
	const int n = GetNumVectorElements(vsz);
167

168
	// Make sure the first reg is at least mapped in the right place.
169
	if (!IsMappedVS(v[0]))
170
		return false;
171
	if (vregs[v[0]].lane != 1)
172
		return false;
173

174
	// And make sure the rest are mapped to the same reg in the right positions.
175
	X64Reg xr = VSX(v);
176
	for (int i = 1; i < n; ++i) {
177
		u8 vi = v[i];
178
		if (!IsMappedVS(vi) || VSX(&vi) != xr)
179
			return false;
180
		if (vregs[vi].lane != i + 1)
181
			return false;
182
	}
183
	// TODO: Optimize this case?  It happens.
184
	for (int i = n; i < 4; ++i) {
185
		if (xregs[xr].mipsRegs[i] != -1) {
186
			return false;
187
		}
188
	}
189
	return true;
190
}
191

192
void FPURegCache::MapRegsVS(const u8 *r, VectorSize vsz, int flags) {
193
	const int n = GetNumVectorElements(vsz);
194

195
	_dbg_assert_msg_(jo_->enableVFPUSIMD, "Should not map simd regs when option is off.");
196

197
	if (!TryMapRegsVS(r, vsz, flags)) {
198
		// TODO: Could be more optimal.
199
		for (int i = 0; i < n; ++i) {
200
			StoreFromRegisterV(r[i]);
201
		}
202
		if (!TryMapRegsVS(r, vsz, flags)) {
203
			_dbg_assert_msg_(false, "MapRegsVS() failed on second try.");
204
		}
205
	}
206
}
207

208
bool FPURegCache::CanMapVS(const u8 *v, VectorSize vsz) {
209
	const int n = GetNumVectorElements(vsz);
210

211
	if (!jo_->enableVFPUSIMD) {
212
		return false;
213
	}
214

215
	if (IsMappedVS(v, vsz)) {
216
		return true;
217
	} else if (vregs[v[0]].lane != 0) {
218
		const MIPSCachedFPReg &v0 = vregs[v[0]];
219
		_dbg_assert_msg_(v0.away, "Must be away when lane != 0");
220
		_dbg_assert_msg_(v0.location.IsSimpleReg(), "Must be is register when lane != 0");
221

222
		// Already in a different simd set.
223
		return false;
224
	}
225

226
	if (vregs[v[0]].locked) {
227
		// If it's locked, we can't mess with it.
228
		return false;
229
	}
230

231
	// Next, fail if any of the other regs are in simd currently.
232
	// TODO: Only if locked?  Not sure if it will be worth breaking them anyway.
233
	for (int i = 1; i < n; ++i) {
234
		if (vregs[v[i]].lane != 0) {
235
			return false;
236
		}
237
		// If it's locked, in simd or not, we can't use it.
238
		if (vregs[v[i]].locked) {
239
			return false;
240
		}
241
		_assert_msg_(!vregs[v[i]].location.IsImm(), "Cannot handle imms in fp cache.");
242
	}
243

244
	return true;
245
}
246

247
bool FPURegCache::TryMapRegsVS(const u8 *v, VectorSize vsz, int flags) {
248
	const int n = GetNumVectorElements(vsz);
249

250
	if (!CanMapVS(v, vsz)) {
251
		return false;
252
	}
253

254
	if (IsMappedVS(v, vsz)) {
255
		// Already mapped then, perfect.  Just mark dirty.
256
		if ((flags & MAP_DIRTY) != 0)
257
			xregs[VSX(v)].dirty = true;
258
		if ((flags & MAP_NOLOCK) == 0)
259
			SpillLockV(v, vsz);
260
		return true;
261
	}
262

263
	// At this point, some or all are in single regs or memory, and they're not locked there.
264

265
	if (n == 1) {
266
		// Single is easy, just map normally but track as a SIMD reg.
267
		// This way V/VS can warn about improper usage properly.
268
		MapRegV(v[0], flags);
269
		X64Reg vx = VX(v[0]);
270
		if (vx == INVALID_REG)
271
			return false;
272

273
		vregs[v[0]].lane = 1;
274
		if ((flags & MAP_DIRTY) != 0)
275
			xregs[vx].dirty = true;
276
		if ((flags & MAP_NOLOCK) == 0)
277
			SpillLockV(v, vsz);
278
		Invariant();
279
		return true;
280
	}
281

282
	X64Reg xr;
283
	if ((flags & MAP_NOINIT) != MAP_NOINIT) {
284
		xr = LoadRegsVS(v, n);
285
	} else {
286
		xr = GetFreeXReg();
287
	}
288

289
	// Victory, now let's clean up everything.
290
	OpArg newloc = Gen::R(xr);
291
	bool dirty = (flags & MAP_DIRTY) != 0;
292
	for (int i = 0; i < n; ++i) {
293
		MIPSCachedFPReg &vr = vregs[v[i]];
294
		if (vr.away) {
295
			// Clear the xreg it was in before.
296
			X64Reg oldXReg = vr.location.GetSimpleReg();
297
			if (oldXReg != xr) {
298
				xregs[oldXReg].mipsReg = -1;
299
			}
300
			if (xregs[oldXReg].dirty) {
301
				// Inherit the "dirtiness" (ultimately set below for all regs.)
302
				dirty = true;
303
				xregs[oldXReg].dirty = false;
304
			}
305
		}
306
		xregs[xr].mipsRegs[i] = v[i] + 32;
307
		vr.location = newloc;
308
		vr.lane = i + 1;
309
		vr.away = true;
310
	}
311
	xregs[xr].dirty = dirty;
312

313
	if ((flags & MAP_NOLOCK) == 0) {
314
		SpillLockV(v, vsz);
315
	}
316

317
	Invariant();
318
	return true;
319
}
320

321
X64Reg FPURegCache::LoadRegsVS(const u8 *v, int n) {
322
	int regsAvail = 0;
323
	int regsLoaded = 0;
324
	X64Reg xrs[4] = {INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG};
325
	bool xrsLoaded[4] = {false, false, false, false};
326

327
	_dbg_assert_msg_(n >= 2 && n <= 4, "LoadRegsVS is only implemented for simd loads.");
328

329
	for (int i = 0; i < n; ++i) {
330
		const MIPSCachedFPReg &mr = vregs[v[i]];
331
		if (mr.away) {
332
			X64Reg mrx = mr.location.GetSimpleReg();
333
			// If it's not simd, or lanes 1+ are clear, we can use it.
334
			if (mr.lane == 0 || xregs[mrx].mipsRegs[1] == -1) {
335
				// Okay, there's nothing else in this reg, so we can use it.
336
				xrsLoaded[i] = true;
337
				xrs[i] = mrx;
338
				++regsLoaded;
339
				++regsAvail;
340
			} else if (mr.lane != 0) {
341
				_dbg_assert_msg_(false, "LoadRegsVS is not able to handle simd remapping yet, store first.");
342
			}
343
		}
344
	}
345

346
	if (regsAvail < n) {
347
		// Try to grab some without spilling.
348
		X64Reg xrFree[4];
349
		int obtained = GetFreeXRegs(xrFree, n - regsAvail, false);
350
		int pos = 0;
351
		for (int i = 0; i < n && pos < obtained; ++i) {
352
			if (xrs[i] == INVALID_REG) {
353
				// Okay, it's not loaded but we have a reg for this slot.
354
				xrs[i] = xrFree[pos++];
355
				++regsAvail;
356
			}
357
		}
358
	}
359

360
	// Let's also check if the memory addresses are sequential.
361
	int sequential = 1;
362
	for (int i = 1; i < n; ++i) {
363
		if (v[i] < 128 && v[i - 1] < 128) {
364
			if (voffset[v[i]] != voffset[v[i - 1]] + 1) {
365
				break;
366
			}
367
		} else if (v[i] >= 128 && v[i - 1] >= 128) {
368
			if (v[i] != v[i - 1] + 1) {
369
				break;
370
			}
371
		} else {
372
			// Temps can't be sequential with non-temps.
373
			break;
374
		}
375
		++sequential;
376
	}
377

378
	// Did we end up with enough regs?
379
	// TODO: Not handling the case of some regs avail and some loaded right now.
380
	if (regsAvail < n && (sequential != n || regsLoaded == n || regsAvail == 0)) {
381
		regsAvail = GetFreeXRegs(xrs, 2, true);
382
		_dbg_assert_msg_(regsAvail >= 2, "Ran out of fp regs for loading simd regs with.");
383
		_dbg_assert_msg_(xrs[0] != xrs[1], "Regs for simd load are the same, bad things await.");
384
		// We spilled, so we assume that all our regs are screwed up now anyway.
385
		for (int i = 0; i < 4; ++i) {
386
			xrsLoaded[i] = false;
387
		}
388
		for (int i = 2; i < n; ++i){
389
			xrs[i] = INVALID_REG;
390
		}
391
		regsLoaded = 0;
392
	}
393

394
	// If they're sequential, and we wouldn't need to store them all, use a single load.
395
	// But if they're already loaded, we'd have to store, not worth it.
396
	X64Reg res = INVALID_REG;
397
	if (sequential == n && regsLoaded < n) {
398
		// TODO: What should we do if some are in regs?  Better to assemble?
399
		for (int i = 0; i < n; ++i) {
400
			StoreFromRegisterV(v[i]);
401
		}
402

403
		// Grab any available reg.
404
		for (int i = 0; i < n; ++i) {
405
			if (xrs[i] != INVALID_REG) {
406
				res = xrs[i];
407
				break;
408
			}
409
		}
410
		const float *f = v[0] < 128 ? &mips_->v[voffset[v[0]]] : &mips_->tempValues[v[0] - 128];
411
		if (((intptr_t)f & 0x7) == 0 && n == 2) {
412
			emit->MOVQ_xmm(res, vregs[v[0]].location);
413
		} else if (((intptr_t)f & 0xf) == 0) {
414
			// On modern processors, MOVUPS on aligned is fast, but maybe not on older ones.
415
			emit->MOVAPS(res, vregs[v[0]].location);
416
		} else {
417
			emit->MOVUPS(res, vregs[v[0]].location);
418
		}
419
	} else if (regsAvail >= n) {
420
		// Have enough regs, potentially all in regs.
421
		auto loadXR = [&](int l) {
422
			if (!xrsLoaded[l] && n >= l + 1) {
423
				emit->MOVSS(xrs[l], vregs[v[l]].location);
424
			}
425
		};
426
		// The order here is intentional.
427
		loadXR(3);
428
		loadXR(1);
429
		loadXR(2);
430
		loadXR(0);
431
		if (n == 4) {
432
			// This gives us [w, y] in the y reg.
433
			emit->UNPCKLPS(xrs[1], Gen::R(xrs[3]));
434
		}
435
		if (n >= 3) {
436
			// This gives us [z, x].  Then we combine with y.
437
			emit->UNPCKLPS(xrs[0], Gen::R(xrs[2]));
438
		}
439
		if (n >= 2) {
440
			emit->UNPCKLPS(xrs[0], Gen::R(xrs[1]));
441
		}
442
		res = xrs[0];
443
	} else {
444
		_dbg_assert_msg_(n > 2, "2 should not be possible here.");
445

446
		// Available regs are less than n, and some may be loaded.
447
		// Let's grab the most optimal unloaded ones.
448
		X64Reg xr1 = n == 3 ? xrs[1] : xrs[3];
449
		X64Reg xr2 = xrs[2];
450
		if (xr1 == INVALID_REG) {
451
			// Not one of the available ones.  Grab another.
452
			for (int i = n - 1; i >= 0; --i) {
453
				if (xrs[i] != INVALID_REG && xrs[i] != xr2) {
454
					StoreFromRegisterV(v[i]);
455
					xr1 = xrs[i];
456
					break;
457
				}
458
			}
459
		}
460
		if (xr2 == INVALID_REG) {
461
			// Not one of the available ones.  Grab another.
462
			for (int i = n - 1; i >= 0; --i) {
463
				if (xrs[i] != INVALID_REG && xrs[i] != xr1) {
464
					StoreFromRegisterV(v[i]);
465
					xr2 = xrs[i];
466
					break;
467
				}
468
			}
469
		}
470

471
		if (n == 3) {
472
			if (!vregs[v[2]].location.IsSimpleReg(xr2))
473
				emit->MOVSS(xr2, vregs[v[2]].location);
474
			if (!vregs[v[1]].location.IsSimpleReg(xr1))
475
				emit->MOVSS(xr1, vregs[v[1]].location);
476
			emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(3, 0, 0, 0));
477
			emit->MOVSS(xr2, vregs[v[0]].location);
478
			emit->MOVSS(xr1, Gen::R(xr2));
479
		} else if (n == 4) {
480
			if (!vregs[v[2]].location.IsSimpleReg(xr2))
481
				emit->MOVSS(xr2, vregs[v[2]].location);
482
			if (!vregs[v[3]].location.IsSimpleReg(xr1))
483
				emit->MOVSS(xr1, vregs[v[3]].location);
484
			emit->UNPCKLPS(xr2, Gen::R(xr1));
485
			emit->MOVSS(xr1, vregs[v[1]].location);
486
			emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(1, 0, 0, 3));
487
			emit->MOVSS(xr2, vregs[v[0]].location);
488
			emit->MOVSS(xr1, Gen::R(xr2));
489
		}
490
		res = xr1;
491
	}
492

493
	return res;
494
}
495

496
bool FPURegCache::TryMapDirtyInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, bool avoidLoad) {
497
	// Don't waste time mapping if some will for sure fail.
498
	if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz)) {
499
		return false;
500
	}
501
	// But, they could still fail based on overlap.  Hopefully not common...
502
	bool success = TryMapRegsVS(vs, vssz, 0);
503
	if (success) {
504
		success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);
505
	}
506
	ReleaseSpillLockV(vs, vssz);
507
	ReleaseSpillLockV(vd, vdsz);
508

509
	_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");
510
	_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");
511

512
	return success;
513
}
514

515
bool FPURegCache::TryMapDirtyInInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, const u8 *vt, VectorSize vtsz, bool avoidLoad) {
516
	// Don't waste time mapping if some will for sure fail.
517
	if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz) || !CanMapVS(vt, vtsz)) {
518
		return false;
519
	}
520

521

522
	// But, they could still fail based on overlap.  Hopefully not common...
523
	bool success = TryMapRegsVS(vs, vssz, 0);
524
	if (success) {
525
		success = TryMapRegsVS(vt, vtsz, 0);
526
	}
527
	if (success) {
528
		success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);
529
	}
530
	ReleaseSpillLockV(vd, vdsz);
531
	ReleaseSpillLockV(vs, vssz);
532
	ReleaseSpillLockV(vt, vtsz);
533

534
	_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");
535
	_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");
536
	_dbg_assert_msg_(!success || IsMappedVS(vt, vtsz), "vt should be mapped now");
537

538
	return success;
539
}
540

541
void FPURegCache::SimpleRegsV(const u8 *v, VectorSize vsz, int flags) {
542
	const int n = GetNumVectorElements(vsz);
543
	// TODO: Could be more optimal (in case of Discard or etc.)
544
	for (int i = 0; i < n; ++i) {
545
		SimpleRegV(v[i], flags);
546
	}
547
}
548

549
void FPURegCache::SimpleRegsV(const u8 *v, MatrixSize msz, int flags) {
550
	const int n = GetMatrixSide(msz);
551
	// TODO: Could be more optimal (in case of Discard or etc.)
552
	for (int i = 0; i < n; ++i) {
553
		for (int j = 0; j < n; ++j) {
554
			SimpleRegV(v[j * 4 + i], flags);
555
		}
556
	}
557
}
558

559
void FPURegCache::SimpleRegV(const u8 v, int flags) {
560
	MIPSCachedFPReg &vr = vregs[v];
561
	// Special optimization: if it's in a single simd, we can keep it there.
562
	if (vr.lane == 1 && xregs[VSX(&v)].mipsRegs[1] == -1) {
563
		if (flags & MAP_DIRTY) {
564
			xregs[VSX(&v)].dirty = true;
565
		}
566
		// Just change the lane to 0.
567
		vr.lane = 0;
568
	} else if (vr.lane != 0) {
569
		// This will never end up in a register this way, so ignore dirty.
570
		if ((flags & MAP_NOINIT) == MAP_NOINIT) {
571
			// This will discard only this reg, and store the others.
572
			DiscardV(v);
573
		} else {
574
			StoreFromRegisterV(v);
575
		}
576
	} else if (vr.away) {
577
		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
578
		if (flags & MAP_DIRTY) {
579
			xregs[VX(v)].dirty = true;
580
		}
581
		_assert_msg_(vr.location.IsSimpleReg(), "not loaded and not simple.");
582
	}
583
	Invariant();
584
}
585

586
void FPURegCache::ReleaseSpillLock(int mipsreg) {
587
	regs[mipsreg].locked = 0;
588
}
589

590
void FPURegCache::ReleaseSpillLocks() {
591
	for (int i = 0; i < NUM_MIPS_FPRS; i++)
592
		regs[i].locked = 0;
593
	for (int i = TEMP0; i < TEMP0 + NUM_X86_FPU_TEMPS; ++i)
594
		DiscardR(i);
595
}
596

597
void FPURegCache::MapReg(const int i, bool doLoad, bool makeDirty) {
598
	pendingFlush = true;
599
	_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::MapReg - imm");
600
	_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::MapReg - invalid mips reg %d", i);
601

602
	if (!regs[i].away) {
603
		// Reg is at home in the memory register file. Let's pull it out.
604
		X64Reg xr = GetFreeXReg();
605
		_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::MapReg - invalid reg %d", (int)xr);
606
		xregs[xr].mipsReg = i;
607
		xregs[xr].dirty = makeDirty;
608
		OpArg newloc = ::Gen::R(xr);
609
		if (doLoad)	{
610
			emit->MOVSS(xr, regs[i].location);
611
		}
612
		regs[i].location = newloc;
613
		regs[i].lane = 0;
614
		regs[i].away = true;
615
	} else if (regs[i].lane != 0) {
616
		// Well, darn.  This means we need to flush it.
617
		// TODO: This could be more optimal.  Also check flags.
618
		StoreFromRegister(i);
619
		MapReg(i, doLoad, makeDirty);
620
	} else {
621
		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
622
		xregs[RX(i)].dirty |= makeDirty;
623
		_assert_msg_(regs[i].location.IsSimpleReg(), "not loaded and not simple.");
624
	}
625
	Invariant();
626
}
627

628
static int MMShuffleSwapTo0(int lane) {
629
	if (lane == 0) {
630
		return _MM_SHUFFLE(3, 2, 1, 0);
631
	} else if (lane == 1) {
632
		return _MM_SHUFFLE(3, 2, 0, 1);
633
	} else if (lane == 2) {
634
		return _MM_SHUFFLE(3, 0, 1, 2);
635
	} else if (lane == 3) {
636
		return _MM_SHUFFLE(0, 2, 1, 3);
637
	} else {
638
		_assert_msg_(false, "MMShuffleSwapTo0: Invalid lane %d", lane);
639
		return 0;
640
	}
641
}
642

643
void FPURegCache::StoreFromRegister(int i) {
644
	_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::StoreFromRegister - it's an imm");
645
	_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::StoreFromRegister - invalid mipsreg %i PC=%08x", i, js_->compilerPC);
646

647
	if (regs[i].away) {
648
		X64Reg xr = regs[i].location.GetSimpleReg();
649
		_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::StoreFromRegister - invalid reg: x %i (mr: %i). PC=%08x", (int)xr, i, js_->compilerPC);
650
		if (regs[i].lane != 0) {
651
			const int *mri = xregs[xr].mipsRegs;
652
			int seq = 1;
653
			for (int j = 1; j < 4; ++j) {
654
				if (mri[j] == -1) {
655
					break;
656
				}
657
				if (mri[j] - 32 >= 128 && mri[j - 1] - 32 >= 128 && mri[j] == mri[j - 1] + 1) {
658
					seq++;
659
				} else if (mri[j] - 32 < 128 && mri[j - 1] - 32 < 128 && voffset[mri[j] - 32] == voffset[mri[j - 1] - 32] + 1) {
660
					seq++;
661
				} else {
662
					break;
663
				}
664
			}
665

666
			const float *f = mri[0] - 32 < 128 ? &mips_->v[voffset[mri[0] - 32]] : &mips_->tempValues[mri[0] - 32 - 128];
667
			int align = (intptr_t)f & 0xf;
668

669
			// If we can do a multistore...
670
			if ((seq == 2 && (align & 0x7) == 0) || seq == 4) {
671
				OpArg newLoc = GetDefaultLocation(mri[0]);
672
				if (xregs[xr].dirty) {
673
					if (seq == 4 && align == 0)
674
						emit->MOVAPS(newLoc, xr);
675
					else if (seq == 4)
676
						emit->MOVUPS(newLoc, xr);
677
					else
678
						emit->MOVQ_xmm(newLoc, xr);
679
				}
680
				for (int j = 0; j < seq; ++j) {
681
					int mr = xregs[xr].mipsRegs[j];
682
					if (mr == -1) {
683
						continue;
684
					}
685
					OpArg newLoc = GetDefaultLocation(mr);
686
					regs[mr].location = newLoc;
687
					regs[mr].away = false;
688
					regs[mr].lane = 0;
689
					xregs[xr].mipsRegs[j] = -1;
690
				}
691
			} else {
692
				seq = 0;
693
			}
694
			// Store the rest.
695
			for (int j = seq; j < 4; ++j) {
696
				int mr = xregs[xr].mipsRegs[j];
697
				if (mr == -1) {
698
					continue;
699
				}
700
				if (j != 0 && xregs[xr].dirty) {
701
					emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));
702
				}
703
				OpArg newLoc = GetDefaultLocation(mr);
704
				if (xregs[xr].dirty) {
705
					emit->MOVSS(newLoc, xr);
706
				}
707
				regs[mr].location = newLoc;
708
				regs[mr].away = false;
709
				regs[mr].lane = 0;
710
				xregs[xr].mipsRegs[j] = -1;
711
			}
712
		} else {
713
			OpArg newLoc = GetDefaultLocation(i);
714
			xregs[xr].mipsReg = -1;
715
			if (xregs[xr].dirty) {
716
				emit->MOVSS(newLoc, xr);
717
			}
718
			regs[i].location = newLoc;
719
		}
720
		xregs[xr].dirty = false;
721
		regs[i].away = false;
722
	} else {
723
		//	_assert_msg_(false,"already stored");
724
	}
725
	Invariant();
726
}
727

728
void FPURegCache::DiscardR(int i) {
729
	_assert_msg_(!regs[i].location.IsImm(), "FPU can't handle imm yet.");
730
	if (regs[i].away) {
731
		X64Reg xr = regs[i].location.GetSimpleReg();
732
		_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
733
		// Note that we DO NOT write it back here. That's the whole point of Discard.
734
		if (regs[i].lane != 0) {
735
			// But we can't just discard all of them in SIMD, just the one lane.
736
			// TODO: Potentially this could be more optimal (MOVQ or etc.)
737
			xregs[xr].mipsRegs[regs[i].lane - 1] = -1;
738
			regs[i].lane = 0;
739
			for (int j = 0; j < 4; ++j) {
740
				int mr = xregs[xr].mipsRegs[j];
741
				if (mr == -1) {
742
					continue;
743
				}
744
				if (j != 0 && xregs[xr].dirty) {
745
					emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));
746
				}
747

748
				OpArg newLoc = GetDefaultLocation(mr);
749
				if (xregs[xr].dirty) {
750
					emit->MOVSS(newLoc, xr);
751
				}
752
				regs[mr].location = newLoc;
753
				regs[mr].away = false;
754
				regs[mr].lane = 0;
755
				xregs[xr].mipsRegs[j] = -1;
756
			}
757
		} else {
758
			xregs[xr].mipsReg = -1;
759
		}
760
		xregs[xr].dirty = false;
761
		regs[i].location = GetDefaultLocation(i);
762
		regs[i].away = false;
763
		regs[i].tempLocked = false;
764
	} else {
765
		//	_assert_msg_(false,"already stored");
766
		regs[i].tempLocked = false;
767
	}
768
	Invariant();
769
}
770

771
void FPURegCache::DiscardVS(int vreg) {
772
	_assert_msg_(!vregs[vreg].location.IsImm(), "FPU can't handle imm yet.");
773

774
	if (vregs[vreg].away) {
775
		_assert_msg_(vregs[vreg].lane != 0, "VS expects a SIMD reg.");
776
		X64Reg xr = vregs[vreg].location.GetSimpleReg();
777
		_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
778
		// Note that we DO NOT write it back here. That's the whole point of Discard.
779
		for (int i = 0; i < 4; ++i) {
780
			int mr = xregs[xr].mipsRegs[i];
781
			if (mr != -1) {
782
				regs[mr].location = GetDefaultLocation(mr);
783
				regs[mr].away = false;
784
				regs[mr].tempLocked = false;
785
				regs[mr].lane = 0;
786
			}
787
			xregs[xr].mipsRegs[i] = -1;
788
		}
789
		xregs[xr].dirty = false;
790
	} else {
791
		vregs[vreg].tempLocked = false;
792
	}
793
	Invariant();
794
}
795

796
bool FPURegCache::IsTempX(X64Reg xr) {
797
	return xregs[xr].mipsReg >= TEMP0;
798
}
799

800
int FPURegCache::GetTempR() {
801
	pendingFlush = true;
802
	for (int r = TEMP0; r < TEMP0 + NUM_X86_FPU_TEMPS; ++r) {
803
		if (!regs[r].away && !regs[r].tempLocked) {
804
			regs[r].tempLocked = true;
805
			return r;
806
		}
807
	}
808

809
	_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
810
	return -1;
811
}
812

813
int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) {
814
	pendingFlush = true;
815
	const int n = GetNumVectorElements(vsz);
816

817
	// Let's collect regs as we go, but try for n free in a row.
818
	int found = 0;
819
	for (int r = TEMP0; r <= TEMP0 + NUM_X86_FPU_TEMPS - n; ++r) {
820
		if (regs[r].away || regs[r].tempLocked) {
821
			continue;
822
		}
823

824
		// How many free siblings does this have?
825
		int seq = 1;
826
		for (int i = 1; i < n; ++i) {
827
			if (regs[r + i].away || regs[r + i].tempLocked) {
828
				break;
829
			}
830
			++seq;
831
		}
832

833
		if (seq == n) {
834
			// Got 'em.  Exacty as many as we need.
835
			for (int i = 0; i < n; ++i) {
836
				v[i] = r + i - 32;
837
			}
838
			found = n;
839
			break;
840
		}
841

842
		if (found < n) {
843
			v[found++] = r - 32;
844
		}
845
	}
846

847
	if (found != n) {
848
		_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
849
		return -1;
850
	}
851

852
	for (int i = 0; i < n; ++i) {
853
		regs[v[i] + 32].tempLocked = true;
854
	}
855

856
	return 0;  // ??
857
}
858

859
void FPURegCache::Flush() {
860
	if (!pendingFlush) {
861
		return;
862
	}
863
	for (int i = 0; i < NUM_MIPS_FPRS; i++) {
864
		_assert_msg_(!regs[i].locked, "Somebody forgot to unlock MIPS reg %d.", i);
865
		if (regs[i].away) {
866
			if (regs[i].location.IsSimpleReg()) {
867
				X64Reg xr = RX(i);
868
				StoreFromRegister(i);
869
				xregs[xr].dirty = false;
870
			} else if (regs[i].location.IsImm()) {
871
				StoreFromRegister(i);
872
			} else {
873
				_assert_msg_(false, "Jit64 - Flush unhandled case, reg %i PC: %08x", i, mips_->pc);
874
			}
875
		}
876
	}
877
	pendingFlush = false;
878
	Invariant();
879
}
880

881
OpArg FPURegCache::GetDefaultLocation(int reg) const {
882
	if (reg < 32) {
883
		// Smaller than RIP addressing since we can use a byte offset.
884
		return MDisp(CTXREG, reg * 4);
885
	} else if (reg < 32 + 128) {
886
		// Here, RIP has the advantage so let's use it when possible
887
		if (useRip_) {
888
			return M(&mips_->v[voffset[reg - 32]]);  // rip accessible
889
		} else {
890
			return MIPSSTATE_VAR_ELEM32(v[0], voffset[reg - 32]);
891
		}
892
	} else {
893
		if (useRip_) {
894
			return M(&mips_->tempValues[reg - 32 - 128]);  // rip accessible
895
		} else {
896
			return MIPSSTATE_VAR_ELEM32(tempValues[0], reg - 32 - 128);
897
		}
898
	}
899
}
900

901
void FPURegCache::Invariant() const {
902
#if 0
903
	_assert_msg_(SanityCheck() == 0, "Sanity check failed: %d", SanityCheck());
904
#endif
905
}
906

907
static int GetMRMtx(int mr) {
908
	if (mr < 32)
909
		return -1;
910
	if (mr >= 128 + 32)
911
		return -1;
912
	return ((mr - 32) >> 2) & 7;
913
}
914

915
static int GetMRRow(int mr) {
916
	if (mr < 32)
917
		return -1;
918
	if (mr >= 128 + 32)
919
		return -1;
920
	return ((mr - 32) >> 0) & 3;
921
}
922

923
static int GetMRCol(int mr) {
924
	if (mr < 32)
925
		return -1;
926
	if (mr >= 128 + 32)
927
		return -1;
928
	return ((mr - 32) >> 5) & 3;
929
}
930

931
static bool IsMRTemp(int mr) {
932
	return mr >= 128 + 32;
933
}
934

935
int FPURegCache::SanityCheck() const {
936
	for (int i = 0; i < NUM_MIPS_FPRS; i++) {
937
		const MIPSCachedFPReg &mr = regs[i];
938

939
		// FPR can never have imms.
940
		if (mr.location.IsImm())
941
			return 1;
942

943
		bool reallyAway = mr.location.IsSimpleReg();
944
		if (reallyAway != mr.away)
945
			return 2;
946

947
		if (mr.lane < 0 || mr.lane > 4)
948
			return 3;
949
		if (mr.lane != 0 && !reallyAway)
950
			return 4;
951

952
		if (mr.away) {
953
			Gen::X64Reg simple = mr.location.GetSimpleReg();
954
			if (mr.lane == 0) {
955
				if (xregs[simple].mipsReg != i)
956
					return 5;
957
				for (int j = 1; j < 4; ++j) {
958
					if (xregs[simple].mipsRegs[j] != -1)
959
						return 6;
960
				}
961
			} else {
962
				if (xregs[simple].mipsRegs[mr.lane - 1] != i)
963
					return 7;
964
			}
965
		}
966
	}
967

968
	for (int i = 0; i < NUM_X_FPREGS; ++i) {
969
		const X64CachedFPReg &xr = xregs[i];
970
		bool hasReg = xr.mipsReg != -1;
971
		if (!hasReg && xr.dirty)
972
			return 8;
973

974
		bool hasMoreRegs = hasReg;
975
		int mtx = -2;
976
		int row = -2;
977
		int col = -2;
978
		bool rowMatched = true;
979
		bool colMatched = true;
980
		for (int j = 0; j < 4; ++j) {
981
			if (xr.mipsRegs[j] == -1) {
982
				hasMoreRegs = false;
983
				continue;
984
			}
985
			if (xr.mipsRegs[j] >= NUM_MIPS_FPRS) {
986
				return 13;
987
			}
988
			// We can't have a hole in the middle / front.
989
			if (!hasMoreRegs)
990
				return 9;
991

992
			const MIPSCachedFPReg &mr = regs[xr.mipsRegs[j]];
993
			if (!mr.location.IsSimpleReg(X64Reg(i)))
994
				return 10;
995

996
			if (!IsMRTemp(xr.mipsRegs[j])) {
997
				if (mtx == -2)
998
					mtx = GetMRMtx(xr.mipsRegs[j]);
999
				else if (mtx != GetMRMtx(xr.mipsRegs[j]))
1000
					return 11;
1001

1002
				if (row == -2)
1003
					row = GetMRRow(xr.mipsRegs[j]);
1004
				else if (row != GetMRRow(xr.mipsRegs[j]))
1005
					rowMatched = false;
1006

1007
				if (col == -2)
1008
					col = GetMRCol(xr.mipsRegs[j]);
1009
				else if (col != GetMRCol(xr.mipsRegs[j]))
1010
					colMatched = false;
1011
			}
1012
		}
1013
		if (!rowMatched && !colMatched) {
1014
			return 12;
1015
		}
1016
	}
1017

1018
	return 0;
1019
}
1020

1021
const int *FPURegCache::GetAllocationOrder(int &count) {
1022
	static const int allocationOrder[] = {
1023
#if PPSSPP_ARCH(AMD64)
1024
		XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5
1025
#elif PPSSPP_ARCH(X86)
1026
		XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
1027
#endif
1028
	};
1029
	count = sizeof(allocationOrder) / sizeof(int);
1030
	return allocationOrder;
1031
}
1032

1033
X64Reg FPURegCache::GetFreeXReg() {
1034
	X64Reg res;
1035
	int obtained = GetFreeXRegs(&res, 1);
1036

1037
	_assert_msg_(obtained == 1, "Regcache ran out of regs");
1038
	return res;
1039
}
1040

1041
int FPURegCache::GetFreeXRegs(X64Reg *res, int n, bool spill) {
1042
	pendingFlush = true;
1043
	int aCount;
1044
	const int *aOrder = GetAllocationOrder(aCount);
1045

1046
	_dbg_assert_msg_(n <= NUM_X_FPREGS - 2, "Cannot obtain that many regs.");
1047

1048
	int r = 0;
1049

1050
	for (int i = 0; i < aCount; i++) {
1051
		X64Reg xr = (X64Reg)aOrder[i];
1052
		if (xregs[xr].mipsReg == -1) {
1053
			res[r++] = (X64Reg)xr;
1054
			if (r >= n) {
1055
				break;
1056
			}
1057
		}
1058
	}
1059

1060
	if (r < n && spill) {
1061
		// Okay, not found :(... Force grab one.
1062
		// TODO - add a pass to grab xregs whose mipsreg is not used in the next 3 instructions.
1063
		for (int i = 0; i < aCount; i++) {
1064
			X64Reg xr = (X64Reg)aOrder[i];
1065
			int preg = xregs[xr].mipsReg;
1066
			_assert_msg_(preg >= -1 && preg < NUM_MIPS_FPRS, "WTF - FPURegCache::GetFreeXRegs - invalid mips reg %d in xr %d", preg, (int)xr);
1067

1068
			// We're only spilling here, so don't overlap.
1069
			if (preg != -1 && !regs[preg].locked) {
1070
				StoreFromRegister(preg);
1071
				res[r++] = xr;
1072
				if (r >= n) {
1073
					break;
1074
				}
1075
			}
1076
		}
1077
	}
1078

1079
	for (int i = r; i < n; ++i) {
1080
		res[i] = INVALID_REG;
1081
	}
1082
	return r;
1083
}
1084

1085
void FPURegCache::FlushX(X64Reg reg) {
1086
	if (reg >= NUM_X_FPREGS) {
1087
		_assert_msg_(false, "Flushing non existent reg");
1088
	} else if (xregs[reg].mipsReg != -1) {
1089
		StoreFromRegister(xregs[reg].mipsReg);
1090
	}
1091
}
1092

1093
void FPURegCache::GetState(FPURegCacheState &state) const {
1094
	memcpy(state.regs, regs, sizeof(regs));
1095
	memcpy(state.xregs, xregs, sizeof(xregs));
1096
}
1097

1098
void FPURegCache::RestoreState(const FPURegCacheState& state) {
1099
	memcpy(regs, state.regs, sizeof(regs));
1100
	memcpy(xregs, state.xregs, sizeof(xregs));
1101
	pendingFlush = true;
1102
}
1103

1104
#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
1105

1106
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company