CoCalc -- ArmCompVFPUNEONUtil.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/ARM/ArmCompVFPUNEONUtil.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2013- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
// NEON VFPU
19
// This is where we will create an alternate implementation of the VFPU emulation
20
// that uses NEON Q registers to cache pairs/tris/quads, and so on.
21
// Will require major extensions to the reg cache and other things.
22

23
// ARM NEON can only do pairs and quads, not tris and scalars.
24
// We can do scalars, though, for many operations if all the operands
25
// are below Q8 (D16, S32) using regular VFP instructions but really not sure
26
// if it's worth it.
27

28
#include "ppsspp_config.h"
29
#if PPSSPP_ARCH(ARM)
30

31
#include <cmath>
32

33
#include "Common/Math/math_util.h"
34

35
#include "Common/CPUDetect.h"
36
#include "Core/MemMap.h"
37
#include "Core/MIPS/MIPS.h"
38
#include "Core/MIPS/MIPSAnalyst.h"
39
#include "Core/MIPS/MIPSCodeUtils.h"
40
#include "Core/MIPS/MIPSVFPUUtils.h"
41
#include "Core/Config.h"
42
#include "Core/Reporting.h"
43

44
#include "Core/MIPS/ARM/ArmJit.h"
45
#include "Core/MIPS/ARM/ArmRegCache.h"
46
#include "Core/MIPS/ARM/ArmCompVFPUNEONUtil.h"
47

48
// TODO: Somehow #ifdef away on ARMv5eabi, without breaking the linker.
49

50
#define _RS MIPS_GET_RS(op)
51
#define _RT MIPS_GET_RT(op)
52
#define _RD MIPS_GET_RD(op)
53
#define _FS MIPS_GET_FS(op)
54
#define _FT MIPS_GET_FT(op)
55
#define _FD MIPS_GET_FD(op)
56
#define _SA MIPS_GET_SA(op)
57
#define _POS  ((op>> 6) & 0x1F)
58
#define _SIZE ((op>>11) & 0x1F)
59
#define _IMM16 (signed short)(op & 0xFFFF)
60
#define _IMM26 (op & 0x03FFFFFF)
61

62
namespace MIPSComp {
63

64
using namespace ArmGen;
65
using namespace ArmJitConstants;
66

67
static const float minus_one = -1.0f;
68
static const float one = 1.0f;
69
static const float zero = 0.0f;
70
	
71
// On NEON, we map triples to Q registers and singles to D registers.
72
// Sometimes, as when doing dot products, it matters what's in that unused reg. This zeroes it.
73
void ArmJit::NEONMaskToSize(ARMReg vs, VectorSize sz) {
74
	// TODO
75
}
76

77
ARMReg ArmJit::NEONMapPrefixST(int mipsReg, VectorSize sz, u32 prefix, int mapFlags) {
78
	static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
79
	static const float constantArrayNegated[8] = { -0.f, -1.f, -2.f, -0.5f, -3.f, -1.f / 3.f, -0.25f, -1.f / 6.f };
80

81
	// Applying prefixes in SIMD fashion will actually be a lot easier than the old style.
82
	if (prefix == 0xE4) {
83
		return fpr.QMapReg(mipsReg, sz, mapFlags);
84
	}
85

86
	int n = GetNumVectorElements(sz);
87

88
	int regnum[4] = { -1, -1, -1, -1 };
89
	int abs[4] = { 0 };
90
	int negate[4] = { 0 };
91
	int constants[4] = { 0 };
92
	int constNum[4] = { 0 };
93

94
	int full_mask = (1 << n) - 1;
95
	
96
	int abs_mask = (prefix >> 8) & full_mask;
97
	int negate_mask = (prefix >> 16) & full_mask;
98
	int constants_mask = (prefix >> 12) & full_mask;
99

100
	// Decode prefix to keep the rest readable
101
	int permuteMask = 0;
102
	for (int i = 0; i < n; i++) {
103
		permuteMask |= 3 << (i * 2);
104
		regnum[i] = (prefix >> (i * 2)) & 3;
105
		abs[i] = (prefix >> (8 + i)) & 1;
106
		negate[i] = (prefix >> (16 + i)) & 1;
107
		constants[i] = (prefix >> (12 + i)) & 1;
108

109
		if (constants[i]) {
110
			constNum[i] = regnum[i] + (abs[i] << 2);
111
			abs[i] = 0;
112
		}
113
	}
114
	abs_mask &= ~constants_mask;
115

116
	bool anyPermute = (prefix & permuteMask) != (0xE4 & permuteMask);
117

118
	if (constants_mask == full_mask) {
119
		// It's all constants! Don't even bother mapping the input register,
120
		// just allocate a temp one.
121
		// If a single, this can sometimes be done cheaper. But meh.
122
		ARMReg ar = fpr.QAllocTemp(sz);
123
		for (int i = 0; i < n; i++) {
124
			if ((i & 1) == 0) {
125
				if (constNum[i] == constNum[i + 1]) {
126
					// Replace two loads with a single immediate when easily possible.
127
					ARMReg dest = i & 2 ? D_1(ar) : D_0(ar);
128
					switch (constNum[i]) {
129
					case 0:
130
					case 1:
131
						{
132
							float c = constantArray[constNum[i]];
133
							VMOV_immf(dest, negate[i] ? -c : c);
134
						}
135
						break;
136
						// TODO: There are a few more that are doable.
137
					default:
138
						goto skip;
139
					}
140

141
					i++;
142
					continue;
143
				skip:
144
					;
145
				}
146
			}
147
			MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);
148
			VLD1_lane(F_32, ar, R0, i, true);
149
		}
150
		return ar;
151
	}
152

153
	// 1. Permute.
154
	// 2. Abs
155
	// If any constants:
156
	// 3. Replace values with constants
157
	// 4. Negate
158

159
	ARMReg inputAR = fpr.QMapReg(mipsReg, sz, mapFlags);
160
	ARMReg ar = fpr.QAllocTemp(sz);
161

162
	if (!anyPermute) {
163
		VMOV(ar, inputAR);
164
		// No permutations!
165
	} else {
166
		bool allSame = false;
167
		for (int i = 1; i < n; i++) {
168
			if (regnum[0] == regnum[i])
169
				allSame = true;
170
		}
171

172
		if (allSame) {
173
			// Easy, someone is duplicating one value onto all the reg parts.
174
			// If this is happening and QMapReg must load, we can combine these two actions
175
			// into a VLD1_lane. TODO
176
			VDUP(F_32, ar, inputAR, regnum[0]);
177
		} else {
178
			// Do some special cases
179
			if (regnum[0] == 1 && regnum[1] == 0) {
180
				INFO_LOG(Log::HLE, "PREFIXST: Bottom swap!");
181
				VREV64(I_32, ar, inputAR);
182
				regnum[0] = 0;
183
				regnum[1] = 1;
184
			}
185

186
			// TODO: Make a generic fallback using another temp register
187

188
			bool match = true;
189
			for (int i = 0; i < n; i++) {
190
				if (regnum[i] != i)
191
					match = false;
192
			}
193

194
			// TODO: Cannot do this permutation yet!
195
			if (!match) {
196
				ERROR_LOG(Log::HLE, "PREFIXST: Unsupported permute! %i %i %i %i / %i", regnum[0], regnum[1], regnum[2], regnum[3], n);
197
				VMOV(ar, inputAR);
198
			}
199
		}
200
	}
201

202
	// ABS
203
	// Two methods: If all lanes are "absoluted", it's easy.
204
	if (abs_mask == full_mask) {
205
		// TODO: elide the above VMOV (in !anyPermute) when possible
206
		VABS(F_32, ar, ar);
207
	} else if (abs_mask != 0) {
208
		// Partial ABS!
209
		if (abs_mask == 3) {
210
			VABS(F_32, D_0(ar), D_0(ar));
211
		} else {
212
			// Horrifying fallback: Mov to Q0, abs, move back.
213
			// TODO: Optimize for lower quads where we don't need to move.
214
			VMOV(MatchSize(Q0, ar), ar);
215
			for (int i = 0; i < n; i++) {
216
				if (abs_mask & (1 << i)) {
217
					VABS((ARMReg)(S0 + i), (ARMReg)(S0 + i));
218
				}
219
			}
220
			VMOV(ar, MatchSize(Q0, ar));
221
			INFO_LOG(Log::HLE, "PREFIXST: Partial ABS %i/%i! Slow fallback generated.", abs_mask, full_mask);
222
		}
223
	}
224

225
	if (negate_mask == full_mask) {
226
		// TODO: elide the above VMOV when possible
227
		VNEG(F_32, ar, ar);
228
	} else if (negate_mask != 0) {
229
		// Partial negate! I guess we build sign bits in another register
230
		// and simply XOR.
231
		if (negate_mask == 3) {
232
			VNEG(F_32, D_0(ar), D_0(ar));
233
		} else {
234
			// Horrifying fallback: Mov to Q0, negate, move back.
235
			// TODO: Optimize for lower quads where we don't need to move.
236
			VMOV(MatchSize(Q0, ar), ar);
237
			for (int i = 0; i < n; i++) {
238
				if (negate_mask & (1 << i)) {
239
					VNEG((ARMReg)(S0 + i), (ARMReg)(S0 + i));
240
				}
241
			}
242
			VMOV(ar, MatchSize(Q0, ar));
243
			INFO_LOG(Log::HLE, "PREFIXST: Partial Negate %i/%i! Slow fallback generated.", negate_mask, full_mask);
244
		}
245
	}
246

247
	// Insert constants where requested, and check negate!
248
	for (int i = 0; i < n; i++) {
249
		if (constants[i]) {
250
			MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);
251
			VLD1_lane(F_32, ar, R0, i, true);
252
		}
253
	}
254

255
	return ar;
256
}
257

258
ArmJit::DestARMReg ArmJit::NEONMapPrefixD(int vreg, VectorSize sz, int mapFlags) {
259
	// Inverted from the actual bits, easier to reason about 1 == write
260
	int writeMask = (~(js.prefixD >> 8)) & 0xF;
261
	int n = GetNumVectorElements(sz);
262
	int full_mask = (1 << n) - 1;
263

264
	DestARMReg dest;
265
	dest.sz = sz;
266
	if ((writeMask & full_mask) == full_mask) {
267
		// No need to apply a write mask.
268
		// Let's not make things complicated.
269
		dest.rd = fpr.QMapReg(vreg, sz, mapFlags);
270
		dest.backingRd = dest.rd;
271
	} else {
272
		// Allocate a temporary register.
273
		ERROR_LOG(Log::JIT, "PREFIXD: Write mask allocated! %i/%i", writeMask, full_mask);
274
		dest.rd = fpr.QAllocTemp(sz);
275
		dest.backingRd = fpr.QMapReg(vreg, sz, mapFlags & ~MAP_NOINIT);  // Force initialization of the backing reg.
276
	}
277
	return dest;
278
}
279

280
void ArmJit::NEONApplyPrefixD(DestARMReg dest) {
281
	// Apply clamps to dest.rd
282
	int n = GetNumVectorElements(dest.sz);
283

284
	int sat1_mask = 0;
285
	int sat3_mask = 0;
286
	int full_mask = (1 << n) - 1;
287
	for (int i = 0; i < n; i++) {
288
		int sat = (js.prefixD >> (i * 2)) & 3;
289
		if (sat == 1)
290
			sat1_mask |= 1 << i;
291
		if (sat == 3)
292
			sat3_mask |= 1 << i;
293
	}
294

295
	if (sat1_mask && sat3_mask) {
296
		// Why would anyone do this?
297
		ERROR_LOG(Log::JIT, "PREFIXD: Can't have both sat[0-1] and sat[-1-1] at the same time yet");
298
	}
299

300
	if (sat1_mask) {
301
		if (sat1_mask != full_mask) {
302
			ERROR_LOG(Log::JIT, "PREFIXD: Can't have partial sat1 mask yet (%i vs %i)", sat1_mask, full_mask);
303
		}
304
		if (IsD(dest.rd)) {
305
			VMOV_immf(D0, 0.0);
306
			VMOV_immf(D1, 1.0);
307
			VMAX(F_32, dest.rd, dest.rd, D0);
308
			VMIN(F_32, dest.rd, dest.rd, D1);
309
		} else {
310
			VMOV_immf(Q0, 1.0);
311
			VMIN(F_32, dest.rd, dest.rd, Q0);
312
			VMOV_immf(Q0, 0.0);
313
			VMAX(F_32, dest.rd, dest.rd, Q0);
314
		}
315
	}
316

317
	if (sat3_mask && sat1_mask != full_mask) {
318
		if (sat3_mask != full_mask) {
319
			ERROR_LOG(Log::JIT, "PREFIXD: Can't have partial sat3 mask yet (%i vs %i)", sat3_mask, full_mask);
320
		}
321
		if (IsD(dest.rd)) {
322
			VMOV_immf(D0, 0.0);
323
			VMOV_immf(D1, 1.0);
324
			VMAX(F_32, dest.rd, dest.rd, D0);
325
			VMIN(F_32, dest.rd, dest.rd, D1);
326
		} else {
327
			VMOV_immf(Q0, 1.0);
328
			VMIN(F_32, dest.rd, dest.rd, Q0);
329
			VMOV_immf(Q0, -1.0);
330
			VMAX(F_32, dest.rd, dest.rd, Q0);
331
		}
332
	}
333

334
	// Check for actual mask operation (unrelated to the "masks" above).
335
	if (dest.backingRd != dest.rd) {
336
		// This means that we need to apply the write mask, from rd to backingRd.
337
		// What a pain. We can at least shortcut easy cases like half the register.
338
		// And we can generate the masks easily with some of the crazy vector imm modes. (bits2bytes for example).
339
		// So no need to load them from RAM.
340
		int writeMask = (~(js.prefixD >> 8)) & 0xF;
341

342
		if (writeMask == 3) {
343
			INFO_LOG(Log::JIT, "Doing writemask = 3");
344
			VMOV(D_0(dest.rd), D_0(dest.backingRd));
345
		} else {
346
			// TODO
347
			ERROR_LOG(Log::JIT, "PREFIXD: Arbitrary write masks not supported (%i / %i)", writeMask, full_mask);
348
			VMOV(dest.backingRd, dest.rd);
349
		}
350
	}
351
}
352

353
ArmJit::MappedRegs ArmJit::NEONMapDirtyInIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {
354
	MappedRegs regs;
355
	if (applyPrefixes) {
356
		regs.vs = NEONMapPrefixS(_VS, ssize, 0);
357
		regs.vt = NEONMapPrefixT(_VT, tsize, 0);
358
	} else {
359
		regs.vs = fpr.QMapReg(_VS, ssize, 0);
360
		regs.vt = fpr.QMapReg(_VT, ssize, 0);
361
	}
362

363
	regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0 || GetVectorOverlap(_VD, dsize, _VT, ssize);
364
	if (applyPrefixes) {
365
		regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
366
	} else {
367
		regs.vd.rd = fpr.QMapReg(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
368
		regs.vd.backingRd = regs.vd.rd;
369
		regs.vd.sz = dsize;
370
	}
371
	return regs;
372
}
373

374
ArmJit::MappedRegs ArmJit::NEONMapInIn(MIPSOpcode op, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {
375
	MappedRegs regs;
376
	if (applyPrefixes) {
377
		regs.vs = NEONMapPrefixS(_VS, ssize, 0);
378
		regs.vt = NEONMapPrefixT(_VT, tsize, 0);
379
	} else {
380
		regs.vs = fpr.QMapReg(_VS, ssize, 0);
381
		regs.vt = fpr.QMapReg(_VT, ssize, 0);
382
	}
383
	regs.vd.rd = INVALID_REG;
384
	regs.vd.sz = V_Invalid;
385
	return regs;
386
}
387

388
ArmJit::MappedRegs ArmJit::NEONMapDirtyIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, bool applyPrefixes) {
389
	MappedRegs regs;
390
	regs.vs = NEONMapPrefixS(_VS, ssize, 0);
391
	regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0;
392
	regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
393
	return regs;
394
}
395

396
// Requires quad registers.
397
void ArmJit::NEONTranspose4x4(ARMReg cols[4]) {
398
	// 0123   _\  0426
399
	// 4567    /  1537
400
	VTRN(F_32, cols[0], cols[1]);   
401

402
	// 89ab   _\  8cae
403
	// cdef    /  9dbf
404
	VTRN(F_32, cols[2], cols[3]);
405

406
	//  04[26]       048c
407
	//  15 37   ->    1537
408
	// [8c]ae       26ae
409
	//  9d bf         9dbf
410
	VSWP(D_1(cols[0]), D_0(cols[2]));
411

412
	//  04 8c       048c
413
	//  15[37]   ->  159d
414
	//  26 ae       26ae
415
	// [9d]bf       37bf
416
	VSWP(D_1(cols[1]), D_0(cols[3]));
417
}
418

419
}  // namespace MIPSComp
420

421
#endif // PPSSPP_ARCH(ARM)
422

423
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company