CoCalc -- X64IRCompVec.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/X64IRCompVec.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2023- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20

21
#include <algorithm>
22
#include "Common/CPUDetect.h"
23
#include "Core/MemMap.h"
24
#include "Core/MIPS/x86/X64IRJit.h"
25
#include "Core/MIPS/x86/X64IRRegCache.h"
26

27
// This file contains compilation for vector instructions.
28
//
29
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
30
// Currently known non working ones should have DISABLE.  No flags because that's in IR already.
31

32
// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }
33
#define CONDITIONAL_DISABLE {}
34
#define DISABLE { CompIR_Generic(inst); return; }
35
#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }
36

37
namespace MIPSComp {
38

39
using namespace Gen;
40
using namespace X64IRJitConstants;
41

42
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
43
	return r1 < r2 + l2 && r1 + l1 > r2;
44
}
45

46
void X64JitBackend::EmitVecConstants() {
47
	static const float vec4InitData[8][4] = {
48
		{ 0.0f, 0.0f, 0.0f, 0.0f },
49
		{ 1.0f, 1.0f, 1.0f, 1.0f },
50
		{ -1.0f, -1.0f, -1.0f, -1.0f },
51
		{ 1.0f, 0.0f, 0.0f, 0.0f },
52
		{ 0.0f, 1.0f, 0.0f, 0.0f },
53
		{ 0.0f, 0.0f, 1.0f, 0.0f },
54
		{ 0.0f, 0.0f, 0.0f, 1.0f },
55
	};
56

57
	constants.vec4InitValues = (const Float4Constant *)GetCodePointer();
58
	for (size_t type = 0; type < ARRAY_SIZE(vec4InitData); ++type) {
59
		for (int i = 0; i < 4; ++i) {
60
			uint32_t val;
61
			memcpy(&val, &vec4InitData[type][i], sizeof(val));
62
			Write32(val);
63
		}
64
	}
65
}
66

67
void X64JitBackend::CompIR_VecArith(IRInst inst) {
68
	CONDITIONAL_DISABLE;
69

70
	switch (inst.op) {
71
	case IROp::Vec4Add:
72
		regs_.Map(inst);
73
		if (inst.dest == inst.src1) {
74
			ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));
75
		} else if (inst.dest == inst.src2) {
76
			ADDPS(regs_.FX(inst.dest), regs_.F(inst.src1));
77
		} else if (cpu_info.bAVX) {
78
			VADDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
79
		} else {
80
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
81
			ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));
82
		}
83
		break;
84

85
	case IROp::Vec4Sub:
86
		if (inst.dest == inst.src1) {
87
			regs_.Map(inst);
88
			SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));
89
		} else if (cpu_info.bAVX) {
90
			regs_.Map(inst);
91
			VSUBPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
92
		} else if (inst.dest == inst.src2) {
93
			X64Reg tempReg = regs_.MapWithFPRTemp(inst);
94
			MOVAPS(tempReg, regs_.F(inst.src2));
95
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
96
			SUBPS(regs_.FX(inst.dest), R(tempReg));
97
		} else {
98
			regs_.Map(inst);
99
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
100
			SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));
101
		}
102
		break;
103

104
	case IROp::Vec4Mul:
105
		regs_.Map(inst);
106
		if (inst.dest == inst.src1) {
107
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
108
		} else if (inst.dest == inst.src2) {
109
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
110
		} else if (cpu_info.bAVX) {
111
			VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
112
		} else {
113
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
114
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
115
		}
116
		break;
117

118
	case IROp::Vec4Div:
119
		if (inst.dest == inst.src1) {
120
			regs_.Map(inst);
121
			DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));
122
		} else if (cpu_info.bAVX) {
123
			regs_.Map(inst);
124
			VDIVPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
125
		} else if (inst.dest == inst.src2) {
126
			X64Reg tempReg = regs_.MapWithFPRTemp(inst);
127
			MOVAPS(tempReg, regs_.F(inst.src2));
128
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
129
			DIVPS(regs_.FX(inst.dest), R(tempReg));
130
		} else {
131
			regs_.Map(inst);
132
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
133
			DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));
134
		}
135
		break;
136

137
	case IROp::Vec4Scale:
138
		// TODO: Handle "aliasing" of sizes.
139
		if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1))
140
			DISABLE;
141

142
		regs_.Map(inst);
143
		SHUFPS(regs_.FX(inst.src2), regs_.F(inst.src2), 0);
144
		if (inst.dest == inst.src1) {
145
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
146
		} else if (inst.dest == inst.src2) {
147
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
148
		} else if (cpu_info.bAVX) {
149
			VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
150
		} else {
151
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
152
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
153
		}
154
		break;
155

156
	case IROp::Vec4Neg:
157
		regs_.Map(inst);
158
		if (cpu_info.bAVX) {
159
			VXORPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.signBitAll));  // rip accessible
160
		} else {
161
			if (inst.dest != inst.src1)
162
				MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
163
			XORPS(regs_.FX(inst.dest), M(constants.signBitAll));  // rip accessible
164
		}
165
		break;
166

167
	case IROp::Vec4Abs:
168
		regs_.Map(inst);
169
		if (cpu_info.bAVX) {
170
			VANDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.noSignMask));  // rip accessible
171
		} else {
172
			if (inst.dest != inst.src1)
173
				MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
174
			ANDPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
175
		}
176
		break;
177

178
	default:
179
		INVALIDOP;
180
		break;
181
	}
182
}
183

184
void X64JitBackend::CompIR_VecAssign(IRInst inst) {
185
	CONDITIONAL_DISABLE;
186

187
	switch (inst.op) {
188
	case IROp::Vec4Init:
189
		regs_.Map(inst);
190
		if (inst.src1 == (int)Vec4Init::AllZERO) {
191
			XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
192
		} else  {
193
			MOVAPS(regs_.FX(inst.dest), M(&constants.vec4InitValues[inst.src1]));  // rip accessible
194
		}
195
		break;
196

197
	case IROp::Vec4Shuffle:
198
		if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0) {
199
			// This is a broadcast.  If dest == src1, this won't clear it.
200
			regs_.SpillLockFPR(inst.src1);
201
			regs_.MapVec4(inst.dest, MIPSMap::NOINIT);
202
		} else {
203
			regs_.Map(inst);
204
		}
205
		if (cpu_info.bAVX) {
206
			VPERMILPS(128, regs_.FX(inst.dest), regs_.F(inst.src1), inst.src2);
207
		} else {
208
			if (inst.dest != inst.src1)
209
				MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
210
			SHUFPS(regs_.FX(inst.dest), regs_.F(inst.dest), inst.src2);
211
		}
212
		break;
213

214
	case IROp::Vec4Blend:
215
		if (cpu_info.bAVX) {
216
			regs_.Map(inst);
217
			VBLENDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2), (uint8_t)inst.constant);
218
		} else if (cpu_info.bSSE4_1) {
219
			regs_.Map(inst);
220
			if (inst.dest == inst.src1) {
221
				BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);
222
			} else if (inst.dest == inst.src2) {
223
				BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src1), (uint8_t)~inst.constant);
224
			} else {
225
				MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
226
				BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);
227
			}
228
		} else {
229
			// Could use some shuffles...
230
			DISABLE;
231
		}
232
		break;
233

234
	case IROp::Vec4Mov:
235
		regs_.Map(inst);
236
		MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
237
		break;
238

239
	default:
240
		INVALIDOP;
241
		break;
242
	}
243
}
244

245
void X64JitBackend::CompIR_VecClamp(IRInst inst) {
246
	CONDITIONAL_DISABLE;
247

248
	switch (inst.op) {
249
	case IROp::Vec4ClampToZero:
250
	case IROp::Vec2ClampToZero:
251
		CompIR_Generic(inst);
252
		break;
253

254
	default:
255
		INVALIDOP;
256
		break;
257
	}
258
}
259

260
void X64JitBackend::CompIR_VecHoriz(IRInst inst) {
261
	CONDITIONAL_DISABLE;
262

263
	switch (inst.op) {
264
	case IROp::Vec4Dot:
265
	{
266
		// TODO: Handle "aliasing" of sizes.  In theory it should be fine if not dirty...
267
		if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4))
268
			DISABLE;
269

270
		X64Reg tempReg = regs_.MapWithFPRTemp(inst);
271

272
		if (inst.dest == inst.src1) {
273
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
274
		} else if (inst.dest == inst.src2) {
275
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
276
		} else if (cpu_info.bAVX) {
277
			VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
278
		} else if (cpu_info.bSSE4_1) {
279
			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
280
			MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
281
		}
282

283
		// This shuffle can be done in one op for SSE3/AVX, but it's not always faster.
284
		MOVAPS(tempReg, regs_.F(inst.dest));
285
		SHUFPS(tempReg, regs_.F(inst.dest), VFPU_SWIZZLE(1, 0, 3, 2));
286
		ADDPS(regs_.FX(inst.dest), R(tempReg));
287
		MOVHLPS(tempReg, regs_.FX(inst.dest));
288
		ADDSS(regs_.FX(inst.dest), R(tempReg));
289
		break;
290
	}
291

292
	default:
293
		INVALIDOP;
294
		break;
295
	}
296
}
297

298
void X64JitBackend::CompIR_VecPack(IRInst inst) {
299
	CONDITIONAL_DISABLE;
300

301
	switch (inst.op) {
302
	case IROp::Vec2Unpack16To31:
303
	case IROp::Vec4Pack32To8:
304
	case IROp::Vec2Pack31To16:
305
	case IROp::Vec4Unpack8To32:
306
	case IROp::Vec2Unpack16To32:
307
	case IROp::Vec4DuplicateUpperBitsAndShift1:
308
	case IROp::Vec4Pack31To8:
309
	case IROp::Vec2Pack32To16:
310
		CompIR_Generic(inst);
311
		break;
312

313
	default:
314
		INVALIDOP;
315
		break;
316
	}
317
}
318

319
} // namespace MIPSComp
320

321
#endif
322

323
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company