CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/X64IRCompVec.cpp
Views: 1401
1
// Copyright (c) 2023- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20
21
#include <algorithm>
22
#include "Common/CPUDetect.h"
23
#include "Core/MemMap.h"
24
#include "Core/MIPS/x86/X64IRJit.h"
25
#include "Core/MIPS/x86/X64IRRegCache.h"
26
27
// This file contains compilation for vector instructions.
28
//
29
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
30
// Currently known non working ones should have DISABLE. No flags because that's in IR already.
31
32
// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }
33
#define CONDITIONAL_DISABLE {}
34
#define DISABLE { CompIR_Generic(inst); return; }
35
#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }
36
37
namespace MIPSComp {
38
39
using namespace Gen;
40
using namespace X64IRJitConstants;
41
42
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
43
return r1 < r2 + l2 && r1 + l1 > r2;
44
}
45
46
void X64JitBackend::EmitVecConstants() {
47
static const float vec4InitData[8][4] = {
48
{ 0.0f, 0.0f, 0.0f, 0.0f },
49
{ 1.0f, 1.0f, 1.0f, 1.0f },
50
{ -1.0f, -1.0f, -1.0f, -1.0f },
51
{ 1.0f, 0.0f, 0.0f, 0.0f },
52
{ 0.0f, 1.0f, 0.0f, 0.0f },
53
{ 0.0f, 0.0f, 1.0f, 0.0f },
54
{ 0.0f, 0.0f, 0.0f, 1.0f },
55
};
56
57
constants.vec4InitValues = (const Float4Constant *)GetCodePointer();
58
for (size_t type = 0; type < ARRAY_SIZE(vec4InitData); ++type) {
59
for (int i = 0; i < 4; ++i) {
60
uint32_t val;
61
memcpy(&val, &vec4InitData[type][i], sizeof(val));
62
Write32(val);
63
}
64
}
65
}
66
67
void X64JitBackend::CompIR_VecArith(IRInst inst) {
68
CONDITIONAL_DISABLE;
69
70
switch (inst.op) {
71
case IROp::Vec4Add:
72
regs_.Map(inst);
73
if (inst.dest == inst.src1) {
74
ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));
75
} else if (inst.dest == inst.src2) {
76
ADDPS(regs_.FX(inst.dest), regs_.F(inst.src1));
77
} else if (cpu_info.bAVX) {
78
VADDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
79
} else {
80
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
81
ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));
82
}
83
break;
84
85
case IROp::Vec4Sub:
86
if (inst.dest == inst.src1) {
87
regs_.Map(inst);
88
SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));
89
} else if (cpu_info.bAVX) {
90
regs_.Map(inst);
91
VSUBPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
92
} else if (inst.dest == inst.src2) {
93
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
94
MOVAPS(tempReg, regs_.F(inst.src2));
95
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
96
SUBPS(regs_.FX(inst.dest), R(tempReg));
97
} else {
98
regs_.Map(inst);
99
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
100
SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));
101
}
102
break;
103
104
case IROp::Vec4Mul:
105
regs_.Map(inst);
106
if (inst.dest == inst.src1) {
107
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
108
} else if (inst.dest == inst.src2) {
109
MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
110
} else if (cpu_info.bAVX) {
111
VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
112
} else {
113
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
114
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
115
}
116
break;
117
118
case IROp::Vec4Div:
119
if (inst.dest == inst.src1) {
120
regs_.Map(inst);
121
DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));
122
} else if (cpu_info.bAVX) {
123
regs_.Map(inst);
124
VDIVPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
125
} else if (inst.dest == inst.src2) {
126
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
127
MOVAPS(tempReg, regs_.F(inst.src2));
128
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
129
DIVPS(regs_.FX(inst.dest), R(tempReg));
130
} else {
131
regs_.Map(inst);
132
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
133
DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));
134
}
135
break;
136
137
case IROp::Vec4Scale:
138
// TODO: Handle "aliasing" of sizes.
139
if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1))
140
DISABLE;
141
142
regs_.Map(inst);
143
SHUFPS(regs_.FX(inst.src2), regs_.F(inst.src2), 0);
144
if (inst.dest == inst.src1) {
145
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
146
} else if (inst.dest == inst.src2) {
147
MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
148
} else if (cpu_info.bAVX) {
149
VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
150
} else {
151
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
152
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
153
}
154
break;
155
156
case IROp::Vec4Neg:
157
regs_.Map(inst);
158
if (cpu_info.bAVX) {
159
VXORPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible
160
} else {
161
if (inst.dest != inst.src1)
162
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
163
XORPS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible
164
}
165
break;
166
167
case IROp::Vec4Abs:
168
regs_.Map(inst);
169
if (cpu_info.bAVX) {
170
VANDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible
171
} else {
172
if (inst.dest != inst.src1)
173
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
174
ANDPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
175
}
176
break;
177
178
default:
179
INVALIDOP;
180
break;
181
}
182
}
183
184
void X64JitBackend::CompIR_VecAssign(IRInst inst) {
185
CONDITIONAL_DISABLE;
186
187
switch (inst.op) {
188
case IROp::Vec4Init:
189
regs_.Map(inst);
190
if (inst.src1 == (int)Vec4Init::AllZERO) {
191
XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
192
} else {
193
MOVAPS(regs_.FX(inst.dest), M(&constants.vec4InitValues[inst.src1])); // rip accessible
194
}
195
break;
196
197
case IROp::Vec4Shuffle:
198
if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0) {
199
// This is a broadcast. If dest == src1, this won't clear it.
200
regs_.SpillLockFPR(inst.src1);
201
regs_.MapVec4(inst.dest, MIPSMap::NOINIT);
202
} else {
203
regs_.Map(inst);
204
}
205
if (cpu_info.bAVX) {
206
VPERMILPS(128, regs_.FX(inst.dest), regs_.F(inst.src1), inst.src2);
207
} else {
208
if (inst.dest != inst.src1)
209
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
210
SHUFPS(regs_.FX(inst.dest), regs_.F(inst.dest), inst.src2);
211
}
212
break;
213
214
case IROp::Vec4Blend:
215
if (cpu_info.bAVX) {
216
regs_.Map(inst);
217
VBLENDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2), (uint8_t)inst.constant);
218
} else if (cpu_info.bSSE4_1) {
219
regs_.Map(inst);
220
if (inst.dest == inst.src1) {
221
BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);
222
} else if (inst.dest == inst.src2) {
223
BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src1), (uint8_t)~inst.constant);
224
} else {
225
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
226
BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);
227
}
228
} else {
229
// Could use some shuffles...
230
DISABLE;
231
}
232
break;
233
234
case IROp::Vec4Mov:
235
regs_.Map(inst);
236
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
237
break;
238
239
default:
240
INVALIDOP;
241
break;
242
}
243
}
244
245
void X64JitBackend::CompIR_VecClamp(IRInst inst) {
246
CONDITIONAL_DISABLE;
247
248
switch (inst.op) {
249
case IROp::Vec4ClampToZero:
250
case IROp::Vec2ClampToZero:
251
CompIR_Generic(inst);
252
break;
253
254
default:
255
INVALIDOP;
256
break;
257
}
258
}
259
260
void X64JitBackend::CompIR_VecHoriz(IRInst inst) {
261
CONDITIONAL_DISABLE;
262
263
switch (inst.op) {
264
case IROp::Vec4Dot:
265
{
266
// TODO: Handle "aliasing" of sizes. In theory it should be fine if not dirty...
267
if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4))
268
DISABLE;
269
270
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
271
272
if (inst.dest == inst.src1) {
273
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
274
} else if (inst.dest == inst.src2) {
275
MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));
276
} else if (cpu_info.bAVX) {
277
VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
278
} else if (cpu_info.bSSE4_1) {
279
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
280
MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));
281
}
282
283
// This shuffle can be done in one op for SSE3/AVX, but it's not always faster.
284
MOVAPS(tempReg, regs_.F(inst.dest));
285
SHUFPS(tempReg, regs_.F(inst.dest), VFPU_SWIZZLE(1, 0, 3, 2));
286
ADDPS(regs_.FX(inst.dest), R(tempReg));
287
MOVHLPS(tempReg, regs_.FX(inst.dest));
288
ADDSS(regs_.FX(inst.dest), R(tempReg));
289
break;
290
}
291
292
default:
293
INVALIDOP;
294
break;
295
}
296
}
297
298
void X64JitBackend::CompIR_VecPack(IRInst inst) {
299
CONDITIONAL_DISABLE;
300
301
switch (inst.op) {
302
case IROp::Vec2Unpack16To31:
303
case IROp::Vec4Pack32To8:
304
case IROp::Vec2Pack31To16:
305
case IROp::Vec4Unpack8To32:
306
case IROp::Vec2Unpack16To32:
307
case IROp::Vec4DuplicateUpperBitsAndShift1:
308
case IROp::Vec4Pack31To8:
309
case IROp::Vec2Pack32To16:
310
CompIR_Generic(inst);
311
break;
312
313
default:
314
INVALIDOP;
315
break;
316
}
317
}
318
319
} // namespace MIPSComp
320
321
#endif
322
323