CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/RiscV/RiscVCompVec.cpp
Views: 1401
1
// Copyright (c) 2023- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <algorithm>
19
#include "Core/MemMap.h"
20
#include "Core/MIPS/RiscV/RiscVJit.h"
21
#include "Core/MIPS/RiscV/RiscVRegCache.h"
22
23
// This file contains compilation for vector instructions.
24
//
25
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
26
// Currently known non working ones should have DISABLE. No flags because that's in IR already.
27
28
// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }
29
#define CONDITIONAL_DISABLE {}
30
#define DISABLE { CompIR_Generic(inst); return; }
31
#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }
32
33
namespace MIPSComp {
34
35
using namespace RiscVGen;
36
using namespace RiscVJitConstants;
37
38
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
39
return r1 < r2 + l2 && r1 + l1 > r2;
40
}
41
42
void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
43
CONDITIONAL_DISABLE;
44
45
switch (inst.op) {
46
case IROp::Vec4Init:
47
regs_.Map(inst);
48
49
// TODO: Check if FCVT/FMV/FL is better.
50
switch ((Vec4Init)inst.src1) {
51
case Vec4Init::AllZERO:
52
for (int i = 0; i < 4; ++i)
53
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
54
break;
55
56
case Vec4Init::AllONE:
57
if (CanFLI(32, 1.0f)) {
58
for (int i = 0; i < 4; ++i)
59
FLI(32, regs_.F(inst.dest + i), 1.0f);
60
} else {
61
LI(SCRATCH1, 1.0f);
62
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
63
for (int i = 1; i < 4; ++i)
64
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
65
}
66
break;
67
68
case Vec4Init::AllMinusONE:
69
if (CanFLI(32, -1.0f)) {
70
for (int i = 0; i < 4; ++i)
71
FLI(32, regs_.F(inst.dest + i), -1.0f);
72
} else {
73
LI(SCRATCH1, -1.0f);
74
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
75
for (int i = 1; i < 4; ++i)
76
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
77
}
78
break;
79
80
case Vec4Init::Set_1000:
81
if (!CanFLI(32, 1.0f))
82
LI(SCRATCH1, 1.0f);
83
for (int i = 0; i < 4; ++i) {
84
if (i == 0) {
85
if (CanFLI(32, 1.0f))
86
FLI(32, regs_.F(inst.dest + i), 1.0f);
87
else
88
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
89
} else {
90
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
91
}
92
}
93
break;
94
95
case Vec4Init::Set_0100:
96
if (!CanFLI(32, 1.0f))
97
LI(SCRATCH1, 1.0f);
98
for (int i = 0; i < 4; ++i) {
99
if (i == 1) {
100
if (CanFLI(32, 1.0f))
101
FLI(32, regs_.F(inst.dest + i), 1.0f);
102
else
103
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
104
} else {
105
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
106
}
107
}
108
break;
109
110
case Vec4Init::Set_0010:
111
if (!CanFLI(32, 1.0f))
112
LI(SCRATCH1, 1.0f);
113
for (int i = 0; i < 4; ++i) {
114
if (i == 2) {
115
if (CanFLI(32, 1.0f))
116
FLI(32, regs_.F(inst.dest + i), 1.0f);
117
else
118
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
119
} else {
120
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
121
}
122
}
123
break;
124
125
case Vec4Init::Set_0001:
126
if (!CanFLI(32, 1.0f))
127
LI(SCRATCH1, 1.0f);
128
for (int i = 0; i < 4; ++i) {
129
if (i == 3) {
130
if (CanFLI(32, 1.0f))
131
FLI(32, regs_.F(inst.dest + i), 1.0f);
132
else
133
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
134
} else {
135
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
136
}
137
}
138
break;
139
}
140
break;
141
142
case IROp::Vec4Shuffle:
143
if (inst.dest == inst.src1) {
144
RiscVReg tempReg = regs_.MapWithFPRTemp(inst);
145
146
// Try to find the least swaps needed to move in place, never worse than 6 FMVs.
147
// Would be better with a vmerge and vector regs.
148
int state[4]{ 0, 1, 2, 3 };
149
int goal[4]{ (inst.src2 >> 0) & 3, (inst.src2 >> 2) & 3, (inst.src2 >> 4) & 3, (inst.src2 >> 6) & 3 };
150
151
static constexpr int NOT_FOUND = 4;
152
auto findIndex = [](int *arr, int val, int start = 0) {
153
return (int)(std::find(arr + start, arr + 4, val) - arr);
154
};
155
auto moveChained = [&](const std::vector<int> &lanes, bool rotate) {
156
int firstState = state[lanes.front()];
157
if (rotate)
158
FMV(32, tempReg, regs_.F(inst.dest + lanes.front()));
159
for (size_t i = 1; i < lanes.size(); ++i) {
160
FMV(32, regs_.F(inst.dest + lanes[i - 1]), regs_.F(inst.dest + lanes[i]));
161
state[lanes[i - 1]] = state[lanes[i]];
162
}
163
if (rotate) {
164
FMV(32, regs_.F(inst.dest + lanes.back()), tempReg);
165
state[lanes.back()] = firstState;
166
}
167
};
168
169
for (int i = 0; i < 4; ++i) {
170
// Overlap, so if they match, nothing to do.
171
if (goal[i] == state[i])
172
continue;
173
174
int neededBy = findIndex(goal, state[i], i + 1);
175
int foundIn = findIndex(state, goal[i], 0);
176
_assert_(foundIn != NOT_FOUND);
177
178
if (neededBy == NOT_FOUND || neededBy == foundIn) {
179
moveChained({ i, foundIn }, neededBy == foundIn);
180
continue;
181
}
182
183
// Maybe we can avoid a swap and move the next thing into place.
184
int neededByDepth2 = findIndex(goal, state[neededBy], i + 1);
185
if (neededByDepth2 == NOT_FOUND || neededByDepth2 == foundIn) {
186
moveChained({ neededBy, i, foundIn }, neededByDepth2 == foundIn);
187
continue;
188
}
189
190
// Since we only have 4 items, this is as deep as the chain could go.
191
int neededByDepth3 = findIndex(goal, state[neededByDepth2], i + 1);
192
moveChained({ neededByDepth2, neededBy, i, foundIn }, neededByDepth3 == foundIn);
193
}
194
} else {
195
regs_.Map(inst);
196
for (int i = 0; i < 4; ++i) {
197
int lane = (inst.src2 >> (i * 2)) & 3;
198
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + lane));
199
}
200
}
201
break;
202
203
case IROp::Vec4Blend:
204
regs_.Map(inst);
205
for (int i = 0; i < 4; ++i) {
206
int which = (inst.constant >> i) & 1;
207
IRReg srcReg = which ? inst.src2 : inst.src1;
208
if (inst.dest != srcReg)
209
FMV(32, regs_.F(inst.dest + i), regs_.F(srcReg + i));
210
}
211
break;
212
213
case IROp::Vec4Mov:
214
if (inst.dest != inst.src1) {
215
regs_.Map(inst);
216
for (int i = 0; i < 4; ++i)
217
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
218
}
219
break;
220
221
default:
222
INVALIDOP;
223
break;
224
}
225
}
226
227
void RiscVJitBackend::CompIR_VecArith(IRInst inst) {
228
CONDITIONAL_DISABLE;
229
230
switch (inst.op) {
231
case IROp::Vec4Add:
232
regs_.Map(inst);
233
for (int i = 0; i < 4; ++i)
234
FADD(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
235
break;
236
237
case IROp::Vec4Sub:
238
regs_.Map(inst);
239
for (int i = 0; i < 4; ++i)
240
FSUB(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
241
break;
242
243
case IROp::Vec4Mul:
244
regs_.Map(inst);
245
for (int i = 0; i < 4; ++i)
246
FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
247
break;
248
249
case IROp::Vec4Div:
250
regs_.Map(inst);
251
for (int i = 0; i < 4; ++i)
252
FDIV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
253
break;
254
255
case IROp::Vec4Scale:
256
regs_.Map(inst);
257
if (Overlap(inst.src2, 1, inst.dest, 3)) {
258
// We have to handle overlap, doing dest == src2 last.
259
for (int i = 0; i < 4; ++i) {
260
if (inst.src2 != inst.dest + i)
261
FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
262
}
263
for (int i = 0; i < 4; ++i) {
264
if (inst.src2 == inst.dest + i)
265
FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
266
}
267
} else {
268
for (int i = 0; i < 4; ++i)
269
FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
270
}
271
break;
272
273
case IROp::Vec4Neg:
274
regs_.Map(inst);
275
for (int i = 0; i < 4; ++i)
276
FNEG(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
277
break;
278
279
case IROp::Vec4Abs:
280
regs_.Map(inst);
281
for (int i = 0; i < 4; ++i)
282
FABS(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
283
break;
284
285
default:
286
INVALIDOP;
287
break;
288
}
289
}
290
291
void RiscVJitBackend::CompIR_VecHoriz(IRInst inst) {
292
CONDITIONAL_DISABLE;
293
294
switch (inst.op) {
295
case IROp::Vec4Dot:
296
regs_.Map(inst);
297
if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {
298
// This means inst.dest overlaps one of src1 or src2. We have to do that one first.
299
// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.
300
for (int i = 0; i < 4; ++i) {
301
if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)
302
FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
303
}
304
for (int i = 0; i < 4; ++i) {
305
if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)
306
FMADD(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));
307
}
308
} else {
309
FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));
310
for (int i = 1; i < 4; ++i)
311
FMADD(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));
312
}
313
break;
314
315
default:
316
INVALIDOP;
317
break;
318
}
319
}
320
321
void RiscVJitBackend::CompIR_VecPack(IRInst inst) {
322
CONDITIONAL_DISABLE;
323
324
switch (inst.op) {
325
case IROp::Vec2Unpack16To31:
326
case IROp::Vec4Pack32To8:
327
case IROp::Vec2Pack31To16:
328
CompIR_Generic(inst);
329
break;
330
331
case IROp::Vec4Unpack8To32:
332
// TODO: This works for now, but may need to handle aliasing for vectors.
333
regs_.Map(inst);
334
FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1));
335
for (int i = 0; i < 4; ++i) {
336
// Mask using walls.
337
if (i != 0) {
338
SRLI(SCRATCH1, SCRATCH2, i * 8);
339
SLLI(SCRATCH1, SCRATCH1, 24);
340
} else {
341
SLLI(SCRATCH1, SCRATCH2, 24);
342
}
343
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
344
}
345
break;
346
347
case IROp::Vec2Unpack16To32:
348
// TODO: This works for now, but may need to handle aliasing for vectors.
349
regs_.Map(inst);
350
FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1));
351
SLLI(SCRATCH1, SCRATCH2, 16);
352
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
353
SRLI(SCRATCH1, SCRATCH2, 16);
354
SLLI(SCRATCH1, SCRATCH1, 16);
355
FMV(FMv::W, FMv::X, regs_.F(inst.dest + 1), SCRATCH1);
356
break;
357
358
case IROp::Vec4DuplicateUpperBitsAndShift1:
359
regs_.Map(inst);
360
for (int i = 0; i < 4; i++) {
361
FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));
362
SRLIW(SCRATCH2, SCRATCH1, 8);
363
OR(SCRATCH1, SCRATCH1, SCRATCH2);
364
SRLIW(SCRATCH2, SCRATCH1, 16);
365
OR(SCRATCH1, SCRATCH1, SCRATCH2);
366
SRLIW(SCRATCH1, SCRATCH1, 1);
367
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
368
}
369
break;
370
371
case IROp::Vec4Pack31To8:
372
// TODO: This works for now, but may need to handle aliasing for vectors.
373
regs_.Map(inst);
374
for (int i = 0; i < 4; ++i) {
375
FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));
376
SRLI(SCRATCH1, SCRATCH1, 23);
377
if (i == 0) {
378
ANDI(SCRATCH2, SCRATCH1, 0xFF);
379
} else {
380
ANDI(SCRATCH1, SCRATCH1, 0xFF);
381
SLLI(SCRATCH1, SCRATCH1, 8 * i);
382
OR(SCRATCH2, SCRATCH2, SCRATCH1);
383
}
384
}
385
386
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH2);
387
break;
388
389
case IROp::Vec2Pack32To16:
390
// TODO: This works for now, but may need to handle aliasing for vectors.
391
regs_.Map(inst);
392
FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1));
393
FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1 + 1));
394
// Keep in mind, this was sign-extended, so we have to zero the upper.
395
SLLI(SCRATCH1, SCRATCH1, XLEN - 32);
396
// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.
397
SRLI(SCRATCH1, SCRATCH1, XLEN - 16);
398
// Use a wall to mask. We can ignore the upper 32 here.
399
SRLI(SCRATCH2, SCRATCH2, 16);
400
SLLI(SCRATCH2, SCRATCH2, 16);
401
OR(SCRATCH1, SCRATCH1, SCRATCH2);
402
// Okay, to the floating point register.
403
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
404
break;
405
406
default:
407
INVALIDOP;
408
break;
409
}
410
}
411
412
void RiscVJitBackend::CompIR_VecClamp(IRInst inst) {
413
CONDITIONAL_DISABLE;
414
415
switch (inst.op) {
416
case IROp::Vec4ClampToZero:
417
regs_.Map(inst);
418
for (int i = 0; i < 4; i++) {
419
FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));
420
SRAIW(SCRATCH2, SCRATCH1, 31);
421
if (cpu_info.RiscV_Zbb) {
422
ANDN(SCRATCH1, SCRATCH1, SCRATCH2);
423
} else {
424
NOT(SCRATCH2, SCRATCH2);
425
AND(SCRATCH1, SCRATCH1, SCRATCH2);
426
}
427
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
428
}
429
break;
430
431
case IROp::Vec2ClampToZero:
432
CompIR_Generic(inst);
433
break;
434
435
default:
436
INVALIDOP;
437
break;
438
}
439
}
440
441
} // namespace MIPSComp
442
443