CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/X64IRCompFPU.cpp
Views: 1401
1
// Copyright (c) 2023- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20
21
#ifndef offsetof
22
#include <cstddef>
23
#endif
24
25
#include "Core/MIPS/x86/X64IRJit.h"
26
#include "Core/MIPS/x86/X64IRRegCache.h"
27
28
// This file contains compilation for floating point related instructions.
29
//
30
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
31
// Currently known non working ones should have DISABLE. No flags because that's in IR already.
32
33
// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }
34
#define CONDITIONAL_DISABLE {}
35
#define DISABLE { CompIR_Generic(inst); return; }
36
#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }
37
38
namespace MIPSComp {
39
40
using namespace Gen;
41
using namespace X64IRJitConstants;
42
43
void X64JitBackend::EmitFPUConstants() {
44
EmitConst4x32(&constants.noSignMask, 0x7FFFFFFF);
45
EmitConst4x32(&constants.signBitAll, 0x80000000);
46
EmitConst4x32(&constants.positiveZeroes, 0x00000000);
47
EmitConst4x32(&constants.positiveInfinity, 0x7F800000);
48
EmitConst4x32(&constants.qNAN, 0x7FC00000);
49
EmitConst4x32(&constants.positiveOnes, 0x3F800000);
50
EmitConst4x32(&constants.negativeOnes, 0xBF800000);
51
EmitConst4x32(&constants.maxIntBelowAsFloat, 0x4EFFFFFF);
52
53
constants.mulTableVi2f = (const float *)GetCodePointer();
54
for (uint8_t i = 0; i < 32; ++i) {
55
float fval = 1.0f / (1UL << i);
56
uint32_t val;
57
memcpy(&val, &fval, sizeof(val));
58
59
Write32(val);
60
}
61
62
constants.mulTableVf2i = (const float *)GetCodePointer();
63
for (uint8_t i = 0; i < 32; ++i) {
64
float fval = (float)(1ULL << i);
65
uint32_t val;
66
memcpy(&val, &fval, sizeof(val));
67
68
Write32(val);
69
}
70
}
71
72
void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) {
73
// TODO: Move to regcache or emitter maybe?
74
if (lane == 0) {
75
if (dest != src)
76
MOVAPS(dest, R(src));
77
} else if (lane == 1 && cpu_info.bSSE3) {
78
MOVSHDUP(dest, R(src));
79
} else if (lane == 2) {
80
MOVHLPS(dest, src);
81
} else if (cpu_info.bAVX) {
82
VPERMILPS(128, dest, R(src), VFPU_SWIZZLE(lane, lane, lane, lane));
83
} else {
84
if (dest != src)
85
MOVAPS(dest, R(src));
86
SHUFPS(dest, R(dest), VFPU_SWIZZLE(lane, lane, lane, lane));
87
}
88
}
89
90
void X64JitBackend::CompIR_FArith(IRInst inst) {
91
CONDITIONAL_DISABLE;
92
93
switch (inst.op) {
94
case IROp::FAdd:
95
regs_.Map(inst);
96
if (inst.dest == inst.src1) {
97
ADDSS(regs_.FX(inst.dest), regs_.F(inst.src2));
98
} else if (inst.dest == inst.src2) {
99
ADDSS(regs_.FX(inst.dest), regs_.F(inst.src1));
100
} else if (cpu_info.bAVX) {
101
VADDSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
102
} else {
103
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
104
ADDSS(regs_.FX(inst.dest), regs_.F(inst.src2));
105
}
106
break;
107
108
case IROp::FSub:
109
if (inst.dest == inst.src1) {
110
regs_.Map(inst);
111
SUBSS(regs_.FX(inst.dest), regs_.F(inst.src2));
112
} else if (cpu_info.bAVX) {
113
regs_.Map(inst);
114
VSUBSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
115
} else if (inst.dest == inst.src2) {
116
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
117
MOVAPS(tempReg, regs_.F(inst.src2));
118
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
119
SUBSS(regs_.FX(inst.dest), R(tempReg));
120
} else {
121
regs_.Map(inst);
122
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
123
SUBSS(regs_.FX(inst.dest), regs_.F(inst.src2));
124
}
125
break;
126
127
case IROp::FMul:
128
{
129
regs_.Map(inst);
130
131
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src2));
132
SETcc(CC_P, R(SCRATCH1));
133
134
if (inst.dest == inst.src1) {
135
MULSS(regs_.FX(inst.dest), regs_.F(inst.src2));
136
} else if (inst.dest == inst.src2) {
137
MULSS(regs_.FX(inst.dest), regs_.F(inst.src1));
138
} else if (cpu_info.bAVX) {
139
VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
140
} else {
141
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
142
MULSS(regs_.FX(inst.dest), regs_.F(inst.src2));
143
}
144
145
UCOMISS(regs_.FX(inst.dest), regs_.F(inst.dest));
146
FixupBranch handleNAN = J_CC(CC_P);
147
FixupBranch finish = J();
148
149
SetJumpTarget(handleNAN);
150
TEST(8, R(SCRATCH1), R(SCRATCH1));
151
FixupBranch keepNAN = J_CC(CC_NZ);
152
153
MOVSS(regs_.FX(inst.dest), M(constants.qNAN)); // rip accessible
154
155
SetJumpTarget(keepNAN);
156
SetJumpTarget(finish);
157
break;
158
}
159
160
case IROp::FDiv:
161
if (inst.dest == inst.src1) {
162
regs_.Map(inst);
163
DIVSS(regs_.FX(inst.dest), regs_.F(inst.src2));
164
} else if (cpu_info.bAVX) {
165
regs_.Map(inst);
166
VDIVSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
167
} else if (inst.dest == inst.src2) {
168
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
169
MOVAPS(tempReg, regs_.F(inst.src2));
170
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
171
DIVSS(regs_.FX(inst.dest), R(tempReg));
172
} else {
173
regs_.Map(inst);
174
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
175
DIVSS(regs_.FX(inst.dest), regs_.F(inst.src2));
176
}
177
break;
178
179
case IROp::FSqrt:
180
regs_.Map(inst);
181
SQRTSS(regs_.FX(inst.dest), regs_.F(inst.src1));
182
break;
183
184
case IROp::FNeg:
185
regs_.Map(inst);
186
if (cpu_info.bAVX) {
187
VXORPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible
188
} else {
189
if (inst.dest != inst.src1)
190
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
191
XORPS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible
192
}
193
break;
194
195
default:
196
INVALIDOP;
197
break;
198
}
199
}
200
201
void X64JitBackend::CompIR_FAssign(IRInst inst) {
202
CONDITIONAL_DISABLE;
203
204
switch (inst.op) {
205
case IROp::FMov:
206
// Just to make sure we don't generate bad code.
207
if (inst.dest == inst.src1)
208
break;
209
if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {
210
// Okay, this is an extract. Avoid unvec4ing src1.
211
regs_.SpillLockFPR(inst.src1 & ~3);
212
regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
213
CopyVec4ToFPRLane0(regs_.FX(inst.dest), regs_.FX(inst.src1 & ~3), inst.src1 & 3);
214
} else {
215
regs_.Map(inst);
216
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
217
}
218
break;
219
220
case IROp::FAbs:
221
regs_.Map(inst);
222
if (cpu_info.bAVX) {
223
VANDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible
224
} else {
225
if (inst.dest != inst.src1)
226
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
227
ANDPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
228
}
229
break;
230
231
case IROp::FSign:
232
{
233
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
234
235
// Set tempReg to +1.0 or -1.0 per sign bit.
236
if (cpu_info.bAVX) {
237
VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible
238
} else {
239
MOVAPS(tempReg, regs_.F(inst.src1));
240
ANDPS(tempReg, M(constants.signBitAll)); // rip accessible
241
}
242
ORPS(tempReg, M(constants.positiveOnes)); // rip accessible
243
244
// Set dest = 0xFFFFFFFF if +0.0 or -0.0.
245
if (inst.dest != inst.src1) {
246
XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
247
CMPPS(regs_.FX(inst.dest), regs_.F(inst.src1), CMP_EQ);
248
} else {
249
CMPPS(regs_.FX(inst.dest), M(constants.positiveZeroes), CMP_EQ); // rip accessible
250
}
251
252
// Now not the mask to keep zero if it was zero.
253
ANDNPS(regs_.FX(inst.dest), R(tempReg));
254
break;
255
}
256
257
default:
258
INVALIDOP;
259
break;
260
}
261
}
262
263
void X64JitBackend::CompIR_FCompare(IRInst inst) {
264
CONDITIONAL_DISABLE;
265
266
constexpr IRReg IRREG_VFPU_CC = IRREG_VFPU_CTRL_BASE + VFPU_CTRL_CC;
267
268
auto ccToFpcond = [&](IRReg lhs, IRReg rhs, CCFlags cc) {
269
if (regs_.HasLowSubregister(regs_.RX(IRREG_FPCOND))) {
270
XOR(32, regs_.R(IRREG_FPCOND), regs_.R(IRREG_FPCOND));
271
UCOMISS(regs_.FX(lhs), regs_.F(rhs));
272
SETcc(cc, regs_.R(IRREG_FPCOND));
273
} else {
274
UCOMISS(regs_.FX(lhs), regs_.F(rhs));
275
SETcc(cc, R(SCRATCH1));
276
MOVZX(32, 8, regs_.RX(IRREG_FPCOND), R(SCRATCH1));
277
}
278
};
279
280
switch (inst.op) {
281
case IROp::FCmp:
282
switch (inst.dest) {
283
case IRFpCompareMode::False:
284
regs_.SetGPRImm(IRREG_FPCOND, 0);
285
break;
286
287
case IRFpCompareMode::EitherUnordered:
288
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
289
// PF = UNORDERED.
290
ccToFpcond(inst.src1, inst.src2, CC_P);
291
break;
292
293
case IRFpCompareMode::EqualOrdered:
294
{
295
// Since UCOMISS doesn't give us ordered == directly, CMPSS is better.
296
regs_.SpillLockFPR(inst.src1, inst.src2);
297
X64Reg tempReg = regs_.GetAndLockTempFPR();
298
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
299
300
if (cpu_info.bAVX) {
301
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);
302
} else {
303
MOVAPS(tempReg, regs_.F(inst.src1));
304
CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);
305
}
306
MOVD_xmm(regs_.R(IRREG_FPCOND), tempReg);
307
AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
308
break;
309
}
310
311
case IRFpCompareMode::EqualUnordered:
312
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
313
// E/ZF = EQUAL or UNORDERED.
314
ccToFpcond(inst.src1, inst.src2, CC_E);
315
break;
316
317
case IRFpCompareMode::LessEqualOrdered:
318
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
319
// AE/!CF = GREATER or EQUAL (src2/src1 reversed.)
320
ccToFpcond(inst.src2, inst.src1, CC_AE);
321
break;
322
323
case IRFpCompareMode::LessEqualUnordered:
324
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
325
// BE/CF||ZF = LESS THAN or EQUAL or UNORDERED.
326
ccToFpcond(inst.src1, inst.src2, CC_BE);
327
break;
328
329
case IRFpCompareMode::LessOrdered:
330
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
331
// A/!CF&&!ZF = GREATER (src2/src1 reversed.)
332
ccToFpcond(inst.src2, inst.src1, CC_A);
333
break;
334
335
case IRFpCompareMode::LessUnordered:
336
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
337
// B/CF = LESS THAN or UNORDERED.
338
ccToFpcond(inst.src1, inst.src2, CC_B);
339
break;
340
341
default:
342
_assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest);
343
}
344
break;
345
346
case IROp::FCmovVfpuCC:
347
regs_.MapWithExtra(inst, { { 'G', IRREG_VFPU_CC, 1, MIPSMap::INIT } });
348
if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC))) {
349
TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(1 << (inst.src2 & 7)));
350
} else {
351
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(1 << (inst.src2 & 7)));
352
}
353
354
if ((inst.src2 >> 7) & 1) {
355
FixupBranch skip = J_CC(CC_Z);
356
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
357
SetJumpTarget(skip);
358
} else {
359
FixupBranch skip = J_CC(CC_NZ);
360
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
361
SetJumpTarget(skip);
362
}
363
break;
364
365
case IROp::FCmpVfpuBit:
366
{
367
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
368
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
369
uint8_t affectedBit = 1 << (inst.dest >> 4);
370
bool condNegated = (inst.dest & 4) != 0;
371
372
bool takeBitFromTempReg = true;
373
switch (VCondition(inst.dest & 0xF)) {
374
case VC_EQ:
375
if (cpu_info.bAVX) {
376
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);
377
} else {
378
MOVAPS(tempReg, regs_.F(inst.src1));
379
CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);
380
}
381
break;
382
case VC_NE:
383
if (cpu_info.bAVX) {
384
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_NEQ);
385
} else {
386
MOVAPS(tempReg, regs_.F(inst.src1));
387
CMPSS(tempReg, regs_.F(inst.src2), CMP_NEQ);
388
}
389
break;
390
case VC_LT:
391
if (cpu_info.bAVX) {
392
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_LT);
393
} else {
394
MOVAPS(tempReg, regs_.F(inst.src1));
395
CMPSS(tempReg, regs_.F(inst.src2), CMP_LT);
396
}
397
break;
398
case VC_LE:
399
if (cpu_info.bAVX) {
400
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_LE);
401
} else {
402
MOVAPS(tempReg, regs_.F(inst.src1));
403
CMPSS(tempReg, regs_.F(inst.src2), CMP_LE);
404
}
405
break;
406
case VC_GT:
407
// This is just LT with src1/src2 swapped.
408
if (cpu_info.bAVX) {
409
VCMPSS(tempReg, regs_.FX(inst.src2), regs_.F(inst.src1), CMP_LT);
410
} else {
411
MOVAPS(tempReg, regs_.F(inst.src2));
412
CMPSS(tempReg, regs_.F(inst.src1), CMP_LT);
413
}
414
break;
415
case VC_GE:
416
// This is just LE with src1/src2 swapped.
417
if (cpu_info.bAVX) {
418
VCMPSS(tempReg, regs_.FX(inst.src2), regs_.F(inst.src1), CMP_LE);
419
} else {
420
MOVAPS(tempReg, regs_.F(inst.src2));
421
CMPSS(tempReg, regs_.F(inst.src1), CMP_LE);
422
}
423
break;
424
case VC_EZ:
425
case VC_NZ:
426
XORPS(tempReg, R(tempReg));
427
CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_EQ : CMP_NEQ);
428
break;
429
case VC_EN:
430
case VC_NN:
431
CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_UNORD : CMP_ORD);
432
break;
433
case VC_EI:
434
case VC_NI:
435
regs_.MapFPR(inst.src1);
436
if (cpu_info.bAVX) {
437
VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible
438
} else {
439
MOVAPS(tempReg, regs_.F(inst.src1));
440
ANDPS(tempReg, M(constants.noSignMask)); // rip accessible
441
}
442
CMPSS(tempReg, M(constants.positiveInfinity), !condNegated ? CMP_EQ : CMP_LT); // rip accessible
443
break;
444
case VC_ES:
445
case VC_NS:
446
// NAN - NAN is NAN, and Infinity - Infinity is also NAN.
447
if (cpu_info.bAVX) {
448
VSUBSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src1));
449
} else {
450
MOVAPS(tempReg, regs_.F(inst.src1));
451
SUBSS(tempReg, regs_.F(inst.src1));
452
}
453
CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_UNORD : CMP_ORD);
454
break;
455
case VC_TR:
456
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(affectedBit));
457
takeBitFromTempReg = true;
458
break;
459
case VC_FL:
460
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(~affectedBit));
461
takeBitFromTempReg = false;
462
break;
463
}
464
465
if (takeBitFromTempReg) {
466
MOVD_xmm(R(SCRATCH1), tempReg);
467
AND(32, R(SCRATCH1), Imm8(affectedBit));
468
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(~affectedBit));
469
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
470
}
471
break;
472
}
473
474
case IROp::FCmpVfpuAggregate:
475
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
476
if (inst.dest == 1) {
477
// Special case 1, which is not uncommon.
478
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
479
BT(32, regs_.R(IRREG_VFPU_CC), Imm8(0));
480
FixupBranch skip = J_CC(CC_NC);
481
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x30));
482
SetJumpTarget(skip);
483
} else if (inst.dest == 3) {
484
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
485
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
486
AND(32, R(SCRATCH1), Imm8(3));
487
// 0, 1, and 3 are already correct for the any and all bits.
488
CMP(32, R(SCRATCH1), Imm8(2));
489
490
FixupBranch skip = J_CC(CC_NE);
491
SUB(32, R(SCRATCH1), Imm8(1));
492
SetJumpTarget(skip);
493
494
SHL(32, R(SCRATCH1), Imm8(4));
495
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
496
} else if (inst.dest == 0xF) {
497
XOR(32, R(SCRATCH1), R(SCRATCH1));
498
499
// Clear out the bits we're aggregating.
500
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
501
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
502
503
// Set the any bit, just using the AND above.
504
FixupBranch noneSet = J_CC(CC_Z);
505
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
506
507
// Next up, the "all" bit.
508
CMP(32, regs_.R(IRREG_VFPU_CC), Imm8(0x1F));
509
SETcc(CC_E, R(SCRATCH1));
510
SHL(32, R(SCRATCH1), Imm8(5));
511
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
512
513
SetJumpTarget(noneSet);
514
} else {
515
XOR(32, R(SCRATCH1), R(SCRATCH1));
516
517
// Clear out the bits we're aggregating.
518
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
519
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
520
521
// Set the any bit.
522
if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC)))
523
TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(inst.dest));
524
else
525
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
526
FixupBranch noneSet = J_CC(CC_Z);
527
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
528
529
// Next up, the "all" bit. A bit annoying...
530
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
531
AND(32, R(SCRATCH1), Imm8(inst.dest));
532
CMP(32, R(SCRATCH1), Imm8(inst.dest));
533
SETcc(CC_E, R(SCRATCH1));
534
SHL(32, R(SCRATCH1), Imm8(5));
535
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
536
537
SetJumpTarget(noneSet);
538
}
539
break;
540
541
default:
542
INVALIDOP;
543
break;
544
}
545
}
546
547
void X64JitBackend::CompIR_FCondAssign(IRInst inst) {
548
CONDITIONAL_DISABLE;
549
550
FixupBranch skipNAN;
551
FixupBranch finishNAN;
552
FixupBranch negativeSigns;
553
FixupBranch finishNANSigns;
554
X64Reg tempReg = INVALID_REG;
555
switch (inst.op) {
556
case IROp::FMin:
557
tempReg = regs_.GetAndLockTempGPR();
558
regs_.Map(inst);
559
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));
560
skipNAN = J_CC(CC_NP, true);
561
562
// Slow path: NAN case. Check if both are negative.
563
MOVD_xmm(R(tempReg), regs_.FX(inst.src1));
564
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src2));
565
TEST(32, R(SCRATCH1), R(tempReg));
566
negativeSigns = J_CC(CC_S);
567
568
// Okay, one or the other positive.
569
CMP(32, R(tempReg), R(SCRATCH1));
570
CMOVcc(32, tempReg, R(SCRATCH1), CC_G);
571
MOVD_xmm(regs_.FX(inst.dest), R(tempReg));
572
finishNAN = J();
573
574
// Okay, both negative.
575
SetJumpTarget(negativeSigns);
576
CMP(32, R(tempReg), R(SCRATCH1));
577
CMOVcc(32, tempReg, R(SCRATCH1), CC_L);
578
MOVD_xmm(regs_.FX(inst.dest), R(tempReg));
579
finishNANSigns = J();
580
581
SetJumpTarget(skipNAN);
582
if (cpu_info.bAVX) {
583
VMINSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
584
} else {
585
if (inst.dest != inst.src1)
586
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
587
MINSS(regs_.FX(inst.dest), regs_.F(inst.src2));
588
}
589
SetJumpTarget(finishNAN);
590
SetJumpTarget(finishNANSigns);
591
break;
592
593
case IROp::FMax:
594
tempReg = regs_.GetAndLockTempGPR();
595
regs_.Map(inst);
596
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));
597
skipNAN = J_CC(CC_NP, true);
598
599
// Slow path: NAN case. Check if both are negative.
600
MOVD_xmm(R(tempReg), regs_.FX(inst.src1));
601
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src2));
602
TEST(32, R(SCRATCH1), R(tempReg));
603
negativeSigns = J_CC(CC_S);
604
605
// Okay, one or the other positive.
606
CMP(32, R(tempReg), R(SCRATCH1));
607
CMOVcc(32, tempReg, R(SCRATCH1), CC_L);
608
MOVD_xmm(regs_.FX(inst.dest), R(tempReg));
609
finishNAN = J();
610
611
// Okay, both negative.
612
SetJumpTarget(negativeSigns);
613
CMP(32, R(tempReg), R(SCRATCH1));
614
CMOVcc(32, tempReg, R(SCRATCH1), CC_G);
615
MOVD_xmm(regs_.FX(inst.dest), R(tempReg));
616
finishNANSigns = J();
617
618
SetJumpTarget(skipNAN);
619
if (cpu_info.bAVX) {
620
VMAXSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));
621
} else {
622
if (inst.dest != inst.src1)
623
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
624
MAXSS(regs_.FX(inst.dest), regs_.F(inst.src2));
625
}
626
SetJumpTarget(finishNAN);
627
SetJumpTarget(finishNANSigns);
628
break;
629
630
default:
631
INVALIDOP;
632
break;
633
}
634
}
635
636
void X64JitBackend::CompIR_FCvt(IRInst inst) {
637
CONDITIONAL_DISABLE;
638
639
switch (inst.op) {
640
case IROp::FCvtWS:
641
{
642
regs_.Map(inst);
643
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
644
645
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
646
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
647
// We want noSignMask otherwise, GREATER or UNORDERED.
648
FixupBranch isNAN = J_CC(CC_P);
649
FixupBranch skip = J_CC(CC_BE);
650
SetJumpTarget(isNAN);
651
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
652
653
SetJumpTarget(skip);
654
break;
655
}
656
657
case IROp::FCvtSW:
658
regs_.Map(inst);
659
CVTDQ2PS(regs_.FX(inst.dest), regs_.F(inst.src1));
660
break;
661
662
case IROp::FCvtScaledWS:
663
regs_.Map(inst);
664
if (cpu_info.bSSE4_1) {
665
int scale = inst.src2 & 0x1F;
666
IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
667
668
if (scale != 0 && cpu_info.bAVX) {
669
VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), M(&constants.mulTableVf2i[scale])); // rip accessible
670
} else {
671
if (inst.dest != inst.src1)
672
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
673
if (scale != 0)
674
MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
675
}
676
677
UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
678
679
switch (rmode) {
680
case IRRoundMode::RINT_0:
681
ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.dest));
682
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
683
break;
684
685
case IRRoundMode::CAST_1:
686
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
687
break;
688
689
case IRRoundMode::CEIL_2:
690
ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.dest));
691
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
692
break;
693
694
case IRRoundMode::FLOOR_3:
695
ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
696
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
697
break;
698
}
699
700
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
701
// We want noSignMask otherwise, GREATER or UNORDERED.
702
FixupBranch isNAN = J_CC(CC_P);
703
FixupBranch skip = J_CC(CC_BE);
704
SetJumpTarget(isNAN);
705
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
706
SetJumpTarget(skip);
707
} else {
708
int scale = inst.src2 & 0x1F;
709
IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
710
711
int setMXCSR = -1;
712
bool useTrunc = false;
713
switch (rmode) {
714
case IRRoundMode::RINT_0:
715
// TODO: Could skip if hasSetRounding, but we don't have the flag.
716
setMXCSR = 0;
717
break;
718
case IRRoundMode::CAST_1:
719
useTrunc = true;
720
break;
721
case IRRoundMode::CEIL_2:
722
setMXCSR = 2;
723
break;
724
case IRRoundMode::FLOOR_3:
725
setMXCSR = 1;
726
break;
727
}
728
729
// Except for truncate, we need to update MXCSR to our preferred rounding mode.
730
// TODO: Might be possible to cache this and update between instructions?
731
// Probably kinda expensive to switch each time...
732
if (setMXCSR != -1) {
733
STMXCSR(MDisp(CTXREG, mxcsrTempOffset));
734
MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));
735
AND(32, R(SCRATCH1), Imm32(~(3 << 13)));
736
if (setMXCSR != 0) {
737
OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));
738
}
739
MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));
740
LDMXCSR(MDisp(CTXREG, tempOffset));
741
}
742
743
if (inst.dest != inst.src1)
744
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
745
if (scale != 0)
746
MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
747
748
UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
749
750
if (useTrunc) {
751
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
752
} else {
753
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
754
}
755
756
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
757
// We want noSignMask otherwise, GREATER or UNORDERED.
758
FixupBranch isNAN = J_CC(CC_P);
759
FixupBranch skip = J_CC(CC_BE);
760
SetJumpTarget(isNAN);
761
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
762
SetJumpTarget(skip);
763
764
// Return MXCSR to its previous value.
765
if (setMXCSR != -1) {
766
LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));
767
}
768
}
769
break;
770
771
case IROp::FCvtScaledSW:
772
regs_.Map(inst);
773
CVTDQ2PS(regs_.FX(inst.dest), regs_.F(inst.src1));
774
MULSS(regs_.FX(inst.dest), M(&constants.mulTableVi2f[inst.src2 & 0x1F])); // rip accessible
775
break;
776
777
default:
778
INVALIDOP;
779
break;
780
}
781
}
782
783
void X64JitBackend::CompIR_FRound(IRInst inst) {
784
CONDITIONAL_DISABLE;
785
786
switch (inst.op) {
787
case IROp::FCeil:
788
case IROp::FFloor:
789
case IROp::FRound:
790
if (cpu_info.bSSE4_1) {
791
regs_.Map(inst);
792
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
793
794
switch (inst.op) {
795
case IROp::FCeil:
796
ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.src1));
797
break;
798
799
case IROp::FFloor:
800
ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.src1));
801
break;
802
803
case IROp::FRound:
804
ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.src1));
805
break;
806
807
default:
808
INVALIDOP;
809
}
810
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
811
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
812
// We want noSignMask otherwise, GREATER or UNORDERED.
813
FixupBranch isNAN = J_CC(CC_P);
814
FixupBranch skip = J_CC(CC_BE);
815
SetJumpTarget(isNAN);
816
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
817
818
SetJumpTarget(skip);
819
} else {
820
regs_.Map(inst);
821
822
int setMXCSR = -1;
823
switch (inst.op) {
824
case IROp::FRound:
825
// TODO: Could skip if hasSetRounding, but we don't have the flag.
826
setMXCSR = 0;
827
break;
828
case IROp::FCeil:
829
setMXCSR = 2;
830
break;
831
case IROp::FFloor:
832
setMXCSR = 1;
833
break;
834
default:
835
INVALIDOP;
836
}
837
838
// TODO: Might be possible to cache this and update between instructions?
839
// Probably kinda expensive to switch each time...
840
if (setMXCSR != -1) {
841
STMXCSR(MDisp(CTXREG, mxcsrTempOffset));
842
MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));
843
AND(32, R(SCRATCH1), Imm32(~(3 << 13)));
844
if (setMXCSR != 0) {
845
OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));
846
}
847
MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));
848
LDMXCSR(MDisp(CTXREG, tempOffset));
849
}
850
851
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
852
853
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
854
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
855
// We want noSignMask otherwise, GREATER or UNORDERED.
856
FixupBranch isNAN = J_CC(CC_P);
857
FixupBranch skip = J_CC(CC_BE);
858
SetJumpTarget(isNAN);
859
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
860
861
SetJumpTarget(skip);
862
863
// Return MXCSR to its previous value.
864
if (setMXCSR != -1) {
865
LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));
866
}
867
}
868
break;
869
870
case IROp::FTrunc:
871
{
872
regs_.Map(inst);
873
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
874
875
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
876
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
877
// We want noSignMask otherwise, GREATER or UNORDERED.
878
FixupBranch isNAN = J_CC(CC_P);
879
FixupBranch skip = J_CC(CC_BE);
880
SetJumpTarget(isNAN);
881
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
882
883
SetJumpTarget(skip);
884
break;
885
}
886
887
default:
888
INVALIDOP;
889
break;
890
}
891
}
892
893
void X64JitBackend::CompIR_FSat(IRInst inst) {
894
CONDITIONAL_DISABLE;
895
896
X64Reg tempReg = INVALID_REG;
897
switch (inst.op) {
898
case IROp::FSat0_1:
899
tempReg = regs_.MapWithFPRTemp(inst);
900
901
// The second argument's NAN is taken if either is NAN, so put known first.
902
MOVSS(tempReg, M(constants.positiveOnes));
903
MINSS(tempReg, regs_.F(inst.src1));
904
905
// Now for NAN, we want known first again.
906
// Unfortunately, this will retain -0.0, which we'll fix next.
907
XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
908
MAXSS(tempReg, regs_.F(inst.dest));
909
910
// Important: this should clamp -0.0 to +0.0.
911
ADDSS(regs_.FX(inst.dest), R(tempReg));
912
break;
913
914
case IROp::FSatMinus1_1:
915
tempReg = regs_.MapWithFPRTemp(inst);
916
917
// The second argument's NAN is taken if either is NAN, so put known first.
918
MOVSS(tempReg, M(constants.negativeOnes));
919
MAXSS(tempReg, regs_.F(inst.src1));
920
921
// Again, stick with the first argument being known.
922
MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes));
923
MINSS(regs_.FX(inst.dest), R(tempReg));
924
break;
925
926
default:
927
INVALIDOP;
928
break;
929
}
930
}
931
932
#if X64JIT_USE_XMM_CALL
933
static float X64JIT_XMM_CALL x64_sin(float f) {
934
return vfpu_sin(f);
935
}
936
937
static float X64JIT_XMM_CALL x64_cos(float f) {
938
return vfpu_cos(f);
939
}
940
941
static float X64JIT_XMM_CALL x64_asin(float f) {
942
return vfpu_asin(f);
943
}
944
#else
945
static uint32_t x64_sin(uint32_t v) {
946
float f;
947
memcpy(&f, &v, sizeof(v));
948
f = vfpu_sin(f);
949
memcpy(&v, &f, sizeof(v));
950
return v;
951
}
952
953
static uint32_t x64_cos(uint32_t v) {
954
float f;
955
memcpy(&f, &v, sizeof(v));
956
f = vfpu_cos(f);
957
memcpy(&v, &f, sizeof(v));
958
return v;
959
}
960
961
static uint32_t x64_asin(uint32_t v) {
962
float f;
963
memcpy(&f, &v, sizeof(v));
964
f = vfpu_asin(f);
965
memcpy(&v, &f, sizeof(v));
966
return v;
967
}
968
#endif
969
970
void X64JitBackend::CompIR_FSpecial(IRInst inst) {
971
CONDITIONAL_DISABLE;
972
973
auto callFuncF_F = [&](const void *func) {
974
regs_.FlushBeforeCall();
975
WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
976
977
#if X64JIT_USE_XMM_CALL
978
if (regs_.IsFPRMapped(inst.src1)) {
979
int lane = regs_.GetFPRLane(inst.src1);
980
CopyVec4ToFPRLane0(XMM0, regs_.FX(inst.src1), lane);
981
} else {
982
// Account for CTXREG being increased by 128 to reduce imm sizes.
983
int offset = offsetof(MIPSState, f) + inst.src1 * 4 - 128;
984
MOVSS(XMM0, MDisp(CTXREG, offset));
985
}
986
ABI_CallFunction((const void *)func);
987
988
// It's already in place, NOINIT won't modify.
989
regs_.MapFPR(inst.dest, MIPSMap::NOINIT | X64Map::XMM0);
990
#else
991
if (regs_.IsFPRMapped(inst.src1)) {
992
int lane = regs_.GetFPRLane(inst.src1);
993
if (lane == 0) {
994
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src1));
995
} else {
996
CopyVec4ToFPRLane0(XMM0, regs_.FX(inst.src1), lane);
997
MOVD_xmm(R(SCRATCH1), XMM0);
998
}
999
} else {
1000
int offset = offsetof(MIPSState, f) + inst.src1 * 4;
1001
MOV(32, R(SCRATCH1), MDisp(CTXREG, offset));
1002
}
1003
ABI_CallFunctionR((const void *)func, SCRATCH1);
1004
1005
regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
1006
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
1007
#endif
1008
1009
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
1010
};
1011
1012
switch (inst.op) {
1013
case IROp::FSin:
1014
callFuncF_F((const void *)&x64_sin);
1015
break;
1016
1017
case IROp::FCos:
1018
callFuncF_F((const void *)&x64_cos);
1019
break;
1020
1021
case IROp::FRSqrt:
1022
{
1023
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
1024
SQRTSS(tempReg, regs_.F(inst.src1));
1025
1026
MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible
1027
DIVSS(regs_.FX(inst.dest), R(tempReg));
1028
break;
1029
}
1030
1031
case IROp::FRecip:
1032
if (inst.dest != inst.src1) {
1033
regs_.Map(inst);
1034
MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible
1035
DIVSS(regs_.FX(inst.dest), regs_.F(inst.src1));
1036
} else {
1037
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
1038
MOVSS(tempReg, M(constants.positiveOnes)); // rip accessible
1039
if (cpu_info.bAVX) {
1040
VDIVSS(regs_.FX(inst.dest), tempReg, regs_.F(inst.src1));
1041
} else {
1042
DIVSS(tempReg, regs_.F(inst.src1));
1043
MOVSS(regs_.FX(inst.dest), R(tempReg));
1044
}
1045
}
1046
break;
1047
1048
case IROp::FAsin:
1049
callFuncF_F((const void *)&x64_asin);
1050
break;
1051
1052
default:
1053
INVALIDOP;
1054
break;
1055
}
1056
}
1057
1058
void X64JitBackend::CompIR_RoundingMode(IRInst inst) {
1059
CONDITIONAL_DISABLE;
1060
1061
switch (inst.op) {
1062
case IROp::RestoreRoundingMode:
1063
RestoreRoundingMode();
1064
break;
1065
1066
case IROp::ApplyRoundingMode:
1067
ApplyRoundingMode();
1068
break;
1069
1070
case IROp::UpdateRoundingMode:
1071
// TODO: We might want to do something here?
1072
break;
1073
1074
default:
1075
INVALIDOP;
1076
break;
1077
}
1078
}
1079
1080
} // namespace MIPSComp
1081
1082
#endif
1083
1084