CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/x86/X64IRCompFPU.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)1920#ifndef offsetof21#include <cstddef>22#endif2324#include "Core/MIPS/x86/X64IRJit.h"25#include "Core/MIPS/x86/X64IRRegCache.h"2627// This file contains compilation for floating point related instructions.28//29// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.30// Currently known non working ones should have DISABLE. No flags because that's in IR already.3132// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }33#define CONDITIONAL_DISABLE {}34#define DISABLE { CompIR_Generic(inst); return; }35#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3637namespace MIPSComp {3839using namespace Gen;40using namespace X64IRJitConstants;4142void X64JitBackend::EmitFPUConstants() {43EmitConst4x32(&constants.noSignMask, 0x7FFFFFFF);44EmitConst4x32(&constants.signBitAll, 0x80000000);45EmitConst4x32(&constants.positiveZeroes, 0x00000000);46EmitConst4x32(&constants.positiveInfinity, 0x7F800000);47EmitConst4x32(&constants.qNAN, 0x7FC00000);48EmitConst4x32(&constants.positiveOnes, 0x3F800000);49EmitConst4x32(&constants.negativeOnes, 0xBF800000);50EmitConst4x32(&constants.maxIntBelowAsFloat, 0x4EFFFFFF);5152constants.mulTableVi2f = (const float *)GetCodePointer();53for (uint8_t i = 0; i < 32; ++i) {54float fval = 1.0f / (1UL << i);55uint32_t val;56memcpy(&val, &fval, sizeof(val));5758Write32(val);59}6061constants.mulTableVf2i = (const float *)GetCodePointer();62for (uint8_t i = 0; i < 32; ++i) {63float fval = (float)(1ULL << i);64uint32_t val;65memcpy(&val, &fval, sizeof(val));6667Write32(val);68}69}7071void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) {72// TODO: Move to regcache or emitter maybe?73if (lane == 0) {74if (dest != src)75MOVAPS(dest, R(src));76} else if (lane == 1 && cpu_info.bSSE3) {77MOVSHDUP(dest, R(src));78} else if (lane == 2) {79MOVHLPS(dest, src);80} else if (cpu_info.bAVX) {81VPERMILPS(128, dest, R(src), VFPU_SWIZZLE(lane, lane, lane, lane));82} else {83if (dest != src)84MOVAPS(dest, R(src));85SHUFPS(dest, R(dest), VFPU_SWIZZLE(lane, lane, lane, lane));86}87}8889void X64JitBackend::CompIR_FArith(IRInst inst) {90CONDITIONAL_DISABLE;9192switch (inst.op) {93case IROp::FAdd:94regs_.Map(inst);95if (inst.dest == inst.src1) {96ADDSS(regs_.FX(inst.dest), regs_.F(inst.src2));97} else if (inst.dest == inst.src2) {98ADDSS(regs_.FX(inst.dest), regs_.F(inst.src1));99} else if (cpu_info.bAVX) {100VADDSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));101} else {102MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));103ADDSS(regs_.FX(inst.dest), regs_.F(inst.src2));104}105break;106107case IROp::FSub:108if (inst.dest == inst.src1) {109regs_.Map(inst);110SUBSS(regs_.FX(inst.dest), regs_.F(inst.src2));111} else if (cpu_info.bAVX) {112regs_.Map(inst);113VSUBSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));114} else if (inst.dest == inst.src2) {115X64Reg tempReg = regs_.MapWithFPRTemp(inst);116MOVAPS(tempReg, regs_.F(inst.src2));117MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));118SUBSS(regs_.FX(inst.dest), R(tempReg));119} else {120regs_.Map(inst);121MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));122SUBSS(regs_.FX(inst.dest), regs_.F(inst.src2));123}124break;125126case IROp::FMul:127{128regs_.Map(inst);129130UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src2));131SETcc(CC_P, R(SCRATCH1));132133if (inst.dest == inst.src1) {134MULSS(regs_.FX(inst.dest), regs_.F(inst.src2));135} else if (inst.dest == inst.src2) {136MULSS(regs_.FX(inst.dest), regs_.F(inst.src1));137} else if (cpu_info.bAVX) {138VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));139} else {140MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));141MULSS(regs_.FX(inst.dest), regs_.F(inst.src2));142}143144UCOMISS(regs_.FX(inst.dest), regs_.F(inst.dest));145FixupBranch handleNAN = J_CC(CC_P);146FixupBranch finish = J();147148SetJumpTarget(handleNAN);149TEST(8, R(SCRATCH1), R(SCRATCH1));150FixupBranch keepNAN = J_CC(CC_NZ);151152MOVSS(regs_.FX(inst.dest), M(constants.qNAN)); // rip accessible153154SetJumpTarget(keepNAN);155SetJumpTarget(finish);156break;157}158159case IROp::FDiv:160if (inst.dest == inst.src1) {161regs_.Map(inst);162DIVSS(regs_.FX(inst.dest), regs_.F(inst.src2));163} else if (cpu_info.bAVX) {164regs_.Map(inst);165VDIVSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));166} else if (inst.dest == inst.src2) {167X64Reg tempReg = regs_.MapWithFPRTemp(inst);168MOVAPS(tempReg, regs_.F(inst.src2));169MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));170DIVSS(regs_.FX(inst.dest), R(tempReg));171} else {172regs_.Map(inst);173MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));174DIVSS(regs_.FX(inst.dest), regs_.F(inst.src2));175}176break;177178case IROp::FSqrt:179regs_.Map(inst);180SQRTSS(regs_.FX(inst.dest), regs_.F(inst.src1));181break;182183case IROp::FNeg:184regs_.Map(inst);185if (cpu_info.bAVX) {186VXORPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible187} else {188if (inst.dest != inst.src1)189MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));190XORPS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible191}192break;193194default:195INVALIDOP;196break;197}198}199200void X64JitBackend::CompIR_FAssign(IRInst inst) {201CONDITIONAL_DISABLE;202203switch (inst.op) {204case IROp::FMov:205// Just to make sure we don't generate bad code.206if (inst.dest == inst.src1)207break;208if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {209// Okay, this is an extract. Avoid unvec4ing src1.210regs_.SpillLockFPR(inst.src1 & ~3);211regs_.MapFPR(inst.dest, MIPSMap::NOINIT);212CopyVec4ToFPRLane0(regs_.FX(inst.dest), regs_.FX(inst.src1 & ~3), inst.src1 & 3);213} else {214regs_.Map(inst);215MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));216}217break;218219case IROp::FAbs:220regs_.Map(inst);221if (cpu_info.bAVX) {222VANDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible223} else {224if (inst.dest != inst.src1)225MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));226ANDPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible227}228break;229230case IROp::FSign:231{232X64Reg tempReg = regs_.MapWithFPRTemp(inst);233234// Set tempReg to +1.0 or -1.0 per sign bit.235if (cpu_info.bAVX) {236VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible237} else {238MOVAPS(tempReg, regs_.F(inst.src1));239ANDPS(tempReg, M(constants.signBitAll)); // rip accessible240}241ORPS(tempReg, M(constants.positiveOnes)); // rip accessible242243// Set dest = 0xFFFFFFFF if +0.0 or -0.0.244if (inst.dest != inst.src1) {245XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));246CMPPS(regs_.FX(inst.dest), regs_.F(inst.src1), CMP_EQ);247} else {248CMPPS(regs_.FX(inst.dest), M(constants.positiveZeroes), CMP_EQ); // rip accessible249}250251// Now not the mask to keep zero if it was zero.252ANDNPS(regs_.FX(inst.dest), R(tempReg));253break;254}255256default:257INVALIDOP;258break;259}260}261262void X64JitBackend::CompIR_FCompare(IRInst inst) {263CONDITIONAL_DISABLE;264265constexpr IRReg IRREG_VFPU_CC = IRREG_VFPU_CTRL_BASE + VFPU_CTRL_CC;266267auto ccToFpcond = [&](IRReg lhs, IRReg rhs, CCFlags cc) {268if (regs_.HasLowSubregister(regs_.RX(IRREG_FPCOND))) {269XOR(32, regs_.R(IRREG_FPCOND), regs_.R(IRREG_FPCOND));270UCOMISS(regs_.FX(lhs), regs_.F(rhs));271SETcc(cc, regs_.R(IRREG_FPCOND));272} else {273UCOMISS(regs_.FX(lhs), regs_.F(rhs));274SETcc(cc, R(SCRATCH1));275MOVZX(32, 8, regs_.RX(IRREG_FPCOND), R(SCRATCH1));276}277};278279switch (inst.op) {280case IROp::FCmp:281switch (inst.dest) {282case IRFpCompareMode::False:283regs_.SetGPRImm(IRREG_FPCOND, 0);284break;285286case IRFpCompareMode::EitherUnordered:287regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });288// PF = UNORDERED.289ccToFpcond(inst.src1, inst.src2, CC_P);290break;291292case IRFpCompareMode::EqualOrdered:293{294// Since UCOMISS doesn't give us ordered == directly, CMPSS is better.295regs_.SpillLockFPR(inst.src1, inst.src2);296X64Reg tempReg = regs_.GetAndLockTempFPR();297regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });298299if (cpu_info.bAVX) {300VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);301} else {302MOVAPS(tempReg, regs_.F(inst.src1));303CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);304}305MOVD_xmm(regs_.R(IRREG_FPCOND), tempReg);306AND(32, regs_.R(IRREG_FPCOND), Imm32(1));307break;308}309310case IRFpCompareMode::EqualUnordered:311regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });312// E/ZF = EQUAL or UNORDERED.313ccToFpcond(inst.src1, inst.src2, CC_E);314break;315316case IRFpCompareMode::LessEqualOrdered:317regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });318// AE/!CF = GREATER or EQUAL (src2/src1 reversed.)319ccToFpcond(inst.src2, inst.src1, CC_AE);320break;321322case IRFpCompareMode::LessEqualUnordered:323regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });324// BE/CF||ZF = LESS THAN or EQUAL or UNORDERED.325ccToFpcond(inst.src1, inst.src2, CC_BE);326break;327328case IRFpCompareMode::LessOrdered:329regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });330// A/!CF&&!ZF = GREATER (src2/src1 reversed.)331ccToFpcond(inst.src2, inst.src1, CC_A);332break;333334case IRFpCompareMode::LessUnordered:335regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });336// B/CF = LESS THAN or UNORDERED.337ccToFpcond(inst.src1, inst.src2, CC_B);338break;339340default:341_assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest);342}343break;344345case IROp::FCmovVfpuCC:346regs_.MapWithExtra(inst, { { 'G', IRREG_VFPU_CC, 1, MIPSMap::INIT } });347if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC))) {348TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(1 << (inst.src2 & 7)));349} else {350TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(1 << (inst.src2 & 7)));351}352353if ((inst.src2 >> 7) & 1) {354FixupBranch skip = J_CC(CC_Z);355MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));356SetJumpTarget(skip);357} else {358FixupBranch skip = J_CC(CC_NZ);359MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));360SetJumpTarget(skip);361}362break;363364case IROp::FCmpVfpuBit:365{366regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);367X64Reg tempReg = regs_.MapWithFPRTemp(inst);368uint8_t affectedBit = 1 << (inst.dest >> 4);369bool condNegated = (inst.dest & 4) != 0;370371bool takeBitFromTempReg = true;372switch (VCondition(inst.dest & 0xF)) {373case VC_EQ:374if (cpu_info.bAVX) {375VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);376} else {377MOVAPS(tempReg, regs_.F(inst.src1));378CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);379}380break;381case VC_NE:382if (cpu_info.bAVX) {383VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_NEQ);384} else {385MOVAPS(tempReg, regs_.F(inst.src1));386CMPSS(tempReg, regs_.F(inst.src2), CMP_NEQ);387}388break;389case VC_LT:390if (cpu_info.bAVX) {391VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_LT);392} else {393MOVAPS(tempReg, regs_.F(inst.src1));394CMPSS(tempReg, regs_.F(inst.src2), CMP_LT);395}396break;397case VC_LE:398if (cpu_info.bAVX) {399VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_LE);400} else {401MOVAPS(tempReg, regs_.F(inst.src1));402CMPSS(tempReg, regs_.F(inst.src2), CMP_LE);403}404break;405case VC_GT:406// This is just LT with src1/src2 swapped.407if (cpu_info.bAVX) {408VCMPSS(tempReg, regs_.FX(inst.src2), regs_.F(inst.src1), CMP_LT);409} else {410MOVAPS(tempReg, regs_.F(inst.src2));411CMPSS(tempReg, regs_.F(inst.src1), CMP_LT);412}413break;414case VC_GE:415// This is just LE with src1/src2 swapped.416if (cpu_info.bAVX) {417VCMPSS(tempReg, regs_.FX(inst.src2), regs_.F(inst.src1), CMP_LE);418} else {419MOVAPS(tempReg, regs_.F(inst.src2));420CMPSS(tempReg, regs_.F(inst.src1), CMP_LE);421}422break;423case VC_EZ:424case VC_NZ:425XORPS(tempReg, R(tempReg));426CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_EQ : CMP_NEQ);427break;428case VC_EN:429case VC_NN:430CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_UNORD : CMP_ORD);431break;432case VC_EI:433case VC_NI:434regs_.MapFPR(inst.src1);435if (cpu_info.bAVX) {436VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible437} else {438MOVAPS(tempReg, regs_.F(inst.src1));439ANDPS(tempReg, M(constants.noSignMask)); // rip accessible440}441CMPSS(tempReg, M(constants.positiveInfinity), !condNegated ? CMP_EQ : CMP_LT); // rip accessible442break;443case VC_ES:444case VC_NS:445// NAN - NAN is NAN, and Infinity - Infinity is also NAN.446if (cpu_info.bAVX) {447VSUBSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src1));448} else {449MOVAPS(tempReg, regs_.F(inst.src1));450SUBSS(tempReg, regs_.F(inst.src1));451}452CMPSS(tempReg, regs_.F(inst.src1), !condNegated ? CMP_UNORD : CMP_ORD);453break;454case VC_TR:455OR(32, regs_.R(IRREG_VFPU_CC), Imm8(affectedBit));456takeBitFromTempReg = true;457break;458case VC_FL:459AND(32, regs_.R(IRREG_VFPU_CC), Imm8(~affectedBit));460takeBitFromTempReg = false;461break;462}463464if (takeBitFromTempReg) {465MOVD_xmm(R(SCRATCH1), tempReg);466AND(32, R(SCRATCH1), Imm8(affectedBit));467AND(32, regs_.R(IRREG_VFPU_CC), Imm8(~affectedBit));468OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));469}470break;471}472473case IROp::FCmpVfpuAggregate:474regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);475if (inst.dest == 1) {476// Special case 1, which is not uncommon.477AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));478BT(32, regs_.R(IRREG_VFPU_CC), Imm8(0));479FixupBranch skip = J_CC(CC_NC);480OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x30));481SetJumpTarget(skip);482} else if (inst.dest == 3) {483AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));484MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));485AND(32, R(SCRATCH1), Imm8(3));486// 0, 1, and 3 are already correct for the any and all bits.487CMP(32, R(SCRATCH1), Imm8(2));488489FixupBranch skip = J_CC(CC_NE);490SUB(32, R(SCRATCH1), Imm8(1));491SetJumpTarget(skip);492493SHL(32, R(SCRATCH1), Imm8(4));494OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));495} else if (inst.dest == 0xF) {496XOR(32, R(SCRATCH1), R(SCRATCH1));497498// Clear out the bits we're aggregating.499// The register refuses writes to bits outside 0x3F, and we're setting 0x30.500AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));501502// Set the any bit, just using the AND above.503FixupBranch noneSet = J_CC(CC_Z);504OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));505506// Next up, the "all" bit.507CMP(32, regs_.R(IRREG_VFPU_CC), Imm8(0x1F));508SETcc(CC_E, R(SCRATCH1));509SHL(32, R(SCRATCH1), Imm8(5));510OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));511512SetJumpTarget(noneSet);513} else {514XOR(32, R(SCRATCH1), R(SCRATCH1));515516// Clear out the bits we're aggregating.517// The register refuses writes to bits outside 0x3F, and we're setting 0x30.518AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));519520// Set the any bit.521if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC)))522TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(inst.dest));523else524TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));525FixupBranch noneSet = J_CC(CC_Z);526OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));527528// Next up, the "all" bit. A bit annoying...529MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));530AND(32, R(SCRATCH1), Imm8(inst.dest));531CMP(32, R(SCRATCH1), Imm8(inst.dest));532SETcc(CC_E, R(SCRATCH1));533SHL(32, R(SCRATCH1), Imm8(5));534OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));535536SetJumpTarget(noneSet);537}538break;539540default:541INVALIDOP;542break;543}544}545546void X64JitBackend::CompIR_FCondAssign(IRInst inst) {547CONDITIONAL_DISABLE;548549FixupBranch skipNAN;550FixupBranch finishNAN;551FixupBranch negativeSigns;552FixupBranch finishNANSigns;553X64Reg tempReg = INVALID_REG;554switch (inst.op) {555case IROp::FMin:556tempReg = regs_.GetAndLockTempGPR();557regs_.Map(inst);558UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));559skipNAN = J_CC(CC_NP, true);560561// Slow path: NAN case. Check if both are negative.562MOVD_xmm(R(tempReg), regs_.FX(inst.src1));563MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src2));564TEST(32, R(SCRATCH1), R(tempReg));565negativeSigns = J_CC(CC_S);566567// Okay, one or the other positive.568CMP(32, R(tempReg), R(SCRATCH1));569CMOVcc(32, tempReg, R(SCRATCH1), CC_G);570MOVD_xmm(regs_.FX(inst.dest), R(tempReg));571finishNAN = J();572573// Okay, both negative.574SetJumpTarget(negativeSigns);575CMP(32, R(tempReg), R(SCRATCH1));576CMOVcc(32, tempReg, R(SCRATCH1), CC_L);577MOVD_xmm(regs_.FX(inst.dest), R(tempReg));578finishNANSigns = J();579580SetJumpTarget(skipNAN);581if (cpu_info.bAVX) {582VMINSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));583} else {584if (inst.dest != inst.src1)585MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));586MINSS(regs_.FX(inst.dest), regs_.F(inst.src2));587}588SetJumpTarget(finishNAN);589SetJumpTarget(finishNANSigns);590break;591592case IROp::FMax:593tempReg = regs_.GetAndLockTempGPR();594regs_.Map(inst);595UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));596skipNAN = J_CC(CC_NP, true);597598// Slow path: NAN case. Check if both are negative.599MOVD_xmm(R(tempReg), regs_.FX(inst.src1));600MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src2));601TEST(32, R(SCRATCH1), R(tempReg));602negativeSigns = J_CC(CC_S);603604// Okay, one or the other positive.605CMP(32, R(tempReg), R(SCRATCH1));606CMOVcc(32, tempReg, R(SCRATCH1), CC_L);607MOVD_xmm(regs_.FX(inst.dest), R(tempReg));608finishNAN = J();609610// Okay, both negative.611SetJumpTarget(negativeSigns);612CMP(32, R(tempReg), R(SCRATCH1));613CMOVcc(32, tempReg, R(SCRATCH1), CC_G);614MOVD_xmm(regs_.FX(inst.dest), R(tempReg));615finishNANSigns = J();616617SetJumpTarget(skipNAN);618if (cpu_info.bAVX) {619VMAXSS(regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));620} else {621if (inst.dest != inst.src1)622MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));623MAXSS(regs_.FX(inst.dest), regs_.F(inst.src2));624}625SetJumpTarget(finishNAN);626SetJumpTarget(finishNANSigns);627break;628629default:630INVALIDOP;631break;632}633}634635void X64JitBackend::CompIR_FCvt(IRInst inst) {636CONDITIONAL_DISABLE;637638switch (inst.op) {639case IROp::FCvtWS:640{641regs_.Map(inst);642UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible643644CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));645// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.646// We want noSignMask otherwise, GREATER or UNORDERED.647FixupBranch isNAN = J_CC(CC_P);648FixupBranch skip = J_CC(CC_BE);649SetJumpTarget(isNAN);650MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible651652SetJumpTarget(skip);653break;654}655656case IROp::FCvtSW:657regs_.Map(inst);658CVTDQ2PS(regs_.FX(inst.dest), regs_.F(inst.src1));659break;660661case IROp::FCvtScaledWS:662regs_.Map(inst);663if (cpu_info.bSSE4_1) {664int scale = inst.src2 & 0x1F;665IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);666667if (scale != 0 && cpu_info.bAVX) {668VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), M(&constants.mulTableVf2i[scale])); // rip accessible669} else {670if (inst.dest != inst.src1)671MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));672if (scale != 0)673MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible674}675676UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible677678switch (rmode) {679case IRRoundMode::RINT_0:680ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.dest));681CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));682break;683684case IRRoundMode::CAST_1:685CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));686break;687688case IRRoundMode::CEIL_2:689ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.dest));690CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));691break;692693case IRRoundMode::FLOOR_3:694ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.dest));695CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));696break;697}698699// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.700// We want noSignMask otherwise, GREATER or UNORDERED.701FixupBranch isNAN = J_CC(CC_P);702FixupBranch skip = J_CC(CC_BE);703SetJumpTarget(isNAN);704MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible705SetJumpTarget(skip);706} else {707int scale = inst.src2 & 0x1F;708IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);709710int setMXCSR = -1;711bool useTrunc = false;712switch (rmode) {713case IRRoundMode::RINT_0:714// TODO: Could skip if hasSetRounding, but we don't have the flag.715setMXCSR = 0;716break;717case IRRoundMode::CAST_1:718useTrunc = true;719break;720case IRRoundMode::CEIL_2:721setMXCSR = 2;722break;723case IRRoundMode::FLOOR_3:724setMXCSR = 1;725break;726}727728// Except for truncate, we need to update MXCSR to our preferred rounding mode.729// TODO: Might be possible to cache this and update between instructions?730// Probably kinda expensive to switch each time...731if (setMXCSR != -1) {732STMXCSR(MDisp(CTXREG, mxcsrTempOffset));733MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));734AND(32, R(SCRATCH1), Imm32(~(3 << 13)));735if (setMXCSR != 0) {736OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));737}738MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));739LDMXCSR(MDisp(CTXREG, tempOffset));740}741742if (inst.dest != inst.src1)743MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));744if (scale != 0)745MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible746747UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible748749if (useTrunc) {750CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));751} else {752CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));753}754755// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.756// We want noSignMask otherwise, GREATER or UNORDERED.757FixupBranch isNAN = J_CC(CC_P);758FixupBranch skip = J_CC(CC_BE);759SetJumpTarget(isNAN);760MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible761SetJumpTarget(skip);762763// Return MXCSR to its previous value.764if (setMXCSR != -1) {765LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));766}767}768break;769770case IROp::FCvtScaledSW:771regs_.Map(inst);772CVTDQ2PS(regs_.FX(inst.dest), regs_.F(inst.src1));773MULSS(regs_.FX(inst.dest), M(&constants.mulTableVi2f[inst.src2 & 0x1F])); // rip accessible774break;775776default:777INVALIDOP;778break;779}780}781782void X64JitBackend::CompIR_FRound(IRInst inst) {783CONDITIONAL_DISABLE;784785switch (inst.op) {786case IROp::FCeil:787case IROp::FFloor:788case IROp::FRound:789if (cpu_info.bSSE4_1) {790regs_.Map(inst);791UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible792793switch (inst.op) {794case IROp::FCeil:795ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.src1));796break;797798case IROp::FFloor:799ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.src1));800break;801802case IROp::FRound:803ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.src1));804break;805806default:807INVALIDOP;808}809CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));810// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.811// We want noSignMask otherwise, GREATER or UNORDERED.812FixupBranch isNAN = J_CC(CC_P);813FixupBranch skip = J_CC(CC_BE);814SetJumpTarget(isNAN);815MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible816817SetJumpTarget(skip);818} else {819regs_.Map(inst);820821int setMXCSR = -1;822switch (inst.op) {823case IROp::FRound:824// TODO: Could skip if hasSetRounding, but we don't have the flag.825setMXCSR = 0;826break;827case IROp::FCeil:828setMXCSR = 2;829break;830case IROp::FFloor:831setMXCSR = 1;832break;833default:834INVALIDOP;835}836837// TODO: Might be possible to cache this and update between instructions?838// Probably kinda expensive to switch each time...839if (setMXCSR != -1) {840STMXCSR(MDisp(CTXREG, mxcsrTempOffset));841MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));842AND(32, R(SCRATCH1), Imm32(~(3 << 13)));843if (setMXCSR != 0) {844OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));845}846MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));847LDMXCSR(MDisp(CTXREG, tempOffset));848}849850UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible851852CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));853// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.854// We want noSignMask otherwise, GREATER or UNORDERED.855FixupBranch isNAN = J_CC(CC_P);856FixupBranch skip = J_CC(CC_BE);857SetJumpTarget(isNAN);858MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible859860SetJumpTarget(skip);861862// Return MXCSR to its previous value.863if (setMXCSR != -1) {864LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));865}866}867break;868869case IROp::FTrunc:870{871regs_.Map(inst);872UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible873874CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));875// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.876// We want noSignMask otherwise, GREATER or UNORDERED.877FixupBranch isNAN = J_CC(CC_P);878FixupBranch skip = J_CC(CC_BE);879SetJumpTarget(isNAN);880MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible881882SetJumpTarget(skip);883break;884}885886default:887INVALIDOP;888break;889}890}891892void X64JitBackend::CompIR_FSat(IRInst inst) {893CONDITIONAL_DISABLE;894895X64Reg tempReg = INVALID_REG;896switch (inst.op) {897case IROp::FSat0_1:898tempReg = regs_.MapWithFPRTemp(inst);899900// The second argument's NAN is taken if either is NAN, so put known first.901MOVSS(tempReg, M(constants.positiveOnes));902MINSS(tempReg, regs_.F(inst.src1));903904// Now for NAN, we want known first again.905// Unfortunately, this will retain -0.0, which we'll fix next.906XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));907MAXSS(tempReg, regs_.F(inst.dest));908909// Important: this should clamp -0.0 to +0.0.910ADDSS(regs_.FX(inst.dest), R(tempReg));911break;912913case IROp::FSatMinus1_1:914tempReg = regs_.MapWithFPRTemp(inst);915916// The second argument's NAN is taken if either is NAN, so put known first.917MOVSS(tempReg, M(constants.negativeOnes));918MAXSS(tempReg, regs_.F(inst.src1));919920// Again, stick with the first argument being known.921MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes));922MINSS(regs_.FX(inst.dest), R(tempReg));923break;924925default:926INVALIDOP;927break;928}929}930931#if X64JIT_USE_XMM_CALL932static float X64JIT_XMM_CALL x64_sin(float f) {933return vfpu_sin(f);934}935936static float X64JIT_XMM_CALL x64_cos(float f) {937return vfpu_cos(f);938}939940static float X64JIT_XMM_CALL x64_asin(float f) {941return vfpu_asin(f);942}943#else944static uint32_t x64_sin(uint32_t v) {945float f;946memcpy(&f, &v, sizeof(v));947f = vfpu_sin(f);948memcpy(&v, &f, sizeof(v));949return v;950}951952static uint32_t x64_cos(uint32_t v) {953float f;954memcpy(&f, &v, sizeof(v));955f = vfpu_cos(f);956memcpy(&v, &f, sizeof(v));957return v;958}959960static uint32_t x64_asin(uint32_t v) {961float f;962memcpy(&f, &v, sizeof(v));963f = vfpu_asin(f);964memcpy(&v, &f, sizeof(v));965return v;966}967#endif968969void X64JitBackend::CompIR_FSpecial(IRInst inst) {970CONDITIONAL_DISABLE;971972auto callFuncF_F = [&](const void *func) {973regs_.FlushBeforeCall();974WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);975976#if X64JIT_USE_XMM_CALL977if (regs_.IsFPRMapped(inst.src1)) {978int lane = regs_.GetFPRLane(inst.src1);979CopyVec4ToFPRLane0(XMM0, regs_.FX(inst.src1), lane);980} else {981// Account for CTXREG being increased by 128 to reduce imm sizes.982int offset = offsetof(MIPSState, f) + inst.src1 * 4 - 128;983MOVSS(XMM0, MDisp(CTXREG, offset));984}985ABI_CallFunction((const void *)func);986987// It's already in place, NOINIT won't modify.988regs_.MapFPR(inst.dest, MIPSMap::NOINIT | X64Map::XMM0);989#else990if (regs_.IsFPRMapped(inst.src1)) {991int lane = regs_.GetFPRLane(inst.src1);992if (lane == 0) {993MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src1));994} else {995CopyVec4ToFPRLane0(XMM0, regs_.FX(inst.src1), lane);996MOVD_xmm(R(SCRATCH1), XMM0);997}998} else {999int offset = offsetof(MIPSState, f) + inst.src1 * 4;1000MOV(32, R(SCRATCH1), MDisp(CTXREG, offset));1001}1002ABI_CallFunctionR((const void *)func, SCRATCH1);10031004regs_.MapFPR(inst.dest, MIPSMap::NOINIT);1005MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));1006#endif10071008WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);1009};10101011switch (inst.op) {1012case IROp::FSin:1013callFuncF_F((const void *)&x64_sin);1014break;10151016case IROp::FCos:1017callFuncF_F((const void *)&x64_cos);1018break;10191020case IROp::FRSqrt:1021{1022X64Reg tempReg = regs_.MapWithFPRTemp(inst);1023SQRTSS(tempReg, regs_.F(inst.src1));10241025MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible1026DIVSS(regs_.FX(inst.dest), R(tempReg));1027break;1028}10291030case IROp::FRecip:1031if (inst.dest != inst.src1) {1032regs_.Map(inst);1033MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible1034DIVSS(regs_.FX(inst.dest), regs_.F(inst.src1));1035} else {1036X64Reg tempReg = regs_.MapWithFPRTemp(inst);1037MOVSS(tempReg, M(constants.positiveOnes)); // rip accessible1038if (cpu_info.bAVX) {1039VDIVSS(regs_.FX(inst.dest), tempReg, regs_.F(inst.src1));1040} else {1041DIVSS(tempReg, regs_.F(inst.src1));1042MOVSS(regs_.FX(inst.dest), R(tempReg));1043}1044}1045break;10461047case IROp::FAsin:1048callFuncF_F((const void *)&x64_asin);1049break;10501051default:1052INVALIDOP;1053break;1054}1055}10561057void X64JitBackend::CompIR_RoundingMode(IRInst inst) {1058CONDITIONAL_DISABLE;10591060switch (inst.op) {1061case IROp::RestoreRoundingMode:1062RestoreRoundingMode();1063break;10641065case IROp::ApplyRoundingMode:1066ApplyRoundingMode();1067break;10681069case IROp::UpdateRoundingMode:1070// TODO: We might want to do something here?1071break;10721073default:1074INVALIDOP;1075break;1076}1077}10781079} // namespace MIPSComp10801081#endif108210831084