CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM64/Arm64IRCompFPU.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18// In other words, PPSSPP_ARCH(ARM64) || DISASM_ALL.19#if PPSSPP_ARCH(ARM64) || (PPSSPP_PLATFORM(WINDOWS) && !defined(__LIBRETRO__))2021#ifndef offsetof22#include <cstddef>23#endif2425#include "Core/MIPS/ARM64/Arm64IRJit.h"26#include "Core/MIPS/ARM64/Arm64IRRegCache.h"2728// This file contains compilation for floating point related instructions.29//30// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.31// Currently known non working ones should have DISABLE. No flags because that's in IR already.3233// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }34#define CONDITIONAL_DISABLE {}35#define DISABLE { CompIR_Generic(inst); return; }36#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3738namespace MIPSComp {3940using namespace Arm64Gen;41using namespace Arm64IRJitConstants;4243void Arm64JitBackend::CompIR_FArith(IRInst inst) {44CONDITIONAL_DISABLE;4546switch (inst.op) {47case IROp::FAdd:48regs_.Map(inst);49fp_.FADD(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));50break;5152case IROp::FSub:53regs_.Map(inst);54fp_.FSUB(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));55break;5657case IROp::FMul:58regs_.Map(inst);59fp_.FMUL(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));60break;6162case IROp::FDiv:63regs_.Map(inst);64fp_.FDIV(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));65break;6667case IROp::FSqrt:68regs_.Map(inst);69fp_.FSQRT(regs_.F(inst.dest), regs_.F(inst.src1));70break;7172case IROp::FNeg:73regs_.Map(inst);74fp_.FNEG(regs_.F(inst.dest), regs_.F(inst.src1));75break;7677default:78INVALIDOP;79break;80}81}8283void Arm64JitBackend::CompIR_FAssign(IRInst inst) {84CONDITIONAL_DISABLE;8586switch (inst.op) {87case IROp::FMov:88if (inst.dest != inst.src1) {89regs_.Map(inst);90fp_.FMOV(regs_.F(inst.dest), regs_.F(inst.src1));91}92break;9394case IROp::FAbs:95regs_.Map(inst);96fp_.FABS(regs_.F(inst.dest), regs_.F(inst.src1));97break;9899case IROp::FSign:100regs_.Map(inst);101// We'll need this flag later. Vector could use a temp and FCMEQ.102fp_.FCMP(regs_.F(inst.src1));103104fp_.MOVI2FDUP(EncodeRegToDouble(SCRATCHF1), 1.0f);105// Invert 0x80000000 -> 0x7FFFFFFF as a mask for sign.106fp_.MVNI(32, EncodeRegToDouble(SCRATCHF2), 0x80, 24);107// Keep the sign bit in dest, replace all other bits from 1.0f.108if (inst.dest != inst.src1)109fp_.FMOV(regs_.FD(inst.dest), regs_.FD(inst.src1));110fp_.BIT(regs_.FD(inst.dest), EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF2));111112// It's later now, let's replace with zero if that FCmp was EQ to zero.113fp_.MOVI2FDUP(EncodeRegToDouble(SCRATCHF1), 0.0f);114fp_.FCSEL(regs_.F(inst.dest), SCRATCHF1, regs_.F(inst.dest), CC_EQ);115break;116117default:118INVALIDOP;119break;120}121}122123void Arm64JitBackend::CompIR_FCompare(IRInst inst) {124CONDITIONAL_DISABLE;125126constexpr IRReg IRREG_VFPU_CC = IRREG_VFPU_CTRL_BASE + VFPU_CTRL_CC;127128switch (inst.op) {129case IROp::FCmp:130switch (inst.dest) {131case IRFpCompareMode::False:132regs_.SetGPRImm(IRREG_FPCOND, 0);133break;134135case IRFpCompareMode::EitherUnordered:136regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });137fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));138CSET(regs_.R(IRREG_FPCOND), CC_VS);139break;140141case IRFpCompareMode::EqualOrdered:142regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });143fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));144CSET(regs_.R(IRREG_FPCOND), CC_EQ);145break;146147case IRFpCompareMode::EqualUnordered:148regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });149fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));150CSET(regs_.R(IRREG_FPCOND), CC_EQ);151// If ordered, use the above result. If unordered, use ZR+1 (being 1.)152CSINC(regs_.R(IRREG_FPCOND), regs_.R(IRREG_FPCOND), WZR, CC_VC);153break;154155case IRFpCompareMode::LessEqualOrdered:156regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });157fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));158CSET(regs_.R(IRREG_FPCOND), CC_LS);159break;160161case IRFpCompareMode::LessEqualUnordered:162regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });163fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));164CSET(regs_.R(IRREG_FPCOND), CC_LE);165break;166167case IRFpCompareMode::LessOrdered:168regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });169fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));170CSET(regs_.R(IRREG_FPCOND), CC_LO);171break;172173case IRFpCompareMode::LessUnordered:174regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });175fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));176CSET(regs_.R(IRREG_FPCOND), CC_LT);177break;178179default:180_assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest);181}182break;183184case IROp::FCmovVfpuCC:185regs_.MapWithExtra(inst, { { 'G', IRREG_VFPU_CC, 1, MIPSMap::INIT } });186TSTI2R(regs_.R(IRREG_VFPU_CC), 1ULL << (inst.src2 & 0xF));187if ((inst.src2 >> 7) & 1) {188fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), regs_.F(inst.src1), CC_EQ);189} else {190fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), regs_.F(inst.src1), CC_NEQ);191}192break;193194case IROp::FCmpVfpuBit:195regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);196197switch (VCondition(inst.dest & 0xF)) {198case VC_EQ:199regs_.Map(inst);200fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));201CSET(SCRATCH1, CC_EQ);202break;203case VC_NE:204regs_.Map(inst);205fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));206CSET(SCRATCH1, CC_NEQ);207break;208case VC_LT:209regs_.Map(inst);210fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));211CSET(SCRATCH1, CC_LO);212break;213case VC_LE:214regs_.Map(inst);215fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));216CSET(SCRATCH1, CC_LS);217break;218case VC_GT:219regs_.Map(inst);220fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));221CSET(SCRATCH1, CC_GT);222break;223case VC_GE:224regs_.Map(inst);225fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));226CSET(SCRATCH1, CC_GE);227break;228case VC_EZ:229regs_.MapFPR(inst.src1);230fp_.FCMP(regs_.F(inst.src1));231CSET(SCRATCH1, CC_EQ);232break;233case VC_NZ:234regs_.MapFPR(inst.src1);235fp_.FCMP(regs_.F(inst.src1));236CSET(SCRATCH1, CC_NEQ);237break;238case VC_EN:239regs_.MapFPR(inst.src1);240fp_.FCMP(regs_.F(inst.src1));241CSET(SCRATCH1, CC_VS);242break;243case VC_NN:244regs_.MapFPR(inst.src1);245fp_.FCMP(regs_.F(inst.src1));246CSET(SCRATCH1, CC_VC);247break;248case VC_EI:249regs_.MapFPR(inst.src1);250// Compare abs(f) >= Infinity. Could use FACGE for vector.251MOVI2R(SCRATCH1, 0x7F800000);252fp_.FMOV(SCRATCHF2, SCRATCH1);253fp_.FABS(SCRATCHF1, regs_.F(inst.src1));254fp_.FCMP(SCRATCHF1, SCRATCHF2);255CSET(SCRATCH1, CC_GE);256break;257case VC_NI:258regs_.MapFPR(inst.src1);259// Compare abs(f) < Infinity.260MOVI2R(SCRATCH1, 0x7F800000);261fp_.FMOV(SCRATCHF2, SCRATCH1);262fp_.FABS(SCRATCHF1, regs_.F(inst.src1));263fp_.FCMP(SCRATCHF1, SCRATCHF2);264// Less than or NAN.265CSET(SCRATCH1, CC_LT);266break;267case VC_ES:268regs_.MapFPR(inst.src1);269// Compare abs(f) < Infinity.270MOVI2R(SCRATCH1, 0x7F800000);271fp_.FMOV(SCRATCHF2, SCRATCH1);272fp_.FABS(SCRATCHF1, regs_.F(inst.src1));273fp_.FCMP(SCRATCHF1, SCRATCHF2);274// Greater than or equal to Infinity, or NAN.275CSET(SCRATCH1, CC_HS);276break;277case VC_NS:278regs_.MapFPR(inst.src1);279// Compare abs(f) < Infinity.280MOVI2R(SCRATCH1, 0x7F800000);281fp_.FMOV(SCRATCHF2, SCRATCH1);282fp_.FABS(SCRATCHF1, regs_.F(inst.src1));283fp_.FCMP(SCRATCHF1, SCRATCHF2);284// Less than Infinity, but not NAN.285CSET(SCRATCH1, CC_LO);286break;287case VC_TR:288MOVI2R(SCRATCH1, 1);289break;290case VC_FL:291MOVI2R(SCRATCH1, 0);292break;293}294295BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, inst.dest >> 4, 1);296break;297298case IROp::FCmpVfpuAggregate:299regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);300if (inst.dest == 1) {301// Just replicate the lowest bit to the others.302BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 4, 1);303BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 5, 1);304} else {305MOVI2R(SCRATCH1, inst.dest);306// Grab the any bit.307TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);308CSET(SCRATCH2, CC_NEQ);309// Now the all bit, by clearing our mask to zero.310BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));311CSET(SCRATCH1, CC_EQ);312313// Insert the bits into place.314BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);315BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);316}317break;318319default:320INVALIDOP;321break;322}323}324325void Arm64JitBackend::CompIR_FCondAssign(IRInst inst) {326CONDITIONAL_DISABLE;327328// For Vec4, we could basically just ORR FCMPGE/FCMPLE together, but overlap is trickier.329regs_.Map(inst);330fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2));331FixupBranch unordered = B(CC_VS);332333switch (inst.op) {334case IROp::FMin:335fp_.FMIN(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));336break;337338case IROp::FMax:339fp_.FMAX(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));340break;341342default:343INVALIDOP;344break;345}346347FixupBranch orderedDone = B();348349// Not sure if this path is fast, trying to optimize it to be small but correct.350// Probably an uncommon path.351SetJumpTarget(unordered);352fp_.AND(EncodeRegToDouble(SCRATCHF1), regs_.FD(inst.src1), regs_.FD(inst.src2));353// SCRATCHF1 = 0xFFFFFFFF if sign bit set on both, 0x00000000 otherwise.354fp_.CMLT(32, EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF1));355356switch (inst.op) {357case IROp::FMin:358fp_.SMAX(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2));359fp_.SMIN(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2));360break;361362case IROp::FMax:363fp_.SMIN(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2));364fp_.SMAX(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2));365break;366367default:368INVALIDOP;369break;370}371// Replace dest with SCRATCHF2 if both were less than zero.372fp_.BIT(regs_.FD(inst.dest), EncodeRegToDouble(SCRATCHF2), EncodeRegToDouble(SCRATCHF1));373374SetJumpTarget(orderedDone);375}376377void Arm64JitBackend::CompIR_FCvt(IRInst inst) {378CONDITIONAL_DISABLE;379380switch (inst.op) {381case IROp::FCvtWS:382// TODO: Unfortunately, we don't currently have the hasSetRounding flag, could skip lookup.383regs_.Map(inst);384fp_.FMOV(S0, regs_.F(inst.src1));385386MOVP2R(SCRATCH1_64, ¤tRoundingFunc_);387LDR(INDEX_UNSIGNED, SCRATCH1_64, SCRATCH1_64, 0);388BLR(SCRATCH1_64);389390fp_.FMOV(regs_.F(inst.dest), S0);391break;392393case IROp::FCvtSW:394regs_.Map(inst);395fp_.SCVTF(regs_.F(inst.dest), regs_.F(inst.src1));396break;397398case IROp::FCvtScaledWS:399if (IRRoundMode(inst.src2 >> 6) == IRRoundMode::CAST_1) {400regs_.Map(inst);401// NAN would convert to zero, so detect it specifically and replace with 0x7FFFFFFF.402fp_.MVNI(32, EncodeRegToDouble(SCRATCHF2), 0x80, 24);403fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src1));404fp_.FCVTZS(regs_.F(inst.dest), regs_.F(inst.src1), inst.src2 & 0x1F);405fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF2, CC_VC);406} else {407RoundingMode rm;408switch (IRRoundMode(inst.src2 >> 6)) {409case IRRoundMode::RINT_0: rm = RoundingMode::ROUND_N; break;410case IRRoundMode::CEIL_2: rm = RoundingMode::ROUND_P; break;411case IRRoundMode::FLOOR_3: rm = RoundingMode::ROUND_M; break;412default:413_assert_msg_(false, "Invalid rounding mode for FCvtScaledWS");414}415416// Unfortunately, only Z has a direct scaled instruction.417// We'll have to multiply.418regs_.Map(inst);419fp_.MOVI2F(SCRATCHF1, (float)(1UL << (inst.src2 & 0x1F)), SCRATCH1);420// This is for the NAN result.421fp_.MVNI(32, EncodeRegToDouble(SCRATCHF2), 0x80, 24);422fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src1));423fp_.FMUL(regs_.F(inst.dest), regs_.F(inst.src1), SCRATCHF1);424fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.dest), rm);425fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF2, CC_VC);426}427break;428429case IROp::FCvtScaledSW:430// TODO: This is probably proceeded by a GPR transfer, might be ideal to combine.431regs_.Map(inst);432fp_.SCVTF(regs_.F(inst.dest), regs_.F(inst.src1), inst.src2 & 0x1F);433break;434435default:436INVALIDOP;437break;438}439}440441void Arm64JitBackend::CompIR_FRound(IRInst inst) {442CONDITIONAL_DISABLE;443444regs_.Map(inst);445// Invert 0x80000000 -> 0x7FFFFFFF for the NAN result.446fp_.MVNI(32, EncodeRegToDouble(SCRATCHF1), 0x80, 24);447fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src1));448449// Luckily, these already saturate.450switch (inst.op) {451case IROp::FRound:452fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_N);453break;454455case IROp::FTrunc:456fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_Z);457break;458459case IROp::FCeil:460fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_P);461break;462463case IROp::FFloor:464fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_M);465break;466467default:468INVALIDOP;469break;470}471472// Switch to INT_MAX if it was NAN.473fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF1, CC_VC);474}475476void Arm64JitBackend::CompIR_FSat(IRInst inst) {477CONDITIONAL_DISABLE;478479switch (inst.op) {480case IROp::FSat0_1:481regs_.Map(inst);482fp_.MOVI2F(SCRATCHF1, 1.0f);483// Note that FMAX takes the larger of the two zeros, which is what we want.484fp_.MOVI2F(SCRATCHF2, 0.0f);485486fp_.FMIN(regs_.F(inst.dest), regs_.F(inst.src1), SCRATCHF1);487fp_.FMAX(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF2);488break;489490case IROp::FSatMinus1_1:491regs_.Map(inst);492fp_.MOVI2F(SCRATCHF1, 1.0f);493fp_.FNEG(SCRATCHF2, SCRATCHF1);494495fp_.FMIN(regs_.F(inst.dest), regs_.F(inst.src1), SCRATCHF1);496fp_.FMAX(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF2);497break;498499default:500INVALIDOP;501break;502}503}504505void Arm64JitBackend::CompIR_FSpecial(IRInst inst) {506CONDITIONAL_DISABLE;507508auto callFuncF_F = [&](float (*func)(float)) {509regs_.FlushBeforeCall();510WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);511512// It might be in a non-volatile register.513// TODO: May have to handle a transfer if SIMD here.514if (regs_.IsFPRMapped(inst.src1)) {515int lane = regs_.GetFPRLane(inst.src1);516if (lane == 0)517fp_.FMOV(S0, regs_.F(inst.src1));518else519fp_.DUP(32, Q0, regs_.F(inst.src1), lane);520} else {521int offset = offsetof(MIPSState, f) + inst.src1 * 4;522fp_.LDR(32, INDEX_UNSIGNED, S0, CTXREG, offset);523}524QuickCallFunction(SCRATCH2_64, func);525526regs_.MapFPR(inst.dest, MIPSMap::NOINIT);527// If it's already F10, we're done - MapReg doesn't actually overwrite the reg in that case.528if (regs_.F(inst.dest) != S0) {529fp_.FMOV(regs_.F(inst.dest), S0);530}531532WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);533};534535switch (inst.op) {536case IROp::FSin:537callFuncF_F(&vfpu_sin);538break;539540case IROp::FCos:541callFuncF_F(&vfpu_cos);542break;543544case IROp::FRSqrt:545regs_.Map(inst);546fp_.MOVI2F(SCRATCHF1, 1.0f);547fp_.FSQRT(regs_.F(inst.dest), regs_.F(inst.src1));548fp_.FDIV(regs_.F(inst.dest), SCRATCHF1, regs_.F(inst.dest));549break;550551case IROp::FRecip:552regs_.Map(inst);553fp_.MOVI2F(SCRATCHF1, 1.0f);554fp_.FDIV(regs_.F(inst.dest), SCRATCHF1, regs_.F(inst.src1));555break;556557case IROp::FAsin:558callFuncF_F(&vfpu_asin);559break;560561default:562INVALIDOP;563break;564}565}566567void Arm64JitBackend::CompIR_RoundingMode(IRInst inst) {568CONDITIONAL_DISABLE;569570switch (inst.op) {571case IROp::RestoreRoundingMode:572RestoreRoundingMode();573break;574575case IROp::ApplyRoundingMode:576ApplyRoundingMode();577break;578579case IROp::UpdateRoundingMode:580UpdateRoundingMode();581break;582583default:584INVALIDOP;585break;586}587}588589} // namespace MIPSComp590591#endif592593594