CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM64/Arm64CompVFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(ARM64)1920#include <cmath>21#include "Common/Arm64Emitter.h"22#include "Common/CPUDetect.h"23#include "Common/Data/Convert/SmallDataConvert.h"24#include "Common/Math/math_util.h"2526#include "Core/Compatibility.h"27#include "Core/Config.h"28#include "Core/MemMap.h"29#include "Core/Reporting.h"30#include "Core/System.h"31#include "Core/MIPS/MIPS.h"32#include "Core/MIPS/MIPSTables.h"33#include "Core/MIPS/MIPSAnalyst.h"34#include "Core/MIPS/MIPSCodeUtils.h"35#include "Core/MIPS/ARM64/Arm64Jit.h"36#include "Core/MIPS/ARM64/Arm64RegCache.h"3738// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.39// Currently known non working ones should have DISABLE.4041// #define CONDITIONAL_DISABLE(flag) { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }42#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }43#define DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }4445#define _RS MIPS_GET_RS(op)46#define _RT MIPS_GET_RT(op)47#define _RD MIPS_GET_RD(op)48#define _FS MIPS_GET_FS(op)49#define _FT MIPS_GET_FT(op)50#define _FD MIPS_GET_FD(op)51#define _SA MIPS_GET_SA(op)52#define _POS ((op>> 6) & 0x1F)53#define _SIZE ((op>>11) & 0x1F)54#define _IMM16 (signed short)(op & 0xFFFF)55#define _IMM26 (op & 0x03FFFFFF)5657namespace MIPSComp {58using namespace Arm64Gen;59using namespace Arm64JitConstants;6061// Vector regs can overlap in all sorts of swizzled ways.62// This does allow a single overlap in sregs[i].63static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)64{65for (int i = 0; i < sn; ++i) {66if (sregs[i] == dreg && i != di)67return false;68}69for (int i = 0; i < tn; ++i) {70if (tregs[i] == dreg)71return false;72}7374// Hurray, no overlap, we can write directly.75return true;76}7778static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)79{80return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;81}8283void Arm64Jit::Comp_VPFX(MIPSOpcode op) {84CONDITIONAL_DISABLE(VFPU_XFER);85int data = op & 0xFFFFF;86int regnum = (op >> 24) & 3;87switch (regnum) {88case 0: // S89js.prefixS = data;90js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;91break;92case 1: // T93js.prefixT = data;94js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;95break;96case 2: // D97js.prefixD = data & 0x00000FFF;98js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;99break;100default:101ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);102break;103}104}105106void Arm64Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {107if (prefix == 0xE4)108return;109110int n = GetNumVectorElements(sz);111u8 origV[4];112static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };113114for (int i = 0; i < n; i++)115origV[i] = vregs[i];116117for (int i = 0; i < n; i++) {118int regnum = (prefix >> (i * 2)) & 3;119int abs = (prefix >> (8 + i)) & 1;120int negate = (prefix >> (16 + i)) & 1;121int constants = (prefix >> (12 + i)) & 1;122123// Unchanged, hurray.124if (!constants && regnum == i && !abs && !negate)125continue;126127// This puts the value into a temp reg, so we won't write the modified value back.128vregs[i] = fpr.GetTempV();129if (!constants) {130fpr.MapDirtyInV(vregs[i], origV[regnum]);131fpr.SpillLockV(vregs[i]);132133// Prefix may say "z, z, z, z" but if this is a pair, we force to x.134// TODO: But some ops seem to use const 0 instead?135if (regnum >= n) {136WARN_LOG(Log::CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, GetCompilerPC(), MIPSDisasmAt(GetCompilerPC()).c_str());137regnum = 0;138}139140if (abs) {141fp.FABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));142if (negate)143fp.FNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));144} else {145if (negate)146fp.FNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));147else148fp.FMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));149}150} else {151fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);152fpr.SpillLockV(vregs[i]);153fp.MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs << 2)], SCRATCH1, (bool)negate);154}155}156}157158void Arm64Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {159_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);160161GetVectorRegs(regs, sz, vectorReg);162if (js.prefixD == 0)163return;164165int n = GetNumVectorElements(sz);166for (int i = 0; i < n; i++) {167// Hopefully this is rare, we'll just write it into a reg we drop.168if (js.VfpuWriteMask(i))169regs[i] = fpr.GetTempV();170}171}172173void Arm64Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {174_assert_msg_(js.prefixDFlag & JitState::PREFIX_KNOWN, "Unexpected unknown prefix!");175if (!js.prefixD)176return;177178int n = GetNumVectorElements(sz);179for (int i = 0; i < n; i++) {180if (js.VfpuWriteMask(i))181continue;182183int sat = (js.prefixD >> (i * 2)) & 3;184if (sat == 1) {185// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]186fpr.MapRegV(vregs[i], MAP_DIRTY);187188fp.MOVI2F(S0, 0.0f, SCRATCH1);189fp.MOVI2F(S1, 1.0f, SCRATCH1);190fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);191fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);192} else if (sat == 3) {193// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]194fpr.MapRegV(vregs[i], MAP_DIRTY);195196fp.MOVI2F(S0, -1.0f, SCRATCH1);197fp.MOVI2F(S1, 1.0f, SCRATCH1);198fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);199fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);200}201}202}203204void Arm64Jit::Comp_SV(MIPSOpcode op) {205CONDITIONAL_DISABLE(LSU_VFPU);206CheckMemoryBreakpoint();207208s32 offset = (signed short)(op & 0xFFFC);209int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);210MIPSGPReg rs = _RS;211212std::vector<FixupBranch> skips;213switch (op >> 26) {214case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);215{216if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset >= 0 && offset < 16384) {217gpr.MapRegAsPointer(rs);218fpr.MapRegV(vt, MAP_NOINIT | MAP_DIRTY);219fp.LDR(32, INDEX_UNSIGNED, fpr.V(vt), gpr.RPtr(rs), offset);220break;221}222223// CC might be set by slow path below, so load regs first.224fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);225if (gpr.IsImm(rs)) {226#ifdef MASKED_PSP_MEMORY227u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;228#else229u32 addr = offset + gpr.GetImm(rs);230#endif231gpr.SetRegImm(SCRATCH1, addr);232} else {233gpr.MapReg(rs);234if (g_Config.bFastMemory) {235SetScratch1ToEffectiveAddress(rs, offset);236} else {237skips = SetScratch1ForSafeAddress(rs, offset, SCRATCH2);238}239}240fp.LDR(32, fpr.V(vt), SCRATCH1_64, ArithOption(MEMBASEREG));241for (auto skip : skips) {242SetJumpTarget(skip);243}244}245break;246247case 58: //sv.s // Memory::Write_U32(VI(vt), addr);248{249if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset >= 0 && offset < 16384) {250gpr.MapRegAsPointer(rs);251fpr.MapRegV(vt, 0);252fp.STR(32, INDEX_UNSIGNED, fpr.V(vt), gpr.RPtr(rs), offset);253break;254}255256// CC might be set by slow path below, so load regs first.257fpr.MapRegV(vt);258if (gpr.IsImm(rs)) {259#ifdef MASKED_PSP_MEMORY260u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;261#else262u32 addr = offset + gpr.GetImm(rs);263#endif264gpr.SetRegImm(SCRATCH1, addr);265} else {266gpr.MapReg(rs);267if (g_Config.bFastMemory) {268SetScratch1ToEffectiveAddress(rs, offset);269} else {270skips = SetScratch1ForSafeAddress(rs, offset, SCRATCH2);271}272}273fp.STR(32, fpr.V(vt), SCRATCH1_64, ArithOption(MEMBASEREG));274for (auto skip : skips) {275SetJumpTarget(skip);276}277}278break;279280281default:282DISABLE;283}284}285286void Arm64Jit::Comp_SVQ(MIPSOpcode op) {287CONDITIONAL_DISABLE(LSU_VFPU);288CheckMemoryBreakpoint();289290int imm = (signed short)(op&0xFFFC);291int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);292MIPSGPReg rs = _RS;293294std::vector<FixupBranch> skips;295switch (op >> 26)296{297case 54: //lv.q298{299// CC might be set by slow path below, so load regs first.300u8 vregs[4];301GetVectorRegs(vregs, V_Quad, vt);302fpr.MapRegsAndSpillLockV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);303304if (gpr.IsImm(rs)) {305#ifdef MASKED_PSP_MEMORY306u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;307#else308u32 addr = imm + gpr.GetImm(rs);309#endif310gpr.SetRegImm(SCRATCH1_64, addr + (uintptr_t)Memory::base);311} else {312gpr.MapReg(rs);313if (g_Config.bFastMemory) {314SetScratch1ToEffectiveAddress(rs, imm);315} else {316skips = SetScratch1ForSafeAddress(rs, imm, SCRATCH2);317}318if (jo.enablePointerify) {319MOVK(SCRATCH1_64, ((uint64_t)Memory::base) >> 32, SHIFT_32);320} else {321ADD(SCRATCH1_64, SCRATCH1_64, MEMBASEREG);322}323}324325fp.LDP(32, INDEX_SIGNED, fpr.V(vregs[0]), fpr.V(vregs[1]), SCRATCH1_64, 0);326fp.LDP(32, INDEX_SIGNED, fpr.V(vregs[2]), fpr.V(vregs[3]), SCRATCH1_64, 8);327328for (auto skip : skips) {329SetJumpTarget(skip);330}331}332break;333334case 62: //sv.q335{336// CC might be set by slow path below, so load regs first.337u8 vregs[4];338GetVectorRegs(vregs, V_Quad, vt);339fpr.MapRegsAndSpillLockV(vregs, V_Quad, 0);340341if (gpr.IsImm(rs)) {342#ifdef MASKED_PSP_MEMORY343u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;344#else345u32 addr = imm + gpr.GetImm(rs);346#endif347gpr.SetRegImm(SCRATCH1_64, addr + (uintptr_t)Memory::base);348} else {349gpr.MapReg(rs);350if (g_Config.bFastMemory) {351SetScratch1ToEffectiveAddress(rs, imm);352} else {353skips = SetScratch1ForSafeAddress(rs, imm, SCRATCH2);354}355if (jo.enablePointerify) {356MOVK(SCRATCH1_64, ((uint64_t)Memory::base) >> 32, SHIFT_32);357} else {358ADD(SCRATCH1_64, SCRATCH1_64, MEMBASEREG);359}360}361fp.STP(32, INDEX_SIGNED, fpr.V(vregs[0]), fpr.V(vregs[1]), SCRATCH1_64, 0);362fp.STP(32, INDEX_SIGNED, fpr.V(vregs[2]), fpr.V(vregs[3]), SCRATCH1_64, 8);363364for (auto skip : skips) {365SetJumpTarget(skip);366}367}368break;369370default:371DISABLE;372break;373}374fpr.ReleaseSpillLocksAndDiscardTemps();375}376377void Arm64Jit::Comp_VVectorInit(MIPSOpcode op) {378CONDITIONAL_DISABLE(VFPU_XFER);379// WARNING: No prefix support!380if (js.HasUnknownPrefix()) {381DISABLE;382}383384switch ((op >> 16) & 0xF) {385case 6: // v=zeros; break; //vzero386fp.MOVI2F(S0, 0.0f, SCRATCH1);387break;388case 7: // v=ones; break; //vone389fp.MOVI2F(S0, 1.0f, SCRATCH1);390break;391default:392DISABLE;393break;394}395396VectorSize sz = GetVecSize(op);397int n = GetNumVectorElements(sz);398399u8 dregs[4];400GetVectorRegsPrefixD(dregs, sz, _VD);401fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);402403for (int i = 0; i < n; ++i)404fp.FMOV(fpr.V(dregs[i]), S0);405406ApplyPrefixD(dregs, sz);407408fpr.ReleaseSpillLocksAndDiscardTemps();409}410411void Arm64Jit::Comp_VIdt(MIPSOpcode op) {412CONDITIONAL_DISABLE(VFPU_XFER);413if (js.HasUnknownPrefix()) {414DISABLE;415}416417int vd = _VD;418VectorSize sz = GetVecSize(op);419int n = GetNumVectorElements(sz);420fp.MOVI2F(S0, 0.0f, SCRATCH1);421fp.MOVI2F(S1, 1.0f, SCRATCH1);422u8 dregs[4];423GetVectorRegsPrefixD(dregs, sz, _VD);424fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);425switch (sz) {426case V_Pair:427fp.FMOV(fpr.V(dregs[0]), (vd & 1) == 0 ? S1 : S0);428fp.FMOV(fpr.V(dregs[1]), (vd & 1) == 1 ? S1 : S0);429break;430case V_Quad:431fp.FMOV(fpr.V(dregs[0]), (vd & 3) == 0 ? S1 : S0);432fp.FMOV(fpr.V(dregs[1]), (vd & 3) == 1 ? S1 : S0);433fp.FMOV(fpr.V(dregs[2]), (vd & 3) == 2 ? S1 : S0);434fp.FMOV(fpr.V(dregs[3]), (vd & 3) == 3 ? S1 : S0);435break;436default:437_dbg_assert_msg_( 0, "Trying to interpret instruction that can't be interpreted");438break;439}440441ApplyPrefixD(dregs, sz);442443fpr.ReleaseSpillLocksAndDiscardTemps();444}445446void Arm64Jit::Comp_VMatrixInit(MIPSOpcode op) {447CONDITIONAL_DISABLE(VFPU_XFER);448if (js.HasUnknownPrefix()) {449// Don't think matrix init ops care about prefixes.450// DISABLE;451}452453MatrixSize sz = GetMtxSize(op);454int n = GetMatrixSide(sz);455456u8 dregs[16];457GetMatrixRegs(dregs, sz, _VD);458459switch ((op >> 16) & 0xF) {460case 3: // vmidt461fp.MOVI2F(S0, 0.0f, SCRATCH1);462fp.MOVI2F(S1, 1.0f, SCRATCH1);463for (int a = 0; a < n; a++) {464for (int b = 0; b < n; b++) {465fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);466fp.FMOV(fpr.V(dregs[a * 4 + b]), a == b ? S1 : S0);467}468}469break;470case 6: // vmzero471fp.MOVI2F(S0, 0.0f, SCRATCH1);472for (int a = 0; a < n; a++) {473for (int b = 0; b < n; b++) {474fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);475fp.FMOV(fpr.V(dregs[a * 4 + b]), S0);476}477}478break;479case 7: // vmone480fp.MOVI2F(S1, 1.0f, SCRATCH1);481for (int a = 0; a < n; a++) {482for (int b = 0; b < n; b++) {483fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);484fp.FMOV(fpr.V(dregs[a * 4 + b]), S1);485}486}487break;488}489490fpr.ReleaseSpillLocksAndDiscardTemps();491}492493void Arm64Jit::Comp_VHdp(MIPSOpcode op) {494CONDITIONAL_DISABLE(VFPU_VEC);495if (js.HasUnknownPrefix()) {496DISABLE;497}498499int vd = _VD;500int vs = _VS;501int vt = _VT;502VectorSize sz = GetVecSize(op);503504// TODO: Force read one of them into regs? probably not.505u8 sregs[4], tregs[4], dregs[1];506GetVectorRegsPrefixS(sregs, sz, vs);507GetVectorRegsPrefixT(tregs, sz, vt);508GetVectorRegsPrefixD(dregs, V_Single, vd);509510// TODO: applyprefixST here somehow (shuffle, etc...)511fpr.MapRegsAndSpillLockV(sregs, sz, 0);512fpr.MapRegsAndSpillLockV(tregs, sz, 0);513fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));514515int n = GetNumVectorElements(sz);516for (int i = 1; i < n; i++) {517// sum += s[i]*t[i];518if (i == n - 1) {519fp.FADD(S0, S0, fpr.V(tregs[i]));520} else {521fp.FMADD(S0, fpr.V(sregs[i]), fpr.V(tregs[i]), S0);522}523}524fpr.ReleaseSpillLocksAndDiscardTemps();525526fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);527528fp.FMOV(fpr.V(dregs[0]), S0);529ApplyPrefixD(dregs, V_Single);530fpr.ReleaseSpillLocksAndDiscardTemps();531}532533alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };534535void Arm64Jit::Comp_Vhoriz(MIPSOpcode op) {536CONDITIONAL_DISABLE(VFPU_VEC);537if (js.HasUnknownPrefix()) {538DISABLE;539}540541int vd = _VD;542int vs = _VS;543int vt = _VT;544VectorSize sz = GetVecSize(op);545546// TODO: Force read one of them into regs? probably not.547u8 sregs[4], dregs[1];548GetVectorRegsPrefixS(sregs, sz, vs);549GetVectorRegsPrefixD(dregs, V_Single, vd);550551// TODO: applyprefixST here somehow (shuffle, etc...)552fpr.MapRegsAndSpillLockV(sregs, sz, 0);553554int n = GetNumVectorElements(sz);555556bool is_vavg = ((op >> 16) & 0x1f) == 7;557if (is_vavg) {558fp.MOVI2F(S1, vavg_table[n - 1], SCRATCH1);559}560// Have to start at +0.000 for the correct sign.561fp.MOVI2F(S0, 0.0f, SCRATCH1);562for (int i = 0; i < n; i++) {563// sum += s[i];564fp.FADD(S0, S0, fpr.V(sregs[i]));565}566567fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);568if (is_vavg) {569fp.FMUL(fpr.V(dregs[0]), S0, S1);570} else {571fp.FMOV(fpr.V(dregs[0]), S0);572}573ApplyPrefixD(dregs, V_Single);574fpr.ReleaseSpillLocksAndDiscardTemps();575}576577void Arm64Jit::Comp_VDot(MIPSOpcode op) {578CONDITIONAL_DISABLE(VFPU_VEC);579if (js.HasUnknownPrefix()) {580DISABLE;581}582583int vd = _VD;584int vs = _VS;585int vt = _VT;586VectorSize sz = GetVecSize(op);587588// TODO: Force read one of them into regs? probably not.589u8 sregs[4], tregs[4], dregs[1];590GetVectorRegsPrefixS(sregs, sz, vs);591GetVectorRegsPrefixT(tregs, sz, vt);592GetVectorRegsPrefixD(dregs, V_Single, vd);593594// TODO: applyprefixST here somehow (shuffle, etc...)595fpr.MapRegsAndSpillLockV(sregs, sz, 0);596fpr.MapRegsAndSpillLockV(tregs, sz, 0);597fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));598599int n = GetNumVectorElements(sz);600for (int i = 1; i < n; i++) {601// sum += s[i]*t[i];602fp.FMADD(S0, fpr.V(sregs[i]), fpr.V(tregs[i]), S0);603}604fpr.ReleaseSpillLocksAndDiscardTemps();605606fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);607608fp.FMOV(fpr.V(dregs[0]), S0);609ApplyPrefixD(dregs, V_Single);610fpr.ReleaseSpillLocksAndDiscardTemps();611}612613void Arm64Jit::Comp_VecDo3(MIPSOpcode op) {614CONDITIONAL_DISABLE(VFPU_VEC);615if (js.HasUnknownPrefix()) {616DISABLE;617}618619int vd = _VD;620int vs = _VS;621int vt = _VT;622623VectorSize sz = GetVecSize(op);624int n = GetNumVectorElements(sz);625626u8 sregs[4], tregs[4], dregs[4];627GetVectorRegsPrefixS(sregs, sz, _VS);628GetVectorRegsPrefixT(tregs, sz, _VT);629GetVectorRegsPrefixD(dregs, sz, _VD);630631MIPSReg tempregs[4];632for (int i = 0; i < n; i++) {633if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {634tempregs[i] = fpr.GetTempV();635} else {636tempregs[i] = dregs[i];637}638}639640// Map first, then work. This will allow us to use VLDMIA more often641// (when we add the appropriate map function) and the instruction ordering642// will improve.643// Note that mapping like this (instead of first all sregs, first all tregs etc)644// reduces the amount of continuous registers a lot :(645for (int i = 0; i < n; i++) {646fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);647fpr.SpillLockV(tempregs[i]);648fpr.SpillLockV(sregs[i]);649fpr.SpillLockV(tregs[i]);650}651652for (int i = 0; i < n; i++) {653switch (op >> 26) {654case 24: //VFPU0655switch ((op >> 23) & 7) {656case 0: // d[i] = s[i] + t[i]; break; //vadd657fp.FADD(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));658break;659case 1: // d[i] = s[i] - t[i]; break; //vsub660fp.FSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));661break;662case 7: // d[i] = s[i] / t[i]; break; //vdiv663fp.FDIV(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));664break;665default:666DISABLE;667}668break;669case 25: //VFPU1670switch ((op >> 23) & 7) {671case 0: // d[i] = s[i] * t[i]; break; //vmul672fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));673break;674default:675DISABLE;676}677break;678// Fortunately there is FMIN/FMAX on ARM64!679case 27: //VFPU3680switch ((op >> 23) & 7) {681case 2: // vmin682{683fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));684FixupBranch unordered = B(CC_VS);685fp.FMIN(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));686FixupBranch skip = B();687688SetJumpTarget(unordered);689// Move to integer registers, it'll be easier. Or maybe there's a simd way?690fp.FMOV(SCRATCH1, fpr.V(sregs[i]));691fp.FMOV(SCRATCH2, fpr.V(tregs[i]));692// And together to find if both have negative set.693TST(SCRATCH1, SCRATCH2);694FixupBranch cmpPositive = B(CC_PL);695// If both are negative, "min" is the greater of the two, since it has the largest mantissa.696CMP(SCRATCH1, SCRATCH2);697CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_GE);698FixupBranch skipPositive = B();699// If either one is positive, we just want the lowest one.700SetJumpTarget(cmpPositive);701CMP(SCRATCH1, SCRATCH2);702CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_LE);703SetJumpTarget(skipPositive);704// Now, whether negative or positive, move to the result.705fp.FMOV(fpr.V(tempregs[i]), SCRATCH1);706SetJumpTarget(skip);707break;708}709case 3: // vmax710{711fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));712FixupBranch unordered = B(CC_VS);713fp.FMAX(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));714FixupBranch skip = B();715716SetJumpTarget(unordered);717// Move to integer registers, it'll be easier. Or maybe there's a simd way?718fp.FMOV(SCRATCH1, fpr.V(sregs[i]));719fp.FMOV(SCRATCH2, fpr.V(tregs[i]));720// And together to find if both have negative set.721TST(SCRATCH1, SCRATCH2);722FixupBranch cmpPositive = B(CC_PL);723// If both are negative, "max" is the least of the two, since it has the lowest mantissa.724CMP(SCRATCH1, SCRATCH2);725CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_LE);726FixupBranch skipPositive = B();727// If either one is positive, we just want the highest one.728SetJumpTarget(cmpPositive);729CMP(SCRATCH1, SCRATCH2);730CSEL(SCRATCH1, SCRATCH1, SCRATCH2, CC_GE);731SetJumpTarget(skipPositive);732// Now, whether negative or positive, move to the result.733fp.FMOV(fpr.V(tempregs[i]), SCRATCH1);734SetJumpTarget(skip);735break;736}737case 6: // vsge738DISABLE; // pending testing739break;740case 7: // vslt741DISABLE; // pending testing742break;743}744break;745746default:747DISABLE;748}749}750751for (int i = 0; i < n; i++) {752if (dregs[i] != tempregs[i]) {753fpr.MapDirtyInV(dregs[i], tempregs[i]);754fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));755}756}757ApplyPrefixD(dregs, sz);758759fpr.ReleaseSpillLocksAndDiscardTemps();760}761762void Arm64Jit::Comp_VV2Op(MIPSOpcode op) {763CONDITIONAL_DISABLE(VFPU_VEC);764if (js.HasUnknownPrefix()) {765DISABLE;766}767768// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure769if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {770return;771}772773// Catch the disabled operations immediately so we don't map registers unnecessarily later.774// Move these down to the big switch below as they are implemented.775switch ((op >> 16) & 0x1f) {776case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin777DISABLE;778break;779case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos780DISABLE;781break;782case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2783DISABLE;784break;785case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2786DISABLE;787break;788case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin789DISABLE;790break;791case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2792DISABLE;793break;794default:795;796}797798VectorSize sz = GetVecSize(op);799int n = GetNumVectorElements(sz);800801u8 sregs[4], dregs[4];802GetVectorRegsPrefixS(sregs, sz, _VS);803GetVectorRegsPrefixD(dregs, sz, _VD);804805MIPSReg tempregs[4];806for (int i = 0; i < n; ++i) {807if (!IsOverlapSafe(dregs[i], i, n, sregs)) {808tempregs[i] = fpr.GetTempV();809} else {810tempregs[i] = dregs[i];811}812}813814// Pre map the registers to get better instruction ordering.815// Note that mapping like this (instead of first all sregs, first all tempregs etc)816// reduces the amount of continuous registers a lot :(817for (int i = 0; i < n; i++) {818fpr.MapDirtyInV(tempregs[i], sregs[i]);819fpr.SpillLockV(tempregs[i]);820fpr.SpillLockV(sregs[i]);821}822823// Warning: sregs[i] and tempxregs[i] may be the same reg.824// Helps for vmov, hurts for vrcp, etc.825for (int i = 0; i < n; i++) {826switch ((op >> 16) & 0x1f) {827case 0: // d[i] = s[i]; break; //vmov828// Probably for swizzle.829fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));830break;831case 1: // d[i] = fabsf(s[i]); break; //vabs832fp.FABS(fpr.V(tempregs[i]), fpr.V(sregs[i]));833break;834case 2: // d[i] = -s[i]; break; //vneg835fp.FNEG(fpr.V(tempregs[i]), fpr.V(sregs[i]));836break;837case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0838if (i == 0) {839fp.MOVI2F(S0, 0.0f, SCRATCH1);840fp.MOVI2F(S1, 1.0f, SCRATCH1);841}842fp.FCMP(fpr.V(sregs[i]), S0);843fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));844fp.FMAX(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);845fp.FMIN(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);846break;847case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1848if (i == 0) {849fp.MOVI2F(S0, -1.0f, SCRATCH1);850fp.MOVI2F(S1, 1.0f, SCRATCH1);851}852fp.FCMP(fpr.V(sregs[i]), S0);853fp.FMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));854fp.FMAX(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);855fp.FMIN(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);856break;857case 16: // d[i] = 1.0f / s[i]; break; //vrcp858if (i == 0) {859fp.MOVI2F(S0, 1.0f, SCRATCH1);860}861fp.FDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));862break;863case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq864if (i == 0) {865fp.MOVI2F(S0, 1.0f, SCRATCH1);866}867fp.FSQRT(S1, fpr.V(sregs[i]));868fp.FDIV(fpr.V(tempregs[i]), S0, S1);869break;870case 22: // d[i] = sqrtf(s[i]); break; //vsqrt871fp.FSQRT(fpr.V(tempregs[i]), fpr.V(sregs[i]));872fp.FABS(fpr.V(tempregs[i]), fpr.V(tempregs[i]));873break;874case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin875DISABLE;876break;877case 24: // d[i] = -1.0f / s[i]; break; // vnrcp878if (i == 0) {879fp.MOVI2F(S0, -1.0f, SCRATCH1);880}881fp.FDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));882break;883default:884ERROR_LOG(Log::JIT, "case missing in vfpu vv2op");885DISABLE;886break;887}888}889890for (int i = 0; i < n; ++i) {891if (dregs[i] != tempregs[i]) {892fpr.MapDirtyInV(dregs[i], tempregs[i]);893fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));894}895}896897ApplyPrefixD(dregs, sz);898899fpr.ReleaseSpillLocksAndDiscardTemps();900}901902void Arm64Jit::Comp_Vi2f(MIPSOpcode op) {903CONDITIONAL_DISABLE(VFPU_VEC);904if (js.HasUnknownPrefix()) {905DISABLE;906}907908VectorSize sz = GetVecSize(op);909int n = GetNumVectorElements(sz);910911int imm = (op >> 16) & 0x1f;912const float mult = 1.0f / (float)(1UL << imm);913914u8 sregs[4], dregs[4];915GetVectorRegsPrefixS(sregs, sz, _VS);916GetVectorRegsPrefixD(dregs, sz, _VD);917918MIPSReg tempregs[4];919for (int i = 0; i < n; ++i) {920if (!IsOverlapSafe(dregs[i], i, n, sregs)) {921tempregs[i] = fpr.GetTempV();922} else {923tempregs[i] = dregs[i];924}925}926927if (mult != 1.0f)928fp.MOVI2F(S0, mult, SCRATCH1);929930// TODO: Use the SCVTF with builtin scaling where possible.931for (int i = 0; i < n; i++) {932fpr.MapDirtyInV(tempregs[i], sregs[i]);933fp.SCVTF(fpr.V(tempregs[i]), fpr.V(sregs[i]));934if (mult != 1.0f)935fp.FMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);936}937938for (int i = 0; i < n; ++i) {939if (dregs[i] != tempregs[i]) {940fpr.MapDirtyInV(dregs[i], tempregs[i]);941fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));942}943}944945ApplyPrefixD(dregs, sz);946fpr.ReleaseSpillLocksAndDiscardTemps();947}948949void Arm64Jit::Comp_Vh2f(MIPSOpcode op) {950// TODO: Fix by porting the general SSE solution to NEON951// FCVTL doesn't provide identical results to the PSP hardware, according to the unit test:952// O vh2f: 00000000,400c0000,00000000,7ff00000953// E vh2f: 00000000,400c0000,00000000,7f800380954DISABLE;955956CONDITIONAL_DISABLE(VFPU_VEC);957if (js.HasUnknownPrefix()) {958DISABLE;959}960961u8 sregs[4], dregs[4];962VectorSize sz = GetVecSize(op);963VectorSize outSz;964965switch (sz) {966case V_Single:967outSz = V_Pair;968break;969case V_Pair:970outSz = V_Quad;971break;972default:973DISABLE;974}975976int n = GetNumVectorElements(sz);977int nOut = n * 2;978GetVectorRegsPrefixS(sregs, sz, _VS);979GetVectorRegsPrefixD(dregs, outSz, _VD);980981// Take the single registers and combine them to a D register.982for (int i = 0; i < n; i++) {983fpr.MapRegV(sregs[i], sz);984fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);985}986// Convert four 16-bit floats in D0 to four 32-bit floats in Q0 (even if we only have two...)987fp.FCVTL(32, Q0, D0);988// Split apart again.989for (int i = 0; i < nOut; i++) {990fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);991fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);992}993994ApplyPrefixD(dregs, sz);995fpr.ReleaseSpillLocksAndDiscardTemps();996}997998void Arm64Jit::Comp_Vf2i(MIPSOpcode op) {999DISABLE;1000}10011002void Arm64Jit::Comp_Mftv(MIPSOpcode op) {1003CONDITIONAL_DISABLE(VFPU_XFER);1004int imm = op & 0xFF;1005MIPSGPReg rt = _RT;1006switch ((op >> 21) & 0x1f) {1007case 3: //mfv / mfvc1008// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.1009if (rt != 0) {1010if (imm < 128) { //R(rt) = VI(imm);1011if (!fpr.IsInRAMV(imm)) {1012fpr.MapRegV(imm, 0);1013gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);1014fp.FMOV(gpr.R(rt), fpr.V(imm));1015} else {1016gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);1017LDR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, fpr.GetMipsRegOffsetV(imm));1018}1019} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc1020if (imm - 128 == VFPU_CTRL_CC) {1021if (gpr.IsImm(MIPS_REG_VFPUCC)) {1022gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));1023} else {1024gpr.MapDirtyIn(rt, MIPS_REG_VFPUCC);1025MOV(gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));1026}1027} else {1028// In case we have a saved prefix.1029FlushPrefixV();1030gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);1031LDR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));1032}1033} else {1034//ERROR - maybe need to make this value too an "interlock" value?1035ERROR_LOG(Log::CPU, "mfv - invalid register %i", imm);1036}1037}1038break;10391040case 7: // mtv1041if (imm < 128) {1042if (rt == MIPS_REG_ZERO) {1043fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);1044fp.MOVI2F(fpr.V(imm), 0.0f, SCRATCH1);1045} else if (!gpr.IsInRAM(rt)) {1046gpr.MapReg(rt);1047fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);1048fp.FMOV(fpr.V(imm), gpr.R(rt));1049} else {1050fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);1051fp.LDR(32, INDEX_UNSIGNED, fpr.V(imm), CTXREG, gpr.GetMipsRegOffset(rt));1052}1053} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);1054if (imm - 128 == VFPU_CTRL_CC) {1055if (gpr.IsImm(rt)) {1056gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));1057} else {1058gpr.MapDirtyIn(MIPS_REG_VFPUCC, rt);1059MOV(gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));1060}1061} else {1062gpr.MapReg(rt);1063STR(INDEX_UNSIGNED, gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));1064}10651066// TODO: Optimization if rt is Imm?1067// Set these BEFORE disable!1068if (imm - 128 == VFPU_CTRL_SPREFIX) {1069js.prefixSFlag = JitState::PREFIX_UNKNOWN;1070js.blockWrotePrefixes = true;1071} else if (imm - 128 == VFPU_CTRL_TPREFIX) {1072js.prefixTFlag = JitState::PREFIX_UNKNOWN;1073js.blockWrotePrefixes = true;1074} else if (imm - 128 == VFPU_CTRL_DPREFIX) {1075js.prefixDFlag = JitState::PREFIX_UNKNOWN;1076js.blockWrotePrefixes = true;1077}1078} else {1079//ERROR1080_dbg_assert_msg_( 0, "mtv - invalid register");1081}1082break;10831084default:1085DISABLE;1086}10871088fpr.ReleaseSpillLocksAndDiscardTemps();1089}10901091void Arm64Jit::Comp_Vmfvc(MIPSOpcode op) {1092CONDITIONAL_DISABLE(VFPU_XFER);10931094int vd = _VD;1095int imm = (op >> 8) & 0x7F;1096if (imm < VFPU_CTRL_MAX) {1097fpr.MapRegV(vd);1098if (imm == VFPU_CTRL_CC) {1099gpr.MapReg(MIPS_REG_VFPUCC, 0);1100fp.FMOV(fpr.V(vd), gpr.R(MIPS_REG_VFPUCC));1101} else {1102ADDI2R(SCRATCH1_64, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCH2);1103fp.LDR(32, INDEX_UNSIGNED, fpr.V(vd), SCRATCH1_64, 0);1104}1105fpr.ReleaseSpillLocksAndDiscardTemps();1106} else {1107fpr.MapRegV(vd);1108fp.MOVI2F(fpr.V(vd), 0.0f, SCRATCH1);1109}1110}11111112void Arm64Jit::Comp_Vmtvc(MIPSOpcode op) {1113CONDITIONAL_DISABLE(VFPU_XFER);11141115int vs = _VS;1116int imm = op & 0x7F;1117if (imm < VFPU_CTRL_MAX) {1118fpr.MapRegV(vs);1119if (imm == VFPU_CTRL_CC) {1120gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY | MAP_NOINIT);1121fp.FMOV(gpr.R(MIPS_REG_VFPUCC), fpr.V(vs));1122} else {1123ADDI2R(SCRATCH1_64, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCH2);1124fp.STR(32, INDEX_UNSIGNED, fpr.V(vs), SCRATCH1_64, 0);1125}1126fpr.ReleaseSpillLocksAndDiscardTemps();11271128if (imm == VFPU_CTRL_SPREFIX) {1129js.prefixSFlag = JitState::PREFIX_UNKNOWN;1130js.blockWrotePrefixes = true;1131} else if (imm == VFPU_CTRL_TPREFIX) {1132js.prefixTFlag = JitState::PREFIX_UNKNOWN;1133js.blockWrotePrefixes = true;1134} else if (imm == VFPU_CTRL_DPREFIX) {1135js.prefixDFlag = JitState::PREFIX_UNKNOWN;1136js.blockWrotePrefixes = true;1137}1138}1139}11401141void Arm64Jit::Comp_Vmmov(MIPSOpcode op) {1142CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);1143if (!js.HasNoPrefix()) {1144DISABLE;1145}11461147if (_VS == _VD) {1148// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.1149return;1150}11511152MatrixSize sz = GetMtxSize(op);1153int n = GetMatrixSide(sz);11541155u8 sregs[16], dregs[16];1156GetMatrixRegs(sregs, sz, _VS);1157GetMatrixRegs(dregs, sz, _VD);11581159switch (GetMatrixOverlap(_VS, _VD, sz)) {1160case OVERLAP_EQUAL:1161// In-place transpose1162DISABLE;1163case OVERLAP_PARTIAL:1164DISABLE;1165case OVERLAP_NONE:1166default:1167break;1168}11691170for (int a = 0; a < n; a++) {1171for (int b = 0; b < n; b++) {1172fpr.MapDirtyInV(dregs[a * 4 + b], sregs[a * 4 + b]);1173fp.FMOV(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[a * 4 + b]));1174}1175}1176fpr.ReleaseSpillLocksAndDiscardTemps();1177}11781179void Arm64Jit::Comp_VScl(MIPSOpcode op) {1180CONDITIONAL_DISABLE(VFPU_VEC);1181if (js.HasUnknownPrefix()) {1182DISABLE;1183}11841185VectorSize sz = GetVecSize(op);1186int n = GetNumVectorElements(sz);11871188u8 sregs[4], dregs[4], treg;1189GetVectorRegsPrefixS(sregs, sz, _VS);1190// TODO: Prefixes seem strange...1191GetVectorRegsPrefixT(&treg, V_Single, _VT);1192GetVectorRegsPrefixD(dregs, sz, _VD);11931194// Move to S0 early, so we don't have to worry about overlap with scale.1195fpr.LoadToRegV(S0, treg);11961197// For prefixes to work, we just have to ensure that none of the output registers spill1198// and that there's no overlap.1199MIPSReg tempregs[4];1200for (int i = 0; i < n; ++i) {1201if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1202// Need to use temp regs1203tempregs[i] = fpr.GetTempV();1204} else {1205tempregs[i] = dregs[i];1206}1207}12081209// The meat of the function!1210for (int i = 0; i < n; i++) {1211fpr.MapDirtyInV(tempregs[i], sregs[i]);1212fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0);1213}12141215for (int i = 0; i < n; i++) {1216// All must be mapped for prefixes to work.1217if (dregs[i] != tempregs[i]) {1218fpr.MapDirtyInV(dregs[i], tempregs[i]);1219fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));1220}1221}12221223ApplyPrefixD(dregs, sz);12241225fpr.ReleaseSpillLocksAndDiscardTemps();1226}12271228void Arm64Jit::Comp_Vmmul(MIPSOpcode op) {1229CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);1230if (!js.HasNoPrefix()) {1231DISABLE;1232}12331234if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {1235// Fall back to interpreter, which has the accurate implementation.1236// Later we might do something more optimized here.1237DISABLE;1238}12391240MatrixSize sz = GetMtxSize(op);1241int n = GetMatrixSide(sz);12421243u8 sregs[16], tregs[16], dregs[16];1244GetMatrixRegs(sregs, sz, _VS);1245GetMatrixRegs(tregs, sz, _VT);1246GetMatrixRegs(dregs, sz, _VD);12471248MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);1249MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);12501251if (soverlap || toverlap) {1252DISABLE;1253} else {1254for (int a = 0; a < n; a++) {1255for (int b = 0; b < n; b++) {1256fpr.MapDirtyInInV(dregs[a * 4 + b], sregs[b * 4], tregs[a * 4], true);1257fp.FMUL(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[b * 4]), fpr.V(tregs[a * 4]));1258for (int c = 1; c < n; c++) {1259fpr.MapDirtyInInV(dregs[a * 4 + b], sregs[b * 4 + c], tregs[a * 4 + c], false);1260fp.FMUL(S0, fpr.V(sregs[b * 4 + c]), fpr.V(tregs[a * 4 + c]));1261fp.FADD(fpr.V(dregs[a * 4 + b]), fpr.V(dregs[a * 4 + b]), S0);1262}1263}1264}1265fpr.ReleaseSpillLocksAndDiscardTemps();1266}1267}12681269void Arm64Jit::Comp_Vmscl(MIPSOpcode op) {1270DISABLE;1271}12721273void Arm64Jit::Comp_Vtfm(MIPSOpcode op) {1274CONDITIONAL_DISABLE(VFPU_MTX_VTFM);1275if (!js.HasNoPrefix()) {1276DISABLE;1277}12781279VectorSize sz = GetVecSize(op);1280MatrixSize msz = GetMtxSize(op);1281int n = GetNumVectorElements(sz);1282int ins = (op >> 23) & 7;12831284bool homogenous = false;1285if (n == ins) {1286n++;1287sz = (VectorSize)((int)(sz)+1);1288msz = (MatrixSize)((int)(msz)+1);1289homogenous = true;1290}1291// Otherwise, n should already be ins + 1.1292else if (n != ins + 1) {1293DISABLE;1294}12951296u8 sregs[16], dregs[4], tregs[4];1297GetMatrixRegs(sregs, msz, _VS);1298GetVectorRegs(tregs, sz, _VT);1299GetVectorRegs(dregs, sz, _VD);13001301MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, msz);1302MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, msz);13031304int tempregs[4];1305for (int i = 0; i < n; i++) {1306if (soverlap || toverlap) {1307tempregs[i] = fpr.GetTempV();1308} else {1309tempregs[i] = dregs[i];1310}1311fpr.SpillLockV(tempregs[i]);1312}1313for (int i = 0; i < n; i++) {1314fpr.MapRegV(tempregs[i], MAP_NOINIT);1315fpr.MapInInV(sregs[i * 4], tregs[0]);1316fp.FMUL(fpr.V(tempregs[i]), fpr.V(sregs[i * 4]), fpr.V(tregs[0]));1317for (int k = 1; k < n; k++) {1318if (!homogenous || k != n - 1) {1319fpr.MapInInV(sregs[i * 4 + k], tregs[k]);1320fp.FMADD(fpr.V(tempregs[i]), fpr.V(sregs[i * 4 + k]), fpr.V(tregs[k]), fpr.V(tempregs[i]));1321} else {1322fpr.MapRegV(sregs[i * 4 + k]);1323fp.FADD(fpr.V(tempregs[i]), fpr.V(tempregs[i]), fpr.V(sregs[i * 4 + k]));1324}1325}1326}1327for (int i = 0; i < n; i++) {1328u8 temp = tempregs[i];1329if (temp != dregs[i]) {1330fpr.MapDirtyInV(dregs[i], temp, true);1331fp.FMOV(fpr.V(dregs[i]), fpr.V(temp));1332}1333}13341335fpr.ReleaseSpillLocksAndDiscardTemps();1336}13371338void Arm64Jit::Comp_VCrs(MIPSOpcode op) {1339DISABLE;1340}13411342void Arm64Jit::Comp_VDet(MIPSOpcode op) {1343DISABLE;1344}13451346void Arm64Jit::Comp_Vi2x(MIPSOpcode op) {1347CONDITIONAL_DISABLE(VFPU_VEC);1348if (js.HasUnknownPrefix())1349DISABLE;13501351int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)1352bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)13531354// These instructions pack pairs or quads of integers into 32 bits.1355// The unsigned (u) versions skip the sign bit when packing.1356VectorSize sz = GetVecSize(op);1357VectorSize outsize;1358if (bits == 8) {1359outsize = V_Single;1360if (sz != V_Quad) {1361DISABLE;1362}1363} else {1364switch (sz) {1365case V_Pair:1366outsize = V_Single;1367break;1368case V_Quad:1369outsize = V_Pair;1370break;1371default:1372DISABLE;1373}1374}13751376u8 sregs[4], dregs[4];1377GetVectorRegsPrefixS(sregs, sz, _VS);1378GetVectorRegsPrefixD(dregs, outsize, _VD);13791380int n = GetNumVectorElements(sz);1381int nOut = GetNumVectorElements(outsize);13821383// Take the single registers and combine them to a D or Q register.1384for (int i = 0; i < n; i++) {1385fpr.MapRegV(sregs[i], sz);1386fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);1387}13881389if (unsignedOp) {1390// What's the best way to zero a Q reg?1391fp.EOR(Q1, Q1, Q1);1392fp.SMAX(32, Q0, Q0, Q1);1393}13941395// At this point, we simply need to collect the high bits of each 32-bit lane into one register.1396if (bits == 8) {1397// Really want to do a SHRN(..., 23/24) but that can't be encoded. So we synthesize it.1398fp.USHR(32, Q0, Q0, 16);1399fp.SHRN(16, D0, Q0, unsignedOp ? 7 : 8);1400fp.XTN(8, D0, Q0);1401} else {1402fp.SHRN(16, D0, Q0, unsignedOp ? 15 : 16);1403}14041405// Split apart again.1406for (int i = 0; i < nOut; i++) {1407fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);1408fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);1409}14101411ApplyPrefixD(dregs, outsize);1412fpr.ReleaseSpillLocksAndDiscardTemps();1413}14141415void Arm64Jit::Comp_Vx2i(MIPSOpcode op) {1416CONDITIONAL_DISABLE(VFPU_VEC);1417if (js.HasUnknownPrefix())1418DISABLE;14191420int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)1421bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)14221423// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values1424// at the top. vus2i shifts it an extra bit right afterward.1425// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values1426// at the top too. vuc2i is a bit special (see below.)1427// Let's do this similarly as h2f - we do a solution that works for both singles and pairs1428// then use it for both.14291430VectorSize sz = GetVecSize(op);1431VectorSize outsize;1432if (bits == 8) {1433outsize = V_Quad;1434} else {1435switch (sz) {1436case V_Single:1437outsize = V_Pair;1438break;1439case V_Pair:1440outsize = V_Quad;1441break;1442default:1443DISABLE;1444}1445}14461447u8 sregs[4], dregs[4];1448GetVectorRegsPrefixS(sregs, sz, _VS);1449GetVectorRegsPrefixD(dregs, outsize, _VD);14501451fpr.MapRegsAndSpillLockV(sregs, sz, 0);1452int n = 1;1453if (sz == V_Single) {1454n = 1;1455} else if (sz == V_Pair) {1456n = 2;1457} else if (bits == 8) {1458n = 1;1459}14601461// Take the single registers and combine them to a D or Q register.1462for (int i = 0; i < n; i++) {1463fpr.MapRegV(sregs[i], sz);1464fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);1465}14661467if (bits == 16) {1468// Simply expand, to upper bits.1469// Hm, can't find a USHLL equivalent that works with shift == size?1470fp.UXTL(16, Q0, D0);1471fp.SHL(32, Q0, Q0, 16);1472} else if (bits == 8) {1473fp.UXTL(8, Q0, D0);1474fp.UXTL(16, Q0, D0);1475fp.SHL(32, Q0, D0, 24);1476if (unsignedOp) {1477// vuc2i is a bit special. It spreads out the bits like this:1478// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.1479fp.USHR(32, Q1, Q0, 8);1480fp.ORR(Q0, Q0, Q1);1481fp.USHR(32, Q1, Q0, 16);1482fp.ORR(Q0, Q0, Q1);1483}1484}14851486// At this point we have the regs in the 4 lanes.1487// In the "u" mode, we need to shift it out of the sign bit.1488if (unsignedOp) {1489Arm64Gen::ARM64Reg reg = (outsize == V_Quad) ? Q0 : D0;1490fp.USHR(32, reg, reg, 1);1491}14921493fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_NOINIT);14941495int nOut = 2;1496if (outsize == V_Quad)1497nOut = 4;14981499// Split apart again.1500for (int i = 0; i < nOut; i++) {1501fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);1502fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);1503}15041505ApplyPrefixD(dregs, outsize);1506fpr.ReleaseSpillLocksAndDiscardTemps();1507}15081509void Arm64Jit::Comp_VCrossQuat(MIPSOpcode op) {1510// This op does not support prefixes anyway.1511CONDITIONAL_DISABLE(VFPU_VEC);1512if (!js.HasNoPrefix())1513DISABLE;15141515VectorSize sz = GetVecSize(op);1516int n = GetNumVectorElements(sz);15171518u8 sregs[4], tregs[4], dregs[4];1519GetVectorRegs(sregs, sz, _VS);1520GetVectorRegs(tregs, sz, _VT);1521GetVectorRegs(dregs, sz, _VD);15221523// Map everything into registers.1524fpr.MapRegsAndSpillLockV(sregs, sz, 0);1525fpr.MapRegsAndSpillLockV(tregs, sz, 0);15261527if (sz == V_Triple) {1528MIPSReg temp3 = fpr.GetTempV();1529MIPSReg temp4 = fpr.GetTempV();1530fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);1531fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);1532// Cross product vcrsp.t15331534// Note: using FMSUB here causes accuracy issues, see #18203.1535// Compute X: s[1] * t[2] - s[2] * t[1]1536fp.FMUL(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[2]));1537fp.FMUL(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[1]));1538fp.FSUB(S0, fpr.V(temp3), fpr.V(temp4));15391540// Compute Y: s[2] * t[0] - s[0] * t[2]1541fp.FMUL(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[0]));1542fp.FMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[2]));1543fp.FSUB(S1, fpr.V(temp3), fpr.V(temp4));15441545// Compute Z: s[0] * t[1] - s[1] * t[0]1546fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));1547fp.FMUL(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[0]));1548fp.FSUB(fpr.V(temp3), fpr.V(temp3), fpr.V(temp4));15491550fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);1551fp.FMOV(fpr.V(dregs[0]), S0);1552fp.FMOV(fpr.V(dregs[1]), S1);1553fp.FMOV(fpr.V(dregs[2]), fpr.V(temp3));1554} else if (sz == V_Quad) {1555MIPSReg temp3 = fpr.GetTempV();1556MIPSReg temp4 = fpr.GetTempV();1557fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);1558fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);15591560// Quaternion product vqmul.q untested1561// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];1562fp.FMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[3]));1563fp.FMADD(S0, fpr.V(sregs[1]), fpr.V(tregs[2]), S0);1564fp.FMSUB(S0, fpr.V(sregs[2]), fpr.V(tregs[1]), S0);1565fp.FMADD(S0, fpr.V(sregs[3]), fpr.V(tregs[0]), S0);15661567//d[1] = -s[0] * t[2] + s[1] * t[3] + s[2] * t[0] + s[3] * t[1];1568fp.FNMUL(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));1569fp.FMADD(S1, fpr.V(sregs[1]), fpr.V(tregs[3]), S1);1570fp.FMADD(S1, fpr.V(sregs[2]), fpr.V(tregs[0]), S1);1571fp.FMADD(S1, fpr.V(sregs[3]), fpr.V(tregs[1]), S1);15721573//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];1574fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));1575fp.FMSUB(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]), fpr.V(temp3));1576fp.FMADD(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[3]), fpr.V(temp3));1577fp.FMADD(fpr.V(temp3), fpr.V(sregs[3]), fpr.V(tregs[2]), fpr.V(temp3));15781579//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];1580fp.FNMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[0]));1581fp.FMSUB(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[1]), fpr.V(temp4));1582fp.FMSUB(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[2]), fpr.V(temp4));1583fp.FMADD(fpr.V(temp4), fpr.V(sregs[3]), fpr.V(tregs[3]), fpr.V(temp4));15841585fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);1586fp.FMOV(fpr.V(dregs[0]), S0);1587fp.FMOV(fpr.V(dregs[1]), S1);1588fp.FMOV(fpr.V(dregs[2]), fpr.V(temp3));1589fp.FMOV(fpr.V(dregs[3]), fpr.V(temp4));1590}15911592fpr.ReleaseSpillLocksAndDiscardTemps();1593}15941595void Arm64Jit::Comp_Vcmp(MIPSOpcode op) {1596CONDITIONAL_DISABLE(VFPU_COMP);1597if (js.HasUnknownPrefix())1598DISABLE;15991600VectorSize sz = GetVecSize(op);1601int n = GetNumVectorElements(sz);16021603VCondition cond = (VCondition)(op & 0xF);16041605u8 sregs[4], tregs[4];1606GetVectorRegsPrefixS(sregs, sz, _VS);1607GetVectorRegsPrefixT(tregs, sz, _VT);16081609// Some, we just fall back to the interpreter.1610// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.16111612switch (cond) {1613case VC_EI: // c = my_isinf(s[i]); break;1614case VC_NI: // c = !my_isinf(s[i]); break;1615DISABLE;1616case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection1617case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;1618case VC_EN: // c = my_isnan(s[i]); break;1619case VC_NN: // c = !my_isnan(s[i]); break;1620if (_VS != _VT)1621DISABLE;1622break;16231624case VC_EZ:1625case VC_NZ:1626break;1627default:1628;1629}16301631// First, let's get the trivial ones.1632int affected_bits = (1 << 4) | (1 << 5); // 4 and 516331634MOVI2R(SCRATCH1, 0);1635for (int i = 0; i < n; ++i) {1636// Let's only handle the easy ones, and fall back on the interpreter for the rest.1637CCFlags flag = CC_AL;1638switch (cond) {1639case VC_FL: // c = 0;1640break;16411642case VC_TR: // c = 11643if (i == 0) {1644if (n == 1) {1645MOVI2R(SCRATCH1, 0x31);1646} else {1647MOVI2R(SCRATCH1, 1ULL << i);1648}1649} else {1650ORRI2R(SCRATCH1, SCRATCH1, 1ULL << i);1651}1652break;16531654case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection1655case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;1656// For these, we use the integer ALU as there is no support on ARM for testing for INF.1657// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.1658// We need an extra temporary register so we store away SCRATCH1.1659STR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));1660fpr.MapRegV(sregs[i], 0);1661MOVI2R(SCRATCH1, 0x7F800000);1662fp.FMOV(SCRATCH2, fpr.V(sregs[i]));1663AND(SCRATCH2, SCRATCH2, SCRATCH1);1664CMP(SCRATCH2, SCRATCH1); // (SCRATCH2 & 0x7F800000) == 0x7F8000001665flag = cond == VC_ES ? CC_EQ : CC_NEQ;1666LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));1667break;16681669case VC_EN: // c = my_isnan(s[i]); break; // Tekken 61670// Should we involve T? Where I found this used, it compared a register with itself so should be fine.1671fpr.MapInInV(sregs[i], tregs[i]);1672fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1673flag = CC_VS; // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html1674break;16751676case VC_NN: // c = !my_isnan(s[i]); break;1677// Should we involve T? Where I found this used, it compared a register with itself so should be fine.1678fpr.MapInInV(sregs[i], tregs[i]);1679fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1680flag = CC_VC; // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html1681break;16821683case VC_EQ: // c = s[i] == t[i]1684fpr.MapInInV(sregs[i], tregs[i]);1685fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1686flag = CC_EQ;1687break;16881689case VC_LT: // c = s[i] < t[i]1690fpr.MapInInV(sregs[i], tregs[i]);1691fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1692flag = CC_LO;1693break;16941695case VC_LE: // c = s[i] <= t[i];1696fpr.MapInInV(sregs[i], tregs[i]);1697fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1698flag = CC_LS;1699break;17001701case VC_NE: // c = s[i] != t[i]1702fpr.MapInInV(sregs[i], tregs[i]);1703fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1704flag = CC_NEQ;1705break;17061707case VC_GE: // c = s[i] >= t[i]1708fpr.MapInInV(sregs[i], tregs[i]);1709fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1710flag = CC_GE;1711break;17121713case VC_GT: // c = s[i] > t[i]1714fpr.MapInInV(sregs[i], tregs[i]);1715fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1716flag = CC_GT;1717break;17181719case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f1720fpr.MapRegV(sregs[i]);1721fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)1722flag = CC_EQ;1723break;17241725case VC_NZ: // c = s[i] != 01726fpr.MapRegV(sregs[i]);1727fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)1728flag = CC_NEQ;1729break;17301731default:1732DISABLE;1733}1734if (flag != CC_AL) {1735FixupBranch b = B(InvertCond(flag));1736if (i == 0) {1737if (n == 1) {1738MOVI2R(SCRATCH1, 0x31);1739} else {1740MOVI2R(SCRATCH1, 1); // 1 << i, but i == 01741}1742} else {1743ORRI2R(SCRATCH1, SCRATCH1, 1ULL << i);1744}1745SetJumpTarget(b);1746}17471748affected_bits |= 1 << i;1749}17501751// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common1752// after all.1753if (n > 1) {1754CMP(SCRATCH1, affected_bits & 0xF);1755FixupBranch skip1 = B(CC_NEQ);1756ORRI2R(SCRATCH1, SCRATCH1, 1 << 5);1757SetJumpTarget(skip1);17581759CMP(SCRATCH1, 0);1760FixupBranch skip2 = B(CC_EQ);1761ORRI2R(SCRATCH1, SCRATCH1, 1 << 4);1762SetJumpTarget(skip2);1763}17641765gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);1766ANDI2R(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), ~affected_bits, SCRATCH2);1767ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCH1);17681769fpr.ReleaseSpillLocksAndDiscardTemps();1770}17711772void Arm64Jit::Comp_Vcmov(MIPSOpcode op) {1773CONDITIONAL_DISABLE(VFPU_COMP);1774if (js.HasUnknownPrefix()) {1775DISABLE;1776}17771778VectorSize sz = GetVecSize(op);1779int n = GetNumVectorElements(sz);17801781u8 sregs[4], dregs[4];1782GetVectorRegsPrefixS(sregs, sz, _VS);1783GetVectorRegsPrefixD(dregs, sz, _VD);1784int tf = (op >> 19) & 1;1785int imm3 = (op >> 16) & 7;17861787for (int i = 0; i < n; ++i) {1788// Simplification: Disable if overlap unsafe1789if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {1790DISABLE;1791}1792}17931794if (imm3 < 6) {1795// Test one bit of CC. This bit decides whether none or all subregisters are copied.1796fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);1797fpr.MapRegsAndSpillLockV(sregs, sz, 0);1798gpr.MapReg(MIPS_REG_VFPUCC);1799TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << imm3);1800// TODO: Use fsel?1801FixupBranch b = B(tf ? CC_NEQ : CC_EQ);1802for (int i = 0; i < n; i++) {1803fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));1804}1805SetJumpTarget(b);1806} else {1807// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.1808fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);1809fpr.MapRegsAndSpillLockV(sregs, sz, 0);1810gpr.MapReg(MIPS_REG_VFPUCC);1811for (int i = 0; i < n; i++) {1812TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << i);1813FixupBranch b = B(tf ? CC_NEQ : CC_EQ);1814fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));1815SetJumpTarget(b);1816}1817}18181819ApplyPrefixD(dregs, sz);1820fpr.ReleaseSpillLocksAndDiscardTemps();1821}18221823void Arm64Jit::Comp_Viim(MIPSOpcode op) {1824CONDITIONAL_DISABLE(VFPU_XFER);1825if (js.HasUnknownPrefix()) {1826DISABLE;1827}18281829u8 dreg;1830GetVectorRegs(&dreg, V_Single, _VT);18311832s32 imm = SignExtend16ToS32(op);1833fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);1834fp.MOVI2F(fpr.V(dreg), (float)imm, SCRATCH1);18351836ApplyPrefixD(&dreg, V_Single);1837fpr.ReleaseSpillLocksAndDiscardTemps();1838}18391840void Arm64Jit::Comp_Vfim(MIPSOpcode op) {1841CONDITIONAL_DISABLE(VFPU_XFER);1842if (js.HasUnknownPrefix()) {1843DISABLE;1844}18451846u8 dreg;1847GetVectorRegs(&dreg, V_Single, _VT);18481849FP16 half;1850half.u = op & 0xFFFF;1851FP32 fval = half_to_float_fast5(half);1852fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);1853fp.MOVI2F(fpr.V(dreg), fval.f, SCRATCH1);18541855ApplyPrefixD(&dreg, V_Single);1856fpr.ReleaseSpillLocksAndDiscardTemps();1857}18581859void Arm64Jit::Comp_Vcst(MIPSOpcode op) {1860CONDITIONAL_DISABLE(VFPU_XFER);1861if (js.HasUnknownPrefix()) {1862DISABLE;1863}18641865int conNum = (op >> 16) & 0x1f;1866int vd = _VD;18671868VectorSize sz = GetVecSize(op);1869int n = GetNumVectorElements(sz);18701871u8 dregs[4];1872GetVectorRegsPrefixD(dregs, sz, _VD);1873fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);18741875MOVP2R(SCRATCH1_64, (void *)&cst_constants[conNum]);1876fp.LDR(32, INDEX_UNSIGNED, S0, SCRATCH1_64, 0);1877for (int i = 0; i < n; ++i)1878fp.FMOV(fpr.V(dregs[i]), S0);18791880ApplyPrefixD(dregs, sz);1881fpr.ReleaseSpillLocksAndDiscardTemps();1882}18831884static double SinCos(float angle) {1885union { struct { float sin; float cos; }; double out; } sincos;1886vfpu_sincos(angle, sincos.sin, sincos.cos);1887return sincos.out;1888}18891890static double SinCosNegSin(float angle) {1891union { struct { float sin; float cos; }; double out; } sincos;1892vfpu_sincos(angle, sincos.sin, sincos.cos);1893sincos.sin = -sincos.sin;1894return sincos.out;1895}18961897void Arm64Jit::CompVrotShuffle(u8 *dregs, int imm, VectorSize sz, bool negSin) {1898int n = GetNumVectorElements(sz);1899char what[4] = { '0', '0', '0', '0' };1900if (((imm >> 2) & 3) == (imm & 3)) {1901for (int i = 0; i < 4; i++)1902what[i] = 'S';1903}1904what[(imm >> 2) & 3] = 'S';1905what[imm & 3] = 'C';19061907fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY | MAP_NOINIT);1908for (int i = 0; i < n; i++) {1909switch (what[i]) {1910case 'C': fp.FMOV(fpr.V(dregs[i]), S1); break;1911case 'S': if (negSin) fp.FNEG(fpr.V(dregs[i]), S0); else fp.FMOV(fpr.V(dregs[i]), S0); break;1912case '0':1913{1914fp.MOVI2F(fpr.V(dregs[i]), 0.0f);1915break;1916}1917default:1918ERROR_LOG(Log::JIT, "Bad what in vrot");1919break;1920}1921}1922}19231924// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of1925// calling the math library.1926void Arm64Jit::Comp_VRot(MIPSOpcode op) {1927// VRot probably doesn't accept prefixes anyway.1928CONDITIONAL_DISABLE(VFPU_VEC);1929if (js.HasUnknownPrefix()) {1930DISABLE;1931}19321933int vd = _VD;1934int vs = _VS;19351936VectorSize sz = GetVecSize(op);1937int n = GetNumVectorElements(sz);19381939u8 dregs[4];1940u8 dregs2[4];19411942MIPSOpcode nextOp = GetOffsetInstruction(1);1943int vd2 = -1;1944int imm2 = -1;1945if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {1946// Pair of vrot. Let's join them.1947vd2 = MIPS_GET_VD(nextOp);1948imm2 = (nextOp >> 16) & 0x1f;1949// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);1950}1951u8 sreg;1952GetVectorRegs(dregs, sz, vd);1953if (vd2 >= 0)1954GetVectorRegs(dregs2, sz, vd2);1955GetVectorRegs(&sreg, V_Single, vs);19561957int imm = (op >> 16) & 0x1f;19581959gpr.FlushBeforeCall();1960fpr.FlushAll();19611962// Don't need to SaveStaticRegs here as long as they are all in callee-save regs - this callee won't read them.19631964bool negSin1 = (imm & 0x10) ? true : false;19651966fpr.MapRegV(sreg);1967fp.FMOV(S0, fpr.V(sreg));1968QuickCallFunction(SCRATCH2_64, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);1969// Here, sin and cos are stored together in Q0.d. On ARM32 we could use it directly1970// but with ARM64's register organization, we need to split it up.1971fp.INS(32, Q1, 0, Q0, 1);19721973CompVrotShuffle(dregs, imm, sz, false);1974if (vd2 != -1) {1975// If the negsin setting differs between the two joint invocations, we need to flip the second one.1976bool negSin2 = (imm2 & 0x10) ? true : false;1977CompVrotShuffle(dregs2, imm2, sz, negSin1 != negSin2);1978EatInstruction(nextOp);1979}19801981fpr.ReleaseSpillLocksAndDiscardTemps();1982}19831984void Arm64Jit::Comp_Vsgn(MIPSOpcode op) {1985DISABLE;1986}19871988void Arm64Jit::Comp_Vocp(MIPSOpcode op) {1989CONDITIONAL_DISABLE(VFPU_VEC);1990if (js.HasUnknownPrefix()) {1991DISABLE;1992}19931994VectorSize sz = GetVecSize(op);1995int n = GetNumVectorElements(sz);19961997// This is a hack that modifies prefixes. We eat them later, so just overwrite.1998// S prefix forces the negate flags.1999js.prefixS |= 0x000F0000;2000// T prefix forces constants on and regnum to 1.2001// That means negate still works, and abs activates a different constant.2002js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;20032004u8 sregs[4], tregs[4], dregs[4];2005GetVectorRegsPrefixS(sregs, sz, _VS);2006GetVectorRegsPrefixT(tregs, sz, _VS);2007GetVectorRegsPrefixD(dregs, sz, _VD);20082009MIPSReg tempregs[4];2010for (int i = 0; i < n; ++i) {2011if (!IsOverlapSafe(dregs[i], i, n, sregs)) {2012tempregs[i] = fpr.GetTempV();2013} else {2014tempregs[i] = dregs[i];2015}2016}20172018fp.MOVI2F(S0, 1.0f, SCRATCH1);2019for (int i = 0; i < n; ++i) {2020fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);2021fp.FADD(fpr.V(tempregs[i]), fpr.V(tregs[i]), fpr.V(sregs[i]));2022}20232024for (int i = 0; i < n; ++i) {2025if (dregs[i] != tempregs[i]) {2026fpr.MapDirtyInV(dregs[i], tempregs[i]);2027fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));2028}2029}20302031ApplyPrefixD(dregs, sz);20322033fpr.ReleaseSpillLocksAndDiscardTemps();2034}20352036void Arm64Jit::Comp_ColorConv(MIPSOpcode op) {2037DISABLE;2038}20392040void Arm64Jit::Comp_Vbfy(MIPSOpcode op) {2041DISABLE;2042}2043}20442045#endif // PPSSPP_ARCH(ARM64)204620472048