CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM/ArmCompVFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(ARM)1920#include <cmath>21#include "Common/CPUDetect.h"22#include "Common/Data/Convert/SmallDataConvert.h"23#include "Common/Math/math_util.h"2425#include "Core/Compatibility.h"26#include "Core/Config.h"27#include "Core/MemMap.h"28#include "Core/Reporting.h"29#include "Core/System.h"30#include "Core/MIPS/MIPS.h"31#include "Core/MIPS/MIPSTables.h"32#include "Core/MIPS/MIPSAnalyst.h"33#include "Core/MIPS/MIPSCodeUtils.h"3435#include "Core/MIPS/ARM/ArmJit.h"36#include "Core/MIPS/ARM/ArmRegCache.h"3738// Cool NEON references:39// http://www.delmarnorth.com/microwave/requirements/neon-test-tutorial.pdf4041// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.42// Currently known non working ones should have DISABLE.4344// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }45#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }46#define DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }4748#define NEON_IF_AVAILABLE(func) { if (jo.useNEONVFPU) { func(op); return; } }49#define _RS MIPS_GET_RS(op)50#define _RT MIPS_GET_RT(op)51#define _RD MIPS_GET_RD(op)52#define _FS MIPS_GET_FS(op)53#define _FT MIPS_GET_FT(op)54#define _FD MIPS_GET_FD(op)55#define _SA MIPS_GET_SA(op)56#define _POS ((op>> 6) & 0x1F)57#define _SIZE ((op>>11) & 0x1F)58#define _IMM16 (signed short)(op & 0xFFFF)59#define _IMM26 (op & 0x03FFFFFF)6061namespace MIPSComp62{63using namespace ArmGen;64using namespace ArmJitConstants;6566// Vector regs can overlap in all sorts of swizzled ways.67// This does allow a single overlap in sregs[i].68static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)69{70for (int i = 0; i < sn; ++i)71{72if (sregs[i] == dreg && i != di)73return false;74}75for (int i = 0; i < tn; ++i)76{77if (tregs[i] == dreg)78return false;79}8081// Hurray, no overlap, we can write directly.82return true;83}8485static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL)86{87return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;88}8990void ArmJit::Comp_VPFX(MIPSOpcode op)91{92CONDITIONAL_DISABLE(VFPU_XFER);93int data = op & 0xFFFFF;94int regnum = (op >> 24) & 3;95switch (regnum) {96case 0: // S97js.prefixS = data;98js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;99break;100case 1: // T101js.prefixT = data;102js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;103break;104case 2: // D105js.prefixD = data & 0x00000FFF;106js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;107break;108default:109ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);110break;111}112}113114void ArmJit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {115if (prefix == 0xE4)116return;117118int n = GetNumVectorElements(sz);119u8 origV[4];120static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};121122for (int i = 0; i < n; i++)123origV[i] = vregs[i];124125for (int i = 0; i < n; i++) {126int regnum = (prefix >> (i*2)) & 3;127int abs = (prefix >> (8+i)) & 1;128int negate = (prefix >> (16+i)) & 1;129int constants = (prefix >> (12+i)) & 1;130131// Unchanged, hurray.132if (!constants && regnum == i && !abs && !negate)133continue;134135// This puts the value into a temp reg, so we won't write the modified value back.136vregs[i] = fpr.GetTempV();137if (!constants) {138fpr.MapDirtyInV(vregs[i], origV[regnum]);139fpr.SpillLockV(vregs[i]);140141// Prefix may say "z, z, z, z" but if this is a pair, we force to x.142// TODO: But some ops seem to use const 0 instead?143if (regnum >= n) {144WARN_LOG(Log::CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, GetCompilerPC(), MIPSDisasmAt(GetCompilerPC()).c_str());145regnum = 0;146}147148if (abs) {149VABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));150if (negate)151VNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));152} else {153if (negate)154VNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));155else156VMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));157}158} else {159fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);160fpr.SpillLockV(vregs[i]);161MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs<<2)], SCRATCHREG1, negate != 0);162}163}164}165166void ArmJit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {167_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);168169GetVectorRegs(regs, sz, vectorReg);170if (js.prefixD == 0)171return;172173int n = GetNumVectorElements(sz);174for (int i = 0; i < n; i++) {175// Hopefully this is rare, we'll just write it into a reg we drop.176if (js.VfpuWriteMask(i))177regs[i] = fpr.GetTempV();178}179}180181void ArmJit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {182_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);183if (!js.prefixD)184return;185186int n = GetNumVectorElements(sz);187for (int i = 0; i < n; i++) {188if (js.VfpuWriteMask(i))189continue;190191int sat = (js.prefixD >> (i * 2)) & 3;192if (sat == 1) {193// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]194fpr.MapRegV(vregs[i], MAP_DIRTY);195196MOVI2F(S0, 0.0f, SCRATCHREG1);197MOVI2F(S1, 1.0f, SCRATCHREG1);198VCMP(fpr.V(vregs[i]), S0);199VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).200SetCC(CC_LS);201VMOV(fpr.V(vregs[i]), S0);202SetCC(CC_AL);203VCMP(fpr.V(vregs[i]), S1);204VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).205SetCC(CC_GT);206VMOV(fpr.V(vregs[i]), S1);207SetCC(CC_AL);208} else if (sat == 3) {209// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]210fpr.MapRegV(vregs[i], MAP_DIRTY);211212MOVI2F(S0, -1.0f, SCRATCHREG1);213MOVI2F(S1, 1.0f, SCRATCHREG1);214VCMP(fpr.V(vregs[i]), S0);215VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).216SetCC(CC_LO);217VMOV(fpr.V(vregs[i]), S0);218SetCC(CC_AL);219VCMP(fpr.V(vregs[i]), S1);220VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).221SetCC(CC_GT);222VMOV(fpr.V(vregs[i]), S1);223SetCC(CC_AL);224}225}226}227228void ArmJit::Comp_SV(MIPSOpcode op) {229NEON_IF_AVAILABLE(CompNEON_SV);230CONDITIONAL_DISABLE(LSU_VFPU);231CheckMemoryBreakpoint();232233s32 offset = (signed short)(op & 0xFFFC);234int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);235MIPSGPReg rs = _RS;236237bool doCheck = false;238switch (op >> 26)239{240case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);241{242if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset < 0x400 && offset > -0x400) {243gpr.MapRegAsPointer(rs);244fpr.MapRegV(vt, MAP_NOINIT | MAP_DIRTY);245VLDR(fpr.V(vt), gpr.RPtr(rs), offset);246break;247}248249// CC might be set by slow path below, so load regs first.250fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);251if (gpr.IsImm(rs)) {252u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;253gpr.SetRegImm(R0, addr + (u32)Memory::base);254} else {255gpr.MapReg(rs);256if (g_Config.bFastMemory) {257SetR0ToEffectiveAddress(rs, offset);258} else {259SetCCAndR0ForSafeAddress(rs, offset, SCRATCHREG2);260doCheck = true;261}262ADD(R0, R0, MEMBASEREG);263}264#ifdef __ARM_ARCH_7S__265FixupBranch skip;266if (doCheck) {267skip = B_CC(CC_EQ);268}269VLDR(fpr.V(vt), R0, 0);270if (doCheck) {271SetJumpTarget(skip);272SetCC(CC_AL);273}274#else275VLDR(fpr.V(vt), R0, 0);276if (doCheck) {277SetCC(CC_EQ);278MOVI2F(fpr.V(vt), 0.0f, SCRATCHREG1);279SetCC(CC_AL);280}281#endif282}283break;284285case 58: //sv.s // Memory::Write_U32(VI(vt), addr);286{287if (!gpr.IsImm(rs) && jo.cachePointers && g_Config.bFastMemory && (offset & 3) == 0 && offset < 0x400 && offset > -0x400) {288gpr.MapRegAsPointer(rs);289fpr.MapRegV(vt, 0);290VSTR(fpr.V(vt), gpr.RPtr(rs), offset);291break;292}293294// CC might be set by slow path below, so load regs first.295fpr.MapRegV(vt);296if (gpr.IsImm(rs)) {297u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF;298gpr.SetRegImm(R0, addr + (u32)Memory::base);299} else {300gpr.MapReg(rs);301if (g_Config.bFastMemory) {302SetR0ToEffectiveAddress(rs, offset);303} else {304SetCCAndR0ForSafeAddress(rs, offset, SCRATCHREG2);305doCheck = true;306}307ADD(R0, R0, MEMBASEREG);308}309#ifdef __ARM_ARCH_7S__310FixupBranch skip;311if (doCheck) {312skip = B_CC(CC_EQ);313}314VSTR(fpr.V(vt), R0, 0);315if (doCheck) {316SetJumpTarget(skip);317SetCC(CC_AL);318}319#else320VSTR(fpr.V(vt), R0, 0);321if (doCheck) {322SetCC(CC_AL);323}324#endif325}326break;327328329default:330DISABLE;331}332}333334void ArmJit::Comp_SVQ(MIPSOpcode op)335{336NEON_IF_AVAILABLE(CompNEON_SVQ);337CONDITIONAL_DISABLE(LSU_VFPU);338CheckMemoryBreakpoint();339340int imm = (signed short)(op&0xFFFC);341int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);342MIPSGPReg rs = _RS;343344bool doCheck = false;345switch (op >> 26)346{347case 54: //lv.q348{349// CC might be set by slow path below, so load regs first.350u8 vregs[4];351GetVectorRegs(vregs, V_Quad, vt);352fpr.MapRegsAndSpillLockV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);353354if (gpr.IsImm(rs)) {355u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;356gpr.SetRegImm(R0, addr + (u32)Memory::base);357} else {358gpr.MapReg(rs);359if (g_Config.bFastMemory) {360SetR0ToEffectiveAddress(rs, imm);361} else {362SetCCAndR0ForSafeAddress(rs, imm, SCRATCHREG2);363doCheck = true;364}365ADD(R0, R0, MEMBASEREG);366}367368#ifdef __ARM_ARCH_7S__369FixupBranch skip;370if (doCheck) {371skip = B_CC(CC_EQ);372}373374bool consecutive = true;375for (int i = 0; i < 3 && consecutive; i++)376if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))377consecutive = false;378if (consecutive) {379VLDMIA(R0, false, fpr.V(vregs[0]), 4);380} else {381for (int i = 0; i < 4; i++)382VLDR(fpr.V(vregs[i]), R0, i * 4);383}384385if (doCheck) {386SetJumpTarget(skip);387SetCC(CC_AL);388}389#else390bool consecutive = true;391for (int i = 0; i < 3 && consecutive; i++)392if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))393consecutive = false;394if (consecutive) {395VLDMIA(R0, false, fpr.V(vregs[0]), 4);396} else {397for (int i = 0; i < 4; i++)398VLDR(fpr.V(vregs[i]), R0, i * 4);399}400401if (doCheck) {402SetCC(CC_EQ);403MOVI2R(SCRATCHREG1, 0);404for (int i = 0; i < 4; i++)405VMOV(fpr.V(vregs[i]), SCRATCHREG1);406SetCC(CC_AL);407}408#endif409}410break;411412case 62: //sv.q413{414// CC might be set by slow path below, so load regs first.415u8 vregs[4];416GetVectorRegs(vregs, V_Quad, vt);417fpr.MapRegsAndSpillLockV(vregs, V_Quad, 0);418419if (gpr.IsImm(rs)) {420u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF;421gpr.SetRegImm(R0, addr + (u32)Memory::base);422} else {423gpr.MapReg(rs);424if (g_Config.bFastMemory) {425SetR0ToEffectiveAddress(rs, imm);426} else {427SetCCAndR0ForSafeAddress(rs, imm, SCRATCHREG2);428doCheck = true;429}430ADD(R0, R0, MEMBASEREG);431}432433#ifdef __ARM_ARCH_7S__434FixupBranch skip;435if (doCheck) {436skip = B_CC(CC_EQ);437}438439bool consecutive = true;440for (int i = 0; i < 3 && consecutive; i++)441if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))442consecutive = false;443if (consecutive) {444VSTMIA(R0, false, fpr.V(vregs[0]), 4);445} else {446for (int i = 0; i < 4; i++)447VSTR(fpr.V(vregs[i]), R0, i * 4);448}449450if (doCheck) {451SetJumpTarget(skip);452SetCC(CC_AL);453}454#else455bool consecutive = true;456for (int i = 0; i < 3 && consecutive; i++)457if ((fpr.V(vregs[i]) + 1) != fpr.V(vregs[i+1]))458consecutive = false;459if (consecutive) {460VSTMIA(R0, false, fpr.V(vregs[0]), 4);461} else {462for (int i = 0; i < 4; i++)463VSTR(fpr.V(vregs[i]), R0, i * 4);464}465466if (doCheck) {467SetCC(CC_AL);468}469#endif470}471break;472473default:474DISABLE;475break;476}477fpr.ReleaseSpillLocksAndDiscardTemps();478}479480void ArmJit::Comp_VVectorInit(MIPSOpcode op)481{482NEON_IF_AVAILABLE(CompNEON_VVectorInit);483CONDITIONAL_DISABLE(VFPU_XFER);484// WARNING: No prefix support!485if (js.HasUnknownPrefix()) {486DISABLE;487}488489switch ((op >> 16) & 0xF)490{491case 6: // v=zeros; break; //vzero492MOVI2F(S0, 0.0f, SCRATCHREG1);493break;494case 7: // v=ones; break; //vone495MOVI2F(S0, 1.0f, SCRATCHREG1);496break;497default:498DISABLE;499break;500}501502VectorSize sz = GetVecSize(op);503int n = GetNumVectorElements(sz);504505u8 dregs[4];506GetVectorRegsPrefixD(dregs, sz, _VD);507fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);508509for (int i = 0; i < n; ++i)510VMOV(fpr.V(dregs[i]), S0);511512ApplyPrefixD(dregs, sz);513514fpr.ReleaseSpillLocksAndDiscardTemps();515}516517void ArmJit::Comp_VIdt(MIPSOpcode op) {518NEON_IF_AVAILABLE(CompNEON_VIdt);519520CONDITIONAL_DISABLE(VFPU_XFER);521if (js.HasUnknownPrefix()) {522DISABLE;523}524525int vd = _VD;526VectorSize sz = GetVecSize(op);527int n = GetNumVectorElements(sz);528MOVI2F(S0, 0.0f, SCRATCHREG1);529MOVI2F(S1, 1.0f, SCRATCHREG1);530u8 dregs[4];531GetVectorRegsPrefixD(dregs, sz, _VD);532fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);533switch (sz)534{535case V_Pair:536VMOV(fpr.V(dregs[0]), (vd&1)==0 ? S1 : S0);537VMOV(fpr.V(dregs[1]), (vd&1)==1 ? S1 : S0);538break;539case V_Quad:540VMOV(fpr.V(dregs[0]), (vd&3)==0 ? S1 : S0);541VMOV(fpr.V(dregs[1]), (vd&3)==1 ? S1 : S0);542VMOV(fpr.V(dregs[2]), (vd&3)==2 ? S1 : S0);543VMOV(fpr.V(dregs[3]), (vd&3)==3 ? S1 : S0);544break;545default:546_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");547break;548}549550ApplyPrefixD(dregs, sz);551552fpr.ReleaseSpillLocksAndDiscardTemps();553}554555void ArmJit::Comp_VMatrixInit(MIPSOpcode op)556{557NEON_IF_AVAILABLE(CompNEON_VMatrixInit);558CONDITIONAL_DISABLE(VFPU_XFER);559if (js.HasUnknownPrefix()) {560// Don't think matrix init ops care about prefixes.561// DISABLE;562}563564MatrixSize sz = GetMtxSize(op);565int n = GetMatrixSide(sz);566567u8 dregs[16];568GetMatrixRegs(dregs, sz, _VD);569570switch ((op >> 16) & 0xF) {571case 3: // vmidt572MOVI2F(S0, 0.0f, SCRATCHREG1);573MOVI2F(S1, 1.0f, SCRATCHREG1);574for (int a = 0; a < n; a++) {575for (int b = 0; b < n; b++) {576fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);577VMOV(fpr.V(dregs[a * 4 + b]), a == b ? S1 : S0);578}579}580break;581case 6: // vmzero582MOVI2F(S0, 0.0f, SCRATCHREG1);583for (int a = 0; a < n; a++) {584for (int b = 0; b < n; b++) {585fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);586VMOV(fpr.V(dregs[a * 4 + b]), S0);587}588}589break;590case 7: // vmone591MOVI2F(S1, 1.0f, SCRATCHREG1);592for (int a = 0; a < n; a++) {593for (int b = 0; b < n; b++) {594fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);595VMOV(fpr.V(dregs[a * 4 + b]), S1);596}597}598break;599}600601fpr.ReleaseSpillLocksAndDiscardTemps();602}603604void ArmJit::Comp_VHdp(MIPSOpcode op) {605NEON_IF_AVAILABLE(CompNEON_VHdp);606CONDITIONAL_DISABLE(VFPU_VEC);607if (js.HasUnknownPrefix()) {608DISABLE;609}610611int vd = _VD;612int vs = _VS;613int vt = _VT;614VectorSize sz = GetVecSize(op);615616// TODO: Force read one of them into regs? probably not.617u8 sregs[4], tregs[4], dregs[1];618GetVectorRegsPrefixS(sregs, sz, vs);619GetVectorRegsPrefixT(tregs, sz, vt);620GetVectorRegsPrefixD(dregs, V_Single, vd);621622// TODO: applyprefixST here somehow (shuffle, etc...)623fpr.MapRegsAndSpillLockV(sregs, sz, 0);624fpr.MapRegsAndSpillLockV(tregs, sz, 0);625VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));626627int n = GetNumVectorElements(sz);628for (int i = 1; i < n; i++) {629// sum += s[i]*t[i];630if (i == n - 1) {631VADD(S0, S0, fpr.V(tregs[i]));632} else {633VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i]));634}635}636fpr.ReleaseSpillLocksAndDiscardTemps();637638fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);639640VMOV(fpr.V(dregs[0]), S0);641ApplyPrefixD(dregs, V_Single);642fpr.ReleaseSpillLocksAndDiscardTemps();643}644645alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };646647void ArmJit::Comp_Vhoriz(MIPSOpcode op) {648NEON_IF_AVAILABLE(CompNEON_Vhoriz);649CONDITIONAL_DISABLE(VFPU_VEC);650if (js.HasUnknownPrefix()) {651DISABLE;652}653654int vd = _VD;655int vs = _VS;656int vt = _VT;657VectorSize sz = GetVecSize(op);658659// TODO: Force read one of them into regs? probably not.660u8 sregs[4], dregs[1];661GetVectorRegsPrefixS(sregs, sz, vs);662GetVectorRegsPrefixD(dregs, V_Single, vd);663664// TODO: applyprefixST here somehow (shuffle, etc...)665fpr.MapRegsAndSpillLockV(sregs, sz, 0);666667int n = GetNumVectorElements(sz);668669bool is_vavg = ((op >> 16) & 0x1f) == 7;670if (is_vavg) {671MOVI2F(S1, vavg_table[n - 1], R0);672}673// Have to start at +0.000 for the correct sign.674MOVI2F(S0, 0.0f, SCRATCHREG1);675for (int i = 0; i < n; i++) {676// sum += s[i];677VADD(S0, S0, fpr.V(sregs[i]));678}679680fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);681if (is_vavg) {682VMUL(fpr.V(dregs[0]), S0, S1);683} else {684VMOV(fpr.V(dregs[0]), S0);685}686ApplyPrefixD(dregs, V_Single);687fpr.ReleaseSpillLocksAndDiscardTemps();688}689690void ArmJit::Comp_VDot(MIPSOpcode op) {691NEON_IF_AVAILABLE(CompNEON_VDot);692CONDITIONAL_DISABLE(VFPU_VEC);693if (js.HasUnknownPrefix()) {694DISABLE;695}696697int vd = _VD;698int vs = _VS;699int vt = _VT;700VectorSize sz = GetVecSize(op);701702// TODO: Force read one of them into regs? probably not.703u8 sregs[4], tregs[4], dregs[1];704GetVectorRegsPrefixS(sregs, sz, vs);705GetVectorRegsPrefixT(tregs, sz, vt);706GetVectorRegsPrefixD(dregs, V_Single, vd);707708// TODO: applyprefixST here somehow (shuffle, etc...)709fpr.MapRegsAndSpillLockV(sregs, sz, 0);710fpr.MapRegsAndSpillLockV(tregs, sz, 0);711VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));712713int n = GetNumVectorElements(sz);714for (int i = 1; i < n; i++) {715// sum += s[i]*t[i];716VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i]));717}718fpr.ReleaseSpillLocksAndDiscardTemps();719720fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY);721722VMOV(fpr.V(dregs[0]), S0);723ApplyPrefixD(dregs, V_Single);724fpr.ReleaseSpillLocksAndDiscardTemps();725}726727void ArmJit::Comp_VecDo3(MIPSOpcode op) {728NEON_IF_AVAILABLE(CompNEON_VecDo3);729CONDITIONAL_DISABLE(VFPU_VEC);730if (js.HasUnknownPrefix()) {731DISABLE;732}733734int vd = _VD;735int vs = _VS;736int vt = _VT;737738VectorSize sz = GetVecSize(op);739int n = GetNumVectorElements(sz);740741u8 sregs[4], tregs[4], dregs[4];742GetVectorRegsPrefixS(sregs, sz, _VS);743GetVectorRegsPrefixT(tregs, sz, _VT);744GetVectorRegsPrefixD(dregs, sz, _VD);745746MIPSReg tempregs[4];747for (int i = 0; i < n; i++) {748if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {749tempregs[i] = fpr.GetTempV();750} else {751tempregs[i] = dregs[i];752}753}754755// Map first, then work. This will allow us to use VLDMIA more often756// (when we add the appropriate map function) and the instruction ordering757// will improve.758// Note that mapping like this (instead of first all sregs, first all tregs etc)759// reduces the amount of continuous registers a lot :(760for (int i = 0; i < n; i++) {761fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);762fpr.SpillLockV(tempregs[i]);763fpr.SpillLockV(sregs[i]);764fpr.SpillLockV(tregs[i]);765}766767for (int i = 0; i < n; i++) {768switch (op >> 26) {769case 24: //VFPU0770switch ((op >> 23)&7) {771case 0: // d[i] = s[i] + t[i]; break; //vadd772VADD(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));773break;774case 1: // d[i] = s[i] - t[i]; break; //vsub775VSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));776break;777case 7: // d[i] = s[i] / t[i]; break; //vdiv778VDIV(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));779break;780default:781DISABLE;782}783break;784case 25: //VFPU1785switch ((op >> 23) & 7) {786case 0: // d[i] = s[i] * t[i]; break; //vmul787VMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i]));788break;789default:790DISABLE;791}792break;793// Unfortunately there is no VMIN/VMAX on ARM without NEON.794case 27: //VFPU3795switch ((op >> 23) & 7) {796case 2: // vmin797{798VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));799VMRS_APSR();800FixupBranch skipNAN = B_CC(CC_VC);801VMOV(SCRATCHREG1, fpr.V(sregs[i]));802VMOV(SCRATCHREG2, fpr.V(tregs[i]));803// If both are negative, we reverse the comparison. We want the highest mantissa then.804// Also, between -NAN and -5.0, we want -NAN to be less.805TST(SCRATCHREG1, SCRATCHREG2);806FixupBranch cmpPositive = B_CC(CC_PL);807CMP(SCRATCHREG2, SCRATCHREG1);808FixupBranch skipPositive = B();809SetJumpTarget(cmpPositive);810CMP(SCRATCHREG1, SCRATCHREG2);811SetJumpTarget(skipPositive);812SetCC(CC_AL);813SetJumpTarget(skipNAN);814SetCC(CC_LT);815VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));816SetCC(CC_GE);817VMOV(fpr.V(tempregs[i]), fpr.V(tregs[i]));818SetCC(CC_AL);819break;820}821case 3: // vmax822{823VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));824VMRS_APSR();825FixupBranch skipNAN = B_CC(CC_VC);826VMOV(SCRATCHREG1, fpr.V(sregs[i]));827VMOV(SCRATCHREG2, fpr.V(tregs[i]));828// If both are negative, we reverse the comparison. We want the lowest mantissa then.829// Also, between -NAN and -5.0, we want -5.0 to be greater.830TST(SCRATCHREG2, SCRATCHREG1);831FixupBranch cmpPositive = B_CC(CC_PL);832CMP(SCRATCHREG1, SCRATCHREG2);833FixupBranch skipPositive = B();834SetJumpTarget(cmpPositive);835CMP(SCRATCHREG2, SCRATCHREG1);836SetJumpTarget(skipPositive);837SetCC(CC_AL);838SetJumpTarget(skipNAN);839SetCC(CC_LT);840VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));841SetCC(CC_GE);842VMOV(fpr.V(tempregs[i]), fpr.V(tregs[i]));843SetCC(CC_AL);844break;845}846case 6: // vsge847DISABLE; // pending testing848VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));849VMRS_APSR();850// Unordered is always 0.851SetCC(CC_GE);852MOVI2F(fpr.V(tempregs[i]), 1.0f, SCRATCHREG1);853SetCC(CC_LT);854MOVI2F(fpr.V(tempregs[i]), 0.0f, SCRATCHREG1);855SetCC(CC_AL);856break;857case 7: // vslt858DISABLE; // pending testing859VCMP(fpr.V(tregs[i]), fpr.V(sregs[i]));860VMRS_APSR();861// Unordered is always 0.862SetCC(CC_LO);863MOVI2F(fpr.V(tempregs[i]), 1.0f, SCRATCHREG1);864SetCC(CC_HS);865MOVI2F(fpr.V(tempregs[i]), 0.0f, SCRATCHREG1);866SetCC(CC_AL);867break;868}869break;870871default:872DISABLE;873}874}875876for (int i = 0; i < n; i++) {877if (dregs[i] != tempregs[i]) {878fpr.MapDirtyInV(dregs[i], tempregs[i]);879VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));880}881}882ApplyPrefixD(dregs, sz);883884fpr.ReleaseSpillLocksAndDiscardTemps();885}886887void ArmJit::Comp_VV2Op(MIPSOpcode op) {888NEON_IF_AVAILABLE(CompNEON_VV2Op);889CONDITIONAL_DISABLE(VFPU_VEC);890if (js.HasUnknownPrefix()) {891DISABLE;892}893894// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure895if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {896return;897}898899// Catch the disabled operations immediately so we don't map registers unnecessarily later.900// Move these down to the big switch below as they are implemented.901switch ((op >> 16) & 0x1f) {902case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin903DISABLE;904break;905case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos906DISABLE;907break;908case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2909DISABLE;910break;911case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2912DISABLE;913break;914case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin915DISABLE;916break;917case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2918DISABLE;919break;920default:921;922}923924VectorSize sz = GetVecSize(op);925int n = GetNumVectorElements(sz);926927u8 sregs[4], dregs[4];928GetVectorRegsPrefixS(sregs, sz, _VS);929GetVectorRegsPrefixD(dregs, sz, _VD);930931MIPSReg tempregs[4];932for (int i = 0; i < n; ++i) {933if (!IsOverlapSafe(dregs[i], i, n, sregs)) {934tempregs[i] = fpr.GetTempV();935} else {936tempregs[i] = dregs[i];937}938}939940// Get some extra temps, used by vasin only.941ARMReg t2 = INVALID_REG, t3 = INVALID_REG, t4 = INVALID_REG;942if (((op >> 16) & 0x1f) == 23) {943// Only get here on vasin.944int t[3] = { fpr.GetTempV(), fpr.GetTempV(), fpr.GetTempV() };945fpr.MapRegV(t[0], MAP_NOINIT);946fpr.MapRegV(t[1], MAP_NOINIT);947fpr.MapRegV(t[2], MAP_NOINIT);948t2 = fpr.V(t[0]);949t3 = fpr.V(t[1]);950t4 = fpr.V(t[2]);951}952953// Pre map the registers to get better instruction ordering.954// Note that mapping like this (instead of first all sregs, first all tempregs etc)955// reduces the amount of continuous registers a lot :(956for (int i = 0; i < n; i++) {957fpr.MapDirtyInV(tempregs[i], sregs[i]);958fpr.SpillLockV(tempregs[i]);959fpr.SpillLockV(sregs[i]);960}961962// Warning: sregs[i] and tempxregs[i] may be the same reg.963// Helps for vmov, hurts for vrcp, etc.964for (int i = 0; i < n; i++) {965switch ((op >> 16) & 0x1f) {966case 0: // d[i] = s[i]; break; //vmov967// Probably for swizzle.968VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));969break;970case 1: // d[i] = fabsf(s[i]); break; //vabs971VABS(fpr.V(tempregs[i]), fpr.V(sregs[i]));972break;973case 2: // d[i] = -s[i]; break; //vneg974VNEG(fpr.V(tempregs[i]), fpr.V(sregs[i]));975break;976case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0977if (i == 0) {978MOVI2F(S0, 0.0f, SCRATCHREG1);979MOVI2F(S1, 1.0f, SCRATCHREG1);980}981VCMP(fpr.V(sregs[i]), S0);982VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).983VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));984SetCC(CC_LS);985VMOV(fpr.V(tempregs[i]), S0);986SetCC(CC_AL);987VCMP(fpr.V(sregs[i]), S1);988VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).989SetCC(CC_GT);990VMOV(fpr.V(tempregs[i]), S1);991SetCC(CC_AL);992break;993case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1994if (i == 0) {995MOVI2F(S0, -1.0f, SCRATCHREG1);996MOVI2F(S1, 1.0f, SCRATCHREG1);997}998VCMP(fpr.V(sregs[i]), S0);999VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).1000VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));1001SetCC(CC_LO);1002VMOV(fpr.V(tempregs[i]), S0);1003SetCC(CC_AL);1004VCMP(fpr.V(sregs[i]), S1);1005VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).1006SetCC(CC_GT);1007VMOV(fpr.V(tempregs[i]), S1);1008SetCC(CC_AL);1009break;1010case 16: // d[i] = 1.0f / s[i]; break; //vrcp1011if (i == 0) {1012MOVI2F(S0, 1.0f, SCRATCHREG1);1013}1014VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));1015break;1016case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq1017if (i == 0) {1018MOVI2F(S0, 1.0f, SCRATCHREG1);1019}1020VSQRT(S1, fpr.V(sregs[i]));1021VDIV(fpr.V(tempregs[i]), S0, S1);1022break;1023case 22: // d[i] = sqrtf(s[i]); break; //vsqrt1024VSQRT(fpr.V(tempregs[i]), fpr.V(sregs[i]));1025VABS(fpr.V(tempregs[i]), fpr.V(tempregs[i]));1026break;1027case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin1028// Seems to work well enough but can disable if it becomes a problem.1029// Should be easy enough to translate to NEON. There we can load all the constants1030// in one go of course.1031VCMP(fpr.V(sregs[i])); // flags = sign(sregs[i])1032VMRS_APSR();1033MOVI2F(S0, 1.0f, SCRATCHREG1);1034VABS(t4, fpr.V(sregs[i])); // t4 = |sregs[i]|1035VSUB(t3, S0, t4);1036VSQRT(t3, t3); // t3 = sqrt(1 - |sregs[i]|)1037MOVI2F(S1, -0.0187293f, SCRATCHREG1);1038MOVI2F(t2, 0.0742610f, SCRATCHREG1);1039VMLA(t2, t4, S1);1040MOVI2F(S1, -0.2121144f, SCRATCHREG1);1041VMLA(S1, t4, t2);1042MOVI2F(t2, 1.5707288f, SCRATCHREG1);1043VMLA(t2, t4, S1);1044MOVI2F(fpr.V(tempregs[i]), M_PI / 2, SCRATCHREG1);1045VMLS(fpr.V(tempregs[i]), t2, t3); // tr[i] = M_PI / 2 - t2 * t31046{1047FixupBranch br = B_CC(CC_GE);1048VNEG(fpr.V(tempregs[i]), fpr.V(tempregs[i]));1049SetJumpTarget(br);1050}1051// Correction factor for PSP range. Could be baked into the calculation above?1052MOVI2F(S1, 1.0f / (M_PI / 2), SCRATCHREG1);1053VMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);1054break;1055case 24: // d[i] = -1.0f / s[i]; break; // vnrcp1056if (i == 0) {1057MOVI2F(S0, -1.0f, SCRATCHREG1);1058}1059VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));1060break;1061default:1062ERROR_LOG(Log::JIT, "case missing in vfpu vv2op");1063DISABLE;1064break;1065}1066}10671068for (int i = 0; i < n; ++i) {1069if (dregs[i] != tempregs[i]) {1070fpr.MapDirtyInV(dregs[i], tempregs[i]);1071VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));1072}1073}10741075ApplyPrefixD(dregs, sz);10761077fpr.ReleaseSpillLocksAndDiscardTemps();1078}10791080void ArmJit::Comp_Vi2f(MIPSOpcode op) {1081NEON_IF_AVAILABLE(CompNEON_Vi2f);1082CONDITIONAL_DISABLE(VFPU_VEC);1083if (js.HasUnknownPrefix()) {1084DISABLE;1085}10861087VectorSize sz = GetVecSize(op);1088int n = GetNumVectorElements(sz);10891090int imm = (op >> 16) & 0x1f;1091const float mult = 1.0f / (float)(1UL << imm);10921093u8 sregs[4], dregs[4];1094GetVectorRegsPrefixS(sregs, sz, _VS);1095GetVectorRegsPrefixD(dregs, sz, _VD);10961097MIPSReg tempregs[4];1098for (int i = 0; i < n; ++i) {1099if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1100tempregs[i] = fpr.GetTempV();1101} else {1102tempregs[i] = dregs[i];1103}1104}11051106if (mult != 1.0f)1107MOVI2F(S0, mult, SCRATCHREG1);11081109for (int i = 0; i < n; i++) {1110fpr.MapDirtyInV(tempregs[i], sregs[i]);1111VCVT(fpr.V(tempregs[i]), fpr.V(sregs[i]), TO_FLOAT | IS_SIGNED);1112if (mult != 1.0f)1113VMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);1114}11151116for (int i = 0; i < n; ++i) {1117if (dregs[i] != tempregs[i]) {1118fpr.MapDirtyInV(dregs[i], tempregs[i]);1119VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));1120}1121}11221123ApplyPrefixD(dregs, sz);1124fpr.ReleaseSpillLocksAndDiscardTemps();1125}11261127void ArmJit::Comp_Vh2f(MIPSOpcode op) {1128NEON_IF_AVAILABLE(CompNEON_Vh2f);1129CONDITIONAL_DISABLE(VFPU_VEC);1130if (js.HasUnknownPrefix()) {1131DISABLE;1132}11331134// This multi-VCVT.F32.F16 is only available in the VFPv4 extension.1135// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.1136if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {1137// No hardware support for half-to-float, fallback to interpreter1138// TODO: Translate the fast SSE solution to standard integer/VFP stuff1139// for the weaker CPUs.1140DISABLE;1141}11421143u8 sregs[4], dregs[4];1144VectorSize sz = GetVecSize(op);1145VectorSize outSz;11461147switch (sz) {1148case V_Single:1149outSz = V_Pair;1150break;1151case V_Pair:1152outSz = V_Quad;1153break;1154default:1155DISABLE;1156}11571158int n = GetNumVectorElements(sz);1159int nOut = n * 2;1160GetVectorRegsPrefixS(sregs, sz, _VS);1161GetVectorRegsPrefixD(dregs, outSz, _VD);11621163static const ARMReg tmp[4] = { S0, S1, S2, S3 };11641165for (int i = 0; i < n; i++) {1166fpr.MapRegV(sregs[i], sz);1167VMOV(tmp[i], fpr.V(sregs[i]));1168}11691170// This always converts four 16-bit floats in D0 to four 32-bit floats1171// in Q0. If we are dealing with a pair here, we just ignore the upper two outputs.1172// There are also a couple of other instructions that do it one at a time but doesn't1173// seem worth the trouble.1174VCVTF32F16(Q0, D0);11751176for (int i = 0; i < nOut; i++) {1177fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);1178VMOV(fpr.V(dregs[i]), tmp[i]);1179}11801181ApplyPrefixD(dregs, sz);1182fpr.ReleaseSpillLocksAndDiscardTemps();1183}11841185void ArmJit::Comp_Vf2i(MIPSOpcode op) {1186NEON_IF_AVAILABLE(CompNEON_Vf2i);1187CONDITIONAL_DISABLE(VFPU_VEC);11881189if (js.HasUnknownPrefix()) {1190DISABLE;1191}1192DISABLE;11931194VectorSize sz = GetVecSize(op);1195int n = GetNumVectorElements(sz);11961197int imm = (op >> 16) & 0x1f;1198float mult = (float)(1ULL << imm);11991200switch ((op >> 21) & 0x1f)1201{1202case 17:1203break; //z - truncate. Easy to support.1204case 16:1205case 18:1206case 19:1207DISABLE;1208break;1209}12101211u8 sregs[4], dregs[4];1212GetVectorRegsPrefixS(sregs, sz, _VS);1213GetVectorRegsPrefixD(dregs, sz, _VD);12141215MIPSReg tempregs[4];1216for (int i = 0; i < n; ++i) {1217if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1218tempregs[i] = fpr.GetTempV();1219} else {1220tempregs[i] = dregs[i];1221}1222}12231224if (mult != 1.0f)1225MOVI2F(S1, mult, SCRATCHREG1);12261227for (int i = 0; i < n; i++) {1228fpr.MapDirtyInV(tempregs[i], sregs[i]);1229switch ((op >> 21) & 0x1f) {1230case 16: /* TODO */ break; //n1231case 17:1232if (mult != 1.0f) {1233VMUL(S0, fpr.V(sregs[i]), S1);1234VCVT(fpr.V(tempregs[i]), S0, TO_INT | ROUND_TO_ZERO);1235} else {1236VCVT(fpr.V(tempregs[i]), fpr.V(sregs[i]), TO_INT | ROUND_TO_ZERO);1237}1238break;1239case 18: /* TODO */ break; //u1240case 19: /* TODO */ break; //d1241}1242}12431244for (int i = 0; i < n; ++i) {1245if (dregs[i] != tempregs[i]) {1246fpr.MapDirtyInV(dregs[i], tempregs[i]);1247VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));1248}1249}12501251ApplyPrefixD(dregs, sz);1252fpr.ReleaseSpillLocksAndDiscardTemps();1253}12541255void ArmJit::Comp_Mftv(MIPSOpcode op) {1256NEON_IF_AVAILABLE(CompNEON_Mftv);1257CONDITIONAL_DISABLE(VFPU_XFER);12581259int imm = op & 0xFF;1260MIPSGPReg rt = _RT;1261switch ((op >> 21) & 0x1f) {1262case 3: //mfv / mfvc1263// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.1264if (rt != 0) {1265if (imm < 128) { //R(rt) = VI(imm);1266fpr.MapRegV(imm, 0);1267gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);1268VMOV(gpr.R(rt), fpr.V(imm));1269} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc1270if (imm - 128 == VFPU_CTRL_CC) {1271if (gpr.IsImm(MIPS_REG_VFPUCC)) {1272gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));1273} else {1274gpr.MapDirtyIn(rt, MIPS_REG_VFPUCC);1275MOV(gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));1276}1277} else {1278// In case we have a saved prefix.1279FlushPrefixV();1280gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);1281LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));1282}1283} else {1284//ERROR - maybe need to make this value too an "interlock" value?1285ERROR_LOG(Log::CPU, "mfv - invalid register %i", imm);1286}1287}1288break;12891290case 7: // mtv1291if (imm < 128) {1292gpr.MapReg(rt);1293fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);1294VMOV(fpr.V(imm), gpr.R(rt));1295} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);1296if (imm - 128 == VFPU_CTRL_CC) {1297if (gpr.IsImm(rt)) {1298gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));1299} else {1300gpr.MapDirtyIn(MIPS_REG_VFPUCC, rt);1301MOV(gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));1302}1303} else {1304gpr.MapReg(rt);1305STR(gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128));1306}13071308// TODO: Optimization if rt is Imm?1309// Set these BEFORE disable!1310if (imm - 128 == VFPU_CTRL_SPREFIX) {1311js.prefixSFlag = JitState::PREFIX_UNKNOWN;1312js.blockWrotePrefixes = true;1313} else if (imm - 128 == VFPU_CTRL_TPREFIX) {1314js.prefixTFlag = JitState::PREFIX_UNKNOWN;1315js.blockWrotePrefixes = true;1316} else if (imm - 128 == VFPU_CTRL_DPREFIX) {1317js.prefixDFlag = JitState::PREFIX_UNKNOWN;1318js.blockWrotePrefixes = true;1319}1320} else {1321//ERROR1322_dbg_assert_msg_(false,"mtv - invalid register");1323}1324break;13251326default:1327DISABLE;1328}13291330fpr.ReleaseSpillLocksAndDiscardTemps();1331}13321333void ArmJit::Comp_Vmfvc(MIPSOpcode op) {1334NEON_IF_AVAILABLE(CompNEON_Vmtvc);1335CONDITIONAL_DISABLE(VFPU_XFER);13361337int vd = _VD;1338int imm = (op >> 8) & 0x7F;1339if (imm < VFPU_CTRL_MAX) {1340fpr.MapRegV(vd);1341if (imm == VFPU_CTRL_CC) {1342gpr.MapReg(MIPS_REG_VFPUCC, 0);1343VMOV(fpr.V(vd), gpr.R(MIPS_REG_VFPUCC));1344} else {1345ADDI2R(SCRATCHREG1, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCHREG2);1346VLDR(fpr.V(vd), SCRATCHREG1, 0);1347}1348fpr.ReleaseSpillLocksAndDiscardTemps();1349} else {1350fpr.MapRegV(vd);1351MOVI2F(fpr.V(vd), 0.0f, SCRATCHREG1);1352}1353}13541355void ArmJit::Comp_Vmtvc(MIPSOpcode op) {1356NEON_IF_AVAILABLE(CompNEON_Vmtvc);1357CONDITIONAL_DISABLE(VFPU_XFER);13581359int vs = _VS;1360int imm = op & 0x7F;1361if (imm < VFPU_CTRL_MAX) {1362fpr.MapRegV(vs);1363if (imm == VFPU_CTRL_CC) {1364gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY | MAP_NOINIT);1365VMOV(gpr.R(MIPS_REG_VFPUCC), fpr.V(vs));1366} else {1367ADDI2R(SCRATCHREG1, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + imm * 4, SCRATCHREG2);1368VSTR(fpr.V(vs), SCRATCHREG1, 0);1369}1370fpr.ReleaseSpillLocksAndDiscardTemps();13711372if (imm == VFPU_CTRL_SPREFIX) {1373js.prefixSFlag = JitState::PREFIX_UNKNOWN;1374js.blockWrotePrefixes = true;1375} else if (imm == VFPU_CTRL_TPREFIX) {1376js.prefixTFlag = JitState::PREFIX_UNKNOWN;1377js.blockWrotePrefixes = true;1378} else if (imm == VFPU_CTRL_DPREFIX) {1379js.prefixDFlag = JitState::PREFIX_UNKNOWN;1380js.blockWrotePrefixes = true;1381}1382}1383}13841385void ArmJit::Comp_Vmmov(MIPSOpcode op) {1386NEON_IF_AVAILABLE(CompNEON_Vmmov);1387CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);13881389// This probably ignores prefixes for all sane intents and purposes.1390if (_VS == _VD) {1391// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.1392return;1393}13941395MatrixSize sz = GetMtxSize(op);1396int n = GetMatrixSide(sz);13971398u8 sregs[16], dregs[16];1399GetMatrixRegs(sregs, sz, _VS);1400GetMatrixRegs(dregs, sz, _VD);14011402// Rough overlap check.1403bool overlap = false;1404if (GetMtx(_VS) == GetMtx(_VD)) {1405// Potential overlap (guaranteed for 3x3 or more).1406overlap = true;1407}14081409if (overlap) {1410// Not so common, fallback.1411DISABLE;1412} else {1413for (int a = 0; a < n; a++) {1414for (int b = 0; b < n; b++) {1415fpr.MapDirtyInV(dregs[a * 4 + b], sregs[a * 4 + b]);1416VMOV(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[a * 4 + b]));1417}1418}1419fpr.ReleaseSpillLocksAndDiscardTemps();1420}1421}14221423void ArmJit::Comp_VScl(MIPSOpcode op) {1424NEON_IF_AVAILABLE(CompNEON_VScl);1425CONDITIONAL_DISABLE(VFPU_VEC);1426if (js.HasUnknownPrefix()) {1427DISABLE;1428}14291430VectorSize sz = GetVecSize(op);1431int n = GetNumVectorElements(sz);14321433u8 sregs[4], dregs[4], treg;1434GetVectorRegsPrefixS(sregs, sz, _VS);1435// TODO: Prefixes seem strange...1436GetVectorRegsPrefixT(&treg, V_Single, _VT);1437GetVectorRegsPrefixD(dregs, sz, _VD);14381439// Move to S0 early, so we don't have to worry about overlap with scale.1440fpr.LoadToRegV(S0, treg);14411442// For prefixes to work, we just have to ensure that none of the output registers spill1443// and that there's no overlap.1444MIPSReg tempregs[4];1445for (int i = 0; i < n; ++i) {1446if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1447// Need to use temp regs1448tempregs[i] = fpr.GetTempV();1449} else {1450tempregs[i] = dregs[i];1451}1452}14531454// The meat of the function!1455for (int i = 0; i < n; i++) {1456fpr.MapDirtyInV(tempregs[i], sregs[i]);1457VMUL(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0);1458}14591460for (int i = 0; i < n; i++) {1461// All must be mapped for prefixes to work.1462if (dregs[i] != tempregs[i]) {1463fpr.MapDirtyInV(dregs[i], tempregs[i]);1464VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));1465}1466}14671468ApplyPrefixD(dregs, sz);14691470fpr.ReleaseSpillLocksAndDiscardTemps();1471}14721473void ArmJit::Comp_Vmmul(MIPSOpcode op) {1474CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);1475if (!js.HasNoPrefix()) {1476DISABLE;1477}1478NEON_IF_AVAILABLE(CompNEON_Vmmul);14791480if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {1481// Fall back to interpreter, which has the accurate implementation.1482// Later we might do something more optimized here.1483DISABLE;1484}14851486MatrixSize sz = GetMtxSize(op);1487int n = GetMatrixSide(sz);14881489u8 sregs[16], tregs[16], dregs[16];1490GetMatrixRegs(sregs, sz, _VS);1491GetMatrixRegs(tregs, sz, _VT);1492GetMatrixRegs(dregs, sz, _VD);14931494// Rough overlap check.1495bool overlap = false;1496if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {1497// Potential overlap (guaranteed for 3x3 or more).1498overlap = true;1499}15001501if (overlap) {1502DISABLE;1503} else {1504for (int a = 0; a < n; a++) {1505for (int b = 0; b < n; b++) {1506fpr.MapInInV(sregs[b * 4], tregs[a * 4]);1507VMUL(S0, fpr.V(sregs[b * 4]), fpr.V(tregs[a * 4]));1508for (int c = 1; c < n; c++) {1509fpr.MapInInV(sregs[b * 4 + c], tregs[a * 4 + c]);1510VMLA(S0, fpr.V(sregs[b * 4 + c]), fpr.V(tregs[a * 4 + c]));1511}1512fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT);1513VMOV(fpr.V(dregs[a * 4 + b]), S0);1514}1515}1516fpr.ReleaseSpillLocksAndDiscardTemps();1517}1518}15191520void ArmJit::Comp_Vmscl(MIPSOpcode op) {1521NEON_IF_AVAILABLE(CompNEON_Vmscl);1522DISABLE;1523}15241525void ArmJit::Comp_Vtfm(MIPSOpcode op) {1526NEON_IF_AVAILABLE(CompNEON_Vtfm);1527CONDITIONAL_DISABLE(VFPU_MTX_VTFM);1528if (js.HasUnknownPrefix()) {1529DISABLE;1530}15311532// TODO: This probably ignores prefixes? Or maybe uses D?15331534VectorSize sz = GetVecSize(op);1535MatrixSize msz = GetMtxSize(op);1536int n = GetNumVectorElements(sz);1537int ins = (op >> 23) & 7;15381539bool homogenous = false;1540if (n == ins) {1541n++;1542sz = (VectorSize)((int)(sz) + 1);1543msz = (MatrixSize)((int)(msz) + 1);1544homogenous = true;1545}1546// Otherwise, n should already be ins + 1.1547else if (n != ins + 1) {1548DISABLE;1549}15501551u8 sregs[16], dregs[4], tregs[4];1552GetMatrixRegs(sregs, msz, _VS);1553GetVectorRegs(tregs, sz, _VT);1554GetVectorRegs(dregs, sz, _VD);15551556// TODO: test overlap, optimize.1557int tempregs[4];1558for (int i = 0; i < n; i++) {1559fpr.MapInInV(sregs[i * 4], tregs[0]);1560VMUL(S0, fpr.V(sregs[i * 4]), fpr.V(tregs[0]));1561for (int k = 1; k < n; k++) {1562if (!homogenous || k != n - 1) {1563fpr.MapInInV(sregs[i * 4 + k], tregs[k]);1564VMLA(S0, fpr.V(sregs[i * 4 + k]), fpr.V(tregs[k]));1565} else {1566fpr.MapRegV(sregs[i * 4 + k]);1567VADD(S0, S0, fpr.V(sregs[i * 4 + k]));1568}1569}15701571int temp = fpr.GetTempV();1572fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);1573fpr.SpillLockV(temp);1574VMOV(fpr.V(temp), S0);1575tempregs[i] = temp;1576}1577for (int i = 0; i < n; i++) {1578u8 temp = tempregs[i];1579fpr.MapRegV(dregs[i], MAP_NOINIT | MAP_DIRTY);1580VMOV(fpr.V(dregs[i]), fpr.V(temp));1581}15821583fpr.ReleaseSpillLocksAndDiscardTemps();1584}15851586void ArmJit::Comp_VCrs(MIPSOpcode op) {1587NEON_IF_AVAILABLE(CompNEON_VCrs);1588DISABLE;1589}15901591void ArmJit::Comp_VDet(MIPSOpcode op) {1592NEON_IF_AVAILABLE(CompNEON_VDet);1593DISABLE;1594}15951596void ArmJit::Comp_Vi2x(MIPSOpcode op) {1597NEON_IF_AVAILABLE(CompNEON_Vi2x);1598CONDITIONAL_DISABLE(VFPU_VEC);1599if (js.HasUnknownPrefix()) {1600DISABLE;1601}16021603int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)1604bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)16051606if (unsignedOp) {1607// Requires a tricky clamp operation that we can't do without more temps, see below1608DISABLE;1609}16101611// These instructions pack pairs or quads of integers into 32 bits.1612// The unsigned (u) versions skip the sign bit when packing.1613VectorSize sz = GetVecSize(op);1614VectorSize outsize;1615if (bits == 8) {1616outsize = V_Single;1617if (sz != V_Quad) {1618DISABLE;1619}1620} else {1621switch (sz) {1622case V_Pair:1623outsize = V_Single;1624break;1625case V_Quad:1626outsize = V_Pair;1627break;1628default:1629DISABLE;1630}1631}16321633u8 sregs[4], dregs[4];1634GetVectorRegsPrefixS(sregs, sz, _VS);1635GetVectorRegsPrefixD(dregs, outsize, _VD);16361637// First, let's assemble the sregs into lanes of either D0 (pair) or Q0 (quad).1638bool quad = sz == V_Quad;1639fpr.MapRegsAndSpillLockV(sregs, sz, 0);1640VMOV(S0, fpr.V(sregs[0]));1641VMOV(S1, fpr.V(sregs[1]));1642if (quad) {1643VMOV(S2, fpr.V(sregs[2]));1644VMOV(S3, fpr.V(sregs[3]));1645}16461647// TODO: For "u" type ops, we clamp to zero and shift off the sign bit first.1648// Need some temp regs to do that efficiently, right?16491650// At this point, we simply need to collect the high bits of each 32-bit lane into one register.1651if (bits == 8) {1652// Really want to do a VSHRN(..., 24) but that can't be encoded. So we synthesize it.1653VSHR(I_32, Q0, Q0, 16);1654VSHRN(I_32, D0, Q0, 8);1655VMOVN(I_16, D0, Q0);1656} else {1657VSHRN(I_32, D0, Q0, 16);1658}16591660fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_DIRTY|MAP_NOINIT);1661VMOV(fpr.V(dregs[0]), S0);1662if (outsize == V_Pair) {1663VMOV(fpr.V(dregs[1]), S1);1664}16651666ApplyPrefixD(dregs, outsize);1667fpr.ReleaseSpillLocksAndDiscardTemps();1668}16691670void ArmJit::Comp_Vx2i(MIPSOpcode op) {1671NEON_IF_AVAILABLE(CompNEON_Vx2i);1672CONDITIONAL_DISABLE(VFPU_VEC);1673if (js.HasUnknownPrefix()) {1674DISABLE;1675}16761677int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)1678bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)16791680if (bits == 8 && unsignedOp) {1681// vuc2i is odd and needs temp registers for implementation.1682DISABLE;1683}1684// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values1685// at the top. vus2i shifts it an extra bit right afterward.1686// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values1687// at the top too. vuc2i is a bit special (see below.)1688// Let's do this similarly as h2f - we do a solution that works for both singles and pairs1689// then use it for both.16901691VectorSize sz = GetVecSize(op);1692VectorSize outsize;1693if (bits == 8) {1694outsize = V_Quad;1695} else {1696switch (sz) {1697case V_Single:1698outsize = V_Pair;1699break;1700case V_Pair:1701outsize = V_Quad;1702break;1703default:1704DISABLE;1705}1706}17071708u8 sregs[4], dregs[4];1709GetVectorRegsPrefixS(sregs, sz, _VS);1710GetVectorRegsPrefixD(dregs, outsize, _VD);17111712fpr.MapRegsAndSpillLockV(sregs, sz, 0);1713if (sz == V_Single) {1714VMOV(S0, fpr.V(sregs[0]));1715} else if (sz == V_Pair) {1716VMOV(S0, fpr.V(sregs[0]));1717VMOV(S1, fpr.V(sregs[1]));1718} else if (bits == 8) {1719// For some reason, sz is quad on vc2i.1720VMOV(S0, fpr.V(sregs[0]));1721}172217231724if (bits == 16) {1725// Simply expand, to upper bits.1726VSHLL(I_16, Q0, D0, 16);1727} else if (bits == 8) {1728if (unsignedOp) {1729// vuc2i is a bit special. It spreads out the bits like this:1730// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.1731// TODO1732} else {1733VSHLL(I_8, Q0, D0, 8);1734VSHLL(I_16, Q0, D0, 16);1735}1736}17371738// At this point we have the regs in the 4 lanes.1739// In the "u" mode, we need to shift it out of the sign bit.1740if (unsignedOp) {1741ArmGen::ARMReg reg = (outsize == V_Quad) ? Q0 : D0;1742VSHR(I_32 | I_UNSIGNED, reg, reg, 1);1743}17441745fpr.MapRegsAndSpillLockV(dregs, outsize, MAP_NOINIT);17461747VMOV(fpr.V(dregs[0]), S0);1748VMOV(fpr.V(dregs[1]), S1);1749if (outsize == V_Quad) {1750VMOV(fpr.V(dregs[2]), S2);1751VMOV(fpr.V(dregs[3]), S3);1752}17531754ApplyPrefixD(dregs, outsize);1755fpr.ReleaseSpillLocksAndDiscardTemps();1756}17571758void ArmJit::Comp_VCrossQuat(MIPSOpcode op) {1759NEON_IF_AVAILABLE(CompNEON_VCrossQuat);1760// This op does not support prefixes anyway.1761CONDITIONAL_DISABLE(VFPU_VEC);1762if (js.HasUnknownPrefix())1763DISABLE;17641765VectorSize sz = GetVecSize(op);1766int n = GetNumVectorElements(sz);17671768u8 sregs[4], tregs[4], dregs[4];1769GetVectorRegs(sregs, sz, _VS);1770GetVectorRegs(tregs, sz, _VT);1771GetVectorRegs(dregs, sz, _VD);17721773// Map everything into registers.1774fpr.MapRegsAndSpillLockV(sregs, sz, 0);1775fpr.MapRegsAndSpillLockV(tregs, sz, 0);17761777if (sz == V_Triple) {1778MIPSReg temp3 = fpr.GetTempV();1779fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);1780// Cross product vcrsp.t17811782// Compute X1783VMUL(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));1784VMLS(S0, fpr.V(sregs[2]), fpr.V(tregs[1]));17851786// Compute Y1787VMUL(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));1788VMLS(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));17891790// Compute Z1791VMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));1792VMLS(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]));17931794fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);1795VMOV(fpr.V(dregs[0]), S0);1796VMOV(fpr.V(dregs[1]), S1);1797VMOV(fpr.V(dregs[2]), fpr.V(temp3));1798} else if (sz == V_Quad) {1799MIPSReg temp3 = fpr.GetTempV();1800MIPSReg temp4 = fpr.GetTempV();1801fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);1802fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);18031804// Quaternion product vqmul.q untested1805// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];1806VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[3]));1807VMLA(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));1808VMLS(S0, fpr.V(sregs[2]), fpr.V(tregs[1]));1809VMLA(S0, fpr.V(sregs[3]), fpr.V(tregs[0]));18101811//d[1] = -s[0] * t[2] + s[1] * t[3] + s[2] * t[0] + s[3] * t[1];1812VNMUL(S1, fpr.V(sregs[0]), fpr.V(tregs[2]));1813VMLA(S1, fpr.V(sregs[1]), fpr.V(tregs[3]));1814VMLA(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));1815VMLA(S1, fpr.V(sregs[3]), fpr.V(tregs[1]));18161817//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];1818VMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));1819VMLS(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]));1820VMLA(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[3]));1821VMLA(fpr.V(temp3), fpr.V(sregs[3]), fpr.V(tregs[2]));18221823//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];1824VNMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[0]));1825VMLS(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[1]));1826VMLS(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[2]));1827VMLA(fpr.V(temp4), fpr.V(sregs[3]), fpr.V(tregs[3]));18281829fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);1830VMOV(fpr.V(dregs[0]), S0);1831VMOV(fpr.V(dregs[1]), S1);1832VMOV(fpr.V(dregs[2]), fpr.V(temp3));1833VMOV(fpr.V(dregs[3]), fpr.V(temp4));1834}18351836fpr.ReleaseSpillLocksAndDiscardTemps();1837}18381839void ArmJit::Comp_Vcmp(MIPSOpcode op) {1840NEON_IF_AVAILABLE(CompNEON_Vcmp);1841CONDITIONAL_DISABLE(VFPU_COMP);1842if (js.HasUnknownPrefix())1843DISABLE;18441845VectorSize sz = GetVecSize(op);1846int n = GetNumVectorElements(sz);18471848VCondition cond = (VCondition)(op & 0xF);18491850u8 sregs[4], tregs[4];1851GetVectorRegsPrefixS(sregs, sz, _VS);1852GetVectorRegsPrefixT(tregs, sz, _VT);18531854// Some, we just fall back to the interpreter.1855// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.18561857switch (cond) {1858case VC_EI: // c = my_isinf(s[i]); break;1859case VC_NI: // c = !my_isinf(s[i]); break;1860DISABLE;1861case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection1862case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;1863case VC_EN: // c = my_isnan(s[i]); break;1864case VC_NN: // c = !my_isnan(s[i]); break;1865if (_VS != _VT)1866DISABLE;1867break;18681869case VC_EZ:1870case VC_NZ:1871break;1872default:1873;1874}18751876// First, let's get the trivial ones.1877int affected_bits = (1 << 4) | (1 << 5); // 4 and 518781879MOVI2R(SCRATCHREG1, 0);1880for (int i = 0; i < n; ++i) {1881// Let's only handle the easy ones, and fall back on the interpreter for the rest.1882CCFlags flag = CC_AL;1883switch (cond) {1884case VC_FL: // c = 0;1885break;18861887case VC_TR: // c = 11888if (i == 0) {1889if (n == 1) {1890MOVI2R(SCRATCHREG1, 0x31);1891} else {1892MOVI2R(SCRATCHREG1, 1 << i);1893}1894} else {1895ORR(SCRATCHREG1, SCRATCHREG1, 1 << i);1896}1897break;18981899case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection1900case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;1901// For these, we use the integer ALU as there is no support on ARM for testing for INF.1902// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.1903// We need an extra temporary register so we store away SCRATCHREG1.1904STR(SCRATCHREG1, CTXREG, offsetof(MIPSState, temp));1905fpr.MapRegV(sregs[i], 0);1906MOVI2R(SCRATCHREG1, 0x7F800000);1907VMOV(SCRATCHREG2, fpr.V(sregs[i]));1908AND(SCRATCHREG2, SCRATCHREG2, SCRATCHREG1);1909CMP(SCRATCHREG2, SCRATCHREG1); // (SCRATCHREG2 & 0x7F800000) == 0x7F8000001910flag = cond == VC_ES ? CC_EQ : CC_NEQ;1911LDR(SCRATCHREG1, CTXREG, offsetof(MIPSState, temp));1912break;19131914case VC_EN: // c = my_isnan(s[i]); break; // Tekken 61915// Should we involve T? Where I found this used, it compared a register with itself so should be fine.1916fpr.MapInInV(sregs[i], tregs[i]);1917VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1918VMRS_APSR();1919flag = CC_VS; // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html1920break;19211922case VC_NN: // c = !my_isnan(s[i]); break;1923// Should we involve T? Where I found this used, it compared a register with itself so should be fine.1924fpr.MapInInV(sregs[i], tregs[i]);1925VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1926VMRS_APSR();1927flag = CC_VC; // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html1928break;19291930case VC_EQ: // c = s[i] == t[i]1931fpr.MapInInV(sregs[i], tregs[i]);1932VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1933VMRS_APSR();1934flag = CC_EQ;1935break;19361937case VC_LT: // c = s[i] < t[i]1938fpr.MapInInV(sregs[i], tregs[i]);1939VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1940VMRS_APSR();1941flag = CC_LO;1942break;19431944case VC_LE: // c = s[i] <= t[i];1945fpr.MapInInV(sregs[i], tregs[i]);1946VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1947VMRS_APSR();1948flag = CC_LS;1949break;19501951case VC_NE: // c = s[i] != t[i]1952fpr.MapInInV(sregs[i], tregs[i]);1953VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1954VMRS_APSR();1955flag = CC_NEQ;1956break;19571958case VC_GE: // c = s[i] >= t[i]1959fpr.MapInInV(sregs[i], tregs[i]);1960VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1961VMRS_APSR();1962flag = CC_GE;1963break;19641965case VC_GT: // c = s[i] > t[i]1966fpr.MapInInV(sregs[i], tregs[i]);1967VCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));1968VMRS_APSR();1969flag = CC_GT;1970break;19711972case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f1973fpr.MapRegV(sregs[i]);1974VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)1975VMRS_APSR();1976flag = CC_EQ;1977break;19781979case VC_NZ: // c = s[i] != 01980fpr.MapRegV(sregs[i]);1981VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)1982VMRS_APSR();1983flag = CC_NEQ;1984break;19851986default:1987DISABLE;1988}1989if (flag != CC_AL) {1990SetCC(flag);1991if (i == 0) {1992if (n == 1) {1993MOVI2R(SCRATCHREG1, 0x31);1994} else {1995MOVI2R(SCRATCHREG1, 1); // 1 << i, but i == 01996}1997} else {1998ORR(SCRATCHREG1, SCRATCHREG1, 1 << i);1999}2000SetCC(CC_AL);2001}20022003affected_bits |= 1 << i;2004}20052006// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common2007// after all.2008if (n > 1) {2009CMP(SCRATCHREG1, affected_bits & 0xF);2010SetCC(CC_EQ);2011ORR(SCRATCHREG1, SCRATCHREG1, 1 << 5);2012SetCC(CC_AL);20132014CMP(SCRATCHREG1, 0);2015SetCC(CC_NEQ);2016ORR(SCRATCHREG1, SCRATCHREG1, 1 << 4);2017SetCC(CC_AL);2018}20192020gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);2021BIC(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), affected_bits);2022ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCHREG1);20232024fpr.ReleaseSpillLocksAndDiscardTemps();2025}20262027void ArmJit::Comp_Vcmov(MIPSOpcode op) {2028NEON_IF_AVAILABLE(CompNEON_Vcmov);2029CONDITIONAL_DISABLE(VFPU_COMP);2030if (js.HasUnknownPrefix()) {2031DISABLE;2032}20332034VectorSize sz = GetVecSize(op);2035int n = GetNumVectorElements(sz);20362037u8 sregs[4], dregs[4];2038GetVectorRegsPrefixS(sregs, sz, _VS);2039GetVectorRegsPrefixD(dregs, sz, _VD);2040int tf = (op >> 19) & 1;2041int imm3 = (op >> 16) & 7;20422043for (int i = 0; i < n; ++i) {2044// Simplification: Disable if overlap unsafe2045if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {2046DISABLE;2047}2048}20492050if (imm3 < 6) {2051// Test one bit of CC. This bit decides whether none or all subregisters are copied.2052fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);2053fpr.MapRegsAndSpillLockV(sregs, sz, 0);2054gpr.MapReg(MIPS_REG_VFPUCC);2055TST(gpr.R(MIPS_REG_VFPUCC), 1 << imm3);2056SetCC(tf ? CC_EQ : CC_NEQ);2057for (int i = 0; i < n; i++) {2058VMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));2059}2060SetCC(CC_AL);2061} else {2062// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.2063fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);2064fpr.MapRegsAndSpillLockV(sregs, sz, 0);2065gpr.MapReg(MIPS_REG_VFPUCC);2066for (int i = 0; i < n; i++) {2067TST(gpr.R(MIPS_REG_VFPUCC), 1 << i);2068SetCC(tf ? CC_EQ : CC_NEQ);2069VMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));2070SetCC(CC_AL);2071}2072}20732074ApplyPrefixD(dregs, sz);2075fpr.ReleaseSpillLocksAndDiscardTemps();2076}20772078void ArmJit::Comp_Viim(MIPSOpcode op) {2079NEON_IF_AVAILABLE(CompNEON_Viim);2080CONDITIONAL_DISABLE(VFPU_XFER);2081if (js.HasUnknownPrefix()) {2082DISABLE;2083}20842085u8 dreg;2086GetVectorRegs(&dreg, V_Single, _VT);20872088s32 imm = SignExtend16ToS32(op);2089fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);2090MOVI2F(fpr.V(dreg), (float)imm, SCRATCHREG1);20912092ApplyPrefixD(&dreg, V_Single);2093fpr.ReleaseSpillLocksAndDiscardTemps();2094}20952096void ArmJit::Comp_Vfim(MIPSOpcode op) {2097NEON_IF_AVAILABLE(CompNEON_Vfim);2098CONDITIONAL_DISABLE(VFPU_XFER);2099if (js.HasUnknownPrefix()) {2100DISABLE;2101}21022103u8 dreg;2104GetVectorRegs(&dreg, V_Single, _VT);21052106FP16 half;2107half.u = op & 0xFFFF;2108FP32 fval = half_to_float_fast5(half);2109fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);2110MOVI2F(fpr.V(dreg), fval.f, SCRATCHREG1);21112112ApplyPrefixD(&dreg, V_Single);2113fpr.ReleaseSpillLocksAndDiscardTemps();2114}21152116void ArmJit::Comp_Vcst(MIPSOpcode op) {2117NEON_IF_AVAILABLE(CompNEON_Vcst);2118CONDITIONAL_DISABLE(VFPU_XFER);2119if (js.HasUnknownPrefix()) {2120DISABLE;2121}21222123int conNum = (op >> 16) & 0x1f;2124int vd = _VD;21252126VectorSize sz = GetVecSize(op);2127int n = GetNumVectorElements(sz);21282129u8 dregs[4];2130GetVectorRegsPrefixD(dregs, sz, _VD);2131fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY);21322133gpr.SetRegImm(SCRATCHREG1, (u32)(void *)&cst_constants[conNum]);2134VLDR(S0, SCRATCHREG1, 0);2135for (int i = 0; i < n; ++i)2136VMOV(fpr.V(dregs[i]), S0);21372138ApplyPrefixD(dregs, sz);2139fpr.ReleaseSpillLocksAndDiscardTemps();2140}21412142static double SinCos(float angle) {2143union { struct { float sin; float cos; }; double out; } sincos;2144vfpu_sincos(angle, sincos.sin, sincos.cos);2145return sincos.out;2146}21472148static double SinCosNegSin(float angle) {2149union { struct { float sin; float cos; }; double out; } sincos;2150vfpu_sincos(angle, sincos.sin, sincos.cos);2151sincos.sin = -sincos.sin;2152return sincos.out;2153}21542155void ArmJit::CompVrotShuffle(u8 *dregs, int imm, VectorSize sz, bool negSin) {2156int n = GetNumVectorElements(sz);2157char what[4] = {'0', '0', '0', '0'};2158if (((imm >> 2) & 3) == (imm & 3)) {2159for (int i = 0; i < 4; i++)2160what[i] = 'S';2161}2162what[(imm >> 2) & 3] = 'S';2163what[imm & 3] = 'C';21642165fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY | MAP_NOINIT);2166for (int i = 0; i < n; i++) {2167switch (what[i]) {2168case 'C': VMOV(fpr.V(dregs[i]), S1); break;2169case 'S': if (negSin) VNEG(fpr.V(dregs[i]), S0); else VMOV(fpr.V(dregs[i]), S0); break;2170case '0':2171{2172MOVI2F(fpr.V(dregs[i]), 0.0f, SCRATCHREG1);2173break;2174}2175default:2176ERROR_LOG(Log::JIT, "Bad what in vrot");2177break;2178}2179}2180}21812182// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of2183// calling the math library.2184// Apparently this may not work on hardfp. I don't think we have any platforms using this though.2185void ArmJit::Comp_VRot(MIPSOpcode op) {2186NEON_IF_AVAILABLE(CompNEON_VRot);2187// VRot probably doesn't accept prefixes anyway.2188CONDITIONAL_DISABLE(VFPU_VEC);2189if (js.HasUnknownPrefix()) {2190DISABLE;2191}21922193#if PPSSPP_ARCH(ARM_HARDFP)2194DISABLE;2195#endif21962197int vd = _VD;2198int vs = _VS;21992200VectorSize sz = GetVecSize(op);2201int n = GetNumVectorElements(sz);22022203u8 dregs[4];2204u8 dregs2[4];22052206MIPSOpcode nextOp = GetOffsetInstruction(1);2207int vd2 = -1;2208int imm2 = -1;2209if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {2210// Pair of vrot. Let's join them.2211vd2 = MIPS_GET_VD(nextOp);2212imm2 = (nextOp >> 16) & 0x1f;2213// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);2214}2215u8 sreg;2216GetVectorRegs(dregs, sz, vd);2217if (vd2 >= 0)2218GetVectorRegs(dregs2, sz, vd2);2219GetVectorRegs(&sreg, V_Single, vs);22202221int imm = (op >> 16) & 0x1f;22222223gpr.FlushBeforeCall();2224fpr.FlushAll();22252226bool negSin1 = (imm & 0x10) ? true : false;22272228fpr.MapRegV(sreg);2229// We should write a custom pure-asm function instead.2230#if defined(__ARM_PCS_VFP) // Hardfp2231VMOV(S0, fpr.V(sreg));2232#else // Softfp2233VMOV(R0, fpr.V(sreg));2234#endif2235// FlushBeforeCall saves R1.2236QuickCallFunction(R1, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);2237#if !defined(__ARM_PCS_VFP)2238// Returns D0 on hardfp and R0,R1 on softfp due to union joining the two floats2239VMOV(D0, R0, R1);2240#endif2241CompVrotShuffle(dregs, imm, sz, false);2242if (vd2 != -1) {2243// If the negsin setting differs between the two joint invocations, we need to flip the second one.2244bool negSin2 = (imm2 & 0x10) ? true : false;2245CompVrotShuffle(dregs2, imm2, sz, negSin1 != negSin2);2246EatInstruction(nextOp);2247}22482249fpr.ReleaseSpillLocksAndDiscardTemps();2250}22512252void ArmJit::Comp_Vsgn(MIPSOpcode op) {2253NEON_IF_AVAILABLE(CompNEON_Vsgn);2254CONDITIONAL_DISABLE(VFPU_VEC);2255if (js.HasUnknownPrefix()) {2256DISABLE;2257}22582259VectorSize sz = GetVecSize(op);2260int n = GetNumVectorElements(sz);22612262u8 sregs[4], dregs[4];2263GetVectorRegsPrefixS(sregs, sz, _VS);2264GetVectorRegsPrefixD(dregs, sz, _VD);22652266MIPSReg tempregs[4];2267for (int i = 0; i < n; ++i) {2268if (!IsOverlapSafe(dregs[i], i, n, sregs)) {2269tempregs[i] = fpr.GetTempV();2270} else {2271tempregs[i] = dregs[i];2272}2273}22742275for (int i = 0; i < n; ++i) {2276fpr.MapDirtyInV(tempregs[i], sregs[i]);2277VCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)2278VMOV(SCRATCHREG1, fpr.V(sregs[i]));2279VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags).2280SetCC(CC_NEQ);2281AND(SCRATCHREG1, SCRATCHREG1, AssumeMakeOperand2(0x80000000));2282ORR(SCRATCHREG1, SCRATCHREG1, AssumeMakeOperand2(0x3F800000));2283SetCC(CC_EQ);2284MOV(SCRATCHREG1, AssumeMakeOperand2(0x0));2285SetCC(CC_AL);2286VMOV(fpr.V(tempregs[i]), SCRATCHREG1);2287}22882289for (int i = 0; i < n; ++i) {2290if (dregs[i] != tempregs[i]) {2291fpr.MapDirtyInV(dregs[i], tempregs[i]);2292VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));2293}2294}22952296ApplyPrefixD(dregs, sz);22972298fpr.ReleaseSpillLocksAndDiscardTemps();2299}23002301void ArmJit::Comp_Vocp(MIPSOpcode op) {2302NEON_IF_AVAILABLE(CompNEON_Vocp);2303CONDITIONAL_DISABLE(VFPU_VEC);2304if (js.HasUnknownPrefix()) {2305DISABLE;2306}23072308VectorSize sz = GetVecSize(op);2309int n = GetNumVectorElements(sz);23102311// This is a hack that modifies prefixes. We eat them later, so just overwrite.2312// S prefix forces the negate flags.2313js.prefixS |= 0x000F0000;2314// T prefix forces constants on and regnum to 1.2315// That means negate still works, and abs activates a different constant.2316js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;23172318u8 sregs[4], tregs[4], dregs[4];2319GetVectorRegsPrefixS(sregs, sz, _VS);2320GetVectorRegsPrefixT(tregs, sz, _VS);2321GetVectorRegsPrefixD(dregs, sz, _VD);23222323MIPSReg tempregs[4];2324for (int i = 0; i < n; ++i) {2325if (!IsOverlapSafe(dregs[i], i, n, sregs)) {2326tempregs[i] = fpr.GetTempV();2327} else {2328tempregs[i] = dregs[i];2329}2330}23312332for (int i = 0; i < n; ++i) {2333fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);2334VADD(fpr.V(tempregs[i]), fpr.V(tregs[i]), fpr.V(sregs[i]));2335}23362337for (int i = 0; i < n; ++i) {2338if (dregs[i] != tempregs[i]) {2339fpr.MapDirtyInV(dregs[i], tempregs[i]);2340VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));2341}2342}23432344ApplyPrefixD(dregs, sz);23452346fpr.ReleaseSpillLocksAndDiscardTemps();2347}23482349void ArmJit::Comp_ColorConv(MIPSOpcode op) {2350DISABLE;2351}23522353void ArmJit::Comp_Vbfy(MIPSOpcode op) {2354DISABLE;2355}2356}23572358#endif // PPSSPP_ARCH(ARM)235923602361