CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/x86/CompVFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617// Table 13.10 in http://agner.org/optimize/optimizing_assembly.pdf is cool - generate constants with18// short instruction sequences. Surprisingly many are possible.1920#include "ppsspp_config.h"21#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)2223#include <cmath>24#include <limits>25#include <emmintrin.h>2627#include "Common/Math/math_util.h"2829#include "Common/CPUDetect.h"30#include "Common/Log.h"31#include "Core/Compatibility.h"32#include "Core/Config.h"33#include "Core/MemMap.h"34#include "Core/Reporting.h"35#include "Core/System.h"36#include "Core/MIPS/MIPSAnalyst.h"37#include "Core/MIPS/MIPSCodeUtils.h"38#include "Core/MIPS/MIPSVFPUUtils.h"39#include "Core/MIPS/x86/Jit.h"40#include "Core/MIPS/x86/RegCache.h"4142// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.43// Currently known non working ones should have DISABLE.4445// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }46#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }47#define DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }4849#define _RS MIPS_GET_RS(op)50#define _RT MIPS_GET_RT(op)51#define _RD MIPS_GET_RD(op)52#define _FS MIPS_GET_FS(op)53#define _FT MIPS_GET_FT(op)54#define _FD MIPS_GET_FD(op)55#define _SA MIPS_GET_SA(op)56#define _POS ((op>> 6) & 0x1F)57#define _SIZE ((op>>11) & 0x1F)58#define _IMM16 (signed short)(op & 0xFFFF)59#define _IMM26 (op & 0x03FFFFFF)6061namespace MIPSComp62{63using namespace Gen;64using namespace X64JitConstants;6566static const float one = 1.0f;67static const float minus_one = -1.0f;6869alignas(16) const u32 noSignMask[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};70alignas(16) const u32 signBitAll[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};71alignas(16) const u32 signBitLower[4] = {0x80000000, 0, 0, 0};72alignas(16) const float oneOneOneOne[4] = {1.0f, 1.0f, 1.0f, 1.0f};73alignas(16) const u32 fourinfnan[4] = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};74alignas(16) const float identityMatrix[4][4] = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };7576void Jit::Comp_VPFX(MIPSOpcode op)77{78CONDITIONAL_DISABLE(VFPU_XFER);79int data = op & 0xFFFFF;80int regnum = (op >> 24) & 3;81switch (regnum) {82case 0: // S83js.prefixS = data;84js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;85break;86case 1: // T87js.prefixT = data;88js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;89break;90case 2: // D91js.prefixD = data & 0x00000FFF;92js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;93break;94}95}9697void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {98if (prefix == 0xE4) return;99100int n = GetNumVectorElements(sz);101u8 origV[4];102static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};103104for (int i = 0; i < n; i++)105origV[i] = vregs[i];106107for (int i = 0; i < n; i++) {108int regnum = (prefix >> (i*2)) & 3;109int abs = (prefix >> (8+i)) & 1;110int negate = (prefix >> (16+i)) & 1;111int constants = (prefix >> (12+i)) & 1;112113// Unchanged, hurray.114if (!constants && regnum == i && !abs && !negate)115continue;116117// This puts the value into a temp reg, so we won't write the modified value back.118vregs[i] = fpr.GetTempV();119fpr.MapRegV(vregs[i], MAP_NOINIT | MAP_DIRTY);120121if (!constants) {122// Prefix may say "z, z, z, z" but if this is a pair, we force to x.123// TODO: But some ops seem to use const 0 instead?124if (regnum >= n) {125ERROR_LOG_REPORT(Log::CPU, "Invalid VFPU swizzle: %08x / %d", prefix, sz);126regnum = 0;127}128fpr.SimpleRegV(origV[regnum], 0);129MOVSS(fpr.VX(vregs[i]), fpr.V(origV[regnum]));130if (abs) {131if (RipAccessible(&noSignMask)) {132ANDPS(fpr.VX(vregs[i]), M(&noSignMask)); // rip accessible133} else {134MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));135ANDPS(fpr.VX(vregs[i]), MatR(TEMPREG));136}137}138} else {139if (RipAccessible(constantArray)) {140MOVSS(fpr.VX(vregs[i]), M(&constantArray[regnum + (abs << 2)])); // rip accessible141} else {142MOV(PTRBITS, R(TEMPREG), ImmPtr(&constantArray[regnum + (abs << 2)]));143MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));144}145}146147if (negate) {148if (RipAccessible(&signBitLower)) {149XORPS(fpr.VX(vregs[i]), M(&signBitLower)); // rip accessible150} else {151MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));152XORPS(fpr.VX(vregs[i]), MatR(TEMPREG));153}154}155// TODO: This probably means it will swap out soon, inefficiently...156fpr.ReleaseSpillLockV(vregs[i]);157}158}159160void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {161_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);162163GetVectorRegs(regs, sz, vectorReg);164if (js.prefixD == 0)165return;166167int n = GetNumVectorElements(sz);168for (int i = 0; i < n; i++) {169// Hopefully this is rare, we'll just write it into a reg we drop.170if (js.VfpuWriteMask(i))171regs[i] = fpr.GetTempV();172}173}174175void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {176_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);177if (!js.prefixD) return;178179int n = GetNumVectorElements(sz);180for (int i = 0; i < n; i++) {181if (js.VfpuWriteMask(i))182continue;183184int sat = (js.prefixD >> (i * 2)) & 3;185if (sat == 1) {186fpr.MapRegV(vregs[i], MAP_DIRTY);187188// Zero out XMM0 if it was <= +0.0f (but skip NAN.)189MOVSS(R(XMM0), fpr.VX(vregs[i]));190XORPS(XMM1, R(XMM1));191CMPLESS(XMM0, R(XMM1));192ANDNPS(XMM0, fpr.V(vregs[i]));193194// Retain a NAN in XMM0 (must be second operand.)195MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));196MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));197MINSS(fpr.VX(vregs[i]), R(XMM0));198} else if (sat == 3) {199fpr.MapRegV(vregs[i], MAP_DIRTY);200201// Check for < -1.0f, but careful of NANs.202MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));203MOVSS(XMM1, MatR(TEMPREG));204MOVSS(R(XMM0), fpr.VX(vregs[i]));205CMPLESS(XMM0, R(XMM1));206// If it was NOT less, the three ops below do nothing.207// Otherwise, they replace the value with -1.0f.208ANDPS(XMM1, R(XMM0));209ANDNPS(XMM0, fpr.V(vregs[i]));210ORPS(XMM0, R(XMM1));211212// Retain a NAN in XMM0 (must be second operand.)213MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));214MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));215MINSS(fpr.VX(vregs[i]), R(XMM0));216}217}218}219220// Vector regs can overlap in all sorts of swizzled ways.221// This does allow a single overlap in sregs[i].222bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {223for (int i = 0; i < sn; ++i) {224if (sregs[i] == dreg && i != di)225return false;226}227for (int i = 0; i < tn; ++i) {228if (tregs[i] == dreg)229return false;230}231232// Hurray, no overlap, we can write directly.233return true;234}235236bool IsOverlapSafe(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {237return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;238}239240void Jit::Comp_SV(MIPSOpcode op) {241CONDITIONAL_DISABLE(LSU_VFPU);242243s32 imm = (signed short)(op&0xFFFC);244int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);245MIPSGPReg rs = _RS;246247CheckMemoryBreakpoint(0, rs, imm);248249switch (op >> 26) {250case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);251{252gpr.Lock(rs);253fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);254255JitSafeMem safe(this, rs, imm);256OpArg src;257if (safe.PrepareRead(src, 4)) {258MOVSS(fpr.VX(vt), safe.NextFastAddress(0));259}260if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {261MOVD_xmm(fpr.VX(vt), R(EAX));262}263safe.Finish();264265gpr.UnlockAll();266fpr.ReleaseSpillLocks();267}268break;269270case 58: //sv.s // Memory::Write_U32(VI(vt), addr);271{272gpr.Lock(rs);273274fpr.MapRegV(vt, 0);275276JitSafeMem safe(this, rs, imm);277OpArg dest;278if (safe.PrepareWrite(dest, 4)) {279MOVSS(safe.NextFastAddress(0), fpr.VX(vt));280}281if (safe.PrepareSlowWrite()) {282MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vt));283safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), 0);284}285safe.Finish();286287fpr.ReleaseSpillLocks();288gpr.UnlockAll();289}290break;291292default:293DISABLE;294}295}296297void Jit::Comp_SVQ(MIPSOpcode op) {298CONDITIONAL_DISABLE(LSU_VFPU);299300int imm = (signed short)(op&0xFFFC);301int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);302MIPSGPReg rs = _RS;303304CheckMemoryBreakpoint(0, rs, imm);305306switch (op >> 26) {307case 53: //lvl.q/lvr.q308{309if (!g_Config.bFastMemory) {310DISABLE;311}312DISABLE;313314gpr.MapReg(rs, true, false);315gpr.FlushLockX(ECX);316u8 vregs[4];317GetVectorRegs(vregs, V_Quad, vt);318MOV(32, R(EAX), gpr.R(rs));319ADD(32, R(EAX), Imm32(imm));320#ifdef MASKED_PSP_MEMORY321AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));322#endif323MOV(32, R(ECX), R(EAX));324SHR(32, R(EAX), Imm8(2));325AND(32, R(EAX), Imm32(0x3));326CMP(32, R(EAX), Imm32(0));327FixupBranch next = J_CC(CC_NE);328329auto PSPMemAddr = [](X64Reg scaled, int offset) {330#if PPSSPP_ARCH(X86)331return MDisp(scaled, (u32)Memory::base + offset);332#else333return MComplex(MEMBASEREG, scaled, 1, offset);334#endif335};336337fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY);338339// Offset = 0340MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 0));341342FixupBranch skip0 = J();343SetJumpTarget(next);344CMP(32, R(EAX), Imm32(1));345next = J_CC(CC_NE);346347// Offset = 1348MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 4));349MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 0));350351FixupBranch skip1 = J();352SetJumpTarget(next);353CMP(32, R(EAX), Imm32(2));354next = J_CC(CC_NE);355356// Offset = 2357MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 8));358MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 4));359MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 0));360361FixupBranch skip2 = J();362SetJumpTarget(next);363CMP(32, R(EAX), Imm32(3));364next = J_CC(CC_NE);365366// Offset = 3367MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 12));368MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 8));369MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 4));370MOVSS(fpr.RX(vregs[0]), PSPMemAddr(EAX, 0));371372SetJumpTarget(next);373SetJumpTarget(skip0);374SetJumpTarget(skip1);375SetJumpTarget(skip2);376377gpr.UnlockAll();378fpr.ReleaseSpillLocks();379}380break;381382case 54: //lv.q383{384gpr.Lock(rs);385// This must be in a reg or an immediate.386// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().387if (!gpr.IsImm(rs))388gpr.MapReg(rs, true, false);389390u8 vregs[4];391GetVectorRegs(vregs, V_Quad, vt);392393if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {394JitSafeMem safe(this, rs, imm);395OpArg src;396if (safe.PrepareRead(src, 16)) {397// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.398if (g_Config.bFastMemory) {399MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));400} else {401MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));402}403}404if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {405for (int i = 0; i < 4; i++) {406safe.NextSlowRead(safeMemFuncs.readU32, i * 4);407// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.408MOVD_xmm(XMM0, R(EAX));409MOVSS(fpr.VSX(vregs), R(XMM0));410// Rotate things so we can read in the next higher float.411// By the end (4 rotates), they'll all be back into place.412SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));413}414}415safe.Finish();416gpr.UnlockAll();417fpr.ReleaseSpillLocks();418return;419}420421fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);422423JitSafeMem safe(this, rs, imm);424OpArg src;425if (safe.PrepareRead(src, 16)) {426// Just copy 4 words the easiest way while not wasting registers.427for (int i = 0; i < 4; i++)428MOVSS(fpr.VX(vregs[i]), safe.NextFastAddress(i * 4));429}430if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {431for (int i = 0; i < 4; i++) {432safe.NextSlowRead(safeMemFuncs.readU32, i * 4);433MOVD_xmm(fpr.VX(vregs[i]), R(EAX));434}435}436safe.Finish();437438gpr.UnlockAll();439fpr.ReleaseSpillLocks();440}441break;442443case 62: //sv.q444{445gpr.Lock(rs);446// This must be in a reg or an immediate.447// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().448if (!gpr.IsImm(rs))449gpr.MapReg(rs, true, false);450451u8 vregs[4];452GetVectorRegs(vregs, V_Quad, vt);453454if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {455JitSafeMem safe(this, rs, imm);456OpArg dest;457if (safe.PrepareWrite(dest, 16)) {458// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.459if (g_Config.bFastMemory) {460MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));461} else {462MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));463}464}465if (safe.PrepareSlowWrite()) {466MOVAPS(XMM0, fpr.VS(vregs));467for (int i = 0; i < 4; i++) {468MOVSS(MIPSSTATE_VAR(temp), XMM0);469SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));470safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);471}472}473safe.Finish();474gpr.UnlockAll();475fpr.ReleaseSpillLocks();476return;477}478479// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.480fpr.MapRegsV(vregs, V_Quad, 0);481482JitSafeMem safe(this, rs, imm);483OpArg dest;484if (safe.PrepareWrite(dest, 16)) {485for (int i = 0; i < 4; i++)486MOVSS(safe.NextFastAddress(i * 4), fpr.VX(vregs[i]));487}488if (safe.PrepareSlowWrite()) {489for (int i = 0; i < 4; i++) {490MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vregs[i]));491safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);492}493}494safe.Finish();495496gpr.UnlockAll();497fpr.ReleaseSpillLocks();498}499break;500501default:502DISABLE;503break;504}505}506507void Jit::Comp_VVectorInit(MIPSOpcode op) {508CONDITIONAL_DISABLE(VFPU_XFER);509510if (js.HasUnknownPrefix())511DISABLE;512513VectorSize sz = GetVecSize(op);514int type = (op >> 16) & 0xF;515u8 dregs[4];516GetVectorRegsPrefixD(dregs, sz, _VD);517518if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {519if (type == 6) {520XORPS(fpr.VSX(dregs), fpr.VS(dregs));521} else if (type == 7) {522if (RipAccessible(&oneOneOneOne)) {523MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne)); // rip accessible524} else {525MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));526MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));527}528} else {529DISABLE;530}531ApplyPrefixD(dregs, sz);532fpr.ReleaseSpillLocks();533return;534}535536switch (type) {537case 6: // v=zeros; break; //vzero538XORPS(XMM0, R(XMM0));539break;540case 7: // v=ones; break; //vone541if (RipAccessible(&one)) {542MOVSS(XMM0, M(&one)); // rip accessible543} else {544MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));545MOVSS(XMM0, MatR(TEMPREG));546}547break;548default:549DISABLE;550break;551}552553int n = GetNumVectorElements(sz);554fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);555for (int i = 0; i < n; ++i)556MOVSS(fpr.VX(dregs[i]), R(XMM0));557ApplyPrefixD(dregs, sz);558559fpr.ReleaseSpillLocks();560}561562void Jit::Comp_VIdt(MIPSOpcode op) {563CONDITIONAL_DISABLE(VFPU_XFER);564if (js.HasUnknownPrefix())565DISABLE;566567int vd = _VD;568VectorSize sz = GetVecSize(op);569int n = GetNumVectorElements(sz);570571u8 dregs[4];572GetVectorRegsPrefixD(dregs, sz, _VD);573if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {574int row = vd & (n - 1);575if (RipAccessible(identityMatrix)) {576MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); // rip accessible577} else {578MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row]));579MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));580}581ApplyPrefixD(dregs, sz);582fpr.ReleaseSpillLocks();583return;584}585586XORPS(XMM0, R(XMM0));587if (RipAccessible(&one)) {588MOVSS(XMM1, M(&one)); // rip accessible589} else {590MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));591MOVSS(XMM1, MatR(TEMPREG));592}593fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);594switch (sz) {595case V_Pair:596MOVSS(fpr.VX(dregs[0]), R((vd&1)==0 ? XMM1 : XMM0));597MOVSS(fpr.VX(dregs[1]), R((vd&1)==1 ? XMM1 : XMM0));598break;599case V_Quad:600MOVSS(fpr.VX(dregs[0]), R((vd&3)==0 ? XMM1 : XMM0));601MOVSS(fpr.VX(dregs[1]), R((vd&3)==1 ? XMM1 : XMM0));602MOVSS(fpr.VX(dregs[2]), R((vd&3)==2 ? XMM1 : XMM0));603MOVSS(fpr.VX(dregs[3]), R((vd&3)==3 ? XMM1 : XMM0));604break;605default:606_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");607break;608}609ApplyPrefixD(dregs, sz);610fpr.ReleaseSpillLocks();611}612613void Jit::Comp_VDot(MIPSOpcode op) {614CONDITIONAL_DISABLE(VFPU_VEC);615616if (js.HasUnknownPrefix())617DISABLE;618619VectorSize sz = GetVecSize(op);620int n = GetNumVectorElements(sz);621622// TODO: Force read one of them into regs? probably not.623u8 sregs[4], tregs[4], dregs[1];624GetVectorRegsPrefixS(sregs, sz, _VS);625GetVectorRegsPrefixT(tregs, sz, _VT);626GetVectorRegsPrefixD(dregs, V_Single, _VD);627628// With SSE2, these won't really give any performance benefit on their own, but may reduce629// conversion costs from/to SIMD form. However, the SSE4.1 DPPS may be worth it.630// Benchmarking will have to decide whether to enable this on < SSE4.1. Also a HADDPS version631// for SSE3 could be written.632if (fpr.TryMapDirtyInInVS(dregs, V_Single, sregs, sz, tregs, sz)) {633switch (sz) {634case V_Pair:635if (cpu_info.bSSE4_1) {636if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {637MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));638DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x31);639} else {640MOVAPS(XMM0, fpr.VS(sregs));641DPPS(XMM0, fpr.VS(tregs), 0x31);642MOVAPS(fpr.VSX(dregs), R(XMM0));643}644} else {645MOVAPS(XMM0, fpr.VS(sregs));646MULPS(XMM0, fpr.VS(tregs));647MOVAPS(R(XMM1), XMM0);648SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(1, 1, 1, 1));649ADDPS(XMM1, R(XMM0));650MOVAPS(fpr.VS(dregs), XMM1);651}652break;653case V_Triple:654if (cpu_info.bSSE4_1) {655if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {656MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));657DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x71);658} else {659MOVAPS(XMM0, fpr.VS(sregs));660DPPS(XMM0, fpr.VS(tregs), 0x71);661MOVAPS(fpr.VSX(dregs), R(XMM0));662}663} else {664MOVAPS(XMM0, fpr.VS(sregs));665MULPS(XMM0, fpr.VS(tregs));666MOVAPS(R(XMM1), XMM0);667SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(3, 2, 1, 1));668ADDSS(XMM1, R(XMM0));669SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(3, 2, 2, 2));670ADDSS(XMM1, R(XMM0));671MOVAPS(fpr.VS(dregs), XMM1);672}673break;674case V_Quad:675if (cpu_info.bSSE4_1) {676if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {677MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));678DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0xF1);679} else {680MOVAPS(XMM0, fpr.VS(sregs));681DPPS(XMM0, fpr.VS(tregs), 0xF1);682MOVAPS(fpr.VSX(dregs), R(XMM0));683}684} /* else if (cpu_info.bSSE3) { // This is slower than the SSE2 solution on my Ivy!685MOVAPS(XMM0, fpr.VS(sregs));686MOVAPS(XMM1, fpr.VS(tregs));687HADDPS(XMM0, R(XMM1));688HADDPS(XMM0, R(XMM0));689MOVAPS(fpr.VSX(dregs), R(XMM0));690} */ else {691MOVAPS(XMM0, fpr.VS(sregs));692MOVAPS(XMM1, fpr.VS(tregs));693MULPS(XMM0, R(XMM1));694MOVAPS(XMM1, R(XMM0));695SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(2, 3, 0, 1));696ADDPS(XMM0, R(XMM1));697MOVAPS(XMM1, R(XMM0));698SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 1, 2, 3));699ADDSS(XMM0, R(XMM1));700MOVAPS(fpr.VSX(dregs), R(XMM0));701}702break;703default:704DISABLE;705}706ApplyPrefixD(dregs, V_Single);707fpr.ReleaseSpillLocks();708return;709}710711// Flush SIMD.712fpr.SimpleRegsV(sregs, sz, 0);713fpr.SimpleRegsV(tregs, sz, 0);714fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);715716X64Reg tempxreg = XMM0;717if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {718fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);719tempxreg = fpr.VX(dregs[0]);720}721722// Need to start with +0.0f so it doesn't result in -0.0f.723MOVSS(tempxreg, fpr.V(sregs[0]));724MULSS(tempxreg, fpr.V(tregs[0]));725for (int i = 1; i < n; i++)726{727// sum += s[i]*t[i];728MOVSS(XMM1, fpr.V(sregs[i]));729MULSS(XMM1, fpr.V(tregs[i]));730ADDSS(tempxreg, R(XMM1));731}732733if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {734fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);735MOVSS(fpr.V(dregs[0]), tempxreg);736}737738ApplyPrefixD(dregs, V_Single);739740fpr.ReleaseSpillLocks();741}742743744void Jit::Comp_VHdp(MIPSOpcode op) {745CONDITIONAL_DISABLE(VFPU_VEC);746747if (js.HasUnknownPrefix())748DISABLE;749750VectorSize sz = GetVecSize(op);751int n = GetNumVectorElements(sz);752753u8 sregs[4], tregs[4], dregs[1];754GetVectorRegsPrefixS(sregs, sz, _VS);755GetVectorRegsPrefixT(tregs, sz, _VT);756GetVectorRegsPrefixD(dregs, V_Single, _VD);757758// Flush SIMD.759fpr.SimpleRegsV(sregs, sz, 0);760fpr.SimpleRegsV(tregs, sz, 0);761fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);762763X64Reg tempxreg = XMM0;764if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {765fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);766tempxreg = fpr.VX(dregs[0]);767}768769// Need to start with +0.0f so it doesn't result in -0.0f.770MOVSS(tempxreg, fpr.V(sregs[0]));771MULSS(tempxreg, fpr.V(tregs[0]));772for (int i = 1; i < n; i++) {773// sum += (i == n-1) ? t[i] : s[i]*t[i];774if (i == n - 1) {775ADDSS(tempxreg, fpr.V(tregs[i]));776} else {777MOVSS(XMM1, fpr.V(sregs[i]));778MULSS(XMM1, fpr.V(tregs[i]));779ADDSS(tempxreg, R(XMM1));780}781}782783if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {784fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);785MOVSS(fpr.V(dregs[0]), tempxreg);786}787788ApplyPrefixD(dregs, V_Single);789790fpr.ReleaseSpillLocks();791}792793void Jit::Comp_VCrossQuat(MIPSOpcode op) {794CONDITIONAL_DISABLE(VFPU_VEC);795796if (js.HasUnknownPrefix())797DISABLE;798799VectorSize sz = GetVecSize(op);800801u8 sregs[4], tregs[4], dregs[4];802GetVectorRegs(sregs, sz, _VS);803GetVectorRegs(tregs, sz, _VT);804GetVectorRegs(dregs, sz, _VD);805806if (sz == V_Triple) {807// Cross product vcrsp.t808if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {809MOVAPS(XMM0, fpr.VS(tregs));810MOVAPS(XMM1, fpr.VS(sregs));811SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));812SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 0, 2, 1));813MULPS(XMM0, fpr.VS(sregs));814MULPS(XMM1, fpr.VS(tregs));815SUBPS(XMM0, R(XMM1));816SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));817MOVAPS(fpr.VS(dregs), XMM0);818fpr.ReleaseSpillLocks();819return;820}821822// Flush SIMD.823fpr.SimpleRegsV(sregs, sz, 0);824fpr.SimpleRegsV(tregs, sz, 0);825fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);826827fpr.MapRegsV(sregs, sz, 0);828829// Compute X830MOVSS(XMM0, fpr.V(sregs[1]));831MULSS(XMM0, fpr.V(tregs[2]));832MOVSS(XMM1, fpr.V(sregs[2]));833MULSS(XMM1, fpr.V(tregs[1]));834SUBSS(XMM0, R(XMM1));835MOVSS(fpr.V(dregs[0]), XMM0);836837// Compute Y838MOVSS(XMM0, fpr.V(sregs[2]));839MULSS(XMM0, fpr.V(tregs[0]));840MOVSS(XMM1, fpr.V(sregs[0]));841MULSS(XMM1, fpr.V(tregs[2]));842SUBSS(XMM0, R(XMM1));843MOVSS(fpr.V(dregs[1]), XMM0);844845// Compute Z846MOVSS(XMM0, fpr.V(sregs[0]));847MULSS(XMM0, fpr.V(tregs[1]));848MOVSS(XMM1, fpr.V(sregs[1]));849MULSS(XMM1, fpr.V(tregs[0]));850SUBSS(XMM0, R(XMM1));851MOVSS(fpr.V(dregs[2]), XMM0);852} else if (sz == V_Quad) {853// Flush SIMD.854fpr.SimpleRegsV(sregs, sz, 0);855fpr.SimpleRegsV(tregs, sz, 0);856fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);857858// Quaternion product vqmul.q859fpr.MapRegsV(sregs, sz, 0);860861// Compute X862// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];863MOVSS(XMM0, fpr.V(sregs[0]));864MULSS(XMM0, fpr.V(tregs[3]));865MOVSS(XMM1, fpr.V(sregs[1]));866MULSS(XMM1, fpr.V(tregs[2]));867ADDSS(XMM0, R(XMM1));868MOVSS(XMM1, fpr.V(sregs[2]));869MULSS(XMM1, fpr.V(tregs[1]));870SUBSS(XMM0, R(XMM1));871MOVSS(XMM1, fpr.V(sregs[3]));872MULSS(XMM1, fpr.V(tregs[0]));873ADDSS(XMM0, R(XMM1));874MOVSS(fpr.V(dregs[0]), XMM0);875876// Compute Y877//d[1] = s[1] * t[3] + s[2] * t[0] + s[3] * t[1] - s[0] * t[2];878MOVSS(XMM0, fpr.V(sregs[1]));879MULSS(XMM0, fpr.V(tregs[3]));880MOVSS(XMM1, fpr.V(sregs[2]));881MULSS(XMM1, fpr.V(tregs[0]));882ADDSS(XMM0, R(XMM1));883MOVSS(XMM1, fpr.V(sregs[3]));884MULSS(XMM1, fpr.V(tregs[1]));885ADDSS(XMM0, R(XMM1));886MOVSS(XMM1, fpr.V(sregs[0]));887MULSS(XMM1, fpr.V(tregs[2]));888SUBSS(XMM0, R(XMM1));889MOVSS(fpr.V(dregs[1]), XMM0);890891// Compute Z892//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];893MOVSS(XMM0, fpr.V(sregs[0]));894MULSS(XMM0, fpr.V(tregs[1]));895MOVSS(XMM1, fpr.V(sregs[1]));896MULSS(XMM1, fpr.V(tregs[0]));897SUBSS(XMM0, R(XMM1));898MOVSS(XMM1, fpr.V(sregs[2]));899MULSS(XMM1, fpr.V(tregs[3]));900ADDSS(XMM0, R(XMM1));901MOVSS(XMM1, fpr.V(sregs[3]));902MULSS(XMM1, fpr.V(tregs[2]));903ADDSS(XMM0, R(XMM1));904MOVSS(fpr.V(dregs[2]), XMM0);905906// Compute W907//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];908MOVSS(XMM0, fpr.V(sregs[3]));909MULSS(XMM0, fpr.V(tregs[3]));910MOVSS(XMM1, fpr.V(sregs[1]));911MULSS(XMM1, fpr.V(tregs[1]));912SUBSS(XMM0, R(XMM1));913MOVSS(XMM1, fpr.V(sregs[2]));914MULSS(XMM1, fpr.V(tregs[2]));915SUBSS(XMM0, R(XMM1));916MOVSS(XMM1, fpr.V(sregs[0]));917MULSS(XMM1, fpr.V(tregs[0]));918SUBSS(XMM0, R(XMM1));919MOVSS(fpr.V(dregs[3]), XMM0);920}921922fpr.ReleaseSpillLocks();923}924925void Jit::Comp_Vcmov(MIPSOpcode op) {926CONDITIONAL_DISABLE(VFPU_COMP);927928if (js.HasUnknownPrefix())929DISABLE;930931VectorSize sz = GetVecSize(op);932int n = GetNumVectorElements(sz);933934u8 sregs[4], dregs[4];935GetVectorRegsPrefixS(sregs, sz, _VS);936GetVectorRegsPrefixD(dregs, sz, _VD);937int tf = (op >> 19) & 1;938int imm3 = (op >> 16) & 7;939940// Flush SIMD.941fpr.SimpleRegsV(sregs, sz, 0);942943for (int i = 0; i < n; ++i) {944// Simplification: Disable if overlap unsafe945if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {946DISABLE;947}948}949950if (imm3 < 6) {951gpr.MapReg(MIPS_REG_VFPUCC, true, false);952fpr.MapRegsV(dregs, sz, MAP_DIRTY);953// Test one bit of CC. This bit decides whether none or all subregisters are copied.954TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << imm3));955FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);956for (int i = 0; i < n; i++) {957MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));958}959SetJumpTarget(skip);960} else {961gpr.MapReg(MIPS_REG_VFPUCC, true, false);962fpr.MapRegsV(dregs, sz, MAP_DIRTY);963// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.964for (int i = 0; i < n; i++) {965TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << i));966FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);967MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));968SetJumpTarget(skip);969}970}971972ApplyPrefixD(dregs, sz);973974fpr.ReleaseSpillLocks();975}976977static s32 DoVminSS(s32 treg) {978s32 sreg = currentMIPS->temp;979980// If both are negative, we flip the comparison (not two's compliment.)981if (sreg < 0 && treg < 0) {982// If at least one side is NAN, we take the highest mantissa bits.983return treg < sreg ? sreg : treg;984} else {985// Otherwise, we take the lowest value (negative or lowest mantissa.)986return treg > sreg ? sreg : treg;987}988}989990static s32 DoVmaxSS(s32 treg) {991s32 sreg = currentMIPS->temp;992993// This is the same logic as vmin, just reversed.994if (sreg < 0 && treg < 0) {995return treg < sreg ? treg : sreg;996} else {997return treg > sreg ? treg : sreg;998}999}10001001void Jit::Comp_VecDo3(MIPSOpcode op) {1002CONDITIONAL_DISABLE(VFPU_VEC);10031004if (js.HasUnknownPrefix())1005DISABLE;10061007// Check that we can support the ops, and prepare temporary values for ops that need it.1008bool allowSIMD = true;1009switch (op >> 26) {1010case 24: //VFPU01011switch ((op >> 23) & 7) {1012case 0: // d[i] = s[i] + t[i]; break; //vadd1013case 1: // d[i] = s[i] - t[i]; break; //vsub1014case 7: // d[i] = s[i] / t[i]; break; //vdiv1015break;1016default:1017DISABLE;1018}1019break;1020case 25: //VFPU11021switch ((op >> 23) & 7) {1022case 0: // d[i] = s[i] * t[i]; break; //vmul1023break;1024default:1025DISABLE;1026}1027break;1028case 27: //VFPU31029switch ((op >> 23) & 7) {1030case 2: // vmin1031case 3: // vmax1032allowSIMD = false;1033break;1034case 6: // vsge1035case 7: // vslt1036break;1037default:1038DISABLE;1039}1040break;1041default:1042DISABLE;1043break;1044}10451046VectorSize sz = GetVecSize(op);1047int n = GetNumVectorElements(sz);10481049u8 sregs[4], tregs[4], dregs[4];1050GetVectorRegsPrefixS(sregs, sz, _VS);1051GetVectorRegsPrefixT(tregs, sz, _VT);1052GetVectorRegsPrefixD(dregs, sz, _VD);10531054if (allowSIMD && fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {1055void (XEmitter::*opFunc)(X64Reg, OpArg) = nullptr;1056bool symmetric = false;1057switch (op >> 26) {1058case 24: //VFPU01059switch ((op >> 23) & 7) {1060case 0: // d[i] = s[i] + t[i]; break; //vadd1061opFunc = &XEmitter::ADDPS;1062symmetric = true;1063break;1064case 1: // d[i] = s[i] - t[i]; break; //vsub1065opFunc = &XEmitter::SUBPS;1066break;1067case 7: // d[i] = s[i] / t[i]; break; //vdiv1068opFunc = &XEmitter::DIVPS;1069break;1070}1071break;1072case 25: //VFPU11073switch ((op >> 23) & 7)1074{1075case 0: // d[i] = s[i] * t[i]; break; //vmul1076opFunc = &XEmitter::MULPS;1077symmetric = true;1078break;1079}1080break;1081case 27: //VFPU31082switch ((op >> 23) & 7)1083{1084case 2: // vmin1085// TODO: Mishandles NaN. Disabled for now.1086MOVAPS(XMM1, fpr.VS(sregs));1087MINPS(XMM1, fpr.VS(tregs));1088MOVAPS(fpr.VSX(dregs), R(XMM1));1089break;1090case 3: // vmax1091// TODO: Mishandles NaN. Disabled for now.1092MOVAPS(XMM1, fpr.VS(sregs));1093MAXPS(XMM1, fpr.VS(tregs));1094MOVAPS(fpr.VSX(dregs), R(XMM1));1095break;1096case 6: // vsge1097MOVAPS(XMM0, fpr.VS(tregs));1098MOVAPS(XMM1, fpr.VS(sregs));1099CMPPS(XMM0, R(XMM1), CMP_ORD);1100CMPPS(XMM1, fpr.VS(tregs), CMP_NLT);11011102ANDPS(XMM1, R(XMM0));1103MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));1104ANDPS(XMM1, MatR(TEMPREG));1105MOVAPS(fpr.VSX(dregs), R(XMM1));1106break;1107case 7: // vslt1108MOVAPS(XMM1, fpr.VS(sregs));1109CMPPS(XMM1, fpr.VS(tregs), CMP_LT);1110MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));1111ANDPS(XMM1, MatR(TEMPREG));1112MOVAPS(fpr.VSX(dregs), R(XMM1));1113break;1114}1115break;1116}11171118if (opFunc != nullptr) {1119if (fpr.VSX(dregs) != fpr.VSX(tregs)) {1120if (fpr.VSX(dregs) != fpr.VSX(sregs)) {1121MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));1122}1123(this->*opFunc)(fpr.VSX(dregs), fpr.VS(tregs));1124} else if (symmetric) {1125// We already know d = t.1126(this->*opFunc)(fpr.VSX(dregs), fpr.VS(sregs));1127} else {1128MOVAPS(XMM1, fpr.VS(sregs));1129(this->*opFunc)(XMM1, fpr.VS(tregs));1130MOVAPS(fpr.VSX(dregs), R(XMM1));1131}1132}11331134ApplyPrefixD(dregs, sz);1135fpr.ReleaseSpillLocks();1136return;1137}11381139// Flush SIMD.1140fpr.SimpleRegsV(sregs, sz, 0);1141fpr.SimpleRegsV(tregs, sz, 0);1142fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);11431144X64Reg tempxregs[4];1145for (int i = 0; i < n; ++i)1146{1147if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs, n, tregs))1148{1149// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.1150// But for vmin/vmax/vsge, we need XMM0/XMM1, so avoid.1151if (i < 2 && (op >> 26) != 27)1152tempxregs[i] = (X64Reg) (XMM0 + i);1153else1154{1155int reg = fpr.GetTempV();1156fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);1157fpr.SpillLockV(reg);1158tempxregs[i] = fpr.VX(reg);1159}1160}1161else1162{1163fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);1164fpr.SpillLockV(dregs[i]);1165tempxregs[i] = fpr.VX(dregs[i]);1166}1167}11681169for (int i = 0; i < n; ++i)1170{1171if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))1172MOVSS(tempxregs[i], fpr.V(sregs[i]));1173}11741175for (int i = 0; i < n; ++i) {1176switch (op >> 26) {1177case 24: //VFPU01178switch ((op >> 23) & 7) {1179case 0: // d[i] = s[i] + t[i]; break; //vadd1180ADDSS(tempxregs[i], fpr.V(tregs[i]));1181break;1182case 1: // d[i] = s[i] - t[i]; break; //vsub1183SUBSS(tempxregs[i], fpr.V(tregs[i]));1184break;1185case 7: // d[i] = s[i] / t[i]; break; //vdiv1186DIVSS(tempxregs[i], fpr.V(tregs[i]));1187break;1188}1189break;1190case 25: //VFPU11191switch ((op >> 23) & 7)1192{1193case 0: // d[i] = s[i] * t[i]; break; //vmul1194MULSS(tempxregs[i], fpr.V(tregs[i]));1195break;1196}1197break;1198case 27: //VFPU31199switch ((op >> 23) & 7)1200{1201case 2: // vmin1202{1203MOVSS(XMM0, fpr.V(tregs[i]));1204UCOMISS(tempxregs[i], R(XMM0));1205FixupBranch skip = J_CC(CC_NP, true);12061207MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);1208MOVD_xmm(R(EAX), XMM0);1209CallProtectedFunction(&DoVminSS, R(EAX));1210MOVD_xmm(tempxregs[i], R(EAX));1211FixupBranch finish = J();12121213SetJumpTarget(skip);1214MINSS(tempxregs[i], R(XMM0));1215SetJumpTarget(finish);1216}1217break;1218case 3: // vmax1219{1220MOVSS(XMM0, fpr.V(tregs[i]));1221UCOMISS(tempxregs[i], R(XMM0));1222FixupBranch skip = J_CC(CC_NP, true);12231224MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);1225MOVD_xmm(R(EAX), XMM0);1226CallProtectedFunction(&DoVmaxSS, R(EAX));1227MOVD_xmm(tempxregs[i], R(EAX));1228FixupBranch finish = J();12291230SetJumpTarget(skip);1231MAXSS(tempxregs[i], R(XMM0));1232SetJumpTarget(finish);1233}1234break;1235case 6: // vsge1236// We can't just reverse, because of 0/-0.1237MOVSS(XMM0, fpr.V(tregs[i]));1238MOVSS(XMM1, R(tempxregs[i]));1239CMPORDSS(XMM1, R(XMM0));1240CMPNLTSS(tempxregs[i], R(XMM0));1241ANDPS(tempxregs[i], R(XMM1));1242MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));1243ANDPS(tempxregs[i], MatR(TEMPREG));1244break;1245case 7: // vslt1246CMPLTSS(tempxregs[i], fpr.V(tregs[i]));1247MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));1248ANDPS(tempxregs[i], MatR(TEMPREG));1249break;1250}1251break;1252}1253}12541255for (int i = 0; i < n; ++i)1256{1257if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))1258MOVSS(fpr.V(dregs[i]), tempxregs[i]);1259}12601261ApplyPrefixD(dregs, sz);12621263fpr.ReleaseSpillLocks();1264}12651266alignas(16) static const u32 vcmpMask[4][4] = {1267{0x00000031, 0x00000000, 0x00000000, 0x00000000},1268{0x00000011, 0x00000012, 0x00000000, 0x00000000},1269{0x00000011, 0x00000012, 0x00000014, 0x00000000},1270{0x00000011, 0x00000012, 0x00000014, 0x00000018},1271};12721273void Jit::Comp_Vcmp(MIPSOpcode op) {1274CONDITIONAL_DISABLE(VFPU_COMP);12751276if (js.HasUnknownPrefix())1277DISABLE;12781279VectorSize sz = GetVecSize(op);1280int n = GetNumVectorElements(sz);12811282VCondition cond = (VCondition)(op & 0xF);12831284u8 sregs[4], tregs[4];1285GetVectorRegsPrefixS(sregs, sz, _VS);1286GetVectorRegsPrefixT(tregs, sz, _VT);12871288// Some, we just fall back to the interpreter.1289switch (cond) {1290case VC_EI: // c = my_isinf(s[i]); break;1291case VC_NI: // c = !my_isinf(s[i]); break;1292DISABLE;1293break;1294case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection1295case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;1296case VC_EN: // c = my_isnan(s[i]); break;1297case VC_NN: // c = !my_isnan(s[i]); break;1298if (_VS != _VT)1299DISABLE;1300break;1301default:1302break;1303}13041305// First, let's get the trivial ones.13061307static const int true_bits[4] = {0x31, 0x33, 0x37, 0x3f};13081309if (cond == VC_TR) {1310gpr.MapReg(MIPS_REG_VFPUCC, true, true);1311OR(32, gpr.R(MIPS_REG_VFPUCC), Imm32(true_bits[n-1]));1312return;1313} else if (cond == VC_FL) {1314gpr.MapReg(MIPS_REG_VFPUCC, true, true);1315AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~true_bits[n-1]));1316return;1317}13181319if (n > 1)1320gpr.FlushLockX(ECX);13211322// Start with zero in each lane for the compare to zero.1323if (cond == VC_EZ || cond == VC_NZ) {1324XORPS(XMM0, R(XMM0));1325if (n > 1) {1326XORPS(XMM1, R(XMM1));1327}1328}13291330bool inverse = false;13311332if (cond == VC_GE || cond == VC_GT) {1333// We flip, and we need them in regs so we don't clear the high lanes.1334fpr.SimpleRegsV(sregs, sz, 0);1335fpr.MapRegsV(tregs, sz, 0);1336} else {1337fpr.SimpleRegsV(tregs, sz, 0);1338fpr.MapRegsV(sregs, sz, 0);1339}13401341// We go backwards because it's more convenient to put things in the right lanes.1342int affected_bits = (1 << 4) | (1 << 5); // 4 and 51343for (int i = n - 1; i >= 0; --i) {1344// Alternate between XMM0 and XMM11345X64Reg reg = i == 1 || i == 3 ? XMM1 : XMM0;1346if ((i == 0 || i == 1) && n > 2) {1347// We need to swap lanes... this also puts them in the right place.1348SHUFPS(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 1));1349}13501351// Let's only handle the easy ones, and fall back on the interpreter for the rest.1352bool compareTwo = false;1353bool compareToZero = false;1354int comparison = -1;1355bool flip = false;13561357switch (cond) {1358case VC_ES:1359comparison = -1; // We will do the compare at the end. XMM1 will have the bits.1360MOVSS(reg, fpr.V(sregs[i]));1361break;13621363case VC_NS:1364comparison = -1; // We will do the compare at the end. XMM1 will have the bits.1365MOVSS(reg, fpr.V(sregs[i]));1366// Note that we do this all at once at the end.1367inverse = true;1368break;13691370case VC_EN:1371comparison = CMP_UNORD;1372compareTwo = true;1373break;13741375case VC_NN:1376comparison = CMP_UNORD;1377compareTwo = true;1378// Note that we do this all at once at the end.1379inverse = true;1380break;13811382case VC_EQ: // c = s[i] == t[i]; break;1383comparison = CMP_EQ;1384compareTwo = true;1385break;13861387case VC_LT: // c = s[i] < t[i]; break;1388comparison = CMP_LT;1389compareTwo = true;1390break;13911392case VC_LE: // c = s[i] <= t[i]; break;1393comparison = CMP_LE;1394compareTwo = true;1395break;13961397case VC_NE: // c = s[i] != t[i]; break;1398comparison = CMP_NEQ;1399compareTwo = true;1400break;14011402case VC_GE: // c = s[i] >= t[i]; break;1403comparison = CMP_LE;1404flip = true;1405compareTwo = true;1406break;14071408case VC_GT: // c = s[i] > t[i]; break;1409comparison = CMP_LT;1410flip = true;1411compareTwo = true;1412break;14131414case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f; break;1415comparison = CMP_EQ;1416compareToZero = true;1417break;14181419case VC_NZ: // c = s[i] != 0; break;1420comparison = CMP_NEQ;1421compareToZero = true;1422break;14231424default:1425DISABLE;1426}14271428if (comparison != -1) {1429if (compareTwo) {1430if (!flip) {1431MOVSS(reg, fpr.V(sregs[i]));1432CMPSS(reg, fpr.V(tregs[i]), comparison);1433} else {1434MOVSS(reg, fpr.V(tregs[i]));1435CMPSS(reg, fpr.V(sregs[i]), comparison);1436}1437} else if (compareToZero) {1438CMPSS(reg, fpr.V(sregs[i]), comparison);1439}1440}14411442affected_bits |= 1 << i;1443}14441445if (n > 1) {1446XOR(32, R(ECX), R(ECX));14471448// This combines them together.1449UNPCKLPS(XMM0, R(XMM1));14501451// Finalize the comparison for ES/NS.1452if (cond == VC_ES || cond == VC_NS) {1453MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));1454ANDPS(XMM0, MatR(TEMPREG));1455PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison1456// It's inversed below for NS.1457}14581459if (inverse) {1460// The canonical way to generate a bunch of ones, see https://stackoverflow.com/questions/35085059/what-are-the-best-instruction-sequences-to-generate-vector-constants-on-the-fly1461PCMPEQW(XMM1, R(XMM1));1462XORPS(XMM0, R(XMM1));1463}1464MOV(PTRBITS, R(TEMPREG), ImmPtr(&vcmpMask[n - 1]));1465ANDPS(XMM0, MatR(TEMPREG));1466MOVAPS(MIPSSTATE_VAR(vcmpResult), XMM0);14671468MOV(32, R(TEMPREG), MIPSSTATE_VAR(vcmpResult[0]));1469for (int i = 1; i < n; ++i) {1470OR(32, R(TEMPREG), MIPSSTATE_VAR_ELEM32(vcmpResult[0], i));1471}14721473// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison,1474// which is the most common after all.1475CMP(32, R(TEMPREG), Imm8(affected_bits & 0x1F));1476SETcc(CC_E, R(ECX));1477SHL(32, R(ECX), Imm8(5));1478OR(32, R(TEMPREG), R(ECX));1479} else {1480// Finalize the comparison for ES/NS.1481if (cond == VC_ES || cond == VC_NS) {1482MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));1483ANDPS(XMM0, MatR(TEMPREG));1484PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison1485// It's inversed below for NS.1486}14871488MOVD_xmm(R(TEMPREG), XMM0);1489if (inverse) {1490XOR(32, R(TEMPREG), Imm32(0xFFFFFFFF));1491}1492AND(32, R(TEMPREG), Imm32(0x31));1493}14941495gpr.UnlockAllX();1496gpr.MapReg(MIPS_REG_VFPUCC, true, true);1497AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~affected_bits));1498OR(32, gpr.R(MIPS_REG_VFPUCC), R(TEMPREG));14991500fpr.ReleaseSpillLocks();1501}15021503// There are no immediates for floating point, so we need to load these1504// from RAM. Might as well have a table ready.1505extern const float mulTableVi2f[32] = {15061.0f/(1UL<<0),1.0f/(1UL<<1),1.0f/(1UL<<2),1.0f/(1UL<<3),15071.0f/(1UL<<4),1.0f/(1UL<<5),1.0f/(1UL<<6),1.0f/(1UL<<7),15081.0f/(1UL<<8),1.0f/(1UL<<9),1.0f/(1UL<<10),1.0f/(1UL<<11),15091.0f/(1UL<<12),1.0f/(1UL<<13),1.0f/(1UL<<14),1.0f/(1UL<<15),15101.0f/(1UL<<16),1.0f/(1UL<<17),1.0f/(1UL<<18),1.0f/(1UL<<19),15111.0f/(1UL<<20),1.0f/(1UL<<21),1.0f/(1UL<<22),1.0f/(1UL<<23),15121.0f/(1UL<<24),1.0f/(1UL<<25),1.0f/(1UL<<26),1.0f/(1UL<<27),15131.0f/(1UL<<28),1.0f/(1UL<<29),1.0f/(1UL<<30),1.0f/(1UL<<31),1514};15151516void Jit::Comp_Vi2f(MIPSOpcode op) {1517CONDITIONAL_DISABLE(VFPU_VEC);15181519if (js.HasUnknownPrefix())1520DISABLE;15211522VectorSize sz = GetVecSize(op);1523int n = GetNumVectorElements(sz);15241525int imm = (op >> 16) & 0x1f;1526const float *mult = &mulTableVi2f[imm];15271528u8 sregs[4], dregs[4];1529GetVectorRegsPrefixS(sregs, sz, _VS);1530GetVectorRegsPrefixD(dregs, sz, _VD);15311532// Flush SIMD.1533fpr.SimpleRegsV(sregs, sz, 0);1534fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);15351536int tempregs[4];1537for (int i = 0; i < n; ++i) {1538if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1539tempregs[i] = fpr.GetTempV();1540} else {1541tempregs[i] = dregs[i];1542}1543}15441545if (*mult != 1.0f) {1546if (RipAccessible(mult)) {1547MOVSS(XMM1, M(mult)); // rip accessible1548} else {1549MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));1550MOVSS(XMM1, MatR(TEMPREG));1551}1552}1553for (int i = 0; i < n; i++) {1554fpr.MapRegV(tempregs[i], sregs[i] == dregs[i] ? MAP_DIRTY : MAP_NOINIT);1555if (fpr.V(sregs[i]).IsSimpleReg()) {1556CVTDQ2PS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));1557} else {1558MOVSS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));1559CVTDQ2PS(fpr.VX(tempregs[i]), R(fpr.VX(tempregs[i])));1560}1561if (*mult != 1.0f)1562MULSS(fpr.VX(tempregs[i]), R(XMM1));1563}15641565for (int i = 0; i < n; ++i) {1566if (dregs[i] != tempregs[i]) {1567fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);1568MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));1569}1570}15711572ApplyPrefixD(dregs, sz);1573fpr.ReleaseSpillLocks();1574}15751576// Planning for true SIMD15771578// Sequence for gathering sparse registers into one SIMD:1579// MOVSS(XMM0, fpr.R(sregs[0]));1580// MOVSS(XMM1, fpr.R(sregs[1]));1581// MOVSS(XMM2, fpr.R(sregs[2]));1582// MOVSS(XMM3, fpr.R(sregs[3]));1583// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); // XMM0 = S1 S1 S0 S01584// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0)); // XMM2 = S3 S3 S2 S21585// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0)); // XMM0 = S3 S2 S1 S01586// Some punpckwd etc would also work.1587// Alternatively, MOVSS and three PINSRD (SSE4) with mem source.1588// Why PINSRD instead of INSERTPS?1589// http://software.intel.com/en-us/blogs/2009/01/07/using-sse41-for-mp3-encoding-quantization15901591// Sequence for scattering a SIMD register to sparse registers:1592// (Very serial though, better methods may be possible)1593// MOVSS(fpr.R(sregs[0]), XMM0);1594// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));1595// MOVSS(fpr.R(sregs[1]), XMM0);1596// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));1597// MOVSS(fpr.R(sregs[2]), XMM0);1598// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));1599// MOVSS(fpr.R(sregs[3]), XMM0);1600// On SSE4 we should use EXTRACTPS.16011602// Translation of ryg's half_to_float5_SSE21603void Jit::Comp_Vh2f(MIPSOpcode op) {1604CONDITIONAL_DISABLE(VFPU_VEC);1605if (js.HasUnknownPrefix())1606DISABLE;16071608#define SSE_CONST4(name, val) alignas(16) static const u32 name[4] = { (val), (val), (val), (val) }16091610SSE_CONST4(mask_nosign, 0x7fff);1611SSE_CONST4(nan_mantissa, 0x800003ff);1612SSE_CONST4(magic, (254 - 15) << 23);1613SSE_CONST4(was_infnan, 0x7bff);1614SSE_CONST4(exp_infnan, 255 << 23);16151616OpArg mask_nosign_arg, nan_mantissa_arg, magic_arg, was_infnan_arg, exp_infnan_arg;1617if (RipAccessible(mask_nosign)) {1618mask_nosign_arg = M(&mask_nosign[0]);1619nan_mantissa_arg = M(&nan_mantissa[0]);1620magic_arg = M(&magic[0]);1621was_infnan_arg = M(&was_infnan[0]);1622exp_infnan_arg = M(&exp_infnan[0]);1623} else {1624MOV(PTRBITS, R(TEMPREG), ImmPtr(&mask_nosign[0]));1625mask_nosign_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &mask_nosign[0]);1626nan_mantissa_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &nan_mantissa[0]);1627magic_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &magic[0]);1628was_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &was_infnan[0]);1629exp_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &exp_infnan[0]);1630}16311632#undef SSE_CONST41633VectorSize sz = GetVecSize(op);1634VectorSize outsize;1635switch (sz) {1636case V_Single:1637outsize = V_Pair;1638break;1639case V_Pair:1640outsize = V_Quad;1641break;1642default:1643DISABLE;1644}16451646u8 sregs[4], dregs[4];1647GetVectorRegsPrefixS(sregs, sz, _VS);1648GetVectorRegsPrefixD(dregs, outsize, _VD);16491650// Flush SIMD.1651fpr.SimpleRegsV(sregs, sz, 0);16521653// Force ourselves an extra xreg as temp space.1654X64Reg tempR = fpr.GetFreeXReg();16551656MOVSS(XMM0, fpr.V(sregs[0]));1657if (sz != V_Single) {1658MOVSS(XMM1, fpr.V(sregs[1]));1659PUNPCKLDQ(XMM0, R(XMM1));1660}1661XORPS(XMM1, R(XMM1));1662PUNPCKLWD(XMM0, R(XMM1));16631664// OK, 16 bits in each word.1665// Let's go. Deep magic here.1666MOVAPS(XMM1, R(XMM0));1667ANDPS(XMM0, mask_nosign_arg); // xmm0 = expmant1668XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm01669MOVAPS(tempR, R(XMM0));1670PSLLD(XMM0, 13);1671MULPS(XMM0, magic_arg); /// xmm0 = scaled1672PSLLD(XMM1, 16); // xmm1 = sign1673ORPS(XMM0, R(XMM1));16741675// Now create a NAN mask, adding in the sign.1676ORPS(XMM1, R(tempR)); // xmm1 = sign + original mantissa.1677ANDPS(XMM1, nan_mantissa_arg); // xmm1 = original mantissa1678PCMPGTD(tempR, was_infnan_arg); // xmm2 = b_wasinfnan1679ORPS(XMM1, exp_infnan_arg); // xmm1 = infnan result1680ANDPS(XMM1, R(tempR)); // xmm1 = infnan result OR zero if not infnan1681ANDNPS(tempR, R(XMM0)); // tempR = result OR zero if infnan1682ORPS(XMM1, R(tempR));16831684fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);16851686// TODO: Could apply D-prefix in parallel here...16871688MOVSS(fpr.V(dregs[0]), XMM1);1689SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));1690MOVSS(fpr.V(dregs[1]), XMM1);16911692if (sz != V_Single) {1693SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));1694MOVSS(fpr.V(dregs[2]), XMM1);1695SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));1696MOVSS(fpr.V(dregs[3]), XMM1);1697}16981699ApplyPrefixD(dregs, outsize);1700gpr.UnlockAllX();1701fpr.ReleaseSpillLocks();1702}17031704// The goal is to map (reversed byte order for clarity):1705// AABBCCDD -> 000000AA 000000BB 000000CC 000000DD1706alignas(16) static s8 vc2i_shuffle[16] = { -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3 };1707// AABBCCDD -> AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD1708alignas(16) static s8 vuc2i_shuffle[16] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 };17091710void Jit::Comp_Vx2i(MIPSOpcode op) {1711CONDITIONAL_DISABLE(VFPU_VEC);1712if (js.HasUnknownPrefix())1713DISABLE;17141715int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)1716bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)17171718// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values1719// at the top. vus2i shifts it an extra bit right afterward.1720// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values1721// at the top too. vuc2i is a bit special (see below.)1722// Let's do this similarly as h2f - we do a solution that works for both singles and pairs1723// then use it for both.17241725VectorSize sz = GetVecSize(op);1726VectorSize outsize;1727if (bits == 8) {1728outsize = V_Quad;1729} else {1730switch (sz) {1731case V_Single:1732outsize = V_Pair;1733break;1734case V_Pair:1735outsize = V_Quad;1736break;1737default:1738DISABLE;1739}1740}17411742u8 sregs[4], dregs[4];1743GetVectorRegsPrefixS(sregs, sz, _VS);1744GetVectorRegsPrefixD(dregs, outsize, _VD);17451746// Flush SIMD.1747fpr.SimpleRegsV(sregs, sz, 0);17481749if (bits == 16) {1750MOVSS(XMM1, fpr.V(sregs[0]));1751if (sz != V_Single) {1752MOVSS(XMM0, fpr.V(sregs[1]));1753PUNPCKLDQ(XMM1, R(XMM0));1754}17551756// Unpack 16-bit words into 32-bit words, upper position, and we're done!1757PXOR(XMM0, R(XMM0));1758PUNPCKLWD(XMM0, R(XMM1));1759} else if (bits == 8) {1760if (unsignedOp) {1761// vuc2i is a bit special. It spreads out the bits like this:1762// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.1763MOVSS(XMM0, fpr.V(sregs[0]));1764if (cpu_info.bSSSE3 && RipAccessible(vuc2i_shuffle)) {1765// Not really different speed. Generates a bit less code.1766PSHUFB(XMM0, M(&vuc2i_shuffle[0])); // rip accessible1767} else {1768// First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA.1769PUNPCKLBW(XMM0, R(XMM0));1770// Now, interleave each 16 bits so they're all 32 bits wide.1771PUNPCKLWD(XMM0, R(XMM0));1772}1773} else {1774if (cpu_info.bSSSE3 && RipAccessible(vc2i_shuffle)) {1775MOVSS(XMM0, fpr.V(sregs[0]));1776PSHUFB(XMM0, M(&vc2i_shuffle[0]));1777} else {1778PXOR(XMM1, R(XMM1));1779MOVSS(XMM0, fpr.V(sregs[0]));1780PUNPCKLBW(XMM1, R(XMM0));1781PXOR(XMM0, R(XMM0));1782PUNPCKLWD(XMM0, R(XMM1));1783}1784}1785}17861787// At this point we have the regs in the 4 lanes.1788// In the "u" mode, we need to shift it out of the sign bit.1789if (unsignedOp) {1790PSRLD(XMM0, 1);1791}17921793if (fpr.TryMapRegsVS(dregs, outsize, MAP_NOINIT | MAP_DIRTY)) {1794MOVAPS(fpr.VSX(dregs), R(XMM0));1795} else {1796// Done! TODO: The rest of this should be possible to extract into a function.1797fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);17981799// TODO: Could apply D-prefix in parallel here...18001801MOVSS(fpr.V(dregs[0]), XMM0);1802PSRLDQ(XMM0, 4);1803MOVSS(fpr.V(dregs[1]), XMM0);18041805if (outsize != V_Pair) {1806PSRLDQ(XMM0, 4);1807MOVSS(fpr.V(dregs[2]), XMM0);1808PSRLDQ(XMM0, 4);1809MOVSS(fpr.V(dregs[3]), XMM0);1810}1811}18121813ApplyPrefixD(dregs, outsize);1814gpr.UnlockAllX();1815fpr.ReleaseSpillLocks();1816}18171818extern const double mulTableVf2i[32] = {1819(1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3),1820(1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7),1821(1ULL<<8),(1ULL<<9),(1ULL<<10),(1ULL<<11),1822(1ULL<<12),(1ULL<<13),(1ULL<<14),(1ULL<<15),1823(1ULL<<16),(1ULL<<17),(1ULL<<18),(1ULL<<19),1824(1ULL<<20),(1ULL<<21),(1ULL<<22),(1ULL<<23),1825(1ULL<<24),(1ULL<<25),(1ULL<<26),(1ULL<<27),1826(1ULL<<28),(1ULL<<29),(1ULL<<30),(1ULL<<31),1827};18281829static const double maxMinIntAsDouble[2] = { (double)0x7fffffff, (double)(int)0x80000000 }; // that's not equal to 0x8000000018301831void Jit::Comp_Vf2i(MIPSOpcode op) {1832CONDITIONAL_DISABLE(VFPU_VEC);1833if (js.HasUnknownPrefix())1834DISABLE;18351836VectorSize sz = GetVecSize(op);1837int n = GetNumVectorElements(sz);18381839int imm = (op >> 16) & 0x1f;1840const double *mult = &mulTableVf2i[imm];18411842int setMXCSR = -1;1843int rmode = (op >> 21) & 0x1f;1844switch (rmode) {1845case 17:1846break; //z - truncate. Easy to support.1847case 16:1848setMXCSR = 0;1849break;1850case 18:1851setMXCSR = 2;1852break;1853case 19:1854setMXCSR = 1;1855break;1856}18571858// Small optimization: 0 is our default mode anyway.1859if (setMXCSR == 0 && !js.hasSetRounding) {1860setMXCSR = -1;1861}1862// Except for truncate, we need to update MXCSR to our preferred rounding mode.1863if (setMXCSR != -1) {1864STMXCSR(MIPSSTATE_VAR(mxcsrTemp));1865MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));1866AND(32, R(TEMPREG), Imm32(~(3 << 13)));1867if (setMXCSR != 0) {1868OR(32, R(TEMPREG), Imm32(setMXCSR << 13));1869}1870MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));1871LDMXCSR(MIPSSTATE_VAR(temp));1872}18731874u8 sregs[4], dregs[4];1875GetVectorRegsPrefixS(sregs, sz, _VS);1876GetVectorRegsPrefixD(dregs, sz, _VD);18771878// Really tricky to SIMD due to double precision requirement...18791880// Flush SIMD.1881fpr.SimpleRegsV(sregs, sz, 0);1882fpr.SimpleRegsV(dregs, sz, MAP_DIRTY | MAP_NOINIT);18831884u8 tempregs[4];1885for (int i = 0; i < n; ++i) {1886if (!IsOverlapSafe(dregs[i], i, n, sregs)) {1887tempregs[i] = fpr.GetTempV();1888} else {1889tempregs[i] = dregs[i];1890}1891}18921893if (*mult != 1.0f) {1894if (RipAccessible(mult)) {1895MOVSD(XMM1, M(mult)); // rip accessible1896} else {1897MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));1898MOVSD(XMM1, MatR(TEMPREG));1899}1900}19011902fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT);1903for (int i = 0; i < n; i++) {1904// Need to do this in double precision to clamp correctly as float1905// doesn't have enough precision to represent 0x7fffffff for example exactly.1906MOVSS(XMM0, fpr.V(sregs[i]));1907CVTSS2SD(XMM0, R(XMM0)); // convert to double precision1908if (*mult != 1.0f) {1909MULSD(XMM0, R(XMM1));1910}1911MOV(PTRBITS, R(TEMPREG), ImmPtr(maxMinIntAsDouble));1912MINSD(XMM0, MDisp(TEMPREG, 0));1913MAXSD(XMM0, MDisp(TEMPREG, sizeof(double)));1914// We've set the rounding mode above, so this part's easy.1915switch ((op >> 21) & 0x1f) {1916case 16: CVTSD2SI(TEMPREG, R(XMM0)); break; //n1917case 17: CVTTSD2SI(TEMPREG, R(XMM0)); break; //z - truncate1918case 18: CVTSD2SI(TEMPREG, R(XMM0)); break; //u1919case 19: CVTSD2SI(TEMPREG, R(XMM0)); break; //d1920}1921MOVD_xmm(fpr.VX(tempregs[i]), R(TEMPREG));1922}19231924for (int i = 0; i < n; ++i) {1925if (dregs[i] != tempregs[i]) {1926fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);1927MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));1928fpr.DiscardV(tempregs[i]);1929}1930}19311932if (setMXCSR != -1) {1933LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));1934}19351936ApplyPrefixD(dregs, sz);1937fpr.ReleaseSpillLocks();1938}19391940void Jit::Comp_Vcst(MIPSOpcode op) {1941CONDITIONAL_DISABLE(VFPU_XFER);19421943if (js.HasUnknownPrefix())1944DISABLE;19451946int conNum = (op >> 16) & 0x1f;1947int vd = _VD;19481949VectorSize sz = GetVecSize(op);1950int n = GetNumVectorElements(sz);19511952u8 dregs[4];1953GetVectorRegsPrefixD(dregs, sz, vd);19541955if (RipAccessible(cst_constants)) {1956MOVSS(XMM0, M(&cst_constants[conNum])); // rip accessible1957} else {1958MOV(PTRBITS, R(TEMPREG), ImmPtr(&cst_constants[conNum]));1959MOVSS(XMM0, MatR(TEMPREG));1960}19611962if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {1963SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0,0,0,0));1964MOVAPS(fpr.VS(dregs), XMM0);1965fpr.ReleaseSpillLocks();1966return;1967}19681969fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);1970for (int i = 0; i < n; i++) {1971MOVSS(fpr.V(dregs[i]), XMM0);1972}1973ApplyPrefixD(dregs, sz);1974fpr.ReleaseSpillLocks();1975}19761977void Jit::Comp_Vsgn(MIPSOpcode op) {1978CONDITIONAL_DISABLE(VFPU_VEC);19791980if (js.HasUnknownPrefix())1981DISABLE;19821983VectorSize sz = GetVecSize(op);1984int n = GetNumVectorElements(sz);19851986u8 sregs[4], dregs[4];1987GetVectorRegsPrefixS(sregs, sz, _VS);1988GetVectorRegsPrefixD(dregs, sz, _VD);19891990// Flush SIMD.1991fpr.SimpleRegsV(sregs, sz, 0);1992fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);19931994X64Reg tempxregs[4];1995for (int i = 0; i < n; ++i) {1996if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {1997int reg = fpr.GetTempV();1998fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);1999fpr.SpillLockV(reg);2000tempxregs[i] = fpr.VX(reg);2001} else {2002fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);2003fpr.SpillLockV(dregs[i]);2004tempxregs[i] = fpr.VX(dregs[i]);2005}2006}20072008// Would be nice with more temp regs here so we could put signBitLower and oneOneOneOne into regs...2009for (int i = 0; i < n; ++i) {2010XORPS(XMM0, R(XMM0));2011CMPEQSS(XMM0, fpr.V(sregs[i])); // XMM0 = s[i] == 0.0f2012MOVSS(XMM1, fpr.V(sregs[i]));2013// Preserve sign bit, replace rest with ones2014if (RipAccessible(signBitLower)) {2015ANDPS(XMM1, M(&signBitLower)); // rip accessible2016ORPS(XMM1, M(&oneOneOneOne)); // rip accessible2017} else {2018MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));2019ANDPS(XMM1, MatR(TEMPREG));2020MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));2021ORPS(XMM1, MatR(TEMPREG));2022}2023// If really was equal to zero, zap. Note that ANDN negates the destination.2024ANDNPS(XMM0, R(XMM1));2025MOVAPS(tempxregs[i], R(XMM0));2026}20272028for (int i = 0; i < n; ++i) {2029if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))2030MOVSS(fpr.V(dregs[i]), tempxregs[i]);2031}20322033ApplyPrefixD(dregs, sz);20342035fpr.ReleaseSpillLocks();2036}20372038void Jit::Comp_Vocp(MIPSOpcode op) {2039CONDITIONAL_DISABLE(VFPU_VEC);20402041if (js.HasUnknownPrefix())2042DISABLE;20432044VectorSize sz = GetVecSize(op);2045int n = GetNumVectorElements(sz);20462047// This is a hack that modifies prefixes. We eat them later, so just overwrite.2048// S prefix forces the negate flags.2049js.prefixS |= 0x000F0000;2050// T prefix forces constants on and regnum to 1.2051// That means negate still works, and abs activates a different constant.2052js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;20532054u8 sregs[4], tregs[4], dregs[4];2055// Actually uses the T prefixes (despite being VS.)2056GetVectorRegsPrefixS(sregs, sz, _VS);2057if (js.prefixT != 0x0000F055)2058GetVectorRegsPrefixT(tregs, sz, _VS);2059GetVectorRegsPrefixD(dregs, sz, _VD);20602061// Flush SIMD.2062fpr.SimpleRegsV(sregs, sz, 0);2063if (js.prefixT != 0x0000F055)2064fpr.SimpleRegsV(tregs, sz, 0);2065fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);20662067X64Reg tempxregs[4];2068for (int i = 0; i < n; ++i) {2069if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {2070int reg = fpr.GetTempV();2071fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);2072fpr.SpillLockV(reg);2073tempxregs[i] = fpr.VX(reg);2074} else {2075fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);2076fpr.SpillLockV(dregs[i]);2077tempxregs[i] = fpr.VX(dregs[i]);2078}2079}20802081if (js.prefixT == 0x0000F055) {2082MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2083MOVSS(XMM1, MatR(TEMPREG));2084}2085for (int i = 0; i < n; ++i) {2086if (js.prefixT == 0x0000F055) {2087MOVSS(XMM0, R(XMM1));2088} else {2089MOVSS(XMM0, fpr.V(tregs[i]));2090}2091ADDSS(XMM0, fpr.V(sregs[i]));2092MOVSS(tempxregs[i], R(XMM0));2093}20942095for (int i = 0; i < n; ++i) {2096if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))2097MOVSS(fpr.V(dregs[i]), tempxregs[i]);2098}20992100ApplyPrefixD(dregs, sz);21012102fpr.ReleaseSpillLocks();2103}21042105void Jit::Comp_Vbfy(MIPSOpcode op) {2106CONDITIONAL_DISABLE(VFPU_VEC);2107if (js.HasUnknownPrefix())2108DISABLE;21092110VectorSize sz = GetVecSize(op);2111int n = GetNumVectorElements(sz);2112if (n != 2 && n != 4) {2113DISABLE;2114}21152116u8 sregs[4], dregs[4];2117GetVectorRegsPrefixS(sregs, sz, _VS);2118GetVectorRegsPrefixD(dregs, sz, _VD);2119// Flush SIMD.2120fpr.SimpleRegsV(sregs, sz, 0);2121fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);21222123X64Reg tempxregs[4];2124for (int i = 0; i < n; ++i) {2125if (!IsOverlapSafe(dregs[i], i, n, sregs)) {2126int reg = fpr.GetTempV();2127fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);2128fpr.SpillLockV(reg);2129tempxregs[i] = fpr.VX(reg);2130} else {2131fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);2132fpr.SpillLockV(dregs[i]);2133tempxregs[i] = fpr.VX(dregs[i]);2134}2135}21362137int subop = (op >> 16) & 0x1F;2138if (subop == 3) {2139// vbfy22140MOVSS(tempxregs[0], fpr.V(sregs[0]));2141MOVSS(tempxregs[1], fpr.V(sregs[1]));2142MOVSS(tempxregs[2], fpr.V(sregs[0]));2143MOVSS(tempxregs[3], fpr.V(sregs[1]));2144ADDSS(tempxregs[0], fpr.V(sregs[2]));2145ADDSS(tempxregs[1], fpr.V(sregs[3]));2146SUBSS(tempxregs[2], fpr.V(sregs[2]));2147SUBSS(tempxregs[3], fpr.V(sregs[3]));2148} else if (subop == 2) {2149// vbfy12150MOVSS(tempxregs[0], fpr.V(sregs[0]));2151MOVSS(tempxregs[1], fpr.V(sregs[0]));2152ADDSS(tempxregs[0], fpr.V(sregs[1]));2153SUBSS(tempxregs[1], fpr.V(sregs[1]));2154if (n == 4) {2155MOVSS(tempxregs[2], fpr.V(sregs[2]));2156MOVSS(tempxregs[3], fpr.V(sregs[2]));2157ADDSS(tempxregs[2], fpr.V(sregs[3]));2158SUBSS(tempxregs[3], fpr.V(sregs[3]));2159}2160} else {2161DISABLE;2162}21632164for (int i = 0; i < n; ++i) {2165if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))2166MOVSS(fpr.V(dregs[i]), tempxregs[i]);2167}21682169ApplyPrefixD(dregs, sz);21702171fpr.ReleaseSpillLocks();2172}21732174union u32float {2175u32 u;2176float f;21772178operator float() const {2179return f;2180}21812182inline u32float &operator *=(const float &other) {2183f *= other;2184return *this;2185}2186};21872188#if PPSSPP_ARCH(AMD64)2189typedef float SinCosArg;2190#else2191typedef u32float SinCosArg;2192#endif21932194void SinCos(SinCosArg angle, float *output) {2195vfpu_sincos(angle, output[0], output[1]);2196}21972198void SinOnly(SinCosArg angle, float *output) {2199output[0] = vfpu_sin(angle);2200}22012202void NegSinOnly(SinCosArg angle, float *output) {2203output[0] = -vfpu_sin(angle);2204}22052206void CosOnly(SinCosArg angle, float *output) {2207output[1] = vfpu_cos(angle);2208}22092210void ASinScaled(SinCosArg sine, float *output) {2211output[0] = vfpu_asin(sine);2212}22132214void SinCosNegSin(SinCosArg angle, float *output) {2215vfpu_sincos(angle, output[0], output[1]);2216output[0] = -output[0];2217}22182219void Exp2(SinCosArg arg, float *output) {2220output[0] = vfpu_exp2(arg);2221}22222223void Log2(SinCosArg arg, float *output) {2224output[0] = vfpu_log2(arg);2225}22262227void RExp2(SinCosArg arg, float *output) {2228output[0] = vfpu_rexp2(arg);2229}22302231void Jit::Comp_VV2Op(MIPSOpcode op) {2232CONDITIONAL_DISABLE(VFPU_VEC);22332234if (js.HasUnknownPrefix())2235DISABLE;22362237auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {2238#if PPSSPP_ARCH(AMD64)2239MOVSS(XMM0, fpr.V(sreg));2240// TODO: This reg might be different on Linux...2241#ifdef _WIN322242LEA(64, RDX, MIPSSTATE_VAR(sincostemp[0]));2243#else2244LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));2245#endif2246ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));2247#else2248// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.2249if (fpr.V(sreg).IsSimpleReg()) {2250MOVD_xmm(R(EAX), fpr.VX(sreg));2251} else {2252MOV(32, R(EAX), fpr.V(sreg));2253}2254CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));2255#endif2256};22572258// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure2259if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {2260return;2261}22622263VectorSize sz = GetVecSize(op);2264int n = GetNumVectorElements(sz);22652266u8 sregs[4], dregs[4];2267GetVectorRegsPrefixS(sregs, sz, _VS);2268GetVectorRegsPrefixD(dregs, sz, _VD);22692270bool canSIMD = false;2271// Some can be SIMD'd.2272switch ((op >> 16) & 0x1f) {2273case 0: // vmov2274case 1: // vabs2275case 2: // vneg2276canSIMD = true;2277break;2278}22792280if (canSIMD && fpr.TryMapDirtyInVS(dregs, sz, sregs, sz)) {2281switch ((op >> 16) & 0x1f) {2282case 0: // vmov2283MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));2284break;2285case 1: // vabs2286if (dregs[0] != sregs[0])2287MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));2288if (RipAccessible(&noSignMask)) {2289ANDPS(fpr.VSX(dregs), M(&noSignMask)); // rip accessible2290} else {2291MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));2292ANDPS(fpr.VSX(dregs), MatR(TEMPREG));2293}2294break;2295case 2: // vneg2296if (dregs[0] != sregs[0])2297MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));2298if (RipAccessible(&signBitAll)) {2299XORPS(fpr.VSX(dregs), M(&signBitAll)); // rip accessible2300} else {2301MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitAll));2302XORPS(fpr.VSX(dregs), MatR(TEMPREG));2303}2304break;2305}2306ApplyPrefixD(dregs, sz);2307fpr.ReleaseSpillLocks();2308return;2309}23102311// Flush SIMD.2312fpr.SimpleRegsV(sregs, sz, 0);2313fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);23142315X64Reg tempxregs[4];2316for (int i = 0; i < n; ++i)2317{2318if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs))2319{2320int reg = fpr.GetTempV();2321fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);2322fpr.SpillLockV(reg);2323tempxregs[i] = fpr.VX(reg);2324}2325else2326{2327fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);2328fpr.SpillLockV(dregs[i]);2329tempxregs[i] = fpr.VX(dregs[i]);2330}2331}23322333// Warning: sregs[i] and tempxregs[i] may be the same reg.2334// Helps for vmov, hurts for vrcp, etc.2335for (int i = 0; i < n; ++i)2336{2337switch ((op >> 16) & 0x1f)2338{2339case 0: // d[i] = s[i]; break; //vmov2340// Probably for swizzle.2341if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2342MOVSS(tempxregs[i], fpr.V(sregs[i]));2343break;2344case 1: // d[i] = fabsf(s[i]); break; //vabs2345if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2346MOVSS(tempxregs[i], fpr.V(sregs[i]));2347if (RipAccessible(&noSignMask)) {2348ANDPS(tempxregs[i], M(&noSignMask)); // rip accessible2349} else {2350MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));2351ANDPS(tempxregs[i], MatR(TEMPREG));2352}2353break;2354case 2: // d[i] = -s[i]; break; //vneg2355if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2356MOVSS(tempxregs[i], fpr.V(sregs[i]));2357if (RipAccessible(&signBitLower)) {2358XORPS(tempxregs[i], M(&signBitLower)); // rip accessible2359} else {2360MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));2361XORPS(tempxregs[i], MatR(TEMPREG));2362}2363break;2364case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat02365if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2366MOVSS(tempxregs[i], fpr.V(sregs[i]));23672368// Zero out XMM0 if it was <= +0.0f (but skip NAN.)2369MOVSS(R(XMM0), tempxregs[i]);2370XORPS(XMM1, R(XMM1));2371CMPLESS(XMM0, R(XMM1));2372ANDNPS(XMM0, R(tempxregs[i]));23732374// Retain a NAN in XMM0 (must be second operand.)2375MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2376MOVSS(tempxregs[i], MatR(TEMPREG));2377MINSS(tempxregs[i], R(XMM0));2378break;2379case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat12380if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2381MOVSS(tempxregs[i], fpr.V(sregs[i]));23822383// Check for < -1.0f, but careful of NANs.2384MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));2385MOVSS(XMM1, MatR(TEMPREG));2386MOVSS(R(XMM0), tempxregs[i]);2387CMPLESS(XMM0, R(XMM1));2388// If it was NOT less, the three ops below do nothing.2389// Otherwise, they replace the value with -1.0f.2390ANDPS(XMM1, R(XMM0));2391ANDNPS(XMM0, R(tempxregs[i]));2392ORPS(XMM0, R(XMM1));23932394// Retain a NAN in XMM0 (must be second operand.)2395MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2396MOVSS(tempxregs[i], MatR(TEMPREG));2397MINSS(tempxregs[i], R(XMM0));2398break;2399case 16: // d[i] = 1.0f / s[i]; break; //vrcp2400if (RipAccessible(&one)) {2401MOVSS(XMM0, M(&one)); // rip accessible2402} else {2403MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2404MOVSS(XMM0, MatR(TEMPREG));2405}2406DIVSS(XMM0, fpr.V(sregs[i]));2407MOVSS(tempxregs[i], R(XMM0));2408break;2409case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq2410SQRTSS(XMM0, fpr.V(sregs[i]));2411if (RipAccessible(&one)) {2412MOVSS(tempxregs[i], M(&one)); // rip accessible2413} else {2414MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2415MOVSS(tempxregs[i], MatR(TEMPREG));2416}2417DIVSS(tempxregs[i], R(XMM0));2418break;2419case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin2420specialFuncCallHelper(&SinOnly, sregs[i]);2421MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2422break;2423case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos2424specialFuncCallHelper(&CosOnly, sregs[i]);2425MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));2426break;2427case 20: // d[i] = powf(2.0f, s[i]); break; //vexp22428specialFuncCallHelper(&Exp2, sregs[i]);2429MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2430break;2431case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog22432specialFuncCallHelper(&Log2, sregs[i]);2433MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2434break;2435case 22: // d[i] = sqrtf(s[i]); break; //vsqrt2436SQRTSS(tempxregs[i], fpr.V(sregs[i]));2437MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));2438ANDPS(tempxregs[i], MatR(TEMPREG));2439break;2440case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin2441specialFuncCallHelper(&ASinScaled, sregs[i]);2442MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2443break;2444case 24: // d[i] = -1.0f / s[i]; break; // vnrcp2445// Rare so let's not bother checking for RipAccessible.2446MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));2447MOVSS(XMM0, MatR(TEMPREG));2448DIVSS(XMM0, fpr.V(sregs[i]));2449MOVSS(tempxregs[i], R(XMM0));2450break;2451case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin2452specialFuncCallHelper(&NegSinOnly, sregs[i]);2453MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2454break;2455case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp22456specialFuncCallHelper(&RExp2, sregs[i]);2457MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));2458break;2459}2460}2461for (int i = 0; i < n; ++i)2462{2463if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))2464MOVSS(fpr.V(dregs[i]), tempxregs[i]);2465}24662467ApplyPrefixD(dregs, sz);24682469fpr.ReleaseSpillLocks();2470}24712472void Jit::Comp_Mftv(MIPSOpcode op) {2473CONDITIONAL_DISABLE(VFPU_XFER);24742475int imm = op & 0xFF;2476MIPSGPReg rt = _RT;2477switch ((op >> 21) & 0x1f)2478{2479case 3: //mfv / mfvc2480// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.2481if (rt != MIPS_REG_ZERO) {2482if (imm < 128) { //R(rt) = VI(imm);2483fpr.SimpleRegV(imm, 0);2484if (fpr.V(imm).IsSimpleReg()) {2485fpr.MapRegV(imm, 0);2486gpr.MapReg(rt, false, true);2487MOVD_xmm(gpr.R(rt), fpr.VX(imm));2488} else {2489// Let's not bother mapping the vreg.2490gpr.MapReg(rt, false, true);2491MOV(32, gpr.R(rt), fpr.V(imm));2492}2493} else if (imm < 128 + VFPU_CTRL_MAX) { //mfvc2494if (imm - 128 == VFPU_CTRL_CC) {2495if (gpr.IsImm(MIPS_REG_VFPUCC)) {2496gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));2497} else {2498gpr.Lock(rt, MIPS_REG_VFPUCC);2499gpr.MapReg(rt, false, true);2500gpr.MapReg(MIPS_REG_VFPUCC, true, false);2501MOV(32, gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));2502gpr.UnlockAll();2503}2504} else {2505// In case we have a saved prefix.2506FlushPrefixV();2507gpr.MapReg(rt, false, true);2508MOV(32, gpr.R(rt), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128));2509}2510} else {2511//ERROR - maybe need to make this value too an "interlock" value?2512_dbg_assert_msg_(false,"mfv - invalid register");2513}2514}2515break;25162517case 7: //mtv2518if (imm < 128) { // VI(imm) = R(rt);2519fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);2520// Let's not bother mapping rt if we don't have to.2521if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {2522XORPS(fpr.VX(imm), fpr.V(imm));2523} else {2524gpr.KillImmediate(rt, true, false);2525MOVD_xmm(fpr.VX(imm), gpr.R(rt));2526}2527} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);2528if (imm - 128 == VFPU_CTRL_CC) {2529if (gpr.IsImm(rt)) {2530gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));2531} else {2532gpr.Lock(rt, MIPS_REG_VFPUCC);2533gpr.MapReg(rt, true, false);2534gpr.MapReg(MIPS_REG_VFPUCC, false, true);2535MOV(32, gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));2536gpr.UnlockAll();2537}2538} else {2539gpr.MapReg(rt, true, false);2540MOV(32, MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128), gpr.R(rt));2541}25422543// TODO: Optimization if rt is Imm?2544if (imm - 128 == VFPU_CTRL_SPREFIX) {2545js.prefixSFlag = JitState::PREFIX_UNKNOWN;2546js.blockWrotePrefixes = true;2547} else if (imm - 128 == VFPU_CTRL_TPREFIX) {2548js.prefixTFlag = JitState::PREFIX_UNKNOWN;2549js.blockWrotePrefixes = true;2550} else if (imm - 128 == VFPU_CTRL_DPREFIX) {2551js.prefixDFlag = JitState::PREFIX_UNKNOWN;2552js.blockWrotePrefixes = true;2553}2554} else {2555//ERROR2556_dbg_assert_msg_(false,"mtv - invalid register");2557}2558break;25592560default:2561DISABLE;2562}2563}25642565void Jit::Comp_Vmfvc(MIPSOpcode op) {2566CONDITIONAL_DISABLE(VFPU_XFER);2567int vd = _VD;2568int imm = (op >> 8) & 0x7F;2569if (imm < VFPU_CTRL_MAX) {2570fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);2571if (imm == VFPU_CTRL_CC) {2572gpr.MapReg(MIPS_REG_VFPUCC, true, false);2573MOVD_xmm(fpr.VX(vd), gpr.R(MIPS_REG_VFPUCC));2574} else {2575MOVSS(fpr.VX(vd), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm));2576}2577fpr.ReleaseSpillLocks();2578} else {2579fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);2580XORPS(fpr.VX(vd), fpr.V(vd));2581fpr.ReleaseSpillLocks();2582}2583}25842585void Jit::Comp_Vmtvc(MIPSOpcode op) {2586CONDITIONAL_DISABLE(VFPU_XFER);2587int vs = _VS;2588int imm = op & 0x7F;2589if (imm < VFPU_CTRL_MAX) {2590fpr.MapRegV(vs, 0);2591if (imm == VFPU_CTRL_CC) {2592gpr.MapReg(MIPS_REG_VFPUCC, false, true);2593MOVD_xmm(gpr.R(MIPS_REG_VFPUCC), fpr.VX(vs));2594} else {2595MOVSS(MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm), fpr.VX(vs));2596}2597fpr.ReleaseSpillLocks();25982599if (imm == VFPU_CTRL_SPREFIX) {2600js.prefixSFlag = JitState::PREFIX_UNKNOWN;2601js.blockWrotePrefixes = true;2602} else if (imm == VFPU_CTRL_TPREFIX) {2603js.prefixTFlag = JitState::PREFIX_UNKNOWN;2604js.blockWrotePrefixes = true;2605} else if (imm == VFPU_CTRL_DPREFIX) {2606js.prefixDFlag = JitState::PREFIX_UNKNOWN;2607js.blockWrotePrefixes = true;2608}2609}2610}26112612void Jit::Comp_VMatrixInit(MIPSOpcode op) {2613CONDITIONAL_DISABLE(VFPU_XFER);26142615if (js.HasUnknownPrefix())2616DISABLE;26172618MatrixSize sz = GetMtxSize(op);2619int n = GetMatrixSide(sz);26202621// Not really about trying here, it will work if enabled.2622if (jo.enableVFPUSIMD) {2623VectorSize vsz = GetVectorSize(sz);2624u8 vecs[4];2625GetMatrixColumns(_VD, sz, vecs);2626switch ((op >> 16) & 0xF) {2627case 3:2628MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[0]));2629break;2630case 7:2631MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));2632MOVAPS(XMM0, MatR(TEMPREG));2633break;2634}26352636for (int i = 0; i < n; i++) {2637u8 vec[4];2638GetVectorRegs(vec, vsz, vecs[i]);2639fpr.MapRegsVS(vec, vsz, MAP_NOINIT | MAP_DIRTY);2640switch ((op >> 16) & 0xF) {2641case 3:2642MOVAPS(fpr.VSX(vec), MDisp(TEMPREG, 16 * i));2643break;2644case 6:2645XORPS(fpr.VSX(vec), fpr.VS(vec));2646break;2647case 7:2648MOVAPS(fpr.VSX(vec), R(XMM0));2649break;2650}2651}2652fpr.ReleaseSpillLocks();2653return;2654}26552656u8 dregs[16];2657GetMatrixRegs(dregs, sz, _VD);26582659// Flush SIMD.2660fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);26612662switch ((op >> 16) & 0xF) {2663case 3: // vmidt2664XORPS(XMM0, R(XMM0));2665MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2666MOVSS(XMM1, MatR(TEMPREG));2667for (int a = 0; a < n; a++) {2668for (int b = 0; b < n; b++) {2669MOVSS(fpr.V(dregs[a * 4 + b]), a == b ? XMM1 : XMM0);2670}2671}2672break;2673case 6: // vmzero2674XORPS(XMM0, R(XMM0));2675for (int a = 0; a < n; a++) {2676for (int b = 0; b < n; b++) {2677MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);2678}2679}2680break;2681case 7: // vmone2682MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));2683MOVSS(XMM0, MatR(TEMPREG));2684for (int a = 0; a < n; a++) {2685for (int b = 0; b < n; b++) {2686MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);2687}2688}2689break;2690}26912692fpr.ReleaseSpillLocks();2693}26942695void Jit::Comp_Vmmov(MIPSOpcode op) {2696CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);26972698// TODO: This probably ignores prefixes?2699if (js.HasUnknownPrefix())2700DISABLE;27012702MatrixSize sz = GetMtxSize(op);2703int n = GetMatrixSide(sz);27042705if (jo.enableVFPUSIMD) {2706VectorSize vsz = GetVectorSize(sz);2707u8 dest[4][4];2708MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);27092710u8 vecs[4];2711if (overlap == OVERLAP_NONE) {2712GetMatrixColumns(_VD, sz, vecs);2713for (int i = 0; i < n; ++i) {2714GetVectorRegs(dest[i], vsz, vecs[i]);2715}2716} else {2717for (int i = 0; i < n; ++i) {2718fpr.GetTempVS(dest[i], vsz);2719}2720}27212722GetMatrixColumns(_VS, sz, vecs);2723for (int i = 0; i < n; i++) {2724u8 vec[4];2725GetVectorRegs(vec, vsz, vecs[i]);2726fpr.MapRegsVS(vec, vsz, 0);2727fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);2728MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));2729fpr.ReleaseSpillLocks();2730}27312732if (overlap != OVERLAP_NONE) {2733// Okay, move from the temps to VD now.2734GetMatrixColumns(_VD, sz, vecs);2735for (int i = 0; i < n; i++) {2736u8 vec[4];2737GetVectorRegs(vec, vsz, vecs[i]);2738fpr.MapRegsVS(vec, vsz, MAP_NOINIT);2739fpr.MapRegsVS(dest[i], vsz, 0);2740MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));2741fpr.ReleaseSpillLocks();2742}2743}27442745fpr.ReleaseSpillLocks();2746return;2747}27482749u8 sregs[16], dregs[16];2750GetMatrixRegs(sregs, sz, _VS);2751GetMatrixRegs(dregs, sz, _VD);27522753// Flush SIMD.2754fpr.SimpleRegsV(sregs, sz, 0);2755fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);27562757// TODO: gas doesn't allow overlap, what does the PSP do?2758// Potentially detect overlap or the safe direction to move in, or just DISABLE?2759// This is very not optimal, blows the regcache everytime.2760u8 tempregs[16];2761for (int a = 0; a < n; a++) {2762for (int b = 0; b < n; b++) {2763u8 temp = (u8) fpr.GetTempV();2764fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);2765MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));2766fpr.StoreFromRegisterV(temp);2767tempregs[a * 4 + b] = temp;2768}2769}2770for (int a = 0; a < n; a++) {2771for (int b = 0; b < n; b++) {2772u8 temp = tempregs[a * 4 + b];2773fpr.MapRegV(temp, 0);2774MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));2775}2776}27772778fpr.ReleaseSpillLocks();2779}27802781void Jit::Comp_VScl(MIPSOpcode op) {2782CONDITIONAL_DISABLE(VFPU_VEC);27832784if (js.HasUnknownPrefix())2785DISABLE;27862787VectorSize sz = GetVecSize(op);2788int n = GetNumVectorElements(sz);27892790u8 sregs[4], dregs[4], scale;2791GetVectorRegsPrefixS(sregs, sz, _VS);2792GetVectorRegsPrefixT(&scale, V_Single, _VT);2793GetVectorRegsPrefixD(dregs, sz, _VD);27942795if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, &scale, V_Single, true)) {2796MOVSS(XMM0, fpr.VS(&scale));2797if (sz != V_Single)2798SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));2799if (dregs[0] != sregs[0]) {2800MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));2801}2802MULPS(fpr.VSX(dregs), R(XMM0));2803ApplyPrefixD(dregs, sz);2804fpr.ReleaseSpillLocks();2805return;2806}28072808// Flush SIMD.2809fpr.SimpleRegsV(sregs, sz, 0);2810fpr.SimpleRegsV(&scale, V_Single, 0);2811fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);28122813// Move to XMM0 early, so we don't have to worry about overlap with scale.2814MOVSS(XMM0, fpr.V(scale));28152816X64Reg tempxregs[4];2817for (int i = 0; i < n; ++i) {2818if (dregs[i] != scale || !IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {2819int reg = fpr.GetTempV();2820fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);2821fpr.SpillLockV(reg);2822tempxregs[i] = fpr.VX(reg);2823} else {2824fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);2825fpr.SpillLockV(dregs[i]);2826tempxregs[i] = fpr.VX(dregs[i]);2827}2828}2829for (int i = 0; i < n; ++i) {2830if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))2831MOVSS(tempxregs[i], fpr.V(sregs[i]));2832MULSS(tempxregs[i], R(XMM0));2833}2834for (int i = 0; i < n; ++i) {2835if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))2836MOVSS(fpr.V(dregs[i]), tempxregs[i]);2837}2838ApplyPrefixD(dregs, sz);28392840fpr.ReleaseSpillLocks();2841}28422843void Jit::Comp_Vmmul(MIPSOpcode op) {2844CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);2845if (!js.HasNoPrefix()) {2846DISABLE;2847}28482849if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {2850// Fall back to interpreter, which has the accurate implementation.2851// Later we might do something more optimized here.2852DISABLE;2853}28542855MatrixSize sz = GetMtxSize(op);2856VectorSize vsz = GetVectorSize(sz);2857int n = GetMatrixSide(sz);28582859MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);2860MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);2861// If these overlap, we won't be able to map T as singles.2862MatrixOverlapType stoverlap = GetMatrixOverlap(_VS, _VT, sz);28632864if (jo.enableVFPUSIMD && !soverlap && !toverlap && !stoverlap) {2865u8 scols[4], dcols[4], tregs[16];28662867int vs = _VS;2868int vd = _VD;2869int vt = _VT;28702871bool transposeDest = false;2872bool transposeS = false;28732874if ((vd & 0x20) && sz == M_4x4) {2875vd ^= 0x20;2876transposeDest = true;2877}28782879// Our algorithm needs a transposed S (which is the usual).2880if (!(vs & 0x20) && sz == M_4x4) {2881vs ^= 0x20;2882transposeS = true;2883}28842885// The T matrix we will address individually.2886GetMatrixColumns(vd, sz, dcols);2887GetMatrixRows(vs, sz, scols);2888memset(tregs, 255, sizeof(tregs));2889GetMatrixRegs(tregs, sz, vt);2890for (int i = 0; i < 16; i++) {2891if (tregs[i] != 255)2892fpr.StoreFromRegisterV(tregs[i]);2893}28942895u8 scol[4][4];28962897// Map all of S's columns into registers.2898for (int i = 0; i < n; i++) {2899if (transposeS){2900fpr.StoreFromRegisterV(scols[i]);2901}2902GetVectorRegs(scol[i], vsz, scols[i]);2903fpr.MapRegsVS(scol[i], vsz, 0);2904fpr.SpillLockV(scols[i], vsz);2905}29062907// Shorter than manually stuffing the registers. But it feels like ther'es room for optimization here...2908auto transposeInPlace = [=](u8 col[4][4]) {2909MOVAPS(XMM0, fpr.VS(col[0]));2910UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[2]));2911UNPCKHPS(XMM0, fpr.VS(col[2]));29122913MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));2914UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[3]));2915UNPCKHPS(fpr.VSX(col[2]), fpr.VS(col[3]));29162917MOVAPS(fpr.VSX(col[3]), fpr.VS(col[0]));2918UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[1]));2919UNPCKHPS(fpr.VSX(col[3]), fpr.VS(col[1]));29202921MOVAPS(fpr.VSX(col[1]), R(XMM0));2922UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[2]));2923UNPCKHPS(XMM0, fpr.VS(col[2]));29242925MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));2926MOVAPS(fpr.VSX(col[1]), fpr.VS(col[3]));2927MOVAPS(fpr.VSX(col[3]), R(XMM0));2928};29292930// Some games pass in S as an E matrix (transposed). Let's just transpose the data before we do the multiplication instead.2931// This is shorter than trying to combine a discontinous matrix with lots of shufps.2932if (transposeS) {2933transposeInPlace(scol);2934}29352936// Now, work our way through the matrix, loading things as we go.2937// TODO: With more temp registers, can generate much more efficient code.2938for (int i = 0; i < n; i++) {2939MOVSS(XMM1, fpr.V(tregs[4 * i])); // TODO: AVX broadcastss to replace this and the SHUFPS2940MOVSS(XMM0, fpr.V(tregs[4 * i + 1]));2941SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));2942SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));2943MULPS(XMM1, fpr.VS(scol[0]));2944MULPS(XMM0, fpr.VS(scol[1]));2945ADDPS(XMM1, R(XMM0));2946for (int j = 2; j < n; j++) {2947MOVSS(XMM0, fpr.V(tregs[4 * i + j]));2948SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));2949MULPS(XMM0, fpr.VS(scol[j]));2950ADDPS(XMM1, R(XMM0));2951}2952// Map the D column.2953u8 dcol[4];2954GetVectorRegs(dcol, vsz, dcols[i]);2955#if !PPSSPP_ARCH(AMD64)2956fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT | MAP_NOLOCK);2957#else2958fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT);2959#endif2960MOVAPS(fpr.VS(dcol), XMM1);2961}2962if (transposeS){2963for (int i = 0; i < n; i++){2964fpr.DiscardVS(scols[i]);2965}2966}29672968#if !PPSSPP_ARCH(AMD64)2969fpr.ReleaseSpillLocks();2970#endif2971if (transposeDest) {2972u8 dcol[4][4];2973for (int i = 0; i < n; i++) {2974GetVectorRegs(dcol[i], vsz, dcols[i]);2975fpr.MapRegsVS(dcol[i], vsz, MAP_DIRTY);2976}2977transposeInPlace(dcol);2978}2979fpr.ReleaseSpillLocks();2980return;2981}29822983u8 sregs[16], tregs[16], dregs[16];2984GetMatrixRegs(sregs, sz, _VS);2985GetMatrixRegs(tregs, sz, _VT);2986GetMatrixRegs(dregs, sz, _VD);29872988// Flush SIMD.2989fpr.SimpleRegsV(sregs, sz, 0);2990fpr.SimpleRegsV(tregs, sz, 0);2991fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);29922993// Rough overlap check.2994bool overlap = false;2995if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {2996// Potential overlap (guaranteed for 3x3 or more).2997overlap = true;2998}29993000if (overlap) {3001u8 tempregs[16];3002for (int a = 0; a < n; a++) {3003for (int b = 0; b < n; b++) {3004MOVSS(XMM0, fpr.V(sregs[b * 4]));3005MULSS(XMM0, fpr.V(tregs[a * 4]));3006for (int c = 1; c < n; c++) {3007MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));3008MULSS(XMM1, fpr.V(tregs[a * 4 + c]));3009ADDSS(XMM0, R(XMM1));3010}3011u8 temp = (u8) fpr.GetTempV();3012fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);3013MOVSS(fpr.VX(temp), R(XMM0));3014fpr.StoreFromRegisterV(temp);3015tempregs[a * 4 + b] = temp;3016}3017}3018for (int a = 0; a < n; a++) {3019for (int b = 0; b < n; b++) {3020u8 temp = tempregs[a * 4 + b];3021fpr.MapRegV(temp, 0);3022MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));3023}3024}3025} else {3026for (int a = 0; a < n; a++) {3027for (int b = 0; b < n; b++) {3028MOVSS(XMM0, fpr.V(sregs[b * 4]));3029MULSS(XMM0, fpr.V(tregs[a * 4]));3030for (int c = 1; c < n; c++) {3031MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));3032MULSS(XMM1, fpr.V(tregs[a * 4 + c]));3033ADDSS(XMM0, R(XMM1));3034}3035MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);3036}3037}3038}3039fpr.ReleaseSpillLocks();3040}30413042void Jit::Comp_Vmscl(MIPSOpcode op) {3043CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);30443045// TODO: This op probably ignores prefixes?3046if (js.HasUnknownPrefix())3047DISABLE;30483049MatrixSize sz = GetMtxSize(op);3050int n = GetMatrixSide(sz);30513052u8 sregs[16], dregs[16], scale;3053GetMatrixRegs(sregs, sz, _VS);3054GetVectorRegs(&scale, V_Single, _VT);3055GetMatrixRegs(dregs, sz, _VD);30563057// Flush SIMD.3058fpr.SimpleRegsV(sregs, sz, 0);3059fpr.SimpleRegsV(&scale, V_Single, 0);3060fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);30613062// Move to XMM0 early, so we don't have to worry about overlap with scale.3063MOVSS(XMM0, fpr.V(scale));30643065// TODO: test overlap, optimize.3066u8 tempregs[16];3067for (int a = 0; a < n; a++) {3068for (int b = 0; b < n; b++) {3069u8 temp = (u8) fpr.GetTempV();3070fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);3071MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));3072MULSS(fpr.VX(temp), R(XMM0));3073fpr.StoreFromRegisterV(temp);3074tempregs[a * 4 + b] = temp;3075}3076}3077for (int a = 0; a < n; a++) {3078for (int b = 0; b < n; b++) {3079u8 temp = tempregs[a * 4 + b];3080fpr.MapRegV(temp, 0);3081MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));3082}3083}30843085fpr.ReleaseSpillLocks();3086}30873088void Jit::Comp_Vtfm(MIPSOpcode op) {3089CONDITIONAL_DISABLE(VFPU_MTX_VTFM);30903091// TODO: This probably ignores prefixes? Or maybe uses D?3092if (js.HasUnknownPrefix())3093DISABLE;30943095VectorSize sz = GetVecSize(op);3096MatrixSize msz = GetMtxSize(op);3097int n = GetNumVectorElements(sz);3098int ins = (op >> 23) & 7;30993100bool homogenous = false;3101if (n == ins) {3102n++;3103sz = (VectorSize)((int)(sz)+1);3104msz = (MatrixSize)((int)(msz)+1);3105homogenous = true;3106}3107// Otherwise, n should already be ins + 1.3108else if (n != ins + 1) {3109DISABLE;3110}31113112if (jo.enableVFPUSIMD) {3113u8 scols[4], dcol[4], tregs[4];31143115int vs = _VS;3116int vd = _VD;3117int vt = _VT; // vector!31183119// The T matrix we will address individually.3120GetVectorRegs(dcol, sz, vd);3121GetMatrixRows(vs, msz, scols);3122GetVectorRegs(tregs, sz, vt);3123for (int i = 0; i < n; i++) {3124fpr.StoreFromRegisterV(tregs[i]);3125}31263127// We need the T regs in individual regs, but they could overlap with S regs.3128// If that happens, we copy the T reg to a temp.3129auto flushConflictingTRegsToTemps = [&](u8 regs[4]) {3130for (int i = 0; i < n; ++i) {3131for (int j = 0; j < n; ++j) {3132if (regs[i] != tregs[j]) {3133continue;3134}31353136// They match. Let's replace this treg with a temp reg.3137// Note that it will spill if there's contention, unfortunately...3138tregs[j] = fpr.GetTempV();3139fpr.MapRegV(tregs[j], MAP_NOINIT);3140MOVSS(fpr.VX(tregs[j]), fpr.V(regs[i]));3141}3142}3143};31443145u8 scol[4][4];31463147// Map all of S's columns into registers.3148for (int i = 0; i < n; i++) {3149GetVectorRegs(scol[i], sz, scols[i]);3150flushConflictingTRegsToTemps(scol[i]);3151fpr.MapRegsVS(scol[i], sz, 0);3152}31533154// Now, work our way through the matrix, loading things as we go.3155// TODO: With more temp registers, can generate much more efficient code.3156MOVSS(XMM1, fpr.V(tregs[0])); // TODO: AVX broadcastss to replace this and the SHUFPS (but take care of temps, unless we force store them.)3157SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));3158MULPS(XMM1, fpr.VS(scol[0]));3159for (int j = 1; j < n; j++) {3160if (!homogenous || j != n - 1) {3161MOVSS(XMM0, fpr.V(tregs[j]));3162SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));3163MULPS(XMM0, fpr.VS(scol[j]));3164ADDPS(XMM1, R(XMM0));3165} else {3166ADDPS(XMM1, fpr.VS(scol[j]));3167}3168}3169// Map the D column. Release first in case of overlap.3170for (int i = 0; i < n; i++) {3171fpr.ReleaseSpillLockV(scol[i], sz);3172}3173fpr.MapRegsVS(dcol, sz, MAP_DIRTY | MAP_NOINIT);3174MOVAPS(fpr.VS(dcol), XMM1);3175fpr.ReleaseSpillLocks();3176return;3177}31783179u8 sregs[16], dregs[4], tregs[4];3180GetMatrixRegs(sregs, msz, _VS);3181GetVectorRegs(tregs, sz, _VT);3182GetVectorRegs(dregs, sz, _VD);31833184// Flush SIMD.3185fpr.SimpleRegsV(sregs, msz, 0);3186fpr.SimpleRegsV(tregs, sz, 0);3187fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);31883189// TODO: test overlap, optimize.3190u8 tempregs[4];3191for (int i = 0; i < n; i++) {3192MOVSS(XMM0, fpr.V(sregs[i * 4]));3193MULSS(XMM0, fpr.V(tregs[0]));3194for (int k = 1; k < n; k++)3195{3196MOVSS(XMM1, fpr.V(sregs[i * 4 + k]));3197if (!homogenous || k != n - 1)3198MULSS(XMM1, fpr.V(tregs[k]));3199ADDSS(XMM0, R(XMM1));3200}32013202u8 temp = (u8) fpr.GetTempV();3203fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);3204MOVSS(fpr.VX(temp), R(XMM0));3205fpr.StoreFromRegisterV(temp);3206tempregs[i] = temp;3207}3208for (int i = 0; i < n; i++) {3209u8 temp = tempregs[i];3210fpr.MapRegV(temp, 0);3211MOVSS(fpr.V(dregs[i]), fpr.VX(temp));3212}32133214fpr.ReleaseSpillLocks();3215}32163217void Jit::Comp_VCrs(MIPSOpcode op) {3218DISABLE;3219}32203221void Jit::Comp_VDet(MIPSOpcode op) {3222DISABLE;3223}32243225// The goal is to map (reversed byte order for clarity):3226// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD3227alignas(16) static const s8 vi2xc_shuffle[16] = { 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };3228// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD3229alignas(16) static const s8 vi2xs_shuffle[16] = { 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 };32303231void Jit::Comp_Vi2x(MIPSOpcode op) {3232CONDITIONAL_DISABLE(VFPU_VEC);3233if (js.HasUnknownPrefix())3234DISABLE;32353236int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)3237bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)32383239// These instructions pack pairs or quads of integers into 32 bits.3240// The unsigned (u) versions skip the sign bit when packing.32413242VectorSize sz = GetVecSize(op);3243VectorSize outsize;3244if (bits == 8) {3245outsize = V_Single;3246if (sz != V_Quad) {3247DISABLE;3248}3249} else {3250switch (sz) {3251case V_Pair:3252outsize = V_Single;3253break;3254case V_Quad:3255outsize = V_Pair;3256break;3257default:3258DISABLE;3259}3260}32613262u8 sregs[4], dregs[4];3263GetVectorRegsPrefixS(sregs, sz, _VS);3264GetVectorRegsPrefixD(dregs, outsize, _VD);32653266// Flush SIMD.3267fpr.SimpleRegsV(sregs, sz, 0);3268fpr.SimpleRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);32693270// First, let's assemble the sregs into lanes of a single xmm reg.3271// For quad inputs, we need somewhere for the bottom regs. Ideally dregs[0].3272X64Reg dst0 = XMM0;3273if (sz == V_Quad) {3274int vreg = dregs[0];3275if (!IsOverlapSafeAllowS(dregs[0], 0, 4, sregs)) {3276// Will be discarded on release.3277vreg = fpr.GetTempV();3278}3279fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);3280fpr.SpillLockV(vreg);3281dst0 = fpr.VX(vreg);3282} else {3283// Pair, let's check if we should use dregs[0] directly. No temp needed.3284int vreg = dregs[0];3285if (IsOverlapSafeAllowS(dregs[0], 0, 2, sregs)) {3286fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);3287fpr.SpillLockV(vreg);3288dst0 = fpr.VX(vreg);3289}3290}32913292if (!fpr.V(sregs[0]).IsSimpleReg(dst0)) {3293MOVSS(dst0, fpr.V(sregs[0]));3294}3295MOVSS(XMM1, fpr.V(sregs[1]));3296// With this, we have the lower half in dst0.3297PUNPCKLDQ(dst0, R(XMM1));3298if (sz == V_Quad) {3299MOVSS(XMM0, fpr.V(sregs[2]));3300MOVSS(XMM1, fpr.V(sregs[3]));3301PUNPCKLDQ(XMM0, R(XMM1));3302// Now we need to combine XMM0 into dst0.3303PUNPCKLQDQ(dst0, R(XMM0));3304} else {3305// Otherwise, we need to zero out the top 2.3306// We expect XMM1 to be zero below.3307PXOR(XMM1, R(XMM1));3308PUNPCKLQDQ(dst0, R(XMM1));3309}33103311// For "u" type ops, we clamp to zero and shift off the sign bit first.3312if (unsignedOp) {3313if (cpu_info.bSSE4_1) {3314if (sz == V_Quad) {3315// Zeroed in the other case above.3316PXOR(XMM1, R(XMM1));3317}3318PMAXSD(dst0, R(XMM1));3319PSLLD(dst0, 1);3320} else {3321// Get a mask of the sign bit in dst0, then and in the values. This clamps to 0.3322MOVDQA(XMM1, R(dst0));3323PSRAD(dst0, 31);3324PSLLD(XMM1, 1);3325PANDN(dst0, R(XMM1));3326}3327}33283329// At this point, everything is aligned in the high bits of our lanes.3330if (cpu_info.bSSSE3) {3331if (RipAccessible(vi2xc_shuffle)) {3332PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle)); // rip accessible3333} else {3334MOV(PTRBITS, R(TEMPREG), bits == 8 ? ImmPtr(vi2xc_shuffle) : ImmPtr(vi2xs_shuffle));3335PSHUFB(dst0, MatR(TEMPREG));3336}3337} else {3338// Let's *arithmetically* shift in the sign so we can use saturating packs.3339PSRAD(dst0, 32 - bits);3340// XMM1 used for the high part just so there's no dependency. It contains garbage or 0.3341PACKSSDW(dst0, R(XMM1));3342if (bits == 8) {3343PACKSSWB(dst0, R(XMM1));3344}3345}33463347if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {3348MOVSS(fpr.V(dregs[0]), dst0);3349}3350if (outsize == V_Pair) {3351fpr.MapRegV(dregs[1], MAP_NOINIT | MAP_DIRTY);3352MOVDQA(fpr.V(dregs[1]), dst0);3353// Shift out the lower result to get the result we want.3354PSRLDQ(fpr.VX(dregs[1]), 4);3355}33563357ApplyPrefixD(dregs, outsize);3358fpr.ReleaseSpillLocks();3359}33603361alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };33623363void Jit::Comp_Vhoriz(MIPSOpcode op) {3364CONDITIONAL_DISABLE(VFPU_VEC);33653366if (js.HasUnknownPrefix())3367DISABLE;33683369VectorSize sz = GetVecSize(op);3370int n = GetNumVectorElements(sz);33713372u8 sregs[4], dregs[1];3373GetVectorRegsPrefixS(sregs, sz, _VS);3374GetVectorRegsPrefixD(dregs, V_Single, _VD);3375if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {3376if (cpu_info.bSSE4_1) {3377MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));3378switch (sz) {3379case V_Pair:3380MOVAPS(XMM0, fpr.VS(sregs));3381DPPS(XMM0, MatR(TEMPREG), 0x31);3382MOVAPS(fpr.VSX(dregs), R(XMM0));3383break;3384case V_Triple:3385MOVAPS(XMM0, fpr.VS(sregs));3386DPPS(XMM0, MatR(TEMPREG), 0x71);3387MOVAPS(fpr.VSX(dregs), R(XMM0));3388break;3389case V_Quad:3390XORPS(XMM1, R(XMM1));3391MOVAPS(XMM0, fpr.VS(sregs));3392DPPS(XMM0, MatR(TEMPREG), 0xF1);3393// In every other case, +0.0 is selected by the mask and added.3394// But, here we need to manually add it to the result.3395ADDPS(XMM0, R(XMM1));3396MOVAPS(fpr.VSX(dregs), R(XMM0));3397break;3398default:3399DISABLE;3400}3401} else {3402switch (sz) {3403case V_Pair:3404XORPS(XMM1, R(XMM1));3405MOVAPS(XMM0, fpr.VS(sregs));3406ADDPS(XMM1, R(XMM0));3407SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));3408ADDPS(XMM0, R(XMM1));3409MOVAPS(fpr.VSX(dregs), R(XMM0));3410break;3411case V_Triple:3412XORPS(XMM1, R(XMM1));3413MOVAPS(XMM0, fpr.VS(sregs));3414ADDPS(XMM1, R(XMM0));3415SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));3416ADDPS(XMM0, R(XMM1));3417SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2));3418ADDPS(XMM0, R(XMM1));3419MOVAPS(fpr.VSX(dregs), R(XMM0));3420break;3421case V_Quad:3422XORPS(XMM1, R(XMM1));3423MOVAPS(XMM0, fpr.VS(sregs));3424// This flips the sign of any -0.000.3425ADDPS(XMM0, R(XMM1));3426MOVHLPS(XMM1, XMM0);3427ADDPS(XMM0, R(XMM1));3428MOVAPS(XMM1, R(XMM0));3429SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1));3430ADDPS(XMM0, R(XMM1));3431MOVAPS(fpr.VSX(dregs), R(XMM0));3432break;3433default:3434DISABLE;3435}3436}3437if (((op >> 16) & 31) == 7) { // vavg3438MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));3439MULSS(fpr.VSX(dregs), MatR(TEMPREG));3440}3441ApplyPrefixD(dregs, V_Single);3442fpr.ReleaseSpillLocks();3443return;3444}34453446// Flush SIMD.3447fpr.SimpleRegsV(sregs, sz, 0);3448fpr.SimpleRegsV(dregs, V_Single, MAP_NOINIT | MAP_DIRTY);34493450X64Reg reg = XMM0;3451if (IsOverlapSafe(dregs[0], 0, n, sregs)) {3452fpr.MapRegV(dregs[0], dregs[0] == sregs[0] ? MAP_DIRTY : MAP_NOINIT);3453fpr.SpillLockV(dregs[0]);3454reg = fpr.VX(dregs[0]);3455}34563457// We have to start zt +0.000 in case any values are -0.000.3458XORPS(reg, R(reg));3459for (int i = 0; i < n; ++i) {3460ADDSS(reg, fpr.V(sregs[i]));3461}34623463switch ((op >> 16) & 31) {3464case 6: // vfad3465break;3466case 7: // vavg3467MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));3468MULSS(reg, MatR(TEMPREG));3469break;3470}34713472if (reg == XMM0) {3473MOVSS(fpr.V(dregs[0]), XMM0);3474}34753476ApplyPrefixD(dregs, V_Single);3477fpr.ReleaseSpillLocks();3478}34793480void Jit::Comp_Viim(MIPSOpcode op) {3481CONDITIONAL_DISABLE(VFPU_XFER);34823483if (js.HasUnknownPrefix())3484DISABLE;34853486u8 dreg;3487GetVectorRegs(&dreg, V_Single, _VT);34883489// Flush SIMD.3490fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);34913492s32 imm = SignExtend16ToS32(op);3493FP32 fp;3494fp.f = (float)imm;3495MOV(32, R(TEMPREG), Imm32(fp.u));3496fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);3497MOVD_xmm(fpr.VX(dreg), R(TEMPREG));34983499ApplyPrefixD(&dreg, V_Single);3500fpr.ReleaseSpillLocks();3501}35023503void Jit::Comp_Vfim(MIPSOpcode op) {3504CONDITIONAL_DISABLE(VFPU_XFER);35053506if (js.HasUnknownPrefix())3507DISABLE;35083509u8 dreg;3510GetVectorRegs(&dreg, V_Single, _VT);35113512// Flush SIMD.3513fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);35143515FP16 half;3516half.u = op & 0xFFFF;3517FP32 fval = half_to_float_fast5(half);3518MOV(32, R(TEMPREG), Imm32(fval.u));3519fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);3520MOVD_xmm(fpr.VX(dreg), R(TEMPREG));35213522ApplyPrefixD(&dreg, V_Single);3523fpr.ReleaseSpillLocks();3524}35253526void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) {3527char what[4] = { '0', '0', '0', '0' };3528if (((imm >> 2) & 3) == (imm & 3)) {3529for (int i = 0; i < 4; i++)3530what[i] = 'S';3531}3532what[(imm >> 2) & 3] = 'S';3533what[imm & 3] = 'C';35343535// TODO: shufps SIMD version35363537for (int i = 0; i < n; i++) {3538fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);3539switch (what[i]) {3540case 'C': MOVSS(fpr.V(dregs[i]), XMM1); break;3541case 'S':3542MOVSS(fpr.V(dregs[i]), XMM0);3543if (negSin) {3544if (RipAccessible(&signBitLower)) {3545XORPS(fpr.VX(dregs[i]), M(&signBitLower)); // rip accessible3546} else {3547MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));3548XORPS(fpr.VX(dregs[i]), MatR(TEMPREG));3549}3550}3551break;3552case '0':3553{3554XORPS(fpr.VX(dregs[i]), fpr.V(dregs[i]));3555break;3556}3557default:3558ERROR_LOG(Log::JIT, "Bad what in vrot");3559break;3560}3561}3562}35633564// Very heavily used by FF:CC3565void Jit::Comp_VRot(MIPSOpcode op) {3566CONDITIONAL_DISABLE(VFPU_VEC);3567if (js.HasUnknownPrefix()) {3568DISABLE;3569}3570if (!js.HasNoPrefix()) {3571// Prefixes work strangely for this, see IRCompVFPU.3572WARN_LOG_REPORT(Log::JIT, "vrot instruction using prefixes at %08x", GetCompilerPC());3573DISABLE;3574}35753576int vd = _VD;3577int vs = _VS;35783579VectorSize sz = GetVecSize(op);3580int n = GetNumVectorElements(sz);35813582u8 dregs[4];3583u8 dregs2[4];35843585MIPSOpcode nextOp = GetOffsetInstruction(1);3586int vd2 = -1;3587int imm2 = -1;3588if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {3589// Pair of vrot with the same angle argument. Let's join them (can share sin/cos results).3590vd2 = MIPS_GET_VD(nextOp);3591imm2 = (nextOp >> 16) & 0x1f;3592// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);3593}35943595u8 sreg;3596GetVectorRegs(dregs, sz, vd);3597if (vd2 >= 0)3598GetVectorRegs(dregs2, sz, vd2);3599GetVectorRegs(&sreg, V_Single, vs);36003601// Flush SIMD.3602fpr.SimpleRegsV(&sreg, V_Single, 0);36033604int imm = (op >> 16) & 0x1f;36053606gpr.FlushBeforeCall();3607fpr.Flush();36083609bool negSin1 = (imm & 0x10) ? true : false;36103611#if PPSSPP_ARCH(AMD64)3612#ifdef _WIN323613LEA(64, RDX, MIPSSTATE_VAR(sincostemp));3614#else3615LEA(64, RDI, MIPSSTATE_VAR(sincostemp));3616#endif3617MOVSS(XMM0, fpr.V(sreg));3618ABI_CallFunction(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos);3619#else3620// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.3621ABI_CallFunctionAC(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos, fpr.V(sreg), (uintptr_t)mips_->sincostemp);3622#endif36233624MOVSS(XMM0, MIPSSTATE_VAR(sincostemp[0]));3625MOVSS(XMM1, MIPSSTATE_VAR(sincostemp[1]));36263627CompVrotShuffle(dregs, imm, n, false);3628if (vd2 != -1) {3629// If the negsin setting differs between the two joint invocations, we need to flip the second one.3630bool negSin2 = (imm2 & 0x10) ? true : false;3631CompVrotShuffle(dregs2, imm2, n, negSin1 != negSin2);3632EatInstruction(nextOp);3633}3634fpr.ReleaseSpillLocks();3635}36363637void Jit::Comp_ColorConv(MIPSOpcode op) {3638CONDITIONAL_DISABLE(VFPU_VEC);3639if (js.HasUnknownPrefix())3640DISABLE;36413642int vd = _VD;3643int vs = _VS;36443645DISABLE;3646#if 03647VectorSize sz = V_Quad;3648int n = GetNumVectorElements(sz);36493650switch ((op >> 16) & 3) {3651case 1:3652break;3653default:3654DISABLE;3655}36563657u8 sregs[4];3658u8 dregs[1];3659// WARNING: Prefixes.3660GetVectorRegs(sregs, sz, vs);3661GetVectorRegs(dregs, V_Pair, vd);36623663if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {3664switch ((op >> 16) & 3) {3665case 1: // 44443666{3667//int a = ((in >> 24) & 0xFF) >> 4;3668//int b = ((in >> 16) & 0xFF) >> 4;3669//int g = ((in >> 8) & 0xFF) >> 4;3670//int r = ((in)& 0xFF) >> 4;3671//col = (a << 12) | (b << 8) | (g << 4) | (r);3672//PACKUSW3673break;3674}3675case 2: // 55513676{3677//int a = ((in >> 24) & 0xFF) >> 7;3678//int b = ((in >> 16) & 0xFF) >> 3;3679//int g = ((in >> 8) & 0xFF) >> 3;3680//int r = ((in)& 0xFF) >> 3;3681//col = (a << 15) | (b << 10) | (g << 5) | (r);3682break;3683}3684case 3: // 5653685{3686//int b = ((in >> 16) & 0xFF) >> 3;3687//int g = ((in >> 8) & 0xFF) >> 2;3688//int r = ((in)& 0xFF) >> 3;3689//col = (b << 11) | (g << 5) | (r);3690break;3691}3692}3693DISABLE;36943695// Flush SIMD.3696fpr.SimpleRegsV(&sreg, V_Pair, MAP_NOINIT | MAP_DIRTY);3697fpr.SimpleRegsV(&dreg, V_Pair, MAP_NOINIT | MAP_DIRTY);3698#endif36993700}3701}37023703#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)370437053706