CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/IR/IRCompVFPU.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cmath>1819#include "Common/CPUDetect.h"20#include "Common/Data/Convert/SmallDataConvert.h"21#include "Common/Math/math_util.h"22#include "Core/Compatibility.h"23#include "Core/Config.h"24#include "Core/MemMap.h"25#include "Core/MIPS/MIPS.h"26#include "Core/MIPS/MIPSTables.h"27#include "Core/MIPS/MIPSAnalyst.h"28#include "Core/MIPS/MIPSCodeUtils.h"29#include "Core/MIPS/IR/IRFrontend.h"30#include "Core/MIPS/IR/IRRegCache.h"31#include "Core/Reporting.h"32#include "Core/System.h"333435// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.36// Currently known non working ones should have DISABLE.3738// #define CONDITIONAL_DISABLE(flag) { Comp_Generic(op); return; }39#define CONDITIONAL_DISABLE(flag) if (opts.disableFlags & (uint32_t)JitDisable::flag) { Comp_Generic(op); return; }40#define DISABLE { Comp_Generic(op); return; }41#define INVALIDOP { Comp_Generic(op); return; }4243#define _RS MIPS_GET_RS(op)44#define _RT MIPS_GET_RT(op)45#define _RD MIPS_GET_RD(op)46#define _FS MIPS_GET_FS(op)47#define _FT MIPS_GET_FT(op)48#define _FD MIPS_GET_FD(op)49#define _SA MIPS_GET_SA(op)50#define _POS ((op>> 6) & 0x1F)51#define _SIZE ((op>>11) & 0x1F)52#define _IMM16 (signed short)(op & 0xFFFF)53#define _IMM26 (op & 0x03FFFFFF)5455const int vfpuBase = 32; // skip the FP registers5657namespace MIPSComp {58static void ApplyVoffset(u8 regs[4], int count) {59for (int i = 0; i < count; i++) {60regs[i] = vfpuBase + voffset[regs[i]];61}62}6364static bool IsConsecutive2(const u8 regs[2]) {65return regs[1] == regs[0] + 1;66}6768static bool IsConsecutive3(const u8 regs[3]) {69return IsConsecutive2(regs) && regs[2] == regs[1] + 1;70}7172static bool IsConsecutive4(const u8 regs[4]) {73return IsConsecutive3(regs) && regs[3] == regs[2] + 1;74}7576static bool IsVec2(VectorSize sz, const u8 regs[2]) {77return sz == V_Pair && IsConsecutive2(regs) && (regs[0] & 1) == 0;78}7980static bool IsVec4(VectorSize sz, const u8 regs[4]) {81return sz == V_Quad && IsConsecutive4(regs) && (regs[0] & 3) == 0;82}8384static bool IsVec3of4(VectorSize sz, const u8 regs[4]) {85return sz == V_Triple && IsConsecutive3(regs) && (regs[0] & 3) == 0;86}8788static bool IsMatrixVec4(MatrixSize sz, const u8 regs[16]) {89if (sz != M_4x4)90return false;91if (!IsConsecutive4(®s[0]) || (regs[0] & 3) != 0)92return false;93if (!IsConsecutive4(®s[4]) || (regs[4] & 3) != 0)94return false;95if (!IsConsecutive4(®s[8]) || (regs[8] & 3) != 0)96return false;97if (!IsConsecutive4(®s[12]) || (regs[12] & 3) != 0)98return false;99return true;100}101102// Vector regs can overlap in all sorts of swizzled ways.103// This does allow a single overlap in sregs[i].104static bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {105for (int i = 0; i < sn; ++i) {106if (sregs[i] == dreg && i != di)107return false;108}109for (int i = 0; i < tn; ++i) {110if (tregs[i] == dreg)111return false;112}113114// Hurray, no overlap, we can write directly.115return true;116}117118static bool IsOverlapSafeAllowS(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {119for (int i = 0; i < dn; ++i) {120if (!IsOverlapSafeAllowS(dregs[i], i, sn, sregs, tn, tregs)) {121return false;122}123}124return true;125}126127static bool IsOverlapSafe(int dreg, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {128return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);129}130131static bool IsOverlapSafe(int dn, const u8 dregs[], int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = nullptr) {132for (int i = 0; i < dn; ++i) {133if (!IsOverlapSafe(dregs[i], sn, sregs, tn, tregs)) {134return false;135}136}137return true;138}139140static bool IsPrefixWithinSize(u32 prefix, VectorSize sz) {141int n = GetNumVectorElements(sz);142for (int i = n; i < 4; i++) {143int regnum = (prefix >> (i * 2)) & 3;144int abs = (prefix >> (8 + i)) & 1;145int negate = (prefix >> (16 + i)) & 1;146int constants = (prefix >> (12 + i)) & 1;147if (regnum >= n && !constants) {148if (abs || negate || regnum != i)149return false;150}151}152153return true;154}155156static bool IsPrefixWithinSize(u32 prefix, MIPSOpcode op) {157return IsPrefixWithinSize(prefix, GetVecSize(op));158}159160void IRFrontend::Comp_VPFX(MIPSOpcode op) {161CONDITIONAL_DISABLE(VFPU_XFER);162// This is how prefixes are typically set.163int data = op & 0xFFFFF;164int regnum = (op >> 24) & 3;165switch (regnum) {166case 0: // S167js.prefixS = data;168js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;169break;170case 1: // T171js.prefixT = data;172js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;173break;174case 2: // D175js.prefixD = data & 0x00000FFF;176js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;177break;178default:179ERROR_LOG(Log::CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);180break;181}182}183184static void InitRegs(u8 *vregs, int reg) {185vregs[0] = reg;186vregs[1] = reg + 1;187vregs[2] = reg + 2;188vregs[3] = reg + 3;189}190191void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {192if (prefix == 0xE4)193return;194195int n = GetNumVectorElements(sz);196u8 origV[4]{};197static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };198199for (int i = 0; i < n; i++)200origV[i] = vregs[i];201202// Some common vector prefixes203if (IsVec4(sz, vregs)) {204if (prefix == 0xF00E4) {205InitRegs(vregs, tempReg);206ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);207return;208}209if (prefix == 0x00FE4) {210InitRegs(vregs, tempReg);211ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);212return;213}214// Pure shuffle215if (prefix == (prefix & 0xFF)) {216InitRegs(vregs, tempReg);217ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);218return;219}220221if ((prefix & 0x000FF000) == 0x0000F000) {222// Handle some easy and common cases.223Vec4Init init = Vec4Init::AllZERO;224bool useInit;225switch (prefix & 0xFFF) {226case 0x00: useInit = true; init = Vec4Init::AllZERO; break;227case 0x01: useInit = true; init = Vec4Init::Set_1000; break;228case 0x04: useInit = true; init = Vec4Init::Set_0100; break;229case 0x10: useInit = true; init = Vec4Init::Set_0010; break;230case 0x40: useInit = true; init = Vec4Init::Set_0001; break;231case 0x55: useInit = true; init = Vec4Init::AllONE; break;232default: useInit = false; break;233}234235if (useInit) {236InitRegs(vregs, tempReg);237ir.Write(IROp::Vec4Init, vregs[0], (int)init);238return;239}240}241242// Check if we're just zeroing certain lanes - this is common.243u32 zeroedLanes = 0;244for (int i = 0; i < 4; ++i) {245int regnum = (prefix >> (i * 2)) & 3;246int abs = (prefix >> (8 + i)) & 1;247int negate = (prefix >> (16 + i)) & 1;248int constants = (prefix >> (12 + i)) & 1;249250if (!constants && regnum == i && !abs && !negate)251continue;252if (constants && regnum == 0 && abs == 0 && !negate) {253zeroedLanes |= 1 << i;254continue;255}256257// Nope, it has something else going on.258zeroedLanes = -1;259break;260}261262if (zeroedLanes != -1) {263InitRegs(vregs, tempReg);264ir.Write(IROp::Vec4Init, vregs[0], (int)Vec4Init::AllZERO);265ir.Write(IROp::Vec4Blend, vregs[0], origV[0], vregs[0], zeroedLanes);266return;267}268}269270// Alright, fall back to the generic approach.271for (int i = 0; i < n; i++) {272int regnum = (prefix >> (i * 2)) & 3;273int abs = (prefix >> (8 + i)) & 1;274int negate = (prefix >> (16 + i)) & 1;275int constants = (prefix >> (12 + i)) & 1;276277// Unchanged, hurray.278if (!constants && regnum == i && !abs && !negate)279continue;280281// This puts the value into a temp reg, so we won't write the modified value back.282vregs[i] = tempReg + i;283if (!constants) {284if (regnum >= n) {285// Depends on the op, but often zero.286ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(0.0f));287} else if (abs) {288ir.Write(IROp::FAbs, vregs[i], origV[regnum]);289if (negate)290ir.Write(IROp::FNeg, vregs[i], vregs[i]);291} else {292if (negate)293ir.Write(IROp::FNeg, vregs[i], origV[regnum]);294else if (vregs[i] != origV[regnum])295ir.Write(IROp::FMov, vregs[i], origV[regnum]);296}297} else {298if (negate) {299ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)]));300} else {301ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)]));302}303}304}305}306307void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {308::GetVectorRegs(regs, N, vectorReg);309ApplyVoffset(regs, N);310}311312void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {313::GetMatrixRegs(regs, N, matrixReg);314for (int i = 0; i < GetMatrixSide(N); i++) {315ApplyVoffset(regs + 4 * i, GetVectorSize(N));316}317}318319void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {320_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);321GetVectorRegs(regs, sz, vectorReg);322ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);323}324void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {325_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);326GetVectorRegs(regs, sz, vectorReg);327ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);328}329330void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {331_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);332333GetVectorRegs(regs, sz, vectorReg);334int n = GetNumVectorElements(sz);335if (js.prefixD == 0)336return;337338if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {339// Use temps for all, we'll blend in the end (keeping in Vec4.)340for (int i = 0; i < 4; ++i)341regs[i] = IRVTEMP_PFX_D + i;342return;343}344345for (int i = 0; i < n; i++) {346// Hopefully this is rare, we'll just write it into a dumping ground reg.347if (js.VfpuWriteMask(i))348regs[i] = IRVTEMP_PFX_D + i;349}350}351352inline int GetDSat(int prefix, int i) {353return (prefix >> (i * 2)) & 3;354}355356// "D" prefix is really a post process. No need to allocate a temporary register (except357// dummies to simulate writemask, which is done in GetVectorRegsPrefixD358void IRFrontend::ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg) {359_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);360if (!js.prefixD)361return;362363ApplyPrefixDMask(vregs, sz, vectorReg);364365int n = GetNumVectorElements(sz);366for (int i = 0; i < n; i++) {367if (js.VfpuWriteMask(i))368continue;369int sat = GetDSat(js.prefixD, i);370if (sat == 1) {371// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]372ir.Write(IROp::FSat0_1, vregs[i], vregs[i]);373} else if (sat == 3) {374ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]);375}376}377}378379void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {380if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {381u8 origV[4];382GetVectorRegs(origV, sz, vectorReg);383384// Just keep the original values where it was masked.385ir.Write(IROp::Vec4Blend, origV[0], vregs[0], origV[0], js.VfpuWriteMask());386387// So that saturate works, change it back.388for (int i = 0; i < 4; ++i)389vregs[i] = origV[i];390}391}392393void IRFrontend::Comp_SV(MIPSOpcode op) {394CONDITIONAL_DISABLE(LSU_VFPU);395s32 offset = (signed short)(op & 0xFFFC);396int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);397MIPSGPReg rs = _RS;398399CheckMemoryBreakpoint(rs, offset);400401switch (op >> 26) {402case 50: //lv.s403ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));404break;405406case 58: //sv.s407ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));408break;409410default:411INVALIDOP;412}413}414415void IRFrontend::Comp_SVQ(MIPSOpcode op) {416CONDITIONAL_DISABLE(LSU_VFPU);417int imm = (signed short)(op & 0xFFFC);418int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);419MIPSGPReg rs = _RS;420421u8 vregs[4];422GetVectorRegs(vregs, V_Quad, vt);423424CheckMemoryBreakpoint(rs, imm);425426enum class LSVType {427INVALID,428LVQ,429SVQ,430LVLQ,431LVRQ,432SVLQ,433SVRQ,434};435436LSVType optype = LSVType::INVALID;437switch (op >> 26) {438case 54: optype = LSVType::LVQ; break; // lv.q439case 62: optype = LSVType::SVQ; break; // sv.q440case 53: // lvl/lvr.q - highly unusual441optype = (op & 2) == 0 ? LSVType::LVLQ : LSVType::LVRQ;442break;443case 61: // svl/svr.q - highly unusual444optype = (op & 2) == 0 ? LSVType::SVLQ : LSVType::SVRQ;445break;446}447if (optype == LSVType::INVALID)448INVALIDOP;449450if ((optype == LSVType::LVRQ || optype == LSVType::SVRQ) && opts.unalignedLoadStoreVec4) {451// We don't bother with an op for this, but we do fuse unaligned stores which happen.452MIPSOpcode nextOp = GetOffsetInstruction(1);453if ((nextOp.encoding ^ op.encoding) == 0x0000000E) {454// Okay, it's an svr.q/svl.q pair, same registers. Treat as lv.q/sv.q.455EatInstruction(nextOp);456optype = optype == LSVType::LVRQ ? LSVType::LVQ : LSVType::SVQ;457}458}459460switch (optype) {461case LSVType::LVQ:462if (IsVec4(V_Quad, vregs)) {463ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));464} else {465// Let's not even bother with "vertical" loads for now.466if (!g_Config.bFastMemory)467ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 0, (u32)imm);468ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm));469ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4));470ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8));471ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12));472}473break;474475case LSVType::SVQ:476if (IsVec4(V_Quad, vregs)) {477ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));478} else {479// Let's not even bother with "vertical" stores for now.480if (!g_Config.bFastMemory)481ir.Write(IROp::ValidateAddress128, 0, (u8)rs, 1, (u32)imm);482ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm));483ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4));484ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8));485ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12));486}487break;488489case LSVType::LVLQ:490case LSVType::LVRQ:491case LSVType::SVLQ:492case LSVType::SVRQ:493// These are pretty uncommon unless paired.494DISABLE;495break;496497default:498INVALIDOP;499}500}501502void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {503CONDITIONAL_DISABLE(VFPU_XFER);504if (js.HasUnknownPrefix() || js.HasSPrefix()) {505DISABLE;506}507508// Vector init509// d[N] = CONST[N]510// Note: probably implemented as vmov with prefix hack.511512VectorSize sz = GetVecSize(op);513int type = (op >> 16) & 0xF;514int vd = _VD;515int n = GetNumVectorElements(sz);516u8 dregs[4];517GetVectorRegsPrefixD(dregs, sz, vd);518519if (IsVec4(sz, dregs)) {520ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));521} else {522for (int i = 0; i < n; i++) {523ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));524}525}526ApplyPrefixD(dregs, sz, vd);527}528529void IRFrontend::Comp_VIdt(MIPSOpcode op) {530CONDITIONAL_DISABLE(VFPU_XFER);531if (js.HasUnknownPrefix() || js.HasSPrefix()) {532DISABLE;533}534535// Vector identity row536// d[N] = IDENTITY[N,m]537// Note: probably implemented as vmov with prefix hack.538539int vd = _VD;540VectorSize sz = GetVecSize(op);541u8 dregs[4];542GetVectorRegsPrefixD(dregs, sz, vd);543544if (IsVec4(sz, dregs)) {545int row = vd & 3;546Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);547ir.Write(IROp::Vec4Init, dregs[0], (int)init);548} else {549switch (sz) {550case V_Pair:551ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f));552ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f));553break;554case V_Quad:555ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f));556ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f));557ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f));558ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f));559break;560default:561INVALIDOP;562}563}564565ApplyPrefixD(dregs, sz, vd);566}567568void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {569CONDITIONAL_DISABLE(VFPU_XFER);570MatrixSize sz = GetMtxSize(op);571if (!js.HasNoPrefix()) {572DISABLE;573}574575// Matrix init (weird prefixes)576// d[N,M] = CONST[N,M]577578int vd = _VD;579if (IsMatrixTransposed(vd)) {580// All outputs are transpositionally symmetric, so should be fine.581vd = TransposeMatrixReg(vd);582}583584if (sz != M_4x4) {585// 3x3 is decently common. It expands a lot, but let's set each.586u8 dregs[16];587GetMatrixRegs(dregs, sz, vd);588589// TODO: It might be worth using Vec4Blend for 3x3 to mask w.590int n = GetMatrixSide(sz);591for (int y = 0; y < n; ++y) {592for (int x = 0; x < n; ++x) {593switch ((op >> 16) & 0xF) {594case 3: // vmidt595if (x == 0 && y == 0)596ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));597else if (x == y)598ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);599else600ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));601break;602case 6: // vmzero603// Likely to be fast.604ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(0.0f));605break;606case 7: // vmone607if (x == 0 && y == 0)608ir.Write(IROp::SetConstF, dregs[y * 4 + x], ir.AddConstantFloat(1.0f));609else610ir.Write(IROp::FMov, dregs[y * 4 + x], dregs[0]);611break;612default:613INVALIDOP;614}615}616}617return;618}619620// Not really about trying here, it will work if enabled.621VectorSize vsz = GetVectorSize(sz);622u8 vecs[4];623GetMatrixColumns(vd, sz, vecs);624for (int i = 0; i < 4; i++) {625u8 vec[4];626GetVectorRegs(vec, vsz, vecs[i]);627// As they are columns, they will be nicely consecutive.628Vec4Init init;629switch ((op >> 16) & 0xF) {630case 3:631init = Vec4Init((int)Vec4Init::Set_1000 + i);632break;633case 6:634init = Vec4Init::AllZERO;635break;636case 7:637init = Vec4Init::AllONE;638break;639default:640INVALIDOP;641return;642}643ir.Write(IROp::Vec4Init, vec[0], (int)init);644}645}646647void IRFrontend::Comp_VHdp(MIPSOpcode op) {648CONDITIONAL_DISABLE(VFPU_VEC);649if (js.HasUnknownPrefix() || js.HasSPrefix() || !IsPrefixWithinSize(js.prefixT, op)) {650DISABLE;651}652653// Vector homogenous dot product654// d[0] = s[0 .. n-2] dot t[0 .. n-2] + t[n-1]655// Note: s[n-1] is ignored / treated as 1 via prefix override.656657int vd = _VD;658int vs = _VS;659int vt = _VT;660VectorSize sz = GetVecSize(op);661int n = GetNumVectorElements(sz);662663if (js.prefixS & (0x0101 << (8 + n - 1)))664DISABLE;665666// TODO: Force read one of them into regs? probably not.667u8 sregs[4], tregs[4], dregs[1];668GetVectorRegsPrefixS(sregs, sz, vs);669GetVectorRegsPrefixT(tregs, sz, vt);670GetVectorRegsPrefixD(dregs, V_Single, vd);671672ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);673674for (int i = 1; i < n; i++) {675if (i == n - 1) {676ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]);677} else {678ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]);679ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1);680}681}682683ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);684ApplyPrefixD(dregs, V_Single, vd);685}686687alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };688689void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {690CONDITIONAL_DISABLE(VFPU_VEC);691if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {692DISABLE;693}694695// Vector horizontal add696// d[0] = s[0] + ... s[n-1]697// Vector horizontal average698// d[0] = s[0] / n + ... s[n-1] / n699// Note: Both are implemented as dot products against generated constants.700701VectorSize sz = GetVecSize(op);702int n = GetNumVectorElements(sz);703704u8 sregs[4], dregs[1];705GetVectorRegsPrefixS(sregs, sz, _VS);706GetVectorRegsPrefixD(dregs, V_Single, _VD);707708// We have to start at +0.000 in case any values are -0.000.709ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f));710for (int i = 0; i < n; ++i) {711ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]);712}713714switch ((op >> 16) & 31) {715case 6: // vfad716ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);717break;718case 7: // vavg719ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1]));720ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1);721break;722}723724ApplyPrefixD(dregs, V_Single, _VD);725}726727void IRFrontend::Comp_VDot(MIPSOpcode op) {728CONDITIONAL_DISABLE(VFPU_VEC);729if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {730DISABLE;731}732733// Vector dot product734// d[0] = s[0 .. n-1] dot t[0 .. n-1]735736int vd = _VD;737int vs = _VS;738int vt = _VT;739740VectorSize sz = GetVecSize(op);741int n = GetNumVectorElements(sz);742743// TODO: Force read one of them into regs? probably not.744u8 sregs[4], tregs[4], dregs[1];745GetVectorRegsPrefixS(sregs, sz, vs);746GetVectorRegsPrefixT(tregs, sz, vt);747GetVectorRegsPrefixD(dregs, V_Single, vd);748749if (IsVec4(sz, sregs) && IsVec4(sz, tregs)) {750if (IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {751ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);752} else {753ir.Write(IROp::Vec4Dot, IRVTEMP_0, sregs[0], tregs[0]);754ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);755}756ApplyPrefixD(dregs, V_Single, vd);757return;758} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4Dot) {759// Note: this is often worse than separate muliplies and adds on x86.760if (IsOverlapSafe(dregs[0], n, tregs) || sregs[0] == tregs[0]) {761// Nice example of this in Fat Princess (US) in block 088181A0 (hot.)762// Create a temporary copy of S with the last element zeroed.763ir.Write(IROp::Vec4Init, IRVTEMP_0, (int)Vec4Init::AllZERO);764ir.Write(IROp::Vec4Blend, IRVTEMP_0, IRVTEMP_0, sregs[0], 0x7);765// Now we can just dot like normal, with the last element effectively masked.766ir.Write(IROp::Vec4Dot, dregs[0], IRVTEMP_0, sregs[0] == tregs[0] ? IRVTEMP_0 : tregs[0]);767ApplyPrefixD(dregs, V_Single, vd);768return;769}770}771772int temp0 = IRVTEMP_0;773int temp1 = IRVTEMP_0 + 1;774ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);775for (int i = 1; i < n; i++) {776ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);777ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);778}779ApplyPrefixD(dregs, V_Single, vd);780}781782void IRFrontend::Comp_VecDo3(MIPSOpcode op) {783CONDITIONAL_DISABLE(VFPU_VEC);784if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {785DISABLE;786}787788// Vector arithmetic789// d[N] = OP(s[N], t[N]) (see below)790791enum class VecDo3Op : uint8_t {792INVALID,793VADD,794VSUB,795VDIV,796VMUL,797VMIN,798VMAX,799VSGE,800VSLT,801};802VecDo3Op type = VecDo3Op::INVALID;803VectorSize sz = GetVecSize(op);804int n = GetNumVectorElements(sz);805806// Check that we can support the ops, and prepare temporary values for ops that need it.807switch (op >> 26) {808case 24: //VFPU0809switch ((op >> 23) & 7) {810case 0: type = VecDo3Op::VADD; break;811case 1: type = VecDo3Op::VSUB; break;812case 7: type = VecDo3Op::VDIV; break;813default: INVALIDOP;814}815break;816case 25: //VFPU1817switch ((op >> 23) & 7) {818case 0: type = VecDo3Op::VMUL; break;819default: INVALIDOP;820}821break;822case 27: //VFPU3823switch ((op >> 23) & 7) {824case 2: type = VecDo3Op::VMIN; break;825case 3: type = VecDo3Op::VMAX; break;826case 6: type = VecDo3Op::VSGE; break;827case 7: type = VecDo3Op::VSLT; break;828default: INVALIDOP;829}830break;831default: INVALIDOP;832}833_assert_(type != VecDo3Op::INVALID);834835bool allowSIMD = true;836switch (type) {837case VecDo3Op::VADD:838case VecDo3Op::VSUB:839case VecDo3Op::VMUL:840break;841case VecDo3Op::VDIV:842if (js.HasUnknownPrefix() || (sz != V_Single && !js.HasNoPrefix()))843DISABLE;844// If it's single, we just need to check the prefixes are within the size.845if (!IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))846DISABLE;847break;848case VecDo3Op::VMIN:849case VecDo3Op::VMAX:850case VecDo3Op::VSGE:851case VecDo3Op::VSLT:852allowSIMD = false;853break;854case VecDo3Op::INVALID: // Can't happen, but to avoid compiler warnings855break;856}857858u8 sregs[4], tregs[4], dregs[4];859GetVectorRegsPrefixS(sregs, sz, _VS);860GetVectorRegsPrefixT(tregs, sz, _VT);861GetVectorRegsPrefixD(dregs, sz, _VD);862863u8 tempregs[4];864for (int i = 0; i < n; i++) {865if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {866tempregs[i] = IRVTEMP_0 + i;867} else {868tempregs[i] = dregs[i];869}870}871872// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.873if (allowSIMD) {874IROp opFunc = IROp::Nop;875switch (type) {876case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd877opFunc = IROp::Vec4Add;878break;879case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub880opFunc = IROp::Vec4Sub;881break;882case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv883opFunc = IROp::Vec4Div;884break;885case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul886opFunc = IROp::Vec4Mul;887break;888default:889// Leave it Nop, disabled below.890break;891}892893if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {894if (opFunc != IROp::Nop) {895ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);896} else {897DISABLE;898}899ApplyPrefixD(dregs, sz, _VD);900return;901} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {902// This is actually pretty common. Use a temp + blend.903// We could post-process this, but it's easier to do it here.904if (opFunc == IROp::Nop)905DISABLE;906ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);907ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);908ApplyPrefixD(dregs, sz, _VD);909return;910}911}912913if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {914// TODO: Consider a dedicated op? For now, we abuse FpCond a bit.915ir.Write(IROp::FpCondToReg, IRTEMP_0);916}917918for (int i = 0; i < n; ++i) {919switch (type) {920case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd921ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]);922break;923case VecDo3Op::VSUB: // d[i] = s[i] - t[i]; break; //vsub924ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]);925break;926case VecDo3Op::VDIV: // d[i] = s[i] / t[i]; break; //vdiv927ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]);928break;929case VecDo3Op::VMUL: // d[i] = s[i] * t[i]; break; //vmul930ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]);931break;932case VecDo3Op::VMIN: // vmin933ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]);934break;935case VecDo3Op::VMAX: // vmax936ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]);937break;938case VecDo3Op::VSGE: // vsge939ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessUnordered, sregs[i], tregs[i]);940ir.Write(IROp::FpCondToReg, IRTEMP_1);941ir.Write(IROp::XorConst, IRTEMP_1, IRTEMP_1, ir.AddConstant(1));942ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);943ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);944break;945case VecDo3Op::VSLT: // vslt946ir.Write(IROp::FCmp, (int)IRFpCompareMode::LessOrdered, sregs[i], tregs[i]);947ir.Write(IROp::FpCondToReg, IRTEMP_1);948ir.Write(IROp::FMovFromGPR, tempregs[i], IRTEMP_1);949ir.Write(IROp::FCvtSW, tempregs[i], tempregs[i]);950break;951case VecDo3Op::INVALID: // Can't happen, but to avoid compiler warnings952break;953}954}955956if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {957ir.Write(IROp::FpCondFromReg, IRTEMP_0);958}959960for (int i = 0; i < n; i++) {961if (dregs[i] != tempregs[i]) {962ir.Write(IROp::FMov, dregs[i], tempregs[i]);963}964}965966ApplyPrefixD(dregs, sz, _VD);967}968969void IRFrontend::Comp_VV2Op(MIPSOpcode op) {970CONDITIONAL_DISABLE(VFPU_VEC);971972if (js.HasUnknownPrefix()) {973DISABLE;974}975976int optype = (op >> 16) & 0x1f;977if (optype == 0) {978if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op))979DISABLE;980} else if (optype == 1 || optype == 2) {981// D prefix is fine for these, and used sometimes.982if (js.HasUnknownPrefix() || js.HasSPrefix())983DISABLE;984} else if (optype == 5 && js.HasDPrefix()) {985DISABLE;986}987988// Vector unary operation989// d[N] = OP(s[N]) (see below)990991int vs = _VS;992int vd = _VD;993VectorSize sz = GetVecSize(op);994int n = GetNumVectorElements(sz);995996if (optype >= 16 && !js.HasNoPrefix()) {997// Many of these apply the D prefix strangely or override parts of the S prefix.998if (js.HasUnknownPrefix() || sz != V_Single)999DISABLE;1000// If it's single, we just need to check the prefixes are within the size.1001if (!IsPrefixWithinSize(js.prefixS, op))1002DISABLE;1003// The negative ones seem to use negate flags as a prefix hack.1004if (optype >= 24 && (js.prefixS & 0x000F0000) != 0)1005DISABLE;1006}10071008// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure1009if (optype == 0 && vs == vd && js.HasNoPrefix()) {1010return;1011}10121013u8 sregs[4]{}, dregs[4]{};1014GetVectorRegsPrefixS(sregs, sz, vs);1015GetVectorRegsPrefixD(dregs, sz, vd);10161017bool usingTemps = false;1018u8 tempregs[4];1019for (int i = 0; i < n; ++i) {1020if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {1021usingTemps = true;1022tempregs[i] = IRVTEMP_0 + i;1023} else {1024tempregs[i] = dregs[i];1025}1026}10271028bool canSIMD = false;1029// Some can be SIMD'd.1030switch (optype) {1031case 0: // vmov1032case 1: // vabs1033case 2: // vneg1034canSIMD = true;1035break;1036}10371038if (canSIMD && !usingTemps) {1039IROp irop = IROp::Nop;1040switch (optype) {1041case 0: // vmov1042irop = IROp::Vec4Mov;1043break;1044case 1: // vabs1045irop = IROp::Vec4Abs;1046break;1047case 2: // vneg1048irop = IROp::Vec4Neg;1049break;1050}1051if (IsVec4(sz, sregs) && IsVec4(sz, dregs) && irop != IROp::Nop) {1052ir.Write(irop, dregs[0], sregs[0]);1053ApplyPrefixD(dregs, sz, vd);1054return;1055} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && irop != IROp::Nop && opts.preferVec4) {1056// This is a simple case of vmov.t, just blend.1057if (irop == IROp::Vec4Mov) {1058ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], sregs[0], 0x7);1059} else {1060ir.Write(irop, IRVTEMP_0, sregs[0]);1061ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);1062}1063ApplyPrefixD(dregs, sz, vd);1064return;1065}1066}10671068for (int i = 0; i < n; ++i) {1069switch (optype) {1070case 0: // d[i] = s[i]; break; //vmov1071// Probably for swizzle.1072if (tempregs[i] != sregs[i])1073ir.Write(IROp::FMov, tempregs[i], sregs[i]);1074break;1075case 1: // d[i] = fabsf(s[i]); break; //vabs1076ir.Write(IROp::FAbs, tempregs[i], sregs[i]);1077break;1078case 2: // d[i] = -s[i]; break; //vneg1079ir.Write(IROp::FNeg, tempregs[i], sregs[i]);1080break;1081case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat01082ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]);1083break;1084case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat11085ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]);1086break;1087case 16: // d[i] = 1.0f / s[i]; break; //vrcp1088ir.Write(IROp::FRecip, tempregs[i], sregs[i]);1089break;1090case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq1091ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]);1092break;1093case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin1094ir.Write(IROp::FSin, tempregs[i], sregs[i]);1095break;1096case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos1097ir.Write(IROp::FCos, tempregs[i], sregs[i]);1098break;1099case 20: // d[i] = powf(2.0f, s[i]); break; //vexp21100DISABLE;1101break;1102case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog21103DISABLE;1104break;1105case 22: // d[i] = sqrtf(s[i]); break; //vsqrt1106ir.Write(IROp::FSqrt, tempregs[i], sregs[i]);1107break;1108case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin1109ir.Write(IROp::FAsin, tempregs[i], sregs[i]);1110break;1111case 24: // d[i] = -1.0f / s[i]; break; // vnrcp1112ir.Write(IROp::FRecip, tempregs[i], sregs[i]);1113ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);1114break;1115case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin1116ir.Write(IROp::FSin, tempregs[i], sregs[i]);1117ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);1118break;1119case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp21120DISABLE;1121break;1122default:1123INVALIDOP;1124}1125}1126for (int i = 0; i < n; i++) {1127if (dregs[i] != tempregs[i]) {1128ir.Write(IROp::FMov, dregs[i], tempregs[i]);1129}1130}11311132ApplyPrefixD(dregs, sz, vd);1133}11341135void IRFrontend::Comp_Vi2f(MIPSOpcode op) {1136CONDITIONAL_DISABLE(VFPU_VEC);1137if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {1138DISABLE;1139}11401141// Vector integer to float1142// d[N] = float(S[N]) * mult11431144VectorSize sz = GetVecSize(op);1145int n = GetNumVectorElements(sz);11461147uint8_t imm = (op >> 16) & 0x1f;11481149u8 sregs[4], dregs[4];1150GetVectorRegsPrefixS(sregs, sz, _VS);1151GetVectorRegsPrefixD(dregs, sz, _VD);11521153for (int i = 0; i < n; i++) {1154if (imm == 0)1155ir.Write(IROp::FCvtSW, dregs[i], sregs[i]);1156else1157ir.Write(IROp::FCvtScaledSW, dregs[i], sregs[i], imm);1158}1159ApplyPrefixD(dregs, sz, _VD);1160}11611162void IRFrontend::Comp_Vh2f(MIPSOpcode op) {1163CONDITIONAL_DISABLE(VFPU_VEC);1164if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {1165DISABLE;1166}11671168// Vector expand half to float1169// d[N*2] = float(lowerhalf(s[N])), d[N*2+1] = float(upperhalf(s[N]))11701171DISABLE;1172}11731174void IRFrontend::Comp_Vf2i(MIPSOpcode op) {1175CONDITIONAL_DISABLE(VFPU_VEC);1176if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixD & 0xFF) != 0) {1177DISABLE;1178}11791180// Vector float to integer1181// d[N] = int(S[N] * mult)1182// Note: saturates on overflow.11831184VectorSize sz = GetVecSize(op);1185int n = GetNumVectorElements(sz);11861187uint8_t imm = (op >> 16) & 0x1f;11881189u8 sregs[4], dregs[4];1190GetVectorRegsPrefixS(sregs, sz, _VS);1191GetVectorRegsPrefixD(dregs, sz, _VD);11921193// Same values as FCR31.1194uint8_t rmode = (op >> 21) & 3;1195if (((op >> 21) & 0x1C) != 0x10)1196INVALIDOP;11971198if (imm != 0) {1199for (int i = 0; i < n; i++)1200ir.Write(IROp::FCvtScaledWS, dregs[i], sregs[i], imm | (rmode << 6));1201} else {1202for (int i = 0; i < n; i++) {1203switch (IRRoundMode(rmode)) {1204case IRRoundMode::RINT_0: // vf2in1205ir.Write(IROp::FRound, dregs[i], sregs[i]);1206break;12071208case IRRoundMode::CAST_1: // vf2iz1209ir.Write(IROp::FTrunc, dregs[i], sregs[i]);1210break;12111212case IRRoundMode::CEIL_2: // vf2iu1213ir.Write(IROp::FCeil, dregs[i], sregs[i]);1214break;12151216case IRRoundMode::FLOOR_3: // vf2id1217ir.Write(IROp::FFloor, dregs[i], sregs[i]);1218break;12191220default:1221INVALIDOP;1222}1223}1224}12251226ApplyPrefixDMask(dregs, sz, _VD);1227}12281229void IRFrontend::Comp_Mftv(MIPSOpcode op) {1230CONDITIONAL_DISABLE(VFPU_XFER);12311232// Vector move from VFPU / from VFPU ctrl (no prefixes)1233// gpr = S1234// gpr = VFPU_CTRL[i]12351236int imm = op & 0xFF;1237MIPSGPReg rt = _RT;1238switch ((op >> 21) & 0x1f) {1239case 3: //mfv / mfvc1240// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.1241if (rt != MIPS_REG_ZERO) {1242if (imm < 128) { //R(rt) = VI(imm);1243ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);1244} else {1245switch (imm - 128) {1246case VFPU_CTRL_DPREFIX:1247case VFPU_CTRL_SPREFIX:1248case VFPU_CTRL_TPREFIX:1249FlushPrefixV();1250break;1251}1252if (imm - 128 < VFPU_CTRL_MAX) {1253ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128);1254} else {1255INVALIDOP;1256}1257}1258}1259break;12601261case 7: // mtv1262if (imm < 128) {1263ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);1264} else if ((imm - 128) < VFPU_CTRL_MAX) {1265u32 mask;1266if (GetVFPUCtrlMask(imm - 128, &mask)) {1267if (mask != 0xFFFFFFFF) {1268ir.Write(IROp::AndConst, IRTEMP_0, rt, ir.AddConstant(mask));1269ir.Write(IROp::SetCtrlVFPUReg, imm - 128, IRTEMP_0);1270} else {1271ir.Write(IROp::SetCtrlVFPUReg, imm - 128, rt);1272}1273}12741275if (imm - 128 == VFPU_CTRL_SPREFIX) {1276js.prefixSFlag = JitState::PREFIX_UNKNOWN;1277js.blockWrotePrefixes = true;1278} else if (imm - 128 == VFPU_CTRL_TPREFIX) {1279js.prefixTFlag = JitState::PREFIX_UNKNOWN;1280js.blockWrotePrefixes = true;1281} else if (imm - 128 == VFPU_CTRL_DPREFIX) {1282js.prefixDFlag = JitState::PREFIX_UNKNOWN;1283js.blockWrotePrefixes = true;1284}1285} else {1286INVALIDOP;1287}1288break;12891290default:1291INVALIDOP;1292}1293}12941295void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {1296CONDITIONAL_DISABLE(VFPU_XFER);12971298// Vector Move from vector control reg (no prefixes)1299// D[0] = VFPU_CTRL[i]13001301int vd = _VD;1302int imm = (op >> 8) & 0x7F;1303if (imm < VFPU_CTRL_MAX) {1304ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm);1305ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vd], IRTEMP_0);1306} else {1307INVALIDOP;1308}1309}13101311void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {1312CONDITIONAL_DISABLE(VFPU_XFER);13131314// Vector Move to vector control reg (no prefixes)1315// VFPU_CTRL[i] = S[0]13161317int vs = _VS;1318int imm = op & 0xFF;1319if (imm < VFPU_CTRL_MAX) {1320u32 mask;1321if (GetVFPUCtrlMask(imm, &mask)) {1322if (mask != 0xFFFFFFFF) {1323ir.Write(IROp::FMovToGPR, IRTEMP_0, vfpuBase + voffset[imm]);1324ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(mask));1325ir.Write(IROp::SetCtrlVFPUReg, imm, IRTEMP_0);1326} else {1327ir.Write(IROp::SetCtrlVFPUFReg, imm, vfpuBase + voffset[vs]);1328}1329}1330if (imm == VFPU_CTRL_SPREFIX) {1331js.prefixSFlag = JitState::PREFIX_UNKNOWN;1332js.blockWrotePrefixes = true;1333} else if (imm == VFPU_CTRL_TPREFIX) {1334js.prefixTFlag = JitState::PREFIX_UNKNOWN;1335js.blockWrotePrefixes = true;1336} else if (imm == VFPU_CTRL_DPREFIX) {1337js.prefixDFlag = JitState::PREFIX_UNKNOWN;1338js.blockWrotePrefixes = true;1339}1340} else {1341INVALIDOP;1342}1343}13441345void IRFrontend::Comp_Vmmov(MIPSOpcode op) {1346CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);1347if (!js.HasNoPrefix()) {1348DISABLE;1349}13501351// Matrix move (weird prefixes)1352// D[N,M] = S[N,M]13531354int vs = _VS;1355int vd = _VD;1356// This probably ignores prefixes for all sane intents and purposes.1357if (vs == vd) {1358// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.1359return;1360}13611362MatrixSize sz = GetMtxSize(op);1363int n = GetMatrixSide(sz);13641365u8 sregs[16], dregs[16];1366GetMatrixRegs(sregs, sz, vs);1367GetMatrixRegs(dregs, sz, vd);13681369switch (GetMatrixOverlap(vs, vd, sz)) {1370case OVERLAP_EQUAL:1371// In-place transpose1372DISABLE;1373case OVERLAP_PARTIAL:1374DISABLE;1375case OVERLAP_NONE:1376default:1377break;1378}1379if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) {1380// Untranspose both matrices1381if (IsMatrixTransposed(vd)) {1382vd = TransposeMatrixReg(vd);1383vs = TransposeMatrixReg(vs);1384}1385// Get the columns1386u8 scols[4], dcols[4];1387GetMatrixColumns(vs, sz, scols);1388GetMatrixColumns(vd, sz, dcols);1389for (int i = 0; i < 4; i++) {1390u8 svec[4], dvec[4];1391GetVectorRegs(svec, GetVectorSize(sz), scols[i]);1392GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]);1393ir.Write(IROp::Vec4Mov, dvec[0], svec[0]);1394}1395return;1396}1397for (int a = 0; a < n; a++) {1398for (int b = 0; b < n; b++) {1399if (dregs[a * 4 + b] != sregs[a * 4 + b])1400ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]);1401}1402}1403}14041405void IRFrontend::Comp_Vmscl(MIPSOpcode op) {1406CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);1407if (!js.HasNoPrefix()) {1408DISABLE;1409}14101411// Matrix scale, matrix by scalar (weird prefixes)1412// d[N,M] = s[N,M] * t[0]1413// Note: behaves just slightly differently than a series of vscls.14141415int vs = _VS;1416int vd = _VD;1417int vt = _VT;14181419MatrixSize sz = GetMtxSize(op);1420if (sz != M_4x4) {1421DISABLE;1422}1423if (GetMtx(vt) == GetMtx(vd)) {1424DISABLE;1425}1426int n = GetMatrixSide(sz);14271428// The entire matrix is scaled equally, so transpose doesn't matter. Let's normalize.1429if (IsMatrixTransposed(vs) && IsMatrixTransposed(vd)) {1430vs = TransposeMatrixReg(vs);1431vd = TransposeMatrixReg(vd);1432}1433if (IsMatrixTransposed(vs) || IsMatrixTransposed(vd)) {1434DISABLE;1435}14361437u8 sregs[16], dregs[16], tregs[1];1438GetMatrixRegs(sregs, sz, vs);1439GetMatrixRegs(dregs, sz, vd);1440GetVectorRegs(tregs, V_Single, vt);14411442for (int i = 0; i < n; ++i) {1443ir.Write(IROp::Vec4Scale, dregs[i * 4], sregs[i * 4], tregs[0]);1444}1445}14461447void IRFrontend::Comp_VScl(MIPSOpcode op) {1448CONDITIONAL_DISABLE(VFPU_VEC);1449if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {1450DISABLE;1451}14521453// Vector scale, vector by scalar1454// d[N] = s[N] * t[0]14551456VectorSize sz = GetVecSize(op);1457int n = GetNumVectorElements(sz);14581459int vs = _VS;1460int vd = _VD;1461int vt = _VT;1462u8 sregs[4], dregs[4], treg;1463GetVectorRegsPrefixS(sregs, sz, vs);1464// T prefixes handled by interp.1465GetVectorRegs(&treg, V_Single, vt);1466GetVectorRegsPrefixD(dregs, sz, vd);14671468bool overlap = false;1469// For prefixes to work, we just have to ensure that none of the output registers spill1470// and that there's no overlap.1471u8 tempregs[4];1472memcpy(tempregs, dregs, sizeof(tempregs));1473for (int i = 0; i < n; ++i) {1474// Conservative, can be improved1475if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {1476// Need to use temp regs1477tempregs[i] = IRVTEMP_0 + i;1478overlap = true;1479}1480}14811482if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {1483if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {1484ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);1485ApplyPrefixD(dregs, sz, vd);1486return;1487} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && opts.preferVec4) {1488ir.Write(IROp::Vec4Scale, IRVTEMP_0, sregs[0], treg);1489ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);1490ApplyPrefixD(dregs, sz, vd);1491return;1492}1493}14941495for (int i = 0; i < n; i++) {1496ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);1497}14981499for (int i = 0; i < n; i++) {1500// All must be mapped for prefixes to work.1501if (dregs[i] != tempregs[i]) {1502ir.Write(IROp::FMov, dregs[i], tempregs[i]);1503}1504}15051506ApplyPrefixD(dregs, sz, vd);1507}15081509/*1510// Capital = straight, lower case = transposed1511// 8 possibilities:1512ABC 21513ABc missing1514AbC 11515Abc 115161517aBC = ACB 2 + swap1518aBc = AcB 1 + swap1519abC = ACb missing1520abc = Acb 1 + swap15211522*/15231524// This may or may not be a win when using the IR interpreter...1525// Many more instructions to interpret.1526void IRFrontend::Comp_Vmmul(MIPSOpcode op) {1527CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);1528if (!js.HasNoPrefix()) {1529DISABLE;1530}15311532if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {1533// Fall back to interpreter, which has the accurate implementation.1534// Later we might do something more optimized here.1535DISABLE;1536}15371538// Matrix multiply (weird prefixes)1539// D[0 .. N, 0 .. M] = S[0 .. N, 0 .. M]' * T[0 .. N, 0 .. M]1540// Note: Behaves as if it's implemented through a series of vdots.1541// Important: this is a matrix multiply with a pre-transposed S.15421543MatrixSize sz = GetMtxSize(op);1544int n = GetMatrixSide(sz);15451546int vs = _VS;1547int vd = _VD;1548int vt = _VT;1549MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz);1550MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);15511552// A very common arrangment. Rearrange to something we can handle.1553if (IsMatrixTransposed(vd)) {1554// Matrix identity says (At * Bt) = (B * A)t1555// D = S * T1556// Dt = (S * T)t = (Tt * St)1557vd = TransposeMatrixReg(vd);1558std::swap(vs, vt);1559}15601561u8 sregs[16], tregs[16], dregs[16];1562GetMatrixRegs(sregs, sz, vs);1563GetMatrixRegs(tregs, sz, vt);1564GetMatrixRegs(dregs, sz, vd);15651566if (soverlap || toverlap) {1567DISABLE;1568}15691570// dregs are always consecutive, thanks to our transpose trick.1571// However, not sure this is always worth it.1572if (IsMatrixVec4(sz, dregs)) {1573// TODO: The interpreter would like proper matrix ops better. Can generate those, and1574// expand them like this as needed on "real" architectures.1575int s0 = IRVTEMP_0;1576int s1 = IRVTEMP_PFX_T;1577if (!IsMatrixVec4(sz, sregs)) {1578// METHOD 1: Handles AbC and Abc1579for (int j = 0; j < 4; j++) {1580ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);1581for (int i = 1; i < 4; i++) {1582ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]);1583ir.Write(IROp::Vec4Add, s0, s0, s1);1584}1585ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);1586}1587return;1588} else if (IsMatrixVec4(sz, tregs)) {1589// METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots.1590// Dots only work if tregs are consecutive.1591// TODO: Skip this and resort to method one and transpose the output?1592for (int j = 0; j < 4; j++) {1593for (int i = 0; i < 4; i++) {1594ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[j * 4]);1595}1596ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);1597}1598return;1599} else {1600// ABc - s consecutive, t not.1601// Tekken uses this.1602// logBlocks = 1;1603}1604}16051606// Fallback. Expands a LOT1607int temp0 = IRVTEMP_0;1608int temp1 = IRVTEMP_0 + 1;1609for (int a = 0; a < n; a++) {1610for (int b = 0; b < n; b++) {1611ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]);1612for (int c = 1; c < n; c++) {1613ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]);1614ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1);1615}1616}1617}1618}16191620void IRFrontend::Comp_Vtfm(MIPSOpcode op) {1621CONDITIONAL_DISABLE(VFPU_MTX_VTFM);1622if (!js.HasNoPrefix()) {1623DISABLE;1624}16251626// Vertex transform, vector by matrix (weird prefixes)1627// d[N] = s[N*m .. N*m + n-1] dot t[0 .. n-1]1628// Homogenous means t[n-1] is treated as 1.1629// Note: this might be implemented as a series of vdots with special prefixes.16301631VectorSize sz = GetVecSize(op);1632MatrixSize msz = GetMtxSize(op);1633int n = GetNumVectorElements(sz);1634int ins = (op >> 23) & 7;16351636bool homogenous = false;1637if (n == ins) {1638n++;1639sz = (VectorSize)((int)(sz)+1);1640msz = (MatrixSize)((int)(msz)+1);1641homogenous = true;1642}1643// Otherwise, n should already be ins + 1.1644else if (n != ins + 1) {1645DISABLE;1646}16471648u8 sregs[16], dregs[4], tregs[4];1649GetMatrixRegs(sregs, msz, _VS);1650GetVectorRegs(tregs, sz, _VT);1651GetVectorRegs(dregs, sz, _VD);16521653// SIMD-optimized implementations - if sregs[0..3] is non-consecutive, it's transposed.1654if (msz == M_4x4 && !IsMatrixVec4(msz, sregs)) {1655int s0 = IRVTEMP_0;1656int s1 = IRVTEMP_PFX_S;1657// For this algorithm, we don't care if tregs are consecutive or not,1658// they are accessed one at a time. This handles homogenous transforms correctly, as well.1659// We take advantage of sregs[0] + 1 being sregs[4] here.1660ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);1661for (int i = 1; i < 4; i++) {1662if (!homogenous || (i != n - 1)) {1663ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);1664ir.Write(IROp::Vec4Add, s0, s0, s1);1665} else {1666ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);1667}1668}1669if (IsVec4(sz, dregs)) {1670ir.Write(IROp::Vec4Mov, dregs[0], s0);1671} else {1672for (int i = 0; i < 4; i++) {1673ir.Write(IROp::FMov, dregs[i], s0 + i);1674}1675}1676return;1677} else if (msz == M_4x4 && IsMatrixVec4(msz, sregs) && IsVec4(sz, tregs)) {1678IRReg t = tregs[0];1679if (homogenous) {1680// This is probably even what the hardware basically does, wiring t[3] to 1.0f.1681ir.Write(IROp::Vec4Init, IRVTEMP_PFX_T, (int)Vec4Init::AllONE);1682ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, IRVTEMP_PFX_T, t, 0x7);1683t = IRVTEMP_PFX_T;1684}1685for (int i = 0; i < 4; i++)1686ir.Write(IROp::Vec4Dot, IRVTEMP_PFX_D + i, sregs[i * 4], t);1687for (int i = 0; i < 4; i++)1688ir.Write(IROp::FMov, dregs[i], IRVTEMP_PFX_D + i);1689return;1690}16911692// TODO: test overlap, optimize.1693u8 tempregs[4];1694int s0 = IRVTEMP_0;1695int temp1 = IRVTEMP_0 + 1;1696for (int i = 0; i < n; i++) {1697ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]);1698for (int k = 1; k < n; k++) {1699if (!homogenous || k != n - 1) {1700ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]);1701ir.Write(IROp::FAdd, s0, s0, temp1);1702} else {1703ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]);1704}1705}1706int temp = IRVTEMP_PFX_T + i;1707ir.Write(IROp::FMov, temp, s0);1708tempregs[i] = temp;1709}1710for (int i = 0; i < n; i++) {1711if (tempregs[i] != dregs[i])1712ir.Write(IROp::FMov, dregs[i], tempregs[i]);1713}1714}17151716void IRFrontend::Comp_VCrs(MIPSOpcode op) {1717CONDITIONAL_DISABLE(VFPU_VEC);1718if (js.HasUnknownPrefix() || js.HasSPrefix() || js.HasTPrefix()) {1719DISABLE;1720}17211722// Vector cross (half a cross product, n = 3)1723// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]1724// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;1725// (or just use vcrsp.)1726// Note: this is possibly just a swizzle prefix hack for vmul.17271728VectorSize sz = GetVecSize(op);1729int n = GetNumVectorElements(sz);1730if (sz != V_Triple)1731DISABLE;17321733u8 sregs[4], dregs[4], tregs[4];1734GetVectorRegsPrefixS(sregs, sz, _VS);1735GetVectorRegsPrefixT(tregs, sz, _VT);1736GetVectorRegsPrefixD(dregs, sz, _VD);17371738if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {1739// Use Vec4 where we can. First, apply shuffles.1740ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));1741ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));1742ir.Write(IROp::Vec4Mul, IRVTEMP_0, IRVTEMP_PFX_S, IRVTEMP_PFX_T);1743// Now just retain w and blend in our values.1744ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);1745} else {1746u8 tempregs[4]{};1747if (!IsOverlapSafe(n, dregs, n, sregs, n, tregs)) {1748for (int i = 0; i < n; ++i)1749tempregs[i] = IRVTEMP_0 + i;1750} else {1751for (int i = 0; i < n; ++i)1752tempregs[i] = dregs[i];1753}17541755ir.Write(IROp::FMul, tempregs[0], sregs[1], tregs[2]);1756ir.Write(IROp::FMul, tempregs[1], sregs[2], tregs[0]);1757ir.Write(IROp::FMul, tempregs[2], sregs[0], tregs[1]);17581759for (int i = 0; i < n; i++) {1760if (tempregs[i] != dregs[i])1761ir.Write(IROp::FMov, dregs[i], tempregs[i]);1762}1763}17641765ApplyPrefixD(dregs, sz, _VD);1766}17671768void IRFrontend::Comp_VDet(MIPSOpcode op) {1769CONDITIONAL_DISABLE(VFPU_VEC);1770if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {1771DISABLE;1772}17731774// Vector determinant1775// d[0] = s[0]*t[1] - s[1]*t[0]1776// Note: this operates on two vectors, not a 2x2 matrix.17771778VectorSize sz = GetVecSize(op);1779if (sz != V_Pair)1780DISABLE;17811782u8 sregs[4], dregs[4], tregs[4];1783GetVectorRegsPrefixS(sregs, sz, _VS);1784GetVectorRegsPrefixT(tregs, sz, _VT);1785GetVectorRegsPrefixD(dregs, V_Single, _VD);17861787ir.Write(IROp::FMul, IRVTEMP_0, sregs[1], tregs[0]);1788ir.Write(IROp::FMul, dregs[0], sregs[0], tregs[1]);1789ir.Write(IROp::FSub, dregs[0], dregs[0], IRVTEMP_0);17901791ApplyPrefixD(dregs, V_Single, _VD);1792}17931794void IRFrontend::Comp_Vi2x(MIPSOpcode op) {1795CONDITIONAL_DISABLE(VFPU_VEC);1796if (js.HasUnknownPrefix() || js.HasSPrefix())1797DISABLE;17981799int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)1800bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)18011802// These instructions pack pairs or quads of integers into 32 bits.1803// The unsigned (u) versions skip the sign bit when packing, first doing a signed clamp to 0 (so the sign bit won't ever be 1).18041805VectorSize sz = GetVecSize(op);1806VectorSize outsize;1807if (bits == 8) {1808outsize = V_Single;1809if (sz != V_Quad) {1810DISABLE;1811}1812} else {1813switch (sz) {1814case V_Pair:1815outsize = V_Single;1816break;1817case V_Quad:1818outsize = V_Pair;1819break;1820default:1821DISABLE;1822}1823}18241825u8 sregs[4], dregs[2], srcregs[4], tempregs[2];1826GetVectorRegsPrefixS(sregs, sz, _VS);1827GetVectorRegsPrefixD(dregs, outsize, _VD);1828memcpy(srcregs, sregs, sizeof(sregs));1829memcpy(tempregs, dregs, sizeof(dregs));18301831int nOut = GetNumVectorElements(outsize);18321833// If src registers aren't contiguous, make them.1834if (!IsVec2(sz, sregs) && !IsVec4(sz, sregs)) {1835// T prefix is unused.1836for (int i = 0; i < GetNumVectorElements(sz); i++) {1837srcregs[i] = IRVTEMP_PFX_T + i;1838ir.Write(IROp::FMov, srcregs[i], sregs[i]);1839}1840}18411842if (bits == 8) {1843if (unsignedOp) { //vi2uc1844// Output is only one register.1845ir.Write(IROp::Vec4ClampToZero, IRVTEMP_0, srcregs[0]);1846ir.Write(IROp::Vec4Pack31To8, tempregs[0], IRVTEMP_0);1847} else { //vi2c1848ir.Write(IROp::Vec4Pack32To8, tempregs[0], srcregs[0]);1849}1850} else {1851// bits == 161852if (unsignedOp) { //vi2us1853// Output is only one register.1854ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0, srcregs[0]);1855ir.Write(IROp::Vec2Pack31To16, tempregs[0], IRVTEMP_0);1856if (outsize == V_Pair) {1857ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0 + 2, srcregs[2]);1858ir.Write(IROp::Vec2Pack31To16, tempregs[1], IRVTEMP_0 + 2);1859}1860} else { //vi2s1861ir.Write(IROp::Vec2Pack32To16, tempregs[0], srcregs[0]);1862if (outsize == V_Pair) {1863ir.Write(IROp::Vec2Pack32To16, tempregs[1], srcregs[2]);1864}1865}1866}18671868for (int i = 0; i < nOut; i++) {1869if (dregs[i] != tempregs[i]) {1870ir.Write(IROp::FMov, dregs[i], tempregs[i]);1871}1872}18731874ApplyPrefixD(dregs, outsize, _VD);1875}18761877void IRFrontend::Comp_Vx2i(MIPSOpcode op) {1878CONDITIONAL_DISABLE(VFPU_VEC);1879if (js.HasUnknownPrefix() || js.HasSPrefix())1880DISABLE;18811882int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)1883bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)18841885// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values1886// at the top. vus2i shifts it an extra bit right afterward.1887// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values1888// at the top too. vuc2i is a bit special (see below.)1889// Let's do this similarly as h2f - we do a solution that works for both singles and pairs1890// then use it for both.18911892VectorSize sz = GetVecSize(op);1893VectorSize outsize;1894if (bits == 8) {1895outsize = V_Quad;1896sz = V_Single; // For some reason, sz is set to Quad in this case though the outsize is Single.1897} else {1898switch (sz) {1899case V_Single:1900outsize = V_Pair;1901break;1902case V_Pair:1903outsize = V_Quad;1904break;1905default:1906DISABLE;1907}1908}19091910u8 sregs[2], dregs[4], tempregs[4], srcregs[2];1911GetVectorRegsPrefixS(sregs, sz, _VS);1912GetVectorRegsPrefixD(dregs, outsize, _VD);1913memcpy(tempregs, dregs, sizeof(dregs));1914memcpy(srcregs, sregs, sizeof(sregs));19151916// Remap source regs to be consecutive. This is not required1917// but helpful when implementations can join two Vec2Expand.1918if (sz == V_Pair && !IsConsecutive2(srcregs)) {1919for (int i = 0; i < 2; i++) {1920srcregs[i] = IRVTEMP_0 + i;1921ir.Write(IROp::FMov, srcregs[i], sregs[i]);1922}1923}19241925int nIn = GetNumVectorElements(sz);19261927int nOut = 2;1928if (outsize == V_Quad)1929nOut = 4;1930// Remap dest regs. PFX_T is unused.1931if (outsize == V_Pair) {1932bool consecutive = IsConsecutive2(dregs);1933// We must have them consecutive, so all temps, or none.1934if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {1935for (int i = 0; i < nOut; i++) {1936tempregs[i] = IRVTEMP_PFX_T + i;1937}1938}1939} else if (outsize == V_Quad) {1940bool consecutive = IsVec4(outsize, dregs);1941if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {1942for (int i = 0; i < nOut; i++) {1943tempregs[i] = IRVTEMP_PFX_T + i;1944}1945}1946}19471948if (bits == 16) {1949if (unsignedOp) {1950ir.Write(IROp::Vec2Unpack16To31, tempregs[0], srcregs[0]);1951if (outsize == V_Quad)1952ir.Write(IROp::Vec2Unpack16To31, tempregs[2], srcregs[1]);1953} else {1954ir.Write(IROp::Vec2Unpack16To32, tempregs[0], srcregs[0]);1955if (outsize == V_Quad)1956ir.Write(IROp::Vec2Unpack16To32, tempregs[2], srcregs[1]);1957}1958} else if (bits == 8) {1959if (unsignedOp) {1960// See the interpreter, this one is odd. Hardware bug?1961ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);1962ir.Write(IROp::Vec4DuplicateUpperBitsAndShift1, tempregs[0], tempregs[0]);1963} else {1964ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);1965}1966}19671968for (int i = 0; i < nOut; i++) {1969if (tempregs[i] != dregs[i]) {1970ir.Write(IROp::FMov, dregs[i], tempregs[i]);1971}1972}1973ApplyPrefixD(dregs, outsize, _VD);1974}19751976void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {1977CONDITIONAL_DISABLE(VFPU_VEC);1978if (!js.HasNoPrefix())1979DISABLE;19801981// Vector cross product (n = 3, weird prefixes)1982// d[0 .. 2] = s[0 .. 2] X t[0 .. 2]1983// Vector quaternion product (n = 4, weird prefixes)1984// d[0 .. 2] = t[0 .. 2] X s[0 .. 2] + s[3] * t[0 .. 2] + t[3] * s[0 .. 2]1985// d[3] = s[3]*t[3] - s[0 .. 2] dot t[0 .. 3]1986// Note: Behaves as if it's implemented through a series of vdots.19871988VectorSize sz = GetVecSize(op);1989int n = GetNumVectorElements(sz);19901991u8 sregs[4], tregs[4], dregs[4];1992GetVectorRegs(sregs, sz, _VS);1993GetVectorRegs(tregs, sz, _VT);1994GetVectorRegs(dregs, sz, _VD);19951996if (sz == V_Triple) {1997u8 tempregs[4]{};1998for (int i = 0; i < n; ++i) {1999if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {2000tempregs[i] = IRVTEMP_PFX_T + i; // using IRTEMP0 for other things2001} else {2002tempregs[i] = dregs[i];2003}2004}20052006int temp0 = IRVTEMP_0;2007int temp1 = IRVTEMP_0 + 1;2008// Compute X2009ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);2010ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);2011ir.Write(IROp::FSub, tempregs[0], temp0, temp1);20122013// Compute Y2014ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);2015ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);2016ir.Write(IROp::FSub, tempregs[1], temp0, temp1);20172018// Compute Z2019ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);2020ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);2021ir.Write(IROp::FSub, tempregs[2], temp0, temp1);20222023for (int i = 0; i < n; i++) {2024if (tempregs[i] != dregs[i])2025ir.Write(IROp::FMov, dregs[i], tempregs[i]);2026}2027} else if (sz == V_Quad) {2028// Rather than using vdots, we organize this as SIMD multiplies and adds.2029// That means flipping the logic column-wise. Also, luckily no prefix temps used.2030if (!IsConsecutive4(sregs) || !IsConsecutive4(tregs) || !IsConsecutive4(dregs)) {2031DISABLE;2032}20332034auto shuffleImm = [](int x, int y, int z, int w) { return x | (y << 2) | (z << 4) | (w << 6); };2035auto blendConst = [](int x, int y, int z, int w) { return x | (y << 1) | (z << 2) | (w << 3); };20362037// Prepare some negatives.2038ir.Write(IROp::Vec4Neg, IRVTEMP_0, tregs[0]);20392040// tmp = S[x,x,x,x] * T[w,-z,y,-x]2041ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 0, 1, 0));2042ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(3, 2, 1, 0));2043ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(0, 0, 0, 0));2044ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_D, IRVTEMP_PFX_S, IRVTEMP_PFX_T);20452046// tmp += S[y,y,y,y] * T[z,w,-x,-y]2047ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(1, 1, 0, 0));2048ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(2, 3, 0, 1));2049ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(1, 1, 1, 1));2050ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);2051ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);20522053// tmp += S[z,z,z,z] * T[-y,x,w,-z]2054ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_S, tregs[0], IRVTEMP_0, blendConst(0, 1, 1, 0));2055ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_S, shuffleImm(1, 0, 3, 2));2056ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(2, 2, 2, 2));2057ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);2058ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);20592060// tmp += S[w,w,w,w] * T[x,y,z,w]2061ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(3, 3, 3, 3));2062ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, tregs[0]);2063ir.Write(IROp::Vec4Add, dregs[0], IRVTEMP_PFX_D, IRVTEMP_PFX_S);2064} else {2065INVALIDOP;2066}2067}20682069void IRFrontend::Comp_Vcmp(MIPSOpcode op) {2070CONDITIONAL_DISABLE(VFPU_COMP);2071if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {2072DISABLE;2073}20742075// Vector compare2076// VFPU_CC[N] = COMPARE(s[N], t[N])20772078VectorSize sz = GetVecSize(op);2079int n = GetNumVectorElements(sz);20802081u8 sregs[4], tregs[4];2082GetVectorRegsPrefixS(sregs, sz, _VS);2083GetVectorRegsPrefixT(tregs, sz, _VT);20842085int cond = op & 0xF;2086int mask = 0;2087for (int i = 0; i < n; i++) {2088ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);2089mask |= (1 << i);2090}2091ir.Write(IROp::FCmpVfpuAggregate, mask);2092}20932094void IRFrontend::Comp_Vcmov(MIPSOpcode op) {2095CONDITIONAL_DISABLE(VFPU_COMP);2096if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {2097DISABLE;2098}20992100// Vector conditional move2101// imm3 >= 6: d[N] = VFPU_CC[N] == tf ? s[N] : d[N]2102// imm3 < 6: d[N] = VFPU_CC[imm3] == tf ? s[N] : d[N]21032104VectorSize sz = GetVecSize(op);2105int n = GetNumVectorElements(sz);21062107u8 sregs[4], dregs[4];2108GetVectorRegsPrefixS(sregs, sz, _VS);2109GetVectorRegsPrefixD(dregs, sz, _VD);2110int tf = (op >> 19) & 1;2111int imm3 = (op >> 16) & 7;21122113if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {2114// TODO: Could do a VfpuCC variant of Vec4Blend.2115}21162117for (int i = 0; i < n; ++i) {2118// Simplification: Disable if overlap unsafe2119if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {2120DISABLE;2121}2122}2123if (imm3 < 6) {2124// Test one bit of CC. This bit decides whether none or all subregisters are copied.2125for (int i = 0; i < n; i++) {2126ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7));2127}2128} else {2129// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.2130for (int i = 0; i < n; i++) {2131ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));2132}2133}2134ApplyPrefixD(dregs, sz, _VD);2135}21362137void IRFrontend::Comp_Viim(MIPSOpcode op) {2138CONDITIONAL_DISABLE(VFPU_XFER);2139if (js.HasUnknownPrefix())2140DISABLE;21412142// Vector integer immediate2143// d[0] = float(imm)21442145s32 imm = SignExtend16ToS32(op);2146u8 dreg;2147GetVectorRegsPrefixD(&dreg, V_Single, _VT);2148ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));2149ApplyPrefixD(&dreg, V_Single, _VT);2150}21512152void IRFrontend::Comp_Vfim(MIPSOpcode op) {2153CONDITIONAL_DISABLE(VFPU_XFER);2154if (js.HasUnknownPrefix())2155DISABLE;21562157// Vector half-float immediate2158// d[0] = float(imm)21592160FP16 half;2161half.u = op & 0xFFFF;2162FP32 fval = half_to_float_fast5(half);21632164u8 dreg;2165GetVectorRegsPrefixD(&dreg, V_Single, _VT);2166ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));2167ApplyPrefixD(&dreg, V_Single, _VT);2168}21692170void IRFrontend::Comp_Vcst(MIPSOpcode op) {2171CONDITIONAL_DISABLE(VFPU_XFER);2172if (js.HasUnknownPrefix())2173DISABLE;21742175// Vector constant2176// d[N] = CONST21772178int conNum = (op >> 16) & 0x1f;2179int vd = _VD;21802181VectorSize sz = GetVecSize(op);2182int n = GetNumVectorElements(sz);21832184u8 dregs[4];2185GetVectorRegsPrefixD(dregs, sz, vd);21862187if (IsVec4(sz, dregs)) {2188ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));2189ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);2190} else if (IsVec3of4(sz, dregs) && opts.preferVec4) {2191ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));2192ir.Write(IROp::Vec4Shuffle, IRVTEMP_0, IRVTEMP_0, 0);2193ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);2194} else {2195for (int i = 0; i < n; i++) {2196// Most of the time, materializing a float is slower than copying from another float.2197if (i == 0)2198ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));2199else2200ir.Write(IROp::FMov, dregs[i], dregs[0]);2201}2202}2203ApplyPrefixD(dregs, sz, vd);2204}22052206// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of2207// calling the math library.2208void IRFrontend::Comp_VRot(MIPSOpcode op) {2209CONDITIONAL_DISABLE(VFPU_VEC);2210if (!js.HasNoPrefix()) {2211// Prefixes work strangely for this:2212// * They never apply to cos (whether d or s prefixes.)2213// * They mostly apply to sin/0, e.g. 0:1, M, or |x|.2214DISABLE;2215}22162217// Vector rotation matrix (weird prefixes)2218// d[N] = SINCOSVAL(s[0], imm[N])2219// The imm selects: cos index, sin index, 0 or sin for others, sin sign flip.22202221int vd = _VD;2222int vs = _VS;2223int imm = (op >> 16) & 0x1f;2224VectorSize sz = GetVecSize(op);2225int n = GetNumVectorElements(sz);2226int sineLane = (imm >> 2) & 3;2227int cosineLane = imm & 3;2228bool negSin = (imm & 0x10) ? true : false;2229bool broadcastSine = sineLane == cosineLane;22302231char d[4] = { '0', '0', '0', '0' };2232if (broadcastSine) {2233for (int i = 0; i < 4; i++)2234d[i] = 's';2235}2236d[sineLane] = 's';2237d[cosineLane] = 'c';22382239u8 dregs[4];2240GetVectorRegs(dregs, sz, vd);2241u8 sreg[1];2242GetVectorRegs(sreg, V_Single, vs);22432244// If there's overlap, sin is calculated without it, but cosine uses the result.2245// This corresponds with prefix handling, where cosine doesn't get in prefixes.2246if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {2247ir.Write(IROp::FSin, IRVTEMP_0, sreg[0]);2248if (negSin)2249ir.Write(IROp::FNeg, IRVTEMP_0, IRVTEMP_0);2250}22512252for (int i = 0; i < n; i++) {2253switch (d[i]) {2254case '0':2255ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(0.0f));2256break;2257case 's':2258if (broadcastSine || !IsOverlapSafe(n, dregs, 1, sreg)) {2259ir.Write(IROp::FMov, dregs[i], IRVTEMP_0);2260} else {2261ir.Write(IROp::FSin, dregs[i], sreg[0]);2262if (negSin) {2263ir.Write(IROp::FNeg, dregs[i], dregs[i]);2264}2265}2266break;2267case 'c':2268if (IsOverlapSafe(n, dregs, 1, sreg))2269ir.Write(IROp::FCos, dregs[i], sreg[0]);2270else if (dregs[sineLane] == sreg[0])2271ir.Write(IROp::FCos, dregs[i], IRVTEMP_0);2272else2273ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(1.0f));2274break;2275}2276}2277}22782279void IRFrontend::Comp_Vsgn(MIPSOpcode op) {2280CONDITIONAL_DISABLE(VFPU_VEC);2281if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {2282DISABLE;2283}22842285// Vector extract sign2286// d[N] = signum(s[N])22872288VectorSize sz = GetVecSize(op);2289int n = GetNumVectorElements(sz);22902291u8 sregs[4], dregs[4];2292GetVectorRegsPrefixS(sregs, sz, _VS);2293GetVectorRegsPrefixD(dregs, sz, _VD);22942295u8 tempregs[4];2296for (int i = 0; i < n; ++i) {2297if (!IsOverlapSafe(dregs[i], n, sregs)) {2298tempregs[i] = IRTEMP_0 + i;2299} else {2300tempregs[i] = dregs[i];2301}2302}23032304for (int i = 0; i < n; ++i) {2305ir.Write(IROp::FSign, tempregs[i], sregs[i]);2306}23072308for (int i = 0; i < n; ++i) {2309if (dregs[i] != tempregs[i]) {2310ir.Write(IROp::FMov, dregs[i], tempregs[i]);2311}2312}23132314ApplyPrefixD(dregs, sz, _VD);2315}23162317void IRFrontend::Comp_Vocp(MIPSOpcode op) {2318CONDITIONAL_DISABLE(VFPU_VEC);2319if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {2320DISABLE;2321}23222323// Vector one's complement2324// d[N] = 1.0 - s[N]23252326VectorSize sz = GetVecSize(op);2327int n = GetNumVectorElements(sz);23282329// This is a hack that modifies prefixes. We eat them later, so just overwrite.2330// S prefix forces the negate flags.2331js.prefixS |= 0x000F0000;2332// T prefix forces constants on and regnum to 1.2333// That means negate still works, and abs activates a different constant.2334js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;23352336u8 sregs[4], tregs[4], dregs[4];2337GetVectorRegsPrefixS(sregs, sz, _VS);2338// There's no bits for t, so just reuse s. It'll be constants only.2339GetVectorRegsPrefixT(tregs, sz, _VS);2340GetVectorRegsPrefixD(dregs, sz, _VD);23412342if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {2343ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);2344} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {2345ir.Write(IROp::Vec4Add, IRVTEMP_0, tregs[0], sregs[0]);2346ir.Write(IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7);2347} else {2348u8 tempregs[4];2349for (int i = 0; i < n; ++i) {2350if (!IsOverlapSafe(dregs[i], n, sregs)) {2351tempregs[i] = IRVTEMP_0 + i;2352} else {2353tempregs[i] = dregs[i];2354}2355}23562357for (int i = 0; i < n; ++i) {2358ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);2359}2360for (int i = 0; i < n; ++i) {2361if (dregs[i] != tempregs[i]) {2362ir.Write(IROp::FMov, dregs[i], tempregs[i]);2363}2364}2365}23662367ApplyPrefixD(dregs, sz, _VD);2368}23692370void IRFrontend::Comp_ColorConv(MIPSOpcode op) {2371CONDITIONAL_DISABLE(VFPU_VEC);2372if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {2373DISABLE;2374}23752376// Vector color conversion2377// d[N] = ConvertTo16(s[N*2]) | (ConvertTo16(s[N*2+1]) << 16)23782379DISABLE;2380}23812382void IRFrontend::Comp_Vbfy(MIPSOpcode op) {2383CONDITIONAL_DISABLE(VFPU_VEC);2384if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {2385DISABLE;2386}23872388// Vector butterfly operation2389// vbfy2: d[0] = s[0] + s[2], d[1] = s[1] + s[3], d[2] = s[0] - s[2], d[3] = s[1] - s[3]2390// vbfy1: d[N*2] = s[N*2] + s[N*2+1], d[N*2+1] = s[N*2] - s[N*2+1]23912392VectorSize sz = GetVecSize(op);2393int n = GetNumVectorElements(sz);2394if (n != 2 && n != 4) {2395// Bad instructions2396INVALIDOP;2397}23982399u8 sregs[4], dregs[4];2400GetVectorRegsPrefixS(sregs, sz, _VS);2401GetVectorRegsPrefixD(dregs, sz, _VD);24022403u8 tempregs[4];2404for (int i = 0; i < n; ++i) {2405if (!IsOverlapSafe(dregs[i], n, sregs)) {2406tempregs[i] = IRVTEMP_0 + i;2407} else {2408tempregs[i] = dregs[i];2409}2410}24112412int subop = (op >> 16) & 0x1F;2413if (subop == 3 && n == 4) {2414// vbfy22415ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[2]);2416ir.Write(IROp::FAdd, tempregs[1], sregs[1], sregs[3]);2417ir.Write(IROp::FSub, tempregs[2], sregs[0], sregs[2]);2418ir.Write(IROp::FSub, tempregs[3], sregs[1], sregs[3]);2419} else if (subop == 2) {2420// vbfy12421ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[1]);2422ir.Write(IROp::FSub, tempregs[1], sregs[0], sregs[1]);2423if (n == 4) {2424ir.Write(IROp::FAdd, tempregs[2], sregs[2], sregs[3]);2425ir.Write(IROp::FSub, tempregs[3], sregs[2], sregs[3]);2426}2427} else {2428INVALIDOP;2429}24302431for (int i = 0; i < n; ++i) {2432if (tempregs[i] != dregs[i])2433ir.Write(IROp::FMov, dregs[i], tempregs[i]);2434}24352436ApplyPrefixD(dregs, sz, _VD);2437}2438}243924402441