CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/x86/X64IRCompVec.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)1920#include <algorithm>21#include "Common/CPUDetect.h"22#include "Core/MemMap.h"23#include "Core/MIPS/x86/X64IRJit.h"24#include "Core/MIPS/x86/X64IRRegCache.h"2526// This file contains compilation for vector instructions.27//28// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.29// Currently known non working ones should have DISABLE. No flags because that's in IR already.3031// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }32#define CONDITIONAL_DISABLE {}33#define DISABLE { CompIR_Generic(inst); return; }34#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3536namespace MIPSComp {3738using namespace Gen;39using namespace X64IRJitConstants;4041static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {42return r1 < r2 + l2 && r1 + l1 > r2;43}4445void X64JitBackend::EmitVecConstants() {46static const float vec4InitData[8][4] = {47{ 0.0f, 0.0f, 0.0f, 0.0f },48{ 1.0f, 1.0f, 1.0f, 1.0f },49{ -1.0f, -1.0f, -1.0f, -1.0f },50{ 1.0f, 0.0f, 0.0f, 0.0f },51{ 0.0f, 1.0f, 0.0f, 0.0f },52{ 0.0f, 0.0f, 1.0f, 0.0f },53{ 0.0f, 0.0f, 0.0f, 1.0f },54};5556constants.vec4InitValues = (const Float4Constant *)GetCodePointer();57for (size_t type = 0; type < ARRAY_SIZE(vec4InitData); ++type) {58for (int i = 0; i < 4; ++i) {59uint32_t val;60memcpy(&val, &vec4InitData[type][i], sizeof(val));61Write32(val);62}63}64}6566void X64JitBackend::CompIR_VecArith(IRInst inst) {67CONDITIONAL_DISABLE;6869switch (inst.op) {70case IROp::Vec4Add:71regs_.Map(inst);72if (inst.dest == inst.src1) {73ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));74} else if (inst.dest == inst.src2) {75ADDPS(regs_.FX(inst.dest), regs_.F(inst.src1));76} else if (cpu_info.bAVX) {77VADDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));78} else {79MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));80ADDPS(regs_.FX(inst.dest), regs_.F(inst.src2));81}82break;8384case IROp::Vec4Sub:85if (inst.dest == inst.src1) {86regs_.Map(inst);87SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));88} else if (cpu_info.bAVX) {89regs_.Map(inst);90VSUBPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));91} else if (inst.dest == inst.src2) {92X64Reg tempReg = regs_.MapWithFPRTemp(inst);93MOVAPS(tempReg, regs_.F(inst.src2));94MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));95SUBPS(regs_.FX(inst.dest), R(tempReg));96} else {97regs_.Map(inst);98MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));99SUBPS(regs_.FX(inst.dest), regs_.F(inst.src2));100}101break;102103case IROp::Vec4Mul:104regs_.Map(inst);105if (inst.dest == inst.src1) {106MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));107} else if (inst.dest == inst.src2) {108MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));109} else if (cpu_info.bAVX) {110VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));111} else {112MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));113MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));114}115break;116117case IROp::Vec4Div:118if (inst.dest == inst.src1) {119regs_.Map(inst);120DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));121} else if (cpu_info.bAVX) {122regs_.Map(inst);123VDIVPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));124} else if (inst.dest == inst.src2) {125X64Reg tempReg = regs_.MapWithFPRTemp(inst);126MOVAPS(tempReg, regs_.F(inst.src2));127MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));128DIVPS(regs_.FX(inst.dest), R(tempReg));129} else {130regs_.Map(inst);131MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));132DIVPS(regs_.FX(inst.dest), regs_.F(inst.src2));133}134break;135136case IROp::Vec4Scale:137// TODO: Handle "aliasing" of sizes.138if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1))139DISABLE;140141regs_.Map(inst);142SHUFPS(regs_.FX(inst.src2), regs_.F(inst.src2), 0);143if (inst.dest == inst.src1) {144MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));145} else if (inst.dest == inst.src2) {146MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));147} else if (cpu_info.bAVX) {148VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));149} else {150MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));151MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));152}153break;154155case IROp::Vec4Neg:156regs_.Map(inst);157if (cpu_info.bAVX) {158VXORPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible159} else {160if (inst.dest != inst.src1)161MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));162XORPS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible163}164break;165166case IROp::Vec4Abs:167regs_.Map(inst);168if (cpu_info.bAVX) {169VANDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), M(constants.noSignMask)); // rip accessible170} else {171if (inst.dest != inst.src1)172MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));173ANDPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible174}175break;176177default:178INVALIDOP;179break;180}181}182183void X64JitBackend::CompIR_VecAssign(IRInst inst) {184CONDITIONAL_DISABLE;185186switch (inst.op) {187case IROp::Vec4Init:188regs_.Map(inst);189if (inst.src1 == (int)Vec4Init::AllZERO) {190XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));191} else {192MOVAPS(regs_.FX(inst.dest), M(&constants.vec4InitValues[inst.src1])); // rip accessible193}194break;195196case IROp::Vec4Shuffle:197if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0) {198// This is a broadcast. If dest == src1, this won't clear it.199regs_.SpillLockFPR(inst.src1);200regs_.MapVec4(inst.dest, MIPSMap::NOINIT);201} else {202regs_.Map(inst);203}204if (cpu_info.bAVX) {205VPERMILPS(128, regs_.FX(inst.dest), regs_.F(inst.src1), inst.src2);206} else {207if (inst.dest != inst.src1)208MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));209SHUFPS(regs_.FX(inst.dest), regs_.F(inst.dest), inst.src2);210}211break;212213case IROp::Vec4Blend:214if (cpu_info.bAVX) {215regs_.Map(inst);216VBLENDPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2), (uint8_t)inst.constant);217} else if (cpu_info.bSSE4_1) {218regs_.Map(inst);219if (inst.dest == inst.src1) {220BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);221} else if (inst.dest == inst.src2) {222BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src1), (uint8_t)~inst.constant);223} else {224MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));225BLENDPS(regs_.FX(inst.dest), regs_.F(inst.src2), (uint8_t)inst.constant);226}227} else {228// Could use some shuffles...229DISABLE;230}231break;232233case IROp::Vec4Mov:234regs_.Map(inst);235MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));236break;237238default:239INVALIDOP;240break;241}242}243244void X64JitBackend::CompIR_VecClamp(IRInst inst) {245CONDITIONAL_DISABLE;246247switch (inst.op) {248case IROp::Vec4ClampToZero:249case IROp::Vec2ClampToZero:250CompIR_Generic(inst);251break;252253default:254INVALIDOP;255break;256}257}258259void X64JitBackend::CompIR_VecHoriz(IRInst inst) {260CONDITIONAL_DISABLE;261262switch (inst.op) {263case IROp::Vec4Dot:264{265// TODO: Handle "aliasing" of sizes. In theory it should be fine if not dirty...266if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4))267DISABLE;268269X64Reg tempReg = regs_.MapWithFPRTemp(inst);270271if (inst.dest == inst.src1) {272MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));273} else if (inst.dest == inst.src2) {274MULPS(regs_.FX(inst.dest), regs_.F(inst.src1));275} else if (cpu_info.bAVX) {276VMULPS(128, regs_.FX(inst.dest), regs_.FX(inst.src1), regs_.F(inst.src2));277} else if (cpu_info.bSSE4_1) {278MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));279MULPS(regs_.FX(inst.dest), regs_.F(inst.src2));280}281282// This shuffle can be done in one op for SSE3/AVX, but it's not always faster.283MOVAPS(tempReg, regs_.F(inst.dest));284SHUFPS(tempReg, regs_.F(inst.dest), VFPU_SWIZZLE(1, 0, 3, 2));285ADDPS(regs_.FX(inst.dest), R(tempReg));286MOVHLPS(tempReg, regs_.FX(inst.dest));287ADDSS(regs_.FX(inst.dest), R(tempReg));288break;289}290291default:292INVALIDOP;293break;294}295}296297void X64JitBackend::CompIR_VecPack(IRInst inst) {298CONDITIONAL_DISABLE;299300switch (inst.op) {301case IROp::Vec2Unpack16To31:302case IROp::Vec4Pack32To8:303case IROp::Vec2Pack31To16:304case IROp::Vec4Unpack8To32:305case IROp::Vec2Unpack16To32:306case IROp::Vec4DuplicateUpperBitsAndShift1:307case IROp::Vec4Pack31To8:308case IROp::Vec2Pack32To16:309CompIR_Generic(inst);310break;311312default:313INVALIDOP;314break;315}316}317318} // namespace MIPSComp319320#endif321322323