CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/RiscV/RiscVCompVec.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <algorithm>18#include "Core/MemMap.h"19#include "Core/MIPS/RiscV/RiscVJit.h"20#include "Core/MIPS/RiscV/RiscVRegCache.h"2122// This file contains compilation for vector instructions.23//24// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.25// Currently known non working ones should have DISABLE. No flags because that's in IR already.2627// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }28#define CONDITIONAL_DISABLE {}29#define DISABLE { CompIR_Generic(inst); return; }30#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3132namespace MIPSComp {3334using namespace RiscVGen;35using namespace RiscVJitConstants;3637static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {38return r1 < r2 + l2 && r1 + l1 > r2;39}4041void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {42CONDITIONAL_DISABLE;4344switch (inst.op) {45case IROp::Vec4Init:46regs_.Map(inst);4748// TODO: Check if FCVT/FMV/FL is better.49switch ((Vec4Init)inst.src1) {50case Vec4Init::AllZERO:51for (int i = 0; i < 4; ++i)52FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);53break;5455case Vec4Init::AllONE:56if (CanFLI(32, 1.0f)) {57for (int i = 0; i < 4; ++i)58FLI(32, regs_.F(inst.dest + i), 1.0f);59} else {60LI(SCRATCH1, 1.0f);61FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);62for (int i = 1; i < 4; ++i)63FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));64}65break;6667case Vec4Init::AllMinusONE:68if (CanFLI(32, -1.0f)) {69for (int i = 0; i < 4; ++i)70FLI(32, regs_.F(inst.dest + i), -1.0f);71} else {72LI(SCRATCH1, -1.0f);73FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);74for (int i = 1; i < 4; ++i)75FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));76}77break;7879case Vec4Init::Set_1000:80if (!CanFLI(32, 1.0f))81LI(SCRATCH1, 1.0f);82for (int i = 0; i < 4; ++i) {83if (i == 0) {84if (CanFLI(32, 1.0f))85FLI(32, regs_.F(inst.dest + i), 1.0f);86else87FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);88} else {89FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);90}91}92break;9394case Vec4Init::Set_0100:95if (!CanFLI(32, 1.0f))96LI(SCRATCH1, 1.0f);97for (int i = 0; i < 4; ++i) {98if (i == 1) {99if (CanFLI(32, 1.0f))100FLI(32, regs_.F(inst.dest + i), 1.0f);101else102FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);103} else {104FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);105}106}107break;108109case Vec4Init::Set_0010:110if (!CanFLI(32, 1.0f))111LI(SCRATCH1, 1.0f);112for (int i = 0; i < 4; ++i) {113if (i == 2) {114if (CanFLI(32, 1.0f))115FLI(32, regs_.F(inst.dest + i), 1.0f);116else117FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);118} else {119FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);120}121}122break;123124case Vec4Init::Set_0001:125if (!CanFLI(32, 1.0f))126LI(SCRATCH1, 1.0f);127for (int i = 0; i < 4; ++i) {128if (i == 3) {129if (CanFLI(32, 1.0f))130FLI(32, regs_.F(inst.dest + i), 1.0f);131else132FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);133} else {134FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);135}136}137break;138}139break;140141case IROp::Vec4Shuffle:142if (inst.dest == inst.src1) {143RiscVReg tempReg = regs_.MapWithFPRTemp(inst);144145// Try to find the least swaps needed to move in place, never worse than 6 FMVs.146// Would be better with a vmerge and vector regs.147int state[4]{ 0, 1, 2, 3 };148int goal[4]{ (inst.src2 >> 0) & 3, (inst.src2 >> 2) & 3, (inst.src2 >> 4) & 3, (inst.src2 >> 6) & 3 };149150static constexpr int NOT_FOUND = 4;151auto findIndex = [](int *arr, int val, int start = 0) {152return (int)(std::find(arr + start, arr + 4, val) - arr);153};154auto moveChained = [&](const std::vector<int> &lanes, bool rotate) {155int firstState = state[lanes.front()];156if (rotate)157FMV(32, tempReg, regs_.F(inst.dest + lanes.front()));158for (size_t i = 1; i < lanes.size(); ++i) {159FMV(32, regs_.F(inst.dest + lanes[i - 1]), regs_.F(inst.dest + lanes[i]));160state[lanes[i - 1]] = state[lanes[i]];161}162if (rotate) {163FMV(32, regs_.F(inst.dest + lanes.back()), tempReg);164state[lanes.back()] = firstState;165}166};167168for (int i = 0; i < 4; ++i) {169// Overlap, so if they match, nothing to do.170if (goal[i] == state[i])171continue;172173int neededBy = findIndex(goal, state[i], i + 1);174int foundIn = findIndex(state, goal[i], 0);175_assert_(foundIn != NOT_FOUND);176177if (neededBy == NOT_FOUND || neededBy == foundIn) {178moveChained({ i, foundIn }, neededBy == foundIn);179continue;180}181182// Maybe we can avoid a swap and move the next thing into place.183int neededByDepth2 = findIndex(goal, state[neededBy], i + 1);184if (neededByDepth2 == NOT_FOUND || neededByDepth2 == foundIn) {185moveChained({ neededBy, i, foundIn }, neededByDepth2 == foundIn);186continue;187}188189// Since we only have 4 items, this is as deep as the chain could go.190int neededByDepth3 = findIndex(goal, state[neededByDepth2], i + 1);191moveChained({ neededByDepth2, neededBy, i, foundIn }, neededByDepth3 == foundIn);192}193} else {194regs_.Map(inst);195for (int i = 0; i < 4; ++i) {196int lane = (inst.src2 >> (i * 2)) & 3;197FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + lane));198}199}200break;201202case IROp::Vec4Blend:203regs_.Map(inst);204for (int i = 0; i < 4; ++i) {205int which = (inst.constant >> i) & 1;206IRReg srcReg = which ? inst.src2 : inst.src1;207if (inst.dest != srcReg)208FMV(32, regs_.F(inst.dest + i), regs_.F(srcReg + i));209}210break;211212case IROp::Vec4Mov:213if (inst.dest != inst.src1) {214regs_.Map(inst);215for (int i = 0; i < 4; ++i)216FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));217}218break;219220default:221INVALIDOP;222break;223}224}225226void RiscVJitBackend::CompIR_VecArith(IRInst inst) {227CONDITIONAL_DISABLE;228229switch (inst.op) {230case IROp::Vec4Add:231regs_.Map(inst);232for (int i = 0; i < 4; ++i)233FADD(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));234break;235236case IROp::Vec4Sub:237regs_.Map(inst);238for (int i = 0; i < 4; ++i)239FSUB(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));240break;241242case IROp::Vec4Mul:243regs_.Map(inst);244for (int i = 0; i < 4; ++i)245FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));246break;247248case IROp::Vec4Div:249regs_.Map(inst);250for (int i = 0; i < 4; ++i)251FDIV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));252break;253254case IROp::Vec4Scale:255regs_.Map(inst);256if (Overlap(inst.src2, 1, inst.dest, 3)) {257// We have to handle overlap, doing dest == src2 last.258for (int i = 0; i < 4; ++i) {259if (inst.src2 != inst.dest + i)260FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));261}262for (int i = 0; i < 4; ++i) {263if (inst.src2 == inst.dest + i)264FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));265}266} else {267for (int i = 0; i < 4; ++i)268FMUL(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));269}270break;271272case IROp::Vec4Neg:273regs_.Map(inst);274for (int i = 0; i < 4; ++i)275FNEG(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));276break;277278case IROp::Vec4Abs:279regs_.Map(inst);280for (int i = 0; i < 4; ++i)281FABS(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));282break;283284default:285INVALIDOP;286break;287}288}289290void RiscVJitBackend::CompIR_VecHoriz(IRInst inst) {291CONDITIONAL_DISABLE;292293switch (inst.op) {294case IROp::Vec4Dot:295regs_.Map(inst);296if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {297// This means inst.dest overlaps one of src1 or src2. We have to do that one first.298// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.299for (int i = 0; i < 4; ++i) {300if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)301FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));302}303for (int i = 0; i < 4; ++i) {304if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)305FMADD(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));306}307} else {308FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));309for (int i = 1; i < 4; ++i)310FMADD(32, regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));311}312break;313314default:315INVALIDOP;316break;317}318}319320void RiscVJitBackend::CompIR_VecPack(IRInst inst) {321CONDITIONAL_DISABLE;322323switch (inst.op) {324case IROp::Vec2Unpack16To31:325case IROp::Vec4Pack32To8:326case IROp::Vec2Pack31To16:327CompIR_Generic(inst);328break;329330case IROp::Vec4Unpack8To32:331// TODO: This works for now, but may need to handle aliasing for vectors.332regs_.Map(inst);333FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1));334for (int i = 0; i < 4; ++i) {335// Mask using walls.336if (i != 0) {337SRLI(SCRATCH1, SCRATCH2, i * 8);338SLLI(SCRATCH1, SCRATCH1, 24);339} else {340SLLI(SCRATCH1, SCRATCH2, 24);341}342FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);343}344break;345346case IROp::Vec2Unpack16To32:347// TODO: This works for now, but may need to handle aliasing for vectors.348regs_.Map(inst);349FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1));350SLLI(SCRATCH1, SCRATCH2, 16);351FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);352SRLI(SCRATCH1, SCRATCH2, 16);353SLLI(SCRATCH1, SCRATCH1, 16);354FMV(FMv::W, FMv::X, regs_.F(inst.dest + 1), SCRATCH1);355break;356357case IROp::Vec4DuplicateUpperBitsAndShift1:358regs_.Map(inst);359for (int i = 0; i < 4; i++) {360FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));361SRLIW(SCRATCH2, SCRATCH1, 8);362OR(SCRATCH1, SCRATCH1, SCRATCH2);363SRLIW(SCRATCH2, SCRATCH1, 16);364OR(SCRATCH1, SCRATCH1, SCRATCH2);365SRLIW(SCRATCH1, SCRATCH1, 1);366FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);367}368break;369370case IROp::Vec4Pack31To8:371// TODO: This works for now, but may need to handle aliasing for vectors.372regs_.Map(inst);373for (int i = 0; i < 4; ++i) {374FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));375SRLI(SCRATCH1, SCRATCH1, 23);376if (i == 0) {377ANDI(SCRATCH2, SCRATCH1, 0xFF);378} else {379ANDI(SCRATCH1, SCRATCH1, 0xFF);380SLLI(SCRATCH1, SCRATCH1, 8 * i);381OR(SCRATCH2, SCRATCH2, SCRATCH1);382}383}384385FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH2);386break;387388case IROp::Vec2Pack32To16:389// TODO: This works for now, but may need to handle aliasing for vectors.390regs_.Map(inst);391FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1));392FMV(FMv::X, FMv::W, SCRATCH2, regs_.F(inst.src1 + 1));393// Keep in mind, this was sign-extended, so we have to zero the upper.394SLLI(SCRATCH1, SCRATCH1, XLEN - 32);395// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.396SRLI(SCRATCH1, SCRATCH1, XLEN - 16);397// Use a wall to mask. We can ignore the upper 32 here.398SRLI(SCRATCH2, SCRATCH2, 16);399SLLI(SCRATCH2, SCRATCH2, 16);400OR(SCRATCH1, SCRATCH1, SCRATCH2);401// Okay, to the floating point register.402FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);403break;404405default:406INVALIDOP;407break;408}409}410411void RiscVJitBackend::CompIR_VecClamp(IRInst inst) {412CONDITIONAL_DISABLE;413414switch (inst.op) {415case IROp::Vec4ClampToZero:416regs_.Map(inst);417for (int i = 0; i < 4; i++) {418FMV(FMv::X, FMv::W, SCRATCH1, regs_.F(inst.src1 + i));419SRAIW(SCRATCH2, SCRATCH1, 31);420if (cpu_info.RiscV_Zbb) {421ANDN(SCRATCH1, SCRATCH1, SCRATCH2);422} else {423NOT(SCRATCH2, SCRATCH2);424AND(SCRATCH1, SCRATCH1, SCRATCH2);425}426FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);427}428break;429430case IROp::Vec2ClampToZero:431CompIR_Generic(inst);432break;433434default:435INVALIDOP;436break;437}438}439440} // namespace MIPSComp441442443