CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM64/Arm64IRCompVec.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18// In other words, PPSSPP_ARCH(ARM64) || DISASM_ALL.19#if PPSSPP_ARCH(ARM64) || (PPSSPP_PLATFORM(WINDOWS) && !defined(__LIBRETRO__))2021#include <algorithm>22#include "Common/CPUDetect.h"23#include "Core/MemMap.h"24#include "Core/MIPS/ARM64/Arm64IRJit.h"25#include "Core/MIPS/ARM64/Arm64IRRegCache.h"2627// This file contains compilation for vector instructions.28//29// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.30// Currently known non working ones should have DISABLE. No flags because that's in IR already.3132// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }33#define CONDITIONAL_DISABLE {}34#define DISABLE { CompIR_Generic(inst); return; }35#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3637namespace MIPSComp {3839using namespace Arm64Gen;40using namespace Arm64IRJitConstants;4142static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {43return r1 < r2 + l2 && r1 + l1 > r2;44}4546void Arm64JitBackend::CompIR_VecArith(IRInst inst) {47CONDITIONAL_DISABLE;4849switch (inst.op) {50case IROp::Vec4Add:51regs_.Map(inst);52fp_.FADD(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));53break;5455case IROp::Vec4Sub:56regs_.Map(inst);57fp_.FSUB(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));58break;5960case IROp::Vec4Mul:61regs_.Map(inst);62fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));63break;6465case IROp::Vec4Div:66regs_.Map(inst);67fp_.FDIV(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));68break;6970case IROp::Vec4Scale:71if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1)) {72// ARM64 can handle this, but we have to map specially.73regs_.SpillLockFPR(inst.dest, inst.src1);74regs_.MapVec4(inst.src1);75regs_.MapVec4(inst.src2 & ~3);76regs_.MapVec4(inst.dest, MIPSMap::NOINIT);77fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2 & ~3), inst.src2 & 3);78} else {79regs_.Map(inst);80fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2), 0);81}82break;8384case IROp::Vec4Neg:85regs_.Map(inst);86fp_.FNEG(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1));87break;8889case IROp::Vec4Abs:90regs_.Map(inst);91fp_.FABS(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1));92break;9394default:95INVALIDOP;96break;97}98}99100enum class Arm64Shuffle {101DUP0_AAAA,102DUP1_BBBB,103DUP2_CCCC,104DUP3_DDDD,105MOV_ABCD,106TRN1_AACC,107TRN2_BBDD,108UZP1_ACAC,109UZP2_BDBD,110ZIP1_AABB,111ZIP2_CCDD,112REV64_BADC,113EXT4_BCDA,114EXT8_CDAB,115EXT12_DABC,116117// These steps are more expensive and use a temp.118REV64_EXT8_CDBA,119REV64_EXT8_DCAB,120EXT4_UZP1_BDAC,121EXT4_UZP2_CABD,122EXT8_ZIP1_ACBD,123EXT8_ZIP2_CADB,124125// Any that don't fully replace dest must be after this point.126INS0_TO_1,127INS0_TO_2,128INS0_TO_3,129INS1_TO_0,130INS1_TO_2,131INS1_TO_3,132INS2_TO_0,133INS2_TO_1,134INS2_TO_3,135INS3_TO_0,136INS3_TO_1,137INS3_TO_2,138XTN2,139140// These hacks to prevent 4 instructions, but scoring isn't smart enough to avoid.141EXT12_ZIP1_ADBA,142DUP3_UZP1_DDAC,143144COUNT_NORMAL = EXT12_ZIP1_ADBA,145COUNT_SIMPLE = REV64_EXT8_CDBA,146COUNT_NOPREV = INS0_TO_1,147};148149uint8_t Arm64ShuffleMask(Arm64Shuffle method) {150// Hopefully optimized into a lookup table, this is a bit less confusing to read...151switch (method) {152case Arm64Shuffle::DUP0_AAAA: return 0x00;153case Arm64Shuffle::DUP1_BBBB: return 0x55;154case Arm64Shuffle::DUP2_CCCC: return 0xAA;155case Arm64Shuffle::DUP3_DDDD: return 0xFF;156case Arm64Shuffle::MOV_ABCD: return 0xE4;157case Arm64Shuffle::TRN1_AACC: return 0xA0;158case Arm64Shuffle::TRN2_BBDD: return 0xF5;159case Arm64Shuffle::UZP1_ACAC: return 0x88;160case Arm64Shuffle::UZP2_BDBD: return 0xDD;161case Arm64Shuffle::ZIP1_AABB: return 0x50;162case Arm64Shuffle::ZIP2_CCDD: return 0xFA;163case Arm64Shuffle::REV64_BADC: return 0xB1;164case Arm64Shuffle::EXT4_BCDA: return 0x39;165case Arm64Shuffle::EXT8_CDAB: return 0x4E;166case Arm64Shuffle::EXT12_DABC: return 0x93;167case Arm64Shuffle::REV64_EXT8_CDBA: return 0x1E;168case Arm64Shuffle::REV64_EXT8_DCAB: return 0x4B;169case Arm64Shuffle::EXT4_UZP1_BDAC: return 0x8D;170case Arm64Shuffle::EXT4_UZP2_CABD: return 0xD2;171case Arm64Shuffle::EXT8_ZIP1_ACBD: return 0xD8;172case Arm64Shuffle::EXT8_ZIP2_CADB: return 0x72;173case Arm64Shuffle::INS0_TO_1: return 0xE0;174case Arm64Shuffle::INS0_TO_2: return 0xC4;175case Arm64Shuffle::INS0_TO_3: return 0x24;176case Arm64Shuffle::INS1_TO_0: return 0xE5;177case Arm64Shuffle::INS1_TO_2: return 0xD4;178case Arm64Shuffle::INS1_TO_3: return 0x64;179case Arm64Shuffle::INS2_TO_0: return 0xE6;180case Arm64Shuffle::INS2_TO_1: return 0xE8;181case Arm64Shuffle::INS2_TO_3: return 0xA4;182case Arm64Shuffle::INS3_TO_0: return 0xE7;183case Arm64Shuffle::INS3_TO_1: return 0xEC;184case Arm64Shuffle::INS3_TO_2: return 0xF4;185case Arm64Shuffle::XTN2: return 0x84;186case Arm64Shuffle::EXT12_ZIP1_ADBA: return 0x1C;187case Arm64Shuffle::DUP3_UZP1_DDAC: return 0x8F;188default:189_assert_(false);190return 0;191}192}193194void Arm64ShuffleApply(ARM64FloatEmitter &fp, Arm64Shuffle method, ARM64Reg vd, ARM64Reg vs) {195switch (method) {196case Arm64Shuffle::DUP0_AAAA: fp.DUP(32, vd, vs, 0); return;197case Arm64Shuffle::DUP1_BBBB: fp.DUP(32, vd, vs, 1); return;198case Arm64Shuffle::DUP2_CCCC: fp.DUP(32, vd, vs, 2); return;199case Arm64Shuffle::DUP3_DDDD: fp.DUP(32, vd, vs, 3); return;200case Arm64Shuffle::MOV_ABCD: _assert_(vd != vs); fp.MOV(vd, vs); return;201case Arm64Shuffle::TRN1_AACC: fp.TRN1(32, vd, vs, vs); return;202case Arm64Shuffle::TRN2_BBDD: fp.TRN2(32, vd, vs, vs); return;203case Arm64Shuffle::UZP1_ACAC: fp.UZP1(32, vd, vs, vs); return;204case Arm64Shuffle::UZP2_BDBD: fp.UZP2(32, vd, vs, vs); return;205case Arm64Shuffle::ZIP1_AABB: fp.ZIP1(32, vd, vs, vs); return;206case Arm64Shuffle::ZIP2_CCDD: fp.ZIP2(32, vd, vs, vs); return;207case Arm64Shuffle::REV64_BADC: fp.REV64(32, vd, vs); return;208case Arm64Shuffle::EXT4_BCDA: fp.EXT(vd, vs, vs, 4); return;209case Arm64Shuffle::EXT8_CDAB: fp.EXT(vd, vs, vs, 8); return;210case Arm64Shuffle::EXT12_DABC: fp.EXT(vd, vs, vs, 12); return;211212case Arm64Shuffle::REV64_EXT8_CDBA:213fp.REV64(32, EncodeRegToQuad(SCRATCHF1), vs);214fp.EXT(vd, vs, EncodeRegToQuad(SCRATCHF1), 8);215return;216217case Arm64Shuffle::REV64_EXT8_DCAB:218fp.REV64(32, EncodeRegToQuad(SCRATCHF1), vs);219fp.EXT(vd, EncodeRegToQuad(SCRATCHF1), vs, 8);220return;221222case Arm64Shuffle::EXT4_UZP1_BDAC:223fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 4);224fp.UZP1(32, vd, EncodeRegToQuad(SCRATCHF1), vs);225return;226227case Arm64Shuffle::EXT4_UZP2_CABD:228fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 4);229fp.UZP2(32, vd, EncodeRegToQuad(SCRATCHF1), vs);230return;231232case Arm64Shuffle::EXT8_ZIP1_ACBD:233fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 8);234fp.ZIP1(32, vd, vs, EncodeRegToQuad(SCRATCHF1));235return;236237case Arm64Shuffle::EXT8_ZIP2_CADB:238fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 8);239fp.ZIP2(32, vd, vs, EncodeRegToQuad(SCRATCHF1));240return;241242case Arm64Shuffle::INS0_TO_1: fp.INS(32, vd, 1, vs, 0); return;243case Arm64Shuffle::INS0_TO_2: fp.INS(32, vd, 2, vs, 0); return;244case Arm64Shuffle::INS0_TO_3: fp.INS(32, vd, 3, vs, 0); return;245case Arm64Shuffle::INS1_TO_0: fp.INS(32, vd, 0, vs, 1); return;246case Arm64Shuffle::INS1_TO_2: fp.INS(32, vd, 2, vs, 1); return;247case Arm64Shuffle::INS1_TO_3: fp.INS(32, vd, 3, vs, 1); return;248case Arm64Shuffle::INS2_TO_0: fp.INS(32, vd, 0, vs, 2); return;249case Arm64Shuffle::INS2_TO_1: fp.INS(32, vd, 1, vs, 2); return;250case Arm64Shuffle::INS2_TO_3: fp.INS(32, vd, 3, vs, 2); return;251case Arm64Shuffle::INS3_TO_0: fp.INS(32, vd, 0, vs, 3); return;252case Arm64Shuffle::INS3_TO_1: fp.INS(32, vd, 1, vs, 3); return;253case Arm64Shuffle::INS3_TO_2: fp.INS(32, vd, 2, vs, 3); return;254255case Arm64Shuffle::XTN2: fp.XTN2(32, vd, vs); return;256257case Arm64Shuffle::EXT12_ZIP1_ADBA:258fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 12);259fp.ZIP1(32, vd, vs, EncodeRegToQuad(SCRATCHF1));260return;261262case Arm64Shuffle::DUP3_UZP1_DDAC:263fp.DUP(32, EncodeRegToQuad(SCRATCHF1), vs, 3);264fp.UZP1(32, vd, EncodeRegToQuad(SCRATCHF1), vs);265return;266267default:268_assert_(false);269return;270}271}272273uint8_t Arm64ShuffleResult(uint8_t mask, uint8_t prev) {274if (prev == 0xE4)275return mask;276277uint8_t result = 0;278for (int i = 0; i < 4; ++i) {279int takeLane = (mask >> (i * 2)) & 3;280int lane = (prev >> (takeLane * 2)) & 3;281result |= lane << (i * 2);282}283return result;284}285286int Arm64ShuffleScore(uint8_t shuf, uint8_t goal, int steps = 1) {287if (shuf == goal)288return 100;289290int score = 0;291bool needs[4]{};292bool gets[4]{};293for (int i = 0; i < 4; ++i) {294uint8_t mask = 3 << (i * 2);295needs[(goal & mask) >> (i * 2)] = true;296gets[(shuf & mask) >> (i * 2)] = true;297if ((shuf & mask) == (goal & mask))298score += 4;299}300301for (int i = 0; i < 4; ++i) {302if (needs[i] && !gets[i])303return 0;304}305306// We need to look one level deeper to solve some, such as 1B (common) well.307if (steps > 0) {308int bestNextScore = 0;309for (int m = 0; m < (int)Arm64Shuffle::COUNT_NORMAL; ++m) {310uint8_t next = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), shuf);311int nextScore = Arm64ShuffleScore(next, goal, steps - 1);312if (nextScore > score) {313bestNextScore = nextScore;314if (bestNextScore == 100) {315// Take the earliest that gives us two steps, it's cheaper (not 2 instructions.)316score = 0;317break;318}319}320}321322score += bestNextScore / 2;323}324325return score;326}327328Arm64Shuffle Arm64BestShuffle(uint8_t goal, uint8_t prev, bool needsCopy) {329// A couple special cases for optimal shuffles.330if (goal == 0x7C && prev == 0xE4)331return Arm64Shuffle::REV64_BADC;332if (goal == 0x2B && prev == 0xE4)333return Arm64Shuffle::EXT8_CDAB;334if ((goal == 0x07 || goal == 0x1C) && prev == 0xE4)335return Arm64Shuffle::EXT12_ZIP1_ADBA;336if ((goal == 0x8F || goal == 0x2F) && prev == 0xE4)337return Arm64Shuffle::DUP3_UZP1_DDAC;338339// needsCopy true means insert isn't possible.340int attempts = needsCopy ? (int)Arm64Shuffle::COUNT_NOPREV : (int)Arm64Shuffle::COUNT_NORMAL;341342Arm64Shuffle best = Arm64Shuffle::MOV_ABCD;343int bestScore = 0;344for (int m = 0; m < attempts; ++m) {345uint8_t result = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), prev);346int score = Arm64ShuffleScore(result, goal);347// Slightly discount options that involve an extra instruction.348if (m >= (int)Arm64Shuffle::COUNT_SIMPLE && m < (int)Arm64Shuffle::COUNT_NOPREV)349score--;350if (score > bestScore) {351best = (Arm64Shuffle)m;352bestScore = score;353}354}355356_assert_(bestScore > 0);357return best;358}359360361static void Arm64ShufflePerform(ARM64FloatEmitter &fp, ARM64Reg vd, ARM64Reg vs, u8 shuf) {362// This performs all shuffles within 3 "steps" (some are two instructions, though.)363_assert_msg_(shuf != 0xE4, "Non-shuffles shouldn't get here");364365uint8_t state = 0xE4;366// If they're not the same, the first step needs to be a copy.367bool needsCopy = vd != vs;368for (int i = 0; i < 4 && state != shuf; ++i) {369// Figure out the next step and write it out.370Arm64Shuffle method = Arm64BestShuffle(shuf, state, needsCopy);371Arm64ShuffleApply(fp, method, vd, needsCopy ? vs : vd);372373// Update our state to where we've ended up, for next time.374needsCopy = false;375state = Arm64ShuffleResult(Arm64ShuffleMask(method), state);376}377378_assert_msg_(state == shuf, "Arm64ShufflePerform failed to resolve shuffle");379}380381void Arm64JitBackend::CompIR_VecAssign(IRInst inst) {382CONDITIONAL_DISABLE;383384switch (inst.op) {385case IROp::Vec4Init:386regs_.Map(inst);387switch (Vec4Init(inst.src1)) {388case Vec4Init::AllZERO:389fp_.MOVI(32, regs_.FQ(inst.dest), 0);390break;391392case Vec4Init::AllONE:393case Vec4Init::AllMinusONE:394fp_.MOVI2FDUP(regs_.FQ(inst.dest), 1.0f, INVALID_REG, Vec4Init(inst.src1) == Vec4Init::AllMinusONE);395break;396397case Vec4Init::Set_1000:398case Vec4Init::Set_0100:399case Vec4Init::Set_0010:400case Vec4Init::Set_0001:401fp_.MOVI(32, regs_.FQ(inst.dest), 0);402fp_.MOVI2FDUP(EncodeRegToQuad(SCRATCHF1), 1.0f);403fp_.INS(32, regs_.FQ(inst.dest), inst.src1 - (int)Vec4Init::Set_1000, EncodeRegToQuad(SCRATCHF1), inst.src1 - (int)Vec4Init::Set_1000);404break;405406default:407_assert_msg_(false, "Unexpected Vec4Init value %d", inst.src1);408DISABLE;409}410break;411412case IROp::Vec4Shuffle:413// There's not really an easy shuffle op on ARM64...414if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0x00) {415// This is a broadcast. If dest == src1, this won't clear it.416regs_.SpillLockFPR(inst.src1);417regs_.MapVec4(inst.dest, MIPSMap::NOINIT);418fp_.DUP(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), 0);419} else if (inst.src2 == 0xE4) {420if (inst.dest != inst.src1) {421regs_.Map(inst);422fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));423}424} else {425regs_.Map(inst);426Arm64ShufflePerform(fp_, regs_.FQ(inst.dest), regs_.FQ(inst.src1), inst.src2);427}428break;429430case IROp::Vec4Blend:431regs_.Map(inst);432if (inst.src1 == inst.src2) {433// Shouldn't really happen, just making sure the below doesn't have to think about it.434if (inst.dest != inst.src1)435fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));436break;437}438439// To reduce overlap cases to consider, let's inverse src1/src2 if dest == src2.440// Thus, dest could be src1, but no other overlap is possible.441if (inst.dest == inst.src2) {442std::swap(inst.src1, inst.src2);443inst.constant ^= 0xF;444}445446switch (inst.constant & 0xF) {447case 0b0000:448if (inst.dest != inst.src1)449fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));450break;451452case 0b0001:453if (inst.dest != inst.src1)454fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));455fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);456break;457458case 0b0010:459if (inst.dest != inst.src1)460fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));461fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);462break;463464case 0b0011:465if (inst.dest != inst.src1)466fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));467fp_.INS(64, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);468break;469470case 0b0100:471if (inst.dest != inst.src1)472fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));473fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);474break;475476case 0b0101:477// To get AbCd: REV64 to BADC, then TRN2 xAxC, xbxd.478fp_.REV64(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src2));479fp_.TRN2(32, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));480break;481482case 0b0110:483if (inst.dest != inst.src1)484fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));485fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);486fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);487break;488489case 0b0111:490if (inst.dest != inst.src1) {491fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));492fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src1), 3);493} else {494fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));495fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));496fp_.INS(32, regs_.FQ(inst.dest), 3, EncodeRegToQuad(SCRATCHF1), 3);497}498break;499500case 0b1000:501if (inst.dest != inst.src1)502fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));503fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);504break;505506case 0b1001:507if (inst.dest != inst.src1)508fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));509fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);510fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);511break;512513case 0b1010:514// To get aBcD: REV64 to badc, then TRN2 xaxc, xBxD.515fp_.REV64(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1));516fp_.TRN2(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.src2));517break;518519case 0b1011:520if (inst.dest != inst.src1) {521fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));522fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src1), 2);523} else {524fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));525fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));526fp_.INS(32, regs_.FQ(inst.dest), 2, EncodeRegToQuad(SCRATCHF1), 2);527}528break;529530case 0b1100:531if (inst.dest != inst.src1)532fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));533fp_.INS(64, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);534break;535536case 0b1101:537if (inst.dest != inst.src1) {538fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));539fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src1), 1);540} else {541fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));542fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));543fp_.INS(32, regs_.FQ(inst.dest), 1, EncodeRegToQuad(SCRATCHF1), 1);544}545break;546547case 0b1110:548if (inst.dest != inst.src1) {549fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));550fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src1), 0);551} else {552fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));553fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));554fp_.INS(32, regs_.FQ(inst.dest), 0, EncodeRegToQuad(SCRATCHF1), 0);555}556break;557558case 0b1111:559if (inst.dest != inst.src2)560fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));561break;562}563break;564565case IROp::Vec4Mov:566if (inst.dest != inst.src1) {567regs_.Map(inst);568fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));569}570break;571572default:573INVALIDOP;574break;575}576}577578void Arm64JitBackend::CompIR_VecClamp(IRInst inst) {579CONDITIONAL_DISABLE;580581switch (inst.op) {582case IROp::Vec4ClampToZero:583regs_.Map(inst);584fp_.MOVI(32, EncodeRegToQuad(SCRATCHF1), 0);585fp_.SMAX(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), EncodeRegToQuad(SCRATCHF1));586break;587588case IROp::Vec2ClampToZero:589regs_.Map(inst);590fp_.MOVI(32, EncodeRegToDouble(SCRATCHF1), 0);591fp_.SMAX(32, regs_.FD(inst.dest), regs_.FD(inst.src1), EncodeRegToDouble(SCRATCHF1));592break;593594default:595INVALIDOP;596break;597}598}599600void Arm64JitBackend::CompIR_VecHoriz(IRInst inst) {601CONDITIONAL_DISABLE;602603switch (inst.op) {604case IROp::Vec4Dot:605if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {606// To avoid overlap problems, map a little carefully.607regs_.SpillLockFPR(inst.src1, inst.src2);608regs_.MapVec4(inst.src1);609regs_.MapVec4(inst.src2);610regs_.MapVec4(inst.dest & ~3, MIPSMap::DIRTY);611fp_.FMUL(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), regs_.FQ(inst.src2));612fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));613fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));614fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0);615} else {616regs_.Map(inst);617fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));618fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));619fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));620}621break;622623default:624INVALIDOP;625break;626}627}628629void Arm64JitBackend::CompIR_VecPack(IRInst inst) {630CONDITIONAL_DISABLE;631632switch (inst.op) {633case IROp::Vec4DuplicateUpperBitsAndShift1:634// This operation swizzles the high 8 bits and converts to a signed int.635// It's always after Vec4Unpack8To32.636// 000A000B000C000D -> AAAABBBBCCCCDDDD and then shift right one (to match INT_MAX.)637regs_.Map(inst);638// First, USHR+ORR to get 0A0A0B0B0C0C0D0D.639fp_.USHR(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), 16);640fp_.ORR(EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));641// Now again, but by 8.642fp_.USHR(32, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), 8);643fp_.ORR(regs_.FQ(inst.dest), regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1));644// Finally, shift away the sign. The goal is to saturate 0xFF -> 0x7FFFFFFF.645fp_.USHR(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), 1);646break;647648case IROp::Vec2Pack31To16:649// Same as Vec2Pack32To16, but we shift left 1 first to nuke the sign bit.650if (Overlap(inst.dest, 1, inst.src1, 2)) {651regs_.MapVec2(inst.src1, MIPSMap::DIRTY);652fp_.SHL(32, EncodeRegToDouble(SCRATCHF1), regs_.FD(inst.src1), 1);653fp_.UZP2(16, EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF1));654fp_.INS(32, regs_.FD(inst.dest & ~1), inst.dest & 1, EncodeRegToDouble(SCRATCHF1), 0);655} else {656regs_.Map(inst);657fp_.SHL(32, regs_.FD(inst.dest), regs_.FD(inst.src1), 1);658fp_.UZP2(16, regs_.FD(inst.dest), regs_.FD(inst.dest), regs_.FD(inst.dest));659}660break;661662case IROp::Vec2Pack32To16:663// Viewed as 16 bit lanes: xAxB -> AB00... that's UZP2.664if (Overlap(inst.dest, 1, inst.src1, 2)) {665regs_.MapVec2(inst.src1, MIPSMap::DIRTY);666fp_.UZP2(16, EncodeRegToDouble(SCRATCHF1), regs_.FD(inst.src1), regs_.FD(inst.src1));667fp_.INS(32, regs_.FD(inst.dest & ~1), inst.dest & 1, EncodeRegToDouble(SCRATCHF1), 0);668} else {669regs_.Map(inst);670fp_.UZP2(16, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src1));671}672break;673674case IROp::Vec4Pack31To8:675if (Overlap(inst.dest, 1, inst.src1, 4)) {676regs_.MapVec4(inst.src1, MIPSMap::DIRTY);677} else {678regs_.Map(inst);679}680681// Viewed as 8-bit lanes, after a shift by 23: AxxxBxxxCxxxDxxx.682// So: UZP1 -> AxBxCxDx -> UZP1 again -> ABCD683fp_.USHR(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), 23);684fp_.UZP1(8, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));685// Second one directly to dest, if we can.686if (Overlap(inst.dest, 1, inst.src1, 4)) {687fp_.UZP1(8, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));688fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0);689} else {690fp_.UZP1(8, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));691}692break;693694case IROp::Vec4Pack32To8:695if (Overlap(inst.dest, 1, inst.src1, 4)) {696regs_.MapVec4(inst.src1, MIPSMap::DIRTY);697} else {698regs_.Map(inst);699}700701// Viewed as 8-bit lanes, after a shift by 24: AxxxBxxxCxxxDxxx.702// Same as Vec4Pack31To8, just a different shift.703fp_.USHR(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), 24);704fp_.UZP1(8, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));705// Second one directly to dest, if we can.706if (Overlap(inst.dest, 1, inst.src1, 4)) {707fp_.UZP1(8, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));708fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0);709} else {710fp_.UZP1(8, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));711}712break;713714case IROp::Vec2Unpack16To31:715// Viewed as 16-bit: ABxx -> 0A0B, then shift a zero into the sign place.716if (Overlap(inst.dest, 2, inst.src1, 1)) {717regs_.MapVec2(inst.dest, MIPSMap::DIRTY);718} else {719regs_.Map(inst);720}721if (inst.src1 == inst.dest + 1) {722fp_.USHLL2(16, regs_.FQ(inst.dest), regs_.FD(inst.src1), 15);723} else {724fp_.USHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.src1), 15);725}726break;727728case IROp::Vec2Unpack16To32:729// Just Vec2Unpack16To31, without the shift.730if (Overlap(inst.dest, 2, inst.src1, 1)) {731regs_.MapVec2(inst.dest, MIPSMap::DIRTY);732} else {733regs_.Map(inst);734}735if (inst.src1 == inst.dest + 1) {736fp_.SHLL2(16, regs_.FQ(inst.dest), regs_.FD(inst.src1));737} else {738fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.src1));739}740break;741742case IROp::Vec4Unpack8To32:743// Viewed as 8-bit: ABCD -> 000A000B000C000D.744if (Overlap(inst.dest, 4, inst.src1, 1)) {745regs_.MapVec4(inst.dest, MIPSMap::DIRTY);746if (inst.dest == inst.src1 + 2) {747fp_.SHLL2(8, regs_.FQ(inst.dest), regs_.FD(inst.src1 & ~3));748} else if (inst.dest != inst.src1) {749fp_.DUP(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), inst.src1 & 3);750fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.dest));751} else {752fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.src1));753}754fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.dest));755} else {756regs_.Map(inst);757// Two steps: ABCD -> 0A0B0C0D, then to 000A000B000C000D.758fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.src1));759fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.dest));760}761break;762763default:764INVALIDOP;765break;766}767}768769} // namespace MIPSComp770771#endif772773774