CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM/ArmCompVFPUNEONUtil.cpp
Views: 1401
// Copyright (c) 2013- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617// NEON VFPU18// This is where we will create an alternate implementation of the VFPU emulation19// that uses NEON Q registers to cache pairs/tris/quads, and so on.20// Will require major extensions to the reg cache and other things.2122// ARM NEON can only do pairs and quads, not tris and scalars.23// We can do scalars, though, for many operations if all the operands24// are below Q8 (D16, S32) using regular VFP instructions but really not sure25// if it's worth it.2627#include "ppsspp_config.h"28#if PPSSPP_ARCH(ARM)2930#include <cmath>3132#include "Common/Math/math_util.h"3334#include "Common/CPUDetect.h"35#include "Core/MemMap.h"36#include "Core/MIPS/MIPS.h"37#include "Core/MIPS/MIPSAnalyst.h"38#include "Core/MIPS/MIPSCodeUtils.h"39#include "Core/MIPS/MIPSVFPUUtils.h"40#include "Core/Config.h"41#include "Core/Reporting.h"4243#include "Core/MIPS/ARM/ArmJit.h"44#include "Core/MIPS/ARM/ArmRegCache.h"45#include "Core/MIPS/ARM/ArmCompVFPUNEONUtil.h"4647// TODO: Somehow #ifdef away on ARMv5eabi, without breaking the linker.4849#define _RS MIPS_GET_RS(op)50#define _RT MIPS_GET_RT(op)51#define _RD MIPS_GET_RD(op)52#define _FS MIPS_GET_FS(op)53#define _FT MIPS_GET_FT(op)54#define _FD MIPS_GET_FD(op)55#define _SA MIPS_GET_SA(op)56#define _POS ((op>> 6) & 0x1F)57#define _SIZE ((op>>11) & 0x1F)58#define _IMM16 (signed short)(op & 0xFFFF)59#define _IMM26 (op & 0x03FFFFFF)6061namespace MIPSComp {6263using namespace ArmGen;64using namespace ArmJitConstants;6566static const float minus_one = -1.0f;67static const float one = 1.0f;68static const float zero = 0.0f;6970// On NEON, we map triples to Q registers and singles to D registers.71// Sometimes, as when doing dot products, it matters what's in that unused reg. This zeroes it.72void ArmJit::NEONMaskToSize(ARMReg vs, VectorSize sz) {73// TODO74}7576ARMReg ArmJit::NEONMapPrefixST(int mipsReg, VectorSize sz, u32 prefix, int mapFlags) {77static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };78static const float constantArrayNegated[8] = { -0.f, -1.f, -2.f, -0.5f, -3.f, -1.f / 3.f, -0.25f, -1.f / 6.f };7980// Applying prefixes in SIMD fashion will actually be a lot easier than the old style.81if (prefix == 0xE4) {82return fpr.QMapReg(mipsReg, sz, mapFlags);83}8485int n = GetNumVectorElements(sz);8687int regnum[4] = { -1, -1, -1, -1 };88int abs[4] = { 0 };89int negate[4] = { 0 };90int constants[4] = { 0 };91int constNum[4] = { 0 };9293int full_mask = (1 << n) - 1;9495int abs_mask = (prefix >> 8) & full_mask;96int negate_mask = (prefix >> 16) & full_mask;97int constants_mask = (prefix >> 12) & full_mask;9899// Decode prefix to keep the rest readable100int permuteMask = 0;101for (int i = 0; i < n; i++) {102permuteMask |= 3 << (i * 2);103regnum[i] = (prefix >> (i * 2)) & 3;104abs[i] = (prefix >> (8 + i)) & 1;105negate[i] = (prefix >> (16 + i)) & 1;106constants[i] = (prefix >> (12 + i)) & 1;107108if (constants[i]) {109constNum[i] = regnum[i] + (abs[i] << 2);110abs[i] = 0;111}112}113abs_mask &= ~constants_mask;114115bool anyPermute = (prefix & permuteMask) != (0xE4 & permuteMask);116117if (constants_mask == full_mask) {118// It's all constants! Don't even bother mapping the input register,119// just allocate a temp one.120// If a single, this can sometimes be done cheaper. But meh.121ARMReg ar = fpr.QAllocTemp(sz);122for (int i = 0; i < n; i++) {123if ((i & 1) == 0) {124if (constNum[i] == constNum[i + 1]) {125// Replace two loads with a single immediate when easily possible.126ARMReg dest = i & 2 ? D_1(ar) : D_0(ar);127switch (constNum[i]) {128case 0:129case 1:130{131float c = constantArray[constNum[i]];132VMOV_immf(dest, negate[i] ? -c : c);133}134break;135// TODO: There are a few more that are doable.136default:137goto skip;138}139140i++;141continue;142skip:143;144}145}146MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);147VLD1_lane(F_32, ar, R0, i, true);148}149return ar;150}151152// 1. Permute.153// 2. Abs154// If any constants:155// 3. Replace values with constants156// 4. Negate157158ARMReg inputAR = fpr.QMapReg(mipsReg, sz, mapFlags);159ARMReg ar = fpr.QAllocTemp(sz);160161if (!anyPermute) {162VMOV(ar, inputAR);163// No permutations!164} else {165bool allSame = false;166for (int i = 1; i < n; i++) {167if (regnum[0] == regnum[i])168allSame = true;169}170171if (allSame) {172// Easy, someone is duplicating one value onto all the reg parts.173// If this is happening and QMapReg must load, we can combine these two actions174// into a VLD1_lane. TODO175VDUP(F_32, ar, inputAR, regnum[0]);176} else {177// Do some special cases178if (regnum[0] == 1 && regnum[1] == 0) {179INFO_LOG(Log::HLE, "PREFIXST: Bottom swap!");180VREV64(I_32, ar, inputAR);181regnum[0] = 0;182regnum[1] = 1;183}184185// TODO: Make a generic fallback using another temp register186187bool match = true;188for (int i = 0; i < n; i++) {189if (regnum[i] != i)190match = false;191}192193// TODO: Cannot do this permutation yet!194if (!match) {195ERROR_LOG(Log::HLE, "PREFIXST: Unsupported permute! %i %i %i %i / %i", regnum[0], regnum[1], regnum[2], regnum[3], n);196VMOV(ar, inputAR);197}198}199}200201// ABS202// Two methods: If all lanes are "absoluted", it's easy.203if (abs_mask == full_mask) {204// TODO: elide the above VMOV (in !anyPermute) when possible205VABS(F_32, ar, ar);206} else if (abs_mask != 0) {207// Partial ABS!208if (abs_mask == 3) {209VABS(F_32, D_0(ar), D_0(ar));210} else {211// Horrifying fallback: Mov to Q0, abs, move back.212// TODO: Optimize for lower quads where we don't need to move.213VMOV(MatchSize(Q0, ar), ar);214for (int i = 0; i < n; i++) {215if (abs_mask & (1 << i)) {216VABS((ARMReg)(S0 + i), (ARMReg)(S0 + i));217}218}219VMOV(ar, MatchSize(Q0, ar));220INFO_LOG(Log::HLE, "PREFIXST: Partial ABS %i/%i! Slow fallback generated.", abs_mask, full_mask);221}222}223224if (negate_mask == full_mask) {225// TODO: elide the above VMOV when possible226VNEG(F_32, ar, ar);227} else if (negate_mask != 0) {228// Partial negate! I guess we build sign bits in another register229// and simply XOR.230if (negate_mask == 3) {231VNEG(F_32, D_0(ar), D_0(ar));232} else {233// Horrifying fallback: Mov to Q0, negate, move back.234// TODO: Optimize for lower quads where we don't need to move.235VMOV(MatchSize(Q0, ar), ar);236for (int i = 0; i < n; i++) {237if (negate_mask & (1 << i)) {238VNEG((ARMReg)(S0 + i), (ARMReg)(S0 + i));239}240}241VMOV(ar, MatchSize(Q0, ar));242INFO_LOG(Log::HLE, "PREFIXST: Partial Negate %i/%i! Slow fallback generated.", negate_mask, full_mask);243}244}245246// Insert constants where requested, and check negate!247for (int i = 0; i < n; i++) {248if (constants[i]) {249MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);250VLD1_lane(F_32, ar, R0, i, true);251}252}253254return ar;255}256257ArmJit::DestARMReg ArmJit::NEONMapPrefixD(int vreg, VectorSize sz, int mapFlags) {258// Inverted from the actual bits, easier to reason about 1 == write259int writeMask = (~(js.prefixD >> 8)) & 0xF;260int n = GetNumVectorElements(sz);261int full_mask = (1 << n) - 1;262263DestARMReg dest;264dest.sz = sz;265if ((writeMask & full_mask) == full_mask) {266// No need to apply a write mask.267// Let's not make things complicated.268dest.rd = fpr.QMapReg(vreg, sz, mapFlags);269dest.backingRd = dest.rd;270} else {271// Allocate a temporary register.272ERROR_LOG(Log::JIT, "PREFIXD: Write mask allocated! %i/%i", writeMask, full_mask);273dest.rd = fpr.QAllocTemp(sz);274dest.backingRd = fpr.QMapReg(vreg, sz, mapFlags & ~MAP_NOINIT); // Force initialization of the backing reg.275}276return dest;277}278279void ArmJit::NEONApplyPrefixD(DestARMReg dest) {280// Apply clamps to dest.rd281int n = GetNumVectorElements(dest.sz);282283int sat1_mask = 0;284int sat3_mask = 0;285int full_mask = (1 << n) - 1;286for (int i = 0; i < n; i++) {287int sat = (js.prefixD >> (i * 2)) & 3;288if (sat == 1)289sat1_mask |= 1 << i;290if (sat == 3)291sat3_mask |= 1 << i;292}293294if (sat1_mask && sat3_mask) {295// Why would anyone do this?296ERROR_LOG(Log::JIT, "PREFIXD: Can't have both sat[0-1] and sat[-1-1] at the same time yet");297}298299if (sat1_mask) {300if (sat1_mask != full_mask) {301ERROR_LOG(Log::JIT, "PREFIXD: Can't have partial sat1 mask yet (%i vs %i)", sat1_mask, full_mask);302}303if (IsD(dest.rd)) {304VMOV_immf(D0, 0.0);305VMOV_immf(D1, 1.0);306VMAX(F_32, dest.rd, dest.rd, D0);307VMIN(F_32, dest.rd, dest.rd, D1);308} else {309VMOV_immf(Q0, 1.0);310VMIN(F_32, dest.rd, dest.rd, Q0);311VMOV_immf(Q0, 0.0);312VMAX(F_32, dest.rd, dest.rd, Q0);313}314}315316if (sat3_mask && sat1_mask != full_mask) {317if (sat3_mask != full_mask) {318ERROR_LOG(Log::JIT, "PREFIXD: Can't have partial sat3 mask yet (%i vs %i)", sat3_mask, full_mask);319}320if (IsD(dest.rd)) {321VMOV_immf(D0, 0.0);322VMOV_immf(D1, 1.0);323VMAX(F_32, dest.rd, dest.rd, D0);324VMIN(F_32, dest.rd, dest.rd, D1);325} else {326VMOV_immf(Q0, 1.0);327VMIN(F_32, dest.rd, dest.rd, Q0);328VMOV_immf(Q0, -1.0);329VMAX(F_32, dest.rd, dest.rd, Q0);330}331}332333// Check for actual mask operation (unrelated to the "masks" above).334if (dest.backingRd != dest.rd) {335// This means that we need to apply the write mask, from rd to backingRd.336// What a pain. We can at least shortcut easy cases like half the register.337// And we can generate the masks easily with some of the crazy vector imm modes. (bits2bytes for example).338// So no need to load them from RAM.339int writeMask = (~(js.prefixD >> 8)) & 0xF;340341if (writeMask == 3) {342INFO_LOG(Log::JIT, "Doing writemask = 3");343VMOV(D_0(dest.rd), D_0(dest.backingRd));344} else {345// TODO346ERROR_LOG(Log::JIT, "PREFIXD: Arbitrary write masks not supported (%i / %i)", writeMask, full_mask);347VMOV(dest.backingRd, dest.rd);348}349}350}351352ArmJit::MappedRegs ArmJit::NEONMapDirtyInIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {353MappedRegs regs;354if (applyPrefixes) {355regs.vs = NEONMapPrefixS(_VS, ssize, 0);356regs.vt = NEONMapPrefixT(_VT, tsize, 0);357} else {358regs.vs = fpr.QMapReg(_VS, ssize, 0);359regs.vt = fpr.QMapReg(_VT, ssize, 0);360}361362regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0 || GetVectorOverlap(_VD, dsize, _VT, ssize);363if (applyPrefixes) {364regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));365} else {366regs.vd.rd = fpr.QMapReg(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));367regs.vd.backingRd = regs.vd.rd;368regs.vd.sz = dsize;369}370return regs;371}372373ArmJit::MappedRegs ArmJit::NEONMapInIn(MIPSOpcode op, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {374MappedRegs regs;375if (applyPrefixes) {376regs.vs = NEONMapPrefixS(_VS, ssize, 0);377regs.vt = NEONMapPrefixT(_VT, tsize, 0);378} else {379regs.vs = fpr.QMapReg(_VS, ssize, 0);380regs.vt = fpr.QMapReg(_VT, ssize, 0);381}382regs.vd.rd = INVALID_REG;383regs.vd.sz = V_Invalid;384return regs;385}386387ArmJit::MappedRegs ArmJit::NEONMapDirtyIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, bool applyPrefixes) {388MappedRegs regs;389regs.vs = NEONMapPrefixS(_VS, ssize, 0);390regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0;391regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));392return regs;393}394395// Requires quad registers.396void ArmJit::NEONTranspose4x4(ARMReg cols[4]) {397// 0123 _\ 0426398// 4567 / 1537399VTRN(F_32, cols[0], cols[1]);400401// 89ab _\ 8cae402// cdef / 9dbf403VTRN(F_32, cols[2], cols[3]);404405// 04[26] 048c406// 15 37 -> 1537407// [8c]ae 26ae408// 9d bf 9dbf409VSWP(D_1(cols[0]), D_0(cols[2]));410411// 04 8c 048c412// 15[37] -> 159d413// 26 ae 26ae414// [9d]bf 37bf415VSWP(D_1(cols[1]), D_0(cols[3]));416}417418} // namespace MIPSComp419420#endif // PPSSPP_ARCH(ARM)421422423