CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/ARM64/Arm64IRRegCache.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18// In other words, PPSSPP_ARCH(ARM64) || DISASM_ALL.19#if PPSSPP_ARCH(ARM64) || (PPSSPP_PLATFORM(WINDOWS) && !defined(__LIBRETRO__))2021#ifndef offsetof22#include <cstddef>23#endif2425#include "Common/CPUDetect.h"26#include "Common/LogReporting.h"27#include "Core/MemMap.h"28#include "Core/MIPS/IR/IRInst.h"29#include "Core/MIPS/IR/IRAnalysis.h"30#include "Core/MIPS/ARM64/Arm64IRRegCache.h"31#include "Core/MIPS/JitCommon/JitState.h"3233using namespace Arm64Gen;34using namespace Arm64IRJitConstants;3536Arm64IRRegCache::Arm64IRRegCache(MIPSComp::JitOptions *jo)37: IRNativeRegCacheBase(jo) {38// The S/D/Q regs overlap, so we just use one slot. The numbers don't match ARM64Reg.39config_.totalNativeRegs = NUM_X_REGS + NUM_X_FREGS;40config_.mapFPUSIMD = true;41// XMM regs are used for both FPU and Vec, so we don't need VREGs.42config_.mapUseVRegs = false;43}4445void Arm64IRRegCache::Init(ARM64XEmitter *emitter, ARM64FloatEmitter *fp) {46emit_ = emitter;47fp_ = fp;48}4950const int *Arm64IRRegCache::GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const {51if (type == MIPSLoc::REG) {52// See register alloc remarks in Arm64Asm.cpp.53base = W0;5455// W19-W23 are most suitable for static allocation. Those that are chosen for static allocation56// should be omitted here and added in GetStaticAllocations.57static const int allocationOrder[] = {58W19, W20, W21, W22, W23, W24, W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15,59};60static const int allocationOrderStaticAlloc[] = {61W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15,62};6364if (jo_->useStaticAlloc) {65count = ARRAY_SIZE(allocationOrderStaticAlloc);66return allocationOrderStaticAlloc;67}68count = ARRAY_SIZE(allocationOrder);69return allocationOrder;70} else if (type == MIPSLoc::FREG) {71base = S0 - NUM_X_REGS;7273// We don't really need four temps, probably.74// We start with S8 for call flushes.75static const int allocationOrder[] = {76// Reserve four full 128-bit temp registers, should be plenty.77S8, S9, S10, S11, // Partially callee-save (bottom 64 bits)78S12, S13, S14, S15, // Partially callee-save (bottom 64 bits)79S16, S17, S18, S19,80S20, S21, S22, S23,81S24, S25, S26, S27,82S28, S29, S30, S31,83S4, S5, S6, S7,84};8586count = ARRAY_SIZE(allocationOrder);87return allocationOrder;88} else {89_assert_msg_(false, "Allocation order not yet implemented");90count = 0;91return nullptr;92}93}9495const Arm64IRRegCache::StaticAllocation *Arm64IRRegCache::GetStaticAllocations(int &count) const {96static const StaticAllocation allocs[] = {97{ MIPS_REG_SP, W19, MIPSLoc::REG, true },98{ MIPS_REG_V0, W20, MIPSLoc::REG },99{ MIPS_REG_V1, W21, MIPSLoc::REG },100{ MIPS_REG_A0, W22, MIPSLoc::REG },101{ MIPS_REG_A1, W23, MIPSLoc::REG },102{ MIPS_REG_RA, W24, MIPSLoc::REG },103};104105if (jo_->useStaticAlloc) {106count = ARRAY_SIZE(allocs);107return allocs;108}109return IRNativeRegCacheBase::GetStaticAllocations(count);110}111112void Arm64IRRegCache::EmitLoadStaticRegisters() {113int count = 0;114const StaticAllocation *allocs = GetStaticAllocations(count);115for (int i = 0; i < count; ++i) {116int offset = GetMipsRegOffset(allocs[i].mr);117if (i + 1 < count && allocs[i].mr == allocs[i + 1].mr - 1) {118_assert_(!allocs[i].pointerified && !allocs[i + 1].pointerified);119emit_->LDP(INDEX_SIGNED, FromNativeReg(allocs[i].nr), FromNativeReg(allocs[i + 1].nr), CTXREG, offset);120++i;121} else {122emit_->LDR(INDEX_UNSIGNED, FromNativeReg(allocs[i].nr), CTXREG, offset);123if (allocs[i].pointerified && jo_->enablePointerify) {124ARM64Reg r64 = FromNativeReg64(allocs[i].nr);125uint32_t membaseHigh = (uint32_t)((uint64_t)Memory::base >> 32);126emit_->MOVK(r64, membaseHigh & 0xFFFF, SHIFT_32);127if (membaseHigh & 0xFFFF0000)128emit_->MOVK(r64, membaseHigh >> 16, SHIFT_48);129}130}131}132}133134void Arm64IRRegCache::EmitSaveStaticRegisters() {135int count = 0;136const StaticAllocation *allocs = GetStaticAllocations(count);137// This only needs to run once (by Asm) so checks don't need to be fast.138for (int i = 0; i < count; ++i) {139int offset = GetMipsRegOffset(allocs[i].mr);140if (i + 1 < count && allocs[i].mr == allocs[i + 1].mr - 1) {141emit_->STP(INDEX_SIGNED, FromNativeReg(allocs[i].nr), FromNativeReg(allocs[i + 1].nr), CTXREG, offset);142++i;143} else {144emit_->STR(INDEX_UNSIGNED, FromNativeReg(allocs[i].nr), CTXREG, offset);145}146}147}148149void Arm64IRRegCache::FlushBeforeCall() {150// These registers are not preserved by function calls.151auto isGPRSaved = [&](IRNativeReg nreg) {152ARM64Reg ar = FromNativeReg(nreg);153return ar >= W19 && ar <= W29;154};155auto isFPRSaved = [&](IRNativeReg nreg) {156ARM64Reg ar = FromNativeReg(nreg);157return ar >= S8 && ar <= S15;158};159160// Go through by IR index first, to use STP where we can.161for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {162if (mr[i].nReg == -1 || mr[i + 1].nReg == -1 || mr[i].isStatic || mr[i + 1].isStatic)163continue;164// Ignore multilane regs.165if (mr[i].lane != -1 || mr[i + 1].lane != -1)166continue;167if (!nr[mr[i].nReg].isDirty || !nr[mr[i + 1].nReg].isDirty)168continue;169// Make sure not to try to pair a GPR and FPR.170if (IsValidGPR(i) != IsValidGPR(i + 1))171continue;172173int offset = GetMipsRegOffset(i);174175// Okay, it's a maybe. Are we flushing both as GPRs?176if (!isGPRSaved(mr[i].nReg) && !isGPRSaved(mr[i + 1].nReg) && IsValidGPR(i) && offset <= 252) {177// If either is mapped as a pointer, fix it.178if (mr[i].loc == MIPSLoc::REG_AS_PTR)179AdjustNativeRegAsPtr(mr[i].nReg, false);180if (mr[i + 1].loc == MIPSLoc::REG_AS_PTR)181AdjustNativeRegAsPtr(mr[i + 1].nReg, false);182183// That means we should use STP.184emit_->STP(INDEX_SIGNED, FromNativeReg(mr[i].nReg), FromNativeReg(mr[i + 1].nReg), CTXREG, offset);185186DiscardNativeReg(mr[i].nReg);187DiscardNativeReg(mr[i + 1].nReg);188189++i;190continue;191}192193// Perhaps as FPRs? Note: these must be single lane at this point.194// TODO: Could use STP on quads etc. too, i.e. i & i + 4.195if (!isFPRSaved(mr[i].nReg) && !isFPRSaved(mr[i + 1].nReg) && !IsValidGPR(i) && offset <= 252) {196fp_->STP(32, INDEX_SIGNED, FromNativeReg(mr[i].nReg), FromNativeReg(mr[i + 1].nReg), CTXREG, offset);197198DiscardNativeReg(mr[i].nReg);199DiscardNativeReg(mr[i + 1].nReg);200201++i;202continue;203}204}205206// Alright, now go through any that didn't get flushed with STP.207for (int i = 0; i < 19; ++i) {208FlushNativeReg(GPRToNativeReg(ARM64Reg(W0 + i)));209}210FlushNativeReg(GPRToNativeReg(W30));211212for (int i = 0; i < 8; ++i) {213FlushNativeReg(VFPToNativeReg(ARM64Reg(S0 + i)));214}215for (int i = 8; i < 16; ++i) {216// These are preserved but only the low 64 bits.217IRNativeReg nreg = VFPToNativeReg(ARM64Reg(S0 + i));218if (nr[nreg].mipsReg != IRREG_INVALID && GetFPRLaneCount(nr[nreg].mipsReg - 32) > 2)219FlushNativeReg(nreg);220}221for (int i = 16; i < 32; ++i) {222FlushNativeReg(VFPToNativeReg(ARM64Reg(S0 + i)));223}224}225226ARM64Reg Arm64IRRegCache::TryMapTempImm(IRReg r) {227_dbg_assert_(IsValidGPR(r));228229// If already mapped, no need for a temporary.230if (IsGPRMapped(r)) {231return R(r);232}233234if (mr[r].loc == MIPSLoc::IMM) {235// Can we just use zero?236if (mr[r].imm == 0)237return WZR;238239// Try our luck - check for an exact match in another xreg.240for (int i = 1; i < TOTAL_MAPPABLE_IRREGS; ++i) {241if (mr[i].loc == MIPSLoc::REG_IMM && mr[i].imm == mr[r].imm) {242// Awesome, let's just use this reg.243return FromNativeReg(mr[i].nReg);244}245}246}247248return INVALID_REG;249}250251ARM64Reg Arm64IRRegCache::GetAndLockTempGPR() {252IRNativeReg reg = AllocateReg(MIPSLoc::REG, MIPSMap::INIT);253if (reg != -1) {254nr[reg].tempLockIRIndex = irIndex_;255}256return FromNativeReg(reg);257}258259ARM64Reg Arm64IRRegCache::GetAndLockTempFPR() {260IRNativeReg reg = AllocateReg(MIPSLoc::FREG, MIPSMap::INIT);261if (reg != -1) {262nr[reg].tempLockIRIndex = irIndex_;263}264return FromNativeReg(reg);265}266267ARM64Reg Arm64IRRegCache::MapWithFPRTemp(const IRInst &inst) {268return FromNativeReg(MapWithTemp(inst, MIPSLoc::FREG));269}270271ARM64Reg Arm64IRRegCache::MapGPR(IRReg mipsReg, MIPSMap mapFlags) {272_dbg_assert_(IsValidGPR(mipsReg));273274// Okay, not mapped, so we need to allocate an arm64 register.275IRNativeReg nreg = MapNativeReg(MIPSLoc::REG, mipsReg, 1, mapFlags);276return FromNativeReg(nreg);277}278279ARM64Reg Arm64IRRegCache::MapGPR2(IRReg mipsReg, MIPSMap mapFlags) {280_dbg_assert_(IsValidGPR(mipsReg) && IsValidGPR(mipsReg + 1));281282// Okay, not mapped, so we need to allocate an arm64 register.283IRNativeReg nreg = MapNativeReg(MIPSLoc::REG, mipsReg, 2, mapFlags);284return FromNativeReg64(nreg);285}286287ARM64Reg Arm64IRRegCache::MapGPRAsPointer(IRReg reg) {288return FromNativeReg64(MapNativeRegAsPointer(reg));289}290291ARM64Reg Arm64IRRegCache::MapFPR(IRReg mipsReg, MIPSMap mapFlags) {292_dbg_assert_(IsValidFPR(mipsReg));293_dbg_assert_(mr[mipsReg + 32].loc == MIPSLoc::MEM || mr[mipsReg + 32].loc == MIPSLoc::FREG);294295IRNativeReg nreg = MapNativeReg(MIPSLoc::FREG, mipsReg + 32, 1, mapFlags);296if (nreg != -1)297return FromNativeReg(nreg);298return INVALID_REG;299}300301ARM64Reg Arm64IRRegCache::MapVec2(IRReg first, MIPSMap mapFlags) {302_dbg_assert_(IsValidFPR(first));303_dbg_assert_((first & 1) == 0);304_dbg_assert_(mr[first + 32].loc == MIPSLoc::MEM || mr[first + 32].loc == MIPSLoc::FREG);305306IRNativeReg nreg = MapNativeReg(MIPSLoc::FREG, first + 32, 2, mapFlags);307if (nreg != -1)308return EncodeRegToDouble(FromNativeReg(nreg));309return INVALID_REG;310}311312ARM64Reg Arm64IRRegCache::MapVec4(IRReg first, MIPSMap mapFlags) {313_dbg_assert_(IsValidFPR(first));314_dbg_assert_((first & 3) == 0);315_dbg_assert_(mr[first + 32].loc == MIPSLoc::MEM || mr[first + 32].loc == MIPSLoc::FREG);316317IRNativeReg nreg = MapNativeReg(MIPSLoc::FREG, first + 32, 4, mapFlags);318if (nreg != -1)319return EncodeRegToQuad(FromNativeReg(nreg));320return INVALID_REG;321}322323void Arm64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {324_assert_(nreg >= 0 && nreg < (IRNativeReg)WZR);325ARM64Reg r = FromNativeReg64(nreg);326if (state) {327if (!jo_->enablePointerify) {328#if defined(MASKED_PSP_MEMORY)329// This destroys the value...330_dbg_assert_(!nr[nreg].isDirty);331emit_->ANDI2R(r, r, Memory::MEMVIEW32_MASK);332#endif333emit_->ADD(r, r, MEMBASEREG);334} else {335uint32_t membaseHigh = (uint32_t)((uint64_t)Memory::base >> 32);336emit_->MOVK(r, membaseHigh & 0xFFFF, SHIFT_32);337if (membaseHigh & 0xFFFF0000)338emit_->MOVK(r, membaseHigh >> 16, SHIFT_48);339}340} else {341if (!jo_->enablePointerify) {342#if defined(MASKED_PSP_MEMORY)343_dbg_assert_(!nr[nreg].isDirty);344#endif345emit_->SUB(r, r, MEMBASEREG);346} else {347// Nothing to do, just ignore the high 32 bits.348}349}350}351352bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {353// No special flags, skip the check for a little speed.354return true;355}356357void Arm64IRRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {358ARM64Reg r = FromNativeReg(nreg);359_dbg_assert_(first != MIPS_REG_ZERO);360if (nreg < NUM_X_REGS) {361_assert_(lanes == 1 || (lanes == 2 && first == IRREG_LO));362if (lanes == 1)363emit_->LDR(INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));364else if (lanes == 2)365emit_->LDR(INDEX_UNSIGNED, EncodeRegTo64(r), CTXREG, GetMipsRegOffset(first));366else367_assert_(false);368} else {369_dbg_assert_(nreg < NUM_X_REGS + NUM_X_FREGS);370_assert_msg_(mr[first].loc == MIPSLoc::FREG, "Cannot load this type: %d", (int)mr[first].loc);371if (lanes == 1)372fp_->LDR(32, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));373else if (lanes == 2)374fp_->LDR(64, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));375else if (lanes == 4)376fp_->LDR(128, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));377else378_assert_(false);379}380}381382void Arm64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {383ARM64Reg r = FromNativeReg(nreg);384_dbg_assert_(first != MIPS_REG_ZERO);385if (nreg < NUM_X_REGS) {386_assert_(lanes == 1 || (lanes == 2 && first == IRREG_LO));387_assert_(mr[first].loc == MIPSLoc::REG || mr[first].loc == MIPSLoc::REG_IMM);388if (lanes == 1)389emit_->STR(INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));390else if (lanes == 2)391emit_->STR(INDEX_UNSIGNED, EncodeRegTo64(r), CTXREG, GetMipsRegOffset(first));392else393_assert_(false);394} else {395_dbg_assert_(nreg < NUM_X_REGS + NUM_X_FREGS);396_assert_msg_(mr[first].loc == MIPSLoc::FREG, "Cannot store this type: %d", (int)mr[first].loc);397if (lanes == 1)398fp_->STR(32, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));399else if (lanes == 2)400fp_->STR(64, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));401else if (lanes == 4)402fp_->STR(128, INDEX_UNSIGNED, r, CTXREG, GetMipsRegOffset(first));403else404_assert_(false);405}406}407408void Arm64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {409ARM64Reg r = FromNativeReg(nreg);410_dbg_assert_(nreg >= 0 && nreg < (IRNativeReg)WZR);411// On ARM64, MOVZ/MOVK is really fast.412emit_->MOVI2R(r, imm);413}414415void Arm64IRRegCache::StoreRegValue(IRReg mreg, uint32_t imm) {416_assert_(IsValidGPRNoZero(mreg));417// Try to optimize using a different reg.418ARM64Reg storeReg = INVALID_REG;419if (imm == 0)420storeReg = WZR;421422// Could we get lucky? Check for an exact match in another xreg.423for (int i = 1; i < TOTAL_MAPPABLE_IRREGS; ++i) {424if (mr[i].loc == MIPSLoc::REG_IMM && mr[i].imm == imm) {425// Awesome, let's just store this reg.426storeReg = (ARM64Reg)mr[i].nReg;427break;428}429}430431if (storeReg == INVALID_REG) {432emit_->MOVI2R(SCRATCH1, imm);433storeReg = SCRATCH1;434}435emit_->STR(INDEX_UNSIGNED, storeReg, CTXREG, GetMipsRegOffset(mreg));436}437438bool Arm64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {439bool allowed = !mr[nr[nreg].mipsReg].isStatic;440// There's currently no support for non-FREGs here.441allowed = allowed && type == MIPSLoc::FREG;442443if (dest == -1)444dest = nreg;445446if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {447// Alright, changing lane count (possibly including lane position.)448IRReg oldfirst = nr[nreg].mipsReg;449int oldlanes = 0;450while (mr[oldfirst + oldlanes].nReg == nreg)451oldlanes++;452_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");453_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");454455if (lanes == 1 && TransferVecTo1(nreg, dest, first, oldlanes))456return true;457if (oldlanes == 1 && Transfer1ToVec(nreg, dest, first, lanes))458return true;459}460461return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);462}463464bool Arm64IRRegCache::TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes) {465IRReg oldfirst = nr[nreg].mipsReg;466467// Is it worth preserving any of the old regs?468int numKept = 0;469for (int i = 0; i < oldlanes; ++i) {470// Skip whichever one this is extracting.471if (oldfirst + i == first)472continue;473// If 0 isn't being transfered, easy to keep in its original reg.474if (i == 0 && dest != nreg) {475numKept++;476continue;477}478479IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);480if (freeReg != -1 && IsRegRead(MIPSLoc::FREG, oldfirst + i)) {481// If there's one free, use it. Don't modify nreg, though.482fp_->DUP(32, FromNativeReg(freeReg), FromNativeReg(nreg), i);483484// Update accounting.485nr[freeReg].isDirty = nr[nreg].isDirty;486nr[freeReg].mipsReg = oldfirst + i;487mr[oldfirst + i].lane = -1;488mr[oldfirst + i].nReg = freeReg;489numKept++;490}491}492493// Unless all other lanes were kept, store.494if (nr[nreg].isDirty && numKept < oldlanes - 1) {495StoreNativeReg(nreg, oldfirst, oldlanes);496// Set false even for regs that were split out, since they were flushed too.497for (int i = 0; i < oldlanes; ++i) {498if (mr[oldfirst + i].nReg != -1)499nr[mr[oldfirst + i].nReg].isDirty = false;500}501}502503// Next, move the desired element into first place.504if (mr[first].lane > 0) {505fp_->DUP(32, FromNativeReg(dest), FromNativeReg(nreg), mr[first].lane);506} else if (mr[first].lane <= 0 && dest != nreg) {507fp_->DUP(32, FromNativeReg(dest), FromNativeReg(nreg), 0);508}509510// Now update accounting.511for (int i = 0; i < oldlanes; ++i) {512auto &mreg = mr[oldfirst + i];513if (oldfirst + i == first) {514mreg.lane = -1;515mreg.nReg = dest;516} else if (mreg.nReg == nreg && i == 0 && nreg != dest) {517// Still in the same register, but no longer a vec.518mreg.lane = -1;519} else if (mreg.nReg == nreg) {520// No longer in a register.521mreg.nReg = -1;522mreg.lane = -1;523mreg.loc = MIPSLoc::MEM;524}525}526527if (dest != nreg) {528nr[dest].isDirty = nr[nreg].isDirty;529if (oldfirst == first) {530nr[nreg].mipsReg = -1;531nr[nreg].isDirty = false;532}533}534nr[dest].mipsReg = first;535536return true;537}538539bool Arm64IRRegCache::Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes) {540ARM64Reg destReg = FromNativeReg(dest);541ARM64Reg cur[4]{};542int numInRegs = 0;543u8 blendMask = 0;544for (int i = 0; i < lanes; ++i) {545if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {546// Can't do it, either double mapped or overlapping vec.547return false;548}549550if (mr[first + i].nReg == -1) {551cur[i] = INVALID_REG;552blendMask |= 1 << i;553} else {554cur[i] = FromNativeReg(mr[first + i].nReg);555numInRegs++;556}557}558559// Shouldn't happen, this should only get called to transfer one in a reg.560if (numInRegs == 0)561return false;562563// If everything's currently in a reg, move it into this reg.564if (lanes == 4) {565// Go with an exhaustive approach, only 15 possibilities...566if (blendMask == 0) {567// y = yw##, x = xz##, dest = xyzw.568fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3]));569fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));570fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1]));571} else if (blendMask == 0b0001) {572// y = yw##, w = x###, w = xz##, dest = xyzw.573fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3]));574fp_->LDR(32, INDEX_UNSIGNED, cur[3], CTXREG, GetMipsRegOffset(first + 0));575fp_->ZIP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[2]));576fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[1]));577} else if (blendMask == 0b0010) {578// x = xz##, z = y###, z = yw##, dest = xyzw.579fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));580fp_->LDR(32, INDEX_UNSIGNED, cur[2], CTXREG, GetMipsRegOffset(first + 1));581fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[3]));582fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));583} else if (blendMask == 0b0011 && (first & 1) == 0) {584// z = zw##, w = xy##, dest = xyzw. Mixed lane sizes.585fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[3]));586fp_->LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(cur[3]), CTXREG, GetMipsRegOffset(first + 0));587fp_->ZIP1(64, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[2]));588} else if (blendMask == 0b0100) {589// y = yw##, w = z###, x = xz##, dest = xyzw.590fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3]));591fp_->LDR(32, INDEX_UNSIGNED, cur[3], CTXREG, GetMipsRegOffset(first + 2));592fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[3]));593fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1]));594} else if (blendMask == 0b0101 && (first & 3) == 0) {595// y = yw##, w=x#z#, w = xz##, dest = xyzw.596fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3]));597fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(cur[3]), CTXREG, GetMipsRegOffset(first));598fp_->UZP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]));599fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[1]));600} else if (blendMask == 0b0110 && (first & 3) == 0) {601if (destReg == cur[0]) {602// w = wx##, dest = #yz#, dest = xyz#, dest = xyzw.603fp_->ZIP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[0]));604fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));605fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[3]), 1);606fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[3]), 0);607} else {608// Assumes destReg may equal cur[3].609// x = xw##, dest = #yz#, dest = xyz#, dest = xyzw.610fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[3]));611fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));612fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[0]), 0);613fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[0]), 1);614}615} else if (blendMask == 0b0111 && (first & 3) == 0 && destReg != cur[3]) {616// dest = xyz#, dest = xyzw.617fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));618fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[3]), 0);619} else if (blendMask == 0b1000) {620// x = xz##, z = w###, y = yw##, dest = xyzw.621fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));622fp_->LDR(32, INDEX_UNSIGNED, cur[2], CTXREG, GetMipsRegOffset(first + 3));623fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[2]));624fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1]));625} else if (blendMask == 0b1001 && (first & 3) == 0) {626if (destReg == cur[1]) {627// w = zy##, dest = x##w, dest = xy#w, dest = xyzw.628fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[1]));629fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));630fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[2]), 1);631fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[2]), 0);632} else {633// Assumes destReg may equal cur[2].634// y = yz##, dest = x##w, dest = xy#w, dest = xyzw.635fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[2]));636fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));637fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[1]), 0);638fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[1]), 1);639}640} else if (blendMask == 0b1010 && (first & 3) == 0) {641// x = xz##, z = #y#w, z=yw##, dest = xyzw.642fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));643fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(cur[2]), CTXREG, GetMipsRegOffset(first));644fp_->UZP2(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]));645fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2]));646} else if (blendMask == 0b1011 && (first & 3) == 0 && destReg != cur[2]) {647// dest = xy#w, dest = xyzw.648fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));649fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[2]), 0);650} else if (blendMask == 0b1100 && (first & 1) == 0) {651// x = xy##, y = zw##, dest = xyzw. Mixed lane sizes.652fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1]));653fp_->LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(cur[1]), CTXREG, GetMipsRegOffset(first + 2));654fp_->ZIP1(64, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1]));655} else if (blendMask == 0b1101 && (first & 3) == 0 && destReg != cur[1]) {656// dest = x#zw, dest = xyzw.657fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));658fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[1]), 0);659} else if (blendMask == 0b1110 && (first & 3) == 0 && destReg != cur[0]) {660// dest = #yzw, dest = xyzw.661fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));662fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[0]), 0);663} else if (blendMask == 0b1110 && (first & 3) == 0) {664// If dest == cur[0] (which may be common), we need a temp...665IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);666// Very unfortunate.667if (freeReg == INVALID_REG)668return false;669670// free = x###, dest = #yzw, dest = xyzw.671fp_->DUP(32, EncodeRegToQuad(FromNativeReg(freeReg)), EncodeRegToQuad(cur[0]), 0);672fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first));673fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(FromNativeReg(freeReg)), 0);674} else {675return false;676}677} else if (lanes == 2) {678if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {679fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(cur[0]), EncodeRegToDouble(cur[1]));680} else if (cur[0] == INVALID_REG && dest != nreg) {681fp_->LDR(32, INDEX_UNSIGNED, destReg, CTXREG, GetMipsRegOffset(first + 0));682fp_->INS(32, EncodeRegToDouble(destReg), 1, EncodeRegToDouble(cur[1]), 0);683} else {684IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);685if (freeReg == INVALID_REG)686return false;687688if (cur[0] == INVALID_REG) {689fp_->LDR(32, INDEX_UNSIGNED, FromNativeReg(freeReg), CTXREG, GetMipsRegOffset(first + 0));690fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(FromNativeReg(freeReg)), EncodeRegToDouble(cur[1]));691} else {692fp_->LDR(32, INDEX_UNSIGNED, FromNativeReg(freeReg), CTXREG, GetMipsRegOffset(first + 1));693fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(cur[0]), EncodeRegToDouble(FromNativeReg(freeReg)));694}695}696} else {697return false;698}699700mr[first].lane = 0;701for (int i = 0; i < lanes; ++i) {702if (mr[first + i].nReg != -1) {703// If this was dirty, the combined reg is now dirty.704if (nr[mr[first + i].nReg].isDirty)705nr[dest].isDirty = true;706707// Throw away the other register we're no longer using.708if (i != 0)709DiscardNativeReg(mr[first + i].nReg);710}711712// And set it as using the new one.713mr[first + i].lane = i;714mr[first + i].loc = MIPSLoc::FREG;715mr[first + i].nReg = dest;716}717718if (dest != nreg) {719nr[dest].mipsReg = first;720nr[nreg].mipsReg = -1;721nr[nreg].isDirty = false;722}723724return true;725}726727void Arm64IRRegCache::FlushAll(bool gprs, bool fprs) {728// Note: make sure not to change the registers when flushing:729// Branching code may expect the armreg to retain its value.730731auto needsFlush = [&](IRReg i) {732if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)733return false;734if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)735return false;736return true;737};738739// Try to flush in pairs when possible.740for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {741if (!needsFlush(i) || !needsFlush(i + 1))742continue;743// Ignore multilane regs. Could handle with more smartness...744if (mr[i].lane != -1 || mr[i + 1].lane != -1)745continue;746747int offset = GetMipsRegOffset(i);748749// If both are imms, let's materialize a single reg and store.750if (mr[i].loc == MIPSLoc::IMM && mr[i + 1].loc == MIPSLoc::IMM) {751if ((i & 1) == 0) {752uint64_t fullImm = ((uint64_t) mr[i + 1].imm << 32) | mr[i].imm;753emit_->MOVI2R(SCRATCH1_64, fullImm);754emit_->STR(INDEX_UNSIGNED, SCRATCH1_64, CTXREG, offset);755DiscardReg(i);756DiscardReg(i + 1);757++i;758}759continue;760}761762// Okay, two dirty regs in a row, in need of flushing. Both GPRs?763if (IsValidGPR(i) && IsValidGPR(i + 1) && offset <= 252) {764auto setupForFlush = [&](ARM64Reg &ar, IRReg r) {765if (mr[r].loc == MIPSLoc::IMM) {766ar = TryMapTempImm(r);767if (ar == INVALID_REG) {768// Both cannot be imms, so this is safe.769ar = SCRATCH1;770emit_->MOVI2R(ar, mr[r].imm);771}772} else if (mr[r].loc == MIPSLoc::REG_AS_PTR) {773AdjustNativeRegAsPtr(r, false);774ar = FromNativeReg(mr[r].nReg);775} else {776_dbg_assert_(mr[r].loc == MIPSLoc::REG || mr[r].loc == MIPSLoc::REG_IMM);777ar = FromNativeReg(mr[r].nReg);778}779};780781ARM64Reg armRegs[2]{ INVALID_REG, INVALID_REG };782setupForFlush(armRegs[0], i);783setupForFlush(armRegs[1], i + 1);784785emit_->STP(INDEX_SIGNED, armRegs[0], armRegs[1], CTXREG, offset);786DiscardReg(i);787DiscardReg(i + 1);788++i;789continue;790}791792// Perhaps as FPRs? Note: these must be single lane at this point.793// TODO: Could use STP on quads etc. too, i.e. i & i + 4.794if (i >= 32 && IsValidFPR(i - 32) && IsValidFPR(i + 1 - 32) && offset <= 252) {795_dbg_assert_(mr[i].loc == MIPSLoc::FREG && mr[i + 1].loc == MIPSLoc::FREG);796fp_->STP(32, INDEX_SIGNED, FromNativeReg(mr[i].nReg), FromNativeReg(mr[i + 1].nReg), CTXREG, offset);797798DiscardNativeReg(mr[i].nReg);799DiscardNativeReg(mr[i + 1].nReg);800801++i;802continue;803}804}805806// Flush all the rest that weren't done via STP.807IRNativeRegCacheBase::FlushAll(gprs, fprs);808}809810ARM64Reg Arm64IRRegCache::R(IRReg mipsReg) {811_dbg_assert_(IsValidGPR(mipsReg));812_dbg_assert_(mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM);813if (mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM) {814return FromNativeReg(mr[mipsReg].nReg);815} else {816ERROR_LOG_REPORT(Log::JIT, "Reg %i not in arm64 reg", mipsReg);817return INVALID_REG; // BAAAD818}819}820821ARM64Reg Arm64IRRegCache::R64(IRReg mipsReg) {822return EncodeRegTo64(R(mipsReg));823}824825ARM64Reg Arm64IRRegCache::RPtr(IRReg mipsReg) {826_dbg_assert_(IsValidGPR(mipsReg));827_dbg_assert_(mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM || mr[mipsReg].loc == MIPSLoc::REG_AS_PTR);828if (mr[mipsReg].loc == MIPSLoc::REG_AS_PTR) {829return FromNativeReg64(mr[mipsReg].nReg);830} else if (mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM) {831int r = mr[mipsReg].nReg;832_dbg_assert_(nr[r].pointerified);833if (nr[r].pointerified) {834return FromNativeReg64(mr[mipsReg].nReg);835} else {836ERROR_LOG(Log::JIT, "Tried to use a non-pointer register as a pointer");837return INVALID_REG;838}839} else {840ERROR_LOG_REPORT(Log::JIT, "Reg %i not in arm64 reg", mipsReg);841return INVALID_REG; // BAAAD842}843}844845ARM64Reg Arm64IRRegCache::F(IRReg mipsReg) {846_dbg_assert_(IsValidFPR(mipsReg));847_dbg_assert_(mr[mipsReg + 32].loc == MIPSLoc::FREG);848if (mr[mipsReg + 32].loc == MIPSLoc::FREG) {849return FromNativeReg(mr[mipsReg + 32].nReg);850} else {851ERROR_LOG_REPORT(Log::JIT, "Reg %i not in arm64 reg", mipsReg);852return INVALID_REG; // BAAAD853}854}855856ARM64Reg Arm64IRRegCache::FD(IRReg mipsReg) {857return EncodeRegToDouble(F(mipsReg));858}859860ARM64Reg Arm64IRRegCache::FQ(IRReg mipsReg) {861return EncodeRegToQuad(F(mipsReg));862}863864IRNativeReg Arm64IRRegCache::GPRToNativeReg(ARM64Reg r) {865_dbg_assert_msg_(r >= 0 && r < 0x40, "Not a GPR?");866return (IRNativeReg)DecodeReg(r);867}868869IRNativeReg Arm64IRRegCache::VFPToNativeReg(ARM64Reg r) {870_dbg_assert_msg_(r >= 0x40 && r < 0xE0, "Not VFP?");871return (IRNativeReg)(NUM_X_REGS + (int)DecodeReg(r));872}873874ARM64Reg Arm64IRRegCache::FromNativeReg(IRNativeReg r) {875if (r >= NUM_X_REGS)876return EncodeRegToSingle((Arm64Gen::ARM64Reg)r);877return (Arm64Gen::ARM64Reg)r;878}879880ARM64Reg Arm64IRRegCache::FromNativeReg64(IRNativeReg r) {881_dbg_assert_msg_(r >= 0 && r < NUM_X_REGS, "Not a GPR?");882return EncodeRegTo64((Arm64Gen::ARM64Reg)r);883}884885#endif886887888