CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/x86/X64IRRegCache.cpp
Views: 1401
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)1920#ifndef offsetof21#include <cstddef>22#endif2324#include "Common/CPUDetect.h"25#include "Core/MemMap.h"26#include "Core/MIPS/IR/IRInst.h"27#include "Core/MIPS/IR/IRAnalysis.h"28#include "Core/MIPS/x86/X64IRRegCache.h"29#include "Core/MIPS/JitCommon/JitState.h"30#include "Core/Reporting.h"3132using namespace Gen;33using namespace X64IRJitConstants;3435X64IRRegCache::X64IRRegCache(MIPSComp::JitOptions *jo)36: IRNativeRegCacheBase(jo) {37config_.totalNativeRegs = NUM_X_REGS + NUM_X_FREGS;38config_.mapFPUSIMD = true;39// XMM regs are used for both FPU and Vec, so we don't need VREGs.40config_.mapUseVRegs = false;41}4243void X64IRRegCache::Init(XEmitter *emitter) {44emit_ = emitter;45}4647const int *X64IRRegCache::GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const {48if (type == MIPSLoc::REG) {49base = RAX;5051static const int allocationOrder[] = {52#if PPSSPP_ARCH(AMD64)53#ifdef _WIN3254RSI, RDI, R8, R9, R10, R11, R12, R13, RDX, RCX,55#else56RBP, R8, R9, R10, R11, R12, R13, RDX, RCX,57#endif58// Intentionally last.59R15,60#elif PPSSPP_ARCH(X86)61ESI, EDI, EDX, EBX, ECX,62#endif63};6465if ((flags & X64Map::MASK) == X64Map::SHIFT) {66// It's a single option for shifts.67static const int shiftReg[] = { ECX };68count = 1;69return shiftReg;70}71if ((flags & X64Map::MASK) == X64Map::HIGH_DATA) {72// It's a single option for shifts.73static const int shiftReg[] = { EDX };74count = 1;75return shiftReg;76}77#if PPSSPP_ARCH(X86)78if ((flags & X64Map::MASK) == X64Map::LOW_SUBREG) {79static const int lowSubRegAllocationOrder[] = {80EDX, EBX, ECX,81};82count = ARRAY_SIZE(lowSubRegAllocationOrder);83return lowSubRegAllocationOrder;84}85#else86if (jo_->reserveR15ForAsm) {87count = ARRAY_SIZE(allocationOrder) - 1;88return allocationOrder;89}90#endif91count = ARRAY_SIZE(allocationOrder);92return allocationOrder;93} else if (type == MIPSLoc::FREG) {94base = -NUM_X_REGS;9596// TODO: Might have to change this if we can't live without dedicated temps.97static const int allocationOrder[] = {98#if PPSSPP_ARCH(AMD64)99XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM1, XMM2, XMM3, XMM4, XMM5, XMM0,100#elif PPSSPP_ARCH(X86)101XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM0,102#endif103};104105if ((flags & X64Map::MASK) == X64Map::XMM0) {106// Certain cases require this reg.107static const int blendReg[] = { XMM0 };108count = 1;109return blendReg;110}111112count = ARRAY_SIZE(allocationOrder);113return allocationOrder;114} else {115_assert_msg_(false, "Allocation order not yet implemented");116count = 0;117return nullptr;118}119}120121void X64IRRegCache::FlushBeforeCall() {122// These registers are not preserved by function calls.123#if PPSSPP_ARCH(AMD64)124#ifdef _WIN32125FlushNativeReg(GPRToNativeReg(RCX));126FlushNativeReg(GPRToNativeReg(RDX));127FlushNativeReg(GPRToNativeReg(R8));128FlushNativeReg(GPRToNativeReg(R9));129FlushNativeReg(GPRToNativeReg(R10));130FlushNativeReg(GPRToNativeReg(R11));131for (int i = 0; i < 6; ++i)132FlushNativeReg(NUM_X_REGS + i);133#else134FlushNativeReg(GPRToNativeReg(R8));135FlushNativeReg(GPRToNativeReg(R9));136FlushNativeReg(GPRToNativeReg(R10));137FlushNativeReg(GPRToNativeReg(R11));138for (int i = 0; i < NUM_X_FREGS; ++i)139FlushNativeReg(NUM_X_REGS + i);140#endif141#elif PPSSPP_ARCH(X86)142FlushNativeReg(GPRToNativeReg(ECX));143FlushNativeReg(GPRToNativeReg(EDX));144for (int i = 0; i < NUM_X_FREGS; ++i)145FlushNativeReg(NUM_X_REGS + i);146#endif147}148149void X64IRRegCache::FlushAll(bool gprs, bool fprs) {150// Note: make sure not to change the registers when flushing:151// Branching code may expect the x64reg to retain its value.152153auto needsFlush = [&](IRReg i) {154if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)155return false;156if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)157return false;158return true;159};160161auto isSingleFloat = [&](IRReg i) {162if (mr[i].lane != -1 || mr[i].loc != MIPSLoc::FREG)163return false;164return true;165};166167// Sometimes, float/vector regs may be in separate regs in a sequence.168// It's worth combining and flushing together.169for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {170if (!needsFlush(i) || !needsFlush(i + 1))171continue;172// GPRs are probably not worth it. Merging Vec2s might be, but pretty uncommon.173if (!isSingleFloat(i) || !isSingleFloat(i + 1))174continue;175176X64Reg regs[4]{ INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG };177regs[0] = FromNativeReg(mr[i + 0].nReg);178regs[1] = FromNativeReg(mr[i + 1].nReg);179180bool flushVec4 = i + 3 < TOTAL_MAPPABLE_IRREGS && needsFlush(i + 2) && needsFlush(i + 3);181if (flushVec4 && isSingleFloat(i + 2) && isSingleFloat(i + 3) && (i & 3) == 0) {182regs[2] = FromNativeReg(mr[i + 2].nReg);183regs[3] = FromNativeReg(mr[i + 3].nReg);184185// Note that this doesn't change the low lane of any of these regs.186emit_->UNPCKLPS(regs[1], ::R(regs[3]));187emit_->UNPCKLPS(regs[0], ::R(regs[2]));188emit_->UNPCKLPS(regs[0], ::R(regs[1]));189emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);190191for (int j = 0; j < 4; ++j)192DiscardReg(i + j);193i += 3;194continue;195}196197// TODO: Maybe this isn't always worth doing.198emit_->UNPCKLPS(regs[0], ::R(regs[1]));199emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);200201DiscardReg(i);202DiscardReg(i + 1);203++i;204continue;205}206207IRNativeRegCacheBase::FlushAll(gprs, fprs);208}209210X64Reg X64IRRegCache::TryMapTempImm(IRReg r, X64Map flags) {211_dbg_assert_(IsValidGPR(r));212213auto canUseReg = [flags](X64Reg r) {214switch (flags & X64Map::MASK) {215case X64Map::NONE:216return true;217case X64Map::LOW_SUBREG:218return HasLowSubregister(r);219case X64Map::SHIFT:220return r == RCX;221case X64Map::HIGH_DATA:222return r == RCX;223default:224_assert_msg_(false, "Unexpected flags");225}226return false;227};228229// If already mapped, no need for a temporary.230if (IsGPRMapped(r)) {231if (canUseReg(RX(r)))232return RX(r);233}234235if (mr[r].loc == MIPSLoc::IMM) {236// Try our luck - check for an exact match in another xreg.237for (int i = 0; i < TOTAL_MAPPABLE_IRREGS; ++i) {238if (mr[i].loc == MIPSLoc::REG_IMM && mr[i].imm == mr[r].imm) {239// Awesome, let's just use this reg.240if (canUseReg(FromNativeReg(mr[i].nReg)))241return FromNativeReg(mr[i].nReg);242}243}244}245246return INVALID_REG;247}248249X64Reg X64IRRegCache::GetAndLockTempGPR() {250IRNativeReg reg = AllocateReg(MIPSLoc::REG, MIPSMap::INIT);251if (reg != -1) {252nr[reg].tempLockIRIndex = irIndex_;253}254return FromNativeReg(reg);255}256257X64Reg X64IRRegCache::GetAndLockTempFPR() {258IRNativeReg reg = AllocateReg(MIPSLoc::FREG, MIPSMap::INIT);259if (reg != -1) {260nr[reg].tempLockIRIndex = irIndex_;261}262return FromNativeReg(reg);263}264265void X64IRRegCache::ReserveAndLockXGPR(Gen::X64Reg r) {266IRNativeReg nreg = GPRToNativeReg(r);267if (nr[nreg].mipsReg != IRREG_INVALID)268FlushNativeReg(nreg);269nr[r].tempLockIRIndex = irIndex_;270}271272X64Reg X64IRRegCache::MapWithFPRTemp(const IRInst &inst) {273return FromNativeReg(MapWithTemp(inst, MIPSLoc::FREG));274}275276void X64IRRegCache::MapWithFlags(IRInst inst, X64Map destFlags, X64Map src1Flags, X64Map src2Flags) {277Mapping mapping[3];278MappingFromInst(inst, mapping);279280mapping[0].flags = mapping[0].flags | destFlags;281mapping[1].flags = mapping[1].flags | src1Flags;282mapping[2].flags = mapping[2].flags | src2Flags;283284auto flushReg = [&](IRNativeReg nreg) {285bool mustKeep = false;286bool canDiscard = false;287for (int i = 0; i < 3; ++i) {288if (mapping[i].reg != nr[nreg].mipsReg)289continue;290291if ((mapping[i].flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {292mustKeep = true;293break;294} else {295canDiscard = true;296}297}298299if (mustKeep || !canDiscard) {300FlushNativeReg(nreg);301} else {302DiscardNativeReg(nreg);303}304};305306// If there are any special rules, we might need to spill.307for (int i = 0; i < 3; ++i) {308switch (mapping[i].flags & X64Map::MASK) {309case X64Map::SHIFT:310if (nr[RCX].mipsReg != mapping[i].reg)311flushReg(RCX);312break;313314case X64Map::HIGH_DATA:315if (nr[RDX].mipsReg != mapping[i].reg)316flushReg(RDX);317break;318319case X64Map::XMM0:320if (nr[XMMToNativeReg(XMM0)].mipsReg != mapping[i].reg)321flushReg(XMMToNativeReg(XMM0));322break;323324default:325break;326}327}328329ApplyMapping(mapping, 3);330CleanupMapping(mapping, 3);331}332333X64Reg X64IRRegCache::MapGPR(IRReg mipsReg, MIPSMap mapFlags) {334_dbg_assert_(IsValidGPR(mipsReg));335336// Okay, not mapped, so we need to allocate an x64 register.337IRNativeReg nreg = MapNativeReg(MIPSLoc::REG, mipsReg, 1, mapFlags);338return FromNativeReg(nreg);339}340341X64Reg X64IRRegCache::MapGPR2(IRReg mipsReg, MIPSMap mapFlags) {342_dbg_assert_(IsValidGPR(mipsReg) && IsValidGPR(mipsReg + 1));343344// Okay, not mapped, so we need to allocate an x64 register.345IRNativeReg nreg = MapNativeReg(MIPSLoc::REG, mipsReg, 2, mapFlags);346return FromNativeReg(nreg);347}348349X64Reg X64IRRegCache::MapGPRAsPointer(IRReg reg) {350return FromNativeReg(MapNativeRegAsPointer(reg));351}352353X64Reg X64IRRegCache::MapFPR(IRReg mipsReg, MIPSMap mapFlags) {354_dbg_assert_(IsValidFPR(mipsReg));355_dbg_assert_(mr[mipsReg + 32].loc == MIPSLoc::MEM || mr[mipsReg + 32].loc == MIPSLoc::FREG);356357IRNativeReg nreg = MapNativeReg(MIPSLoc::FREG, mipsReg + 32, 1, mapFlags);358if (nreg != -1)359return FromNativeReg(nreg);360return INVALID_REG;361}362363X64Reg X64IRRegCache::MapVec4(IRReg first, MIPSMap mapFlags) {364_dbg_assert_(IsValidFPR(first));365_dbg_assert_((first & 3) == 0);366_dbg_assert_(mr[first + 32].loc == MIPSLoc::MEM || mr[first + 32].loc == MIPSLoc::FREG);367368IRNativeReg nreg = MapNativeReg(MIPSLoc::FREG, first + 32, 4, mapFlags);369if (nreg != -1)370return FromNativeReg(nreg);371return INVALID_REG;372}373374void X64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {375_assert_(nreg >= 0 && nreg < NUM_X_REGS);376X64Reg r = FromNativeReg(nreg);377if (state) {378#if defined(MASKED_PSP_MEMORY)379// This destroys the value...380_dbg_assert_(!nr[nreg].isDirty);381emit_->AND(PTRBITS, ::R(r), Imm32(Memory::MEMVIEW32_MASK));382emit_->ADD(PTRBITS, ::R(r), ImmPtr(Memory::base));383#else384emit_->ADD(PTRBITS, ::R(r), ::R(MEMBASEREG));385#endif386} else {387#if defined(MASKED_PSP_MEMORY)388_dbg_assert_(!nr[nreg].isDirty);389emit_->SUB(PTRBITS, ::R(r), ImmPtr(Memory::base));390#else391emit_->SUB(PTRBITS, ::R(r), ::R(MEMBASEREG));392#endif393}394}395396void X64IRRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {397X64Reg r = FromNativeReg(nreg);398_dbg_assert_(first != MIPS_REG_ZERO);399if (nreg < NUM_X_REGS) {400_assert_(lanes == 1 || (lanes == 2 && first == IRREG_LO));401if (lanes == 1)402emit_->MOV(32, ::R(r), MDisp(CTXREG, -128 + GetMipsRegOffset(first)));403#if PPSSPP_ARCH(AMD64)404else if (lanes == 2)405emit_->MOV(64, ::R(r), MDisp(CTXREG, -128 + GetMipsRegOffset(first)));406#endif407else408_assert_(false);409} else {410_dbg_assert_(nreg < NUM_X_REGS + NUM_X_FREGS);411_assert_msg_(mr[first].loc == MIPSLoc::FREG, "Cannot load this type: %d", (int)mr[first].loc);412if (lanes == 1)413emit_->MOVSS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));414else if (lanes == 2)415emit_->MOVLPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));416else if (lanes == 4 && (first & 3) == 0)417emit_->MOVAPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));418else if (lanes == 4)419emit_->MOVUPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));420else421_assert_(false);422}423}424425void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {426X64Reg r = FromNativeReg(nreg);427_dbg_assert_(first != MIPS_REG_ZERO);428if (nreg < NUM_X_REGS) {429_assert_(lanes == 1 || (lanes == 2 && first == IRREG_LO));430_assert_(mr[first].loc == MIPSLoc::REG || mr[first].loc == MIPSLoc::REG_IMM);431if (lanes == 1)432emit_->MOV(32, MDisp(CTXREG, -128 + GetMipsRegOffset(first)), ::R(r));433#if PPSSPP_ARCH(AMD64)434else if (lanes == 2)435emit_->MOV(64, MDisp(CTXREG, -128 + GetMipsRegOffset(first)), ::R(r));436#endif437else438_assert_(false);439} else {440_dbg_assert_(nreg < NUM_X_REGS + NUM_X_FREGS);441_assert_msg_(mr[first].loc == MIPSLoc::FREG, "Cannot store this type: %d", (int)mr[first].loc);442if (lanes == 1)443emit_->MOVSS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);444else if (lanes == 2)445emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);446else if (lanes == 4 && (first & 3) == 0)447emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);448else if (lanes == 4)449emit_->MOVUPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);450else451_assert_(false);452}453}454455bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {456bool allowed = !mr[nr[nreg].mipsReg].isStatic;457// There's currently no support for non-XMMs here.458allowed = allowed && type == MIPSLoc::FREG;459460if (dest == -1)461dest = nreg;462463if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {464// Alright, changing lane count (possibly including lane position.)465IRReg oldfirst = nr[nreg].mipsReg;466int oldlanes = 0;467while (mr[oldfirst + oldlanes].nReg == nreg)468oldlanes++;469_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");470_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");471472if (lanes == 1 && TransferVecTo1(nreg, dest, first, oldlanes))473return true;474if (oldlanes == 1 && Transfer1ToVec(nreg, dest, first, lanes))475return true;476}477478return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);479}480481bool X64IRRegCache::TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes) {482IRReg oldfirst = nr[nreg].mipsReg;483484// Is it worth preserving any of the old regs?485int numKept = 0;486for (int i = 0; i < oldlanes; ++i) {487// Skip whichever one this is extracting.488if (oldfirst + i == first)489continue;490// If 0 isn't being transfered, easy to keep in its original reg.491if (i == 0 && dest != nreg) {492numKept++;493continue;494}495496IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);497if (freeReg != -1 && IsRegRead(MIPSLoc::FREG, oldfirst + i)) {498// If there's one free, use it. Don't modify nreg, though.499u8 shuf = VFPU_SWIZZLE(i, i, i, i);500if (i == 0) {501emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));502} else if (cpu_info.bAVX) {503emit_->VPERMILPS(128, FromNativeReg(freeReg), ::R(FromNativeReg(nreg)), shuf);504} else if (i == 2) {505emit_->MOVHLPS(FromNativeReg(freeReg), FromNativeReg(nreg));506} else {507emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));508emit_->SHUFPS(FromNativeReg(freeReg), ::R(FromNativeReg(freeReg)), shuf);509}510511// Update accounting.512nr[freeReg].isDirty = nr[nreg].isDirty;513nr[freeReg].mipsReg = oldfirst + i;514mr[oldfirst + i].lane = -1;515mr[oldfirst + i].nReg = freeReg;516numKept++;517}518}519520// Unless all other lanes were kept, store.521if (nr[nreg].isDirty && numKept < oldlanes - 1) {522StoreNativeReg(nreg, oldfirst, oldlanes);523// Set false even for regs that were split out, since they were flushed too.524for (int i = 0; i < oldlanes; ++i) {525if (mr[oldfirst + i].nReg != -1)526nr[mr[oldfirst + i].nReg].isDirty = false;527}528}529530// Next, shuffle the desired element into first place.531u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);532if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {533emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), shuf);534} else if (mr[first].lane <= 0 && dest != nreg) {535emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));536} else if (mr[first].lane == 2) {537emit_->MOVHLPS(FromNativeReg(dest), FromNativeReg(nreg));538} else if (mr[first].lane > 0) {539if (dest != nreg)540emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));541emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);542}543544// Now update accounting.545for (int i = 0; i < oldlanes; ++i) {546auto &mreg = mr[oldfirst + i];547if (oldfirst + i == first) {548mreg.lane = -1;549mreg.nReg = dest;550} else if (mreg.nReg == nreg && i == 0 && nreg != dest) {551// Still in the same register, but no longer a vec.552mreg.lane = -1;553} else if (mreg.nReg == nreg) {554// No longer in a register.555mreg.nReg = -1;556mreg.lane = -1;557mreg.loc = MIPSLoc::MEM;558}559}560561if (dest != nreg) {562nr[dest].isDirty = nr[nreg].isDirty;563if (oldfirst == first) {564nr[nreg].mipsReg = -1;565nr[nreg].isDirty = false;566}567}568nr[dest].mipsReg = first;569570return true;571}572573bool X64IRRegCache::Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes) {574X64Reg cur[4]{};575int numInRegs = 0;576u8 blendMask = 0;577for (int i = 0; i < lanes; ++i) {578if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {579// Can't do it, either double mapped or overlapping vec.580return false;581}582583if (mr[first + i].nReg == -1) {584cur[i] = INVALID_REG;585blendMask |= 1 << i;586} else {587cur[i] = FromNativeReg(mr[first + i].nReg);588numInRegs++;589}590}591592// Shouldn't happen, this should only get called to transfer one in a reg.593if (numInRegs == 0)594return false;595596// Move things together into a reg.597if (lanes == 4 && cpu_info.bSSE4_1 && numInRegs == 1 && (first & 3) == 0) {598// Use a blend to grab the rest. BLENDPS is pretty good.599if (cpu_info.bAVX && nreg != dest) {600if (cur[0] == INVALID_REG) {601// Broadcast to all lanes, then blend from memory to replace.602emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), 0);603emit_->BLENDPS(FromNativeReg(dest), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);604} else {605emit_->VBLENDPS(128, FromNativeReg(dest), FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);606}607cur[0] = FromNativeReg(dest);608} else {609if (cur[0] == INVALID_REG)610emit_->SHUFPS(FromNativeReg(nreg), ::R(FromNativeReg(nreg)), 0);611emit_->BLENDPS(FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);612// If this is not dest, it'll get moved there later.613cur[0] = FromNativeReg(nreg);614}615} else if (lanes == 4) {616if (blendMask == 0) {617// y = yw##, x = xz##, x = xyzw.618emit_->UNPCKLPS(cur[1], ::R(cur[3]));619emit_->UNPCKLPS(cur[0], ::R(cur[2]));620emit_->UNPCKLPS(cur[0], ::R(cur[1]));621} else if (blendMask == 0b1100) {622// x = xy##, then load zw.623emit_->UNPCKLPS(cur[0], ::R(cur[1]));624emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));625} else if (blendMask == 0b1010 && cpu_info.bSSE4_1 && (first & 3) == 0) {626// x = x#z#, x = xyzw.627emit_->SHUFPS(cur[0], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));628emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);629} else if (blendMask == 0b0110 && cpu_info.bSSE4_1 && (first & 3) == 0) {630// x = x##w, x = xyzw.631emit_->SHUFPS(cur[0], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));632emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);633} else if (blendMask == 0b1001 && cpu_info.bSSE4_1 && (first & 3) == 0) {634// y = #yz#, y = xyzw.635emit_->SHUFPS(cur[1], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));636emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);637// Will be moved to dest as needed.638cur[0] = cur[1];639} else if (blendMask == 0b0101 && cpu_info.bSSE4_1 && (first & 3) == 0) {640// y = #y#w, y = xyzw.641emit_->SHUFPS(cur[1], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));642emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);643// Will be moved to dest as needed.644cur[0] = cur[1];645} else if (blendMask == 0b1000) {646// x = xz##, z = w###, y = yw##, x = xyzw.647emit_->UNPCKLPS(cur[0], ::R(cur[2]));648emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));649emit_->UNPCKLPS(cur[1], ::R(cur[2]));650emit_->UNPCKLPS(cur[0], ::R(cur[1]));651} else if (blendMask == 0b0100) {652// y = yw##, w = z###, x = xz##, x = xyzw.653emit_->UNPCKLPS(cur[1], ::R(cur[3]));654emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));655emit_->UNPCKLPS(cur[0], ::R(cur[3]));656emit_->UNPCKLPS(cur[0], ::R(cur[1]));657} else if (blendMask == 0b0010) {658// z = zw##, w = y###, x = xy##, x = xyzw.659emit_->UNPCKLPS(cur[2], ::R(cur[3]));660emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)));661emit_->UNPCKLPS(cur[0], ::R(cur[3]));662emit_->MOVLHPS(cur[0], cur[2]);663} else if (blendMask == 0b0001) {664// y = yw##, w = x###, w = xz##, w = xyzw.665emit_->UNPCKLPS(cur[1], ::R(cur[3]));666emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));667emit_->UNPCKLPS(cur[3], ::R(cur[2]));668emit_->UNPCKLPS(cur[3], ::R(cur[1]));669// Will be moved to dest as needed.670cur[0] = cur[3];671} else if (blendMask == 0b0011) {672// z = zw##, w = xy##, w = xyzw.673emit_->UNPCKLPS(cur[2], ::R(cur[3]));674emit_->MOVLPS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));675emit_->MOVLHPS(cur[3], cur[2]);676// Will be moved to dest as needed.677cur[0] = cur[3];678} else {679// This must mean no SSE4, and numInRegs <= 2 in trickier cases.680return false;681}682} else if (lanes == 2) {683if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {684emit_->UNPCKLPS(cur[0], ::R(cur[1]));685} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {686emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);687} else {688return false;689}690} else {691return false;692}693694mr[first].lane = 0;695for (int i = 0; i < lanes; ++i) {696if (mr[first + i].nReg != -1) {697// If this was dirty, the combined reg is now dirty.698if (nr[mr[first + i].nReg].isDirty)699nr[dest].isDirty = true;700701// Throw away the other register we're no longer using.702if (i != 0)703DiscardNativeReg(mr[first + i].nReg);704}705706// And set it as using the new one.707mr[first + i].lane = i;708mr[first + i].loc = MIPSLoc::FREG;709mr[first + i].nReg = dest;710}711712if (cur[0] != FromNativeReg(dest))713emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));714715if (dest != nreg) {716nr[dest].mipsReg = first;717nr[nreg].mipsReg = -1;718nr[nreg].isDirty = false;719}720721return true;722}723724void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {725X64Reg r = FromNativeReg(nreg);726_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);727emit_->MOV(32, ::R(r), Imm32(imm));728}729730void X64IRRegCache::StoreRegValue(IRReg mreg, uint32_t imm) {731_assert_(IsValidGPRNoZero(mreg));732// Try to optimize using a different reg.733X64Reg storeReg = INVALID_REG;734735// Could we get lucky? Check for an exact match in another xreg.736for (int i = 0; i < TOTAL_MAPPABLE_IRREGS; ++i) {737if (mr[i].loc == MIPSLoc::REG_IMM && mr[i].imm == imm) {738// Awesome, let's just store this reg.739storeReg = (X64Reg)mr[i].nReg;740break;741}742}743744if (storeReg == INVALID_REG)745emit_->MOV(32, MDisp(CTXREG, -128 + GetMipsRegOffset(mreg)), Imm32(imm));746else747emit_->MOV(32, MDisp(CTXREG, -128 + GetMipsRegOffset(mreg)), ::R(storeReg));748}749750OpArg X64IRRegCache::R(IRReg mipsReg) {751return ::R(RX(mipsReg));752}753754OpArg X64IRRegCache::RPtr(IRReg mipsReg) {755return ::R(RXPtr(mipsReg));756}757758OpArg X64IRRegCache::F(IRReg mipsReg) {759return ::R(FX(mipsReg));760}761762X64Reg X64IRRegCache::RX(IRReg mipsReg) {763_dbg_assert_(IsValidGPR(mipsReg));764_dbg_assert_(mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM);765if (mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM) {766return FromNativeReg(mr[mipsReg].nReg);767} else {768ERROR_LOG_REPORT(Log::JIT, "Reg %i not in x64 reg", mipsReg);769return INVALID_REG; // BAAAD770}771}772773X64Reg X64IRRegCache::RXPtr(IRReg mipsReg) {774_dbg_assert_(IsValidGPR(mipsReg));775_dbg_assert_(mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM || mr[mipsReg].loc == MIPSLoc::REG_AS_PTR);776if (mr[mipsReg].loc == MIPSLoc::REG_AS_PTR) {777return FromNativeReg(mr[mipsReg].nReg);778} else if (mr[mipsReg].loc == MIPSLoc::REG || mr[mipsReg].loc == MIPSLoc::REG_IMM) {779int r = mr[mipsReg].nReg;780_dbg_assert_(nr[r].pointerified);781if (nr[r].pointerified) {782return FromNativeReg(mr[mipsReg].nReg);783} else {784ERROR_LOG(Log::JIT, "Tried to use a non-pointer register as a pointer");785return INVALID_REG;786}787} else {788ERROR_LOG_REPORT(Log::JIT, "Reg %i not in x64 reg", mipsReg);789return INVALID_REG; // BAAAD790}791}792793X64Reg X64IRRegCache::FX(IRReg mipsReg) {794_dbg_assert_(IsValidFPR(mipsReg));795_dbg_assert_(mr[mipsReg + 32].loc == MIPSLoc::FREG);796if (mr[mipsReg + 32].loc == MIPSLoc::FREG) {797return FromNativeReg(mr[mipsReg + 32].nReg);798} else {799ERROR_LOG_REPORT(Log::JIT, "Reg %i not in x64 reg", mipsReg);800return INVALID_REG; // BAAAD801}802}803804bool X64IRRegCache::HasLowSubregister(Gen::X64Reg reg) {805#if !PPSSPP_ARCH(AMD64)806// Can't use ESI or EDI (which we use), no 8-bit versions. Only these.807return reg == EAX || reg == EBX || reg == ECX || reg == EDX;808#else809return true;810#endif811}812813#endif814815816