CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/SamplerX86.cpp
Views: 1401
// Copyright (c) 2017- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)1920#include <emmintrin.h>21#include "Common/x64Emitter.h"22#include "Common/BitScan.h"23#include "Common/CPUDetect.h"24#include "GPU/GPUState.h"25#include "GPU/Software/Sampler.h"26#include "GPU/ge_constants.h"2728using namespace Gen;29using namespace Rasterizer;3031namespace Sampler {3233FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {34_assert_msg_(id.fetch && !id.linear, "Only fetch should be set on sampler id");35regCache_.SetupABI({36RegCache::GEN_ARG_U,37RegCache::GEN_ARG_V,38RegCache::GEN_ARG_TEXPTR,39RegCache::GEN_ARG_BUFW,40RegCache::GEN_ARG_LEVEL,41RegCache::GEN_ARG_ID,42});43regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);44regCache_.ForceRetain(RegCache::GEN_RESULT);45regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);4647BeginWrite(2048);48Describe("Init");49const u8 *start = AlignCode16();5051#if PPSSPP_PLATFORM(WINDOWS)52// RET and shadow space.53stackArgPos_ = 8 + 32;54stackIDOffset_ = 8;55stackLevelOffset_ = 0;56#else57stackArgPos_ = 0;58stackIDOffset_ = -1;59stackLevelOffset_ = -1;60#endif6162// Early exit on !srcPtr.63FixupBranch zeroSrc;64if (id.hasInvalidPtr) {65X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);66CMP(PTRBITS, R(srcReg), Imm8(0));67regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);6869FixupBranch nonZeroSrc = J_CC(CC_NZ);70X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);71PXOR(vecResultReg, R(vecResultReg));72regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);73zeroSrc = J(true);74SetJumpTarget(nonZeroSrc);75}7677// This reads the pixel data into resultReg from the args.78if (!Jit_ReadTextureFormat(id)) {79regCache_.Reset(false);80EndWrite();81ResetCodePtr(GetOffset(start));82ERROR_LOG(Log::G3D, "Failed to compile fetch %s", DescribeSamplerID(id).c_str());83return nullptr;84}8586if (regCache_.Has(RegCache::GEN_ARG_LEVEL))87regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);88if (regCache_.Has(RegCache::GEN_ARG_ID))89regCache_.ForceRelease(RegCache::GEN_ARG_ID);9091X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);9293X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);94MOVD_xmm(vecResultReg, R(resultReg));95regCache_.Unlock(resultReg, RegCache::GEN_RESULT);96regCache_.ForceRelease(RegCache::GEN_RESULT);9798if (cpu_info.bSSE4_1) {99PMOVZXBD(vecResultReg, R(vecResultReg));100} else {101X64Reg vecTempReg = regCache_.Alloc(RegCache::VEC_TEMP0);102PXOR(vecTempReg, R(vecTempReg));103PUNPCKLBW(vecResultReg, R(vecTempReg));104PUNPCKLWD(vecResultReg, R(vecTempReg));105regCache_.Release(vecTempReg, RegCache::VEC_TEMP0);106}107regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);108109Describe("Init");110if (id.hasInvalidPtr) {111SetJumpTarget(zeroSrc);112}113114RET();115116regCache_.Reset(true);117118EndWrite();119return (FetchFunc)start;120}121122NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {123_assert_msg_(!id.fetch && !id.linear, "Fetch and linear should be cleared on sampler id");124BeginWrite(2048);125Describe("Init");126127// Let's drop some helpful constants here.128WriteConstantPool(id);129130const u8 *start = AlignCode16();131132regCache_.SetupABI({133RegCache::VEC_ARG_S,134RegCache::VEC_ARG_T,135RegCache::VEC_ARG_COLOR,136RegCache::GEN_ARG_TEXPTR_PTR,137RegCache::GEN_ARG_BUFW_PTR,138RegCache::GEN_ARG_LEVEL,139RegCache::GEN_ARG_LEVELFRAC,140RegCache::GEN_ARG_ID,141});142143#if PPSSPP_PLATFORM(WINDOWS)144// RET + shadow space.145stackArgPos_ = 8 + 32;146147// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac148stackIDOffset_ = 24;149stackLevelOffset_ = 8;150#else151stackArgPos_ = 0;152// No args on the stack.153stackIDOffset_ = -1;154stackLevelOffset_ = -1;155#endif156157// Start out by saving some registers, since we'll need more.158PUSH(R15);159PUSH(R14);160PUSH(R13);161PUSH(R12);162regCache_.Add(R15, RegCache::GEN_INVALID);163regCache_.Add(R14, RegCache::GEN_INVALID);164regCache_.Add(R13, RegCache::GEN_INVALID);165regCache_.Add(R12, RegCache::GEN_INVALID);166stackArgPos_ += 32;167168#if PPSSPP_PLATFORM(WINDOWS)169// Use the shadow space to save U1/V1.170stackUV1Offset_ = -8;171#else172// Use the red zone, but account for the R15-R12 we push just below.173stackUV1Offset_ = -stackArgPos_ - 8;174#endif175176// We can throw these away right off if there are no mips.177if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVEL) && id.useSharedClut)178regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);179if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))180regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);181182if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {183// On Linux, RCX is currently levelFrac, but we'll need it for other things.184if (!cpu_info.bBMI2) {185X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);186MOV(64, R(R15), R(levelFracReg));187regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);188regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);189regCache_.ChangeReg(R15, RegCache::GEN_ARG_LEVELFRAC);190regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);191}192} else if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {193// Let's load bufwptr into regs. RDX is free.194MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));195regCache_.ChangeReg(RDX, RegCache::GEN_ARG_BUFW_PTR);196regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);197}198// Okay, now lock RCX as a shifting reg.199if (!cpu_info.bBMI2) {200regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);201regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);202}203204bool success = true;205206// Convert S/T + X/Y to U/V (and U1/V1 if appropriate.)207success = success && Jit_GetTexelCoords(id);208209// At this point, XMM0 should be free. Swap it to the result.210success = success && regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);211// Let's also pick a reg for GEN_RESULT - doesn't matter which.212X64Reg resultReg = regCache_.Alloc(RegCache::GEN_RESULT);213regCache_.Unlock(resultReg, RegCache::GEN_RESULT);214regCache_.ForceRetain(RegCache::GEN_RESULT);215216// Early exit on !srcPtr (either one.)217FixupBranch zeroSrc;218if (id.hasInvalidPtr) {219Describe("NullCheck");220X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);221222if (id.hasAnyMips) {223X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);224MOV(64, R(tempReg), MDisp(srcReg, 0));225AND(64, R(tempReg), MDisp(srcReg, 8));226227CMP(PTRBITS, R(tempReg), Imm8(0));228regCache_.Release(tempReg, RegCache::GEN_TEMP0);229} else {230CMP(PTRBITS, MatR(srcReg), Imm8(0));231}232FixupBranch nonZeroSrc = J_CC(CC_NZ);233PXOR(XMM0, R(XMM0));234zeroSrc = J(true);235SetJumpTarget(nonZeroSrc);236237regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);238}239240auto loadPtrs = [&](bool level1) {241X64Reg bufwReg = regCache_.Alloc(RegCache::GEN_ARG_BUFW);242X64Reg bufwPtrReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);243MOVZX(32, 16, bufwReg, MDisp(bufwPtrReg, level1 ? 2 : 0));244regCache_.Unlock(bufwPtrReg, RegCache::GEN_ARG_BUFW_PTR);245regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);246regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);247248X64Reg srcReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);249X64Reg srcPtrReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);250MOV(64, R(srcReg), MDisp(srcPtrReg, level1 ? 8 : 0));251regCache_.Unlock(srcPtrReg, RegCache::GEN_ARG_TEXPTR_PTR);252regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);253regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR);254};255256loadPtrs(false);257success = success && Jit_ReadTextureFormat(id);258259// Convert that to 16-bit from 8-bit channels.260X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);261resultReg = regCache_.Find(RegCache::GEN_RESULT);262MOVD_xmm(vecResultReg, R(resultReg));263if (cpu_info.bSSE4_1) {264PMOVZXBW(vecResultReg, R(vecResultReg));265} else {266X64Reg zeroReg = GetZeroVec();267PUNPCKLBW(vecResultReg, R(zeroReg));268regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);269}270regCache_.Unlock(resultReg, RegCache::GEN_RESULT);271regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);272273if (id.hasAnyMips) {274X64Reg vecResultReg = regCache_.Alloc(RegCache::VEC_RESULT1);275276if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {277X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);278CMP(8, R(levelFracReg), Imm8(0));279regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);280} else {281CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));282}283FixupBranch skip = J_CC(CC_Z, true);284285// Modify the level, so the new level value is used. We don't need the old.286if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {287X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);288ADD(32, R(levelReg), Imm8(1));289regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);290} else {291// It's fine to just modify this in place.292ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));293}294295// This is inside the conditional, but it's okay because we throw it away after.296loadPtrs(true);297regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);298regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);299300X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);301MOV(32, R(uReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0));302regCache_.Unlock(uReg, RegCache::GEN_ARG_U);303regCache_.ForceRetain(RegCache::GEN_ARG_U);304305X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);306MOV(32, R(vReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4));307regCache_.Unlock(vReg, RegCache::GEN_ARG_V);308regCache_.ForceRetain(RegCache::GEN_ARG_V);309310bool hadId = regCache_.Has(RegCache::GEN_ID);311bool hadZero = regCache_.Has(RegCache::VEC_ZERO);312success = success && Jit_ReadTextureFormat(id);313314X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);315MOVD_xmm(vecResultReg, R(resultReg));316if (cpu_info.bSSE4_1) {317PMOVZXBW(vecResultReg, R(vecResultReg));318} else {319X64Reg zeroReg = GetZeroVec();320PUNPCKLBW(vecResultReg, R(zeroReg));321regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);322}323regCache_.Unlock(resultReg, RegCache::GEN_RESULT);324325// Since we're inside a conditional, make sure these go away if we allocated them.326if (!hadId && regCache_.Has(RegCache::GEN_ID))327regCache_.ForceRelease(RegCache::GEN_ID);328if (!hadZero && regCache_.Has(RegCache::VEC_ZERO))329regCache_.ForceRelease(RegCache::VEC_ZERO);330331SetJumpTarget(skip);332333regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT1);334} else {335regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);336regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);337}338339// We're done with these now.340if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR))341regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);342if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR))343regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);344if (regCache_.Has(RegCache::GEN_ARG_LEVEL))345regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);346if (regCache_.Has(RegCache::GEN_SHIFTVAL))347regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);348regCache_.ForceRelease(RegCache::GEN_RESULT);349350if (id.hasAnyMips) {351Describe("BlendMips");352if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {353X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);354MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));355regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);356regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);357}358359X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);360CMP(8, R(levelFracReg), Imm8(0));361FixupBranch skip = J_CC(CC_Z, true);362363// TODO: PMADDWD? Refactor shared?364// First, broadcast the levelFrac value into an XMM.365X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);366MOVD_xmm(fracReg, R(levelFracReg));367PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));368regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);369regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);370371// Multiply level1 color by the fraction.372X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);373PMULLW(color1Reg, R(fracReg));374375// Okay, next we need an inverse for color 0.376X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);377MOVDQA(invFracReg, M(const10All16_));378PSUBW(invFracReg, R(fracReg));379380// And multiply.381PMULLW(XMM0, R(invFracReg));382regCache_.Release(fracReg, RegCache::VEC_TEMP0);383regCache_.Release(invFracReg, RegCache::VEC_TEMP1);384385// Okay, now sum and divide by 16 (which is what the fraction maxed at.)386PADDW(XMM0, R(color1Reg));387PSRLW(XMM0, 4);388389// And now we're done with color1Reg/VEC_RESULT1.390regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);391regCache_.ForceRelease(RegCache::VEC_RESULT1);392393SetJumpTarget(skip);394}395396// Finally, it's time to apply the texture function.397success = success && Jit_ApplyTextureFunc(id);398399// Last of all, convert to 32-bit channels.400Describe("Init");401if (cpu_info.bSSE4_1) {402PMOVZXWD(XMM0, R(XMM0));403} else {404X64Reg zeroReg = GetZeroVec();405PUNPCKLWD(XMM0, R(zeroReg));406regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);407}408409regCache_.ForceRelease(RegCache::VEC_RESULT);410if (regCache_.Has(RegCache::GEN_ARG_ID))411regCache_.ForceRelease(RegCache::GEN_ARG_ID);412413if (!success) {414regCache_.Reset(false);415EndWrite();416ResetCodePtr(GetOffset(start));417ERROR_LOG(Log::G3D, "Failed to compile nearest %s", DescribeSamplerID(id).c_str());418return nullptr;419}420421if (id.hasInvalidPtr) {422SetJumpTarget(zeroSrc);423}424425POP(R12);426POP(R13);427POP(R14);428POP(R15);429430RET();431432regCache_.Reset(true);433434EndWrite();435return (NearestFunc)start;436}437438LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {439_assert_msg_(id.linear && !id.fetch, "Only linear should be set on sampler id");440BeginWrite(2048);441Describe("Init");442443// We don't use stackArgPos_ here, this is just for DXT.444stackArgPos_ = -1;445446// Let's drop some helpful constants here.447WriteConstantPool(id);448449const u8 *nearest = nullptr;450if (id.TexFmt() >= GE_TFMT_DXT1) {451regCache_.SetupABI({452RegCache::GEN_ARG_U,453RegCache::GEN_ARG_V,454RegCache::GEN_ARG_TEXPTR,455RegCache::GEN_ARG_BUFW,456RegCache::GEN_ARG_LEVEL,457// Avoid clobber.458RegCache::GEN_ARG_LEVELFRAC,459});460auto lockReg = [&](X64Reg r, RegCache::Purpose p) {461regCache_.ChangeReg(r, p);462regCache_.ForceRetain(p);463};464lockReg(RAX, RegCache::GEN_RESULT);465lockReg(XMM0, RegCache::VEC_ARG_U);466lockReg(XMM1, RegCache::VEC_ARG_V);467lockReg(XMM5, RegCache::VEC_RESULT);468#if !PPSSPP_PLATFORM(WINDOWS)469if (id.hasAnyMips) {470lockReg(XMM6, RegCache::VEC_U1);471lockReg(XMM7, RegCache::VEC_V1);472lockReg(XMM8, RegCache::VEC_RESULT1);473lockReg(XMM12, RegCache::VEC_INDEX1);474}475lockReg(XMM9, RegCache::VEC_ARG_COLOR);476lockReg(XMM10, RegCache::VEC_FRAC);477lockReg(XMM11, RegCache::VEC_INDEX);478#endif479480// We'll first write the nearest sampler, which we will CALL.481// This may differ slightly based on the "linear" flag.482nearest = AlignCode16();483484if (!Jit_ReadTextureFormat(id)) {485regCache_.Reset(false);486EndWrite();487ResetCodePtr(GetOffset(nearest));488ERROR_LOG(Log::G3D, "Failed to compile linear nearest %s", DescribeSamplerID(id).c_str());489return nullptr;490}491492Describe("Init");493RET();494495regCache_.ForceRelease(RegCache::GEN_RESULT);496regCache_.ForceRelease(RegCache::VEC_ARG_U);497regCache_.ForceRelease(RegCache::VEC_ARG_V);498regCache_.ForceRelease(RegCache::VEC_RESULT);499500auto unlockOptReg = [&](RegCache::Purpose p) {501if (regCache_.Has(p))502regCache_.ForceRelease(p);503};504unlockOptReg(RegCache::GEN_ARG_LEVEL);505unlockOptReg(RegCache::GEN_ARG_LEVELFRAC);506unlockOptReg(RegCache::VEC_U1);507unlockOptReg(RegCache::VEC_V1);508unlockOptReg(RegCache::VEC_RESULT1);509unlockOptReg(RegCache::VEC_ARG_COLOR);510unlockOptReg(RegCache::VEC_FRAC);511unlockOptReg(RegCache::VEC_INDEX);512unlockOptReg(RegCache::VEC_INDEX1);513regCache_.Reset(true);514}515EndWrite();516517// Now the actual linear func, which is exposed externally.518const u8 *linearResetPos = GetCodePointer();519Describe("Init");520521regCache_.SetupABI({522RegCache::VEC_ARG_S,523RegCache::VEC_ARG_T,524RegCache::VEC_ARG_COLOR,525RegCache::GEN_ARG_TEXPTR_PTR,526RegCache::GEN_ARG_BUFW_PTR,527RegCache::GEN_ARG_LEVEL,528RegCache::GEN_ARG_LEVELFRAC,529RegCache::GEN_ARG_ID,530});531532#if PPSSPP_PLATFORM(WINDOWS)533// RET + shadow space.534stackArgPos_ = 8 + 32;535// Free up some more vector regs on Windows too, where we're a bit tight.536stackArgPos_ += WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12 }, { R15, R14, R13, R12 });537538// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac539stackIDOffset_ = 24;540stackLevelOffset_ = 8;541542// If needed, we could store UV1 data in shadow space, but we no longer do.543stackUV1Offset_ = -8;544#else545stackArgPos_ = 0;546stackArgPos_ += WriteProlog(0, {}, { R15, R14, R13, R12 });547stackIDOffset_ = -1;548stackLevelOffset_ = -1;549550// Use the red zone.551stackUV1Offset_ = -stackArgPos_ - 8;552#endif553554// This is what we'll put in them, anyway...555if (nearest != nullptr) {556regCache_.ChangeReg(XMM10, RegCache::VEC_FRAC);557regCache_.ForceRetain(RegCache::VEC_FRAC);558regCache_.ChangeReg(XMM11, RegCache::VEC_INDEX);559regCache_.ForceRetain(RegCache::VEC_INDEX);560if (id.hasAnyMips) {561regCache_.ChangeReg(XMM12, RegCache::VEC_INDEX1);562regCache_.ForceRetain(RegCache::VEC_INDEX1);563}564}565566// Reserve a couple regs that the nearest CALL won't use.567if (id.hasAnyMips) {568regCache_.ChangeReg(XMM6, RegCache::VEC_U1);569regCache_.ChangeReg(XMM7, RegCache::VEC_V1);570regCache_.ForceRetain(RegCache::VEC_U1);571regCache_.ForceRetain(RegCache::VEC_V1);572} else if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {573regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);574}575576// Save prim color for later in a different XMM too if we're using the nearest helper.577if (nearest != nullptr) {578X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);579MOVDQA(XMM9, R(primColorReg));580regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);581regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);582regCache_.ChangeReg(XMM9, RegCache::VEC_ARG_COLOR);583regCache_.ForceRetain(RegCache::VEC_ARG_COLOR);584}585586// We also want to save src and bufw for later. Might be in a reg already.587if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR) && regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {588X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);589X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);590MOV(64, R(R14), R(srcReg));591MOV(64, R(R15), R(bufwReg));592regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);593regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);594regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);595regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);596} else if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR)) {597X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);598MOV(64, R(R14), R(srcReg));599regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);600regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);601MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 0));602} else {603MOV(64, R(R14), MDisp(RSP, stackArgPos_ + 0));604MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 8));605}606607// Okay, and now remember we moved to R14/R15.608regCache_.ChangeReg(R14, RegCache::GEN_ARG_TEXPTR_PTR);609regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR_PTR);610if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {611regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);612regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);613}614615bool success = true;616617// Our first goal is to convert S/T and X/Y into U/V and frac_u/frac_v.618success = success && Jit_GetTexelCoordsQuad(id);619620// Early exit on !srcPtr (either one.)621FixupBranch zeroSrc;622if (id.hasInvalidPtr) {623Describe("NullCheck");624X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);625626if (id.hasAnyMips) {627X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);628MOV(64, R(tempReg), MDisp(srcReg, 0));629AND(64, R(tempReg), MDisp(srcReg, 8));630631CMP(PTRBITS, R(tempReg), Imm8(0));632regCache_.Release(tempReg, RegCache::GEN_TEMP0);633} else {634CMP(PTRBITS, MatR(srcReg), Imm8(0));635}636FixupBranch nonZeroSrc = J_CC(CC_NZ);637PXOR(XMM0, R(XMM0));638zeroSrc = J(true);639SetJumpTarget(nonZeroSrc);640641regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);642}643644auto prepareDataOffsets = [&](RegCache::Purpose uPurpose, RegCache::Purpose vPurpose, bool level1) {645X64Reg uReg = regCache_.Find(uPurpose);646X64Reg vReg = regCache_.Find(vPurpose);647success = success && Jit_PrepareDataOffsets(id, uReg, vReg, level1);648regCache_.Unlock(uReg, uPurpose);649regCache_.Unlock(vReg, vPurpose);650};651652Describe("DataOffsets");653prepareDataOffsets(RegCache::VEC_ARG_U, RegCache::VEC_ARG_V, false);654if (id.hasAnyMips)655prepareDataOffsets(RegCache::VEC_U1, RegCache::VEC_V1, true);656657// The data offset goes into V, except in the CLUT4 case and DXT (nearest func) cases.658if (nearest == nullptr && id.TexFmt() != GE_TFMT_CLUT4)659regCache_.ForceRelease(RegCache::VEC_ARG_U);660661// Hard allocate results if we're using the func method.662if (nearest != nullptr) {663regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);664regCache_.ForceRetain(RegCache::VEC_RESULT);665if (id.hasAnyMips) {666regCache_.ChangeReg(XMM8, RegCache::VEC_RESULT1);667regCache_.ForceRetain(RegCache::VEC_RESULT1);668}669}670671// This stores the result in an XMM for later processing.672// We map lookups to nearest CALLs, with arg order: u, v, src, bufw, level673auto doNearestCall = [&](int off, bool level1) {674#if PPSSPP_PLATFORM(WINDOWS)675static const X64Reg uArgReg = RCX;676static const X64Reg vArgReg = RDX;677static const X64Reg srcArgReg = R8;678static const X64Reg bufwArgReg = R9;679#else680static const X64Reg uArgReg = RDI;681static const X64Reg vArgReg = RSI;682static const X64Reg srcArgReg = RDX;683static const X64Reg bufwArgReg = RCX;684#endif685static const X64Reg resultReg = RAX;686687X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);688X64Reg vReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);689// Otherwise, we'll overwrite them...690_assert_(level1 || (uReg == XMM0 && vReg == XMM1));691692if (cpu_info.bSSE4_1) {693PEXTRD(R(uArgReg), uReg, off / 4);694PEXTRD(R(vArgReg), vReg, off / 4);695} else {696MOVD_xmm(R(uArgReg), uReg);697MOVD_xmm(R(vArgReg), vReg);698PSRLDQ(uReg, 4);699PSRLDQ(vReg, 4);700}701regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);702regCache_.Unlock(vReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);703704X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);705if (cpu_info.bSSE4_1) {706PEXTRD(R(srcArgReg), indexReg, off / 4);707} else {708MOVD_xmm(R(srcArgReg), indexReg);709PSRLDQ(indexReg, 4);710}711regCache_.Unlock(indexReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);712713X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);714X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);715ADD(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));716MOVZX(32, 16, bufwArgReg, MDisp(bufwReg, level1 ? 2 : 0));717// Leave level/levelFrac, we just always load from RAM on Windows and lock on POSIX.718regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);719regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);720721CALL(nearest);722723X64Reg vecResultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);724if (cpu_info.bSSE4_1) {725PINSRD(vecResultReg, R(resultReg), off / 4);726} else if (off == 0) {727MOVD_xmm(vecResultReg, R(resultReg));728} else {729X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);730MOVD_xmm(tempReg, R(resultReg));731PSLLDQ(tempReg, off);732POR(vecResultReg, R(tempReg));733regCache_.Release(tempReg, RegCache::VEC_TEMP0);734}735regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);736};737738if (nearest != nullptr) {739Describe("Calls");740doNearestCall(0, false);741doNearestCall(4, false);742doNearestCall(8, false);743doNearestCall(12, false);744745// After doing the calls, certain cached things aren't safe.746if (regCache_.Has(RegCache::GEN_ID))747regCache_.ForceRelease(RegCache::GEN_ID);748if (regCache_.Has(RegCache::VEC_ZERO))749regCache_.ForceRelease(RegCache::VEC_ZERO);750} else {751success = success && Jit_FetchQuad(id, false);752}753754if (id.hasAnyMips) {755Describe("MipsCalls");756if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {757X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);758CMP(8, R(levelFracReg), Imm8(0));759regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);760} else {761CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));762}763FixupBranch skip = J_CC(CC_Z, true);764765// Modify the level, so the new level value is used. We don't need the old.766if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {767X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);768ADD(32, R(levelReg), Imm8(1));769regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);770} else {771// It's fine to just modify this in place.772ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));773}774775if (nearest != nullptr) {776Describe("MipsCalls");777doNearestCall(0, true);778doNearestCall(4, true);779doNearestCall(8, true);780doNearestCall(12, true);781} else {782success = success && Jit_FetchQuad(id, true);783}784785SetJumpTarget(skip);786}787788// We're done with these now.789if (nearest != nullptr) {790regCache_.ForceRelease(RegCache::VEC_ARG_U);791regCache_.ForceRelease(RegCache::VEC_ARG_V);792regCache_.ForceRelease(RegCache::VEC_INDEX);793}794if (regCache_.Has(RegCache::VEC_INDEX1))795regCache_.ForceRelease(RegCache::VEC_INDEX1);796if (regCache_.Has(RegCache::VEC_U1))797regCache_.ForceRelease(RegCache::VEC_U1);798if (regCache_.Has(RegCache::VEC_V1))799regCache_.ForceRelease(RegCache::VEC_V1);800regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);801regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);802if (regCache_.Has(RegCache::GEN_ARG_LEVEL))803regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);804805success = success && Jit_DecodeQuad(id, false);806success = success && Jit_BlendQuad(id, false);807if (id.hasAnyMips) {808Describe("BlendMips");809if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {810X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);811MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));812regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);813regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);814}815816X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);817CMP(8, R(levelFracReg), Imm8(0));818FixupBranch skip = J_CC(CC_Z, true);819820success = success && Jit_DecodeQuad(id, true);821success = success && Jit_BlendQuad(id, true);822823Describe("BlendMips");824// First, broadcast the levelFrac value into an XMM.825X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);826MOVD_xmm(fracReg, R(levelFracReg));827PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));828regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);829regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);830831// Multiply level1 color by the fraction.832X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);833PMULLW(color1Reg, R(fracReg));834835// Okay, next we need an inverse for color 0.836X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);837MOVDQA(invFracReg, M(const10All16_));838PSUBW(invFracReg, R(fracReg));839840// And multiply.841PMULLW(XMM0, R(invFracReg));842regCache_.Release(fracReg, RegCache::VEC_TEMP0);843regCache_.Release(invFracReg, RegCache::VEC_TEMP1);844845// Okay, now sum and divide by 16 (which is what the fraction maxed at.)846PADDW(XMM0, R(color1Reg));847PSRLW(XMM0, 4);848849// And now we're done with color1Reg/VEC_RESULT1.850regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);851regCache_.ForceRelease(RegCache::VEC_RESULT1);852853SetJumpTarget(skip);854}855856if (regCache_.Has(RegCache::VEC_FRAC))857regCache_.ForceRelease(RegCache::VEC_FRAC);858859// Finally, it's time to apply the texture function.860success = success && Jit_ApplyTextureFunc(id);861862// Last of all, convert to 32-bit channels.863Describe("Init");864if (cpu_info.bSSE4_1) {865PMOVZXWD(XMM0, R(XMM0));866} else {867X64Reg zeroReg = GetZeroVec();868PUNPCKLWD(XMM0, R(zeroReg));869regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);870}871872regCache_.ForceRelease(RegCache::VEC_RESULT);873if (regCache_.Has(RegCache::GEN_ARG_ID))874regCache_.ForceRelease(RegCache::GEN_ARG_ID);875876if (!success) {877regCache_.Reset(false);878EndWrite();879ResetCodePtr(GetOffset(nearest ? nearest : linearResetPos));880ERROR_LOG(Log::G3D, "Failed to compile linear %s", DescribeSamplerID(id).c_str());881return nullptr;882}883884if (id.hasInvalidPtr) {885SetJumpTarget(zeroSrc);886}887888const u8 *start = WriteFinalizedEpilog();889regCache_.Reset(true);890return (LinearFunc)start;891}892893void SamplerJitCache::WriteConstantPool(const SamplerID &id) {894// We reuse constants in any pool, because our code space is small.895WriteSimpleConst8x16(const10All16_, 0x10);896WriteSimpleConst16x8(const10All8_, 0x10);897898if (const10Low_ == nullptr) {899const10Low_ = AlignCode16();900for (int i = 0; i < 4; ++i)901Write16(0x10);902for (int i = 0; i < 4; ++i)903Write16(0);904}905906WriteSimpleConst4x32(constOnes32_, 1);907WriteSimpleConst8x16(constOnes16_, 1);908// This is the mask for clamp or wrap, the max texel in the S or T direction.909WriteSimpleConst4x32(constMaxTexel32_, 511);910911if (constUNext_ == nullptr) {912constUNext_ = AlignCode16();913Write32(0); Write32(1); Write32(0); Write32(1);914}915916if (constVNext_ == nullptr) {917constVNext_ = AlignCode16();918Write32(0); Write32(0); Write32(1); Write32(1);919}920921WriteSimpleConst4x32(const5551Swizzle_, 0x00070707);922WriteSimpleConst4x32(const5650Swizzle_, 0x00070307);923924// These are unique to the sampler ID.925if (!id.hasAnyMips) {926float w256f = (1 << id.width0Shift) * 256;927float h256f = (1 << id.height0Shift) * 256;928constWidthHeight256f_ = AlignCode16();929Write32(*(uint32_t *)&w256f);930Write32(*(uint32_t *)&h256f);931Write32(*(uint32_t *)&w256f);932Write32(*(uint32_t *)&h256f);933934WriteDynamicConst4x32(constWidthMinus1i_, id.width0Shift > 9 ? 511 : (1 << id.width0Shift) - 1);935WriteDynamicConst4x32(constHeightMinus1i_, id.height0Shift > 9 ? 511 : (1 << id.height0Shift) - 1);936} else {937constWidthHeight256f_ = nullptr;938constWidthMinus1i_ = nullptr;939constHeightMinus1i_ = nullptr;940}941}942943RegCache::Reg SamplerJitCache::GetSamplerID() {944if (regCache_.Has(RegCache::GEN_ARG_ID))945return regCache_.Find(RegCache::GEN_ARG_ID);946if (!regCache_.Has(RegCache::GEN_ID)) {947X64Reg r = regCache_.Alloc(RegCache::GEN_ID);948_assert_(stackIDOffset_ != -1);949MOV(PTRBITS, R(r), MDisp(RSP, stackArgPos_ + stackIDOffset_));950return r;951}952return regCache_.Find(RegCache::GEN_ID);953}954955void SamplerJitCache::UnlockSamplerID(RegCache::Reg &r) {956if (regCache_.Has(RegCache::GEN_ARG_ID))957regCache_.Unlock(r, RegCache::GEN_ARG_ID);958else959regCache_.Unlock(r, RegCache::GEN_ID);960}961962bool SamplerJitCache::Jit_FetchQuad(const SamplerID &id, bool level1) {963bool success = true;964switch (id.TexFmt()) {965case GE_TFMT_5650:966case GE_TFMT_5551:967case GE_TFMT_4444:968success = Jit_GetDataQuad(id, level1, 16);969// Mask away the high bits, if loaded via AVX2.970if (cpu_info.bAVX2) {971X64Reg destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);972PSLLD(destReg, 16);973PSRLD(destReg, 16);974regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);975}976break;977978case GE_TFMT_8888:979success = Jit_GetDataQuad(id, level1, 32);980break;981982case GE_TFMT_CLUT32:983success = Jit_GetDataQuad(id, level1, 32);984if (success)985success = Jit_TransformClutIndexQuad(id, 32);986if (success)987success = Jit_ReadClutQuad(id, level1);988break;989990case GE_TFMT_CLUT16:991success = Jit_GetDataQuad(id, level1, 16);992if (success)993success = Jit_TransformClutIndexQuad(id, 16);994if (success)995success = Jit_ReadClutQuad(id, level1);996break;997998case GE_TFMT_CLUT8:999success = Jit_GetDataQuad(id, level1, 8);1000if (success)1001success = Jit_TransformClutIndexQuad(id, 8);1002if (success)1003success = Jit_ReadClutQuad(id, level1);1004break;10051006case GE_TFMT_CLUT4:1007success = Jit_GetDataQuad(id, level1, 4);1008if (success)1009success = Jit_TransformClutIndexQuad(id, 4);1010if (success)1011success = Jit_ReadClutQuad(id, level1);1012break;10131014case GE_TFMT_DXT1:1015case GE_TFMT_DXT3:1016case GE_TFMT_DXT5:1017// No SIMD version currently, should use nearest helper path.1018success = false;1019break;10201021default:1022success = false;1023}10241025return success;1026}10271028bool SamplerJitCache::Jit_GetDataQuad(const SamplerID &id, bool level1, int bitsPerTexel) {1029Describe("DataQuad");1030bool success = true;10311032X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);1033X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);1034MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0));1035regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);10361037X64Reg destReg = INVALID_REG;1038if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32)1039destReg = regCache_.Alloc(RegCache::VEC_INDEX);1040else if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))1041destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1042else1043destReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);10441045X64Reg byteOffsetReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);1046if (cpu_info.bAVX2 && id.overReadSafe) {1047// We have to set a mask for which values to load. Load all 4.1048// Note this is overwritten with zeroes by the gather instruction.1049X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);1050PCMPEQD(maskReg, R(maskReg));1051VPGATHERDD(128, destReg, MComplex(baseReg, byteOffsetReg, SCALE_1, 0), maskReg);1052regCache_.Release(maskReg, RegCache::VEC_TEMP0);1053} else {1054if (bitsPerTexel != 32)1055PXOR(destReg, R(destReg));10561057// Grab each value separately... try to use the right memory access size.1058X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);1059if (cpu_info.bSSE4_1) {1060for (int i = 0; i < 4; ++i) {1061PEXTRD(R(temp2Reg), byteOffsetReg, i);1062if (bitsPerTexel <= 8)1063PINSRB(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 4);1064else if (bitsPerTexel == 16)1065PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);1066else if (bitsPerTexel == 32)1067PINSRD(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i);1068}1069} else {1070for (int i = 0; i < 4; ++i) {1071MOVD_xmm(R(temp2Reg), byteOffsetReg);1072if (i != 3)1073PSRLDQ(byteOffsetReg, 4);1074if (bitsPerTexel <= 8) {1075MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_1, 0));1076PINSRW(destReg, R(temp2Reg), i * 2);1077} else if (bitsPerTexel == 16) {1078PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);1079} else if (bitsPerTexel == 32) {1080if (i == 0) {1081MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0));1082} else {1083// Maybe a temporary would be better, but this path should be rare.1084PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);1085PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 2), i * 2 + 1);1086}1087}1088}1089}1090regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);1091}1092regCache_.Unlock(byteOffsetReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);1093regCache_.ForceRelease(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);1094regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR);10951096if (bitsPerTexel == 4) {1097// Take only lowest bit, multiply by 4 with shifting.1098X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);1099// Next, shift away based on the odd U bits.1100if (cpu_info.bAVX2) {1101// This is really convenient with AVX. Just make the bit into a shift amount.1102PSLLD(uReg, 31);1103PSRLD(uReg, 29);1104VPSRLVD(128, destReg, destReg, R(uReg));1105} else {1106// This creates a mask - FFFFFFFF to shift, zero otherwise.1107PSLLD(uReg, 31);1108PSRAD(uReg, 31);11091110X64Reg unshiftedReg = regCache_.Alloc(RegCache::VEC_TEMP0);1111MOVDQA(unshiftedReg, R(destReg));1112PSRLD(destReg, 4);1113// Mask destReg (shifted) and reverse uReg to unshifted masked.1114PAND(destReg, R(uReg));1115PANDN(uReg, R(unshiftedReg));1116// Now combine.1117POR(destReg, R(uReg));1118regCache_.Release(unshiftedReg, RegCache::VEC_TEMP0);1119}1120regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);1121regCache_.ForceRelease(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);1122}11231124if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32) {1125regCache_.Unlock(destReg, RegCache::VEC_INDEX);1126} else {1127regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1128regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1129}11301131return success;1132}11331134bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {1135Describe("TrCLUTQuad");1136GEPaletteFormat fmt = id.ClutFmt();1137if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {1138// This is simple - just mask.1139X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);1140// Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4.1141PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);1142PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);1143regCache_.Unlock(indexReg, RegCache::VEC_INDEX);11441145return true;1146}11471148X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);1149bool maskedIndex = false;11501151// Okay, first load the actual samplerID clutformat bits we'll use.1152X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);1153X64Reg idReg = GetSamplerID();1154if (cpu_info.bAVX2 && !id.hasClutShift)1155VPBROADCASTD(128, formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));1156else1157MOVD_xmm(formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));1158UnlockSamplerID(idReg);11591160// Shift = (clutformat >> 2) & 0x1F1161if (id.hasClutShift) {1162// Before shifting, let's mask if needed (we always read 32 bits.)1163// We have to do this here, because the bits should be zero even if F is used as a mask.1164if (bitsPerIndex < 32) {1165PSLLD(indexReg, 32 - bitsPerIndex);1166PSRLD(indexReg, 32 - bitsPerIndex);1167maskedIndex = true;1168}11691170X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);1171// Shift against walls to get 5 bits after the rightmost 2.1172PSLLD(shiftReg, formatReg, 32 - 7);1173PSRLD(shiftReg, 32 - 5);1174// The other lanes are zero, so we can use PSRLD.1175PSRLD(indexReg, R(shiftReg));1176regCache_.Release(shiftReg, RegCache::VEC_TEMP1);1177}11781179// With shifting done, we need the format in each lane.1180if (!cpu_info.bAVX2 || id.hasClutShift)1181PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));11821183// Mask = (clutformat >> 8) & 0xFF1184if (id.hasClutMask) {1185X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);1186// If it was CLUT4, grab only 4 bits of the mask.1187PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);1188PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);11891190PAND(indexReg, R(maskReg));1191regCache_.Release(maskReg, RegCache::VEC_TEMP1);1192} else if (!maskedIndex || bitsPerIndex > 8) {1193// Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.)1194PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);1195PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);1196}11971198// Offset = (clutformat >> 12) & 0x01F01199if (id.hasClutOffset) {1200// Use walls to extract the 5 bits at 16, and then put them shifted left by 4.1201int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;1202PSRLD(formatReg, 16);1203PSLLD(formatReg, 32 - offsetBits);1204PSRLD(formatReg, 32 - offsetBits - 4);12051206POR(indexReg, R(formatReg));1207}12081209regCache_.Release(formatReg, RegCache::VEC_TEMP0);1210regCache_.Unlock(indexReg, RegCache::VEC_INDEX);1211return true;1212}12131214bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {1215Describe("ReadCLUTQuad");1216X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);12171218if (!id.useSharedClut) {1219X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0);12201221if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {1222X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);1223MOVD_xmm(vecLevelReg, R(levelReg));1224regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);1225} else {1226#if PPSSPP_PLATFORM(WINDOWS)1227if (cpu_info.bAVX2) {1228VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));1229} else {1230MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));1231PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0));1232}1233#else1234_assert_(false);1235#endif1236}12371238// Now we multiply by 16, and add.1239PSLLD(vecLevelReg, 4);1240PADDD(indexReg, R(vecLevelReg));1241regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);1242}12431244X64Reg idReg = GetSamplerID();1245X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);1246MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));1247UnlockSamplerID(idReg);12481249X64Reg resultReg = INVALID_REG;1250if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))1251resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1252else1253resultReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1254X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);1255if (cpu_info.bAVX2 && id.overReadSafe)1256PCMPEQD(maskReg, R(maskReg));12571258switch (id.ClutFmt()) {1259case GE_CMODE_16BIT_BGR5650:1260case GE_CMODE_16BIT_ABGR5551:1261case GE_CMODE_16BIT_ABGR4444:1262if (cpu_info.bAVX2 && id.overReadSafe) {1263VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg);1264// Clear out the top 16 bits.1265PCMPEQD(maskReg, R(maskReg));1266PSRLD(maskReg, 16);1267PAND(resultReg, R(maskReg));1268} else {1269PXOR(resultReg, R(resultReg));12701271X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);1272if (cpu_info.bSSE4_1) {1273for (int i = 0; i < 4; ++i) {1274PEXTRD(R(temp2Reg), indexReg, i);1275PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);1276}1277} else {1278for (int i = 0; i < 4; ++i) {1279MOVD_xmm(R(temp2Reg), indexReg);1280if (i != 3)1281PSRLDQ(indexReg, 4);1282PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);1283}1284}1285regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);1286}1287break;12881289case GE_CMODE_32BIT_ABGR8888:1290if (cpu_info.bAVX2 && id.overReadSafe) {1291VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg);1292} else {1293X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);1294if (cpu_info.bSSE4_1) {1295for (int i = 0; i < 4; ++i) {1296PEXTRD(R(temp2Reg), indexReg, i);1297PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i);1298}1299} else {1300for (int i = 0; i < 4; ++i) {1301MOVD_xmm(R(temp2Reg), indexReg);1302if (i != 3)1303PSRLDQ(indexReg, 4);13041305if (i == 0) {1306MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));1307} else {1308MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));1309PSLLDQ(maskReg, 4 * i);1310POR(resultReg, R(maskReg));1311}1312}1313}1314regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);1315}1316break;1317}1318regCache_.Release(maskReg, RegCache::VEC_TEMP0);1319regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1320regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);13211322regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1);1323regCache_.Release(indexReg, RegCache::VEC_INDEX);1324return true;1325}13261327bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {1328Describe(level1 ? "BlendQuadMips" : "BlendQuad");13291330if (cpu_info.bSSE4_1 && cpu_info.bSSSE3) {1331// Let's start by rearranging from TL TR BL BR like this:1332// ABCD EFGH IJKL MNOP -> AI BJ CK DL EM FN GO HP -> AIEM BJFN CKGO DLHP1333// This way, all the RGBAs are next to each other, and in order TL BL TR BR.1334X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1335X64Reg tempArrangeReg = regCache_.Alloc(RegCache::VEC_TEMP0);1336PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));1337PUNPCKLBW(quadReg, R(tempArrangeReg));1338// Okay, that's top and bottom interleaved, now for left and right.1339PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));1340PUNPCKLWD(quadReg, R(tempArrangeReg));1341regCache_.Release(tempArrangeReg, RegCache::VEC_TEMP0);13421343// Next up, we want to multiply and add using a repeated TB frac pair.1344// That's (0x10 - frac_v) in byte 1, frac_v in byte 2, repeating.1345X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);1346X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);1347X64Reg zeroReg = GetZeroVec();1348if (level1) {1349PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));1350} else {1351PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));1352}1353PSHUFB(fracReg, R(zeroReg));1354regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1355regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);13561357// Now, inverse fracReg, then interleave into the actual multiplier.1358// This gives us the repeated TB pairs we wanted.1359X64Reg multTBReg = regCache_.Alloc(RegCache::VEC_TEMP1);1360MOVDQA(multTBReg, M(const10All8_));1361PSUBB(multTBReg, R(fracReg));1362PUNPCKLBW(multTBReg, R(fracReg));1363regCache_.Release(fracReg, RegCache::VEC_TEMP0);13641365// Now we can multiply and add paired lanes in one go.1366// Note that since T+B=0x10, this gives us exactly 12 bits.1367PMADDUBSW(quadReg, R(multTBReg));1368regCache_.Release(multTBReg, RegCache::VEC_TEMP1);13691370// With that done, we need to multiply by LR, or rather 0L0R, and sum again.1371// Since RRRR was all next to each other, this gives us a clean total R.1372fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);1373allFracReg = regCache_.Find(RegCache::VEC_FRAC);1374if (level1) {1375PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));1376} else {1377// We can ignore the high bits, since we'll interleave those away anyway.1378PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));1379}1380regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);13811382// Again, we're inversing into an interleaved multiplier. L is the inversed one.1383// 0L0R is (0x10 - frac_u), frac_u - 2x16 repeated four times.1384X64Reg multLRReg = regCache_.Alloc(RegCache::VEC_TEMP1);1385MOVDQA(multLRReg, M(const10All16_));1386PSUBW(multLRReg, R(fracReg));1387PUNPCKLWD(multLRReg, R(fracReg));1388regCache_.Release(fracReg, RegCache::VEC_TEMP0);13891390// This gives us RGBA as dwords, but they're all shifted left by 8 from the multiplies.1391PMADDWD(quadReg, R(multLRReg));1392PSRLD(quadReg, 8);1393regCache_.Release(multLRReg, RegCache::VEC_TEMP1);13941395// Shrink to 16-bit, it's more convenient for later.1396if (level1) {1397PACKSSDW(quadReg, R(quadReg));1398regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);1399} else {1400if (cpu_info.bAVX) {1401VPACKSSDW(128, XMM0, quadReg, R(quadReg));1402} else {1403PACKSSDW(quadReg, R(quadReg));1404MOVDQA(XMM0, R(quadReg));1405}1406regCache_.Unlock(quadReg, RegCache::VEC_RESULT);14071408regCache_.ForceRelease(RegCache::VEC_RESULT);1409bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);1410_assert_msg_(changeSuccess, "Unexpected reg locked as destReg");1411}1412} else {1413X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);1414X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);14151416X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);1417X64Reg zeroReg = GetZeroVec();1418PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));1419PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));1420PUNPCKLBW(topReg, R(zeroReg));1421PUNPCKLBW(bottomReg, R(zeroReg));1422regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1423if (!level1) {1424regCache_.Unlock(quadReg, RegCache::VEC_RESULT);1425regCache_.ForceRelease(RegCache::VEC_RESULT);1426}14271428// Grab frac_u and spread to lower (L) lanes.1429X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);1430X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);1431X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);1432if (level1) {1433PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));1434} else {1435PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));1436}1437regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);1438// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.1439MOVDQA(fracMulReg, M(const10Low_));1440PSUBW(fracMulReg, R(fracReg));1441// Then we just put the original frac_u in the upper bits.1442PUNPCKLQDQ(fracMulReg, R(fracReg));1443regCache_.Release(fracReg, RegCache::VEC_TEMP2);14441445// Okay, we have 8-bits in the top and bottom rows for the color.1446// Multiply by frac to get 12, which we keep for the next stage.1447PMULLW(topReg, R(fracMulReg));1448PMULLW(bottomReg, R(fracMulReg));1449regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);14501451// Time for frac_v. This time, we want it in all 8 lanes.1452fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);1453allFracReg = regCache_.Find(RegCache::VEC_FRAC);1454X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);1455if (level1) {1456PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));1457} else {1458PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));1459}1460PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));1461regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);14621463// Now, inverse fracReg into fracTopReg for the top row.1464MOVDQA(fracTopReg, M(const10All16_));1465PSUBW(fracTopReg, R(fracReg));14661467// We had 12, plus 4 frac, that gives us 16.1468PMULLW(bottomReg, R(fracReg));1469PMULLW(topReg, R(fracTopReg));1470regCache_.Release(fracReg, RegCache::VEC_TEMP2);1471regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);14721473// Finally, time to sum them all up and divide by 256 to get back to 8 bits.1474PADDUSW(bottomReg, R(topReg));1475regCache_.Release(topReg, RegCache::VEC_TEMP0);14761477if (level1) {1478PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));1479PADDUSW(quadReg, R(bottomReg));1480PSRLW(quadReg, 8);1481regCache_.Release(bottomReg, RegCache::VEC_TEMP1);1482regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);1483} else {1484bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);1485if (!changeSuccess) {1486_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");1487X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);1488PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));1489PADDUSW(bottomReg, R(otherReg));1490regCache_.Release(otherReg, RegCache::VEC_TEMP0);1491regCache_.Release(bottomReg, RegCache::VEC_TEMP1);14921493// Okay, now it can be changed.1494regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);1495} else {1496PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));1497PADDUSW(XMM0, R(bottomReg));1498regCache_.Release(bottomReg, RegCache::VEC_TEMP1);1499}15001501PSRLW(XMM0, 8);1502}1503}15041505return true;1506}15071508bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {1509X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);1510X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);1511X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);15121513auto useAlphaFrom = [&](X64Reg alphaColorReg) {1514if (cpu_info.bSSE4_1) {1515// Copy only alpha.1516PBLENDW(resultReg, R(alphaColorReg), 0x08);1517} else {1518PSRLDQ(alphaColorReg, 6);1519PSLLDQ(alphaColorReg, 6);1520// Zero out the result alpha and OR them together.1521PSLLDQ(resultReg, 10);1522PSRLDQ(resultReg, 10);1523POR(resultReg, R(alphaColorReg));1524}1525};15261527// Note: color is in DWORDs, but result is in WORDs.1528switch (id.TexFunc()) {1529case GE_TEXFUNC_MODULATE:1530Describe("Modulate");1531PACKSSDW(primColorReg, R(primColorReg));1532if (cpu_info.bAVX) {1533VPADDW(128, tempReg, primColorReg, M(constOnes16_));15341535// Okay, time to multiply. This produces 16 bits, neatly.1536VPMULLW(128, resultReg, tempReg, R(resultReg));1537} else {1538MOVDQA(tempReg, M(constOnes16_));1539PADDW(tempReg, R(primColorReg));15401541PMULLW(resultReg, R(tempReg));1542}15431544if (id.useColorDoubling)1545PSRLW(resultReg, 7);1546else1547PSRLW(resultReg, 8);15481549if (!id.useTextureAlpha) {1550useAlphaFrom(primColorReg);1551} else if (id.useColorDoubling) {1552// We still need to finish dividing alpha, it's currently doubled (from the 7 above.)1553PSRLW(primColorReg, resultReg, 1);1554useAlphaFrom(primColorReg);1555}1556break;15571558case GE_TEXFUNC_DECAL:1559Describe("Decal");1560PACKSSDW(primColorReg, R(primColorReg));1561if (id.useTextureAlpha) {1562// Get alpha into the tempReg.1563PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));1564PADDW(resultReg, M(constOnes16_));1565PMULLW(resultReg, R(tempReg));15661567X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);1568// Materialize some 255s, and subtract out alpha.1569PCMPEQD(invAlphaReg, R(invAlphaReg));1570PSRLW(invAlphaReg, 8);1571PSUBW(invAlphaReg, R(tempReg));15721573MOVDQA(tempReg, R(primColorReg));1574PADDW(tempReg, M(constOnes16_));1575PMULLW(tempReg, R(invAlphaReg));1576regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);15771578// Now sum, and divide.1579PADDW(resultReg, R(tempReg));1580if (id.useColorDoubling)1581PSRLW(resultReg, 7);1582else1583PSRLW(resultReg, 8);1584} else if (id.useColorDoubling) {1585PSLLW(resultReg, 1);1586}1587useAlphaFrom(primColorReg);1588break;15891590case GE_TEXFUNC_BLEND:1591{1592Describe("EnvBlend");1593PACKSSDW(primColorReg, R(primColorReg));15941595// First off, let's grab the color value.1596X64Reg idReg = GetSamplerID();1597X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);1598if (cpu_info.bSSE4_1) {1599PMOVZXBW(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));1600} else {1601MOVD_xmm(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));1602X64Reg zeroReg = GetZeroVec();1603PUNPCKLBW(texEnvReg, R(zeroReg));1604regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1605}1606UnlockSamplerID(idReg);16071608// Now merge in the prim color so we have them interleaved, texenv low.1609PUNPCKLWD(texEnvReg, R(primColorReg));16101611// Okay, now materialize 255 for inversing resultReg and rounding.1612PCMPEQD(tempReg, R(tempReg));1613PSRLW(tempReg, 8);16141615// If alpha is used, we want the roundup and factor to be zero.1616if (id.useTextureAlpha)1617PSRLDQ(tempReg, 10);16181619// We're going to lose tempReg, so save the 255s.1620X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP2);1621MOVDQA(roundValueReg, R(tempReg));16221623// Okay, now inverse, then merge with resultReg low to match texenv low.1624PSUBUSW(tempReg, R(resultReg));1625PUNPCKLWD(resultReg, R(tempReg));16261627if (id.useTextureAlpha) {1628// Before we multiply, let's include alpha in that multiply.1629PADDW(primColorReg, M(constOnes16_));1630// Mask off everything but alpha, and move to the second highest short.1631PSRLDQ(primColorReg, 6);1632PSLLDQ(primColorReg, 12);1633// Now simply merge in with texenv.1634POR(texEnvReg, R(primColorReg));1635}16361637// Alright, now to multiply and add all in one go. Note this gives us DWORDs.1638PMADDWD(resultReg, R(texEnvReg));1639regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);16401641// Now convert back to 16 bit and add the 255s for rounding.1642if (cpu_info.bSSE4_1) {1643PACKUSDW(resultReg, R(resultReg));1644} else {1645PSLLD(resultReg, 16);1646PSRAD(resultReg, 16);1647PACKSSDW(resultReg, R(resultReg));1648}1649PADDW(resultReg, R(roundValueReg));1650regCache_.Release(roundValueReg, RegCache::VEC_TEMP2);16511652// Okay, divide by 256 or 128 depending on doubling (we want to preserve the precision.)1653if (id.useColorDoubling && id.useTextureAlpha) {1654// If doubling, we want to still divide alpha by 256.1655PSRLW(resultReg, 7);1656PSRLW(primColorReg, resultReg, 1);1657useAlphaFrom(primColorReg);1658} else if (id.useColorDoubling) {1659PSRLW(resultReg, 7);1660} else {1661PSRLW(resultReg, 8);1662}16631664if (!id.useTextureAlpha)1665useAlphaFrom(primColorReg);1666break;1667}16681669case GE_TEXFUNC_REPLACE:1670Describe("Replace");1671if (id.useColorDoubling && id.useTextureAlpha) {1672// We can abuse primColorReg as a temp.1673MOVDQA(primColorReg, R(resultReg));1674// Shift to zero out alpha in resultReg.1675PSLLDQ(resultReg, 10);1676PSRLDQ(resultReg, 10);1677// Now simply add them together, restoring alpha and doubling the colors.1678PADDW(resultReg, R(primColorReg));1679} else if (!id.useTextureAlpha) {1680if (id.useColorDoubling) {1681// Let's just double using shifting. Ignore alpha.1682PSLLW(resultReg, 1);1683}1684// Now we want prim_color in W, so convert, then shift-mask away the color.1685PACKSSDW(primColorReg, R(primColorReg));1686useAlphaFrom(primColorReg);1687}1688break;16891690case GE_TEXFUNC_ADD:1691case GE_TEXFUNC_UNKNOWN1:1692case GE_TEXFUNC_UNKNOWN2:1693case GE_TEXFUNC_UNKNOWN3:1694Describe("Add");1695PACKSSDW(primColorReg, R(primColorReg));1696if (id.useTextureAlpha) {1697MOVDQA(tempReg, M(constOnes16_));1698// Add and multiply the alpha (and others, but we'll mask them.)1699PADDW(tempReg, R(primColorReg));1700PMULLW(tempReg, R(resultReg));17011702// Now that we've extracted alpha, sum and double as needed.1703PADDW(resultReg, R(primColorReg));1704if (id.useColorDoubling)1705PSLLW(resultReg, 1);17061707// Divide by 256 to normalize alpha.1708PSRLW(tempReg, 8);1709useAlphaFrom(tempReg);1710} else {1711PADDW(resultReg, R(primColorReg));1712if (id.useColorDoubling)1713PSLLW(resultReg, 1);1714useAlphaFrom(primColorReg);1715}1716break;1717}17181719regCache_.Release(tempReg, RegCache::VEC_TEMP0);1720regCache_.Unlock(resultReg, RegCache::VEC_RESULT);1721regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);1722regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);1723return true;1724}17251726bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {1727GETextureFormat fmt = id.TexFmt();1728bool success = true;1729switch (fmt) {1730case GE_TFMT_5650:1731success = Jit_GetTexData(id, 16);1732if (success)1733success = Jit_Decode5650(id);1734break;17351736case GE_TFMT_5551:1737success = Jit_GetTexData(id, 16);1738if (success)1739success = Jit_Decode5551(id);1740break;17411742case GE_TFMT_4444:1743success = Jit_GetTexData(id, 16);1744if (success)1745success = Jit_Decode4444(id);1746break;17471748case GE_TFMT_8888:1749success = Jit_GetTexData(id, 32);1750break;17511752case GE_TFMT_CLUT32:1753success = Jit_GetTexData(id, 32);1754if (success)1755success = Jit_TransformClutIndex(id, 32);1756if (success)1757success = Jit_ReadClutColor(id);1758break;17591760case GE_TFMT_CLUT16:1761success = Jit_GetTexData(id, 16);1762if (success)1763success = Jit_TransformClutIndex(id, 16);1764if (success)1765success = Jit_ReadClutColor(id);1766break;17671768case GE_TFMT_CLUT8:1769success = Jit_GetTexData(id, 8);1770if (success)1771success = Jit_TransformClutIndex(id, 8);1772if (success)1773success = Jit_ReadClutColor(id);1774break;17751776case GE_TFMT_CLUT4:1777success = Jit_GetTexData(id, 4);1778if (success)1779success = Jit_TransformClutIndex(id, 4);1780if (success)1781success = Jit_ReadClutColor(id);1782break;17831784case GE_TFMT_DXT1:1785success = Jit_GetDXT1Color(id, 8, 255);1786break;17871788case GE_TFMT_DXT3:1789success = Jit_GetDXT1Color(id, 16, 0);1790if (success)1791success = Jit_ApplyDXTAlpha(id);1792break;17931794case GE_TFMT_DXT5:1795success = Jit_GetDXT1Color(id, 16, 0);1796if (success)1797success = Jit_ApplyDXTAlpha(id);1798break;17991800default:1801success = false;1802}18031804return success;1805}18061807// Note: afterward, srcReg points at the block, and uReg/vReg have offset into block.1808bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int alpha) {1809Describe("DXT1");1810// Like Jit_GetTexData, this gets the color into resultReg.1811// Note: color low bits are red, high bits are blue.1812_assert_msg_(blockSize == 8 || blockSize == 16, "Invalid DXT block size");18131814X64Reg colorIndexReg = INVALID_REG;1815if (!id.linear) {1816// First, we need to get the block's offset, which is:1817// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize1818// We distribute the blockSize constant for convenience:1819// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)18201821// Copy u (we'll need it later), and round down to the nearest 4 after scaling.1822X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);1823X64Reg srcBaseReg = regCache_.Alloc(RegCache::GEN_TEMP0);1824LEA(32, srcBaseReg, MScaled(uReg, blockSize / 4, 0));1825AND(32, R(srcBaseReg), Imm32(blockSize == 8 ? ~7 : ~15));1826// Add in srcReg already, since we'll be multiplying soon.1827X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);1828ADD(64, R(srcBaseReg), R(srcReg));18291830X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);1831X64Reg srcOffsetReg = regCache_.Alloc(RegCache::GEN_TEMP1);1832LEA(32, srcOffsetReg, MScaled(vReg, blockSize / 4, 0));1833AND(32, R(srcOffsetReg), Imm32(blockSize == 8 ? ~7 : ~15));1834// Modify bufw in place and then multiply.1835X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);1836SHR(32, R(bufwReg), Imm8(2));1837IMUL(32, srcOffsetReg, R(bufwReg));1838regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);1839// We no longer need bufwReg.1840regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);18411842// And now let's chop off the offset for u and v.1843AND(32, R(uReg), Imm32(3));1844AND(32, R(vReg), Imm32(3));18451846// Okay, at this point srcBaseReg + srcOffsetReg = blockPos. To free up regs, put back in srcReg.1847LEA(64, srcReg, MRegSum(srcBaseReg, srcOffsetReg));1848regCache_.Release(srcBaseReg, RegCache::GEN_TEMP0);1849regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);18501851// Make sure we don't grab this as colorIndexReg.1852if (uReg != ECX && !cpu_info.bBMI2)1853regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);18541855// The colorIndex is simply the 2 bits at blockPos + (v & 3), shifted right by (u & 3) twice.1856colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);1857MOVZX(32, 8, colorIndexReg, MRegSum(srcReg, vReg));1858regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);1859regCache_.Unlock(vReg, RegCache::GEN_ARG_V);1860// Only DXT3/5 need this reg later.1861if (id.TexFmt() == GE_TFMT_DXT1)1862regCache_.ForceRelease(RegCache::GEN_ARG_V);18631864if (uReg == ECX) {1865SHR(32, R(colorIndexReg), R(CL));1866SHR(32, R(colorIndexReg), R(CL));1867} else if (cpu_info.bBMI2) {1868SHRX(32, colorIndexReg, R(colorIndexReg), uReg);1869SHRX(32, colorIndexReg, R(colorIndexReg), uReg);1870} else {1871bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);1872_assert_(hasRCX);1873LEA(32, ECX, MScaled(uReg, SCALE_2, 0));1874SHR(32, R(colorIndexReg), R(CL));1875}1876regCache_.Unlock(uReg, RegCache::GEN_ARG_U);1877// If DXT1, there's no alpha and we can toss this reg.1878if (id.TexFmt() == GE_TFMT_DXT1)1879regCache_.ForceRelease(RegCache::GEN_ARG_U);1880} else {1881// For linear, we already precalculated the block pos into srcReg.1882// uReg is the shift for the color index fomr the 32 bits of color index data.1883regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);1884// If we don't have alpha, we don't need vReg.1885if (id.TexFmt() == GE_TFMT_DXT1)1886regCache_.ForceRelease(RegCache::GEN_ARG_V);18871888// Make sure we don't grab this as colorIndexReg.1889X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);1890if (uReg != ECX && !cpu_info.bBMI2)1891regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);18921893// Shift and mask out the 2 bits we need into colorIndexReg.1894colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);1895X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);1896if (cpu_info.bBMI2) {1897SHRX(32, colorIndexReg, MatR(srcReg), uReg);1898} else {1899MOV(32, R(colorIndexReg), MatR(srcReg));1900if (uReg != RCX) {1901bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);1902_assert_(hasRCX);1903MOV(32, R(RCX), R(uReg));1904}1905SHR(32, R(colorIndexReg), R(CL));1906}1907regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);1908// We're done with U now.1909regCache_.Unlock(uReg, RegCache::GEN_ARG_U);1910regCache_.ForceRelease(RegCache::GEN_ARG_U);1911}19121913// Mask out the value.1914AND(32, R(colorIndexReg), Imm32(3));19151916X64Reg color1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);1917X64Reg color2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);1918X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);19191920// For colorIndex 0 or 1, we'll simply take the 565 color and convert.1921CMP(32, R(colorIndexReg), Imm32(1));1922FixupBranch handleSimple565 = J_CC(CC_BE);19231924// Otherwise, it depends if color1 or color2 is higher, so fetch them.1925X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);1926MOVZX(32, 16, color1Reg, MDisp(srcReg, 4));1927MOVZX(32, 16, color2Reg, MDisp(srcReg, 6));1928regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);19291930CMP(32, R(color1Reg), R(color2Reg));1931FixupBranch handleMix23 = J_CC(CC_A, true);19321933// If we're still here, then colorIndex is either 3 for 0 (easy) or 2 for 50% mix.1934XOR(32, R(resultReg), R(resultReg));1935CMP(32, R(colorIndexReg), Imm32(3));1936FixupBranch finishZero = J_CC(CC_E, true);19371938// At this point, resultReg, colorIndexReg, and maybe R12/R13 can be used as temps.1939// We'll add, then shift from 565 a bit less to "divide" by 2 for a 50/50 mix.19401941if (cpu_info.bBMI2_fast) {1942// Expand everything out to 0BGR at 8888, but halved.1943MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));1944PDEP(32, color1Reg, color1Reg, R(colorIndexReg));1945PDEP(32, color2Reg, color2Reg, R(colorIndexReg));19461947// Now let's sum them together (this undoes our halving.)1948LEA(32, resultReg, MRegSum(color1Reg, color2Reg));19491950// Time to swap into order. Luckily we can ignore alpha.1951BSWAP(32, resultReg);1952SHR(32, R(resultReg), Imm8(8));1953} else {1954// We'll need more regs. Grab two more.1955PUSH(R12);1956PUSH(R13);19571958// Start with summing R, then shift into position.1959MOV(32, R(resultReg), R(color1Reg));1960AND(32, R(resultReg), Imm32(0x0000F800));1961MOV(32, R(colorIndexReg), R(color2Reg));1962AND(32, R(colorIndexReg), Imm32(0x0000F800));1963LEA(32, R12, MRegSum(resultReg, colorIndexReg));1964// The position is 9, instead of 8, due to doubling.1965SHR(32, R(R12), Imm8(9));19661967// For G, summing leaves it 4 right (doubling made it not need more.)1968MOV(32, R(resultReg), R(color1Reg));1969AND(32, R(resultReg), Imm32(0x000007E0));1970MOV(32, R(colorIndexReg), R(color2Reg));1971AND(32, R(colorIndexReg), Imm32(0x000007E0));1972LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));1973SHL(32, R(resultReg), Imm8(5 - 1));1974// Now add G and R together.1975OR(32, R(resultReg), R(R12));19761977// At B, we're free to modify the regs in place, finally.1978AND(32, R(color1Reg), Imm32(0x0000001F));1979AND(32, R(color2Reg), Imm32(0x0000001F));1980LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));1981// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.1982SHL(32, R(colorIndexReg), Imm8(16 + 2));1983// And combine into the result.1984OR(32, R(resultReg), R(colorIndexReg));19851986POP(R13);1987POP(R12);1988}19891990FixupBranch finishMix50 = J(true);19911992// Simply load the 565 color, and convert to 0888.1993SetJumpTarget(handleSimple565);1994srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);1995MOVZX(32, 16, colorIndexReg, MComplex(srcReg, colorIndexReg, SCALE_2, 4));1996regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);1997// DXT1 is done with this reg.1998if (id.TexFmt() == GE_TFMT_DXT1)1999regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);20002001if (cpu_info.bBMI2_fast) {2002// We're only grabbing the high bits, no swizzle here.2003MOV(32, R(resultReg), Imm32(0x00F8FCF8));2004PDEP(32, resultReg, colorIndexReg, R(resultReg));2005BSWAP(32, resultReg);2006SHR(32, R(resultReg), Imm8(8));2007} else {2008// Start with R, shifting it into place.2009MOV(32, R(resultReg), R(colorIndexReg));2010AND(32, R(resultReg), Imm32(0x0000F800));2011SHR(32, R(resultReg), Imm8(8));20122013// Then take G and shift it too.2014MOV(32, R(color2Reg), R(colorIndexReg));2015AND(32, R(color2Reg), Imm32(0x000007E0));2016SHL(32, R(color2Reg), Imm8(5));2017// And now combine with R, shifting that in the process.2018OR(32, R(resultReg), R(color2Reg));20192020// Modify B in place and OR in.2021AND(32, R(colorIndexReg), Imm32(0x0000001F));2022SHL(32, R(colorIndexReg), Imm8(16 + 3));2023OR(32, R(resultReg), R(colorIndexReg));2024}2025FixupBranch finish565 = J(true);20262027// Here we'll mix color1 and color2 by 2/3 (which gets the 2 depends on colorIndexReg.)2028SetJumpTarget(handleMix23);20292030// If colorIndexReg is 2, it's color1Reg * 2 + color2Reg, but if colorIndexReg is 3, it's reversed.2031// Let's swap the regs in that case.2032CMP(32, R(colorIndexReg), Imm32(2));2033FixupBranch skipSwap23 = J_CC(CC_E);2034XCHG(32, R(color2Reg), R(color1Reg));2035SetJumpTarget(skipSwap23);20362037if (cpu_info.bBMI2_fast) {2038// Gather B, G, and R and space them apart by 14 or 15 bits.2039MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));2040PDEP(64, color1Reg, color1Reg, R(colorIndexReg));2041PDEP(64, color2Reg, color2Reg, R(colorIndexReg));2042LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));20432044// Now multiply all of them by a special constant to divide by 3.2045// This constant is (1 << 13) / 3, which is importantly less than 14 or 15.2046IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));20472048// Now extract the BGR values to 8 bits each.2049// We subtract 3 from 13 to get 8 from 5 bits, then 2 from 20 + 13, and 3 from 40 + 13.2050MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));2051PEXT(64, resultReg, resultReg, R(colorIndexReg));20522053// Finally swap B and R.2054BSWAP(32, resultReg);2055SHR(32, R(resultReg), Imm8(8));2056} else {2057// We'll need more regs. Grab two more to keep the stack aligned.2058PUSH(R12);2059PUSH(R13);20602061// Start off with R, adding together first...2062MOV(32, R(resultReg), R(color1Reg));2063AND(32, R(resultReg), Imm32(0x0000F800));2064MOV(32, R(colorIndexReg), R(color2Reg));2065AND(32, R(colorIndexReg), Imm32(0x0000F800));2066LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));2067// We'll overflow if we divide here, so shift into place already.2068SHR(32, R(resultReg), Imm8(8));2069// Now we divide that by 3, by actually multiplying by AAAB and shifting off.2070IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));2071// Now we SHR off the extra bits we added on.2072SHR(32, R(R12), Imm8(17));20732074// Now add up G. We leave this in place and shift right more.2075MOV(32, R(resultReg), R(color1Reg));2076AND(32, R(resultReg), Imm32(0x000007E0));2077MOV(32, R(colorIndexReg), R(color2Reg));2078AND(32, R(colorIndexReg), Imm32(0x000007E0));2079LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));2080// Again, multiply and now we use AAAB, this time masking.2081IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));2082SHR(32, R(resultReg), Imm8(17 - 5));2083AND(32, R(resultReg), Imm32(0x0000FF00));2084// Let's combine R in already.2085OR(32, R(resultReg), R(R12));20862087// Now for B, it starts in the lowest place so we'll need to mask.2088AND(32, R(color1Reg), Imm32(0x0000001F));2089AND(32, R(color2Reg), Imm32(0x0000001F));2090LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));2091// Instead of shifting left, though, we multiply by a bit more.2092IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));2093AND(32, R(colorIndexReg), Imm32(0x00FF0000));2094OR(32, R(resultReg), R(colorIndexReg));20952096POP(R13);2097POP(R12);2098}20992100regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);2101regCache_.Release(color1Reg, RegCache::GEN_TEMP1);2102regCache_.Release(color2Reg, RegCache::GEN_TEMP2);2103regCache_.Unlock(resultReg, RegCache::GEN_RESULT);21042105SetJumpTarget(finishMix50);2106SetJumpTarget(finish565);2107// In all these cases, it's time to add in alpha. Zero doesn't get it.2108if (alpha != 0) {2109X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2110OR(32, R(resultReg), Imm32(alpha << 24));2111regCache_.Unlock(resultReg, RegCache::GEN_RESULT);2112}21132114SetJumpTarget(finishZero);21152116return true;2117}21182119bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {2120GETextureFormat fmt = id.TexFmt();21212122// At this point, srcReg points at the block, and u/v are offsets inside it.21232124bool success = false;2125if (fmt == GE_TFMT_DXT3) {2126Describe("DXT3A");2127X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);2128X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);21292130if (id.linear) {2131// We precalculated the shift for the 64 bits of alpha data in vReg.2132if (!cpu_info.bBMI2) {2133regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);2134_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));2135}21362137if (cpu_info.bBMI2) {2138SHRX(64, srcReg, MDisp(srcReg, 8), vReg);2139} else {2140MOV(64, R(srcReg), MDisp(srcReg, 8));2141MOV(32, R(RCX), R(vReg));2142SHR(64, R(srcReg), R(CL));2143}2144// This will mask the 4 bits we want using a wall also.2145SHL(32, R(srcReg), Imm8(28));2146X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2147OR(32, R(resultReg), R(srcReg));2148regCache_.Unlock(resultReg, RegCache::GEN_RESULT);21492150success = true;2151} else {2152X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);21532154if (uReg != RCX && !cpu_info.bBMI2) {2155regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);2156_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));2157}21582159X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);2160MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));2161if (cpu_info.bBMI2) {2162LEA(32, uReg, MScaled(uReg, SCALE_4, 0));2163SHRX(32, temp1Reg, R(temp1Reg), uReg);2164} else {2165// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.2166LEA(32, RCX, MScaled(uReg, SCALE_4, 0));2167SHR(32, R(temp1Reg), R(CL));2168}2169SHL(32, R(temp1Reg), Imm8(28));2170X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2171OR(32, R(resultReg), R(temp1Reg));2172regCache_.Unlock(resultReg, RegCache::GEN_RESULT);2173regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);21742175success = true;21762177regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2178regCache_.ForceRelease(RegCache::GEN_ARG_U);2179}21802181regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);2182regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);2183regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2184regCache_.ForceRelease(RegCache::GEN_ARG_V);2185} else if (fmt == GE_TFMT_DXT5) {2186Describe("DXT5A");21872188X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);2189X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);2190X64Reg alphaIndexReg = INVALID_REG;2191if (id.linear) {2192// We precalculated the shift for the 64 bits of alpha data in vReg.2193if (cpu_info.bBMI2) {2194alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);2195SHRX(64, alphaIndexReg, MDisp(srcReg, 8), vReg);2196} else {2197regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);2198alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);21992200MOV(64, R(alphaIndexReg), MDisp(srcReg, 8));2201MOV(32, R(RCX), R(vReg));2202SHR(64, R(alphaIndexReg), R(CL));2203}2204regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2205regCache_.ForceRelease(RegCache::GEN_ARG_V);2206} else {2207X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);2208if (uReg != RCX && !cpu_info.bBMI2)2209regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);2210alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);22112212// Let's figure out the alphaIndex bit offset so we can read the right byte.2213// bitOffset = (u + v * 4) * 3;2214LEA(32, uReg, MComplex(uReg, vReg, SCALE_4, 0));2215LEA(32, uReg, MComplex(uReg, uReg, SCALE_2, 0));2216regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2217regCache_.ForceRelease(RegCache::GEN_ARG_V);22182219if (cpu_info.bBMI2) {2220SHRX(64, alphaIndexReg, MDisp(srcReg, 8), uReg);2221} else {2222// And now the byte offset and bit from there, from those.2223MOV(32, R(alphaIndexReg), R(uReg));2224SHR(32, R(alphaIndexReg), Imm8(3));2225AND(32, R(uReg), Imm32(7));22262227// Load 16 bits and mask, in case it straddles bytes.2228MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));2229// If not, it's in what was bufwReg.2230if (uReg != RCX) {2231_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));2232MOV(32, R(RCX), R(uReg));2233}2234SHR(32, R(alphaIndexReg), R(CL));2235}2236regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2237regCache_.ForceRelease(RegCache::GEN_ARG_U);2238}22392240X64Reg alpha1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);2241X64Reg alpha2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);22422243AND(32, R(alphaIndexReg), Imm32(7));22442245X64Reg temp3Reg = regCache_.Alloc(RegCache::GEN_TEMP3);22462247// Okay, now check for 0 or 1 alphaIndex in alphaIndexReg, those are simple.2248CMP(32, R(alphaIndexReg), Imm32(1));2249FixupBranch handleSimple = J_CC(CC_BE, true);22502251// Now load a1 and a2, since the rest depend on those values. Frees up srcReg.2252MOVZX(32, 8, alpha1Reg, MDisp(srcReg, 14));2253MOVZX(32, 8, alpha2Reg, MDisp(srcReg, 15));22542255CMP(32, R(alpha1Reg), R(alpha2Reg));2256FixupBranch handleLerp8 = J_CC(CC_A);22572258// Okay, check for zero or full alpha, at alphaIndex 6 or 7.2259CMP(32, R(alphaIndexReg), Imm32(6));2260FixupBranch finishZero = J_CC(CC_E, true);2261// Remember, MOV doesn't affect flags.2262MOV(32, R(srcReg), Imm32(0xFF));2263FixupBranch finishFull = J_CC(CC_A, true);22642265// At this point, we're handling a 6-step lerp between alpha1 and alpha2.2266SHL(32, R(alphaIndexReg), Imm8(8));2267// Prepare a multiplier in temp3Reg and multiply alpha1 by it.2268MOV(32, R(temp3Reg), Imm32(6 << 8));2269SUB(32, R(temp3Reg), R(alphaIndexReg));2270IMUL(32, alpha1Reg, R(temp3Reg));2271// And now the same for alpha2, using alphaIndexReg.2272SUB(32, R(alphaIndexReg), Imm32(1 << 8));2273IMUL(32, alpha2Reg, R(alphaIndexReg));22742275// Let's skip a step and sum before dividing by 5, also adding the 31.2276LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 5 * 31));2277// To divide by 5, we will actually multiply by 0x3334 and shift.2278IMUL(32, srcReg, Imm32(0x3334));2279SHR(32, R(srcReg), Imm8(24));2280FixupBranch finishLerp6 = J(true);22812282// This will be a 8-step lerp between alpha1 and alpha2.2283SetJumpTarget(handleLerp8);2284SHL(32, R(alphaIndexReg), Imm8(8));2285// Prepare a multiplier in temp3Reg and multiply alpha1 by it.2286MOV(32, R(temp3Reg), Imm32(8 << 8));2287SUB(32, R(temp3Reg), R(alphaIndexReg));2288IMUL(32, alpha1Reg, R(temp3Reg));2289// And now the same for alpha2, using alphaIndexReg.2290SUB(32, R(alphaIndexReg), Imm32(1 << 8));2291IMUL(32, alpha2Reg, R(alphaIndexReg));22922293// And divide by 7 together here too, also adding the 31.2294LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 7 * 31));2295// Our magic constant here is 0x124A, but it's a bit more complex than just a shift.2296IMUL(32, alpha1Reg, R(srcReg), Imm32(0x124A));2297SHR(32, R(alpha1Reg), Imm8(15));2298SUB(32, R(srcReg), R(alpha1Reg));2299SHR(32, R(srcReg), Imm8(1));2300ADD(32, R(srcReg), R(alpha1Reg));2301SHR(32, R(srcReg), Imm8(10));23022303FixupBranch finishLerp8 = J();23042305SetJumpTarget(handleSimple);2306// Just load the specified alpha byte.2307MOVZX(32, 8, srcReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 14));23082309regCache_.Release(alphaIndexReg, RegCache::GEN_TEMP0);2310regCache_.Release(alpha1Reg, RegCache::GEN_TEMP1);2311regCache_.Release(alpha2Reg, RegCache::GEN_TEMP2);2312regCache_.Release(temp3Reg, RegCache::GEN_TEMP3);23132314SetJumpTarget(finishFull);2315SetJumpTarget(finishLerp6);2316SetJumpTarget(finishLerp8);23172318SHL(32, R(srcReg), Imm8(24));2319X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2320OR(32, R(resultReg), R(srcReg));2321regCache_.Unlock(resultReg, RegCache::GEN_RESULT);2322success = true;23232324SetJumpTarget(finishZero);23252326regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);2327regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);2328}23292330_dbg_assert_(success);2331return success;2332}23332334bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {2335if (id.swizzle) {2336return Jit_GetTexDataSwizzled(id, bitsPerTexel);2337}23382339_assert_msg_(!id.linear, "Should not use this path for linear")2340Describe("TexData");2341X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);2342X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);23432344// srcReg might be EDX, so let's copy and uReg that before we multiply.2345X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);2346X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);2347bool success = true;2348switch (bitsPerTexel) {2349case 32:2350case 16:2351case 8:2352LEA(64, temp1Reg, MComplex(srcReg, uReg, bitsPerTexel / 8, 0));2353break;23542355case 4: {2356if (cpu_info.bBMI2_fast)2357MOV(32, R(temp2Reg), Imm32(0x0F));2358else2359XOR(32, R(temp2Reg), R(temp2Reg));2360SHR(32, R(uReg), Imm8(1));2361FixupBranch skip = J_CC(CC_NC);2362// Track whether we shifted a 1 off or not.2363if (cpu_info.bBMI2_fast)2364SHL(32, R(temp2Reg), Imm8(4));2365else2366MOV(32, R(temp2Reg), Imm32(4));2367SetJumpTarget(skip);2368LEA(64, temp1Reg, MRegSum(srcReg, uReg));2369break;2370}23712372default:2373success = false;2374break;2375}2376// All done with u and texptr.2377regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);2378regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);2379regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2380regCache_.ForceRelease(RegCache::GEN_ARG_U);23812382X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2383X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);2384MOV(32, R(resultReg), R(vReg));2385regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2386regCache_.ForceRelease(RegCache::GEN_ARG_V);23872388X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);2389IMUL(32, resultReg, R(bufwReg));2390regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);2391// We can throw bufw away, now.2392regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);23932394if (bitsPerTexel == 4 && !cpu_info.bBMI2) {2395bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);2396_assert_(hasRCX);2397}23982399switch (bitsPerTexel) {2400case 32:2401case 16:2402case 8:2403MOVZX(32, bitsPerTexel, resultReg, MComplex(temp1Reg, resultReg, bitsPerTexel / 8, 0));2404break;24052406case 4: {2407SHR(32, R(resultReg), Imm8(1));2408if (cpu_info.bBMI2_fast) {2409MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));2410PEXT(32, resultReg, resultReg, R(temp2Reg));2411} else if (cpu_info.bBMI2) {2412SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);2413AND(32, R(resultReg), Imm8(0x0F));2414} else {2415MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));2416// RCX is now free.2417MOV(8, R(RCX), R(temp2Reg));2418SHR(8, R(resultReg), R(RCX));2419// Zero out any bits not shifted off.2420AND(32, R(resultReg), Imm8(0x0F));2421}2422break;2423}24242425default:2426success = false;2427break;2428}24292430regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);2431regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);2432regCache_.Unlock(resultReg, RegCache::GEN_RESULT);2433return success;2434}24352436bool SamplerJitCache::Jit_GetTexDataSwizzled4(const SamplerID &id) {2437Describe("TexDataS4");2438_assert_msg_(!id.linear, "Should not use this path for linear")2439X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);2440X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);2441X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);2442X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);24432444// Get the horizontal tile pos into temp1Reg.2445LEA(32, temp1Reg, MScaled(uReg, SCALE_4, 0));2446// Note: imm8 sign extends negative.2447AND(32, R(temp1Reg), Imm8(~127));24482449// Add vertical offset inside tile to temp1Reg.2450LEA(32, temp2Reg, MScaled(vReg, SCALE_4, 0));2451AND(32, R(temp2Reg), Imm8(31));2452LEA(32, temp1Reg, MComplex(temp1Reg, temp2Reg, SCALE_4, 0));2453// Add srcReg, since we'll need it at some point.2454X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);2455ADD(64, R(temp1Reg), R(srcReg));2456regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);2457regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);24582459// Now find the vertical tile pos, and add to temp1Reg.2460SHR(32, R(vReg), Imm8(3));2461X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);2462LEA(32, temp2Reg, MScaled(bufwReg, SCALE_4, 0));2463regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);2464// We can throw bufw away, now.2465regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);24662467IMUL(32, temp2Reg, R(vReg));2468ADD(64, R(temp1Reg), R(temp2Reg));2469// We no longer have a good value in vReg.2470regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2471regCache_.ForceRelease(RegCache::GEN_ARG_V);24722473// Last and possible also least, the horizontal offset inside the tile.2474AND(32, R(uReg), Imm8(31));2475SHR(32, R(uReg), Imm8(1));2476X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2477MOV(8, R(resultReg), MRegSum(temp1Reg, uReg));2478FixupBranch skipNonZero = J_CC(CC_NC);2479// If the horizontal offset was odd, take the upper 4.2480SHR(8, R(resultReg), Imm8(4));2481SetJumpTarget(skipNonZero);2482// Zero out the rest of the bits.2483AND(32, R(resultReg), Imm8(0x0F));2484regCache_.Unlock(resultReg, RegCache::GEN_RESULT);24852486// This destroyed u as well.2487regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2488regCache_.ForceRelease(RegCache::GEN_ARG_U);24892490regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);2491regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);2492return true;2493}24942495bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {2496if (bitsPerTexel == 4) {2497// Specialized implementation.2498return Jit_GetTexDataSwizzled4(id);2499}25002501bool success = true;2502_assert_msg_(!id.linear, "Should not use this path for linear")25032504Describe("TexDataS");2505X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);2506X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);2507X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);2508X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);2509X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);25102511LEA(32, temp1Reg, MScaled(vReg, SCALE_4, 0));2512AND(32, R(temp1Reg), Imm8(31));2513AND(32, R(vReg), Imm8(~7));25142515MOV(32, R(temp2Reg), R(uReg));2516MOV(32, R(resultReg), R(uReg));2517switch (bitsPerTexel) {2518case 32:2519SHR(32, R(resultReg), Imm8(2));2520break;2521case 16:2522SHR(32, R(vReg), Imm8(1));2523SHR(32, R(temp2Reg), Imm8(1));2524SHR(32, R(resultReg), Imm8(3));2525break;2526case 8:2527SHR(32, R(vReg), Imm8(2));2528SHR(32, R(temp2Reg), Imm8(2));2529SHR(32, R(resultReg), Imm8(4));2530break;2531default:2532success = false;2533break;2534}2535AND(32, R(temp2Reg), Imm8(3));2536SHL(32, R(resultReg), Imm8(5));2537ADD(32, R(temp1Reg), R(temp2Reg));2538ADD(32, R(temp1Reg), R(resultReg));25392540// We may clobber srcReg in the multiply, so let's grab it now.2541X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);2542LEA(64, temp1Reg, MComplex(srcReg, temp1Reg, SCALE_4, 0));2543regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);2544regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);25452546X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);2547LEA(32, resultReg, MScaled(bufwReg, SCALE_4, 0));2548regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);2549// We can throw bufw away, now.2550regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);25512552IMUL(32, resultReg, R(vReg));2553// We no longer have a good value in vReg.2554regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2555regCache_.ForceRelease(RegCache::GEN_ARG_V);25562557switch (bitsPerTexel) {2558case 32:2559MOV(bitsPerTexel, R(resultReg), MRegSum(temp1Reg, resultReg));2560break;2561case 16:2562AND(32, R(uReg), Imm8(1));2563LEA(32, resultReg, MComplex(resultReg, uReg, SCALE_2, 0));2564MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));2565break;2566case 8:2567AND(32, R(uReg), Imm8(3));2568ADD(32, R(resultReg), R(uReg));2569MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));2570break;2571default:2572success = false;2573break;2574}25752576regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2577regCache_.ForceRelease(RegCache::GEN_ARG_U);25782579regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);2580regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);2581regCache_.Unlock(resultReg, RegCache::GEN_RESULT);2582return success;2583}25842585bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {2586Describe("Texel");25872588X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);2589X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);2590X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);2591X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);2592if (id.hasAnyMips) {2593// We have to figure out levels and the proper width, ugh.2594X64Reg idReg = GetSamplerID();2595X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);25962597X64Reg levelReg = INVALID_REG;2598if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {2599levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);2600} else {2601levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);2602MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));2603}26042605// We'll multiply these at the same time, so it's nice to put together.2606UNPCKLPS(sReg, R(tReg));2607SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));26082609X64Reg sizesReg = regCache_.Alloc(RegCache::VEC_TEMP0);2610if (cpu_info.bSSE4_1) {2611PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));2612} else {2613MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));2614X64Reg zeroReg = GetZeroVec();2615PUNPCKLWD(sizesReg, R(zeroReg));2616regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);2617}26182619// We just want this value as a float, times 256.2620PSLLD(sizesReg, 8);2621CVTDQ2PS(sizesReg, R(sizesReg));26222623// Okay, we can multiply now, and convert back to integer.2624MULPS(sReg, R(sizesReg));2625CVTTPS2DQ(sReg, R(sReg));2626regCache_.Release(sizesReg, RegCache::VEC_TEMP0);26272628PSRAD(sReg, 8);26292630// Reuse tempXYReg for the level1 values.2631if (!cpu_info.bSSE4_1)2632PSHUFD(tReg, R(sReg), _MM_SHUFFLE(3, 2, 3, 2));26332634auto applyClampWrap = [&](X64Reg dest, bool clamp, bool isY, bool isLevel1) {2635int offset = offsetof(SamplerID, cached.sizes[0].w) + (isY ? 2 : 0) + (isLevel1 ? 4 : 0);2636// Grab the size, already pre-shifted for us.2637MOVZX(32, 16, tempReg, MComplex(idReg, levelReg, SCALE_4, offset));26382639// Grab the size from the multiply.2640if (cpu_info.bSSE4_1) {2641if (isY || isLevel1)2642PEXTRD(R(dest), sReg, (isY ? 1 : 0) + (isLevel1 ? 2 : 0));2643else2644MOVD_xmm(R(dest), sReg);2645} else {2646X64Reg srcReg = isLevel1 ? tReg : sReg;2647MOVD_xmm(R(dest), srcReg);2648if (!isY)2649PSRLDQ(srcReg, 4);2650}26512652SUB(32, R(tempReg), Imm8(1));2653AND(32, R(tempReg), Imm32(0x000001FF));2654if (clamp) {2655CMP(32, R(dest), R(tempReg));2656CMOVcc(32, dest, R(tempReg), CC_G);2657XOR(32, R(tempReg), R(tempReg));2658CMP(32, R(dest), R(tempReg));2659CMOVcc(32, dest, R(tempReg), CC_L);2660} else {2661AND(32, R(dest), R(tempReg));2662}2663};26642665// Do the next level first, so we can save them and reuse the regs.2666// Note: for non-SSE4, this must be in S/T order.2667applyClampWrap(uReg, id.clampS, false, true);2668applyClampWrap(vReg, id.clampT, true, true);26692670// Okay, now stuff them on the stack - we'll load them again later.2671MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0), R(uReg));2672MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4), R(vReg));26732674// And then the given level.2675// Note: for non-SSE4, this must be in S/T order.2676applyClampWrap(uReg, id.clampS, false, false);2677applyClampWrap(vReg, id.clampT, true, false);26782679UnlockSamplerID(idReg);2680regCache_.Release(tempReg, RegCache::GEN_TEMP0);2681regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);2682} else {2683// Multiply, then convert to integer...2684UNPCKLPS(sReg, R(tReg));2685MULPS(sReg, M(constWidthHeight256f_));2686CVTTPS2DQ(sReg, R(sReg));2687// Great, shift out the fraction.2688PSRAD(sReg, 8);26892690// Square textures are kinda common.2691bool clampApplied = false;2692if (id.width0Shift == id.height0Shift) {2693if (!id.clampS && !id.clampT) {2694PAND(sReg, M(constWidthMinus1i_));2695clampApplied = true;2696} else if (id.clampS && id.clampT && cpu_info.bSSE4_1) {2697X64Reg zeroReg = GetZeroVec();2698PMINSD(sReg, M(constWidthMinus1i_));2699PMAXSD(sReg, R(zeroReg));2700regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);2701clampApplied = true;2702}2703}27042705// Now extract to do the clamping (unless we already did it.)2706MOVQ_xmm(R(uReg), sReg);2707MOV(64, R(vReg), R(uReg));2708SHR(64, R(vReg), Imm8(32));2709// Strip off the top bits.2710AND(32, R(uReg), R(uReg));27112712auto applyClampWrap = [this](X64Reg dest, bool clamp, uint8_t shift) {2713// Clamp and wrap both max out at 512.2714if (shift > 9)2715shift = 9;27162717if (clamp) {2718X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);2719MOV(32, R(tempReg), Imm32((1 << shift) - 1));2720CMP(32, R(dest), R(tempReg));2721CMOVcc(32, dest, R(tempReg), CC_G);2722XOR(32, R(tempReg), R(tempReg));2723CMP(32, R(dest), R(tempReg));2724CMOVcc(32, dest, R(tempReg), CC_L);2725regCache_.Release(tempReg, RegCache::GEN_TEMP0);2726} else {2727AND(32, R(dest), Imm32((1 << shift) - 1));2728}2729};27302731// Now apply clamp/wrap.2732if (!clampApplied) {2733applyClampWrap(uReg, id.clampS, id.width0Shift);2734applyClampWrap(vReg, id.clampT, id.height0Shift);2735}2736}27372738regCache_.Unlock(uReg, RegCache::GEN_ARG_U);2739regCache_.Unlock(vReg, RegCache::GEN_ARG_V);2740regCache_.ForceRetain(RegCache::GEN_ARG_U);2741regCache_.ForceRetain(RegCache::GEN_ARG_V);27422743// And get rid of S and T, we're done with them now.2744regCache_.Unlock(sReg, RegCache::VEC_ARG_S);2745regCache_.Unlock(tReg, RegCache::VEC_ARG_T);2746regCache_.ForceRelease(RegCache::VEC_ARG_S);2747regCache_.ForceRelease(RegCache::VEC_ARG_T);27482749return true;2750}27512752bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {2753Describe("TexelQuad");27542755X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);2756X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);27572758// We use this if there are mips later, to apply wrap/clamp.2759X64Reg sizesReg = INVALID_REG;27602761// Start by multiplying with the width/height... which might be complex with mips.2762if (id.hasAnyMips) {2763// We have to figure out levels and the proper width, ugh.2764X64Reg idReg = GetSamplerID();27652766X64Reg levelReg = INVALID_REG;2767// To avoid ABI problems, we don't hold onto level.2768bool releaseLevelReg = !regCache_.Has(RegCache::GEN_ARG_LEVEL);2769if (!releaseLevelReg) {2770levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);2771} else {2772releaseLevelReg = true;2773levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);2774MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));2775}27762777// This will load the current and next level's sizes, 16x4.2778sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);2779// We actually want this in 32-bit, though, so extend.2780if (cpu_info.bSSE4_1) {2781PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));2782} else {2783MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));2784X64Reg zeroReg = GetZeroVec();2785PUNPCKLWD(sizesReg, R(zeroReg));2786regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);2787}27882789if (releaseLevelReg)2790regCache_.Release(levelReg, RegCache::GEN_ARG_LEVEL);2791else2792regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);2793UnlockSamplerID(idReg);27942795// Now make a float version of sizesReg, times 256.2796X64Reg sizes256Reg = regCache_.Alloc(RegCache::VEC_TEMP0);2797PSLLD(sizes256Reg, sizesReg, 8);2798CVTDQ2PS(sizes256Reg, R(sizes256Reg));27992800// Next off, move S and T into a single reg, which will become U0 V0 U1 V1.2801UNPCKLPS(sReg, R(tReg));2802SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));2803// And multiply by the sizes, all lined up already.2804MULPS(sReg, R(sizes256Reg));2805regCache_.Release(sizes256Reg, RegCache::VEC_TEMP0);28062807// For wrap/clamp purposes, we want width or height minus one. Do that now.2808PSUBD(sizesReg, M(constOnes32_));2809PAND(sizesReg, M(constMaxTexel32_));2810} else {2811// Easy mode.2812UNPCKLPS(sReg, R(tReg));2813MULPS(sReg, M(constWidthHeight256f_));2814}28152816// And now, convert to integers for all later processing.2817CVTPS2DQ(sReg, R(sReg));28182819// Now adjust X and Y...2820X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);2821// Product a -128 constant.2822PCMPEQD(tempXYReg, R(tempXYReg));2823PSLLD(tempXYReg, 7);2824PADDD(sReg, R(tempXYReg));2825regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);28262827// We do want the fraction, though, so extract that to an XMM for later.2828X64Reg allFracReg = INVALID_REG;2829if (regCache_.Has(RegCache::VEC_FRAC))2830allFracReg = regCache_.Find(RegCache::VEC_FRAC);2831else2832allFracReg = regCache_.Alloc(RegCache::VEC_FRAC);2833// We only want the four bits after the first four, though.2834PSLLD(allFracReg, sReg, 24);2835PSRLD(allFracReg, 28);2836// It's convenient later if this is in the low words only.2837PACKSSDW(allFracReg, R(allFracReg));2838regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);2839regCache_.ForceRetain(RegCache::VEC_FRAC);28402841// With those extracted, we can now get rid of the fractional bits.2842PSRAD(sReg, 8);28432844// Now it's time to separate the lanes into separate registers and add next UV offsets.2845if (id.hasAnyMips) {2846X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);2847X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);2848PSHUFD(u1Reg, R(sReg), _MM_SHUFFLE(2, 2, 2, 2));2849PSHUFD(v1Reg, R(sReg), _MM_SHUFFLE(3, 3, 3, 3));2850PADDD(u1Reg, M(constUNext_));2851PADDD(v1Reg, M(constVNext_));2852regCache_.Unlock(u1Reg, RegCache::VEC_U1);2853regCache_.Unlock(v1Reg, RegCache::VEC_V1);2854}28552856PSHUFD(tReg, R(sReg), _MM_SHUFFLE(1, 1, 1, 1));2857PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));2858PADDD(tReg, M(constVNext_));2859PADDD(sReg, M(constUNext_));28602861X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);2862bool temp0ClampZero = false;28632864auto doClamp = [&](bool clamp, X64Reg stReg, const OpArg &bound) {2865if (!clamp) {2866// Wrapping is easy.2867PAND(stReg, bound);2868return;2869}28702871if (!temp0ClampZero)2872PXOR(temp0ClampReg, R(temp0ClampReg));2873temp0ClampZero = true;28742875if (cpu_info.bSSE4_1) {2876PMINSD(stReg, bound);2877PMAXSD(stReg, R(temp0ClampReg));2878} else {2879temp0ClampZero = false;2880// Set temp to max(0, stReg) = AND(NOT(0 > stReg), stReg).2881PCMPGTD(temp0ClampReg, R(stReg));2882PANDN(temp0ClampReg, R(stReg));28832884// Now make a mask where bound is greater than the ST value in temp0ClampReg.2885if (cpu_info.bAVX && bound.IsSimpleReg()) {2886VPCMPGTD(128, stReg, bound.GetSimpleReg(), R(temp0ClampReg));2887} else {2888MOVDQA(stReg, bound);2889PCMPGTD(stReg, R(temp0ClampReg));2890}2891// Throw away the values that are greater in our temp0ClampReg in progress result.2892PAND(temp0ClampReg, R(stReg));28932894// Now, set bound only where ST was too high.2895PANDN(stReg, bound);2896// And put in the values that were fine.2897POR(stReg, R(temp0ClampReg));2898}2899};29002901if (id.hasAnyMips) {2902// We'll spread sizes out into a temp.2903X64Reg spreadSizeReg = regCache_.Alloc(RegCache::VEC_TEMP1);29042905PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(0, 0, 0, 0));2906doClamp(id.clampS, sReg, R(spreadSizeReg));2907PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(1, 1, 1, 1));2908doClamp(id.clampT, tReg, R(spreadSizeReg));2909X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);2910X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);2911PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(2, 2, 2, 2));2912doClamp(id.clampS, u1Reg, R(spreadSizeReg));2913PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(3, 3, 3, 3));2914doClamp(id.clampT, v1Reg, R(spreadSizeReg));2915regCache_.Unlock(u1Reg, RegCache::VEC_U1);2916regCache_.Unlock(v1Reg, RegCache::VEC_V1);29172918regCache_.Release(spreadSizeReg, RegCache::VEC_TEMP1);2919} else {2920doClamp(id.clampS, sReg, M(constWidthMinus1i_));2921doClamp(id.clampT, tReg, M(constHeightMinus1i_));2922}29232924if (sizesReg != INVALID_REG)2925regCache_.Release(sizesReg, RegCache::VEC_TEMP5);2926regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);29272928regCache_.Unlock(sReg, RegCache::VEC_ARG_S);2929regCache_.Unlock(tReg, RegCache::VEC_ARG_T);2930regCache_.Change(RegCache::VEC_ARG_S, RegCache::VEC_ARG_U);2931regCache_.Change(RegCache::VEC_ARG_T, RegCache::VEC_ARG_V);2932return true;2933}29342935bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1) {2936_assert_(id.linear);29372938bool success = true;2939int bits = 0;2940switch (id.TexFmt()) {2941case GE_TFMT_5650:2942case GE_TFMT_5551:2943case GE_TFMT_4444:2944case GE_TFMT_CLUT16:2945bits = 16;2946break;29472948case GE_TFMT_8888:2949case GE_TFMT_CLUT32:2950bits = 32;2951break;29522953case GE_TFMT_CLUT8:2954bits = 8;2955break;29562957case GE_TFMT_CLUT4:2958bits = 4;2959break;29602961case GE_TFMT_DXT1:2962bits = -8;2963break;29642965case GE_TFMT_DXT3:2966case GE_TFMT_DXT5:2967bits = -16;2968break;29692970default:2971success = false;2972}29732974if (success && bits != 0) {2975if (bits < 0) {2976success = Jit_PrepareDataDXTOffsets(id, uReg, vReg, level1, -bits);2977} else if (id.swizzle) {2978success = Jit_PrepareDataSwizzledOffsets(id, uReg, vReg, level1, bits);2979} else {2980success = Jit_PrepareDataDirectOffsets(id, uReg, vReg, level1, bits);2981}2982}29832984return success;2985}29862987bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {2988Describe("DataOff");2989X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);2990if (!id.useStandardBufw || id.hasAnyMips) {2991// Spread bufw into each lane.2992X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);2993if (cpu_info.bSSE4_1) {2994PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));2995} else {2996PXOR(bufwVecReg, R(bufwVecReg));2997PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);2998}2999PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));3000regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);30013002if (bitsPerTexel == 4)3003PSRLD(bufwVecReg, 1);3004else if (bitsPerTexel == 16)3005PSLLD(bufwVecReg, 1);3006else if (bitsPerTexel == 32)3007PSLLD(bufwVecReg, 2);3008}30093010if (id.useStandardBufw && !id.hasAnyMips) {3011int amt = id.width0Shift;3012if (bitsPerTexel == 4)3013amt -= 1;3014else if (bitsPerTexel == 16)3015amt += 1;3016else if (bitsPerTexel == 32)3017amt += 2;3018// It's aligned to 16 bytes, so must at least be 16.3019PSLLD(vReg, std::max(4, amt));3020} else if (cpu_info.bSSE4_1) {3021// And now multiply. This is slow, but not worse than the SSE2 version...3022PMULLD(vReg, R(bufwVecReg));3023} else {3024// Copy that into another temp for multiply.3025X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);3026MOVDQA(vOddLaneReg, R(vReg));30273028// Okay, first, multiply to get XXXX CCCC XXXX AAAA.3029PMULUDQ(vReg, R(bufwVecReg));3030PSRLDQ(vOddLaneReg, 4);3031PSRLDQ(bufwVecReg, 4);3032// And now get XXXX DDDD XXXX BBBB.3033PMULUDQ(vOddLaneReg, R(bufwVecReg));30343035// We know everything is positive, so XXXX must be zero. Let's combine.3036PSLLDQ(vOddLaneReg, 4);3037POR(vReg, R(vOddLaneReg));3038regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);3039}3040regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);30413042if (bitsPerTexel == 4) {3043// Need to keep uvec for the odd bit.3044X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);3045MOVDQA(uCopyReg, R(uReg));3046PSRLD(uCopyReg, 1);3047PADDD(vReg, R(uCopyReg));3048regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);3049} else {3050// Destroy uvec, we won't use it again.3051if (bitsPerTexel == 16)3052PSLLD(uReg, 1);3053else if (bitsPerTexel == 32)3054PSLLD(uReg, 2);3055PADDD(vReg, R(uReg));3056}30573058return true;3059}30603061bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {3062Describe("DataOffS");3063// See Jit_GetTexDataSwizzled() for usage of this offset.30643065X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);3066if (!id.useStandardBufw || id.hasAnyMips) {3067// Spread bufw into each lane.3068X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);3069if (cpu_info.bSSE4_1) {3070PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));3071} else {3072PXOR(bufwVecReg, R(bufwVecReg));3073PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);3074}3075PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));3076regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);3077}30783079// Divide vvec by 8 in a temp.3080X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);3081PSRLD(vMultReg, vReg, 3);30823083// And now multiply by bufw. May be able to use a shift in a common case.3084int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);3085if (id.useStandardBufw && !id.hasAnyMips) {3086int amt = id.width0Shift;3087// Account for 16 byte minimum.3088amt = std::max(7 - shiftAmount, amt);3089shiftAmount += amt;3090} else if (cpu_info.bSSE4_1) {3091// And now multiply. This is slow, but not worse than the SSE2 version...3092PMULLD(vMultReg, R(bufwVecReg));3093} else {3094// Copy that into another temp for multiply.3095X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP2);3096MOVDQA(vOddLaneReg, R(vMultReg));30973098// Okay, first, multiply to get XXXX CCCC XXXX AAAA.3099PMULUDQ(vMultReg, R(bufwVecReg));3100PSRLDQ(vOddLaneReg, 4);3101PSRLDQ(bufwVecReg, 4);3102// And now get XXXX DDDD XXXX BBBB.3103PMULUDQ(vOddLaneReg, R(bufwVecReg));31043105// We know everything is positive, so XXXX must be zero. Let's combine.3106PSLLDQ(vOddLaneReg, 4);3107POR(vMultReg, R(vOddLaneReg));3108regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP2);3109}3110regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);31113112// Multiply the result by bitsPerTexel using a shift.3113PSLLD(vMultReg, shiftAmount);31143115// Now we're adding (v & 7) * 16. Use a 16-bit wall.3116PSLLW(vReg, 13);3117PSRLD(vReg, 9);3118PADDD(vReg, R(vMultReg));3119regCache_.Release(vMultReg, RegCache::VEC_TEMP1);31203121// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.3122X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);3123PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);3124PSLLD(uCopyReg, 7);3125// Add it in to our running total.3126PADDD(vReg, R(uCopyReg));31273128if (bitsPerTexel == 4) {3129// Finally, we want (uvec & 31) / 2. Use a 16-bit wall.3130PSLLW(uCopyReg, uReg, 11);3131PSRLD(uCopyReg, 12);3132// With that, this is our byte offset. uvec & 1 has which half.3133PADDD(vReg, R(uCopyReg));3134} else {3135// We can destroy uvec in this path. Clear all but 2 bits for 32, 3 for 16, or 4 for 8.3136PSLLW(uReg, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);3137// Now that it's at the top of the 16 bits, we always shift that to the top of 4 bits.3138PSRLD(uReg, 12);3139PADDD(vReg, R(uReg));3140}3141regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);31423143return true;3144}31453146bool SamplerJitCache::Jit_PrepareDataDXTOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int blockSize) {3147Describe("DataOffDXT");3148// Wwe need to get the block's offset, which is:3149// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize3150// We distribute the blockSize constant for convenience:3151// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)31523153X64Reg baseVReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);3154// This gives us the V factor for the block, which we multiply by bufw.3155PSRLD(baseVReg, vReg, 2);3156PSLLD(baseVReg, blockSize == 16 ? 4 : 3);31573158X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);3159if (!id.useStandardBufw || id.hasAnyMips) {3160// Spread bufw into each lane.3161X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);3162if (cpu_info.bSSE4_1) {3163PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));3164} else {3165PXOR(bufwVecReg, R(bufwVecReg));3166PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);3167}3168PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));3169regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);31703171// Divide by 4 before the multiply.3172PSRLD(bufwVecReg, 2);3173}31743175if (id.useStandardBufw && !id.hasAnyMips) {3176int amt = id.width0Shift - 2;3177if (amt < 0)3178PSRLD(baseVReg, -amt);3179else if (amt > 0)3180PSLLD(baseVReg, amt);3181} else if (cpu_info.bSSE4_1) {3182// And now multiply. This is slow, but not worse than the SSE2 version...3183PMULLD(baseVReg, R(bufwVecReg));3184} else {3185// Copy that into another temp for multiply.3186X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);3187MOVDQA(vOddLaneReg, R(baseVReg));31883189// Okay, first, multiply to get XXXX CCCC XXXX AAAA.3190PMULUDQ(baseVReg, R(bufwVecReg));3191PSRLDQ(vOddLaneReg, 4);3192PSRLDQ(bufwVecReg, 4);3193// And now get XXXX DDDD XXXX BBBB.3194PMULUDQ(vOddLaneReg, R(bufwVecReg));31953196// We know everything is positive, so XXXX must be zero. Let's combine.3197PSLLDQ(vOddLaneReg, 4);3198POR(baseVReg, R(vOddLaneReg));3199regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);3200}3201regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);32023203// Now add in the U factor for the block.3204X64Reg baseUReg = regCache_.Alloc(RegCache::VEC_TEMP0);3205PSRLD(baseUReg, uReg, 2);3206PSLLD(baseUReg, blockSize == 16 ? 4 : 3);3207PADDD(baseVReg, R(baseUReg));3208regCache_.Release(baseUReg, RegCache::VEC_TEMP0);32093210// Okay, the base index (block byte offset from src) is ready.3211regCache_.Unlock(baseVReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);3212regCache_.ForceRetain(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);32133214// For everything else, we only want the low two bits of U and V.3215PSLLD(uReg, 30);3216PSLLD(vReg, 30);32173218X64Reg alphaTempRegU = regCache_.Alloc(RegCache::VEC_TEMP0);3219if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5)3220PSRLD(alphaTempRegU, uReg, 30);32213222PSRLD(uReg, 30 - 1);3223PSRLD(vReg, 30 - 3);3224// At this point, uReg is now the bit offset of the color index.3225PADDD(uReg, R(vReg));32263227// Grab the alpha index into vReg next.3228if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5) {3229PSRLD(vReg, 1);3230PADDD(vReg, R(alphaTempRegU));32313232if (id.TexFmt() == GE_TFMT_DXT3) {3233PSLLD(vReg, 2);3234} else if (id.TexFmt() == GE_TFMT_DXT5) {3235// Multiply by 3.3236PSLLD(alphaTempRegU, vReg, 1);3237PADDD(vReg, R(alphaTempRegU));3238}3239}3240regCache_.Release(alphaTempRegU, RegCache::VEC_TEMP0);32413242return true;3243}32443245bool SamplerJitCache::Jit_DecodeQuad(const SamplerID &id, bool level1) {3246GETextureFormat decodeFmt = id.TexFmt();3247switch (id.TexFmt()) {3248case GE_TFMT_CLUT32:3249case GE_TFMT_CLUT16:3250case GE_TFMT_CLUT8:3251case GE_TFMT_CLUT4:3252// The values match, so just use the clut fmt.3253decodeFmt = (GETextureFormat)id.ClutFmt();3254break;32553256default:3257// We'll decode below.3258break;3259}32603261bool success = true;3262X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);32633264switch (decodeFmt) {3265case GE_TFMT_5650:3266success = Jit_Decode5650Quad(id, quadReg);3267break;32683269case GE_TFMT_5551:3270success = Jit_Decode5551Quad(id, quadReg);3271break;32723273case GE_TFMT_4444:3274success = Jit_Decode4444Quad(id, quadReg);3275break;32763277default:3278// Doesn't need decoding.3279break;3280}32813282regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);3283return success;3284}32853286bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {3287Describe("5650Quad");3288X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);3289X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);32903291// Filter out red only into temp1. We do this by shifting into a wall.3292PSLLD(temp1Reg, quadReg, 32 - 5);3293// Move it right to the top of the 8 bits.3294PSRLD(temp1Reg, 24);32953296// Now we bring in blue, since it's also 5 like red.3297// Luckily, we know the top 16 bits are zero. Shift right into a wall.3298PSRLD(temp2Reg, quadReg, 11);3299// Shift blue into place at 19, and merge back to temp1.3300PSLLD(temp2Reg, 19);3301POR(temp1Reg, R(temp2Reg));33023303// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.3304PSLLD(temp2Reg, temp1Reg, 1);33053306// We go to green last because it's the different one. Shift off red and blue.3307PSRLD(quadReg, 5);3308// Use a word shift to put a wall just at the right place, top 6 bits of second byte.3309PSLLW(quadReg, 10);3310// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)3311POR(temp2Reg, R(quadReg));3312POR(quadReg, R(temp1Reg));33133314// Now shift and mask temp2 for swizzle.3315PSRLD(temp2Reg, 6);3316PAND(temp2Reg, M(const5650Swizzle_));3317// And then OR that in too. Only alpha left now.3318POR(quadReg, R(temp2Reg));33193320if (id.useTextureAlpha) {3321// Just put a fixed FF in. Maybe we could even avoid this and act like it's FF later...3322PCMPEQD(temp2Reg, R(temp2Reg));3323PSLLD(temp2Reg, 24);3324POR(quadReg, R(temp2Reg));3325}33263327regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);3328regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);3329return true;3330}33313332bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {3333Describe("5650");3334X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);3335X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3336X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);33373338if (cpu_info.bBMI2_fast) {3339// Start off with the high bits.3340MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));3341PDEP(32, temp1Reg, resultReg, R(temp1Reg));3342if (id.useTextureAlpha || id.fetch)3343OR(32, R(temp1Reg), Imm32(0xFF000000));33443345// Now grab the low bits (they end up packed.)3346MOV(32, R(temp2Reg), Imm32(0x0000E61C));3347PEXT(32, resultReg, resultReg, R(temp2Reg));3348// And spread them back out.3349MOV(32, R(temp2Reg), Imm32(0x00070307));3350PDEP(32, resultReg, resultReg, R(temp2Reg));33513352// Finally put the high bits in, we're done.3353OR(32, R(resultReg), R(temp1Reg));3354} else {3355MOV(32, R(temp2Reg), R(resultReg));3356AND(32, R(temp2Reg), Imm32(0x0000001F));33573358// B (we do R and B at the same time, they're both 5.)3359MOV(32, R(temp1Reg), R(resultReg));3360AND(32, R(temp1Reg), Imm32(0x0000F800));3361SHL(32, R(temp1Reg), Imm8(5));3362OR(32, R(temp2Reg), R(temp1Reg));33633364// Expand 5 -> 8. At this point we have 00BB00RR.3365MOV(32, R(temp1Reg), R(temp2Reg));3366SHL(32, R(temp2Reg), Imm8(3));3367SHR(32, R(temp1Reg), Imm8(2));3368OR(32, R(temp2Reg), R(temp1Reg));3369AND(32, R(temp2Reg), Imm32(0x00FF00FF));33703371// Now's as good a time to put in A as any.3372if (id.useTextureAlpha || id.fetch)3373OR(32, R(temp2Reg), Imm32(0xFF000000));33743375// Last, we need to align, extract, and expand G.3376// 3 to align to G, and then 2 to expand to 8.3377SHL(32, R(resultReg), Imm8(3 + 2));3378AND(32, R(resultReg), Imm32(0x0000FC00));3379MOV(32, R(temp1Reg), R(resultReg));3380// 2 to account for resultReg being preshifted, 4 for expansion.3381SHR(32, R(temp1Reg), Imm8(2 + 4));3382OR(32, R(resultReg), R(temp1Reg));3383AND(32, R(resultReg), Imm32(0x0000FF00));3384OR(32, R(resultReg), R(temp2Reg));3385}33863387regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3388regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);3389regCache_.Unlock(resultReg, RegCache::GEN_RESULT);3390return true;3391}33923393bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {3394Describe("5551Quad");3395X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);3396X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);33973398// Filter out red only into temp1. We do this by shifting into a wall.3399PSLLD(temp1Reg, quadReg, 32 - 5);3400// Move it right to the top of the 8 bits.3401PSRLD(temp1Reg, 24);34023403// Add in green and shift into place (top 5 bits of byte 2.)3404PSRLD(temp2Reg, quadReg, 5);3405PSLLW(temp2Reg, 11);3406POR(temp1Reg, R(temp2Reg));34073408// First, extend alpha using an arithmetic shift.3409// We use 10 to meanwhile get rid of green too. The extra alpha bits are fine.3410PSRAW(quadReg, 10);3411// This gets rid of those extra alpha bits and puts blue in place too.3412PSLLD(quadReg, 19);34133414// Combine both together, we still need to swizzle.3415POR(quadReg, R(temp1Reg));3416PSRLD(temp1Reg, quadReg, 5);34173418// Now for swizzle, we'll mask carefully to avoid overflow.3419PAND(temp1Reg, M(const5551Swizzle_));3420// Then finally merge in the swizzle bits.3421POR(quadReg, R(temp1Reg));34223423regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);3424regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);3425return true;3426}34273428bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {3429Describe("5551");3430X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);3431X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3432X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);34333434if (cpu_info.bBMI2_fast) {3435// First, grab the top bits.3436bool keepAlpha = id.useTextureAlpha || id.fetch;3437MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));3438PDEP(32, resultReg, resultReg, R(temp1Reg));34393440// Now make the swizzle bits.3441MOV(32, R(temp2Reg), R(resultReg));3442SHR(32, R(temp2Reg), Imm8(5));3443AND(32, R(temp2Reg), Imm32(0x00070707));34443445if (keepAlpha) {3446// Sign extend the alpha bit to 8 bits.3447SHL(32, R(resultReg), Imm8(7));3448SAR(32, R(resultReg), Imm8(7));3449}34503451OR(32, R(resultReg), R(temp2Reg));3452} else {3453MOV(32, R(temp2Reg), R(resultReg));3454MOV(32, R(temp1Reg), R(resultReg));3455AND(32, R(temp2Reg), Imm32(0x0000001F));3456AND(32, R(temp1Reg), Imm32(0x000003E0));3457SHL(32, R(temp1Reg), Imm8(3));3458OR(32, R(temp2Reg), R(temp1Reg));34593460MOV(32, R(temp1Reg), R(resultReg));3461AND(32, R(temp1Reg), Imm32(0x00007C00));3462SHL(32, R(temp1Reg), Imm8(6));3463OR(32, R(temp2Reg), R(temp1Reg));34643465// Expand 5 -> 8. After this is just A.3466MOV(32, R(temp1Reg), R(temp2Reg));3467SHL(32, R(temp2Reg), Imm8(3));3468SHR(32, R(temp1Reg), Imm8(2));3469// Chop off the bits that were shifted out.3470AND(32, R(temp1Reg), Imm32(0x00070707));3471OR(32, R(temp2Reg), R(temp1Reg));34723473if (id.useTextureAlpha || id.fetch) {3474// For A, we sign extend to get either 16 1s or 0s of alpha.3475SAR(16, R(resultReg), Imm8(15));3476// Now, shift left by 24 to get the lowest 8 of those at the top.3477SHL(32, R(resultReg), Imm8(24));3478OR(32, R(resultReg), R(temp2Reg));3479} else {3480MOV(32, R(resultReg), R(temp2Reg));3481}3482}34833484regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3485regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);3486regCache_.Unlock(resultReg, RegCache::GEN_RESULT);3487return true;3488}34893490bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {3491Describe("4444Quad");3492X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);3493X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);34943495// Mask and move red into position within temp1.3496PSLLD(temp1Reg, quadReg, 28);3497PSRLD(temp1Reg, 24);34983499// Green is easy too, we use a word shift to get a free wall.3500PSRLD(temp2Reg, quadReg, 4);3501PSLLW(temp2Reg, 12);3502POR(temp1Reg, R(temp2Reg));35033504// Blue isn't last this time, but it's next.3505PSRLD(temp2Reg, quadReg, 8);3506PSLLD(temp2Reg, 28);3507PSRLD(temp2Reg, 8);3508POR(temp1Reg, R(temp2Reg));35093510if (id.useTextureAlpha) {3511// Last but not least, alpha.3512PSRLW(quadReg, 12);3513PSLLD(quadReg, 28);3514POR(quadReg, R(temp1Reg));35153516// Masking isn't necessary here since everything is 4 wide.3517PSRLD(temp1Reg, quadReg, 4);3518POR(quadReg, R(temp1Reg));3519} else {3520// Overwrite quadReg (we need temp1 as a copy anyway.)3521PSRLD(quadReg, temp1Reg, 4);3522POR(quadReg, R(temp1Reg));3523}35243525regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);3526regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);3527return true;3528}35293530alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };35313532bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {3533Describe("4444");3534X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);35353536if (cpu_info.bBMI2_fast) {3537X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3538// First, spread the bits out with spaces.3539MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));3540PDEP(32, resultReg, resultReg, R(temp1Reg));35413542// Now swizzle the low bits in.3543MOV(32, R(temp1Reg), R(resultReg));3544SHR(32, R(temp1Reg), Imm8(4));3545OR(32, R(resultReg), R(temp1Reg));35463547regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3548} else {3549X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);3550X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);3551X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);35523553MOVD_xmm(vecTemp1Reg, R(resultReg));3554PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));3555if (RipAccessible(color4444mask)) {3556PAND(vecTemp1Reg, M(color4444mask));3557} else {3558X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3559MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));3560PAND(vecTemp1Reg, MatR(temp1Reg));3561regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3562}3563MOVSS(vecTemp2Reg, R(vecTemp1Reg));3564MOVSS(vecTemp3Reg, R(vecTemp1Reg));3565PSRLW(vecTemp2Reg, 4);3566PSLLW(vecTemp3Reg, 4);3567POR(vecTemp1Reg, R(vecTemp2Reg));3568POR(vecTemp1Reg, R(vecTemp3Reg));3569MOVD_xmm(R(resultReg), vecTemp1Reg);35703571regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);3572regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);3573regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);3574}3575regCache_.Unlock(resultReg, RegCache::GEN_RESULT);3576return true;3577}35783579bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerIndex) {3580Describe("TrCLUT");3581GEPaletteFormat fmt = id.ClutFmt();3582if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {3583// This is simple - just mask if necessary.3584if (bitsPerIndex > 8) {3585X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);3586AND(32, R(resultReg), Imm32(0x000000FF));3587regCache_.Unlock(resultReg, RegCache::GEN_RESULT);3588}3589return true;3590}35913592if (!cpu_info.bBMI2) {3593bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);3594_assert_msg_(hasRCX, "Could not obtain RCX, locked?");3595}35963597X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3598X64Reg idReg = GetSamplerID();3599MOV(32, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));3600UnlockSamplerID(idReg);36013602X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);3603int shiftedToSoFar = 0;36043605// Shift = (clutformat >> 2) & 0x1F3606if (id.hasClutShift) {3607SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));3608shiftedToSoFar = 2;36093610if (cpu_info.bBMI2) {3611SHRX(32, resultReg, R(resultReg), temp1Reg);3612} else {3613_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));3614MOV(32, R(RCX), R(temp1Reg));3615SHR(32, R(resultReg), R(RCX));3616}3617}36183619// Mask = (clutformat >> 8) & 0xFF3620if (id.hasClutMask) {3621SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));3622shiftedToSoFar = 8;36233624AND(32, R(resultReg), R(temp1Reg));3625}36263627// We need to wrap any entries beyond the first 1024 bytes.3628u32 offsetMask = fmt == GE_CMODE_32BIT_ABGR8888 ? 0x00FF : 0x01FF;36293630// We must mask to 0xFF before ORing 0x100 in 16 bit CMODEs.3631// But skip if we'll mask 0xFF after offset anyway.3632if (bitsPerIndex > 8 && (!id.hasClutOffset || offsetMask != 0x00FF)) {3633AND(32, R(resultReg), Imm32(0x000000FF));3634}36353636// Offset = (clutformat >> 12) & 0x01F03637if (id.hasClutOffset) {3638SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));3639SHL(32, R(temp1Reg), Imm8(4));3640OR(32, R(resultReg), R(temp1Reg));3641AND(32, R(resultReg), Imm32(offsetMask));3642}36433644regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3645regCache_.Unlock(resultReg, RegCache::GEN_RESULT);3646return true;3647}36483649bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {3650Describe("ReadCLUT");3651X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);3652_assert_msg_(!id.linear, "Should not use this path for linear");36533654if (!id.useSharedClut) {3655X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);36563657if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {3658X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);3659// We need to multiply by 16 and add, LEA allows us to copy too.3660LEA(32, temp2Reg, MScaled(levelReg, SCALE_4, 0));3661regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);3662if (id.fetch)3663regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);3664} else {3665_assert_(stackLevelOffset_ != -1);3666// The argument was saved on the stack.3667MOV(32, R(temp2Reg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));3668LEA(32, temp2Reg, MScaled(temp2Reg, SCALE_4, 0));3669}36703671// Second step of the multiply by 16 (since we only multiplied by 4 before.)3672LEA(64, resultReg, MComplex(resultReg, temp2Reg, SCALE_4, 0));3673regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);3674}36753676X64Reg idReg = GetSamplerID();3677X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);3678MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));3679UnlockSamplerID(idReg);36803681switch (id.ClutFmt()) {3682case GE_CMODE_16BIT_BGR5650:3683case GE_CMODE_16BIT_ABGR5551:3684case GE_CMODE_16BIT_ABGR4444:3685MOVZX(32, 16, resultReg, MComplex(temp1Reg, resultReg, SCALE_2, 0));3686break;36873688case GE_CMODE_32BIT_ABGR8888:3689MOV(32, R(resultReg), MComplex(temp1Reg, resultReg, SCALE_4, 0));3690break;3691}36923693regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);3694regCache_.Unlock(resultReg, RegCache::GEN_RESULT);36953696switch (id.ClutFmt()) {3697case GE_CMODE_16BIT_BGR5650:3698return Jit_Decode5650(id);36993700case GE_CMODE_16BIT_ABGR5551:3701return Jit_Decode5551(id);37023703case GE_CMODE_16BIT_ABGR4444:3704return Jit_Decode4444(id);37053706case GE_CMODE_32BIT_ABGR8888:3707return true;37083709default:3710return false;3711}3712}37133714};37153716#endif371737183719