CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/DrawPixelX86.cpp
Views: 1401
// Copyright (c) 2017- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#if PPSSPP_ARCH(AMD64)1920#include <emmintrin.h>21#include "Common/x64Emitter.h"22#include "Common/CPUDetect.h"23#include "Common/LogReporting.h"24#include "GPU/GPUState.h"25#include "GPU/Software/DrawPixel.h"26#include "GPU/Software/SoftGpu.h"27#include "GPU/ge_constants.h"2829using namespace Gen;3031namespace Rasterizer {3233SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) {34// Setup the reg cache and disallow spill for arguments.35regCache_.SetupABI({36RegCache::GEN_ARG_X,37RegCache::GEN_ARG_Y,38RegCache::GEN_ARG_Z,39RegCache::GEN_ARG_FOG,40RegCache::VEC_ARG_COLOR,41RegCache::GEN_ARG_ID,42});4344BeginWrite(64);45Describe("Init");46WriteConstantPool(id);4748const u8 *resetPos = AlignCode16();49EndWrite();50bool success = true;5152#if PPSSPP_PLATFORM(WINDOWS)53// RET + Windows reserves space to save args, half of 1 xmm + 4 ints before the id.54_assert_(!regCache_.Has(RegCache::GEN_ARG_ID));55int stackSpace = 0;56if (id.hasStencilTestMask)57stackSpace = WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 }, { R12, R13, R14, R15 });58else59stackSpace = WriteProlog(0, {}, {});60stackIDOffset_ = stackSpace + 8 + 8 + 4 * PTRBITS / 8;61#else62_assert_(regCache_.Has(RegCache::GEN_ARG_ID));63WriteProlog(0, {}, {});64stackIDOffset_ = -1;65#endif6667// Start with the depth range.68success = success && Jit_ApplyDepthRange(id);6970// Next, let's clamp the color (might affect alpha test, and everything expects it clamped.)71// We simply convert to 4x8-bit to clamp. Everything else expects color in this format.72Describe("ClampColor");73X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);74PACKSSDW(argColorReg, R(argColorReg));75PACKUSWB(argColorReg, R(argColorReg));76regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);77colorIs16Bit_ = false;7879success = success && Jit_AlphaTest(id);80// Fog is applied prior to color test. Maybe before alpha test too, but it doesn't affect it...81success = success && Jit_ApplyFog(id);82success = success && Jit_ColorTest(id);8384if (id.stencilTest && !id.clearMode)85success = success && Jit_StencilAndDepthTest(id);86else if (!id.clearMode)87success = success && Jit_DepthTest(id);88success = success && Jit_WriteDepth(id);8990success = success && Jit_AlphaBlend(id);91success = success && Jit_Dither(id);92success = success && Jit_WriteColor(id);9394for (auto &fixup : discards_) {95SetJumpTarget(fixup);96}97discards_.clear();9899if (regCache_.Has(RegCache::GEN_ARG_ID))100regCache_.ForceRelease(RegCache::GEN_ARG_ID);101102if (!success) {103ERROR_LOG_REPORT(Log::G3D, "Could not compile pixel func: %s", DescribePixelFuncID(id).c_str());104105regCache_.Reset(false);106EndWrite();107ResetCodePtr(GetOffset(resetPos));108return nullptr;109}110111const u8 *start = WriteFinalizedEpilog();112regCache_.Reset(true);113return (SingleFunc)start;114}115116RegCache::Reg PixelJitCache::GetPixelID() {117if (regCache_.Has(RegCache::GEN_ARG_ID))118return regCache_.Find(RegCache::GEN_ARG_ID);119if (!regCache_.Has(RegCache::GEN_ID)) {120X64Reg r = regCache_.Alloc(RegCache::GEN_ID);121_assert_(stackIDOffset_ != -1);122MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));123return r;124}125return regCache_.Find(RegCache::GEN_ID);126}127128void PixelJitCache::UnlockPixelID(RegCache::Reg &r) {129if (regCache_.Has(RegCache::GEN_ARG_ID))130regCache_.Unlock(r, RegCache::GEN_ARG_ID);131else132regCache_.Unlock(r, RegCache::GEN_ID);133}134135RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {136if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {137Describe("GetColorOff");138if (id.useStandardStride && !id.dithering) {139bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);140X64Reg depthTemp = INVALID_REG;141X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);142X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);143144// In this mode, we force argXReg to the off, and throw away argYReg.145SHL(32, R(argYReg), Imm8(9));146ADD(32, R(argXReg), R(argYReg));147148// Now add the pointer for the color buffer.149if (loadDepthOff) {150_assert_(Accessible(&fb.data, &depthbuf.data));151depthTemp = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);152if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {153MOV(PTRBITS, R(argYReg), M(&fb.data));154} else {155MOV(PTRBITS, R(depthTemp), ImmPtr(&fb.data));156MOV(PTRBITS, R(argYReg), MatR(depthTemp));157}158} else {159if (RipAccessible(&fb.data)) {160MOV(PTRBITS, R(argYReg), M(&fb.data));161} else {162MOV(PTRBITS, R(argYReg), ImmPtr(&fb.data));163MOV(PTRBITS, R(argYReg), MatR(argYReg));164}165}166LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));167// With that, argYOff is now GEN_COLOR_OFF.168regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);169regCache_.Change(RegCache::GEN_ARG_Y, RegCache::GEN_COLOR_OFF);170// Retain it, because we can't recalculate this.171regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);172173// Next, also calculate the depth offset, unless we won't need it at all.174if (loadDepthOff) {175if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {176MOV(PTRBITS, R(depthTemp), M(&depthbuf.data));177} else {178MOV(PTRBITS, R(depthTemp), MAccessibleDisp(depthTemp, &fb.data, &depthbuf.data));179}180LEA(PTRBITS, argXReg, MComplex(depthTemp, argXReg, 2, 0));181regCache_.Release(depthTemp, RegCache::GEN_DEPTH_OFF);182183// Okay, same deal - release as GEN_DEPTH_OFF and force retain it.184regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);185regCache_.Change(RegCache::GEN_ARG_X, RegCache::GEN_DEPTH_OFF);186regCache_.ForceRetain(RegCache::GEN_DEPTH_OFF);187} else {188regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);189regCache_.ForceRelease(RegCache::GEN_ARG_X);190}191192return regCache_.Find(RegCache::GEN_COLOR_OFF);193}194195X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);196X64Reg r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);197if (id.useStandardStride) {198MOV(32, R(r), R(argYReg));199SHL(32, R(r), Imm8(9));200} else {201if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {202X64Reg idReg = GetPixelID();203MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.framebufStride)));204UnlockPixelID(idReg);205} else {206_assert_(stackIDOffset_ != -1);207MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));208MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.framebufStride)));209}210211IMUL(32, r, R(argYReg));212}213regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);214215X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);216ADD(32, R(r), R(argXReg));217regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);218219X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);220if (RipAccessible(&fb.data)) {221MOV(PTRBITS, R(temp), M(&fb.data));222} else {223MOV(PTRBITS, R(temp), ImmPtr(&fb.data));224MOV(PTRBITS, R(temp), MatR(temp));225}226LEA(PTRBITS, r, MComplex(temp, r, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));227regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);228229return r;230}231return regCache_.Find(RegCache::GEN_COLOR_OFF);232}233234RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {235if (!regCache_.Has(RegCache::GEN_DEPTH_OFF)) {236// If both color and depth use 512, the offsets are the same.237if (id.useStandardStride && !id.dithering) {238// Calculate once inside GetColorOff().239X64Reg colorOffReg = GetColorOff(id);240regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);241return regCache_.Find(RegCache::GEN_DEPTH_OFF);242}243244Describe("GetDepthOff");245X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);246X64Reg r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);247if (id.useStandardStride) {248MOV(32, R(r), R(argYReg));249SHL(32, R(r), Imm8(9));250} else {251if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {252X64Reg idReg = GetPixelID();253MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.depthbufStride)));254UnlockPixelID(idReg);255} else {256_assert_(stackIDOffset_ != -1);257MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));258MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.depthbufStride)));259}260261IMUL(32, r, R(argYReg));262}263regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);264265X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);266ADD(32, R(r), R(argXReg));267regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);268269X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);270if (RipAccessible(&depthbuf.data)) {271MOV(PTRBITS, R(temp), M(&depthbuf.data));272} else {273MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data));274MOV(PTRBITS, R(temp), MatR(temp));275}276LEA(PTRBITS, r, MComplex(temp, r, 2, 0));277regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);278279return r;280}281return regCache_.Find(RegCache::GEN_DEPTH_OFF);282}283284285RegCache::Reg PixelJitCache::GetDestStencil(const PixelFuncID &id) {286// Skip if 565, since stencil is fixed zero.287if (id.FBFormat() == GE_FORMAT_565)288return INVALID_REG;289290X64Reg colorOffReg = GetColorOff(id);291Describe("GetDestStencil");292X64Reg stencilReg = regCache_.Alloc(RegCache::GEN_STENCIL);293if (id.FBFormat() == GE_FORMAT_8888) {294MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 3));295} else if (id.FBFormat() == GE_FORMAT_5551) {296MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));297SAR(8, R(stencilReg), Imm8(7));298} else if (id.FBFormat() == GE_FORMAT_4444) {299MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));300SHR(32, R(stencilReg), Imm8(4));301X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);302MOV(32, R(temp), R(stencilReg));303SHL(32, R(temp), Imm8(4));304OR(32, R(stencilReg), R(temp));305regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);306}307regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);308309return stencilReg;310}311312void PixelJitCache::Discard() {313discards_.push_back(J(true));314}315316void PixelJitCache::Discard(Gen::CCFlags cc) {317discards_.push_back(J_CC(cc, true));318}319320void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {321// This is used to add a fixed point 0.5 (as s.11.4) for blend factors to multiply accurately.322WriteSimpleConst8x16(constBlendHalf_11_4s_, 1 << 3);323324// This is used for shifted blend factors, to inverse them.325WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);326}327328bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {329if (id.applyDepthRange && !id.earlyZChecks) {330Describe("ApplyDepthR");331X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);332X64Reg idReg = GetPixelID();333334// We expanded this to 32 bits, so it's convenient to compare.335CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.minz)));336Discard(CC_L);337338// We load the low 16 bits, but compare all 32 of z. Above handles < 0.339CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.maxz)));340Discard(CC_G);341342UnlockPixelID(idReg);343regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);344}345346// Since this is early on, try to free up the z reg if we don't need it anymore.347if (id.clearMode && !id.DepthClear())348regCache_.ForceRelease(RegCache::GEN_ARG_Z);349else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))350regCache_.ForceRelease(RegCache::GEN_ARG_Z);351352return true;353}354355bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) {356// Take care of ALWAYS/NEVER first. ALWAYS is common, means disabled.357Describe("AlphaTest");358switch (id.AlphaTestFunc()) {359case GE_COMP_NEVER:360Discard();361return true;362363case GE_COMP_ALWAYS:364return true;365366default:367break;368}369370// Load alpha into its own general reg.371X64Reg alphaReg;372if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {373alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);374} else {375alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);376_assert_(!colorIs16Bit_);377X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);378MOVD_xmm(R(alphaReg), argColorReg);379regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);380SHR(32, R(alphaReg), Imm8(24));381}382383if (id.hasAlphaTestMask) {384// Unfortunate, we'll need pixelID to load the mask.385// Note: we leave the ALPHA purpose untouched and free it, because later code may reuse.386X64Reg idReg = GetPixelID();387X64Reg maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);388389MOVZX(32, 8, maskedReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaTestMask)));390UnlockPixelID(idReg);391AND(32, R(maskedReg), R(alphaReg));392regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);393394// Okay now do the rest using the masked reg, which we modified.395alphaReg = maskedReg;396}397398// We hardcode the ref into this jit func.399CMP(8, R(alphaReg), Imm8(id.alphaTestRef));400if (id.hasAlphaTestMask)401regCache_.Release(alphaReg, RegCache::GEN_TEMP0);402else403regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);404405switch (id.AlphaTestFunc()) {406case GE_COMP_NEVER:407case GE_COMP_ALWAYS:408break;409410case GE_COMP_EQUAL:411Discard(CC_NE);412break;413414case GE_COMP_NOTEQUAL:415Discard(CC_E);416break;417418case GE_COMP_LESS:419Discard(CC_AE);420break;421422case GE_COMP_LEQUAL:423Discard(CC_A);424break;425426case GE_COMP_GREATER:427Discard(CC_BE);428break;429430case GE_COMP_GEQUAL:431Discard(CC_B);432break;433}434435return true;436}437438bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) {439if (!id.colorTest || id.clearMode)440return true;441442// We'll have 4 with fog released, so we're using them all...443Describe("ColorTest");444X64Reg idReg = GetPixelID();445X64Reg funcReg = regCache_.Alloc(RegCache::GEN_TEMP0);446X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP1);447X64Reg refReg = regCache_.Alloc(RegCache::GEN_TEMP2);448449// First, load the registers: mask and ref.450MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestMask)));451MOV(32, R(refReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestRef)));452453X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);454if (colorIs16Bit_) {455// If it's expanded, we need to clamp anyway if it was fogged.456PACKUSWB(argColorReg, R(argColorReg));457colorIs16Bit_ = false;458}459460// Temporarily abuse funcReg to grab the color into maskReg.461MOVD_xmm(R(funcReg), argColorReg);462AND(32, R(maskReg), R(funcReg));463regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);464465// Now that we're setup, get the func and follow it.466MOVZX(32, 8, funcReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorTestFunc)));467UnlockPixelID(idReg);468469CMP(8, R(funcReg), Imm8(GE_COMP_ALWAYS));470// Discard for GE_COMP_NEVER...471Discard(CC_B);472FixupBranch skip = J_CC(CC_E);473474CMP(8, R(funcReg), Imm8(GE_COMP_EQUAL));475FixupBranch doEqual = J_CC(CC_E);476regCache_.Release(funcReg, RegCache::GEN_TEMP0);477478// The not equal path here... if they are equal, we discard.479CMP(32, R(refReg), R(maskReg));480Discard(CC_E);481FixupBranch skip2 = J();482483SetJumpTarget(doEqual);484CMP(32, R(refReg), R(maskReg));485Discard(CC_NE);486487regCache_.Release(maskReg, RegCache::GEN_TEMP1);488regCache_.Release(refReg, RegCache::GEN_TEMP2);489490SetJumpTarget(skip);491SetJumpTarget(skip2);492493return true;494}495496bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {497if (!id.applyFog) {498// Okay, anyone can use the fog register then.499regCache_.ForceRelease(RegCache::GEN_ARG_FOG);500return true;501}502503// Load fog and expand to 16 bit. Ignore the high 8 bits, which'll match up with A.504Describe("ApplyFog");505X64Reg fogColorReg = regCache_.Alloc(RegCache::VEC_TEMP1);506X64Reg idReg = GetPixelID();507if (cpu_info.bSSE4_1) {508PMOVZXBW(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));509} else {510X64Reg zeroReg = GetZeroVec();511MOVD_xmm(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));512PUNPCKLBW(fogColorReg, R(zeroReg));513regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);514}515UnlockPixelID(idReg);516517// Load a set of 255s at 16 bit into a reg for later...518X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);519PCMPEQW(invertReg, R(invertReg));520PSRLW(invertReg, 8);521522// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.523X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);524if (!colorIs16Bit_) {525if (cpu_info.bSSE4_1) {526PMOVZXBW(argColorReg, R(argColorReg));527} else {528X64Reg zeroReg = GetZeroVec();529PUNPCKLBW(argColorReg, R(zeroReg));530regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);531}532colorIs16Bit_ = true;533}534535// Save A so we can put it back, we don't "fog" A.536X64Reg alphaReg;537if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {538alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);539} else {540alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);541PEXTRW(alphaReg, argColorReg, 3);542}543544// Okay, let's broadcast fog to an XMM.545X64Reg fogMultReg = regCache_.Alloc(RegCache::VEC_TEMP3);546X64Reg argFogReg = regCache_.Find(RegCache::GEN_ARG_FOG);547MOVD_xmm(fogMultReg, R(argFogReg));548PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0));549regCache_.Unlock(argFogReg, RegCache::GEN_ARG_FOG);550// We can free up the actual fog reg now.551regCache_.ForceRelease(RegCache::GEN_ARG_FOG);552553// Our goal here is to calculate this formula:554// (argColor * fog + fogColor * (255 - fog) + 255) / 256555556// Now we multiply the existing color by fog...557PMULLW(argColorReg, R(fogMultReg));558// Before inversing, let's add that 255 we loaded in as well, since we have it.559PADDW(argColorReg, R(invertReg));560// And then inverse the fog value using those 255s, and multiply by fog color.561PSUBW(invertReg, R(fogMultReg));562PMULLW(fogColorReg, R(invertReg));563// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.564PADDW(argColorReg, R(fogColorReg));565regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);566regCache_.Release(invertReg, RegCache::VEC_TEMP2);567regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);568569// Now we simply divide by 256, or in other words shift by 8.570PSRLW(argColorReg, 8);571572// Okay, put A back in, we'll shrink it to 8888 when needed.573PINSRW(argColorReg, R(alphaReg), 3);574regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);575576// We most likely won't use alphaReg again.577regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);578579return true;580}581582bool PixelJitCache::Jit_StencilAndDepthTest(const PixelFuncID &id) {583_assert_(!id.clearMode && id.stencilTest);584585X64Reg stencilReg = GetDestStencil(id);586Describe("StencilAndDepth");587X64Reg maskedReg = stencilReg;588if (id.hasStencilTestMask && stencilReg != INVALID_REG) {589X64Reg idReg = GetPixelID();590maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);591MOV(32, R(maskedReg), R(stencilReg));592AND(8, R(maskedReg), MDisp(idReg, offsetof(PixelFuncID, cached.stencilTestMask)));593UnlockPixelID(idReg);594}595596bool success = true;597success = success && Jit_StencilTest(id, stencilReg, maskedReg);598if (maskedReg != stencilReg)599regCache_.Release(maskedReg, RegCache::GEN_TEMP0);600601// Next up, the depth test.602if (stencilReg == INVALID_REG) {603// Just use the standard one, since we don't need to write stencil.604// We also don't need to worry about cleanup either.605return success && Jit_DepthTest(id);606}607608success = success && Jit_DepthTestForStencil(id, stencilReg);609success = success && Jit_ApplyStencilOp(id, id.ZPass(), stencilReg);610611// At this point, stencilReg can't be spilled. It contains the updated value.612regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);613regCache_.ForceRetain(RegCache::GEN_STENCIL);614615return success;616}617618bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencilReg, RegCache::Reg maskedReg) {619Describe("StencilTest");620621bool hasFixedResult = false;622bool fixedResult = false;623FixupBranch toPass;624if (stencilReg == INVALID_REG) {625// This means stencil is a fixed value 0.626hasFixedResult = true;627switch (id.StencilTestFunc()) {628case GE_COMP_NEVER: fixedResult = false; break;629case GE_COMP_ALWAYS: fixedResult = true; break;630case GE_COMP_EQUAL: fixedResult = id.stencilTestRef == 0; break;631case GE_COMP_NOTEQUAL: fixedResult = id.stencilTestRef != 0; break;632case GE_COMP_LESS: fixedResult = false; break;633case GE_COMP_LEQUAL: fixedResult = id.stencilTestRef == 0; break;634case GE_COMP_GREATER: fixedResult = id.stencilTestRef != 0; break;635case GE_COMP_GEQUAL: fixedResult = true; break;636}637} else if (id.StencilTestFunc() == GE_COMP_ALWAYS) {638// Fairly common, skip the CMP.639hasFixedResult = true;640fixedResult = true;641} else {642// Reversed here because of the imm, so tests below are reversed.643CMP(8, R(maskedReg), Imm8(id.stencilTestRef));644switch (id.StencilTestFunc()) {645case GE_COMP_NEVER:646hasFixedResult = true;647fixedResult = false;648break;649650case GE_COMP_ALWAYS:651_assert_(false);652break;653654case GE_COMP_EQUAL:655toPass = J_CC(CC_E);656break;657658case GE_COMP_NOTEQUAL:659toPass = J_CC(CC_NE);660break;661662case GE_COMP_LESS:663toPass = J_CC(CC_A);664break;665666case GE_COMP_LEQUAL:667toPass = J_CC(CC_AE);668break;669670case GE_COMP_GREATER:671toPass = J_CC(CC_B);672break;673674case GE_COMP_GEQUAL:675toPass = J_CC(CC_BE);676break;677}678}679680if (hasFixedResult && !fixedResult && stencilReg == INVALID_REG) {681Discard();682return true;683}684685bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);686bool hadIdReg = regCache_.Has(RegCache::GEN_ID);687688bool success = true;689if (stencilReg != INVALID_REG && (!hasFixedResult || !fixedResult)) {690// This is the fail path.691success = success && Jit_ApplyStencilOp(id, id.SFail(), stencilReg);692success = success && Jit_WriteStencilOnly(id, stencilReg);693694Discard();695}696697// If we allocated either id or colorOff in the conditional, forget.698if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))699regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);700if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))701regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);702703if (!hasFixedResult)704SetJumpTarget(toPass);705return success;706}707708bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {709if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)710return true;711712X64Reg depthOffReg = GetDepthOff(id);713Describe("DepthTestStencil");714X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);715CMP(16, R(argZReg), MatR(depthOffReg));716regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);717regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);718719// We discard the opposite of the passing test.720FixupBranch skip;721switch (id.DepthTestFunc()) {722case GE_COMP_NEVER:723// Shouldn't happen, just do an extra CMP.724CMP(32, R(RAX), R(RAX));725// This is just to have a skip that is valid.726skip = J_CC(CC_NE);727break;728729case GE_COMP_ALWAYS:730// Shouldn't happen, just do an extra CMP.731CMP(32, R(RAX), R(RAX));732skip = J_CC(CC_E);733break;734735case GE_COMP_EQUAL:736skip = J_CC(CC_E);737break;738739case GE_COMP_NOTEQUAL:740skip = J_CC(CC_NE);741break;742743case GE_COMP_LESS:744skip = J_CC(CC_B);745break;746747case GE_COMP_LEQUAL:748skip = J_CC(CC_BE);749break;750751case GE_COMP_GREATER:752skip = J_CC(CC_A);753break;754755case GE_COMP_GEQUAL:756skip = J_CC(CC_AE);757break;758}759760bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);761bool hadIdReg = regCache_.Has(RegCache::GEN_ID);762763bool success = true;764success = success && Jit_ApplyStencilOp(id, id.ZFail(), stencilReg);765success = success && Jit_WriteStencilOnly(id, stencilReg);766Discard();767768// If we allocated either id or colorOff in the conditional, forget.769if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))770regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);771if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))772regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);773774SetJumpTarget(skip);775776// Like in Jit_DepthTest(), at this point we may not need this reg anymore.777if (!id.depthWrite)778regCache_.ForceRelease(RegCache::GEN_ARG_Z);779780return success;781}782783bool PixelJitCache::Jit_ApplyStencilOp(const PixelFuncID &id, GEStencilOp op, RegCache::Reg stencilReg) {784_assert_(stencilReg != INVALID_REG);785786Describe("ApplyStencil");787FixupBranch skip;788switch (op) {789case GE_STENCILOP_KEEP:790// Nothing to do.791break;792793case GE_STENCILOP_ZERO:794XOR(32, R(stencilReg), R(stencilReg));795break;796797case GE_STENCILOP_REPLACE:798if (id.hasStencilTestMask) {799// Load the unmasked value.800X64Reg idReg = GetPixelID();801MOVZX(32, 8, stencilReg, MDisp(idReg, offsetof(PixelFuncID, cached.stencilRef)));802UnlockPixelID(idReg);803} else {804MOV(8, R(stencilReg), Imm8(id.stencilTestRef));805}806break;807808case GE_STENCILOP_INVERT:809NOT(8, R(stencilReg));810break;811812case GE_STENCILOP_INCR:813switch (id.fbFormat) {814case GE_FORMAT_565:815break;816817case GE_FORMAT_5551:818MOV(8, R(stencilReg), Imm8(0xFF));819break;820821case GE_FORMAT_4444:822CMP(8, R(stencilReg), Imm8(0xF0));823skip = J_CC(CC_AE);824ADD(8, R(stencilReg), Imm8(0x11));825SetJumpTarget(skip);826break;827828case GE_FORMAT_8888:829CMP(8, R(stencilReg), Imm8(0xFF));830skip = J_CC(CC_E);831ADD(8, R(stencilReg), Imm8(0x01));832SetJumpTarget(skip);833break;834}835break;836837case GE_STENCILOP_DECR:838switch (id.fbFormat) {839case GE_FORMAT_565:840break;841842case GE_FORMAT_5551:843XOR(32, R(stencilReg), R(stencilReg));844break;845846case GE_FORMAT_4444:847CMP(8, R(stencilReg), Imm8(0x11));848skip = J_CC(CC_B);849SUB(8, R(stencilReg), Imm8(0x11));850SetJumpTarget(skip);851break;852853case GE_FORMAT_8888:854CMP(8, R(stencilReg), Imm8(0x00));855skip = J_CC(CC_E);856SUB(8, R(stencilReg), Imm8(0x01));857SetJumpTarget(skip);858break;859}860break;861}862863return true;864}865866bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg stencilReg) {867_assert_(stencilReg != INVALID_REG);868869// It's okay to destroy stencilReg here, we know we're the last writing it.870X64Reg colorOffReg = GetColorOff(id);871Describe("WriteStencil");872if (id.applyColorWriteMask) {873X64Reg idReg = GetPixelID();874X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP5);875876switch (id.fbFormat) {877case GE_FORMAT_565:878break;879880case GE_FORMAT_5551:881// Read the high 8 bits of the 16-bit color mask.882MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));883OR(8, R(maskReg), Imm8(0x7F));884885// Poor man's BIC...886NOT(32, R(stencilReg));887OR(32, R(stencilReg), R(maskReg));888NOT(32, R(stencilReg));889890AND(8, MDisp(colorOffReg, 1), R(maskReg));891OR(8, MDisp(colorOffReg, 1), R(stencilReg));892break;893894case GE_FORMAT_4444:895// Read the high 8 bits of the 16-bit color mask.896MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));897OR(8, R(maskReg), Imm8(0x0F));898899// Poor man's BIC...900NOT(32, R(stencilReg));901OR(32, R(stencilReg), R(maskReg));902NOT(32, R(stencilReg));903904AND(8, MDisp(colorOffReg, 1), R(maskReg));905OR(8, MDisp(colorOffReg, 1), R(stencilReg));906break;907908case GE_FORMAT_8888:909// Read the high 8 bits of the 32-bit color mask.910MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 3));911912// Poor man's BIC...913NOT(32, R(stencilReg));914OR(32, R(stencilReg), R(maskReg));915NOT(32, R(stencilReg));916917AND(8, MDisp(colorOffReg, 3), R(maskReg));918OR(8, MDisp(colorOffReg, 3), R(stencilReg));919break;920}921922regCache_.Release(maskReg, RegCache::GEN_TEMP5);923UnlockPixelID(idReg);924} else {925switch (id.fbFormat) {926case GE_FORMAT_565:927break;928929case GE_FORMAT_5551:930AND(8, R(stencilReg), Imm8(0x80));931AND(8, MDisp(colorOffReg, 1), Imm8(0x7F));932OR(8, MDisp(colorOffReg, 1), R(stencilReg));933break;934935case GE_FORMAT_4444:936AND(8, MDisp(colorOffReg, 1), Imm8(0x0F));937AND(8, R(stencilReg), Imm8(0xF0));938OR(8, MDisp(colorOffReg, 1), R(stencilReg));939break;940941case GE_FORMAT_8888:942MOV(8, MDisp(colorOffReg, 3), R(stencilReg));943break;944}945}946947regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);948return true;949}950951bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {952if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)953return true;954955if (id.DepthTestFunc() == GE_COMP_NEVER) {956Discard();957// This should be uncommon, just keep going to have shared cleanup...958}959960X64Reg depthOffReg = GetDepthOff(id);961Describe("DepthTest");962X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);963CMP(16, R(argZReg), MatR(depthOffReg));964regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);965regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);966967// We discard the opposite of the passing test.968switch (id.DepthTestFunc()) {969case GE_COMP_NEVER:970case GE_COMP_ALWAYS:971break;972973case GE_COMP_EQUAL:974Discard(CC_NE);975break;976977case GE_COMP_NOTEQUAL:978Discard(CC_E);979break;980981case GE_COMP_LESS:982Discard(CC_AE);983break;984985case GE_COMP_LEQUAL:986Discard(CC_A);987break;988989case GE_COMP_GREATER:990Discard(CC_BE);991break;992993case GE_COMP_GEQUAL:994Discard(CC_B);995break;996}997998// If we're not writing, we don't need Z anymore. We'll free GEN_DEPTH_OFF in Jit_WriteDepth().999if (!id.depthWrite)1000regCache_.ForceRelease(RegCache::GEN_ARG_Z);10011002return true;1003}10041005bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) {1006// Clear mode shares depthWrite for DepthClear().1007if (id.depthWrite) {1008X64Reg depthOffReg = GetDepthOff(id);1009Describe("WriteDepth");1010X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);1011MOV(16, MatR(depthOffReg), R(argZReg));1012regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);1013regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);1014regCache_.ForceRelease(RegCache::GEN_ARG_Z);1015}10161017// We can free up this reg if we force locked it.1018if (regCache_.Has(RegCache::GEN_DEPTH_OFF)) {1019regCache_.ForceRelease(RegCache::GEN_DEPTH_OFF);1020}10211022return true;1023}10241025bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {1026if (!id.alphaBlend)1027return true;10281029// Check if we need to load and prep factors.1030PixelBlendState blendState;1031ComputePixelBlendState(blendState, id);10321033bool success = true;10341035// Step 1: Load and expand dest color.1036X64Reg dstReg = regCache_.Alloc(RegCache::VEC_TEMP0);1037if (!blendState.readsDstPixel) {1038// Let's load colorOff just for registers to be consistent.1039X64Reg colorOff = GetColorOff(id);1040regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);10411042PXOR(dstReg, R(dstReg));1043} else if (id.FBFormat() == GE_FORMAT_8888) {1044X64Reg colorOff = GetColorOff(id);1045Describe("AlphaBlend");1046MOVD_xmm(dstReg, MatR(colorOff));1047regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);1048} else {1049X64Reg colorOff = GetColorOff(id);1050Describe("AlphaBlend");1051X64Reg dstGenReg = regCache_.Alloc(RegCache::GEN_TEMP0);1052MOVZX(32, 16, dstGenReg, MatR(colorOff));1053regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);10541055X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);1056X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);10571058switch (id.fbFormat) {1059case GE_FORMAT_565:1060success = success && Jit_ConvertFrom565(id, dstGenReg, temp1Reg, temp2Reg);1061break;10621063case GE_FORMAT_5551:1064success = success && Jit_ConvertFrom5551(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);1065break;10661067case GE_FORMAT_4444:1068success = success && Jit_ConvertFrom4444(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);1069break;10701071case GE_FORMAT_8888:1072break;1073}10741075Describe("AlphaBlend");1076MOVD_xmm(dstReg, R(dstGenReg));10771078regCache_.Release(dstGenReg, RegCache::GEN_TEMP0);1079regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);1080regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);1081}10821083// Step 2: Load and apply factors.1084X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);1085if (blendState.usesFactors) {1086X64Reg srcFactorReg = regCache_.Alloc(RegCache::VEC_TEMP1);1087X64Reg dstFactorReg = regCache_.Alloc(RegCache::VEC_TEMP2);10881089// We apply these at 16-bit, because they can be doubled and have a half offset.1090if (cpu_info.bSSE4_1) {1091if (!colorIs16Bit_)1092PMOVZXBW(argColorReg, R(argColorReg));1093PMOVZXBW(dstReg, R(dstReg));1094} else {1095X64Reg zeroReg = GetZeroVec();1096if (!colorIs16Bit_)1097PUNPCKLBW(argColorReg, R(zeroReg));1098PUNPCKLBW(dstReg, R(zeroReg));1099regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1100}1101colorIs16Bit_ = true;11021103// Skip multiplying by factors if we can.1104bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;1105bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;1106// We also shift left by 4, so mulhi gives us a free shift1107// We also need to add a half bit later, so this gives us space.1108if (multiplySrc || blendState.srcColorAsFactor)1109PSLLW(argColorReg, 4);1110if (multiplyDst || blendState.dstColorAsFactor || blendState.usesDstAlpha)1111PSLLW(dstReg, 4);11121113// Okay, now grab our factors. Don't bother if they're known values.1114if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)1115success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());1116if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)1117success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);11181119X64Reg halfReg = INVALID_REG;1120if (multiplySrc || multiplyDst) {1121halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);1122// We'll use this several times, so load into a reg.1123MOVDQA(halfReg, M(constBlendHalf_11_4s_));1124}11251126// Add in the half bit to the factors and color values, then multiply.1127// We take the high 16 bits to get a free right shift by 16.1128if (multiplySrc) {1129POR(srcFactorReg, R(halfReg));1130POR(argColorReg, R(halfReg));1131PMULHUW(argColorReg, R(srcFactorReg));1132} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {1133PXOR(argColorReg, R(argColorReg));1134} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {1135if (blendState.srcColorAsFactor)1136PSRLW(argColorReg, 4);1137}11381139if (multiplyDst) {1140POR(dstFactorReg, R(halfReg));1141POR(dstReg, R(halfReg));1142PMULHUW(dstReg, R(dstFactorReg));1143} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {1144// No need to add or subtract zero, unless we're negating.1145// This is common for bloom preparation.1146if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)1147PXOR(dstReg, R(dstReg));1148} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {1149if (blendState.dstColorAsFactor || blendState.usesDstAlpha)1150PSRLW(dstReg, 4);1151}11521153regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);1154regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);1155if (halfReg != INVALID_REG)1156regCache_.Release(halfReg, RegCache::VEC_TEMP3);1157} else if (colorIs16Bit_) {1158// If it's expanded, shrink and clamp for our min/max/absdiff handling.1159PACKUSWB(argColorReg, R(argColorReg));1160colorIs16Bit_ = false;1161}11621163// Step 3: Apply equation.1164// Note: below, we completely ignore what happens to the alpha bits.1165// It won't matter, since we'll replace those with stencil anyway.1166X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);1167switch (id.AlphaBlendEq()) {1168case GE_BLENDMODE_MUL_AND_ADD:1169if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)1170PADDUSW(argColorReg, R(dstReg));1171break;11721173case GE_BLENDMODE_MUL_AND_SUBTRACT:1174if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)1175PSUBUSW(argColorReg, R(dstReg));1176break;11771178case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:1179if (cpu_info.bAVX) {1180VPSUBUSW(128, argColorReg, dstReg, R(argColorReg));1181} else {1182MOVDQA(tempReg, R(argColorReg));1183MOVDQA(argColorReg, R(dstReg));1184PSUBUSW(argColorReg, R(tempReg));1185}1186break;11871188case GE_BLENDMODE_MIN:1189PMINUB(argColorReg, R(dstReg));1190break;11911192case GE_BLENDMODE_MAX:1193PMAXUB(argColorReg, R(dstReg));1194break;11951196case GE_BLENDMODE_ABSDIFF:1197// Calculate A=(dst-src < 0 ? 0 : dst-src) and B=(src-dst < 0 ? 0 : src-dst)...1198MOVDQA(tempReg, R(dstReg));1199PSUBUSB(tempReg, R(argColorReg));1200PSUBUSB(argColorReg, R(dstReg));12011202// Now, one of those must be zero, and the other one is the result (could also be zero.)1203POR(argColorReg, R(tempReg));1204break;1205}12061207regCache_.Release(dstReg, RegCache::VEC_TEMP0);1208regCache_.Release(tempReg, RegCache::VEC_TEMP1);1209regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);12101211return success;1212}12131214bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {1215X64Reg idReg = INVALID_REG;1216X64Reg tempReg = INVALID_REG;1217X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);12181219// Everything below expects an expanded 16-bit color1220_assert_(colorIs16Bit_);12211222// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.1223// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.12241225// Load the invert constant first off, if needed.1226switch (factor) {1227case PixelBlendFactor::INVOTHERCOLOR:1228case PixelBlendFactor::INVSRCALPHA:1229case PixelBlendFactor::INVDSTALPHA:1230case PixelBlendFactor::DOUBLEINVSRCALPHA:1231case PixelBlendFactor::DOUBLEINVDSTALPHA:1232MOVDQA(factorReg, M(constBlendInvert_11_4s_));1233break;12341235default:1236break;1237}12381239switch (factor) {1240case PixelBlendFactor::OTHERCOLOR:1241MOVDQA(factorReg, R(dstReg));1242break;12431244case PixelBlendFactor::INVOTHERCOLOR:1245PSUBUSW(factorReg, R(dstReg));1246break;12471248case PixelBlendFactor::SRCALPHA:1249PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));1250break;12511252case PixelBlendFactor::INVSRCALPHA:1253tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);12541255PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));1256PSUBUSW(factorReg, R(tempReg));1257break;12581259case PixelBlendFactor::DSTALPHA:1260PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));1261break;12621263case PixelBlendFactor::INVDSTALPHA:1264tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);12651266PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));1267PSUBUSW(factorReg, R(tempReg));1268break;12691270case PixelBlendFactor::DOUBLESRCALPHA:1271PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));1272PSLLW(factorReg, 1);1273break;12741275case PixelBlendFactor::DOUBLEINVSRCALPHA:1276tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);12771278PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));1279PSLLW(tempReg, 1);1280PSUBUSW(factorReg, R(tempReg));1281break;12821283case PixelBlendFactor::DOUBLEDSTALPHA:1284PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));1285PSLLW(factorReg, 1);1286break;12871288case PixelBlendFactor::DOUBLEINVDSTALPHA:1289tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);12901291PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));1292PSLLW(tempReg, 1);1293PSUBUSW(factorReg, R(tempReg));1294break;12951296case PixelBlendFactor::ZERO:1297// Special value meaning zero.1298PXOR(factorReg, R(factorReg));1299break;13001301case PixelBlendFactor::ONE:1302// Special value meaning all 255s.1303PCMPEQD(factorReg, R(factorReg));1304PSLLW(factorReg, 8);1305PSRLW(factorReg, 4);1306break;13071308case PixelBlendFactor::FIX:1309default:1310idReg = GetPixelID();1311if (cpu_info.bSSE4_1) {1312PMOVZXBW(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));1313} else {1314X64Reg zeroReg = GetZeroVec();1315MOVD_xmm(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));1316PUNPCKLBW(factorReg, R(zeroReg));1317regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1318}1319// Round it out by shifting into place.1320PSLLW(factorReg, 4);1321break;1322}13231324if (idReg != INVALID_REG)1325UnlockPixelID(idReg);1326if (tempReg != INVALID_REG)1327regCache_.Release(tempReg, RegCache::VEC_TEMP3);1328regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);13291330return true;1331}13321333bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg) {1334bool success = true;1335X64Reg idReg = INVALID_REG;1336X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);13371338// Everything below expects an expanded 16-bit color1339_assert_(colorIs16Bit_);13401341PixelBlendState blendState;1342ComputePixelBlendState(blendState, id);13431344// We might be able to reuse srcFactorReg for dst, in some cases.1345switch (id.AlphaBlendDst()) {1346case PixelBlendFactor::OTHERCOLOR:1347MOVDQA(dstFactorReg, R(argColorReg));1348break;13491350case PixelBlendFactor::INVOTHERCOLOR:1351MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));1352PSUBUSW(dstFactorReg, R(argColorReg));1353break;13541355case PixelBlendFactor::SRCALPHA:1356case PixelBlendFactor::INVSRCALPHA:1357case PixelBlendFactor::DSTALPHA:1358case PixelBlendFactor::INVDSTALPHA:1359case PixelBlendFactor::DOUBLESRCALPHA:1360case PixelBlendFactor::DOUBLEINVSRCALPHA:1361case PixelBlendFactor::DOUBLEDSTALPHA:1362case PixelBlendFactor::DOUBLEINVDSTALPHA:1363case PixelBlendFactor::ZERO:1364case PixelBlendFactor::ONE:1365// These are all equivalent for src factor, so reuse that logic.1366if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {1367MOVDQA(dstFactorReg, R(srcFactorReg));1368} else if (blendState.dstFactorIsInverse) {1369MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));1370PSUBUSW(dstFactorReg, R(srcFactorReg));1371} else {1372success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());1373}1374break;13751376case PixelBlendFactor::FIX:1377default:1378idReg = GetPixelID();1379if (cpu_info.bSSE4_1) {1380PMOVZXBW(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));1381} else {1382X64Reg zeroReg = GetZeroVec();1383MOVD_xmm(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));1384PUNPCKLBW(dstFactorReg, R(zeroReg));1385regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1386}1387// Round it out by shifting into place.1388PSLLW(dstFactorReg, 4);1389break;1390}13911392if (idReg != INVALID_REG)1393UnlockPixelID(idReg);1394regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);13951396return success;1397}13981399bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {1400if (!id.dithering)1401return true;14021403Describe("Dither");1404X64Reg valueReg = regCache_.Alloc(RegCache::GEN_TEMP0);14051406// Load the row dither matrix entry (will still need to get the X.)1407X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);1408MOV(32, R(valueReg), R(argYReg));1409AND(32, R(valueReg), Imm8(3));14101411// At this point, we're done with depth and y, so let's grab GEN_COLOR_OFF and retain it.1412// Then we can modify x and throw it away too, which is our actual goal.1413X64Reg colorOffReg = GetColorOff(id);1414Describe("Dither");1415regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);1416regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);1417// And get rid of y, we can use for other regs.1418regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);1419regCache_.ForceRelease(RegCache::GEN_ARG_Y);14201421X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);1422AND(32, R(argXReg), Imm32(3));14231424// Sum up (x + y * 4) + ditherMatrix offset to valueReg.1425LEA(32, valueReg, MComplex(argXReg, valueReg, 4, offsetof(PixelFuncID, cached.ditherMatrix)));14261427// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.1428if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {1429X64Reg idReg = GetPixelID();1430MOVSX(32, 8, valueReg, MRegSum(idReg, valueReg));1431UnlockPixelID(idReg);1432} else {1433_assert_(stackIDOffset_ != -1);1434MOV(PTRBITS, R(argXReg), MDisp(RSP, stackIDOffset_));1435MOVSX(32, 8, valueReg, MRegSum(argXReg, valueReg));1436}1437regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);1438regCache_.ForceRelease(RegCache::GEN_ARG_X);14391440// Copy that value into a vec to add to the color.1441X64Reg vecValueReg = regCache_.Alloc(RegCache::VEC_TEMP0);1442MOVD_xmm(vecValueReg, R(valueReg));1443regCache_.Release(valueReg, RegCache::GEN_TEMP0);14441445// Now we want to broadcast RGB in 16-bit, but keep A as 0.1446// Luckily, we know that third lane (in 16-bit) is zero from MOVD clearing it.1447// We use 16-bit because we need a signed add, but we also want to saturate.1448PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(2, 0, 0, 0));14491450// With that, now let's convert the color to 16 bit...1451X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);1452if (!colorIs16Bit_) {1453if (cpu_info.bSSE4_1) {1454PMOVZXBW(argColorReg, R(argColorReg));1455} else {1456X64Reg zeroReg = GetZeroVec();1457PUNPCKLBW(argColorReg, R(zeroReg));1458regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);1459}1460colorIs16Bit_ = true;1461}1462// And simply add the dither values.1463PADDSW(argColorReg, R(vecValueReg));1464regCache_.Release(vecValueReg, RegCache::VEC_TEMP0);1465regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);14661467return true;1468}14691470bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {1471X64Reg colorOff = GetColorOff(id);1472Describe("WriteColor");1473if (regCache_.Has(RegCache::GEN_ARG_X)) {1474// We normally toss x and y during dithering or useStandardStride with no dithering.1475// Free up the regs now to get more reg space.1476regCache_.ForceRelease(RegCache::GEN_ARG_X);1477regCache_.ForceRelease(RegCache::GEN_ARG_Y);14781479// But make sure we don't lose GEN_COLOR_OFF, we'll be lost without that now.1480regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);1481}14821483// Convert back to 8888 and clamp.1484X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);1485if (colorIs16Bit_) {1486PACKUSWB(argColorReg, R(argColorReg));1487colorIs16Bit_ = false;1488}14891490if (id.clearMode) {1491bool drawingDone = false;1492if (!id.ColorClear() && !id.StencilClear())1493drawingDone = true;1494if (!id.ColorClear() && id.FBFormat() == GE_FORMAT_565)1495drawingDone = true;14961497bool success = true;1498if (!id.ColorClear() && !drawingDone) {1499// Let's reuse Jit_WriteStencilOnly for this path.1500X64Reg alphaReg;1501if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {1502alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);1503} else {1504alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);1505MOVD_xmm(R(alphaReg), argColorReg);1506SHR(32, R(alphaReg), Imm8(24));1507}1508success = Jit_WriteStencilOnly(id, alphaReg);1509regCache_.Release(alphaReg, RegCache::GEN_SRC_ALPHA);15101511drawingDone = true;1512}15131514if (drawingDone) {1515regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);1516regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);1517regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);1518regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);1519return success;1520}15211522// In this case, we're clearing only color or only color and stencil. Proceed.1523}15241525X64Reg colorReg = regCache_.Alloc(RegCache::GEN_TEMP0);1526MOVD_xmm(R(colorReg), argColorReg);1527regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);1528regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);15291530X64Reg stencilReg = INVALID_REG;1531if (regCache_.Has(RegCache::GEN_STENCIL))1532stencilReg = regCache_.Find(RegCache::GEN_STENCIL);15331534X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);1535X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);1536bool convertAlpha = id.clearMode && id.StencilClear();1537bool writeAlpha = convertAlpha || stencilReg != INVALID_REG;1538uint32_t fixedKeepMask = 0x00000000;15391540bool success = true;15411542// Step 1: Load the color into colorReg.1543switch (id.fbFormat) {1544case GE_FORMAT_565:1545// In this case, stencil doesn't matter.1546success = success && Jit_ConvertTo565(id, colorReg, temp1Reg, temp2Reg);1547break;15481549case GE_FORMAT_5551:1550success = success && Jit_ConvertTo5551(id, colorReg, temp1Reg, temp2Reg, convertAlpha);15511552if (stencilReg != INVALID_REG) {1553// Truncate off the top bit of the stencil.1554SHR(32, R(stencilReg), Imm8(7));1555SHL(32, R(stencilReg), Imm8(15));1556} else if (!writeAlpha) {1557fixedKeepMask = 0x8000;1558}1559break;15601561case GE_FORMAT_4444:1562success = success && Jit_ConvertTo4444(id, colorReg, temp1Reg, temp2Reg, convertAlpha);15631564if (stencilReg != INVALID_REG) {1565// Truncate off the top bit of the stencil.1566SHR(32, R(stencilReg), Imm8(4));1567SHL(32, R(stencilReg), Imm8(12));1568} else if (!writeAlpha) {1569fixedKeepMask = 0xF000;1570}1571break;15721573case GE_FORMAT_8888:1574if (stencilReg != INVALID_REG) {1575SHL(32, R(stencilReg), Imm8(24));1576// Clear out the alpha bits so we can fit the stencil.1577AND(32, R(colorReg), Imm32(0x00FFFFFF));1578} else if (!writeAlpha) {1579fixedKeepMask = 0xFF000000;1580}1581break;1582}15831584// Step 2: Load write mask if needed.1585// Note that we apply the write mask at the destination bit depth.1586Describe("WriteColor");1587X64Reg maskReg = INVALID_REG;1588if (id.applyColorWriteMask) {1589maskReg = regCache_.Alloc(RegCache::GEN_TEMP3);1590// Load the pre-converted and combined write mask.1591if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {1592X64Reg idReg = GetPixelID();1593MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask)));1594UnlockPixelID(idReg);1595} else {1596_assert_(stackIDOffset_ != -1);1597MOV(PTRBITS, R(maskReg), MDisp(RSP, stackIDOffset_));1598MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));1599}1600}16011602// We've run out of regs, let's live without temp2 from here on.1603regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);16041605// Step 3: Apply logic op, combine stencil.1606skipStandardWrites_.clear();1607if (id.applyLogicOp) {1608// Note: we combine stencil during logic op, because it's a bit complex to retain.1609success = success && Jit_ApplyLogicOp(id, colorReg, maskReg);1610} else if (stencilReg != INVALID_REG) {1611OR(32, R(colorReg), R(stencilReg));1612}16131614// Step 4: Write and apply write mask.1615Describe("WriteColor");1616switch (id.fbFormat) {1617case GE_FORMAT_565:1618case GE_FORMAT_5551:1619case GE_FORMAT_4444:1620if (maskReg != INVALID_REG) {1621// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.1622AND(16, MatR(colorOff), R(maskReg));1623if (cpu_info.bBMI1) {1624ANDN(32, colorReg, maskReg, R(colorReg));1625} else {1626NOT(32, R(maskReg));1627AND(32, R(colorReg), R(maskReg));1628}1629OR(16, MatR(colorOff), R(colorReg));1630} else if (fixedKeepMask == 0) {1631MOV(16, MatR(colorOff), R(colorReg));1632} else {1633// Clear the non-stencil bits and or in the color.1634AND(16, MatR(colorOff), Imm16((uint16_t)fixedKeepMask));1635OR(16, MatR(colorOff), R(colorReg));1636}1637break;16381639case GE_FORMAT_8888:1640if (maskReg != INVALID_REG) {1641// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.1642AND(32, MatR(colorOff), R(maskReg));1643if (cpu_info.bBMI1) {1644ANDN(32, colorReg, maskReg, R(colorReg));1645} else {1646NOT(32, R(maskReg));1647AND(32, R(colorReg), R(maskReg));1648}1649OR(32, MatR(colorOff), R(colorReg));1650} else if (fixedKeepMask == 0) {1651MOV(32, MatR(colorOff), R(colorReg));1652} else if (fixedKeepMask == 0xFF000000) {1653// We want to set 24 bits only, since we're not changing stencil.1654// For now, let's do two writes rather than reading in the old stencil.1655MOV(16, MatR(colorOff), R(colorReg));1656SHR(32, R(colorReg), Imm8(16));1657MOV(8, MDisp(colorOff, 2), R(colorReg));1658} else {1659AND(32, MatR(colorOff), Imm32(fixedKeepMask));1660OR(32, MatR(colorOff), R(colorReg));1661}1662break;1663}16641665for (FixupBranch &fixup : skipStandardWrites_)1666SetJumpTarget(fixup);1667skipStandardWrites_.clear();16681669regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);1670regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);1671regCache_.Release(colorReg, RegCache::GEN_TEMP0);1672regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);1673if (maskReg != INVALID_REG)1674regCache_.Release(maskReg, RegCache::GEN_TEMP3);1675if (stencilReg != INVALID_REG) {1676regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);1677regCache_.ForceRelease(RegCache::GEN_STENCIL);1678}16791680return success;1681}16821683bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {1684Describe("LogicOp");1685X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);1686if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {1687X64Reg idReg = GetPixelID();1688MOVZX(32, 8, logicOpReg, MDisp(idReg, offsetof(PixelFuncID, cached.logicOp)));1689UnlockPixelID(idReg);1690} else {1691_assert_(stackIDOffset_ != -1);1692MOV(PTRBITS, R(logicOpReg), MDisp(RSP, stackIDOffset_));1693MOVZX(32, 8, logicOpReg, MDisp(logicOpReg, offsetof(PixelFuncID, cached.logicOp)));1694}16951696X64Reg stencilReg = INVALID_REG;1697if (regCache_.Has(RegCache::GEN_STENCIL))1698stencilReg = regCache_.Find(RegCache::GEN_STENCIL);16991700// Should already be allocated.1701X64Reg colorOff = regCache_.Find(RegCache::GEN_COLOR_OFF);1702X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP5);17031704// We'll use these in several cases, so prepare.1705int bits = id.fbFormat == GE_FORMAT_8888 ? 32 : 16;1706OpArg stencilMask, notStencilMask;1707switch (id.fbFormat) {1708case GE_FORMAT_565:1709stencilMask = Imm16(0);1710notStencilMask = Imm16(0xFFFF);1711break;1712case GE_FORMAT_5551:1713stencilMask = Imm16(0x8000);1714notStencilMask = Imm16(0x7FFF);1715break;1716case GE_FORMAT_4444:1717stencilMask = Imm16(0xF000);1718notStencilMask = Imm16(0x0FFF);1719break;1720case GE_FORMAT_8888:1721stencilMask = Imm32(0xFF000000);1722notStencilMask = Imm32(0x00FFFFFF);1723break;1724}17251726std::vector<FixupBranch> finishes;1727finishes.reserve(11);1728FixupBranch skipTable = J(true);1729const u8 *tableValues[16]{};17301731tableValues[GE_LOGIC_CLEAR] = GetCodePointer();1732if (stencilReg != INVALID_REG) {1733// If clearing and setting the stencil, that's easy - stencilReg has it.1734MOV(32, R(colorReg), R(stencilReg));1735finishes.push_back(J(true));1736} else if (maskReg != INVALID_REG) {1737// Just and out the unmasked bits (stencil already included in maskReg.)1738AND(bits, MatR(colorOff), R(maskReg));1739skipStandardWrites_.push_back(J(true));1740} else {1741// Otherwise, no mask, just AND the stencil bits to zero the rest.1742AND(bits, MatR(colorOff), stencilMask);1743skipStandardWrites_.push_back(J(true));1744}17451746tableValues[GE_LOGIC_AND] = GetCodePointer();1747if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {1748// Since we're ANDing, set the mask bits (AND will keep them as-is.)1749OR(32, R(colorReg), R(maskReg));1750OR(32, R(colorReg), R(stencilReg));17511752// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.1753NOT(32, R(maskReg));1754AND(bits, R(maskReg), stencilMask);1755OR(bits, MatR(colorOff), R(maskReg));1756} else if (stencilReg != INVALID_REG) {1757OR(32, R(colorReg), R(stencilReg));1758// No mask, so just or in the stencil bits so our AND can set any we want.1759OR(bits, MatR(colorOff), stencilMask);1760} else if (maskReg != INVALID_REG) {1761// Force in the mask (which includes all stencil bits) so both are kept as-is.1762OR(32, R(colorReg), R(maskReg));1763} else {1764// Force on the stencil bits so they AND and keep the existing value.1765if (stencilMask.GetImmValue() != 0)1766OR(bits, R(colorReg), stencilMask);1767}1768// Now the AND, which applies stencil and the logic op.1769AND(bits, MatR(colorOff), R(colorReg));1770skipStandardWrites_.push_back(J(true));17711772tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();1773// Reverse memory in a temp reg so we can apply the write mask easily.1774MOV(bits, R(temp1Reg), MatR(colorOff));1775if (cpu_info.bBMI1) {1776ANDN(32, colorReg, temp1Reg, R(colorReg));1777} else {1778NOT(32, R(temp1Reg));1779AND(32, R(colorReg), R(temp1Reg));1780}1781// Now add in the stencil bits (must be zero before, since we used AND.)1782if (stencilReg != INVALID_REG) {1783OR(32, R(colorReg), R(stencilReg));1784}1785finishes.push_back(J(true));17861787tableValues[GE_LOGIC_COPY] = GetCodePointer();1788// This is just a standard write, nothing complex.1789if (stencilReg != INVALID_REG) {1790OR(32, R(colorReg), R(stencilReg));1791}1792finishes.push_back(J(true));17931794tableValues[GE_LOGIC_AND_INVERTED] = GetCodePointer();1795if (stencilReg != INVALID_REG) {1796// Set the stencil bits, so they're zero when we invert.1797OR(bits, R(colorReg), stencilMask);1798NOT(32, R(colorReg));1799OR(32, R(colorReg), R(stencilReg));18001801if (maskReg != INVALID_REG) {1802// This way our AND will keep all those bits.1803OR(32, R(colorReg), R(maskReg));18041805// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.1806NOT(32, R(maskReg));1807AND(bits, R(maskReg), stencilMask);1808OR(bits, MatR(colorOff), R(maskReg));1809} else {1810// Force memory to take our stencil bits by ORing for the AND.1811OR(bits, MatR(colorOff), stencilMask);1812}1813} else if (maskReg != INVALID_REG) {1814NOT(32, R(colorReg));1815// This way our AND will keep all those bits.1816OR(32, R(colorReg), R(maskReg));1817} else {1818// Invert our color, but then add in stencil bits so the AND keeps them.1819NOT(32, R(colorReg));1820// We only do this for 8888 since the rest will have had 0 stencil bits (which turned to 1s.)1821if (id.FBFormat() == GE_FORMAT_8888)1822OR(bits, R(colorReg), stencilMask);1823}1824AND(bits, MatR(colorOff), R(colorReg));1825skipStandardWrites_.push_back(J(true));18261827tableValues[GE_LOGIC_NOOP] = GetCodePointer();1828if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {1829// Start by clearing masked bits from stencilReg.1830if (cpu_info.bBMI1) {1831ANDN(32, stencilReg, maskReg, R(stencilReg));1832} else {1833NOT(32, R(maskReg));1834AND(32, R(stencilReg), R(maskReg));1835NOT(32, R(maskReg));1836}18371838// Now mask out the stencil bits we're writing from memory.1839OR(bits, R(maskReg), notStencilMask);1840AND(bits, MatR(colorOff), R(maskReg));18411842// Now set those remaining stencil bits.1843OR(bits, MatR(colorOff), R(stencilReg));1844skipStandardWrites_.push_back(J(true));1845} else if (stencilReg != INVALID_REG) {1846// Clear and set just the stencil bits.1847AND(bits, MatR(colorOff), notStencilMask);1848OR(bits, MatR(colorOff), R(stencilReg));1849skipStandardWrites_.push_back(J(true));1850} else {1851Discard();1852}18531854tableValues[GE_LOGIC_XOR] = GetCodePointer();1855XOR(bits, R(colorReg), MatR(colorOff));1856if (stencilReg != INVALID_REG) {1857// Purge out the stencil bits from the XOR and copy ours in.1858AND(bits, R(colorReg), notStencilMask);1859OR(32, R(colorReg), R(stencilReg));1860} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1861// XOR might've set some bits, and without a maskReg we won't clear them.1862AND(bits, R(colorReg), notStencilMask);1863}1864finishes.push_back(J(true));18651866tableValues[GE_LOGIC_OR] = GetCodePointer();1867if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {1868OR(32, R(colorReg), R(stencilReg));18691870// Clear the bits we should be masking out.1871if (cpu_info.bBMI1) {1872ANDN(32, colorReg, maskReg, R(colorReg));1873} else {1874NOT(32, R(maskReg));1875AND(32, R(colorReg), R(maskReg));1876NOT(32, R(maskReg));1877}18781879// Clear all the unmasked stencil bits, so we can set our own.1880OR(bits, R(maskReg), notStencilMask);1881AND(bits, MatR(colorOff), R(maskReg));1882} else if (stencilReg != INVALID_REG) {1883OR(32, R(colorReg), R(stencilReg));1884// AND out the stencil bits so we set our own.1885AND(bits, MatR(colorOff), notStencilMask);1886} else if (maskReg != INVALID_REG) {1887// Clear the bits we should be masking out.1888if (cpu_info.bBMI1) {1889ANDN(32, colorReg, maskReg, R(colorReg));1890} else {1891NOT(32, R(maskReg));1892AND(32, R(colorReg), R(maskReg));1893}1894} else if (id.FBFormat() == GE_FORMAT_8888) {1895// We only need to do this for 8888, the others already have 0 stencil.1896AND(bits, R(colorReg), notStencilMask);1897}1898// Now the OR, which applies stencil and the logic op itself.1899OR(bits, MatR(colorOff), R(colorReg));1900skipStandardWrites_.push_back(J(true));19011902tableValues[GE_LOGIC_NOR] = GetCodePointer();1903OR(bits, R(colorReg), MatR(colorOff));1904NOT(32, R(colorReg));1905if (stencilReg != INVALID_REG) {1906AND(bits, R(colorReg), notStencilMask);1907OR(32, R(colorReg), R(stencilReg));1908} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1909// We need to clear the stencil bits since the standard write logic assumes they're zero.1910AND(bits, R(colorReg), notStencilMask);1911}1912finishes.push_back(J(true));19131914tableValues[GE_LOGIC_EQUIV] = GetCodePointer();1915XOR(bits, R(colorReg), MatR(colorOff));1916NOT(32, R(colorReg));1917if (stencilReg != INVALID_REG) {1918AND(bits, R(colorReg), notStencilMask);1919OR(32, R(colorReg), R(stencilReg));1920} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1921// We need to clear the stencil bits since the standard write logic assumes they're zero.1922AND(bits, R(colorReg), notStencilMask);1923}1924finishes.push_back(J(true));19251926tableValues[GE_LOGIC_INVERTED] = GetCodePointer();1927// We just toss our color entirely.1928MOV(bits, R(colorReg), MatR(colorOff));1929NOT(32, R(colorReg));1930if (stencilReg != INVALID_REG) {1931AND(bits, R(colorReg), notStencilMask);1932OR(32, R(colorReg), R(stencilReg));1933} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1934// We need to clear the stencil bits since the standard write logic assumes they're zero.1935AND(bits, R(colorReg), notStencilMask);1936}1937finishes.push_back(J(true));19381939tableValues[GE_LOGIC_OR_REVERSE] = GetCodePointer();1940// Reverse in a temp reg so we can mask properly.1941MOV(bits, R(temp1Reg), MatR(colorOff));1942NOT(32, R(temp1Reg));1943OR(32, R(colorReg), R(temp1Reg));1944if (stencilReg != INVALID_REG) {1945AND(bits, R(colorReg), notStencilMask);1946OR(32, R(colorReg), R(stencilReg));1947} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1948// We need to clear the stencil bits since the standard write logic assumes they're zero.1949AND(bits, R(colorReg), notStencilMask);1950}1951finishes.push_back(J(true));19521953tableValues[GE_LOGIC_COPY_INVERTED] = GetCodePointer();1954NOT(32, R(colorReg));1955if (stencilReg != INVALID_REG) {1956AND(bits, R(colorReg), notStencilMask);1957OR(32, R(colorReg), R(stencilReg));1958} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {1959// We need to clear the stencil bits since the standard write logic assumes they're zero.1960AND(bits, R(colorReg), notStencilMask);1961}1962finishes.push_back(J(true));19631964tableValues[GE_LOGIC_OR_INVERTED] = GetCodePointer();1965NOT(32, R(colorReg));1966if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {1967AND(bits, R(colorReg), notStencilMask);1968OR(32, R(colorReg), R(stencilReg));19691970// Clear the bits we should be masking out.1971if (cpu_info.bBMI1) {1972ANDN(32, colorReg, maskReg, R(colorReg));1973} else {1974NOT(32, R(maskReg));1975AND(32, R(colorReg), R(maskReg));1976NOT(32, R(maskReg));1977}19781979// Clear all the unmasked stencil bits, so we can set our own.1980OR(bits, R(maskReg), notStencilMask);1981AND(bits, MatR(colorOff), R(maskReg));1982} else if (stencilReg != INVALID_REG) {1983AND(bits, R(colorReg), notStencilMask);1984OR(32, R(colorReg), R(stencilReg));1985// AND out the stencil bits so we set our own.1986AND(bits, MatR(colorOff), notStencilMask);1987} else if (maskReg != INVALID_REG) {1988// Clear the bits we should be masking out.1989NOT(32, R(maskReg));1990AND(32, R(colorReg), R(maskReg));1991} else if (id.FBFormat() == GE_FORMAT_8888) {1992// We only need to do this for 8888, the others already have 0 stencil.1993AND(bits, R(colorReg), notStencilMask);1994}1995OR(bits, MatR(colorOff), R(colorReg));1996skipStandardWrites_.push_back(J(true));19971998tableValues[GE_LOGIC_NAND] = GetCodePointer();1999AND(bits, R(temp1Reg), MatR(colorOff));2000NOT(32, R(colorReg));2001if (stencilReg != INVALID_REG) {2002AND(bits, R(colorReg), notStencilMask);2003OR(32, R(colorReg), R(stencilReg));2004} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {2005// We need to clear the stencil bits since the standard write logic assumes they're zero.2006AND(bits, R(colorReg), notStencilMask);2007}2008finishes.push_back(J(true));20092010tableValues[GE_LOGIC_SET] = GetCodePointer();2011if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {2012OR(32, R(colorReg), R(stencilReg));2013OR(bits, R(colorReg), notStencilMask);2014finishes.push_back(J(true));2015} else if (stencilReg != INVALID_REG) {2016// Set bits directly in stencilReg, and then put in memory.2017OR(bits, R(stencilReg), notStencilMask);2018MOV(bits, MatR(colorOff), R(stencilReg));2019skipStandardWrites_.push_back(J(true));2020} else if (maskReg != INVALID_REG) {2021// OR in the bits we're allowed to write (won't be any stencil.)2022NOT(32, R(maskReg));2023OR(bits, MatR(colorOff), R(maskReg));2024skipStandardWrites_.push_back(J(true));2025} else {2026OR(bits, MatR(colorOff), notStencilMask);2027skipStandardWrites_.push_back(J(true));2028}20292030const u8 *tablePtr = GetCodePointer();2031for (int i = 0; i < 16; ++i) {2032Write64((uintptr_t)tableValues[i]);2033}20342035SetJumpTarget(skipTable);2036LEA(64, temp1Reg, M(tablePtr));2037JMPptr(MComplex(temp1Reg, logicOpReg, 8, 0));20382039for (FixupBranch &fixup : finishes)2040SetJumpTarget(fixup);20412042regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);2043regCache_.Release(logicOpReg, RegCache::GEN_TEMP4);2044regCache_.Release(temp1Reg, RegCache::GEN_TEMP5);2045if (stencilReg != INVALID_REG)2046regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);20472048return true;2049}20502051bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {2052Describe("ConvertTo565");20532054if (cpu_info.bBMI2_fast) {2055MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));2056PEXT(32, colorReg, colorReg, R(temp1Reg));2057return true;2058}20592060// Assemble the 565 color, starting with R...2061MOV(32, R(temp1Reg), R(colorReg));2062SHR(32, R(temp1Reg), Imm8(3));2063AND(16, R(temp1Reg), Imm16(0x1F << 0));20642065// For G, move right 5 (because the top 6 are offset by 10.)2066MOV(32, R(temp2Reg), R(colorReg));2067SHR(32, R(temp2Reg), Imm8(5));2068AND(16, R(temp2Reg), Imm16(0x3F << 5));2069OR(32, R(temp1Reg), R(temp2Reg));20702071// And finally B, move right 8 (top 5 are offset by 19.)2072SHR(32, R(colorReg), Imm8(8));2073AND(16, R(colorReg), Imm16(0x1F << 11));2074OR(32, R(colorReg), R(temp1Reg));20752076return true;2077}20782079bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {2080Describe("ConvertTo5551");20812082if (cpu_info.bBMI2_fast) {2083MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));2084PEXT(32, colorReg, colorReg, R(temp1Reg));2085return true;2086}20872088// This is R, pretty simple.2089MOV(32, R(temp1Reg), R(colorReg));2090SHR(32, R(temp1Reg), Imm8(3));2091AND(16, R(temp1Reg), Imm16(0x1F << 0));20922093// G moves right 6, to match the top 5 at 11.2094MOV(32, R(temp2Reg), R(colorReg));2095SHR(32, R(temp2Reg), Imm8(6));2096AND(16, R(temp2Reg), Imm16(0x1F << 5));2097OR(32, R(temp1Reg), R(temp2Reg));20982099if (keepAlpha) {2100// Grab A into tempReg2 before handling B.2101MOV(32, R(temp2Reg), R(colorReg));2102SHR(32, R(temp2Reg), Imm8(31));2103SHL(32, R(temp2Reg), Imm8(15));2104}21052106// B moves right 9, to match the top 5 at 19.2107SHR(32, R(colorReg), Imm8(9));2108AND(16, R(colorReg), Imm16(0x1F << 10));2109OR(32, R(colorReg), R(temp1Reg));21102111if (keepAlpha)2112OR(32, R(colorReg), R(temp2Reg));21132114return true;2115}21162117bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {2118Describe("ConvertTo4444");21192120if (cpu_info.bBMI2_fast) {2121MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));2122PEXT(32, colorReg, colorReg, R(temp1Reg));2123return true;2124}21252126// Shift and mask out R.2127MOV(32, R(temp1Reg), R(colorReg));2128SHR(32, R(temp1Reg), Imm8(4));2129AND(16, R(temp1Reg), Imm16(0xF << 0));21302131// Shift G into position and mask.2132MOV(32, R(temp2Reg), R(colorReg));2133SHR(32, R(temp2Reg), Imm8(8));2134AND(16, R(temp2Reg), Imm16(0xF << 4));2135OR(32, R(temp1Reg), R(temp2Reg));21362137if (keepAlpha) {2138// Grab A into tempReg2 before handling B.2139MOV(32, R(temp2Reg), R(colorReg));2140SHR(32, R(temp2Reg), Imm8(28));2141SHL(32, R(temp2Reg), Imm8(12));2142}21432144// B moves right 12, to match the top 4 at 20.2145SHR(32, R(colorReg), Imm8(12));2146AND(16, R(colorReg), Imm16(0xF << 8));2147OR(32, R(colorReg), R(temp1Reg));21482149if (keepAlpha)2150OR(32, R(colorReg), R(temp2Reg));21512152return true;2153}21542155bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {2156Describe("ConvertFrom565");21572158if (cpu_info.bBMI2_fast) {2159// Start off with the high bits.2160MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));2161PDEP(32, temp1Reg, colorReg, R(temp1Reg));21622163// Now grab the low bits (they end up packed.)2164MOV(32, R(temp2Reg), Imm32(0x0000E61C));2165PEXT(32, colorReg, colorReg, R(temp2Reg));2166// And spread them back out.2167MOV(32, R(temp2Reg), Imm32(0x00070307));2168PDEP(32, colorReg, colorReg, R(temp2Reg));21692170// Finally put the high bits in, we're done.2171OR(32, R(colorReg), R(temp1Reg));2172return true;2173}21742175// Filter out red only into temp1.2176MOV(32, R(temp1Reg), R(colorReg));2177AND(16, R(temp1Reg), Imm16(0x1F << 0));2178// Move it left to the top of the 8 bits.2179SHL(32, R(temp1Reg), Imm8(3));21802181// Now we bring in blue, since it's also 5 like red.2182MOV(32, R(temp2Reg), R(colorReg));2183AND(16, R(temp2Reg), Imm16(0x1F << 11));2184// Shift blue into place, 8 left (at 19), and merge back to temp1.2185SHL(32, R(temp2Reg), Imm8(8));2186OR(32, R(temp1Reg), R(temp2Reg));21872188// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.2189OR(32, R(temp2Reg), R(temp1Reg));2190SHL(32, R(temp2Reg), Imm8(1));21912192// We go to green last because it's the different one. Put it in place.2193AND(16, R(colorReg), Imm16(0x3F << 5));2194SHL(32, R(colorReg), Imm8(5));2195// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)2196OR(32, R(temp2Reg), R(colorReg));2197OR(32, R(colorReg), R(temp1Reg));21982199// Now shift and mask temp2 for swizzle.2200SHR(32, R(temp2Reg), Imm8(6));2201AND(32, R(temp2Reg), Imm32(0x00070307));2202// And then OR that in too. We're done.2203OR(32, R(colorReg), R(temp2Reg));22042205return true;2206}22072208bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {2209Describe("ConvertFrom5551");22102211if (cpu_info.bBMI2_fast) {2212// First, grab the top bits.2213MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));2214PDEP(32, colorReg, colorReg, R(temp1Reg));22152216// Now make the swizzle bits.2217MOV(32, R(temp2Reg), R(colorReg));2218SHR(32, R(temp2Reg), Imm8(5));2219AND(32, R(temp2Reg), Imm32(0x00070707));22202221if (keepAlpha) {2222// Sign extend the alpha bit to 8 bits.2223SHL(32, R(colorReg), Imm8(7));2224SAR(32, R(colorReg), Imm8(7));2225}22262227OR(32, R(colorReg), R(temp2Reg));2228return true;2229}22302231// Filter out red only into temp1.2232MOV(32, R(temp1Reg), R(colorReg));2233AND(16, R(temp1Reg), Imm16(0x1F << 0));2234// Move it left to the top of the 8 bits.2235SHL(32, R(temp1Reg), Imm8(3));22362237// Add in green and shift into place (top bits.)2238MOV(32, R(temp2Reg), R(colorReg));2239AND(16, R(temp2Reg), Imm16(0x1F << 5));2240SHL(32, R(temp2Reg), Imm8(6));2241OR(32, R(temp1Reg), R(temp2Reg));22422243if (keepAlpha) {2244// Now take blue and alpha together.2245AND(16, R(colorReg), Imm16(0x8000 | (0x1F << 10)));2246// We move all the way left, then sign extend right to expand alpha.2247SHL(32, R(colorReg), Imm8(16));2248SAR(32, R(colorReg), Imm8(7));2249} else {2250AND(16, R(colorReg), Imm16(0x1F << 10));2251SHL(32, R(colorReg), Imm8(9));2252}22532254// Combine both together, we still need to swizzle.2255OR(32, R(colorReg), R(temp1Reg));2256OR(32, R(temp1Reg), R(colorReg));2257// Now for swizzle, we'll mask carefully to avoid overflow.2258SHR(32, R(temp1Reg), Imm8(5));2259AND(32, R(temp1Reg), Imm32(0x00070707));22602261// Then finally merge in the swizzle bits.2262OR(32, R(colorReg), R(temp1Reg));2263return true;2264}22652266bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {2267Describe("ConvertFrom4444");22682269if (cpu_info.bBMI2_fast) {2270// First, spread the bits out with spaces.2271MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));2272PDEP(32, colorReg, colorReg, R(temp1Reg));22732274// Now swizzle the low bits in.2275MOV(32, R(temp1Reg), R(colorReg));2276SHR(32, R(temp1Reg), Imm8(4));2277OR(32, R(colorReg), R(temp1Reg));2278return true;2279}22802281// Move red into position within temp1.2282MOV(32, R(temp1Reg), R(colorReg));2283AND(16, R(temp1Reg), Imm16(0xF << 0));2284SHL(32, R(temp1Reg), Imm8(4));22852286// Green is just as simple.2287MOV(32, R(temp2Reg), R(colorReg));2288AND(16, R(temp2Reg), Imm16(0xF << 4));2289SHL(32, R(temp2Reg), Imm8(8));2290OR(32, R(temp1Reg), R(temp2Reg));22912292// Blue isn't last this time, but it's next.2293MOV(32, R(temp2Reg), R(colorReg));2294AND(16, R(temp2Reg), Imm16(0xF << 8));2295SHL(32, R(temp2Reg), Imm8(12));2296OR(32, R(temp1Reg), R(temp2Reg));22972298if (keepAlpha) {2299// Last but not least, alpha.2300AND(16, R(colorReg), Imm16(0xF << 12));2301SHL(32, R(colorReg), Imm8(16));2302OR(32, R(colorReg), R(temp1Reg));23032304// Copy to temp1 again for swizzling.2305OR(32, R(temp1Reg), R(colorReg));2306} else {2307// Overwrite colorReg (we need temp1 as a copy anyway.)2308MOV(32, R(colorReg), R(temp1Reg));2309}23102311// Masking isn't necessary here since everything is 4 wide.2312SHR(32, R(temp1Reg), Imm8(4));2313OR(32, R(colorReg), R(temp1Reg));2314return true;2315}23162317};23182319#endif232023212322