CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/Rasterizer.cpp
Views: 1401
// Copyright (c) 2013- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#include <algorithm>19#include <cmath>2021#include "Common/Common.h"22#include "Common/CPUDetect.h"23#include "Common/Data/Convert/ColorConv.h"24#include "Common/Profiler/Profiler.h"25#include "Common/StringUtils.h"26#include "Core/Config.h"27#include "Core/Debugger/MemBlockInfo.h"28#include "Core/MemMap.h"29#include "GPU/GPUState.h"3031#include "GPU/Common/TextureDecoder.h"32#include "GPU/Software/BinManager.h"33#include "GPU/Software/DrawPixel.h"34#include "GPU/Software/Rasterizer.h"35#include "GPU/Software/Sampler.h"36#include "GPU/Software/SoftGpu.h"37#include "GPU/Software/TransformUnit.h"3839#if defined(_M_SSE)40#include <emmintrin.h>41#include <smmintrin.h>42#endif4344namespace Rasterizer {4546// Only OK on x64 where our stack is aligned47#if defined(_M_SSE) && !PPSSPP_ARCH(X86)48static inline __m128 InterpolateF(const __m128 &c0, const __m128 &c1, const __m128 &c2, int w0, int w1, int w2, float wsum) {49__m128 v = _mm_mul_ps(c0, _mm_cvtepi32_ps(_mm_set1_epi32(w0)));50v = _mm_add_ps(v, _mm_mul_ps(c1, _mm_cvtepi32_ps(_mm_set1_epi32(w1))));51v = _mm_add_ps(v, _mm_mul_ps(c2, _mm_cvtepi32_ps(_mm_set1_epi32(w2))));52return _mm_mul_ps(v, _mm_set_ps1(wsum));53}5455static inline __m128i InterpolateI(const __m128i &c0, const __m128i &c1, const __m128i &c2, int w0, int w1, int w2, float wsum) {56return _mm_cvtps_epi32(InterpolateF(_mm_cvtepi32_ps(c0), _mm_cvtepi32_ps(c1), _mm_cvtepi32_ps(c2), w0, w1, w2, wsum));57}58#elif PPSSPP_ARCH(ARM64_NEON)59static inline float32x4_t InterpolateF(const float32x4_t &c0, const float32x4_t &c1, const float32x4_t &c2, int w0, int w1, int w2, float wsum) {60float32x4_t v = vmulq_f32(c0, vcvtq_f32_s32(vdupq_n_s32(w0)));61v = vaddq_f32(v, vmulq_f32(c1, vcvtq_f32_s32(vdupq_n_s32(w1))));62v = vaddq_f32(v, vmulq_f32(c2, vcvtq_f32_s32(vdupq_n_s32(w2))));63return vmulq_f32(v, vdupq_n_f32(wsum));64}6566static inline int32x4_t InterpolateI(const int32x4_t &c0, const int32x4_t &c1, const int32x4_t &c2, int w0, int w1, int w2, float wsum) {67return vcvtq_s32_f32(InterpolateF(vcvtq_f32_s32(c0), vcvtq_f32_s32(c1), vcvtq_f32_s32(c2), w0, w1, w2, wsum));68}69#endif7071// NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues.72// Not sure if that should be regarded as a bug or if casting to float is a valid fix.7374static inline Vec4<int> Interpolate(const Vec4<int> &c0, const Vec4<int> &c1, const Vec4<int> &c2, int w0, int w1, int w2, float wsum) {75#if (defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)) && !PPSSPP_ARCH(X86)76return Vec4<int>(InterpolateI(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));77#else78return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();79#endif80}8182static inline Vec3<int> Interpolate(const Vec3<int> &c0, const Vec3<int> &c1, const Vec3<int> &c2, int w0, int w1, int w2, float wsum) {83#if (defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)) && !PPSSPP_ARCH(X86)84return Vec3<int>(InterpolateI(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));85#else86return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();87#endif88}8990static inline Vec4<float> Interpolate(const float &c0, const float &c1, const float &c2, const Vec4<float> &w0, const Vec4<float> &w1, const Vec4<float> &w2, const Vec4<float> &wsum_recip) {91#if defined(_M_SSE) && !PPSSPP_ARCH(X86)92__m128 v = _mm_mul_ps(w0.vec, _mm_set1_ps(c0));93v = _mm_add_ps(v, _mm_mul_ps(w1.vec, _mm_set1_ps(c1)));94v = _mm_add_ps(v, _mm_mul_ps(w2.vec, _mm_set1_ps(c2)));95return _mm_mul_ps(v, wsum_recip.vec);96#elif PPSSPP_ARCH(ARM64_NEON)97float32x4_t v = vmulq_f32(w0.vec, vdupq_n_f32(c0));98v = vaddq_f32(v, vmulq_f32(w1.vec, vdupq_n_f32(c1)));99v = vaddq_f32(v, vmulq_f32(w2.vec, vdupq_n_f32(c2)));100return vmulq_f32(v, wsum_recip.vec);101#else102return (w0 * c0 + w1 * c1 + w2 * c2) * wsum_recip;103#endif104}105106static inline Vec4<float> Interpolate(const float &c0, const float &c1, const float &c2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip) {107return Interpolate(c0, c1, c2, w0.Cast<float>(), w1.Cast<float>(), w2.Cast<float>(), wsum_recip);108}109110void ComputeRasterizerState(RasterizerState *state, BinManager *binner) {111ComputePixelFuncID(&state->pixelID);112state->drawPixel = Rasterizer::GetSingleFunc(state->pixelID, binner);113114state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;115if (state->enableTextures) {116ComputeSamplerID(&state->samplerID);117state->linear = Sampler::GetLinearFunc(state->samplerID, binner);118state->nearest = Sampler::GetNearestFunc(state->samplerID, binner);119120// Since the definitions are the same, just force this setting using the func pointer.121if (g_Config.iTexFiltering == TEX_FILTER_FORCE_LINEAR) {122state->nearest = state->linear;123} else if (g_Config.iTexFiltering == TEX_FILTER_FORCE_NEAREST) {124state->linear = state->nearest;125}126127state->maxTexLevel = state->samplerID.hasAnyMips ? gstate.getTextureMaxLevel() : 0;128129GETextureFormat texfmt = state->samplerID.TexFmt();130for (uint8_t i = 0; i <= state->maxTexLevel; i++) {131u32 texaddr = gstate.getTextureAddress(i);132state->texaddr[i] = texaddr;133state->texbufw[i] = (uint16_t)GetTextureBufw(i, texaddr, texfmt);134if (Memory::IsValidAddress(texaddr))135state->texptr[i] = Memory::GetPointerUnchecked(texaddr);136else137state->texptr[i] = nullptr;138}139140state->textureLodSlope = gstate.getTextureLodSlope();141state->texLevelMode = gstate.getTexLevelMode();142state->texLevelOffset = (int8_t)gstate.getTexLevelOffset16();143state->mipFilt = gstate.isMipmapFilteringEnabled();144state->minFilt = gstate.isMinifyFilteringEnabled();145state->magFilt = gstate.isMagnifyFilteringEnabled();146state->textureProj = gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX;147if (state->textureProj) {148// We may be able to optimize this off. This is actually kinda common.149const bool qZeroST = gstate.tgenMatrix[2] == 0.0f && gstate.tgenMatrix[5] == 0.0f;150const bool qZeroQ = gstate.tgenMatrix[8] == 0.0f;151152// Two common cases: the source q factor is zero, OR source is UV.153const bool qFactorZero = gstate.getUVProjMode() == GE_PROJMAP_UV;154if (qZeroST && (qZeroQ || qFactorZero) && gstate.tgenMatrix[11] == 1.0f) {155state->textureProj = false;156}157}158}159160state->shadeGouraud = !gstate.isModeClear() && gstate.getShadeMode() == GE_SHADE_GOURAUD;161state->throughMode = gstate.isModeThrough();162state->antialiasLines = gstate.isAntiAliasEnabled();163164#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)165DisplayList currentList{};166if (gpuDebug)167gpuDebug->GetCurrentDisplayList(currentList);168state->listPC = currentList.pc;169#endif170}171172static inline void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, bool useColor) {173if (useColor) {174if ((v0.color0 & 0x00FFFFFF) != 0x00FFFFFF)175state->flags |= RasterizerStateFlags::VERTEX_NON_FULL_WHITE;176uint8_t alpha = v0.color0 >> 24;177if (alpha != 0)178state->flags |= RasterizerStateFlags::VERTEX_ALPHA_NON_ZERO;179if (alpha != 0xFF)180state->flags |= RasterizerStateFlags::VERTEX_ALPHA_NON_FULL;181}182if (!(v0.fogdepth >= 1.0f))183state->flags |= RasterizerStateFlags::VERTEX_HAS_FOG;184}185186void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0) {187CalculateRasterStateFlags(state, v0, true);188}189190void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, const VertexData &v1, bool forceFlat) {191CalculateRasterStateFlags(state, v0, !forceFlat && state->shadeGouraud);192CalculateRasterStateFlags(state, v1, true);193}194195void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, const VertexData &v1, const VertexData &v2) {196CalculateRasterStateFlags(state, v0, state->shadeGouraud);197CalculateRasterStateFlags(state, v1, state->shadeGouraud);198CalculateRasterStateFlags(state, v2, true);199}200201static inline int OptimizePixelIDFlags(const RasterizerStateFlags &flags) {202return (int)flags & (int)RasterizerStateFlags::OPTIMIZED_PIXELID;203}204205static inline int OptimizeSamplerIDFlags(const RasterizerStateFlags &flags) {206return (int)flags & (int)RasterizerStateFlags::OPTIMIZED_SAMPLERID;207}208209static inline int OptimizeAllFlags(const RasterizerStateFlags &flags) {210return OptimizePixelIDFlags(flags) | OptimizeSamplerIDFlags(flags);211}212213static inline RasterizerStateFlags ClearFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &mask) {214int clearBits = (int)flags & (int)mask;215return (RasterizerStateFlags)((int)flags & ~clearBits);216}217218static inline RasterizerStateFlags ReplacePixelIDFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &replace) {219RasterizerStateFlags updated = ClearFlags(flags, RasterizerStateFlags::OPTIMIZED_PIXELID);220return updated | (RasterizerStateFlags)OptimizePixelIDFlags(replace);221}222223static inline RasterizerStateFlags ReplaceSamplerIDFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &replace) {224RasterizerStateFlags updated = ClearFlags(flags, RasterizerStateFlags::OPTIMIZED_SAMPLERID);225return updated | (RasterizerStateFlags)OptimizeSamplerIDFlags(replace);226}227228static bool CheckClutAlphaFull(RasterizerState *state) {229// We only need to check it once.230if (state->flags & RasterizerStateFlags::CLUT_ALPHA_CHECKED)231return !(state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_FULL);232// For now, let's keep things simple.233const SamplerID &samplerID = state->samplerID;234if (samplerID.hasClutOffset || !samplerID.useSharedClut)235return false;236237uint32_t count = samplerID.TexFmt() == GE_TFMT_CLUT4 ? 16 : 256;238if (samplerID.hasClutMask)239count = std::min(count, ((samplerID.cached.clutFormat >> 8) & 0xFF) + 1);240241u32 alphaSum = 0xFFFFFFFF;242if (samplerID.ClutFmt() == GE_CMODE_32BIT_ABGR8888) {243CheckMask32((const uint32_t *)samplerID.cached.clut, count, &alphaSum);244} else {245CheckMask16((const uint16_t *)samplerID.cached.clut, count, &alphaSum);246}247248bool onlyFull = true;249switch (samplerID.ClutFmt()) {250case GE_CMODE_16BIT_BGR5650:251break;252253case GE_CMODE_16BIT_ABGR5551:254onlyFull = (alphaSum & 0x8000) != 0;255break;256257case GE_CMODE_16BIT_ABGR4444:258onlyFull = (alphaSum & 0xF000) == 0xF000;259break;260261case GE_CMODE_32BIT_ABGR8888:262onlyFull = (alphaSum & 0xFF000000) == 0xFF000000;263break;264}265266// Might just be different patterns, but if alphaSum != 0, it can't contain zero.267if (alphaSum != 0)268state->flags |= RasterizerStateFlags::CLUT_ALPHA_NON_ZERO;269if (!onlyFull)270state->flags |= RasterizerStateFlags::CLUT_ALPHA_NON_FULL;271state->flags |= RasterizerStateFlags::CLUT_ALPHA_CHECKED;272273return onlyFull;274}275276static RasterizerStateFlags DetectStateOptimizations(RasterizerState *state) {277// Note: all optimizations must be undoable.278RasterizerStateFlags optimize = RasterizerStateFlags::NONE;279auto &pixelID = state->pixelID;280auto &samplerID = state->samplerID;281282bool alphaZero = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_ZERO);283bool alphaFull = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_FULL);284bool needTextureAlpha = state->enableTextures && samplerID.useTextureAlpha;285286if (!pixelID.clearMode) {287auto &cached = pixelID.cached;288289bool alphaBlend = pixelID.alphaBlend || (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_OFF);290if (needTextureAlpha && alphaBlend && alphaFull) {291bool usesClut = (samplerID.texfmt & 4) != 0;292if (usesClut && CheckClutAlphaFull(state))293needTextureAlpha = false;294}295296if (alphaBlend && !needTextureAlpha) {297PixelBlendFactor src = pixelID.AlphaBlendSrc();298PixelBlendFactor dst = pixelID.AlphaBlendDst();299if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)300src = PixelBlendFactor::SRCALPHA;301if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)302dst = PixelBlendFactor::INVSRCALPHA;303304// Okay, we may be able to convert this to a fixed value.305if (alphaZero || alphaFull) {306// If it was already set and we still can, set it again.307if (src == PixelBlendFactor::SRCALPHA)308optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_SRC;309if (dst == PixelBlendFactor::INVSRCALPHA)310optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_DST;311}312if (alphaFull && (src == PixelBlendFactor::SRCALPHA || src == PixelBlendFactor::ONE) && (dst == PixelBlendFactor::INVSRCALPHA || dst == PixelBlendFactor::ZERO)) {313optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_OFF;314}315}316317if (alphaBlend && (needTextureAlpha || !alphaFull)) {318// Okay, we're blending, and we need to. Are we alpha testing?319GEComparison alphaTestFunc = pixelID.AlphaTestFunc();320if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)321alphaTestFunc = GE_COMP_NOTEQUAL;322if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)323alphaTestFunc = GE_COMP_GREATER;324if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON)325alphaTestFunc = GE_COMP_ALWAYS;326327PixelBlendFactor src = pixelID.AlphaBlendSrc();328PixelBlendFactor dst = pixelID.AlphaBlendDst();329if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)330src = PixelBlendFactor::SRCALPHA;331if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)332dst = PixelBlendFactor::INVSRCALPHA;333334if (alphaTestFunc == GE_COMP_ALWAYS && src == PixelBlendFactor::SRCALPHA && dst == PixelBlendFactor::INVSRCALPHA) {335bool usesClut = (samplerID.texfmt & 4) != 0;336bool couldHaveZeroTexAlpha = true;337if (usesClut && CheckClutAlphaFull(state))338couldHaveZeroTexAlpha = false;339if (state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_ZERO)340couldHaveZeroTexAlpha = false;341342// Blending is expensive, since we read the target. Force alpha testing on.343if (!pixelID.depthWrite && !pixelID.stencilTest && couldHaveZeroTexAlpha)344optimize |= RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON;345}346}347348bool applyFog = pixelID.applyFog || (state->flags & RasterizerStateFlags::OPTIMIZED_FOG_OFF);349if (applyFog) {350bool hasFog = state->flags & RasterizerStateFlags::VERTEX_HAS_FOG;351if (!hasFog)352optimize |= RasterizerStateFlags::OPTIMIZED_FOG_OFF;353}354}355356if (state->enableTextures) {357bool colorFull = !(state->flags & RasterizerStateFlags::VERTEX_NON_FULL_WHITE);358if (colorFull && (!needTextureAlpha || alphaFull)) {359// Modulate is common, sometimes even with a fixed color. Replace is cheaper.360GETexFunc texFunc = samplerID.TexFunc();361if (state->flags & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)362texFunc = GE_TEXFUNC_MODULATE;363364if (texFunc == GE_TEXFUNC_MODULATE)365optimize |= RasterizerStateFlags::OPTIMIZED_TEXREPLACE;366}367368bool usesClut = (samplerID.texfmt & 4) != 0;369if (usesClut && alphaFull && samplerID.useTextureAlpha) {370GEComparison alphaTestFunc = pixelID.AlphaTestFunc();371// We optimize > 0 to != 0, so this is especially common.372if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)373alphaTestFunc = GE_COMP_NOTEQUAL;374// > 16, 8, or similar are also very common.375if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)376alphaTestFunc = GE_COMP_GREATER;377if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON)378alphaTestFunc = GE_COMP_ALWAYS;379380bool alphaTest = (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) && pixelID.alphaTestRef < 0xFF && !state->pixelID.hasAlphaTestMask;381if (alphaTest) {382bool canSkipAlphaTest = CheckClutAlphaFull(state);383if ((state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_ZERO) && pixelID.alphaTestRef == 0)384canSkipAlphaTest = true;385if (canSkipAlphaTest)386optimize |= alphaTestFunc == GE_COMP_NOTEQUAL ? RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE : RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT;387}388}389}390391return optimize;392}393394static bool ApplyStateOptimizations(RasterizerState *state, const RasterizerStateFlags &optimize) {395bool changed = false;396397// Check if we can compile the new funcs before replacing.398if (OptimizePixelIDFlags(state->flags) != OptimizePixelIDFlags(optimize)) {399bool canFull = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_FULL);400401PixelFuncID pixelID = state->pixelID;402if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_OFF)403pixelID.alphaBlend = false;404else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_OFF)405pixelID.alphaBlend = true;406if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)407pixelID.alphaBlendSrc = (uint8_t)(canFull ? PixelBlendFactor::ONE : PixelBlendFactor::ZERO);408else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)409pixelID.alphaBlendSrc = (uint8_t)PixelBlendFactor::SRCALPHA;410if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_DST)411pixelID.alphaBlendDst = (uint8_t)(canFull ? PixelBlendFactor::ZERO : PixelBlendFactor::ONE);412else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)413pixelID.alphaBlendDst = (uint8_t)PixelBlendFactor::INVSRCALPHA;414if (optimize & RasterizerStateFlags::OPTIMIZED_FOG_OFF)415pixelID.applyFog = false;416else if (state->flags & RasterizerStateFlags::OPTIMIZED_FOG_OFF)417pixelID.applyFog = true;418if (optimize & (RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE | RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT))419pixelID.alphaTestFunc = GE_COMP_ALWAYS;420else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)421pixelID.alphaTestFunc = GE_COMP_NOTEQUAL;422else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)423pixelID.alphaTestFunc = GE_COMP_GREATER;424else if (optimize & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON) {425pixelID.alphaTestFunc = GE_COMP_NOTEQUAL;426pixelID.alphaTestRef = 0;427pixelID.hasAlphaTestMask = false;428} else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON) {429pixelID.alphaTestFunc = GE_COMP_ALWAYS;430}431432SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID, nullptr);433// Can't compile during runtime. This failing is a bit of a problem when undoing...434if (drawPixel) {435state->drawPixel = drawPixel;436memcpy(&state->pixelID, &pixelID, sizeof(PixelFuncID));437state->flags = ReplacePixelIDFlags(state->flags, optimize) | RasterizerStateFlags::OPTIMIZED;438changed = true;439}440}441442if (OptimizeSamplerIDFlags(state->flags) != OptimizeSamplerIDFlags(optimize)) {443SamplerID samplerID = state->samplerID;444if (optimize & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)445samplerID.texFunc = (uint8_t)GE_TEXFUNC_REPLACE;446else if (state->flags & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)447samplerID.texFunc = (uint8_t)GE_TEXFUNC_MODULATE;448449Sampler::LinearFunc linear = Sampler::GetLinearFunc(samplerID, nullptr);450Sampler::LinearFunc nearest = Sampler::GetNearestFunc(samplerID, nullptr);451// Can't compile during runtime. This failing is a bit of a problem when undoing...452if (linear && nearest) {453// Since the definitions are the same, just force this setting using the func pointer.454if (g_Config.iTexFiltering == TEX_FILTER_FORCE_LINEAR) {455state->nearest = linear;456state->linear = linear;457} else if (g_Config.iTexFiltering == TEX_FILTER_FORCE_NEAREST) {458state->nearest = nearest;459state->linear = nearest;460} else {461state->nearest = nearest;462state->linear = linear;463}464memcpy(&state->samplerID, &samplerID, sizeof(SamplerID));465state->flags = ReplaceSamplerIDFlags(state->flags, optimize) | RasterizerStateFlags::OPTIMIZED;466changed = true;467}468}469470state->lastFlags = state->flags;471return changed;472}473474bool OptimizeRasterState(RasterizerState *state) {475if (state->flags == state->lastFlags)476return false;477478RasterizerStateFlags optimize = DetectStateOptimizations(state);479480// If it was optimized before, just revert and don't churn.481if ((state->flags & RasterizerStateFlags::OPTIMIZED) && OptimizeAllFlags(state->flags) != OptimizeAllFlags(optimize)) {482optimize = RasterizerStateFlags::NONE;483} else if (optimize == RasterizerStateFlags::NONE && !(state->flags & RasterizerStateFlags::OPTIMIZED)) {484state->lastFlags = state->flags;485return false;486}487488return ApplyStateOptimizations(state, optimize);489}490491RasterizerState OptimizeFlatRasterizerState(const RasterizerState &origState, const VertexData &v1) {492uint8_t alpha = v1.color0 >> 24;493RasterizerState state = origState;494495// Sometimes, a particular draw can do better than the overall state.496state.flags = ClearFlags(state.flags, RasterizerStateFlags::VERTEX_FLAT_RESET);497CalculateRasterStateFlags(&state, v1, true);498499RasterizerStateFlags optimize = DetectStateOptimizations(&state);500if (OptimizeAllFlags(state.flags) != OptimizeAllFlags(optimize)) {501ApplyStateOptimizations(&state, optimize);502return state;503}504505return origState;506}507508static inline u8 ClampFogDepth(float fogdepth) {509union FloatBits {510float f;511u32 u;512};513FloatBits f;514f.f = fogdepth;515516u32 exp = f.u >> 23;517if ((f.u & 0x80000000) != 0 || exp <= 126 - 8)518return 0;519if (exp > 126)520return 255;521522u32 mantissa = (f.u & 0x007FFFFF) | 0x00800000;523return mantissa >> (16 + 126 - exp);524}525526static inline void GetTextureCoordinates(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {527// Note that for environment mapping, texture coordinates have been calculated during lighting528float q0 = 1.f / v0.clipw;529float q1 = 1.f / v1.clipw;530float wq0 = p * q0;531float wq1 = (1.0f - p) * q1;532533float q_recip = 1.0f / (wq0 + wq1);534s = (v0.texturecoords.s() * wq0 + v1.texturecoords.s() * wq1) * q_recip;535t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;536}537538static inline void GetTextureCoordinatesProj(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {539// This is for texture matrix projection.540float q0 = 1.f / v0.clipw;541float q1 = 1.f / v1.clipw;542float wq0 = p * q0;543float wq1 = (1.0f - p) * q1;544545float q_recip = 1.0f / (v0.texturecoords.q() * wq0 + v1.texturecoords.q() * wq1);546547s = (v0.texturecoords.s() * wq0 + v1.texturecoords.s() * wq1) * q_recip;548t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;549}550551static inline void GetTextureCoordinates(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {552// Note that for environment mapping, texture coordinates have been calculated during lighting.553float q0 = 1.f / v0.clipw;554float q1 = 1.f / v1.clipw;555float q2 = 1.f / v2.clipw;556Vec4<float> wq0 = w0.Cast<float>() * q0;557Vec4<float> wq1 = w1.Cast<float>() * q1;558Vec4<float> wq2 = w2.Cast<float>() * q2;559560Vec4<float> q_recip = (wq0 + wq1 + wq2).Reciprocal();561s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), wq0, wq1, wq2, q_recip);562t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);563}564565static inline void GetTextureCoordinatesProj(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {566// This is for texture matrix projection.567float q0 = 1.f / v0.clipw;568float q1 = 1.f / v1.clipw;569float q2 = 1.f / v2.clipw;570Vec4<float> wq0 = w0.Cast<float>() * q0;571Vec4<float> wq1 = w1.Cast<float>() * q1;572Vec4<float> wq2 = w2.Cast<float>() * q2;573574// Here, Interpolate() is a bit suboptimal, since575// there's no need to multiply by 1.0f.576Vec4<float> q_recip = Interpolate(v0.texturecoords.q(), v1.texturecoords.q(), v2.texturecoords.q(), wq0, wq1, wq2, Vec4<float>::AssignToAll(1.0f)).Reciprocal();577578s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), wq0, wq1, wq2, q_recip);579t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);580}581582static inline void SetPixelDepth(int x, int y, int stride, u16 value) {583depthbuf.Set16(x, y, stride, value);584}585586static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Vec2<int>& line1, const Vec2<int>& line2)587{588if (line1.y == line2.y) {589// just check if vertex is above us => bottom line parallel to x-axis590return vertex.y < line1.y;591} else {592// check if vertex is on our left => right side593return vertex.x < line1.x + (line2.x - line1.x) * (vertex.y - line1.y) / (line2.y - line1.y);594}595}596597static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(float s, float t, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {598const u8 **tptr0 = const_cast<const u8 **>(&state.texptr[texlevel]);599const uint16_t *bufw0 = &state.texbufw[texlevel];600601if (!bilinear) {602return state.nearest(s, t, prim_color, tptr0, bufw0, texlevel, frac_texlevel, state.samplerID);603}604return state.linear(s, t, prim_color, tptr0, bufw0, texlevel, frac_texlevel, state.samplerID);605}606607static inline Vec4IntResult SOFTRAST_CALL ApplyTexturingSingle(float s, float t, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {608return ApplyTexturing(s, t, prim_color, texlevel, frac_texlevel, bilinear, state);609}610611// Produces a signed 1.27.4 value.612static int TexLog2(float delta) {613union FloatBits {614float f;615u32 u;616};617FloatBits f;618f.f = delta;619// Use the exponent as the tex level, and the top mantissa bits for a frac.620// We can't support more than 4 bits of frac, so truncate.621int useful = (f.u >> 19) & 0x0FFF;622// Now offset so the exponent aligns with log2f (exp=127 is 0.)623return useful - 127 * 16;624}625626static inline void CalculateSamplingParams(const float ds, const float dt, float w, const RasterizerState &state, int &level, int &levelFrac, bool &filt) {627const int width = 1 << state.samplerID.width0Shift;628const int height = 1 << state.samplerID.height0Shift;629630// With 8 bits of fraction (because texslope can be fairly precise.)631int detail;632switch (state.TexLevelMode()) {633case GE_TEXLEVEL_MODE_AUTO:634detail = TexLog2(std::max(std::abs(ds * width), std::abs(dt * height)));635break;636case GE_TEXLEVEL_MODE_SLOPE:637// This is always offset by an extra texlevel.638detail = TexLog2(2.0f * w * state.textureLodSlope);639break;640case GE_TEXLEVEL_MODE_CONST:641default:642// Unused value 3 operates the same as CONST.643detail = 0;644break;645}646647// Add in the bias (used in all modes), with 4 bits of fraction.648detail += state.texLevelOffset;649650if (detail > 0 && state.maxTexLevel > 0) {651bool mipFilt = state.mipFilt;652653int level8 = std::min(detail, state.maxTexLevel * 16);654if (!mipFilt) {655// Round up at 1.5.656level8 += 8;657}658level = level8 >> 4;659levelFrac = mipFilt ? level8 & 0xF : 0;660} else {661level = 0;662levelFrac = 0;663}664665if (detail > 0)666filt = state.minFilt;667else668filt = state.magFilt;669}670671static inline void ApplyTexturing(const RasterizerState &state, Vec4<int> *prim_color, const Vec4<int> &mask, const Vec4<float> &s, const Vec4<float> &t, float w) {672float ds = s[1] - s[0];673float dt = t[2] - t[0];674675int level;676int levelFrac;677bool bilinear;678CalculateSamplingParams(ds, dt, w, state, level, levelFrac, bilinear);679680PROFILE_THIS_SCOPE("sampler");681for (int i = 0; i < 4; ++i) {682if (mask[i] >= 0)683prim_color[i] = ApplyTexturing(s[i], t[i], ToVec4IntArg(prim_color[i]), level, levelFrac, bilinear, state);684}685}686687static inline Vec4<int> SOFTRAST_CALL CheckDepthTestPassed4(const Vec4<int> &mask, GEComparison func, int x, int y, int stride, Vec4<int> z) {688// Skip the depth buffer read if we're masked already.689#if defined(_M_SSE)690__m128i result = SAFE_M128I(mask.ivec);691int maskbits = _mm_movemask_epi8(result);692if (maskbits >= 0xFFFF)693return mask;694#else695Vec4<int> result = mask;696if (mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)697return result;698#endif699700// Read in the existing depth values.701#if defined(_M_SSE)702// Tried using flags from maskbits to skip dwords... seemed neutral.703__m128i refz = _mm_cvtsi32_si128(*(u32 *)depthbuf.Get16Ptr(x, y, stride));704refz = _mm_unpacklo_epi32(refz, _mm_cvtsi32_si128(*(u32 *)depthbuf.Get16Ptr(x, y + 1, stride)));705refz = _mm_unpacklo_epi16(refz, _mm_setzero_si128());706#else707Vec4<int> refz(depthbuf.Get16(x, y, stride), depthbuf.Get16(x + 1, y, stride), depthbuf.Get16(x, y + 1, stride), depthbuf.Get16(x + 1, y + 1, stride));708#endif709710switch (func) {711case GE_COMP_NEVER:712#if defined(_M_SSE)713result = _mm_set1_epi32(-1);714#else715result = Vec4<int>::AssignToAll(-1);716#endif717break;718719case GE_COMP_ALWAYS:720break;721722case GE_COMP_EQUAL:723#if defined(_M_SSE)724result = _mm_or_si128(result, _mm_xor_si128(_mm_cmpeq_epi32(z.ivec, refz), _mm_set1_epi32(-1)));725#else726for (int i = 0; i < 4; ++i)727result[i] |= z[i] != refz[i] ? -1 : 0;728#endif729break;730731case GE_COMP_NOTEQUAL:732#if defined(_M_SSE)733result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));734#else735for (int i = 0; i < 4; ++i)736result[i] |= z[i] == refz[i] ? -1 : 0;737#endif738break;739740case GE_COMP_LESS:741#if defined(_M_SSE)742result = _mm_or_si128(result, _mm_cmpgt_epi32(z.ivec, refz));743result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));744#else745for (int i = 0; i < 4; ++i)746result[i] |= z[i] >= refz[i] ? -1 : 0;747#endif748break;749750case GE_COMP_LEQUAL:751#if defined(_M_SSE)752result = _mm_or_si128(result, _mm_cmpgt_epi32(z.ivec, refz));753#else754for (int i = 0; i < 4; ++i)755result[i] |= z[i] > refz[i] ? -1 : 0;756#endif757break;758759case GE_COMP_GREATER:760#if defined(_M_SSE)761result = _mm_or_si128(result, _mm_cmplt_epi32(z.ivec, refz));762result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));763#else764for (int i = 0; i < 4; ++i)765result[i] |= z[i] <= refz[i] ? -1 : 0;766#endif767break;768769case GE_COMP_GEQUAL:770#if defined(_M_SSE)771result = _mm_or_si128(result, _mm_cmplt_epi32(z.ivec, refz));772#else773for (int i = 0; i < 4; ++i)774result[i] |= z[i] < refz[i] ? -1 : 0;775#endif776break;777}778779return result;780}781782template <bool useSSE4>783struct TriangleEdge {784Vec4<int> Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin);785inline Vec4<int> StepX(const Vec4<int> &w);786inline Vec4<int> StepY(const Vec4<int> &w);787788inline void NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX);789inline Vec4<int> StepXTimes(const Vec4<int> &w, int c);790791Vec4<int> stepX;792Vec4<int> stepY;793};794795#if defined(_M_SSE) && !PPSSPP_ARCH(X86)796#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)797[[gnu::target("sse4.1")]]798#endif799static inline __m128i SOFTRAST_CALL TriangleEdgeStartSSE4(__m128i initX, __m128i initY, int xf, int yf, int c) {800initX = _mm_mullo_epi32(initX, _mm_set1_epi32(xf));801initY = _mm_mullo_epi32(initY, _mm_set1_epi32(yf));802return _mm_add_epi32(_mm_add_epi32(initX, initY), _mm_set1_epi32(c));803}804#endif805806template <bool useSSE4>807Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {808// Start at pixel centers.809static constexpr int centerOff = (SCREEN_SCALE_FACTOR / 2) - 1;810static constexpr int centerPlus1 = SCREEN_SCALE_FACTOR + centerOff;811Vec4<int> initX = Vec4<int>::AssignToAll(origin.x) + Vec4<int>(centerOff, centerPlus1, centerOff, centerPlus1);812Vec4<int> initY = Vec4<int>::AssignToAll(origin.y) + Vec4<int>(centerOff, centerOff, centerPlus1, centerPlus1);813814// orient2d refactored.815int xf = v0.y - v1.y;816int yf = v1.x - v0.x;817int c = v1.y * v0.x - v1.x * v0.y;818819stepX = Vec4<int>::AssignToAll(xf * SCREEN_SCALE_FACTOR * 2);820stepY = Vec4<int>::AssignToAll(yf * SCREEN_SCALE_FACTOR * 2);821822#if defined(_M_SSE) && !PPSSPP_ARCH(X86)823if constexpr (useSSE4)824return TriangleEdgeStartSSE4(initX.ivec, initY.ivec, xf, yf, c);825#endif826return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);827}828829template <bool useSSE4>830inline Vec4<int> TriangleEdge<useSSE4>::StepX(const Vec4<int> &w) {831#if defined(_M_SSE) && !PPSSPP_ARCH(X86)832return _mm_add_epi32(w.ivec, stepX.ivec);833#elif PPSSPP_ARCH(ARM64_NEON)834return vaddq_s32(w.ivec, stepX.ivec);835#else836return w + stepX;837#endif838}839840template <bool useSSE4>841inline Vec4<int> TriangleEdge<useSSE4>::StepY(const Vec4<int> &w) {842#if defined(_M_SSE) && !PPSSPP_ARCH(X86)843return _mm_add_epi32(w.ivec, stepY.ivec);844#elif PPSSPP_ARCH(ARM64_NEON)845return vaddq_s32(w.ivec, stepY.ivec);846#else847return w + stepY;848#endif849}850851#if defined(_M_SSE) && !PPSSPP_ARCH(X86)852#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)853[[gnu::target("sse4.1")]]854#endif855static inline int SOFTRAST_CALL MaxWeightSSE4(__m128i w) {856__m128i max2 = _mm_max_epi32(w, _mm_shuffle_epi32(w, _MM_SHUFFLE(3, 2, 3, 2)));857__m128i max1 = _mm_max_epi32(max2, _mm_shuffle_epi32(max2, _MM_SHUFFLE(1, 1, 1, 1)));858return _mm_cvtsi128_si32(max1);859}860#endif861862template <bool useSSE4>863void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {864int wmax;865#if defined(_M_SSE) && !PPSSPP_ARCH(X86)866if constexpr (useSSE4) {867wmax = MaxWeightSSE4(w.ivec);868} else {869wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));870}871#elif PPSSPP_ARCH(ARM64_NEON)872int32x2_t wmax_temp = vpmax_s32(vget_low_s32(w.ivec), vget_high_s32(w.ivec));873wmax = vget_lane_s32(vpmax_s32(wmax_temp, wmax_temp), 0);874#else875wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));876#endif877if (wmax < 0) {878if (stepX.x > 0) {879int steps = -wmax / stepX.x;880rowMinX = std::max(rowMinX, minX + steps * SCREEN_SCALE_FACTOR * 2);881} else if (stepX.x <= 0) {882rowMinX = rowMaxX + 1;883}884}885886if (wmax >= 0 && stepX.x < 0) {887int steps = (-wmax / stepX.x) + 1;888rowMaxX = std::min(rowMaxX, minX + steps * SCREEN_SCALE_FACTOR * 2);889}890}891892#if defined(_M_SSE) && !PPSSPP_ARCH(X86)893#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)894[[gnu::target("sse4.1")]]895#endif896static inline __m128i SOFTRAST_CALL StepTimesSSE4(__m128i w, __m128i step, int c) {897return _mm_add_epi32(w, _mm_mullo_epi32(_mm_set1_epi32(c), step));898}899#endif900901template <bool useSSE4>902inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {903#if defined(_M_SSE) && !PPSSPP_ARCH(X86)904if constexpr (useSSE4)905return StepTimesSSE4(w.ivec, stepX.ivec, c);906#elif PPSSPP_ARCH(ARM64_NEON)907return vaddq_s32(w.ivec, vmulq_s32(vdupq_n_s32(c), stepX.ivec));908#endif909return w + stepX * c;910}911912static inline Vec4<int> MakeMask(const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<int> &bias0, const Vec4<int> &bias1, const Vec4<int> &bias2, const Vec4<int> &scissor) {913#if defined(_M_SSE) && !PPSSPP_ARCH(X86)914__m128i biased0 = _mm_add_epi32(w0.ivec, bias0.ivec);915__m128i biased1 = _mm_add_epi32(w1.ivec, bias1.ivec);916__m128i biased2 = _mm_add_epi32(w2.ivec, bias2.ivec);917918return _mm_or_si128(_mm_or_si128(biased0, _mm_or_si128(biased1, biased2)), scissor.ivec);919#elif PPSSPP_ARCH(ARM64_NEON)920int32x4_t biased0 = vaddq_s32(w0.ivec, bias0.ivec);921int32x4_t biased1 = vaddq_s32(w1.ivec, bias1.ivec);922int32x4_t biased2 = vaddq_s32(w2.ivec, bias2.ivec);923924return vorrq_s32(vorrq_s32(biased0, vorrq_s32(biased1, biased2)), scissor.ivec);925#else926return (w0 + bias0) | (w1 + bias1) | (w2 + bias2) | scissor;927#endif928}929930#if defined(_M_SSE) && !PPSSPP_ARCH(X86)931#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)932[[gnu::target("sse4.1")]]933#endif934static inline bool SOFTRAST_CALL AnyMaskSSE4(__m128i mask) {935__m128i sig = _mm_srai_epi32(mask, 31);936return _mm_test_all_ones(sig) == 0;937}938#endif939940template <bool useSSE4>941static inline bool AnyMask(const Vec4<int> &mask) {942#if defined(_M_SSE) && !PPSSPP_ARCH(X86)943if constexpr (useSSE4) {944return AnyMaskSSE4(mask.ivec);945}946947// Source: https://fgiesen.wordpress.com/2013/02/10/optimizing-the-basic-rasterizer/#comment-6676948return _mm_movemask_ps(_mm_castsi128_ps(mask.ivec)) != 15;949#elif PPSSPP_ARCH(ARM64_NEON)950int64x2_t sig = vreinterpretq_s64_s32(vshrq_n_s32(mask.ivec, 31));951return vgetq_lane_s64(sig, 0) != -1 || vgetq_lane_s64(sig, 1) != -1;952#else953return mask.x >= 0 || mask.y >= 0 || mask.z >= 0 || mask.w >= 0;954#endif955}956957static inline Vec4<float> EdgeRecip(const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2) {958#if defined(_M_SSE) && !PPSSPP_ARCH(X86)959__m128i wsum = _mm_add_epi32(w0.ivec, _mm_add_epi32(w1.ivec, w2.ivec));960// _mm_rcp_ps loses too much precision.961return _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(wsum));962#elif PPSSPP_ARCH(ARM64_NEON)963int32x4_t wsum = vaddq_s32(w0.ivec, vaddq_s32(w1.ivec, w2.ivec));964return vdivq_f32(vdupq_n_f32(1.0f), vcvtq_f32_s32(wsum));965#else966return (w0 + w1 + w2).Cast<float>().Reciprocal();967#endif968}969970template <bool clearMode, bool useSSE4>971void DrawTriangleSlice(972const VertexData& v0, const VertexData& v1, const VertexData& v2,973int x1, int y1, int x2, int y2,974const RasterizerState &state)975{976Vec4<int> bias0 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v0.screenpos.xy(), v1.screenpos.xy(), v2.screenpos.xy()) ? -1 : 0);977Vec4<int> bias1 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v1.screenpos.xy(), v2.screenpos.xy(), v0.screenpos.xy()) ? -1 : 0);978Vec4<int> bias2 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v2.screenpos.xy(), v0.screenpos.xy(), v1.screenpos.xy()) ? -1 : 0);979980const PixelFuncID &pixelID = state.pixelID;981982TriangleEdge<useSSE4> e0;983TriangleEdge<useSSE4> e1;984TriangleEdge<useSSE4> e2;985986int64_t minX = x1, maxX = x2, minY = y1, maxY = y2;987988ScreenCoords pprime(minX, minY, 0);989Vec4<int> w0_base = e0.Start(v1.screenpos, v2.screenpos, pprime);990Vec4<int> w1_base = e1.Start(v2.screenpos, v0.screenpos, pprime);991Vec4<int> w2_base = e2.Start(v0.screenpos, v1.screenpos, pprime);992993// The sum of weights should remain constant as we move toward/away from the edges.994const Vec4<float> wsum_recip = EdgeRecip(w0_base, w1_base, w2_base);995996// All the z values are the same, no interpolation required.997// This is common, and when we interpolate, we lose accuracy.998const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z;999const bool flatColorAll = !state.shadeGouraud;1000const bool flatColor0 = flatColorAll || (v0.color0 == v1.color0 && v0.color0 == v2.color0);1001const bool flatColor1 = flatColorAll || (v0.color1 == v1.color1 && v0.color1 == v2.color1);1002const bool noFog = clearMode || !pixelID.applyFog || (v0.fogdepth >= 1.0f && v1.fogdepth >= 1.0f && v2.fogdepth >= 1.0f);10031004if (pixelID.applyDepthRange && flatZ) {1005if (v0.screenpos.z < pixelID.cached.minz || v0.screenpos.z > pixelID.cached.maxz)1006return;1007}10081009#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1010uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;1011std::string tag = StringFromFormat("DisplayListT_%08x", state.listPC);1012std::string ztag = StringFromFormat("DisplayListTZ_%08x", state.listPC);1013#endif10141015const Vec4<int> v0_c0 = Vec4<int>::FromRGBA(v0.color0);1016const Vec4<int> v1_c0 = Vec4<int>::FromRGBA(v1.color0);1017const Vec4<int> v2_c0 = Vec4<int>::FromRGBA(v2.color0);1018const Vec3<int> v0_c1 = Vec3<int>::FromRGB(v0.color1);1019const Vec3<int> v1_c1 = Vec3<int>::FromRGB(v1.color1);1020const Vec3<int> v2_c1 = Vec3<int>::FromRGB(v2.color1);10211022const Vec4<float> v0_z4 = Vec4<int>::AssignToAll(v0.screenpos.z).Cast<float>();1023const Vec4<float> v1_z4 = Vec4<int>::AssignToAll(v1.screenpos.z).Cast<float>();1024const Vec4<float> v2_z4 = Vec4<int>::AssignToAll(v2.screenpos.z).Cast<float>();1025const Vec4<int> minz = Vec4<int>::AssignToAll(pixelID.cached.minz);1026const Vec4<int> maxz = Vec4<int>::AssignToAll(pixelID.cached.maxz);10271028for (int64_t curY = minY; curY <= maxY; curY += SCREEN_SCALE_FACTOR * 2,1029w0_base = e0.StepY(w0_base),1030w1_base = e1.StepY(w1_base),1031w2_base = e2.StepY(w2_base)) {1032Vec4<int> w0 = w0_base;1033Vec4<int> w1 = w1_base;1034Vec4<int> w2 = w2_base;10351036DrawingCoords p = TransformUnit::ScreenToDrawing(minX, curY);10371038int64_t rowMinX = minX, rowMaxX = maxX;1039e0.NarrowMinMaxX(w0, minX, rowMinX, rowMaxX);1040e1.NarrowMinMaxX(w1, minX, rowMinX, rowMaxX);1041e2.NarrowMinMaxX(w2, minX, rowMinX, rowMaxX);10421043int skipX = (rowMinX - minX) / (SCREEN_SCALE_FACTOR * 2);1044w0 = e0.StepXTimes(w0, skipX);1045w1 = e1.StepXTimes(w1, skipX);1046w2 = e2.StepXTimes(w2, skipX);1047p.x = (p.x + 2 * skipX) & 0x3FF;10481049// TODO: Maybe we can clip the edges instead?1050int scissorYPlus1 = curY + SCREEN_SCALE_FACTOR > maxY ? -1 : 0;1051Vec4<int> scissor_mask = Vec4<int>(0, rowMaxX - rowMinX - SCREEN_SCALE_FACTOR, scissorYPlus1, (rowMaxX - rowMinX - SCREEN_SCALE_FACTOR) | scissorYPlus1);1052Vec4<int> scissor_step = Vec4<int>(0, -(SCREEN_SCALE_FACTOR * 2), 0, -(SCREEN_SCALE_FACTOR * 2));10531054for (int64_t curX = rowMinX; curX <= rowMaxX; curX += SCREEN_SCALE_FACTOR * 2,1055w0 = e0.StepX(w0),1056w1 = e1.StepX(w1),1057w2 = e2.StepX(w2),1058scissor_mask = scissor_mask + scissor_step,1059p.x = (p.x + 2) & 0x3FF) {10601061// If p is on or inside all edges, render pixel1062Vec4<int> mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask);1063if (AnyMask<useSSE4>(mask)) {1064Vec4<int> z;1065if (flatZ) {1066z = Vec4<int>::AssignToAll(v2.screenpos.z);1067} else {1068// Z is interpolated pretty much directly.1069Vec4<float> zfloats = w0.Cast<float>() * v0_z4 + w1.Cast<float>() * v1_z4 + w2.Cast<float>() * v2_z4;1070z = (zfloats * wsum_recip).Cast<int>();1071}10721073if (pixelID.earlyZChecks) {1074if (pixelID.applyDepthRange) {1075#if defined(_M_SSE)1076mask.ivec = _mm_or_si128(mask.ivec, _mm_or_si128(_mm_cmplt_epi32(z.ivec, minz.ivec), _mm_cmpgt_epi32(z.ivec, maxz.ivec)));1077#else1078for (int i = 0; i < 4; ++i) {1079if (z[i] < minz[i] || z[i] > maxz[i])1080mask[i] = -1;1081}1082#endif1083}1084mask = CheckDepthTestPassed4(mask, pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z);1085if (!AnyMask<useSSE4>(mask))1086continue;1087}10881089// Color interpolation is not perspective corrected on the PSP.1090Vec4<int> prim_color[4];1091if (!flatColor0) {1092for (int i = 0; i < 4; ++i) {1093if (mask[i] >= 0)1094prim_color[i] = Interpolate(v0_c0, v1_c0, v2_c0, w0[i], w1[i], w2[i], wsum_recip[i]);1095}1096} else {1097for (int i = 0; i < 4; ++i) {1098prim_color[i] = v2_c0;1099}1100}1101Vec3<int> sec_color[4];1102if (!flatColor1) {1103for (int i = 0; i < 4; ++i) {1104if (mask[i] >= 0)1105sec_color[i] = Interpolate(v0_c1, v1_c1, v2_c1, w0[i], w1[i], w2[i], wsum_recip[i]);1106}1107} else {1108for (int i = 0; i < 4; ++i) {1109sec_color[i] = v2_c1;1110}1111}11121113if (state.enableTextures) {1114if constexpr (!clearMode) {1115Vec4<float> s, t;1116if (state.throughMode) {1117s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), w0, w1,1118w2, wsum_recip);1119t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), w0, w1,1120w2, wsum_recip);11211122// For levels > 0, mipmapping is always based on level 0. Simpler to scale first.1123s *= 1.0f / (float) (1 << state.samplerID.width0Shift);1124t *= 1.0f / (float) (1 << state.samplerID.height0Shift);1125} else if (state.textureProj) {1126// Texture coordinate interpolation must definitely be perspective-correct.1127GetTextureCoordinatesProj(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);1128} else {1129// Texture coordinate interpolation must definitely be perspective-correct.1130GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);1131}11321133if (state.TexLevelMode() == GE_TEXLEVEL_MODE_SLOPE) {1134// Not sure what's right, but we need one value for the slope.1135float clipw = (v0.clipw * w0.x + v1.clipw * w1.x + v2.clipw * w2.x) * wsum_recip.x;1136ApplyTexturing(state, prim_color, mask, s, t, clipw);1137} else {1138ApplyTexturing(state, prim_color, mask, s, t, 0.0f);1139}1140}1141}11421143if constexpr (!clearMode) {1144for (int i = 0; i < 4; ++i) {1145#if defined(_M_SSE)1146// TODO: Tried making Vec4 do this, but things got slower.1147const __m128i sec = _mm_and_si128(sec_color[i].ivec, _mm_set_epi32(0, -1, -1, -1));1148prim_color[i].ivec = _mm_add_epi32(prim_color[i].ivec, sec);1149#elif PPSSPP_ARCH(ARM64_NEON)1150int32x4_t sec = vsetq_lane_s32(0, sec_color[i].ivec, 3);1151prim_color[i].ivec = vaddq_s32(prim_color[i].ivec, sec);1152#else1153prim_color[i] += Vec4<int>(sec_color[i], 0);1154#endif1155}1156}11571158Vec4<int> fog = Vec4<int>::AssignToAll(255);1159if (!noFog) {1160Vec4<float> fogdepths = w0.Cast<float>() * v0.fogdepth + w1.Cast<float>() * v1.fogdepth + w2.Cast<float>() * v2.fogdepth;1161fogdepths = fogdepths * wsum_recip;1162for (int i = 0; i < 4; ++i) {1163fog[i] = ClampFogDepth(fogdepths[i]);1164}1165}11661167PROFILE_THIS_SCOPE("draw_tri_px");1168DrawingCoords subp = p;1169for (int i = 0; i < 4; ++i) {1170if (mask[i] < 0) {1171continue;1172}1173subp.x = p.x + (i & 1);1174subp.y = p.y + (i / 2);11751176state.drawPixel(subp.x, subp.y, z[i], fog[i], ToVec4IntArg(prim_color[i]), pixelID);11771178#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED)1179uint32_t row = gstate.getFrameBufAddress() + subp.y * pixelID.cached.framebufStride * bpp;1180NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * bpp, bpp, tag.c_str(), tag.size());1181if (pixelID.depthWrite) {1182row = gstate.getDepthBufAddress() + subp.y * pixelID.cached.depthbufStride * 2;1183NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * 2, 2, ztag.c_str(), ztag.size());1184}1185#endif1186}1187}1188}1189}11901191#if !defined(SOFTGPU_MEMORY_TAGGING_DETAILED) && defined(SOFTGPU_MEMORY_TAGGING_BASIC)1192for (int y = minY; y <= maxY; y += SCREEN_SCALE_FACTOR) {1193DrawingCoords p = TransformUnit::ScreenToDrawing(minX, y);1194DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, y);1195uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;1196NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, (pend.x - p.x) * bpp, tag.c_str(), tag.size());11971198if (pixelID.depthWrite) {1199row = gstate.getDepthBufAddress() + p.y * pixelID.cached.depthbufStride * 2;1200NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, (pend.x - p.x) * 2, ztag.c_str(), ztag.size());1201}1202}1203#endif1204}12051206// Draws triangle, vertices specified in counter-clockwise direction1207void DrawTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const BinCoords &range, const RasterizerState &state) {1208PROFILE_THIS_SCOPE("draw_tri");12091210auto drawSlice = cpu_info.bSSE4_1 ?1211(state.pixelID.clearMode ? &DrawTriangleSlice<true, true> : &DrawTriangleSlice<false, true>) :1212(state.pixelID.clearMode ? &DrawTriangleSlice<true, false> : &DrawTriangleSlice<false, false>);12131214drawSlice(v0, v1, v2, range.x1, range.y1, range.x2, range.y2, state);1215}12161217void DrawRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &rastState) {1218int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);1219int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);1220int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;1221int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;1222int minX = std::max(entireX1 & ~(SCREEN_SCALE_FACTOR - 1), range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);1223int minY = std::max(entireY1 & ~(SCREEN_SCALE_FACTOR - 1), range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);1224int maxX = std::min(entireX2, range.x2);1225int maxY = std::min(entireY2, range.y2);12261227// If TL x or y was after the half, we don't draw the pixel.1228// TODO: Verify what center is used, allowing slight offset makes gpu/primitives/trianglefan pass.1229if (minX < entireX1 - 1)1230minX += SCREEN_SCALE_FACTOR;1231if (minY < entireY1 - 1)1232minY += SCREEN_SCALE_FACTOR;12331234RasterizerState state = OptimizeFlatRasterizerState(rastState, v1);12351236Vec2f rowST(0.0f, 0.0f);1237// Note: this is double the x or y movement.1238Vec2f stx(0.0f, 0.0f);1239Vec2f sty(0.0f, 0.0f);1240if (state.enableTextures) {1241// Note: texture projection is not handled here, those always turn into triangles.1242Vec2f tc0 = v0.texturecoords.uv();1243Vec2f tc1 = v1.texturecoords.uv();1244if (state.throughMode) {1245// For levels > 0, mipmapping is always based on level 0. Simpler to scale first.1246tc0.s() *= 1.0f / (float)(1 << state.samplerID.width0Shift);1247tc1.s() *= 1.0f / (float)(1 << state.samplerID.width0Shift);1248tc0.t() *= 1.0f / (float)(1 << state.samplerID.height0Shift);1249tc1.t() *= 1.0f / (float)(1 << state.samplerID.height0Shift);1250}12511252float diffX = (entireX2 - entireX1 + 1) / (float)SCREEN_SCALE_FACTOR;1253float diffY = (entireY2 - entireY1 + 1) / (float)SCREEN_SCALE_FACTOR;1254float diffS = tc1.s() - tc0.s();1255float diffT = tc1.t() - tc0.t();12561257if (v0.screenpos.x < v1.screenpos.x) {1258if (v0.screenpos.y < v1.screenpos.y) {1259// Okay, simple, TL -> BR. S and T move toward v1 with X and Y.1260rowST = tc0;1261stx = Vec2f(2.0f * diffS / diffX, 0.0f);1262sty = Vec2f(0.0f, 2.0f * diffT / diffY);1263} else {1264// BL to TR, rotated. We start at TL still.1265// X moves T (not S) toward v1, and Y moves S away from v1.1266rowST = Vec2f(tc1.s(), tc0.t());1267stx = Vec2f(0.0f, 2.0f * diffT / diffX);1268sty = Vec2f(2.0f * -diffS / diffY, 0.0f);1269}1270} else {1271if (v0.screenpos.y < v1.screenpos.y) {1272// TR to BL. Like BL to TR, rotated.1273// X moves T (not s) away from v1, and Y moves S toward v1.1274rowST = Vec2f(tc0.s(), tc1.t());1275stx = Vec2f(0.0f, 2.0f * -diffT / diffX);1276sty = Vec2f(2.0f * diffS / diffY, 0.0f);1277} else {1278// BR to TL, just inverse of TL to BR.1279rowST = Vec2f(tc1.s(), tc1.t());1280stx = Vec2f(2.0f * -diffS / diffX, 0.0f);1281sty = Vec2f(0.0f, 2.0f * -diffT / diffY);1282}1283}12841285// Okay, now move ST to the minX, minY position.1286rowST += (stx / (float)(SCREEN_SCALE_FACTOR * 2)) * (minX - entireX1 + 1);1287rowST += (sty / (float)(SCREEN_SCALE_FACTOR * 2)) * (minY - entireY1 + 1);1288}12891290// And now what we add to spread out to 4 values.1291const Vec4f sto4(0.0f, 0.5f * stx.s(), 0.5f * sty.s(), 0.5f * stx.s() + 0.5f * sty.s());1292const Vec4f tto4(0.0f, 0.5f * stx.t(), 0.5f * sty.t(), 0.5f * stx.t() + 0.5f * sty.t());12931294ScreenCoords pprime(minX, minY, 0);1295const Vec4<int> fog = Vec4<int>::AssignToAll(ClampFogDepth(v1.fogdepth));1296const Vec4<int> z = Vec4<int>::AssignToAll(v1.screenpos.z);1297const Vec4<int> c0 = Vec4<int>::FromRGBA(v1.color0);1298const Vec3<int> sec_color = Vec3<int>::FromRGB(v1.color1);12991300if (state.pixelID.applyDepthRange) {1301// We can bail early since the Z is flat.1302if (v1.screenpos.z < state.pixelID.cached.minz || v1.screenpos.z > state.pixelID.cached.maxz)1303return;1304}13051306#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1307uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;1308std::string tag = StringFromFormat("DisplayListR_%08x", state.listPC);1309std::string ztag = StringFromFormat("DisplayListRZ_%08x", state.listPC);1310#endif13111312for (int64_t curY = minY; curY < maxY; curY += SCREEN_SCALE_FACTOR * 2, rowST += sty) {1313DrawingCoords p = TransformUnit::ScreenToDrawing(minX, curY);13141315int scissorY2 = curY + SCREEN_SCALE_FACTOR > maxY ? -1 : 0;1316Vec4<int> scissor_mask = Vec4<int>(0, maxX - minX - SCREEN_SCALE_FACTOR, scissorY2, (maxX - minX - SCREEN_SCALE_FACTOR) | scissorY2);1317Vec4<int> scissor_step = Vec4<int>(0, -(SCREEN_SCALE_FACTOR * 2), 0, -(SCREEN_SCALE_FACTOR * 2));1318Vec2f st = rowST;13191320for (int64_t curX = minX; curX < maxX; curX += SCREEN_SCALE_FACTOR * 2,1321st += stx,1322scissor_mask += scissor_step,1323p.x = (p.x + 2) & 0x3FF) {1324Vec4<int> mask = scissor_mask;13251326Vec4<int> prim_color[4];1327for (int i = 0; i < 4; ++i) {1328prim_color[i] = c0;1329}13301331if (state.pixelID.earlyZChecks) {1332for (int i = 0; i < 4; ++i) {1333if (mask[i] < 0)1334continue;13351336int x = p.x + (i & 1);1337int y = p.y + (i / 2);1338if (!CheckDepthTestPassed(state.pixelID.DepthTestFunc(), x, y, state.pixelID.cached.depthbufStride, z[i])) {1339mask[i] = -1;1340}1341}1342}13431344if (state.enableTextures) {1345Vec4<float> s, t;1346s = Vec4<float>::AssignToAll(st.s()) + sto4;1347t = Vec4<float>::AssignToAll(st.t()) + tto4;13481349ApplyTexturing(state, prim_color, mask, s, t, v1.clipw);1350}13511352if (!state.pixelID.clearMode) {1353for (int i = 0; i < 4; ++i) {1354#if defined(_M_SSE)1355// TODO: Tried making Vec4 do this, but things got slower.1356const __m128i sec = _mm_and_si128(sec_color.ivec, _mm_set_epi32(0, -1, -1, -1));1357prim_color[i].ivec = _mm_add_epi32(prim_color[i].ivec, sec);1358#elif PPSSPP_ARCH(ARM64_NEON)1359int32x4_t sec = vsetq_lane_s32(0, sec_color.ivec, 3);1360prim_color[i].ivec = vaddq_s32(prim_color[i].ivec, sec);1361#else1362prim_color[i] += Vec4<int>(sec_color, 0);1363#endif1364}1365}13661367PROFILE_THIS_SCOPE("draw_rect_px");1368DrawingCoords subp = p;1369for (int i = 0; i < 4; ++i) {1370if (mask[i] < 0) {1371continue;1372}1373subp.x = p.x + (i & 1);1374subp.y = p.y + (i / 2);13751376state.drawPixel(subp.x, subp.y, z[i], fog[i], ToVec4IntArg(prim_color[i]), state.pixelID);13771378#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED)1379uint32_t row = gstate.getFrameBufAddress() + subp.y * state.pixelID.cached.framebufStride * bpp;1380NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * bpp, bpp, tag.c_str(), tag.size());1381if (state.pixelID.depthWrite) {1382row = gstate.getDepthBufAddress() + subp.y * state.pixelID.cached.depthbufStride * 2;1383NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * 2, 2, ztag.c_str(), ztag.size());1384}1385#endif1386}1387}1388}13891390#if !defined(SOFTGPU_MEMORY_TAGGING_DETAILED) && defined(SOFTGPU_MEMORY_TAGGING_BASIC)1391for (int y = minY; y <= maxY; y += SCREEN_SCALE_FACTOR) {1392DrawingCoords p = TransformUnit::ScreenToDrawing(minX, y);1393DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, y);1394uint32_t row = gstate.getFrameBufAddress() + p.y * state.pixelID.cached.framebufStride * bpp;1395NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, (pend.x - p.x) * bpp, tag.c_str(), tag.size());13961397if (state.pixelID.depthWrite) {1398row = gstate.getDepthBufAddress() + p.y * state.pixelID.cached.depthbufStride * 2;1399NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, (pend.x - p.x) * 2, ztag.c_str(), ztag.size());1400}1401}1402#endif1403}14041405void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerState &state) {1406ScreenCoords pos = v0.screenpos;1407Vec4<int> prim_color = Vec4<int>::FromRGBA(v0.color0);14081409auto &pixelID = state.pixelID;1410auto &samplerID = state.samplerID;14111412DrawingCoords p = TransformUnit::ScreenToDrawing(pos);1413u16 z = pos.z;14141415if (pixelID.earlyZChecks) {1416if (pixelID.applyDepthRange) {1417if (z < pixelID.cached.minz || z > pixelID.cached.maxz)1418return;1419}14201421if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {1422return;1423}1424}14251426if (state.enableTextures) {1427float s = v0.texturecoords.s();1428float t = v0.texturecoords.t();1429if (state.throughMode) {1430s *= 1.0f / (float)(1 << state.samplerID.width0Shift);1431t *= 1.0f / (float)(1 << state.samplerID.height0Shift);1432} else if (state.textureProj) {1433GetTextureCoordinatesProj(v0, v0, 0.0f, s, t);1434} else {1435// Texture coordinate interpolation must definitely be perspective-correct.1436GetTextureCoordinates(v0, v0, 0.0f, s, t);1437}14381439int texLevel;1440int texLevelFrac;1441bool bilinear;1442CalculateSamplingParams(0.0f, 0.0f, v0.clipw, state, texLevel, texLevelFrac, bilinear);1443PROFILE_THIS_SCOPE("sampler");1444prim_color = ApplyTexturingSingle(s, t, ToVec4IntArg(prim_color), texLevel, texLevelFrac, bilinear, state);1445}14461447if (!pixelID.clearMode) {1448Vec3<int> sec_color = Vec3<int>::FromRGB(v0.color1);1449prim_color += Vec4<int>(sec_color, 0);1450}14511452u8 fog = 255;1453if (pixelID.applyFog) {1454fog = ClampFogDepth(v0.fogdepth);1455}14561457PROFILE_THIS_SCOPE("draw_px");1458state.drawPixel(p.x, p.y, z, fog, ToVec4IntArg(prim_color), pixelID);14591460#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1461uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;1462std::string tag = StringFromFormat("DisplayListP_%08x", state.listPC);14631464uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;1465NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, bpp, tag.c_str(), tag.size());14661467if (pixelID.depthWrite) {1468std::string ztag = StringFromFormat("DisplayListPZ_%08x", state.listPC);1469row = gstate.getDepthBufAddress() + p.y * pixelID.cached.depthbufStride * 2;1470NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, 2, ztag.c_str(), ztag.size());1471}1472#endif1473}14741475void ClearRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {1476int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);1477int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);1478int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;1479int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;1480int minX = std::max(entireX1 & ~(SCREEN_SCALE_FACTOR - 1), range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);1481int minY = std::max(entireY1 & ~(SCREEN_SCALE_FACTOR - 1), range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);1482int maxX = std::min(entireX2, range.x2);1483int maxY = std::min(entireY2, range.y2);14841485// If TL x or y was after the half, we don't draw the pixel.1486if (minX < entireX1 - 1)1487minX += SCREEN_SCALE_FACTOR;1488if (minY < entireY1 - 1)1489minY += SCREEN_SCALE_FACTOR;14901491const DrawingCoords pprime = TransformUnit::ScreenToDrawing(minX, minY);1492// Only include the end pixel when it's >= 0.5.1493const DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX - SCREEN_SCALE_FACTOR / 2, maxY - SCREEN_SCALE_FACTOR / 2);1494auto &pixelID = state.pixelID;1495auto &samplerID = state.samplerID;14961497const int w = pend.x - pprime.x + 1;1498if (w <= 0)1499return;15001501if (pixelID.DepthClear()) {1502const u16 z = v1.screenpos.z;1503const int stride = pixelID.cached.depthbufStride;15041505// If both bytes of Z equal, we can just use memset directly which is faster.1506if ((z & 0xFF) == (z >> 8)) {1507DrawingCoords p = pprime;1508for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1509u16 *row = depthbuf.Get16Ptr(p.x, p.y, stride);1510memset(row, z, w * 2);1511}1512} else {1513DrawingCoords p = pprime;1514for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1515for (int x = 0; x < w; ++x) {1516SetPixelDepth(p.x + x, p.y, pixelID.cached.depthbufStride, z);1517}1518}1519}15201521#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1522std::string tag = StringFromFormat("DisplayListXZ_%08x", state.listPC);1523for (int y = pprime.y; y <= pend.y; ++y) {1524uint32_t row = gstate.getDepthBufAddress() + y * pixelID.cached.depthbufStride * 2;1525NotifyMemInfo(MemBlockFlags::WRITE, row + pprime.x * 2, w * 2, tag.c_str(), tag.size());1526}1527#endif1528}15291530// Note: this stays 0xFFFFFFFF if keeping color and alpha, even for 16-bit.1531u32 keepOldMask = 0xFFFFFFFF;1532if (pixelID.ColorClear() && pixelID.StencilClear()) {1533keepOldMask = 0;1534} else {1535switch (pixelID.FBFormat()) {1536case GE_FORMAT_565:1537if (pixelID.ColorClear())1538keepOldMask = 0;1539break;15401541case GE_FORMAT_5551:1542if (pixelID.ColorClear())1543keepOldMask = 0xFFFF8000;1544else if (pixelID.StencilClear())1545keepOldMask = 0xFFFF7FFF;1546break;15471548case GE_FORMAT_4444:1549if (pixelID.ColorClear())1550keepOldMask = 0xFFFFF000;1551else if (pixelID.StencilClear())1552keepOldMask = 0xFFFF0FFF;1553break;15541555case GE_FORMAT_8888:1556default:1557if (pixelID.ColorClear())1558keepOldMask = 0xFF000000;1559else if (pixelID.StencilClear())1560keepOldMask = 0x00FFFFFF;1561break;1562}1563}15641565// The pixel write masks are respected in clear mode.1566if (pixelID.applyColorWriteMask) {1567keepOldMask |= pixelID.cached.colorWriteMask;1568}15691570const u32 new_color = v1.color0;1571u16 new_color16;1572switch (pixelID.FBFormat()) {1573case GE_FORMAT_565:1574new_color16 = RGBA8888ToRGB565(new_color);1575break;15761577case GE_FORMAT_5551:1578new_color16 = RGBA8888ToRGBA5551(new_color);1579break;15801581case GE_FORMAT_4444:1582new_color16 = RGBA8888ToRGBA4444(new_color);1583break;15841585case GE_FORMAT_8888:1586break;15871588case GE_FORMAT_INVALID:1589case GE_FORMAT_DEPTH16:1590case GE_FORMAT_CLUT8:1591_dbg_assert_msg_(false, "Software: invalid framebuf format.");1592break;1593}15941595if (keepOldMask == 0) {1596const int stride = pixelID.cached.framebufStride;15971598if (pixelID.FBFormat() == GE_FORMAT_8888) {1599const bool canMemsetColor = (new_color & 0xFF) == (new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16);1600if (canMemsetColor) {1601DrawingCoords p = pprime;1602for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1603u32 *row = fb.Get32Ptr(p.x, p.y, stride);1604memset(row, new_color, w * 4);1605}1606} else {1607DrawingCoords p = pprime;1608for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1609for (int x = 0; x < w; ++x) {1610fb.Set32(p.x + x, p.y, stride, new_color);1611}1612}1613}1614} else {1615const bool canMemsetColor = (new_color16 & 0xFF) == (new_color16 >> 8);1616if (canMemsetColor) {1617DrawingCoords p = pprime;1618for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1619u16 *row = fb.Get16Ptr(p.x, p.y, stride);1620memset(row, new_color16, w * 2);1621}1622} else {1623DrawingCoords p = pprime;1624for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1625for (int x = 0; x < w; ++x) {1626fb.Set16(p.x + x, p.y, stride, new_color16);1627}1628}1629}1630}1631} else if (keepOldMask != 0xFFFFFFFF) {1632const int stride = pixelID.cached.framebufStride;16331634if (pixelID.FBFormat() == GE_FORMAT_8888) {1635DrawingCoords p = pprime;1636for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1637for (int x = 0; x < w; ++x) {1638const u32 old_color = fb.Get32(p.x + x, p.y, stride);1639const u32 c = (old_color & keepOldMask) | (new_color & ~keepOldMask);1640fb.Set32(p.x + x, p.y, stride, c);1641}1642}1643} else {1644DrawingCoords p = pprime;1645for (p.y = pprime.y; p.y <= pend.y; ++p.y) {1646for (int x = 0; x < w; ++x) {1647const u16 old_color = fb.Get16(p.x + x, p.y, stride);1648const u16 c = (old_color & keepOldMask) | (new_color16 & ~keepOldMask);1649fb.Set16(p.x + x, p.y, stride, c);1650}1651}1652}1653}16541655#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1656if (keepOldMask != 0xFFFFFFFF) {1657uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;1658std::string tag = StringFromFormat("DisplayListX_%08x", state.listPC);1659for (int y = pprime.y; y < pend.y; ++y) {1660uint32_t row = gstate.getFrameBufAddress() + y * pixelID.cached.framebufStride * bpp;1661NotifyMemInfo(MemBlockFlags::WRITE, row + pprime.x * bpp, w * bpp, tag.c_str(), tag.size());1662}1663}1664#endif1665}16661667void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {1668// TODO: Use a proper line drawing algorithm that handles fractional endpoints correctly.1669Vec3<int> a(v0.screenpos.x, v0.screenpos.y, v0.screenpos.z);1670Vec3<int> b(v1.screenpos.x, v1.screenpos.y, v1.screenpos.z);16711672int dx = b.x - a.x;1673int dy = b.y - a.y;1674int dz = b.z - a.z;16751676int steps;1677if (abs(dx) < abs(dy))1678steps = abs(dy) / SCREEN_SCALE_FACTOR;1679else1680steps = abs(dx) / SCREEN_SCALE_FACTOR;16811682// Avoid going too far since we typically don't start at the pixel center.1683if (dx < 0 && dx >= -SCREEN_SCALE_FACTOR)1684dx++;1685if (dy < 0 && dy >= -SCREEN_SCALE_FACTOR)1686dy++;16871688double xinc = (double)dx / steps;1689double yinc = (double)dy / steps;1690double zinc = (double)dz / steps;16911692auto &pixelID = state.pixelID;1693auto &samplerID = state.samplerID;16941695const bool interpolateColor = !state.shadeGouraud || (v0.color0 == v1.color0 && v0.color1 == v1.color1);1696const Vec4<int> v0_c0 = Vec4<int>::FromRGBA(v0.color0);1697const Vec4<int> v1_c0 = Vec4<int>::FromRGBA(v1.color0);1698const Vec3<int> v0_c1 = Vec3<int>::FromRGB(v0.color1);1699const Vec3<int> v1_c1 = Vec3<int>::FromRGB(v1.color1);17001701#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1702std::string tag = StringFromFormat("DisplayListL_%08x", state.listPC);1703std::string ztag = StringFromFormat("DisplayListLZ_%08x", state.listPC);1704#endif17051706double x = a.x > b.x ? a.x - 1 : a.x;1707double y = a.y > b.y ? a.y - 1 : a.y;1708double z = a.z;1709const int steps1 = steps == 0 ? 1 : steps;1710for (int i = 0; i < steps; i++) {1711DrawingCoords p = TransformUnit::ScreenToDrawing(x, y);17121713bool maskOK = x >= range.x1 && y >= range.y1 && x <= range.x2 && y <= range.y2;1714if (maskOK) {1715if (pixelID.earlyZChecks) {1716if (pixelID.applyDepthRange) {1717if (z < pixelID.cached.minz || z > pixelID.cached.maxz)1718maskOK = false;1719}17201721if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {1722maskOK = false;1723}1724}1725}17261727if (maskOK) {1728// Interpolate between the two points.1729Vec4<int> prim_color;1730Vec3<int> sec_color;1731if (interpolateColor) {1732prim_color = (v0_c0 * (steps - i) + v1_c0 * i) / steps1;1733sec_color = (v0_c1 * (steps - i) + v1_c1 * i) / steps1;1734} else {1735prim_color = v1_c0;1736sec_color = v1_c1;1737}17381739u8 fog = 255;1740if (pixelID.applyFog) {1741fog = ClampFogDepth((v0.fogdepth * (float)(steps - i) + v1.fogdepth * (float)i) / steps1);1742}17431744if (state.antialiasLines) {1745// TODO: Clearmode?1746// TODO: Calculate.1747prim_color.a() = 0x7F;1748}17491750if (state.enableTextures) {1751float s, s1;1752float t, t1;1753if (state.throughMode) {1754Vec2<float> tc = (v0.texturecoords.uv() * (float)(steps - i) + v1.texturecoords.uv() * (float)i) / steps1;1755Vec2<float> tc1 = (v0.texturecoords.uv() * (float)(steps - i - 1) + v1.texturecoords.uv() * (float)(i + 1)) / steps1;17561757s = tc.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));1758s1 = tc1.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));1759t = tc.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));1760t1 = tc1.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));1761} else if (state.textureProj) {1762GetTextureCoordinatesProj(v0, v1, (float)(steps - i) / steps1, s, t);1763GetTextureCoordinatesProj(v0, v1, (float)(steps - i - 1) / steps1, s1, t1);1764} else {1765// Texture coordinate interpolation must definitely be perspective-correct.1766GetTextureCoordinates(v0, v1, (float)(steps - i) / steps1, s, t);1767GetTextureCoordinates(v0, v1, (float)(steps - i - 1) / steps1, s1, t1);1768}17691770// If inc is 0, force the delta to zero.1771float ds = xinc == 0.0 ? 0.0f : (s1 - s) * (float)SCREEN_SCALE_FACTOR * (1.0f / xinc);1772float dt = yinc == 0.0 ? 0.0f : (t1 - t) * (float)SCREEN_SCALE_FACTOR * (1.0f / yinc);1773float w = (v0.clipw * (float)(steps - i) + v1.clipw * (float)i) / steps1;17741775int texLevel;1776int texLevelFrac;1777bool texBilinear;1778CalculateSamplingParams(ds, dt, w, state, texLevel, texLevelFrac, texBilinear);17791780if (state.antialiasLines) {1781// TODO: This is a naive and wrong implementation.1782DrawingCoords p0 = TransformUnit::ScreenToDrawing(x, y);1783s = ((float)p0.x + xinc / 32.0f) / 512.0f;1784t = ((float)p0.y + yinc / 32.0f) / 512.0f;17851786texBilinear = true;1787}17881789PROFILE_THIS_SCOPE("sampler");1790prim_color = ApplyTexturingSingle(s, t, ToVec4IntArg(prim_color), texLevel, texLevelFrac, texBilinear, state);1791}17921793if (!pixelID.clearMode)1794prim_color += Vec4<int>(sec_color, 0);17951796PROFILE_THIS_SCOPE("draw_px");1797state.drawPixel(p.x, p.y, z, fog, ToVec4IntArg(prim_color), pixelID);17981799#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)1800uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;1801uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;1802NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, bpp, tag.c_str(), tag.size());18031804if (pixelID.depthWrite) {1805uint32_t row = gstate.getDepthBufAddress() + y * pixelID.cached.depthbufStride * 2;1806NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, 2, ztag.c_str(), ztag.size());1807}1808#endif1809}18101811x += xinc;1812y += yinc;1813z += zinc;1814}1815}18161817bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)1818{1819if (!gstate.isTextureMapEnabled()) {1820return false;1821}18221823GETextureFormat texfmt = gstate.getTextureFormat();1824u32 texaddr = gstate.getTextureAddress(level);1825u32 texbufw = GetTextureBufw(level, texaddr, texfmt);1826int w = gstate.getTextureWidth(level);1827int h = gstate.getTextureHeight(level);18281829u32 sizeInBits = textureBitsPerPixel[texfmt] * (texbufw * (h - 1) + w);1830if (!texaddr || !Memory::IsValidRange(texaddr, sizeInBits / 8))1831return false;1832// We'll break trying to allocate this much.1833if (w >= 0x8000 && h >= 0x8000)1834return false;18351836buffer.Allocate(w, h, GE_FORMAT_8888, false);18371838SamplerID id;1839ComputeSamplerID(&id);1840id.cached.clut = clut;18411842// Slight annoyance, we may have to force a compile.1843Sampler::FetchFunc sampler = Sampler::GetFetchFunc(id, nullptr);1844if (!sampler) {1845Sampler::FlushJit();1846sampler = Sampler::GetFetchFunc(id, nullptr);1847if (!sampler)1848return false;1849}18501851u8 *texptr = Memory::GetPointerWrite(texaddr);1852u32 *row = (u32 *)buffer.GetData();1853for (int y = 0; y < h; ++y) {1854for (int x = 0; x < w; ++x) {1855row[x] = Vec4<int>(sampler(x, y, texptr, texbufw, level, id)).ToRGBA();1856}1857row += w;1858}1859return true;1860}18611862} // namespace186318641865