CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Common/FragmentShaderGenerator.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cstdio>18#include <sstream>1920#include "Common/Log.h"21#include "Common/StringUtils.h"22#include "Common/GPU/OpenGL/GLFeatures.h"23#include "Common/GPU/ShaderWriter.h"24#include "Common/GPU/thin3d.h"25#include "Core/Compatibility.h"26#include "Core/Config.h"27#include "Core/System.h"28#include "GPU/Common/GPUStateUtils.h"29#include "GPU/Common/ShaderId.h"30#include "GPU/Common/ShaderUniforms.h"31#include "GPU/Common/FragmentShaderGenerator.h"32#include "GPU/Vulkan/DrawEngineVulkan.h"33#include "GPU/ge_constants.h"34#include "GPU/GPUState.h"3536#define WRITE(p, ...) p.F(__VA_ARGS__)3738static const SamplerDef samplersMono[3] = {39{ 0, "tex" },40{ 1, "fbotex", SamplerFlags::ARRAY_ON_VULKAN },41{ 2, "pal" },42};4344static const SamplerDef samplersStereo[3] = {45{ 0, "tex", SamplerFlags::ARRAY_ON_VULKAN },46{ 1, "fbotex", SamplerFlags::ARRAY_ON_VULKAN },47{ 2, "pal" },48};4950bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLanguageDesc &compat, Draw::Bugs bugs, uint64_t *uniformMask, FragmentShaderFlags *fragmentShaderFlags, std::string *errorString) {51*uniformMask = 0;52*fragmentShaderFlags = (FragmentShaderFlags)0;53errorString->clear();5455bool useStereo = id.Bit(FS_BIT_STEREO);56bool highpFog = false;57bool highpTexcoord = false;58bool enableFragmentTestCache = gstate_c.Use(GPU_USE_FRAGMENT_TEST_CACHE);5960if (compat.gles) {61// PowerVR needs highp to do the fog in MHU correctly.62// Others don't, and some can't handle highp in the fragment shader.63highpFog = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? true : false;64highpTexcoord = highpFog;65}6667bool texture3D = id.Bit(FS_BIT_3D_TEXTURE);68bool arrayTexture = id.Bit(FS_BIT_SAMPLE_ARRAY_TEXTURE);6970ReplaceAlphaType stencilToAlpha = static_cast<ReplaceAlphaType>(id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2));7172std::vector<const char*> extensions;73if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {74if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE && gl_extensions.EXT_blend_func_extended) {75extensions.push_back("#extension GL_EXT_blend_func_extended : require");76}77if (gl_extensions.EXT_gpu_shader4) {78extensions.push_back("#extension GL_EXT_gpu_shader4 : enable");79}80if (compat.framebufferFetchExtension) {81extensions.push_back(compat.framebufferFetchExtension);82}83if (gl_extensions.OES_texture_3D && texture3D) {84extensions.push_back("#extension GL_OES_texture_3D: enable");85}86}8788ShaderWriterFlags flags = ShaderWriterFlags::NONE;89if (useStereo) {90flags |= ShaderWriterFlags::FS_AUTO_STEREO;91}9293ShaderWriter p(buffer, compat, ShaderStage::Fragment, extensions, flags);94p.F("// %s\n", FragmentShaderDesc(id).c_str());9596p.ApplySamplerMetadata(arrayTexture ? samplersStereo : samplersMono);9798bool lmode = id.Bit(FS_BIT_LMODE);99bool doTexture = id.Bit(FS_BIT_DO_TEXTURE);100bool enableFog = id.Bit(FS_BIT_ENABLE_FOG);101bool enableAlphaTest = id.Bit(FS_BIT_ALPHA_TEST);102103bool alphaTestAgainstZero = id.Bit(FS_BIT_ALPHA_AGAINST_ZERO);104bool testForceToZero = id.Bit(FS_BIT_TEST_DISCARD_TO_ZERO);105bool enableColorTest = id.Bit(FS_BIT_COLOR_TEST);106bool colorTestAgainstZero = id.Bit(FS_BIT_COLOR_AGAINST_ZERO);107bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ);108109bool ubershader = id.Bit(FS_BIT_UBERSHADER);110// ubershader-controlled bits. If ubershader is on, these will not be used below (and will be false).111bool useTexAlpha = id.Bit(FS_BIT_TEXALPHA);112bool enableColorDouble = id.Bit(FS_BIT_DOUBLE_COLOR);113114if (texture3D && arrayTexture) {115*errorString = "Invalid combination of 3D texture and array texture, shouldn't happen";116return false;117}118if (compat.shaderLanguage != ShaderLanguage::GLSL_VULKAN && arrayTexture) {119*errorString = "We only do array textures for framebuffers in Vulkan.";120return false;121}122123bool flatBug = bugs.Has(Draw::Bugs::BROKEN_FLAT_IN_SHADER) && g_Config.bVendorBugChecksEnabled;124125bool doFlatShading = id.Bit(FS_BIT_FLATSHADE) && !flatBug;126if (doFlatShading) {127*fragmentShaderFlags |= FragmentShaderFlags::USES_FLAT_SHADING;128}129130ShaderDepalMode shaderDepalMode = (ShaderDepalMode)id.Bits(FS_BIT_SHADER_DEPAL_MODE, 2);131if (texture3D) {132shaderDepalMode = ShaderDepalMode::OFF;133}134if (!compat.bitwiseOps && shaderDepalMode != ShaderDepalMode::OFF) {135*errorString = "depal requires bitwise ops";136return false;137}138bool bgraTexture = id.Bit(FS_BIT_BGRA_TEXTURE);139bool colorWriteMask = id.Bit(FS_BIT_COLOR_WRITEMASK) && compat.bitwiseOps;140141GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);142GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);143bool needShaderTexClamp = id.Bit(FS_BIT_SHADER_TEX_CLAMP);144145GETexFunc texFunc = (GETexFunc)id.Bits(FS_BIT_TEXFUNC, 3);146147ReplaceBlendType replaceBlend = static_cast<ReplaceBlendType>(id.Bits(FS_BIT_REPLACE_BLEND, 3));148149bool blueToAlpha = false;150if (replaceBlend == ReplaceBlendType::REPLACE_BLEND_BLUE_TO_ALPHA) {151blueToAlpha = true;152}153154bool isModeClear = id.Bit(FS_BIT_CLEARMODE);155156const char *shading = "";157if (compat.glslES30 || compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {158shading = doFlatShading ? "flat" : "";159}160161bool forceDepthWritesOff = id.Bit(FS_BIT_DEPTH_TEST_NEVER);162163bool useDiscardStencilBugWorkaround = id.Bit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL) && !forceDepthWritesOff;164165GEBlendSrcFactor replaceBlendFuncA = (GEBlendSrcFactor)id.Bits(FS_BIT_BLENDFUNC_A, 4);166GEBlendDstFactor replaceBlendFuncB = (GEBlendDstFactor)id.Bits(FS_BIT_BLENDFUNC_B, 4);167GEBlendMode replaceBlendEq = (GEBlendMode)id.Bits(FS_BIT_BLENDEQ, 3);168StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4);169170// Distinct from the logic op simulation support.171GELogicOp replaceLogicOpType = isModeClear ? GE_LOGIC_COPY : (GELogicOp)id.Bits(FS_BIT_REPLACE_LOGIC_OP, 4);172bool replaceLogicOp = replaceLogicOpType != GE_LOGIC_COPY && compat.bitwiseOps;173174bool needFramebufferRead = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp;175176bool fetchFramebuffer = needFramebufferRead && id.Bit(FS_BIT_USE_FRAMEBUFFER_FETCH);177bool readFramebufferTex = needFramebufferRead && !id.Bit(FS_BIT_USE_FRAMEBUFFER_FETCH);178179if (fetchFramebuffer && (compat.shaderLanguage != GLSL_3xx || !compat.lastFragData)) {180*errorString = "framebuffer fetch requires GLSL 3xx";181return false;182}183184bool needFragCoord = readFramebufferTex || gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);185bool writeDepth = gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT) && !forceDepthWritesOff;186187// TODO: We could have a separate mechanism to support more ops using the shader blending mechanism,188// on hardware that can do proper bit math in fragment shaders.189SimulateLogicOpType simulateLogicOpType = (SimulateLogicOpType)id.Bits(FS_BIT_SIMULATE_LOGIC_OP_TYPE, 2);190191if (shaderDepalMode != ShaderDepalMode::OFF && !doTexture) {192*errorString = "depal requires a texture";193return false;194}195196// Currently only used by Vulkan.197std::vector<SamplerDef> samplers;198199if (compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {200if (useDiscardStencilBugWorkaround && !writeDepth) {201WRITE(p, "layout (depth_unchanged) out float gl_FragDepth;\n");202}203204WRITE(p, "layout (std140, set = 0, binding = %d) uniform baseUBO {\n%s};\n", DRAW_BINDING_DYNUBO_BASE, ub_baseStr);205if (doTexture) {206WRITE(p, "layout (set = 0, binding = %d) uniform %s%s tex;\n", DRAW_BINDING_TEXTURE, texture3D ? "sampler3D" : "sampler2D", arrayTexture ? "Array" : "");207}208209if (readFramebufferTex) {210// The framebuffer texture is always bound as an array.211p.F("layout (set = 0, binding = %d) uniform sampler2DArray fbotex;\n", DRAW_BINDING_2ND_TEXTURE);212}213214if (shaderDepalMode != ShaderDepalMode::OFF) {215WRITE(p, "layout (set = 0, binding = %d) uniform sampler2D pal;\n", DRAW_BINDING_DEPAL_TEXTURE);216}217218// Note: the precision qualifiers must match the vertex shader!219WRITE(p, "layout (location = 1) %s in lowp vec4 v_color0;\n", shading);220if (lmode) {221WRITE(p, "layout (location = 2) %s in lowp vec3 v_color1;\n", shading);222}223WRITE(p, "layout (location = 3) in highp float v_fogdepth;\n");224if (doTexture) {225WRITE(p, "layout (location = 0) in highp vec3 v_texcoord;\n");226}227228if (enableAlphaTest && !alphaTestAgainstZero) {229WRITE(p, "int roundAndScaleTo255i(in highp float x) { return int(floor(x * 255.0 + 0.5)); }\n");230}231if (enableColorTest && !colorTestAgainstZero) {232WRITE(p, "uint roundAndScaleTo8x4(in highp vec3 x) { uvec3 u = uvec3(floor(x * 255.0 + 0.5)); return u.r | (u.g << 8) | (u.b << 16); }\n");233WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");234}235236WRITE(p, "layout (location = 0, index = 0) out vec4 fragColor0;\n");237if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {238WRITE(p, "layout (location = 0, index = 1) out vec4 fragColor1;\n");239}240} else if (compat.shaderLanguage == HLSL_D3D11 || compat.shaderLanguage == HLSL_D3D9) {241if (compat.shaderLanguage == HLSL_D3D9) {242if (doTexture)243WRITE(p, "sampler tex : register(s0);\n");244245if (readFramebufferTex) {246WRITE(p, "vec2 u_fbotexSize : register(c%i);\n", CONST_PS_FBOTEXSIZE);247WRITE(p, "sampler fbotex : register(s1);\n");248}249250if (replaceBlend > REPLACE_BLEND_STANDARD) {251if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {252WRITE(p, "float3 u_blendFixA : register(c%i);\n", CONST_PS_BLENDFIXA);253}254if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {255WRITE(p, "float3 u_blendFixB : register(c%i);\n", CONST_PS_BLENDFIXB);256}257}258if (needShaderTexClamp && doTexture) {259WRITE(p, "vec4 u_texclamp : register(c%i);\n", CONST_PS_TEXCLAMP);260WRITE(p, "vec2 u_texclampoff : register(c%i);\n", CONST_PS_TEXCLAMPOFF);261}262263if (enableAlphaTest || enableColorTest) {264WRITE(p, "vec4 u_alphacolorref : register(c%i);\n", CONST_PS_ALPHACOLORREF);265WRITE(p, "vec4 u_alphacolormask : register(c%i);\n", CONST_PS_ALPHACOLORMASK);266}267if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {268WRITE(p, "float u_stencilReplaceValue : register(c%i);\n", CONST_PS_STENCILREPLACE);269}270if (doTexture) {271if (texFunc == GE_TEXFUNC_BLEND) {272WRITE(p, "float3 u_texenv : register(c%i);\n", CONST_PS_TEXENV);273}274if (ubershader) {275WRITE(p, "float2 u_texNoAlphaMul : register(c%i);\n", CONST_PS_TEX_NO_ALPHA_MUL);276}277}278if (enableFog) {279WRITE(p, "float3 u_fogcolor : register(c%i);\n", CONST_PS_FOGCOLOR);280}281if (texture3D) {282WRITE(p, "float u_mipBias : register(c%i);\n", CONST_PS_MIPBIAS);283}284} else {285WRITE(p, "SamplerState texSamp : register(s0);\n");286if (texture3D) {287WRITE(p, "Texture3D<vec4> tex : register(t0);\n");288} else {289WRITE(p, "Texture2D<vec4> tex : register(t0);\n");290}291if (readFramebufferTex) {292// No sampler required, we Load293WRITE(p, "Texture2D<vec4> fbotex : register(t1);\n");294}295296if (shaderDepalMode != ShaderDepalMode::OFF) {297WRITE(p, "SamplerState palSamp : register(s3);\n");298WRITE(p, "Texture2D<vec4> pal : register(t3);\n");299WRITE(p, "float2 textureSize(Texture2D<float4> tex, int mip) { float2 size; tex.GetDimensions(size.x, size.y); return size; }\n");300}301302WRITE(p, "cbuffer base : register(b0) {\n%s};\n", ub_baseStr);303}304305if (enableAlphaTest) {306if (compat.shaderLanguage == HLSL_D3D11) {307WRITE(p, "int roundAndScaleTo255i(float x) { return int(floor(x * 255.0f + 0.5f)); }\n");308} else {309// D3D11 level 9 gets to take this path.310WRITE(p, "float roundAndScaleTo255f(float x) { return floor(x * 255.0f + 0.5f); }\n");311}312}313if (enableColorTest) {314if (compat.shaderLanguage == HLSL_D3D11) {315WRITE(p, "uint roundAndScaleTo8x4(float3 x) { uvec3 u = (floor(x * 255.0f + 0.5f)); return u.r | (u.g << 8) | (u.b << 16); }\n");316WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");317} else {318WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");319}320}321322WRITE(p, "struct PS_IN {\n");323if (doTexture || compat.shaderLanguage == HLSL_D3D11) {324// In D3D11, if we always have a texcoord in the VS, we always need it in the PS too for the structs to match.325WRITE(p, " vec3 v_texcoord: TEXCOORD0;\n");326}327const char *colorInterpolation = doFlatShading && compat.shaderLanguage == HLSL_D3D11 ? "nointerpolation " : "";328WRITE(p, " %svec4 v_color0: COLOR0;\n", colorInterpolation);329if (lmode) {330WRITE(p, " vec3 v_color1: COLOR1;\n");331}332WRITE(p, " float v_fogdepth: TEXCOORD1;\n");333if (needFragCoord) {334if (compat.shaderLanguage == HLSL_D3D11) {335WRITE(p, " vec4 pixelPos : SV_POSITION;\n");336} else if (compat.shaderLanguage == HLSL_D3D9) {337WRITE(p, " vec4 pixelPos : VPOS;\n"); // VPOS is only supported for Shader Model 3.0, but we can probably forget about D3D9 SM2.0 at this point...338}339}340WRITE(p, "};\n");341342if (compat.shaderLanguage == HLSL_D3D11) {343WRITE(p, "struct PS_OUT {\n");344if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {345WRITE(p, " vec4 target : SV_Target0;\n");346WRITE(p, " vec4 target1 : SV_Target1;\n");347} else {348WRITE(p, " vec4 target : SV_Target;\n");349}350if (writeDepth) {351WRITE(p, " float depth : SV_Depth;\n");352}353WRITE(p, "};\n");354} else if (compat.shaderLanguage == HLSL_D3D9) {355WRITE(p, "struct PS_OUT {\n");356WRITE(p, " vec4 target : COLOR;\n");357if (writeDepth) {358WRITE(p, " float depth : DEPTH;\n");359}360WRITE(p, "};\n");361}362} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {363if ((shaderDepalMode != ShaderDepalMode::OFF || colorWriteMask) && gl_extensions.IsGLES) {364WRITE(p, "precision highp int;\n");365}366367if (doTexture) {368if (texture3D) {369// For whatever reason, a precision specifier is required here.370WRITE(p, "uniform lowp sampler3D tex;\n");371} else {372WRITE(p, "uniform sampler2D tex;\n");373}374*uniformMask |= DIRTY_TEX_ALPHA_MUL;375if (ubershader) {376WRITE(p, "uniform vec2 u_texNoAlphaMul;\n");377}378}379380if (readFramebufferTex) {381if (!compat.texelFetch) {382WRITE(p, "uniform vec2 u_fbotexSize;\n");383}384WRITE(p, "uniform sampler2D fbotex;\n");385}386387if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {388*uniformMask |= DIRTY_SHADERBLEND;389if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {390WRITE(p, "uniform vec3 u_blendFixA;\n");391}392if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {393WRITE(p, "uniform vec3 u_blendFixB;\n");394}395}396397if (needShaderTexClamp && doTexture) {398*uniformMask |= DIRTY_TEXCLAMP;399WRITE(p, "uniform vec4 u_texclamp;\n");400WRITE(p, "uniform vec2 u_texclampoff;\n");401}402403// TODO: Can get rid of some of this in the != 0 cases.404if (enableAlphaTest || enableColorTest) {405if (enableFragmentTestCache) {406WRITE(p, "uniform sampler2D testtex;\n");407} else {408*uniformMask |= DIRTY_ALPHACOLORREF;409if (compat.bitwiseOps) {410WRITE(p, "uniform uint u_alphacolorref;\n");411} else {412WRITE(p, "uniform vec4 u_alphacolorref;\n");413}414if (compat.bitwiseOps && ((enableColorTest && !colorTestAgainstZero) || (enableAlphaTest && !alphaTestAgainstZero))) {415*uniformMask |= DIRTY_ALPHACOLORMASK;416WRITE(p, "uniform uint u_alphacolormask;\n");417}418}419}420421if (shaderDepalMode != ShaderDepalMode::OFF) {422WRITE(p, "uniform sampler2D pal;\n");423WRITE(p, "uniform uint u_depal_mask_shift_off_fmt;\n");424*uniformMask |= DIRTY_DEPAL;425}426427if (colorWriteMask) {428WRITE(p, "uniform uint u_colorWriteMask;\n");429*uniformMask |= DIRTY_COLORWRITEMASK;430}431432if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {433*uniformMask |= DIRTY_STENCILREPLACEVALUE;434WRITE(p, "uniform float u_stencilReplaceValue;\n");435}436if (doTexture && texFunc == GE_TEXFUNC_BLEND) {437*uniformMask |= DIRTY_TEXENV;438WRITE(p, "uniform vec3 u_texenv;\n");439}440441if (texture3D) {442*uniformMask |= DIRTY_MIPBIAS;443WRITE(p, "uniform float u_mipBias;\n");444}445446WRITE(p, "%s %s lowp vec4 v_color0;\n", shading, compat.varying_fs);447if (lmode) {448WRITE(p, "%s %s lowp vec3 v_color1;\n", shading, compat.varying_fs);449}450if (enableFog) {451*uniformMask |= DIRTY_FOGCOLOR;452WRITE(p, "uniform vec3 u_fogcolor;\n");453}454WRITE(p, "%s %s float v_fogdepth;\n", compat.varying_fs, highpFog ? "highp" : "mediump");455if (doTexture) {456WRITE(p, "%s %s vec3 v_texcoord;\n", compat.varying_fs, highpTexcoord ? "highp" : "mediump");457}458459if (!enableFragmentTestCache) {460if (enableAlphaTest && !alphaTestAgainstZero) {461if (compat.bitwiseOps) {462WRITE(p, "int roundAndScaleTo255i(in float x) { return int(floor(x * 255.0 + 0.5)); }\n");463} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {464WRITE(p, "float roundTo255thf(in mediump float x) { mediump float y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");465} else {466WRITE(p, "float roundAndScaleTo255f(in float x) { return floor(x * 255.0 + 0.5); }\n");467}468}469if (enableColorTest && !colorTestAgainstZero) {470if (compat.bitwiseOps) {471WRITE(p, "uint roundAndScaleTo8x4(in vec3 x) { uvec3 u = uvec3(floor(x * 255.92)); return u.r | (u.g << 0x8u) | (u.b << 0x10u); }\n");472WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 0x8u) | (u.b << 0x10u); }\n");473} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {474WRITE(p, "vec3 roundTo255thv(in vec3 x) { vec3 y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");475} else {476WRITE(p, "vec3 roundAndScaleTo255v(in vec3 x) { return floor(x * 255.0 + 0.5); }\n");477}478}479}480481if (!strcmp(compat.fragColor0, "fragColor0")) {482const char *qualifierColor0 = "out";483if (fetchFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) {484qualifierColor0 = "inout";485}486// Output the output color definitions.487if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {488WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);489WRITE(p, "out vec4 fragColor1;\n");490} else {491WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);492}493}494}495496bool hasPackUnorm4x8 = false;497if (compat.shaderLanguage == GLSL_VULKAN) {498hasPackUnorm4x8 = true;499} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {500if (compat.gles) {501hasPackUnorm4x8 = compat.glslVersionNumber >= 310;502} else {503hasPackUnorm4x8 = compat.glslVersionNumber >= 400;504}505}506507const char *packSuffix = "";508if (!hasPackUnorm4x8) {509packSuffix = "R";510}511512// Provide implementations of packUnorm4x8 and unpackUnorm4x8 if not available.513if ((colorWriteMask || replaceLogicOp) && !hasPackUnorm4x8) {514WRITE(p, "uint packUnorm4x8%s(%svec4 v) {\n", packSuffix, compat.shaderLanguage == GLSL_VULKAN ? "highp " : "");515WRITE(p, " highp vec4 f = clamp(v, 0.0, 1.0);\n");516WRITE(p, " uvec4 u = uvec4(255.0 * f);\n");517WRITE(p, " return u.x | (u.y << 0x8u) | (u.z << 0x10u) | (u.w << 0x18u);\n");518WRITE(p, "}\n");519520WRITE(p, "vec4 unpackUnorm4x8%s(highp uint x) {\n", packSuffix);521WRITE(p, " highp uvec4 u = uvec4(x & 0xFFu, (x >> 0x8u) & 0xFFu, (x >> 0x10u) & 0xFFu, (x >> 0x18u) & 0xFFu);\n");522WRITE(p, " highp vec4 f = vec4(u);\n");523WRITE(p, " return f * (1.0 / 255.0);\n");524WRITE(p, "}\n");525}526527if (compat.bitwiseOps && enableColorTest) {528p.C("uvec3 unpackUVec3(highp uint x) {\n");529p.C(" return uvec3(x & 0xFFu, (x >> 0x8u) & 0xFFu, (x >> 0x10u) & 0xFFu);\n");530p.C("}\n");531}532533// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.534if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {535WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");536}537538if (compat.shaderLanguage == HLSL_D3D11) {539WRITE(p, "PS_OUT main( PS_IN In ) {\n");540WRITE(p, " PS_OUT outfragment;\n");541if (needFragCoord) {542WRITE(p, " vec4 gl_FragCoord = In.pixelPos;\n");543}544if (writeDepth) {545WRITE(p, " float gl_FragDepth;\n");546}547} else if (compat.shaderLanguage == HLSL_D3D9) {548WRITE(p, "PS_OUT main( PS_IN In ) {\n");549WRITE(p, " PS_OUT outfragment;\n");550if (needFragCoord) {551WRITE(p, " vec4 gl_FragCoord = In.pixelPos;\n");552}553} else {554WRITE(p, "void main() {\n");555}556557if (compat.shaderLanguage == HLSL_D3D11 || compat.shaderLanguage == HLSL_D3D9) {558WRITE(p, " vec4 v_color0 = In.v_color0;\n");559if (lmode) {560WRITE(p, " vec3 v_color1 = In.v_color1;\n");561}562if (enableFog) {563WRITE(p, " float v_fogdepth = In.v_fogdepth;\n");564}565if (doTexture) {566WRITE(p, " vec3 v_texcoord = In.v_texcoord;\n");567}568}569570// Two things read from the old framebuffer - shader replacement blending and bit-level masking.571if (readFramebufferTex) {572if (compat.shaderLanguage == HLSL_D3D11) {573WRITE(p, " vec4 destColor = fbotex.Load(int3((int)gl_FragCoord.x, (int)gl_FragCoord.y, 0));\n");574} else if (compat.shaderLanguage == HLSL_D3D9) {575WRITE(p, " vec4 destColor = tex2D(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);576} else if (compat.shaderLanguage == GLSL_VULKAN) {577WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec3(gl_FragCoord.x, gl_FragCoord.y, %s), 0);\n", compat.texelFetch, useStereo ? "float(gl_ViewIndex)" : "0");578} else if (!compat.texelFetch) {579WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);580} else {581WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);582}583} else if (fetchFramebuffer) {584// If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit.585// We can just read the prev value more directly.586if (compat.shaderLanguage == GLSL_3xx) {587WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData);588} else if (compat.shaderLanguage == GLSL_VULKAN) {589WRITE(p, " lowp vec4 destColor = subpassLoad(inputColor);\n");590} else {591_assert_msg_(false, "Need fetch destColor, but not a compatible language");592}593}594595if (isModeClear) {596// Clear mode does not allow any fancy shading.597WRITE(p, " vec4 v = v_color0;\n");598} else {599const char *secondary = "";600// Secondary color for specular on top of texture601if (lmode) {602WRITE(p, " vec4 s = vec4(v_color1, 0.0);\n");603secondary = " + s";604}605606if (doTexture) {607char texcoord[64] = "v_texcoord";608// TODO: Not sure the right way to do this for projection.609// This path destroys resolution on older PowerVR no matter what I do if projection is needed,610// so we disable it on SGX 540 and lesser, and live with the consequences.611bool terriblePrecision = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_TERRIBLE) != 0;612bool clampDisabled = doTextureProjection && terriblePrecision;613// Also with terrible precision we can't do wrapping without destroying the image. See #9189614if (terriblePrecision && (!id.Bit(FS_BIT_CLAMP_S) || !id.Bit(FS_BIT_CLAMP_T))) {615clampDisabled = true;616}617if (needShaderTexClamp && !clampDisabled) {618// We may be clamping inside a larger surface (tex = 64x64, buffer=480x272).619// We may also be wrapping in such a surface, or either one in a too-small surface.620// Obviously, clamping to a smaller surface won't work. But better to clamp to something.621std::string ucoord = "v_texcoord.x";622std::string vcoord = "v_texcoord.y";623if (doTextureProjection) {624ucoord = "(v_texcoord.x / v_texcoord.z)";625vcoord = "(v_texcoord.y / v_texcoord.z)";626}627628std::string modulo = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? "mymod" : "mod";629630if (id.Bit(FS_BIT_CLAMP_S)) {631ucoord = "clamp(" + ucoord + ", u_texclamp.z, u_texclamp.x - u_texclamp.z)";632} else {633ucoord = modulo + "(" + ucoord + ", u_texclamp.x)";634}635if (id.Bit(FS_BIT_CLAMP_T)) {636vcoord = "clamp(" + vcoord + ", u_texclamp.w, u_texclamp.y - u_texclamp.w)";637} else {638vcoord = modulo + "(" + vcoord + ", u_texclamp.y)";639}640ucoord = "(" + ucoord + " + u_texclampoff.x)";641vcoord = "(" + vcoord + " + u_texclampoff.y)";642643WRITE(p, " vec2 fixedcoord = vec2(%s, %s);\n", ucoord.c_str(), vcoord.c_str());644truncate_cpy(texcoord, "fixedcoord");645// We already projected it.646doTextureProjection = false;647}648649switch (shaderDepalMode) {650case ShaderDepalMode::OFF:651if (compat.shaderLanguage == HLSL_D3D11) {652if (texture3D) {653if (doTextureProjection) {654WRITE(p, " vec4 t = tex.Sample(texSamp, vec3(v_texcoord.xy / v_texcoord.z, u_mipBias))%s;\n", bgraTexture ? ".bgra" : "");655} else {656WRITE(p, " vec4 t = tex.Sample(texSamp, vec3(%s.xy, u_mipBias))%s;\n", texcoord, bgraTexture ? ".bgra" : "");657}658} else {659if (doTextureProjection) {660WRITE(p, " vec4 t = tex.Sample(texSamp, v_texcoord.xy / v_texcoord.z)%s;\n", bgraTexture ? ".bgra" : "");661} else {662WRITE(p, " vec4 t = tex.Sample(texSamp, %s.xy)%s;\n", texcoord, bgraTexture ? ".bgra" : "");663}664}665} else if (compat.shaderLanguage == HLSL_D3D9) {666if (texture3D) {667if (doTextureProjection) {668WRITE(p, " vec4 t = tex3Dproj(tex, vec4(v_texcoord.x, v_texcoord.y, u_mipBias, v_texcoord.z))%s;\n", bgraTexture ? ".bgra" : "");669} else {670WRITE(p, " vec4 t = tex3D(tex, vec3(%s.x, %s.y, u_mipBias))%s;\n", texcoord, texcoord, bgraTexture ? ".bgra" : "");671}672} else {673if (doTextureProjection) {674WRITE(p, " vec4 t = tex2Dproj(tex, vec4(v_texcoord.x, v_texcoord.y, 0.0, v_texcoord.z))%s;\n", bgraTexture ? ".bgra" : "");675} else {676WRITE(p, " vec4 t = tex2D(tex, %s.xy)%s;\n", texcoord, bgraTexture ? ".bgra" : "");677}678}679} else {680// Note that here we're relying on the filter to be linear. We would have to otherwise to do two samples and manually filter in Z.681// Let's add that if we run into a case...682if (texture3D) {683if (doTextureProjection) {684WRITE(p, " vec4 t = %sProj(tex, vec4(%s.xy, u_mipBias, %s.z));\n", compat.texture3D, texcoord, texcoord);685} else {686WRITE(p, " vec4 t = %s(tex, vec3(%s.xy, u_mipBias));\n", compat.texture3D, texcoord);687}688} else if (arrayTexture) {689_dbg_assert_(compat.shaderLanguage == GLSL_VULKAN);690// Used for stereo rendering.691const char *arrayIndex = useStereo ? "float(gl_ViewIndex)" : "0.0";692if (doTextureProjection) {693// There's no textureProj for array textures, so we need to emulate it.694// Should be fine on any Vulkan-compatible hardware.695WRITE(p, " vec2 uv_proj = (%s.xy) / (%s.z);\n", texcoord, texcoord);696WRITE(p, " vec4 t = %s(tex, vec3(uv_proj, %s));\n", compat.texture, texcoord, arrayIndex);697} else {698WRITE(p, " vec4 t = %s(tex, vec3(%s.xy, %s));\n", compat.texture, texcoord, arrayIndex);699}700} else {701if (doTextureProjection) {702WRITE(p, " vec4 t = %sProj(tex, %s);\n", compat.texture, texcoord);703} else {704WRITE(p, " vec4 t = %s(tex, %s.xy);\n", compat.texture, texcoord);705}706}707}708break;709case ShaderDepalMode::SMOOTHED:710// Specific mode for Test Drive. Fixes the banding.711if (doTextureProjection) {712// We don't use textureProj because we need better control and it's probably not much of a savings anyway.713// However it is good for precision on older hardware like PowerVR.714p.F(" vec2 uv = %s.xy/%s.z;\n vec2 uv_round;\n", texcoord, texcoord);715} else {716p.F(" vec2 uv = %s.xy;\n vec2 uv_round;\n", texcoord);717}718// Restrictions on this are checked before setting the smoothed flag.719// Only RGB565 and RGBA5551 are supported, and only the specific shifts hitting the720// channels directly.721// Also, since we know the CLUT is smooth, we do not need to do the bilinear filter manually, we can just722// lookup with the filtered value once.723p.F(" vec4 t = ").SampleTexture2D("tex", "uv").C(";\n");724p.C(" uint depalShift = (u_depal_mask_shift_off_fmt >> 0x8u) & 0xFFu;\n");725p.C(" uint depalOffset = ((u_depal_mask_shift_off_fmt >> 0x10u) & 0xFFu) << 0x4u;\n");726p.C(" uint depalFmt = (u_depal_mask_shift_off_fmt >> 0x18u) & 0x3u;\n");727p.C(" float index0 = t.r;\n");728p.C(" float factor = 31.0 / 256.0;\n");729p.C(" if (depalFmt == 0x0u) {\n"); // yes, different versions of Test Drive use different formats. Could do compile time by adding more compat flags but meh.730p.C(" if (depalShift == 0x5u) { index0 = t.g; factor = 63.0 / 256.0; }\n");731p.C(" else if (depalShift == 0xBu) { index0 = t.b; }\n");732p.C(" } else {\n");733p.C(" if (depalShift == 0x5u) { index0 = t.g; }\n");734p.C(" else if (depalShift == 0xAu) { index0 = t.b; }\n");735p.C(" }\n");736p.C(" float offset = float(depalOffset) / 256.0;\n");737p.F(" t = ").SampleTexture2D("pal", "vec2((index0 * factor + offset) * 0.5 + 0.5 / 512.0, 0.0)").C(";\n"); // 0.5 for 512-entry CLUT.738break;739case ShaderDepalMode::NORMAL:740if (doTextureProjection) {741// We don't use textureProj because we need better control and it's probably not much of a savings anyway.742// However it is good for precision on older hardware like PowerVR.743WRITE(p, " vec2 uv = %s.xy/%s.z;\n vec2 uv_round;\n", texcoord, texcoord);744} else {745WRITE(p, " vec2 uv = %s.xy;\n vec2 uv_round;\n", texcoord);746}747WRITE(p, " vec2 tsize = vec2(textureSize(tex, 0).xy);\n");748WRITE(p, " vec2 fraction;\n");749WRITE(p, " bool bilinear = (u_depal_mask_shift_off_fmt >> 0x2Fu) != 0x0u;\n");750WRITE(p, " if (bilinear) {\n");751WRITE(p, " uv_round = uv * tsize - vec2(0.5, 0.5);\n");752WRITE(p, " fraction = fract(uv_round);\n");753WRITE(p, " uv_round = (uv_round - fraction + vec2(0.5, 0.5)) / tsize;\n"); // We want to take our four point samples at pixel centers.754WRITE(p, " } else {\n");755WRITE(p, " uv_round = uv;\n");756WRITE(p, " }\n");757p.C(" highp vec4 t = ").SampleTexture2D("tex", "uv_round").C(";\n");758p.C(" highp vec4 t1 = ").SampleTexture2DOffset("tex", "uv_round", 1, 0).C(";\n");759p.C(" highp vec4 t2 = ").SampleTexture2DOffset("tex", "uv_round", 0, 1).C(";\n");760p.C(" highp vec4 t3 = ").SampleTexture2DOffset("tex", "uv_round", 1, 1).C(";\n");761WRITE(p, " uint depalMask = (u_depal_mask_shift_off_fmt & 0xFFu);\n");762WRITE(p, " uint depalShift = (u_depal_mask_shift_off_fmt >> 0x8u) & 0xFFu;\n");763WRITE(p, " uint depalOffset = ((u_depal_mask_shift_off_fmt >> 0x10u) & 0xFFu) << 0x4u;\n");764WRITE(p, " uint depalFmt = (u_depal_mask_shift_off_fmt >> 0x18u) & 0x3u;\n");765WRITE(p, " uvec4 col; uint index0; uint index1; uint index2; uint index3;\n");766WRITE(p, " switch (int(depalFmt)) {\n"); // We might want to include fmt in the shader ID if this is a performance issue.767WRITE(p, " case 0:\n"); // 565768WRITE(p, " col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n");769WRITE(p, " index0 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");770WRITE(p, " if (bilinear) {\n");771WRITE(p, " col = uvec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n");772WRITE(p, " index1 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");773WRITE(p, " col = uvec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n");774WRITE(p, " index2 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");775WRITE(p, " col = uvec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n");776WRITE(p, " index3 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");777WRITE(p, " }\n");778WRITE(p, " break;\n");779WRITE(p, " case 1:\n"); // 5551780WRITE(p, " col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");781WRITE(p, " index0 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");782WRITE(p, " if (bilinear) {\n");783WRITE(p, " col = uvec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");784WRITE(p, " index1 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");785WRITE(p, " col = uvec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");786WRITE(p, " index2 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");787WRITE(p, " col = uvec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");788WRITE(p, " index3 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");789WRITE(p, " }\n");790WRITE(p, " break;\n");791WRITE(p, " case 2:\n"); // 4444792WRITE(p, " col = uvec4(t.rgba * 15.99);\n");793WRITE(p, " index0 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");794WRITE(p, " if (bilinear) {\n");795WRITE(p, " col = uvec4(t1.rgba * 15.99);\n");796WRITE(p, " index1 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");797WRITE(p, " col = uvec4(t2.rgba * 15.99);\n");798WRITE(p, " index2 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");799WRITE(p, " col = uvec4(t3.rgba * 15.99);\n");800WRITE(p, " index3 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");801WRITE(p, " }\n");802WRITE(p, " break;\n");803WRITE(p, " case 3:\n"); // 8888804WRITE(p, " col = uvec4(t.rgba * 255.99);\n");805WRITE(p, " index0 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");806WRITE(p, " if (bilinear) {\n");807WRITE(p, " col = uvec4(t1.rgba * 255.99);\n");808WRITE(p, " index1 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");809WRITE(p, " col = uvec4(t2.rgba * 255.99);\n");810WRITE(p, " index2 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");811WRITE(p, " col = uvec4(t3.rgba * 255.99);\n");812WRITE(p, " index3 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");813WRITE(p, " }\n");814WRITE(p, " break;\n");815WRITE(p, " };\n");816WRITE(p, " index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n");817p.C(" t = ").LoadTexture2D("pal", "ivec2(index0, 0)", 0).C(";\n");818WRITE(p, " if (bilinear && !(index0 == index1 && index1 == index2 && index2 == index3)) {\n");819WRITE(p, " index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n");820WRITE(p, " index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n");821WRITE(p, " index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n");822p.C(" t1 = ").LoadTexture2D("pal", "ivec2(index1, 0)", 0).C(";\n");823p.C(" t2 = ").LoadTexture2D("pal", "ivec2(index2, 0)", 0).C(";\n");824p.C(" t3 = ").LoadTexture2D("pal", "ivec2(index3, 0)", 0).C(";\n");825WRITE(p, " t = mix(t, t1, fraction.x);\n");826WRITE(p, " t2 = mix(t2, t3, fraction.x);\n");827WRITE(p, " t = mix(t, t2, fraction.y);\n");828WRITE(p, " }\n");829break;830case ShaderDepalMode::CLUT8_8888:831if (doTextureProjection) {832// We don't use textureProj because we need better control and it's probably not much of a savings anyway.833// However it is good for precision on older hardware like PowerVR.834p.F(" vec2 uv = %s.xy/%s.z;\n vec2 uv_round;\n", texcoord, texcoord);835} else {836p.F(" vec2 uv = %s.xy;\n vec2 uv_round;\n", texcoord);837}838p.C(" vec2 tsize = vec2(textureSize(tex, 0).xy);\n");839p.C(" uv_round = floor(uv * tsize);\n");840p.C(" int component = int(uv_round.x) & 3;\n");841p.C(" uv_round.x *= 0.25;\n");842p.C(" uv_round /= tsize;\n");843p.C(" vec4 t = ").SampleTexture2D("tex", "uv_round").C(";\n");844p.C(" int index;\n");845p.C(" switch (component) {\n");846p.C(" case 0: index = int(t.x * 254.99); break;\n"); // TODO: Not sure why 254.99 instead of 255.99, but it's currently needed.847p.C(" case 1: index = int(t.y * 254.99); break;\n");848p.C(" case 2: index = int(t.z * 254.99); break;\n");849p.C(" case 3: index = int(t.w * 254.99); break;\n");850p.C(" }\n");851p.C(" t = ").LoadTexture2D("pal", "ivec2(index, 0)", 0).C(";\n");852break;853}854855WRITE(p, " vec4 p = v_color0;\n");856857if (texFunc != GE_TEXFUNC_REPLACE) {858if (ubershader) {859WRITE(p, " t.a = max(t.a, u_texNoAlphaMul.x);\n");860} else if (!useTexAlpha) {861WRITE(p, " t.a = 1.0;\n");862}863}864865switch (texFunc) {866case GE_TEXFUNC_MODULATE:867WRITE(p, " vec4 v = p * t%s;\n", secondary);868break;869case GE_TEXFUNC_DECAL:870WRITE(p, " vec4 v = vec4(mix(p.rgb, t.rgb, t.a), p.a)%s;\n", secondary);871break;872case GE_TEXFUNC_BLEND:873WRITE(p, " vec4 v = vec4(mix(p.rgb, u_texenv.rgb, t.rgb), p.a * t.a)%s;\n", secondary);874break;875case GE_TEXFUNC_REPLACE:876WRITE(p, " vec4 r = t;\n");877if (ubershader) {878WRITE(p, " r.a = mix(r.a, p.a, u_texNoAlphaMul.x);\n");879} else if (!useTexAlpha) {880WRITE(p, " r.a = p.a;\n");881}882WRITE(p, " vec4 v = r%s;\n", secondary);883break;884case GE_TEXFUNC_ADD:885case GE_TEXFUNC_UNKNOWN1:886case GE_TEXFUNC_UNKNOWN2:887case GE_TEXFUNC_UNKNOWN3:888WRITE(p, " vec4 v = vec4(p.rgb + t.rgb, p.a * t.a)%s;\n", secondary);889break;890default:891// Doesn't happen892WRITE(p, " vec4 v = p%s;\n", secondary); break;893break;894}895896// This happens before fog is applied.897*uniformMask |= DIRTY_TEX_ALPHA_MUL;898899// We only need a clamp if the color will be further processed. Otherwise the hardware color conversion will clamp for us.900if (ubershader) {901if (enableFog || enableColorTest || replaceBlend != REPLACE_BLEND_NO || simulateLogicOpType != LOGICOPTYPE_NORMAL || colorWriteMask || blueToAlpha) {902WRITE(p, " v.rgb = clamp(v.rgb * u_texNoAlphaMul.y, 0.0, 1.0);\n");903} else {904WRITE(p, " v.rgb *= u_texNoAlphaMul.y;\n");905}906} else if (enableColorDouble) {907p.C(" v.rgb = clamp(v.rgb * 2.0, 0.0, 1.0);\n");908}909} else {910// No texture mapping911WRITE(p, " vec4 v = v_color0%s;\n", secondary);912}913914if (enableFog) {915WRITE(p, " float fogCoef = clamp(v_fogdepth, 0.0, 1.0);\n");916WRITE(p, " v = mix(vec4(u_fogcolor, v.a), v, fogCoef);\n");917}918919// Texture access is at half texels [0.5/256, 255.5/256], but colors are normalized [0, 255].920// So we have to scale to account for the difference.921char alphaTestXCoord[64] = "0";922if (enableFragmentTestCache) {923if (enableColorTest && !colorTestAgainstZero) {924WRITE(p, " vec4 vScale256 = v * %f + %f;\n", 255.0 / 256.0, 0.5 / 256.0);925truncate_cpy(alphaTestXCoord, "vScale256.a");926} else if (enableAlphaTest && !alphaTestAgainstZero) {927snprintf(alphaTestXCoord, sizeof(alphaTestXCoord), "v.a * %f + %f", 255.0 / 256.0, 0.5 / 256.0);928}929}930931const char *discardStatement = testForceToZero ? "v.a = 0.0;" : "DISCARD;";932if (enableAlphaTest) {933*fragmentShaderFlags |= FragmentShaderFlags::USES_DISCARD;934935if (alphaTestAgainstZero) {936// When testing against 0 (extremely common), we can avoid some math.937// 0.002 is approximately half of 1.0 / 255.0.938if (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) {939WRITE(p, " if (v.a < 0.002) %s\n", discardStatement);940} else if (alphaTestFunc != GE_COMP_NEVER) {941// Anything else is a test for == 0. Happens sometimes, actually...942WRITE(p, " if (v.a > 0.002) %s\n", discardStatement);943} else {944// NEVER has been logged as used by games, although it makes little sense - statically failing.945// Maybe we could discard the drawcall, but it's pretty rare. Let's just statically discard here.946WRITE(p, " %s\n", discardStatement);947}948} else if (enableFragmentTestCache) {949WRITE(p, " float aResult = %s(testtex, vec2(%s, 0)).a;\n", compat.texture, alphaTestXCoord);950WRITE(p, " if (aResult < 0.5) %s\n", discardStatement);951} else {952const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };953if (alphaTestFuncs[alphaTestFunc][0] != '#') {954if (compat.bitwiseOps) {955WRITE(p, " if ((roundAndScaleTo255i(v.a) & int(u_alphacolormask >> 0x18u)) %s int(u_alphacolorref >> 0x18u)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);956} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {957// Work around bad PVR driver problem where equality check + discard just doesn't work.958if (alphaTestFunc != GE_COMP_NOTEQUAL) {959WRITE(p, " if (roundTo255thf(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);960}961} else {962WRITE(p, " if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);963}964} else {965// This means NEVER. See above.966WRITE(p, " %s\n", discardStatement);967}968}969}970971if (enableColorTest) {972*fragmentShaderFlags |= FragmentShaderFlags::USES_DISCARD;973974if (colorTestAgainstZero) {975// When testing against 0 (common), we can avoid some math.976// 0.002 is approximately half of 1.0 / 255.0.977if (colorTestFunc == GE_COMP_NOTEQUAL) {978if (compat.shaderLanguage == GLSL_VULKAN) {979// Old workaround for Adreno driver bug. We could make this the main path actually980// since the math is roughly equivalent given the non-negative inputs.981WRITE(p, " if (v.r + v.g + v.b < 0.002) %s\n", discardStatement);982} else {983WRITE(p, " if (v.r < 0.002 && v.g < 0.002 && v.b < 0.002) %s\n", discardStatement);984}985} else if (colorTestFunc != GE_COMP_NEVER) {986if (compat.shaderLanguage == GLSL_VULKAN) {987// See the GE_COMP_NOTEQUAL case.988WRITE(p, " if (v.r + v.g + v.b > 0.002) %s\n", discardStatement);989} else {990// Anything else is a test for == 0.991WRITE(p, " if (v.r > 0.002 || v.g > 0.002 || v.b > 0.002) %s\n", discardStatement);992}993} else {994// NEVER has been logged as used by games, although it makes little sense - statically failing.995// Maybe we could discard the drawcall, but it's pretty rare. Let's just statically discard here.996WRITE(p, " %s\n", discardStatement);997}998} else if (enableFragmentTestCache) {999WRITE(p, " float rResult = %s(testtex, vec2(vScale256.r, 0)).r;\n", compat.texture);1000WRITE(p, " float gResult = %s(testtex, vec2(vScale256.g, 0)).g;\n", compat.texture);1001WRITE(p, " float bResult = %s(testtex, vec2(vScale256.b, 0)).b;\n", compat.texture);1002if (colorTestFunc == GE_COMP_EQUAL) {1003// Equal means all parts must be equal (so discard if any is not.)1004WRITE(p, " if (rResult < 0.5 || gResult < 0.5 || bResult < 0.5) %s\n", discardStatement);1005} else {1006// Not equal means any part must be not equal.1007WRITE(p, " if (rResult < 0.5 && gResult < 0.5 && bResult < 0.5) %s\n", discardStatement);1008}1009} else {1010const char *colorTestFuncs[] = { "#", "#", " != ", " == " };1011const char *test = colorTestFuncs[colorTestFunc];1012if (test[0] != '#') {1013// TODO: Unify these paths better.1014if (compat.shaderLanguage == HLSL_D3D9) {1015// TODO: Use a texture to lookup bitwise ops instead?1016WRITE(p, " vec3 colortest = roundAndScaleTo255v(v.rgb);\n");1017WRITE(p, " if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b)) %s\n", test, test, test, discardStatement);1018} else if (compat.bitwiseOps) {1019WRITE(p, " uint v_uint = roundAndScaleTo8x4(v.rgb);\n");1020WRITE(p, " uint v_masked = v_uint & u_alphacolormask;\n");1021WRITE(p, " uint colorTestRef = (u_alphacolorref & u_alphacolormask) & 0xFFFFFFu;\n");1022WRITE(p, " if (v_masked %s colorTestRef) %s\n", test, discardStatement);1023} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {1024WRITE(p, " if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);1025} else {1026WRITE(p, " if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);1027}1028} else {1029WRITE(p, " %s\n", discardStatement);1030}1031}1032}10331034if (replaceBlend == REPLACE_BLEND_2X_SRC) {1035WRITE(p, " v.rgb = v.rgb * 2.0;\n");1036}10371038// In some cases we need to replicate the first half of the blend equation here.1039// In case of blue-to-alpha, it's since we overwrite alpha with blue before the actual blend equation runs.1040if (replaceBlend == REPLACE_BLEND_PRE_SRC || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA || replaceBlend == REPLACE_BLEND_BLUE_TO_ALPHA) {1041const char *srcFactor = "ERROR";1042switch (replaceBlendFuncA) {1043case GE_SRCBLEND_DSTCOLOR: srcFactor = "ERROR"; break;1044case GE_SRCBLEND_INVDSTCOLOR: srcFactor = "ERROR"; break;1045case GE_SRCBLEND_SRCALPHA: srcFactor = "splat3(v.a)"; break;1046case GE_SRCBLEND_INVSRCALPHA: srcFactor = "splat3(1.0 - v.a)"; break;1047case GE_SRCBLEND_DSTALPHA: srcFactor = "ERROR"; break;1048case GE_SRCBLEND_INVDSTALPHA: srcFactor = "ERROR"; break;1049case GE_SRCBLEND_DOUBLESRCALPHA: srcFactor = "splat3(v.a * 2.0)"; break;1050case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "splat3(1.0 - v.a * 2.0)"; break;1051// PRE_SRC for REPLACE_BLEND_PRE_SRC_2X_ALPHA means "double the src."1052// It's close to the same, but clamping can still be an issue.1053case GE_SRCBLEND_DOUBLEDSTALPHA: srcFactor = "splat3(2.0)"; break;1054case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "ERROR"; break;1055case GE_SRCBLEND_FIXA: srcFactor = "u_blendFixA"; break;1056default: srcFactor = "u_blendFixA"; break;1057}10581059if (!strcmp(srcFactor, "ERROR")) {1060*errorString = "Bad replaceblend src factor";1061return false;1062}10631064WRITE(p, " v.rgb = v.rgb * %s;\n", srcFactor);1065}10661067if (replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER) {1068const char *srcFactor = nullptr;1069const char *dstFactor = nullptr;10701071switch (replaceBlendFuncA) {1072case GE_SRCBLEND_DSTCOLOR: srcFactor = "destColor.rgb"; break;1073case GE_SRCBLEND_INVDSTCOLOR: srcFactor = "(splat3(1.0) - destColor.rgb)"; break;1074case GE_SRCBLEND_SRCALPHA: srcFactor = "v.aaa"; break;1075case GE_SRCBLEND_INVSRCALPHA: srcFactor = "splat3(1.0 - v.a)"; break;1076case GE_SRCBLEND_DSTALPHA: srcFactor = "destColor.aaa"; break;1077case GE_SRCBLEND_INVDSTALPHA: srcFactor = "(splat3(1.0) - destColor.aaa)"; break;1078case GE_SRCBLEND_DOUBLESRCALPHA: srcFactor = "v.aaa * 2.0"; break;1079case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;1080case GE_SRCBLEND_DOUBLEDSTALPHA: srcFactor = "destColor.aaa * 2.0"; break;1081case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;1082case GE_SRCBLEND_FIXA: srcFactor = "u_blendFixA"; break;1083default: srcFactor = "u_blendFixA"; break;1084}1085switch (replaceBlendFuncB) {1086case GE_DSTBLEND_SRCCOLOR: dstFactor = "v.rgb"; break;1087case GE_DSTBLEND_INVSRCCOLOR: dstFactor = "(splat3(1.0) - v.rgb)"; break;1088case GE_DSTBLEND_SRCALPHA: dstFactor = "v.aaa"; break;1089case GE_DSTBLEND_INVSRCALPHA: dstFactor = "(splat3(1.0) - v.aaa)"; break;1090case GE_DSTBLEND_DSTALPHA: dstFactor = "destColor.aaa"; break;1091case GE_DSTBLEND_INVDSTALPHA: dstFactor = "(splat3(1.0) - destColor.aaa)"; break;1092case GE_DSTBLEND_DOUBLESRCALPHA: dstFactor = "v.aaa * 2.0"; break;1093case GE_DSTBLEND_DOUBLEINVSRCALPHA: dstFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;1094case GE_DSTBLEND_DOUBLEDSTALPHA: dstFactor = "destColor.aaa * 2.0"; break;1095case GE_DSTBLEND_DOUBLEINVDSTALPHA: dstFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;1096case GE_DSTBLEND_FIXB: dstFactor = "u_blendFixB"; break;1097default: dstFactor = "u_blendFixB"; break;1098}10991100switch (replaceBlendEq) {1101case GE_BLENDMODE_MUL_AND_ADD:1102WRITE(p, " v.rgb = v.rgb * %s + destColor.rgb * %s;\n", srcFactor, dstFactor);1103break;1104case GE_BLENDMODE_MUL_AND_SUBTRACT:1105WRITE(p, " v.rgb = v.rgb * %s - destColor.rgb * %s;\n", srcFactor, dstFactor);1106break;1107case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:1108WRITE(p, " v.rgb = destColor.rgb * %s - v.rgb * %s;\n", dstFactor, srcFactor);1109break;1110case GE_BLENDMODE_MIN:1111WRITE(p, " v.rgb = min(v.rgb, destColor.rgb);\n");1112break;1113case GE_BLENDMODE_MAX:1114WRITE(p, " v.rgb = max(v.rgb, destColor.rgb);\n");1115break;1116case GE_BLENDMODE_ABSDIFF:1117WRITE(p, " v.rgb = abs(v.rgb - destColor.rgb);\n");1118break;1119default:1120*errorString = "Bad replace blend eq";1121return false;1122}1123}11241125if (replaceBlend == REPLACE_BLEND_2X_ALPHA || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA) {1126WRITE(p, " v.a *= 2.0;\n");1127}1128}11291130char replacedAlpha[64] = "0.0";1131if (stencilToAlpha != REPLACE_ALPHA_NO) {1132switch (replaceAlphaWithStencilType) {1133case STENCIL_VALUE_UNIFORM:1134truncate_cpy(replacedAlpha, "u_stencilReplaceValue");1135break;11361137case STENCIL_VALUE_ZERO:1138truncate_cpy(replacedAlpha, "0.0");1139break;11401141case STENCIL_VALUE_ONE:1142case STENCIL_VALUE_INVERT:1143// In invert, we subtract by one, but we want to output one here.1144truncate_cpy(replacedAlpha, "1.0");1145break;11461147case STENCIL_VALUE_INCR_4:1148case STENCIL_VALUE_DECR_4:1149// We're adding/subtracting, just by the smallest value in 4-bit.1150snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 15.0);1151break;11521153case STENCIL_VALUE_INCR_8:1154case STENCIL_VALUE_DECR_8:1155// We're adding/subtracting, just by the smallest value in 8-bit.1156snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 255.0);1157break;11581159case STENCIL_VALUE_KEEP:1160// Do nothing. We'll mask out the alpha using color mask.1161break;1162}1163}11641165switch (stencilToAlpha) {1166case REPLACE_ALPHA_DUALSOURCE:1167WRITE(p, " %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);1168WRITE(p, " %s = vec4(0.0, 0.0, 0.0, v.a);\n", compat.fragColor1);1169break;11701171case REPLACE_ALPHA_YES:1172WRITE(p, " %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);1173break;11741175case REPLACE_ALPHA_NO:1176WRITE(p, " %s = v;\n", compat.fragColor0);1177break;11781179default:1180*errorString = "Bad stencil-to-alpha type, corrupt ID?";1181return false;1182}11831184switch (simulateLogicOpType) {1185case LOGICOPTYPE_ONE:1186WRITE(p, " %s.rgb = splat3(1.0);\n", compat.fragColor0);1187break;1188case LOGICOPTYPE_INVERT:1189WRITE(p, " %s.rgb = splat3(1.0) - %s.rgb;\n", compat.fragColor0, compat.fragColor0);1190break;1191case LOGICOPTYPE_NORMAL:1192break;11931194default:1195*errorString = "Bad logic op type, corrupt ID?";1196return false;1197}11981199// Final color computed - apply logic ops and bitwise color write mask, through shader blending, if specified.1200if (colorWriteMask || replaceLogicOp) {1201WRITE(p, " highp uint v32 = packUnorm4x8%s(%s);\n", packSuffix, compat.fragColor0);1202WRITE(p, " highp uint d32 = packUnorm4x8%s(destColor);\n", packSuffix);12031204// v32 is both the "s" to the logical operation, and the value that we'll merge to the destination with masking later.1205// d32 is the "d" to the logical operation.1206// NOTE: Alpha of v32 needs to be preserved. Same equations as in the software renderer.1207switch (replaceLogicOpType) {1208case GE_LOGIC_CLEAR: p.C(" v32 &= 0xFF000000u;\n"); break;1209case GE_LOGIC_AND: p.C(" v32 = v32 & (d32 | 0xFF000000u);\n"); break;1210case GE_LOGIC_AND_REVERSE: p.C(" v32 = v32 & (~d32 | 0xFF000000u);\n"); break;1211case GE_LOGIC_COPY: break; // source to dest, do nothing. Will be set to this, if not used.1212case GE_LOGIC_AND_INVERTED: p.C(" v32 = (~v32 & (d32 & 0x00FFFFFFu)) | (v32 & 0xFF000000u);\n"); break;1213case GE_LOGIC_NOOP: p.C(" v32 = (d32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1214case GE_LOGIC_XOR: p.C(" v32 = v32 ^ (d32 & 0x00FFFFFFu);\n"); break;1215case GE_LOGIC_OR: p.C(" v32 = v32 | (d32 & 0x00FFFFFFu);\n"); break;1216case GE_LOGIC_NOR: p.C(" v32 = (~(v32 | d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1217case GE_LOGIC_EQUIV: p.C(" v32 = (~(v32 ^ d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1218case GE_LOGIC_INVERTED: p.C(" v32 = (~d32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1219case GE_LOGIC_OR_REVERSE: p.C(" v32 = v32 | (~d32 & 0x00FFFFFFu);\n"); break;1220case GE_LOGIC_COPY_INVERTED: p.C(" v32 = (~v32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1221case GE_LOGIC_OR_INVERTED: p.C(" v32 = ((~v32 | d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1222case GE_LOGIC_NAND: p.C(" v32 = (~(v32 & d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;1223case GE_LOGIC_SET: p.C(" v32 |= 0x00FFFFFFu;\n"); break;1224}12251226// Note that the mask has already been flipped to the PC way - 1 means write.1227if (colorWriteMask) {1228if (stencilToAlpha != REPLACE_ALPHA_NO)1229WRITE(p, " v32 = (v32 & u_colorWriteMask) | (d32 & ~u_colorWriteMask);\n");1230else1231WRITE(p, " v32 = (v32 & u_colorWriteMask & 0x00FFFFFFu) | (d32 & (~u_colorWriteMask | 0xFF000000u));\n");1232}1233WRITE(p, " %s = unpackUnorm4x8%s(v32);\n", compat.fragColor0, packSuffix);1234}12351236if (blueToAlpha) {1237WRITE(p, " %s = vec4(0.0, 0.0, 0.0, %s.z); // blue to alpha\n", compat.fragColor0, compat.fragColor0);1238}12391240if (gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {1241DepthScaleFactors depthScale = GetDepthScaleFactors(gstate_c.UseFlags());12421243const double scale = depthScale.ScaleU16();12441245WRITE(p, " highp float z = gl_FragCoord.z;\n");1246if (gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {1247// We center the depth with an offset, but only its fraction matters.1248// When (DepthSliceFactor() - 1) is odd, it will be 0.5, otherwise 0.1249if (((int)(depthScale.Scale() - 1.0f) & 1) == 1) {1250WRITE(p, " z = (floor((z * %f) - (1.0 / 2.0)) + (1.0 / 2.0)) * (1.0 / %f);\n", scale, scale);1251} else {1252WRITE(p, " z = floor(z * %f) * (1.0 / %f);\n", scale, scale);1253}1254} else {1255WRITE(p, " z = (1.0 / 65535.0) * floor(z * 65535.0);\n");1256}1257WRITE(p, " gl_FragDepth = z;\n");1258} else if (useDiscardStencilBugWorkaround) {1259// Adreno and some Mali drivers apply early frag tests even with discard in the shader,1260// when only stencil is used. The exact situation seems to vary by driver.1261// Writing depth prevents the bug for both vendors, even with depth_unchanged specified.1262// This doesn't make a ton of sense, but empirically does work.1263WRITE(p, " gl_FragDepth = gl_FragCoord.z;\n");1264}12651266if (compat.shaderLanguage == HLSL_D3D11 || compat.shaderLanguage == HLSL_D3D9) {1267if (writeDepth) {1268WRITE(p, " outfragment.depth = gl_FragDepth;\n");1269}1270WRITE(p, " return outfragment;\n");1271}12721273WRITE(p, "}\n");12741275return true;1276}1277127812791280