CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/DrawPixel.cpp
Views: 1401
// Copyright (c) 2013- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#include <mutex>19#include "Common/Common.h"20#include "Common/Data/Convert/ColorConv.h"21#include "Core/Config.h"22#include "GPU/GPUState.h"23#include "GPU/Software/BinManager.h"24#include "GPU/Software/DrawPixel.h"25#include "GPU/Software/FuncId.h"26#include "GPU/Software/Rasterizer.h"27#include "GPU/Software/SoftGpu.h"2829using namespace Math3D;3031namespace Rasterizer {3233std::mutex jitCacheLock;34PixelJitCache *jitCache = nullptr;3536void Init() {37jitCache = new PixelJitCache();38}3940void FlushJit() {41jitCache->Flush();42}4344void Shutdown() {45delete jitCache;46jitCache = nullptr;47}4849bool DescribeCodePtr(const u8 *ptr, std::string &name) {50if (!jitCache->IsInSpace(ptr)) {51return false;52}5354name = jitCache->DescribeCodePtr(ptr);55return true;56}5758static inline u8 GetPixelStencil(GEBufferFormat fmt, int fbStride, int x, int y) {59if (fmt == GE_FORMAT_565) {60// Always treated as 0 for comparison purposes.61return 0;62} else if (fmt == GE_FORMAT_5551) {63return ((fb.Get16(x, y, fbStride) & 0x8000) != 0) ? 0xFF : 0;64} else if (fmt == GE_FORMAT_4444) {65return Convert4To8(fb.Get16(x, y, fbStride) >> 12);66} else {67return fb.Get32(x, y, fbStride) >> 24;68}69}7071static inline void SetPixelStencil(GEBufferFormat fmt, int fbStride, uint32_t targetWriteMask, int x, int y, u8 value) {72if (fmt == GE_FORMAT_565) {73// Do nothing74} else if (fmt == GE_FORMAT_5551) {75if ((targetWriteMask & 0x8000) == 0) {76u16 pixel = fb.Get16(x, y, fbStride) & ~0x8000;77pixel |= (value & 0x80) << 8;78fb.Set16(x, y, fbStride, pixel);79}80} else if (fmt == GE_FORMAT_4444) {81const u16 write_mask = targetWriteMask | 0x0FFF;82u16 pixel = fb.Get16(x, y, fbStride) & write_mask;83pixel |= ((u16)value << 8) & ~write_mask;84fb.Set16(x, y, fbStride, pixel);85} else {86const u32 write_mask = targetWriteMask | 0x00FFFFFF;87u32 pixel = fb.Get32(x, y, fbStride) & write_mask;88pixel |= ((u32)value << 24) & ~write_mask;89fb.Set32(x, y, fbStride, pixel);90}91}9293static inline u16 GetPixelDepth(int x, int y, int stride) {94return depthbuf.Get16(x, y, stride);95}9697static inline void SetPixelDepth(int x, int y, int stride, u16 value) {98depthbuf.Set16(x, y, stride, value);99}100101// NOTE: These likely aren't endian safe102static inline u32 GetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y) {103switch (fmt) {104case GE_FORMAT_565:105// A should be zero for the purposes of alpha blending.106return RGB565ToRGBA8888(fb.Get16(x, y, fbStride)) & 0x00FFFFFF;107108case GE_FORMAT_5551:109return RGBA5551ToRGBA8888(fb.Get16(x, y, fbStride));110111case GE_FORMAT_4444:112return RGBA4444ToRGBA8888(fb.Get16(x, y, fbStride));113114case GE_FORMAT_8888:115return fb.Get32(x, y, fbStride);116117default:118return 0;119}120}121122static inline void SetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y, u32 value, u32 old_value, u32 targetWriteMask) {123switch (fmt) {124case GE_FORMAT_565:125value = RGBA8888ToRGB565(value);126if (targetWriteMask != 0) {127old_value = RGBA8888ToRGB565(old_value);128value = (value & ~targetWriteMask) | (old_value & targetWriteMask);129}130fb.Set16(x, y, fbStride, value);131break;132133case GE_FORMAT_5551:134value = RGBA8888ToRGBA5551(value);135if (targetWriteMask != 0) {136old_value = RGBA8888ToRGBA5551(old_value);137value = (value & ~targetWriteMask) | (old_value & targetWriteMask);138}139fb.Set16(x, y, fbStride, value);140break;141142case GE_FORMAT_4444:143value = RGBA8888ToRGBA4444(value);144if (targetWriteMask != 0) {145old_value = RGBA8888ToRGBA4444(old_value);146value = (value & ~targetWriteMask) | (old_value & targetWriteMask);147}148fb.Set16(x, y, fbStride, value);149break;150151case GE_FORMAT_8888:152value = (value & ~targetWriteMask) | (old_value & targetWriteMask);153fb.Set32(x, y, fbStride, value);154break;155156default:157break;158}159}160161static inline bool AlphaTestPassed(const PixelFuncID &pixelID, int alpha) {162const u8 ref = pixelID.alphaTestRef;163if (pixelID.hasAlphaTestMask)164alpha &= pixelID.cached.alphaTestMask;165166switch (pixelID.AlphaTestFunc()) {167case GE_COMP_NEVER:168return false;169170case GE_COMP_ALWAYS:171return true;172173case GE_COMP_EQUAL:174return (alpha == ref);175176case GE_COMP_NOTEQUAL:177return (alpha != ref);178179case GE_COMP_LESS:180return (alpha < ref);181182case GE_COMP_LEQUAL:183return (alpha <= ref);184185case GE_COMP_GREATER:186return (alpha > ref);187188case GE_COMP_GEQUAL:189return (alpha >= ref);190}191return true;192}193194static inline bool ColorTestPassed(const PixelFuncID &pixelID, const Vec3<int> &color) {195const u32 mask = pixelID.cached.colorTestMask;196const u32 c = color.ToRGB() & mask;197const u32 ref = pixelID.cached.colorTestRef;198switch (pixelID.cached.colorTestFunc) {199case GE_COMP_NEVER:200return false;201202case GE_COMP_ALWAYS:203return true;204205case GE_COMP_EQUAL:206return c == ref;207208case GE_COMP_NOTEQUAL:209return c != ref;210211default:212return true;213}214}215216static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {217if (pixelID.hasStencilTestMask)218stencil &= pixelID.cached.stencilTestMask;219u8 ref = pixelID.stencilTestRef;220switch (pixelID.StencilTestFunc()) {221case GE_COMP_NEVER:222return false;223224case GE_COMP_ALWAYS:225return true;226227case GE_COMP_EQUAL:228return ref == stencil;229230case GE_COMP_NOTEQUAL:231return ref != stencil;232233case GE_COMP_LESS:234return ref < stencil;235236case GE_COMP_LEQUAL:237return ref <= stencil;238239case GE_COMP_GREATER:240return ref > stencil;241242case GE_COMP_GEQUAL:243return ref >= stencil;244}245return true;246}247248static inline u8 ApplyStencilOp(GEBufferFormat fmt, uint8_t stencilReplace, GEStencilOp op, u8 old_stencil) {249switch (op) {250case GE_STENCILOP_KEEP:251return old_stencil;252253case GE_STENCILOP_ZERO:254return 0;255256case GE_STENCILOP_REPLACE:257return stencilReplace;258259case GE_STENCILOP_INVERT:260return ~old_stencil;261262case GE_STENCILOP_INCR:263switch (fmt) {264case GE_FORMAT_8888:265if (old_stencil != 0xFF) {266return old_stencil + 1;267}268return old_stencil;269case GE_FORMAT_5551:270return 0xFF;271case GE_FORMAT_4444:272if (old_stencil < 0xF0) {273return old_stencil + 0x10;274}275return old_stencil;276default:277return old_stencil;278}279break;280281case GE_STENCILOP_DECR:282switch (fmt) {283case GE_FORMAT_4444:284if (old_stencil >= 0x10)285return old_stencil - 0x10;286break;287case GE_FORMAT_5551:288return 0;289default:290if (old_stencil != 0)291return old_stencil - 1;292return old_stencil;293}294break;295}296297return old_stencil;298}299300static inline bool DepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {301u16 reference_z = GetPixelDepth(x, y, stride);302303switch (func) {304case GE_COMP_NEVER:305return false;306307case GE_COMP_ALWAYS:308return true;309310case GE_COMP_EQUAL:311return (z == reference_z);312313case GE_COMP_NOTEQUAL:314return (z != reference_z);315316case GE_COMP_LESS:317return (z < reference_z);318319case GE_COMP_LEQUAL:320return (z <= reference_z);321322case GE_COMP_GREATER:323return (z > reference_z);324325case GE_COMP_GEQUAL:326return (z >= reference_z);327328default:329return 0;330}331}332333bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {334return DepthTestPassed(func, x, y, stride, z);335}336337static inline u32 ApplyLogicOp(GELogicOp op, u32 old_color, u32 new_color) {338// All of the operations here intentionally preserve alpha/stencil.339switch (op) {340case GE_LOGIC_CLEAR:341new_color &= 0xFF000000;342break;343344case GE_LOGIC_AND:345new_color = new_color & (old_color | 0xFF000000);346break;347348case GE_LOGIC_AND_REVERSE:349new_color = new_color & (~old_color | 0xFF000000);350break;351352case GE_LOGIC_COPY:353// No change to new_color.354break;355356case GE_LOGIC_AND_INVERTED:357new_color = (~new_color & (old_color & 0x00FFFFFF)) | (new_color & 0xFF000000);358break;359360case GE_LOGIC_NOOP:361new_color = (old_color & 0x00FFFFFF) | (new_color & 0xFF000000);362break;363364case GE_LOGIC_XOR:365new_color = new_color ^ (old_color & 0x00FFFFFF);366break;367368case GE_LOGIC_OR:369new_color = new_color | (old_color & 0x00FFFFFF);370break;371372case GE_LOGIC_NOR:373new_color = (~(new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);374break;375376case GE_LOGIC_EQUIV:377new_color = (~(new_color ^ old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);378break;379380case GE_LOGIC_INVERTED:381new_color = (~old_color & 0x00FFFFFF) | (new_color & 0xFF000000);382break;383384case GE_LOGIC_OR_REVERSE:385new_color = new_color | (~old_color & 0x00FFFFFF);386break;387388case GE_LOGIC_COPY_INVERTED:389new_color = (~new_color & 0x00FFFFFF) | (new_color & 0xFF000000);390break;391392case GE_LOGIC_OR_INVERTED:393new_color = ((~new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);394break;395396case GE_LOGIC_NAND:397new_color = (~(new_color & old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);398break;399400case GE_LOGIC_SET:401new_color |= 0x00FFFFFF;402break;403}404405return new_color;406}407408static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {409switch (factor) {410case PixelBlendFactor::OTHERCOLOR:411return dst.rgb();412413case PixelBlendFactor::INVOTHERCOLOR:414return Vec3<int>::AssignToAll(255) - dst.rgb();415416case PixelBlendFactor::SRCALPHA:417#if defined(_M_SSE)418return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));419#elif PPSSPP_ARCH(ARM64_NEON)420return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));421#else422return Vec3<int>::AssignToAll(source.a());423#endif424425case PixelBlendFactor::INVSRCALPHA:426#if defined(_M_SSE)427return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));428#elif PPSSPP_ARCH(ARM64_NEON)429return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));430#else431return Vec3<int>::AssignToAll(255 - source.a());432#endif433434case PixelBlendFactor::DSTALPHA:435return Vec3<int>::AssignToAll(dst.a());436437case PixelBlendFactor::INVDSTALPHA:438return Vec3<int>::AssignToAll(255 - dst.a());439440case PixelBlendFactor::DOUBLESRCALPHA:441return Vec3<int>::AssignToAll(2 * source.a());442443case PixelBlendFactor::DOUBLEINVSRCALPHA:444return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));445446case PixelBlendFactor::DOUBLEDSTALPHA:447return Vec3<int>::AssignToAll(2 * dst.a());448449case PixelBlendFactor::DOUBLEINVDSTALPHA:450return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));451452case PixelBlendFactor::FIX:453default:454// All other dest factors (> 10) are treated as FIXA.455return Vec3<int>::FromRGB(fix);456457case PixelBlendFactor::ZERO:458return Vec3<int>::AssignToAll(0);459460case PixelBlendFactor::ONE:461return Vec3<int>::AssignToAll(255);462}463}464465static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {466switch (factor) {467case PixelBlendFactor::OTHERCOLOR:468return source.rgb();469470case PixelBlendFactor::INVOTHERCOLOR:471return Vec3<int>::AssignToAll(255) - source.rgb();472473case PixelBlendFactor::SRCALPHA:474#if defined(_M_SSE)475return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));476#elif PPSSPP_ARCH(ARM64_NEON)477return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));478#else479return Vec3<int>::AssignToAll(source.a());480#endif481482case PixelBlendFactor::INVSRCALPHA:483#if defined(_M_SSE)484return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));485#elif PPSSPP_ARCH(ARM64_NEON)486return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));487#else488return Vec3<int>::AssignToAll(255 - source.a());489#endif490491case PixelBlendFactor::DSTALPHA:492return Vec3<int>::AssignToAll(dst.a());493494case PixelBlendFactor::INVDSTALPHA:495return Vec3<int>::AssignToAll(255 - dst.a());496497case PixelBlendFactor::DOUBLESRCALPHA:498return Vec3<int>::AssignToAll(2 * source.a());499500case PixelBlendFactor::DOUBLEINVSRCALPHA:501return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));502503case PixelBlendFactor::DOUBLEDSTALPHA:504return Vec3<int>::AssignToAll(2 * dst.a());505506case PixelBlendFactor::DOUBLEINVDSTALPHA:507return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));508509case PixelBlendFactor::FIX:510default:511// All other dest factors (> 10) are treated as FIXB.512return Vec3<int>::FromRGB(fix);513514case PixelBlendFactor::ZERO:515return Vec3<int>::AssignToAll(0);516517case PixelBlendFactor::ONE:518return Vec3<int>::AssignToAll(255);519}520}521522// Removed inline here - it was never chosen to be inlined by the compiler anyway, too complex.523static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst) {524// Note: These factors cannot go below 0, but they can go above 255 when doubling.525Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst, pixelID.cached.alphaBlendSrc);526Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst, pixelID.cached.alphaBlendDst);527528switch (pixelID.AlphaBlendEq()) {529case GE_BLENDMODE_MUL_AND_ADD:530{531#if defined(_M_SSE)532// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.533const __m128i half = _mm_set1_epi16(1 << 3);534535const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);536const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);537const __m128i s = _mm_mulhi_epi16(srgb, sf);538539const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);540const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);541const __m128i d = _mm_mulhi_epi16(drgb, df);542543return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));544#elif PPSSPP_ARCH(ARM64_NEON)545const int32x4_t half = vdupq_n_s32(1);546547const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);548const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);549const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);550551const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);552const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);553const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);554555return Vec3<int>(vaddq_s32(s, d));556#else557static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);558Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;559Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;560return lhs + rhs;561#endif562}563564case GE_BLENDMODE_MUL_AND_SUBTRACT:565{566#if defined(_M_SSE)567const __m128i half = _mm_set1_epi16(1 << 3);568569const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);570const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);571const __m128i s = _mm_mulhi_epi16(srgb, sf);572573const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);574const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);575const __m128i d = _mm_mulhi_epi16(drgb, df);576577return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));578#elif PPSSPP_ARCH(ARM64_NEON)579const int32x4_t half = vdupq_n_s32(1);580581const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);582const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);583const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);584585const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);586const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);587const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);588589return Vec3<int>(vqsubq_s32(s, d));590#else591static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);592Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;593Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;594return lhs - rhs;595#endif596}597598case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:599{600#if defined(_M_SSE)601const __m128i half = _mm_set1_epi16(1 << 3);602603const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);604const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);605const __m128i s = _mm_mulhi_epi16(srgb, sf);606607const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);608const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);609const __m128i d = _mm_mulhi_epi16(drgb, df);610611return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));612#elif PPSSPP_ARCH(ARM64_NEON)613const int32x4_t half = vdupq_n_s32(1);614615const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);616const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);617const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);618619const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);620const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);621const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);622623return Vec3<int>(vqsubq_s32(d, s));624#else625static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);626Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;627Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;628return rhs - lhs;629#endif630}631632case GE_BLENDMODE_MIN:633#if PPSSPP_ARCH(ARM64_NEON)634return Vec3<int>(vminq_s32(source.ivec, dst.ivec));635#else636return Vec3<int>(std::min(source.r(), dst.r()),637std::min(source.g(), dst.g()),638std::min(source.b(), dst.b()));639#endif640641case GE_BLENDMODE_MAX:642#if PPSSPP_ARCH(ARM64_NEON)643return Vec3<int>(vmaxq_s32(source.ivec, dst.ivec));644#else645return Vec3<int>(std::max(source.r(), dst.r()),646std::max(source.g(), dst.g()),647std::max(source.b(), dst.b()));648#endif649650case GE_BLENDMODE_ABSDIFF:651#if PPSSPP_ARCH(ARM64_NEON)652return Vec3<int>(vabdq_s32(source.ivec, dst.ivec));653#else654return Vec3<int>(::abs(source.r() - dst.r()),655::abs(source.g() - dst.g()),656::abs(source.b() - dst.b()));657#endif658659default:660return source.rgb();661}662}663664template <bool clearMode, GEBufferFormat fbFormat>665void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg color_in, const PixelFuncID &pixelID) {666Vec4<int> prim_color = Vec4<int>(color_in).Clamp(0, 255);667// Depth range test - applied in clear mode, if not through mode.668if (pixelID.applyDepthRange && !pixelID.earlyZChecks)669if (z < pixelID.cached.minz || z > pixelID.cached.maxz)670return;671672if (pixelID.AlphaTestFunc() != GE_COMP_ALWAYS && !clearMode)673if (!AlphaTestPassed(pixelID, prim_color.a()))674return;675676// Fog is applied prior to color test.677if (pixelID.applyFog && !clearMode) {678Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);679// This is very similar to the BLEND texfunc, and simply always rounds up.680static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);681fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;682prim_color.r() = fogColor.r();683prim_color.g() = fogColor.g();684prim_color.b() = fogColor.b();685}686687if (pixelID.colorTest && !clearMode)688if (!ColorTestPassed(pixelID, prim_color.rgb()))689return;690691// In clear mode, it uses the alpha color as stencil.692uint32_t targetWriteMask = pixelID.applyColorWriteMask ? pixelID.cached.colorWriteMask : 0;693u8 stencil = clearMode ? prim_color.a() : GetPixelStencil(fbFormat, pixelID.cached.framebufStride, x, y);694if (clearMode) {695if (pixelID.DepthClear())696SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);697} else if (pixelID.stencilTest) {698const uint8_t stencilReplace = pixelID.hasStencilTestMask ? pixelID.cached.stencilRef : pixelID.stencilTestRef;699if (!StencilTestPassed(pixelID, stencil)) {700stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.SFail(), stencil);701SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);702return;703}704705// Also apply depth at the same time. If disabled, same as passing.706if (!pixelID.earlyZChecks && pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {707stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZFail(), stencil);708SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);709return;710}711712stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZPass(), stencil);713} else if (!pixelID.earlyZChecks) {714if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {715return;716}717}718719if (pixelID.depthWrite && !clearMode)720SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);721722const u32 old_color = GetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y);723u32 new_color;724725// Dithering happens before the logic op and regardless of framebuffer format or clear mode.726// We do it while alpha blending because it happens before clamping.727if (pixelID.alphaBlend && !clearMode) {728const Vec4<int> dst = Vec4<int>::FromRGBA(old_color);729Vec3<int> blended = AlphaBlendingResult(pixelID, prim_color, dst);730if (pixelID.dithering) {731blended += Vec3<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);732}733734// ToRGB() always automatically clamps.735new_color = blended.ToRGB();736new_color |= stencil << 24;737} else {738if (pixelID.dithering) {739// We'll discard alpha anyway.740prim_color += Vec4<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);741}742743#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)744new_color = Vec3<int>(prim_color.ivec).ToRGB();745new_color |= stencil << 24;746#else747new_color = Vec4<int>(prim_color.r(), prim_color.g(), prim_color.b(), stencil).ToRGBA();748#endif749}750751// Logic ops are applied after blending (if blending is enabled.)752if (pixelID.applyLogicOp && !clearMode) {753// Logic ops don't affect stencil, which happens inside ApplyLogicOp.754new_color = ApplyLogicOp(pixelID.cached.logicOp, old_color, new_color);755}756757if (clearMode) {758if (!pixelID.ColorClear())759new_color = (new_color & 0xFF000000) | (old_color & 0x00FFFFFF);760if (!pixelID.StencilClear())761new_color = (new_color & 0x00FFFFFF) | (old_color & 0xFF000000);762}763764SetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y, new_color, old_color, targetWriteMask);765}766767SingleFunc GetSingleFunc(const PixelFuncID &id, BinManager *binner) {768SingleFunc jitted = jitCache->GetSingle(id, binner);769if (jitted) {770return jitted;771}772773return jitCache->GenericSingle(id);774}775776SingleFunc PixelJitCache::GenericSingle(const PixelFuncID &id) {777if (id.clearMode) {778switch (id.fbFormat) {779case GE_FORMAT_565:780return &DrawSinglePixel<true, GE_FORMAT_565>;781case GE_FORMAT_5551:782return &DrawSinglePixel<true, GE_FORMAT_5551>;783case GE_FORMAT_4444:784return &DrawSinglePixel<true, GE_FORMAT_4444>;785case GE_FORMAT_8888:786return &DrawSinglePixel<true, GE_FORMAT_8888>;787}788}789switch (id.fbFormat) {790case GE_FORMAT_565:791return &DrawSinglePixel<false, GE_FORMAT_565>;792case GE_FORMAT_5551:793return &DrawSinglePixel<false, GE_FORMAT_5551>;794case GE_FORMAT_4444:795return &DrawSinglePixel<false, GE_FORMAT_4444>;796case GE_FORMAT_8888:797return &DrawSinglePixel<false, GE_FORMAT_8888>;798}799_assert_(false);800return nullptr;801}802803thread_local PixelJitCache::LastCache PixelJitCache::lastSingle_;804int PixelJitCache::clearGen_ = 0;805806// 256k should be plenty of space for plenty of variations.807PixelJitCache::PixelJitCache() : CodeBlock(1024 * 64 * 4), cache_(64) {808lastSingle_.gen = -1;809clearGen_++;810}811812void PixelJitCache::Clear() {813clearGen_++;814CodeBlock::Clear();815cache_.Clear();816addresses_.clear();817818constBlendHalf_11_4s_ = nullptr;819constBlendInvert_11_4s_ = nullptr;820}821822std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {823constexpr bool USE_IDS = false;824ptrdiff_t dist = 0x7FFFFFFF;825if (USE_IDS) {826PixelFuncID found{};827for (const auto &it : addresses_) {828ptrdiff_t it_dist = ptr - it.second;829if (it_dist >= 0 && it_dist < dist) {830found = it.first;831dist = it_dist;832}833}834835return DescribePixelFuncID(found);836}837838return CodeBlock::DescribeCodePtr(ptr);839}840841void PixelJitCache::Flush() {842std::unique_lock<std::mutex> guard(jitCacheLock);843for (const auto &queued : compileQueue_) {844// Might've been compiled after enqueue, but before now.845size_t queuedKey = std::hash<PixelFuncID>()(queued);846if (!cache_.ContainsKey(queuedKey))847Compile(queued);848}849compileQueue_.clear();850}851852SingleFunc PixelJitCache::GetSingle(const PixelFuncID &id, BinManager *binner) {853if (!g_Config.bSoftwareRenderingJit)854return nullptr;855856const size_t key = std::hash<PixelFuncID>()(id);857if (lastSingle_.Match(key, clearGen_))858return lastSingle_.func;859860std::unique_lock<std::mutex> guard(jitCacheLock);861SingleFunc singleFunc;862if (cache_.Get(key, &singleFunc)) {863lastSingle_.Set(key, singleFunc, clearGen_);864return singleFunc;865}866867if (!binner) {868// Can't compile, let's try to do it later when there's an opportunity.869compileQueue_.insert(id);870return nullptr;871}872873guard.unlock();874binner->Flush("compile");875guard.lock();876877for (const auto &queued : compileQueue_) {878// Might've been compiled after enqueue, but before now.879size_t queuedKey = std::hash<PixelFuncID>()(queued);880if (!cache_.ContainsKey(queuedKey))881Compile(queued);882}883compileQueue_.clear();884885// Might've been in the queue.886if (!cache_.ContainsKey(key))887Compile(id);888889if (cache_.Get(key, &singleFunc)) {890lastSingle_.Set(key, singleFunc, clearGen_);891return singleFunc;892} else {893return nullptr;894}895}896897void PixelJitCache::Compile(const PixelFuncID &id) {898// x64 is typically 200-500 bytes, but let's be safe.899if (GetSpaceLeft() < 65536) {900Clear();901}902903#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)904addresses_[id] = GetCodePointer();905SingleFunc func = CompileSingle(id);906cache_.Insert(std::hash<PixelFuncID>()(id), func);907#endif908}909910void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id) {911switch (id.AlphaBlendEq()) {912case GE_BLENDMODE_MUL_AND_ADD:913case GE_BLENDMODE_MUL_AND_SUBTRACT:914case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:915state.usesFactors = true;916break;917918case GE_BLENDMODE_MIN:919case GE_BLENDMODE_MAX:920case GE_BLENDMODE_ABSDIFF:921break;922}923924if (state.usesFactors) {925switch (id.AlphaBlendSrc()) {926case PixelBlendFactor::DSTALPHA:927case PixelBlendFactor::INVDSTALPHA:928case PixelBlendFactor::DOUBLEDSTALPHA:929case PixelBlendFactor::DOUBLEINVDSTALPHA:930state.usesDstAlpha = true;931break;932933case PixelBlendFactor::OTHERCOLOR:934case PixelBlendFactor::INVOTHERCOLOR:935state.dstColorAsFactor = true;936break;937938case PixelBlendFactor::SRCALPHA:939case PixelBlendFactor::INVSRCALPHA:940case PixelBlendFactor::DOUBLESRCALPHA:941case PixelBlendFactor::DOUBLEINVSRCALPHA:942state.srcColorAsFactor = true;943break;944945default:946break;947}948949switch (id.AlphaBlendDst()) {950case PixelBlendFactor::INVSRCALPHA:951state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA;952state.srcColorAsFactor = true;953break;954955case PixelBlendFactor::DOUBLEINVSRCALPHA:956state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLESRCALPHA;957state.srcColorAsFactor = true;958break;959960case PixelBlendFactor::DSTALPHA:961state.usesDstAlpha = true;962break;963964case PixelBlendFactor::INVDSTALPHA:965state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DSTALPHA;966state.usesDstAlpha = true;967break;968969case PixelBlendFactor::DOUBLEDSTALPHA:970state.usesDstAlpha = true;971break;972973case PixelBlendFactor::DOUBLEINVDSTALPHA:974state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLEDSTALPHA;975state.usesDstAlpha = true;976break;977978case PixelBlendFactor::OTHERCOLOR:979case PixelBlendFactor::INVOTHERCOLOR:980state.srcColorAsFactor = true;981break;982983case PixelBlendFactor::SRCALPHA:984case PixelBlendFactor::DOUBLESRCALPHA:985state.srcColorAsFactor = true;986break;987988case PixelBlendFactor::ZERO:989state.readsDstPixel = state.dstColorAsFactor || state.usesDstAlpha;990break;991992default:993break;994}995}996}997998};99910001001