CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Software/Sampler.cpp
Views: 1401
// Copyright (c) 2017- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#include <unordered_map>19#include <mutex>20#include "Common/Common.h"21#include "Common/Data/Convert/ColorConv.h"22#include "Common/LogReporting.h"23#include "Common/StringUtils.h"24#include "Core/Config.h"25#include "GPU/Common/TextureDecoder.h"26#include "GPU/GPUState.h"27#include "GPU/Software/BinManager.h"28#include "GPU/Software/Rasterizer.h"29#include "GPU/Software/RasterizerRegCache.h"30#include "GPU/Software/Sampler.h"3132#if defined(_M_SSE)33#include <emmintrin.h>34#endif3536#if PPSSPP_ARCH(ARM_NEON)37#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)38#include <arm64_neon.h>39#else40#include <arm_neon.h>41#endif42#endif4344using namespace Math3D;45using namespace Rasterizer;4647namespace Sampler {4849static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);50static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);51static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID);5253std::mutex jitCacheLock;54SamplerJitCache *jitCache = nullptr;5556void Init() {57jitCache = new SamplerJitCache();58}5960void FlushJit() {61jitCache->Flush();62}6364void Shutdown() {65delete jitCache;66jitCache = nullptr;67}6869bool DescribeCodePtr(const u8 *ptr, std::string &name) {70if (!jitCache->IsInSpace(ptr)) {71return false;72}7374name = jitCache->DescribeCodePtr(ptr);75return true;76}7778NearestFunc GetNearestFunc(SamplerID id, BinManager *binner) {79id.linear = false;80NearestFunc jitted = jitCache->GetNearest(id, binner);81if (jitted) {82return jitted;83}8485return &SampleNearest;86}8788LinearFunc GetLinearFunc(SamplerID id, BinManager *binner) {89id.linear = true;90LinearFunc jitted = jitCache->GetLinear(id, binner);91if (jitted) {92return jitted;93}9495return &SampleLinear;96}9798FetchFunc GetFetchFunc(SamplerID id, BinManager *binner) {99id.fetch = true;100FetchFunc jitted = jitCache->GetFetch(id, binner);101if (jitted) {102return jitted;103}104105return &SampleFetch;106}107108thread_local SamplerJitCache::LastCache SamplerJitCache::lastFetch_;109thread_local SamplerJitCache::LastCache SamplerJitCache::lastNearest_;110thread_local SamplerJitCache::LastCache SamplerJitCache::lastLinear_;111int SamplerJitCache::clearGen_ = 0;112113// 256k should be enough.114SamplerJitCache::SamplerJitCache() : Rasterizer::CodeBlock(1024 * 64 * 4), cache_(64) {115lastFetch_.gen = -1;116lastNearest_.gen = -1;117lastLinear_.gen = -1;118clearGen_++;119}120121void SamplerJitCache::Clear() {122clearGen_++;123CodeBlock::Clear();124cache_.Clear();125addresses_.clear();126127const10All16_ = nullptr;128const10Low_ = nullptr;129const10All8_ = nullptr;130131constWidthHeight256f_ = nullptr;132constWidthMinus1i_ = nullptr;133constHeightMinus1i_ = nullptr;134135constOnes32_ = nullptr;136constOnes16_ = nullptr;137constUNext_ = nullptr;138constVNext_ = nullptr;139140const5551Swizzle_ = nullptr;141const5650Swizzle_ = nullptr;142}143144std::string SamplerJitCache::DescribeCodePtr(const u8 *ptr) {145constexpr bool USE_IDS = false;146ptrdiff_t dist = 0x7FFFFFFF;147if (USE_IDS) {148SamplerID found{};149for (const auto &it : addresses_) {150ptrdiff_t it_dist = ptr - it.second;151if (it_dist >= 0 && it_dist < dist) {152found = it.first;153dist = it_dist;154}155}156157return DescribeSamplerID(found);158}159160return CodeBlock::DescribeCodePtr(ptr);161}162163void SamplerJitCache::Flush() {164std::unique_lock<std::mutex> guard(jitCacheLock);165for (const auto &queued : compileQueue_) {166// Might've been compiled after enqueue, but before now.167size_t queuedKey = std::hash<SamplerID>()(queued);168if (!cache_.ContainsKey(queuedKey))169Compile(queued);170}171compileQueue_.clear();172}173174NearestFunc SamplerJitCache::GetByID(const SamplerID &id, size_t key, BinManager *binner) {175std::unique_lock<std::mutex> guard(jitCacheLock);176177NearestFunc func;178if (cache_.Get(key, &func)) {179return func;180}181182if (!binner) {183// Can't compile, let's try to do it later when there's an opportunity.184compileQueue_.insert(id);185return nullptr;186}187188guard.unlock();189binner->Flush("compile");190guard.lock();191192for (const auto &queued : compileQueue_) {193// Might've been compiled after enqueue, but before now.194size_t queuedKey = std::hash<SamplerID>()(queued);195if (!cache_.ContainsKey(queuedKey))196Compile(queued);197}198compileQueue_.clear();199200if (!cache_.ContainsKey(key))201Compile(id);202203// Okay, should be there now.204if (cache_.Get(key, &func)) {205return func;206} else {207return nullptr;208}209}210211NearestFunc SamplerJitCache::GetNearest(const SamplerID &id, BinManager *binner) {212if (!g_Config.bSoftwareRenderingJit)213return nullptr;214215const size_t key = std::hash<SamplerID>()(id);216if (lastNearest_.Match(key, clearGen_))217return (NearestFunc)lastNearest_.func;218219auto func = GetByID(id, key, binner);220lastNearest_.Set(key, func, clearGen_);221return (NearestFunc)func;222}223224LinearFunc SamplerJitCache::GetLinear(const SamplerID &id, BinManager *binner) {225if (!g_Config.bSoftwareRenderingJit)226return nullptr;227228const size_t key = std::hash<SamplerID>()(id);229if (lastLinear_.Match(key, clearGen_))230return (LinearFunc)lastLinear_.func;231232auto func = GetByID(id, key, binner);233lastLinear_.Set(key, func, clearGen_);234return (LinearFunc)func;235}236237FetchFunc SamplerJitCache::GetFetch(const SamplerID &id, BinManager *binner) {238if (!g_Config.bSoftwareRenderingJit)239return nullptr;240241const size_t key = std::hash<SamplerID>()(id);242if (lastFetch_.Match(key, clearGen_))243return (FetchFunc)lastFetch_.func;244245auto func = GetByID(id, key, binner);246lastFetch_.Set(key, func, clearGen_);247return (FetchFunc)func;248}249250void SamplerJitCache::Compile(const SamplerID &id) {251// This should be sufficient.252if (GetSpaceLeft() < 16384) {253Clear();254}255256// We compile them together so the cache can't possibly be cleared in between.257// We might vary between nearest and linear, so we can't clear between.258#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)259SamplerID fetchID = id;260fetchID.linear = false;261fetchID.fetch = true;262addresses_[fetchID] = GetCodePointer();263cache_.Insert(std::hash<SamplerID>()(fetchID), (NearestFunc)CompileFetch(fetchID));264265SamplerID nearestID = id;266nearestID.linear = false;267nearestID.fetch = false;268addresses_[nearestID] = GetCodePointer();269cache_.Insert(std::hash<SamplerID>()(nearestID), (NearestFunc)CompileNearest(nearestID));270271SamplerID linearID = id;272linearID.linear = true;273linearID.fetch = false;274addresses_[linearID] = GetCodePointer();275cache_.Insert(std::hash<SamplerID>()(linearID), (NearestFunc)CompileLinear(linearID));276#endif277}278279template <uint32_t texel_size_bits>280static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint32_t v, bool swizzled) {281if (!swizzled)282return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);283284const uint32_t tile_size_bits = 32;285const uint32_t tiles_in_block_horizontal = 4;286const uint32_t tiles_in_block_vertical = 8;287288constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;289uint32_t tile_u = u / texels_per_tile;290uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +291// TODO: not sure if the *texel_size_bits/8 factor is correct292(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +293(tile_u % tiles_in_block_horizontal) +294(tile_u / tiles_in_block_horizontal) * (tiles_in_block_horizontal*tiles_in_block_vertical);295296return tile_idx * (tile_size_bits / 8) + ((u % texels_per_tile) * texel_size_bits) / 8;297}298299static inline u32 LookupColor(unsigned int index, unsigned int level, const SamplerID &samplerID) {300const int clutSharingOffset = samplerID.useSharedClut ? 0 : level * 16;301302switch (samplerID.ClutFmt()) {303case GE_CMODE_16BIT_BGR5650:304return RGB565ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);305306case GE_CMODE_16BIT_ABGR5551:307return RGBA5551ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);308309case GE_CMODE_16BIT_ABGR4444:310return RGBA4444ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);311312case GE_CMODE_32BIT_ABGR8888:313return samplerID.cached.clut32[index + clutSharingOffset];314315default:316ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt());317return 0;318}319}320321uint32_t TransformClutIndex(uint32_t index, const SamplerID &samplerID) {322if (samplerID.hasClutShift || samplerID.hasClutMask || samplerID.hasClutOffset) {323const uint8_t shift = (samplerID.cached.clutFormat >> 2) & 0x1F;324const uint8_t mask = (samplerID.cached.clutFormat >> 8) & 0xFF;325const uint16_t offset = ((samplerID.cached.clutFormat >> 16) & 0x1F) << 4;326// We need to wrap any entries beyond the first 1024 bytes.327const uint16_t offsetMask = samplerID.ClutFmt() == GE_CMODE_32BIT_ABGR8888 ? 0xFF : 0x1FF;328329return ((index >> shift) & mask) | (offset & offsetMask);330}331return index & 0xFF;332}333334struct Nearest4 {335alignas(16) u32 v[4];336337operator u32() const {338return v[0];339}340};341342template <int N>343inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N], const u8 *srcptr, uint16_t texbufw, int level, const SamplerID &samplerID) {344Nearest4 res;345if (!srcptr) {346memset(res.v, 0, sizeof(res.v));347return res;348}349350// TODO: Should probably check if textures are aligned properly...351352switch (samplerID.TexFmt()) {353case GE_TFMT_4444:354for (int i = 0; i < N; ++i) {355const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);356res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src);357}358return res;359360case GE_TFMT_5551:361for (int i = 0; i < N; ++i) {362const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);363res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src);364}365return res;366367case GE_TFMT_5650:368for (int i = 0; i < N; ++i) {369const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);370res.v[i] = RGB565ToRGBA8888(*(const u16 *)src);371}372return res;373374case GE_TFMT_8888:375for (int i = 0; i < N; ++i) {376const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);377res.v[i] = *(const u32 *)src;378}379return res;380381case GE_TFMT_CLUT32:382for (int i = 0; i < N; ++i) {383const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);384u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24);385res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);386}387return res;388389case GE_TFMT_CLUT16:390for (int i = 0; i < N; ++i) {391const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);392u16 val = src[0] + (src[1] << 8);393res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);394}395return res;396397case GE_TFMT_CLUT8:398for (int i = 0; i < N; ++i) {399const u8 *src = srcptr + GetPixelDataOffset<8>(texbufw, u[i], v[i], samplerID.swizzle);400u8 val = *src;401res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);402}403return res;404405case GE_TFMT_CLUT4:406for (int i = 0; i < N; ++i) {407const u8 *src = srcptr + GetPixelDataOffset<4>(texbufw, u[i], v[i], samplerID.swizzle);408u8 val = (u[i] & 1) ? (src[0] >> 4) : (src[0] & 0xF);409// Only CLUT4 uses separate mipmap palettes.410res.v[i] = LookupColor(TransformClutIndex(val, samplerID), level, samplerID);411}412return res;413414case GE_TFMT_DXT1:415for (int i = 0; i < N; ++i) {416const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);417res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);418}419return res;420421case GE_TFMT_DXT3:422for (int i = 0; i < N; ++i) {423const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);424res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);425}426return res;427428case GE_TFMT_DXT5:429for (int i = 0; i < N; ++i) {430const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);431res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);432}433return res;434435default:436ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported texture format: %x", samplerID.TexFmt());437memset(res.v, 0, sizeof(res.v));438return res;439}440}441442static inline int ClampUV(int v, int height) {443if (v >= height - 1)444return height - 1;445if (v >= 511)446return 511;447else if (v < 0)448return 0;449return v;450}451452static inline int WrapUV(int v, int height) {453return v & (height - 1) & 511;454}455456template <int N>457static inline void ApplyTexelClamp(int out_u[N], int out_v[N], const int u[N], const int v[N], int width, int height, const SamplerID &samplerID) {458if (samplerID.clampS) {459for (int i = 0; i < N; ++i) {460out_u[i] = ClampUV(u[i], width);461}462} else {463for (int i = 0; i < N; ++i) {464out_u[i] = WrapUV(u[i], width);465}466}467if (samplerID.clampT) {468for (int i = 0; i < N; ++i) {469out_v[i] = ClampUV(v[i], height);470}471} else {472for (int i = 0; i < N; ++i) {473out_v[i] = WrapUV(v[i], height);474}475}476}477478static inline void GetTexelCoordinates(int level, float s, float t, int &out_u, int &out_v, const SamplerID &samplerID) {479int width = samplerID.cached.sizes[level].w;480int height = samplerID.cached.sizes[level].h;481482int base_u = (int)(s * width * 256.0f);483int base_v = (int)(t * height * 256.0f);484485base_u >>= 8;486base_v >>= 8;487488ApplyTexelClamp<1>(&out_u, &out_v, &base_u, &base_v, width, height, samplerID);489}490491Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in, const SamplerID &samplerID) {492const Vec4<int> prim_color = prim_color_in;493const Vec4<int> texcolor = texcolor_in;494495Vec3<int> out_rgb;496int out_a;497498bool rgba = samplerID.useTextureAlpha;499500switch (samplerID.TexFunc()) {501case GE_TEXFUNC_MODULATE:502{503#if defined(_M_SSE)504// Modulate weights slightly on the tex color, by adding one to prim and dividing by 256.505const __m128i p = _mm_slli_epi16(_mm_packs_epi32(prim_color.ivec, prim_color.ivec), 4);506const __m128i pboost = _mm_add_epi16(p, _mm_set1_epi16(1 << 4));507__m128i t = _mm_slli_epi16(_mm_packs_epi32(texcolor.ivec, texcolor.ivec), 4);508if (samplerID.useColorDoubling) {509const __m128i amask = _mm_set_epi16(-1, 0, 0, 0, -1, 0, 0, 0);510const __m128i a = _mm_and_si128(t, amask);511const __m128i rgb = _mm_andnot_si128(amask, t);512t = _mm_or_si128(_mm_slli_epi16(rgb, 1), a);513}514const __m128i b = _mm_mulhi_epi16(pboost, t);515out_rgb.ivec = _mm_unpacklo_epi16(b, _mm_setzero_si128());516517if (rgba) {518return ToVec4IntResult(Vec4<int>(out_rgb.ivec));519} else {520out_a = prim_color.a();521}522#elif PPSSPP_ARCH(ARM64_NEON)523int32x4_t pboost = vaddq_s32(prim_color.ivec, vdupq_n_s32(1));524int32x4_t t = texcolor.ivec;525if (samplerID.useColorDoubling) {526static const int32_t rgbDouble[4] = { 1, 1, 1, 0 };527t = vshlq_s32(t, vld1q_s32(rgbDouble));528}529out_rgb.ivec = vshrq_n_s32(vmulq_s32(pboost, t), 8);530531if (rgba) {532return ToVec4IntResult(Vec4<int>(out_rgb.ivec));533}534out_a = prim_color.a();535#else536if (samplerID.useColorDoubling) {537out_rgb = ((prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() * 2) / 256;538} else {539out_rgb = (prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() / 256;540}541out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();542#endif543break;544}545546case GE_TEXFUNC_DECAL:547if (rgba) {548int t = texcolor.a();549int invt = 255 - t;550// Both colors are boosted here, making the alpha have more weight.551Vec3<int> one = Vec3<int>::AssignToAll(1);552out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t);553// Keep the bits of accuracy when doubling.554if (samplerID.useColorDoubling)555out_rgb /= 128;556else557out_rgb /= 256;558} else {559if (samplerID.useColorDoubling)560out_rgb = texcolor.rgb() * 2;561else562out_rgb = texcolor.rgb();563}564out_a = prim_color.a();565break;566567case GE_TEXFUNC_BLEND:568{569const Vec3<int> const255(255, 255, 255);570const Vec3<int> texenv = Vec3<int>::FromRGB(samplerID.cached.texBlendColor);571572// Unlike the others (and even alpha), this one simply always rounds up.573const Vec3<int> roundup = Vec3<int>::AssignToAll(255);574out_rgb = ((const255 - texcolor.rgb()) * prim_color.rgb() + texcolor.rgb() * texenv + roundup);575// Must divide by less to keep the precision for doubling to be accurate.576if (samplerID.useColorDoubling)577out_rgb /= 128;578else579out_rgb /= 256;580581out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();582break;583}584585case GE_TEXFUNC_REPLACE:586out_rgb = texcolor.rgb();587// Doubling even happens for replace.588if (samplerID.useColorDoubling)589out_rgb *= 2;590out_a = (rgba) ? texcolor.a() : prim_color.a();591break;592593case GE_TEXFUNC_ADD:594case GE_TEXFUNC_UNKNOWN1:595case GE_TEXFUNC_UNKNOWN2:596case GE_TEXFUNC_UNKNOWN3:597// Don't need to clamp afterward, we always clamp before tests.598out_rgb = prim_color.rgb() + texcolor.rgb();599if (samplerID.useColorDoubling)600out_rgb *= 2;601602// Alpha is still blended the common way.603out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();604break;605}606607return ToVec4IntResult(Vec4<int>(out_rgb, out_a));608}609610static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID) {611int u, v;612613// Nearest filtering only. Round texcoords.614GetTexelCoordinates(level, s, t, u, v, samplerID);615Vec4<int> c0 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[0], bufw[0], level, samplerID).v[0]);616617if (levelFrac) {618GetTexelCoordinates(level + 1, s, t, u, v, samplerID);619Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);620621c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;622}623624return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);625}626627static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID) {628Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level, samplerID);629return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));630}631632static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuad(bool clamp, Vec4IntArg vec, int width) {633Vec4<int> result = vec;634#ifdef _M_SSE635if (clamp) {636// First, clamp to zero.637__m128i negmask = _mm_cmpgt_epi32(_mm_setzero_si128(), result.ivec);638result.ivec = _mm_andnot_si128(negmask, result.ivec);639640// Now the high bound.641__m128i bound = _mm_set1_epi32(width > 512 ? 511 : width - 1);642__m128i goodmask = _mm_cmpgt_epi32(bound, result.ivec);643// Clear the ones that were too high, then or in the high bound to those.644result.ivec = _mm_and_si128(goodmask, result.ivec);645result.ivec = _mm_or_si128(result.ivec, _mm_andnot_si128(goodmask, bound));646} else {647result.ivec = _mm_and_si128(result.ivec, _mm_set1_epi32((width - 1) & 511));648}649#elif PPSSPP_ARCH(ARM64_NEON)650if (clamp) {651// Let's start by clamping to the maximum.652result.ivec = vminq_s32(result.ivec, vdupq_n_s32(width > 512 ? 511 : width - 1));653// And then to zero.654result.ivec = vmaxq_s32(result.ivec, vdupq_n_s32(0));655} else {656result.ivec = vandq_s32(result.ivec, vdupq_n_s32((width - 1) & 511));657}658#else659if (clamp) {660for (int i = 0; i < 4; ++i) {661result[i] = ClampUV(result[i], width);662}663} else {664for (int i = 0; i < 4; ++i) {665result[i] = WrapUV(result[i], width);666}667}668#endif669670return ToVec4IntResult(result);671}672673static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadS(bool clamp, int u, int width) {674#ifdef _M_SSE675__m128i uvec = _mm_add_epi32(_mm_set1_epi32(u), _mm_set_epi32(1, 0, 1, 0));676return ApplyTexelClampQuad(clamp, uvec, width);677#elif PPSSPP_ARCH(ARM64_NEON)678static const int32_t u2[4] = { 0, 1, 0, 1 };679int32x4_t uvec = vaddq_s32(vdupq_n_s32(u), vld1q_s32(u2));680return ApplyTexelClampQuad(clamp, uvec, width);681#else682Vec4<int> result = Vec4<int>::AssignToAll(u) + Vec4<int>(0, 1, 0, 1);683return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), width);684#endif685}686687static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadT(bool clamp, int v, int height) {688#ifdef _M_SSE689__m128i vvec = _mm_add_epi32(_mm_set1_epi32(v), _mm_set_epi32(1, 1, 0, 0));690return ApplyTexelClampQuad(clamp, vvec, height);691#elif PPSSPP_ARCH(ARM64_NEON)692static const int32_t v2[4] = { 0, 0, 1, 1 };693int32x4_t vvec = vaddq_s32(vdupq_n_s32(v), vld1q_s32(v2));694return ApplyTexelClampQuad(clamp, vvec, height);695#else696Vec4<int> result = Vec4<int>::AssignToAll(v) + Vec4<int>(0, 0, 1, 1);697return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), height);698#endif699}700701static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadS(int level, float in_s, int &frac_u, const SamplerID &samplerID) {702int width = samplerID.cached.sizes[level].w;703704int base_u = (int)(in_s * width * 256) - 128;705frac_u = (int)(base_u >> 4) & 0x0F;706base_u >>= 8;707708// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.709return ApplyTexelClampQuadS(samplerID.clampS, base_u, width);710}711712static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadT(int level, float in_t, int &frac_v, const SamplerID &samplerID) {713int height = samplerID.cached.sizes[level].h;714715int base_v = (int)(in_t * height * 256) - 128;716frac_v = (int)(base_v >> 4) & 0x0F;717base_v >>= 8;718719// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.720return ApplyTexelClampQuadT(samplerID.clampT, base_v, height);721}722723static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8 *const *tptr, const uint16_t *bufw, int texlevel, const SamplerID &samplerID) {724int frac_u, frac_v;725const Vec4<int> u = GetTexelCoordinatesQuadS(texlevel, s, frac_u, samplerID);726const Vec4<int> v = GetTexelCoordinatesQuadT(texlevel, t, frac_v, samplerID);727Nearest4 c = SampleNearest<4>(u.AsArray(), v.AsArray(), tptr[0], bufw[0], texlevel, samplerID);728#ifdef _M_SSE729__m128i zero = _mm_setzero_si128();730__m128i samples = _mm_loadu_si128((const __m128i*)(c.v));731__m128i top = _mm_unpacklo_epi8(samples, zero);732__m128i bot = _mm_unpackhi_epi8(samples, zero);733// I just a want reasonably efficient734// __m128i mul_u = _mm_setr_epi16(0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, frac_u, frac_u, frac_u, frac_u);735// GCC/clang do something decent for that, MSVC - not so much.736// Hence this. (0x10 - frac_u) is expressed as (frac_u ^ 0xF) + 1,737// which REQUIRES 0 <= frac_u < 0x10.738__m128i mul_u = _mm_set1_epi16(frac_u);739mul_u = _mm_xor_si128(mul_u, _mm_setr_epi16(0xF, 0xF, 0xF, 0xF, 0x0, 0x0, 0x0, 0x0));740mul_u = _mm_add_epi16(mul_u, _mm_setr_epi16(0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0));741top = _mm_mullo_epi16(top, _mm_set1_epi16(0x10 - frac_v));742bot = _mm_mullo_epi16(bot, _mm_set1_epi16(frac_v));743__m128i sum = _mm_add_epi16(top, bot);744sum = _mm_mullo_epi16(sum, mul_u);745sum = _mm_add_epi16(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(3, 2, 3, 2)));746sum = _mm_srli_epi16(sum, 8);747sum = _mm_unpacklo_epi16(sum, zero);748return sum;749#else750Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);751Vec4<int> texcolor_tr = Vec4<int>::FromRGBA(c.v[1]);752Vec4<int> texcolor_bl = Vec4<int>::FromRGBA(c.v[2]);753Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);754Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;755Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;756return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));757#endif758}759760static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {761Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);762if (levelFrac) {763const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);764c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;765}766return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);767}768769};770771772