CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Common/TextureDecoder.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"1819#include "ext/xxhash.h"2021#include "Common/Common.h"22#include "Common/Data/Convert/ColorConv.h"23#include "Common/CPUDetect.h"24#include "Common/Log.h"25#include "Common/Math/CrossSIMD.h"2627#include "GPU/GPU.h"28#include "GPU/GPUState.h"29#include "GPU/Common/TextureDecoder.h"3031#ifdef _M_SSE32#include <emmintrin.h>33#include <smmintrin.h>34#endif3536#if PPSSPP_ARCH(ARM_NEON)37#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)38#include <arm64_neon.h>39#else40#include <arm_neon.h>41#endif42#endif4344const u8 textureBitsPerPixel[16] = {4516, //GE_TFMT_5650,4616, //GE_TFMT_5551,4716, //GE_TFMT_4444,4832, //GE_TFMT_8888,494, //GE_TFMT_CLUT4,508, //GE_TFMT_CLUT8,5116, //GE_TFMT_CLUT16,5232, //GE_TFMT_CLUT32,534, //GE_TFMT_DXT1,548, //GE_TFMT_DXT3,558, //GE_TFMT_DXT5,560, // INVALID,570, // INVALID,580, // INVALID,590, // INVALID,600, // INVALID,61};6263#ifdef _M_SSE6465static u32 QuickTexHashSSE2(const void *checkp, u32 size) {66u32 check = 0;6768if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {69__m128i cursor = _mm_set1_epi32(0);70__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);71__m128i update = _mm_set1_epi16(0x2455U);72const __m128i *p = (const __m128i *)checkp;73for (u32 i = 0; i < size / 16; i += 4) {74__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);75cursor = _mm_add_epi16(cursor, chunk);76cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));77cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));78chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);79cursor = _mm_xor_si128(cursor, chunk);80cursor2 = _mm_add_epi16(cursor2, update);81}82cursor = _mm_add_epi32(cursor, cursor2);83// Add the four parts into the low i32.84cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));85cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));86check = _mm_cvtsi128_si32(cursor);87} else {88const u32 *p = (const u32 *)checkp;89for (u32 i = 0; i < size / 8; ++i) {90check += *p++;91check ^= *p++;92}93}9495return check;96}97#endif9899#if PPSSPP_ARCH(ARM_NEON)100101alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };102103static u32 QuickTexHashNEON(const void *checkp, u32 size) {104u32 check = 0;105106if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {107#if PPSSPP_PLATFORM(IOS) || PPSSPP_ARCH(ARM64) || defined(_MSC_VER) || !PPSSPP_ARCH(ARMV7)108uint32x4_t cursor = vdupq_n_u32(0);109uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);110uint16x8_t update = vdupq_n_u16(0x2455U);111112const u32 *p = (const u32 *)checkp;113const u32 *pend = p + size / 4;114while (p < pend) {115cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));116cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));117cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));118cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));119cursor2 = vaddq_u16(cursor2, update);120121p += 4 * 4;122}123124cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));125uint32x2_t mixed = vadd_u32(vget_high_u32(cursor), vget_low_u32(cursor));126check = vget_lane_u32(mixed, 0) + vget_lane_u32(mixed, 1);127#else128// TODO: Why does this crash on iOS, but only certain devices?129// It's faster than the above, but I guess it sucks to be using an iPhone.130// As of 2020 clang, it's still faster by ~1.4%.131132// d0/d1 (q0) - cursor133// d2/d3 (q1) - cursor2134// d4/d5 (q2) - update135// d16-d23 (q8-q11) - memory transfer136asm volatile (137// Initialize cursor.138"vmov.i32 q0, #0\n"139140// Initialize cursor2.141"movw r0, 0xc00b\n"142"movt r0, 0x9bd9\n"143"movw r1, 0x4b73\n"144"movt r1, 0xb651\n"145"vmov d2, r0, r1\n"146"movw r0, 0x4d9b\n"147"movt r0, 0x4309\n"148"movw r1, 0x0083\n"149"movt r1, 0x0001\n"150"vmov d3, r0, r1\n"151152// Initialize update.153"movw r0, 0x2455\n"154"vdup.i16 q2, r0\n"155156// This is where we end.157"add r0, %1, %2\n"158159// Okay, do the memory hashing.160"QuickTexHashNEON_next:\n"161"pld [%2, #0xc0]\n"162"vldmia %2!, {d16-d23}\n"163"vmla.i16 q0, q1, q8\n"164"vmul.i16 q11, q11, q1\n"165"veor.i32 q0, q0, q9\n"166"cmp %2, r0\n"167"vadd.i32 q0, q0, q10\n"168"vadd.i16 q1, q1, q2\n"169"veor.i32 q0, q0, q11\n"170"blo QuickTexHashNEON_next\n"171172// Now let's get the result.173"vadd.i32 q0, q0, q1\n"174"vadd.i32 d0, d0, d1\n"175"vmov r0, r1, d0\n"176"add %0, r0, r1\n"177178: "=r"(check)179: "r"(size), "r"(checkp)180: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "cc"181);182#endif183} else {184const u32 size_u32 = size / 4;185const u32 *p = (const u32 *)checkp;186for (u32 i = 0; i < size_u32; i += 4) {187check += p[i + 0];188check ^= p[i + 1];189check += p[i + 2];190check ^= p[i + 3];191}192}193194return check;195}196197#endif // PPSSPP_ARCH(ARM_NEON)198199// Masks to downalign bufw to 16 bytes, and wrap at 2048.200static const u32 textureAlignMask16[16] = {2010x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5650,2020x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5551,2030x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_4444,2040x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_8888,2050x7FF & ~(((8 * 16) / 4) - 1), //GE_TFMT_CLUT4,2060x7FF & ~(((8 * 16) / 8) - 1), //GE_TFMT_CLUT8,2070x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_CLUT16,2080x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_CLUT32,2090x7FF, //GE_TFMT_DXT1,2100x7FF, //GE_TFMT_DXT3,2110x7FF, //GE_TFMT_DXT5,2120, // INVALID,2130, // INVALID,2140, // INVALID,2150, // INVALID,2160, // INVALID,217};218219u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {220// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.221if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())222return gstate.texbufwidth[level] & 0x1FFF;223224u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];225if (bufw == 0 && format <= GE_TFMT_DXT5) {226// If it's less than 16 bytes, use 16 bytes.227bufw = (8 * 16) / textureBitsPerPixel[format];228}229return bufw;230}231232// Matches QuickTexHashNEON/SSE, see #7029.233static u32 QuickTexHashNonSSE(const void *checkp, u32 size) {234u32 check = 0;235236if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {237static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};238union u32x4_u16x8 {239#if defined(__GNUC__)240uint32_t x32 __attribute__((vector_size(16)));241uint16_t x16 __attribute__((vector_size(16)));242#else243u32 x32[4];244u16 x16[8];245#endif246};247u32x4_u16x8 cursor{};248u32x4_u16x8 cursor2;249static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};250251for (u32 j = 0; j < 8; ++j) {252cursor2.x16[j] = cursor2_initial[j];253}254255const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;256for (u32 i = 0; i < size / 16; i += 4) {257for (u32 j = 0; j < 8; ++j) {258const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];259cursor.x16[j] += temp;260}261for (u32 j = 0; j < 4; ++j) {262cursor.x32[j] ^= p[i + 1].x32[j];263cursor.x32[j] += p[i + 2].x32[j];264}265for (u32 j = 0; j < 8; ++j) {266const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];267cursor.x16[j] ^= temp;268}269for (u32 j = 0; j < 8; ++j) {270cursor2.x16[j] += update[j];271}272}273274for (u32 j = 0; j < 4; ++j) {275cursor.x32[j] += cursor2.x32[j];276}277check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];278} else {279const u32 *p = (const u32 *)checkp;280for (u32 i = 0; i < size / 8; ++i) {281check += *p++;282check ^= *p++;283}284}285286return check;287}288289u32 StableQuickTexHash(const void *checkp, u32 size) {290#if defined(_M_SSE)291return QuickTexHashSSE2(checkp, size);292#elif PPSSPP_ARCH(ARM_NEON)293return QuickTexHashNEON(checkp, size);294#else295return QuickTexHashNonSSE(checkp, size);296#endif297}298299void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {300// ysrcp is in 32-bits, so this is convenient.301const u32 pitchBy32 = pitch >> 2;302#ifdef _M_SSE303if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {304__m128i *dest = (__m128i *)texptr;305// The pitch parameter is in bytes, so shift down for 128-bit.306// Note: it's always aligned to 16 bytes, so this is safe.307const u32 pitchBy128 = pitch >> 4;308for (int by = 0; by < byc; by++) {309const __m128i *xsrc = (const __m128i *)ysrcp;310for (int bx = 0; bx < bxc; bx++) {311const __m128i *src = xsrc;312for (int n = 0; n < 2; n++) {313// Textures are always 16-byte aligned so this is fine.314__m128i temp1 = _mm_load_si128(src);315src += pitchBy128;316__m128i temp2 = _mm_load_si128(src);317src += pitchBy128;318__m128i temp3 = _mm_load_si128(src);319src += pitchBy128;320__m128i temp4 = _mm_load_si128(src);321src += pitchBy128;322323_mm_store_si128(dest, temp1);324_mm_store_si128(dest + 1, temp2);325_mm_store_si128(dest + 2, temp3);326_mm_store_si128(dest + 3, temp4);327dest += 4;328}329xsrc++;330}331ysrcp += pitchBy32 * 8;332}333} else334#endif335{336u32 *dest = (u32 *)texptr;337for (int by = 0; by < byc; by++) {338const u32 *xsrc = ysrcp;339for (int bx = 0; bx < bxc; bx++) {340const u32 *src = xsrc;341for (int n = 0; n < 8; n++) {342memcpy(dest, src, 16);343src += pitchBy32;344dest += 4;345}346xsrc += 4;347}348ysrcp += pitchBy32 * 8;349}350}351}352353void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {354// ydestp is in 32-bits, so this is convenient.355const u32 pitchBy32 = pitch >> 2;356357#ifdef _M_SSE358// This check is pretty much a given, right?359if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {360const __m128i *src = (const __m128i *)texptr;361// The pitch parameter is in bytes, so shift down for 128-bit.362// Note: it's always aligned to 16 bytes, so this is safe.363const u32 pitchBy128 = pitch >> 4;364for (int by = 0; by < byc; by++) {365__m128i *xdest = (__m128i *)ydestp;366for (int bx = 0; bx < bxc; bx++) {367__m128i *dest = xdest;368for (int n = 0; n < 2; n++) {369// Textures are always 16-byte aligned so this is fine.370__m128i temp1 = _mm_load_si128(src);371__m128i temp2 = _mm_load_si128(src + 1);372__m128i temp3 = _mm_load_si128(src + 2);373__m128i temp4 = _mm_load_si128(src + 3);374_mm_store_si128(dest, temp1);375dest += pitchBy128;376_mm_store_si128(dest, temp2);377dest += pitchBy128;378_mm_store_si128(dest, temp3);379dest += pitchBy128;380_mm_store_si128(dest, temp4);381dest += pitchBy128;382src += 4;383}384xdest++;385}386ydestp += pitchBy32 * 8;387}388} else389#elif PPSSPP_ARCH(ARM_NEON)390if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {391const u32 *src = (const u32 *)texptr;392for (int by = 0; by < byc; by++) {393u32 *xdest = ydestp;394for (int bx = 0; bx < bxc; bx++) {395u32 *dest = xdest;396for (int n = 0; n < 2; n++) {397// Textures are always 16-byte aligned so this is fine.398uint32x4_t temp1 = vld1q_u32(src);399uint32x4_t temp2 = vld1q_u32(src + 4);400uint32x4_t temp3 = vld1q_u32(src + 8);401uint32x4_t temp4 = vld1q_u32(src + 12);402vst1q_u32(dest, temp1);403dest += pitchBy32;404vst1q_u32(dest, temp2);405dest += pitchBy32;406vst1q_u32(dest, temp3);407dest += pitchBy32;408vst1q_u32(dest, temp4);409dest += pitchBy32;410src += 16;411}412xdest += 4;413}414ydestp += pitchBy32 * 8;415}416} else417#endif418{419const u32 *src = (const u32 *)texptr;420for (int by = 0; by < byc; by++) {421u32 *xdest = ydestp;422for (int bx = 0; bx < bxc; bx++) {423u32 *dest = xdest;424for (int n = 0; n < 8; n++) {425memcpy(dest, src, 16);426dest += pitchBy32;427src += 4;428}429xdest += 4;430}431ydestp += pitchBy32 * 8;432}433}434}435436// S3TC / DXT Decoder437class DXTDecoder {438public:439inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);440inline void DecodeAlphaDXT5(const DXT5Block *src);441inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);442inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);443inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);444445bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }446447protected:448u32 colors_[4];449u8 alpha_[8];450bool alphaMode_ = false;451bool anyNonFullAlpha_ = false;452};453454static inline u32 makecol(int r, int g, int b, int a) {455return (a << 24) | (b << 16) | (g << 8) | r;456}457458static inline int mix_2_3(int c1, int c2) {459return (c1 + c1 + c2) / 3;460}461462// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.463void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {464u16 c1 = src->color1;465u16 c2 = src->color2;466int blue1 = (c1 << 3) & 0xF8;467int blue2 = (c2 << 3) & 0xF8;468int green1 = (c1 >> 3) & 0xFC;469int green2 = (c2 >> 3) & 0xFC;470int red1 = (c1 >> 8) & 0xF8;471int red2 = (c2 >> 8) & 0xF8;472473// Keep alpha zero for non-DXT1 to skip masking the colors.474int alpha = ignore1bitAlpha ? 0 : 255;475476colors_[0] = makecol(red1, green1, blue1, alpha);477colors_[1] = makecol(red2, green2, blue2, alpha);478if (c1 > c2) {479colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);480colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);481} else {482// Average - these are always left shifted, so no need to worry about ties.483int red3 = (red1 + red2) / 2;484int green3 = (green1 + green2) / 2;485int blue3 = (blue1 + blue2) / 2;486colors_[2] = makecol(red3, green3, blue3, alpha);487colors_[3] = makecol(0, 0, 0, 0);488if (alpha == 255) {489alphaMode_ = true;490}491}492}493494static inline u8 lerp8(const DXT5Block *src, int n) {495// These weights multiple alpha1/alpha2 to fixed 8.8 point.496int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;497int alpha2 = (src->alpha2 * (n << 8)) / 7;498return (u8)((alpha1 + alpha2 + 31) >> 8);499}500501static inline u8 lerp6(const DXT5Block *src, int n) {502int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;503int alpha2 = (src->alpha2 * (n << 8)) / 5;504return (u8)((alpha1 + alpha2 + 31) >> 8);505}506507void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {508alpha_[0] = src->alpha1;509alpha_[1] = src->alpha2;510if (alpha_[0] > alpha_[1]) {511alpha_[2] = lerp8(src, 1);512alpha_[3] = lerp8(src, 2);513alpha_[4] = lerp8(src, 3);514alpha_[5] = lerp8(src, 4);515alpha_[6] = lerp8(src, 5);516alpha_[7] = lerp8(src, 6);517} else {518alpha_[2] = lerp6(src, 1);519alpha_[3] = lerp6(src, 2);520alpha_[4] = lerp6(src, 3);521alpha_[5] = lerp6(src, 4);522alpha_[6] = 0;523alpha_[7] = 255;524}525}526527void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {528bool anyColor3 = false;529for (int y = 0; y < height; y++) {530int colordata = src->lines[y];531for (int x = 0; x < width; x++) {532int col = colordata & 3;533if (col == 3) {534anyColor3 = true;535}536dst[x] = colors_[col];537colordata >>= 2;538}539dst += pitch;540}541542if (alphaMode_ && anyColor3) {543anyNonFullAlpha_ = true;544}545}546547void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {548for (int y = 0; y < height; y++) {549int colordata = src->color.lines[y];550u32 alphadata = src->alphaLines[y];551for (int x = 0; x < width; x++) {552dst[x] = colors_[colordata & 3] | (alphadata << 28);553colordata >>= 2;554alphadata >>= 4;555}556dst += pitch;557}558}559560void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {561// 48 bits, 3 bit index per pixel, 12 bits per line.562u64 allAlpha = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;563564for (int y = 0; y < height; y++) {565uint32_t colordata = src->color.lines[y];566uint32_t alphadata = allAlpha >> (12 * y);567for (int x = 0; x < width; x++) {568dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);569colordata >>= 2;570alphadata >>= 3;571}572dst += pitch;573}574}575576uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {577_dbg_assert_(x >= 0 && x < 4);578_dbg_assert_(y >= 0 && y < 4);579580uint16_t c1 = src->color1;581uint16_t c2 = src->color2;582int blue1 = (c1 << 3) & 0xF8;583int blue2 = (c2 << 3) & 0xF8;584int green1 = (c1 >> 3) & 0xFC;585int green2 = (c2 >> 3) & 0xFC;586int red1 = (c1 >> 8) & 0xF8;587int red2 = (c2 >> 8) & 0xF8;588589int colorIndex = (src->lines[y] >> (x * 2)) & 3;590if (colorIndex == 0) {591return makecol(red1, green1, blue1, alpha);592} else if (colorIndex == 1) {593return makecol(red2, green2, blue2, alpha);594} else if (c1 > c2) {595if (colorIndex == 2) {596return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);597}598return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);599} else if (colorIndex == 3) {600return makecol(0, 0, 0, 0);601}602603// Average - these are always left shifted, so no need to worry about ties.604int red3 = (red1 + red2) / 2;605int green3 = (green1 + green2) / 2;606int blue3 = (blue1 + blue2) / 2;607return makecol(red3, green3, blue3, alpha);608}609610uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {611return GetDXTTexelColor(src, x, y, 255);612}613614uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {615uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);616u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;617return color | (alpha << 28);618}619620uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {621uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);622uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;623int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;624625if (alphaIndex == 0) {626return color | (src->alpha1 << 24);627} else if (alphaIndex == 1) {628return color | (src->alpha2 << 24);629} else if (src->alpha1 > src->alpha2) {630return color | (lerp8(src, alphaIndex - 1) << 24);631} else if (alphaIndex == 6) {632return color;633} else if (alphaIndex == 7) {634return color | 0xFF000000;635}636return color | (lerp6(src, alphaIndex - 1) << 24);637}638639// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.640void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {641DXTDecoder dxt;642dxt.DecodeColors(src, false);643dxt.WriteColorsDXT1(dst, src, pitch, width, height);644*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;645}646647void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {648DXTDecoder dxt;649dxt.DecodeColors(&src->color, true);650dxt.WriteColorsDXT3(dst, src, pitch, width, height);651}652653void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {654DXTDecoder dxt;655dxt.DecodeColors(&src->color, true);656dxt.DecodeAlphaDXT5(src);657dxt.WriteColorsDXT5(dst, src, pitch, width, height);658}659660#ifdef _M_SSE661inline u32 SSEReduce32And(__m128i value) {662value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));663value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)));664return _mm_cvtsi128_si32(value);665}666inline u32 SSEReduce16And(__m128i value) {667u32 mask = SSEReduce32And(value);668return mask & (mask >> 16);669}670#endif671672#if PPSSPP_ARCH(ARM_NEON)673inline u32 NEONReduce32And(uint32x4_t value) {674// TODO: Maybe a shuffle and a vector and, or something?675return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);676}677inline u32 NEONReduce16And(uint16x8_t value) {678uint32x4_t value32 = vreinterpretq_u32_u16(value);679// TODO: Maybe a shuffle and a vector and, or something?680u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3);681return mask & (mask >> 16);682}683#endif684685// TODO: SSE/SIMD686// At least on x86, compiler actually SIMDs these pretty well.687void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {688u16 mask = 0xFFFF;689#ifdef _M_SSE690if (width >= 8) {691__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);692while (width >= 8) {693__m128i color = _mm_loadu_si128((__m128i *)src);694wideMask = _mm_and_si128(wideMask, color);695_mm_storeu_si128((__m128i *)dst, color);696src += 8;697dst += 8;698width -= 8;699}700mask = SSEReduce16And(wideMask);701}702#elif PPSSPP_ARCH(ARM_NEON)703if (width >= 8) {704uint16x8_t wideMask = vdupq_n_u16(0xFFFF);705while (width >= 8) {706uint16x8_t colors = vld1q_u16(src);707wideMask = vandq_u16(wideMask, colors);708vst1q_u16(dst, colors);709src += 8;710dst += 8;711width -= 8;712}713mask = NEONReduce16And(wideMask);714}715#endif716717DO_NOT_VECTORIZE_LOOP718for (int i = 0; i < width; i++) {719u16 color = src[i];720mask &= color;721dst[i] = color;722}723*outMask &= (u32)mask;724}725726// Used in video playback so nice to have being fast.727void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {728u32 mask = 0xFFFFFFFF;729#ifdef _M_SSE730if (width >= 4) {731__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);732while (width >= 4) {733__m128i color = _mm_loadu_si128((__m128i *)src);734wideMask = _mm_and_si128(wideMask, color);735_mm_storeu_si128((__m128i *)dst, color);736src += 4;737dst += 4;738width -= 4;739}740mask = SSEReduce32And(wideMask);741}742#elif PPSSPP_ARCH(ARM_NEON)743if (width >= 4) {744uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);745while (width >= 4) {746uint32x4_t colors = vld1q_u32(src);747wideMask = vandq_u32(wideMask, colors);748vst1q_u32(dst, colors);749src += 4;750dst += 4;751width -= 4;752}753mask = NEONReduce32And(wideMask);754}755#endif756757DO_NOT_VECTORIZE_LOOP758for (int i = 0; i < width; i++) {759u32 color = src[i];760mask &= color;761dst[i] = color;762}763*outMask &= (u32)mask;764}765766void CheckMask16(const u16 *src, int width, u32 *outMask) {767u16 mask = 0xFFFF;768#ifdef _M_SSE769if (width >= 8) {770__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);771while (width >= 8) {772wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));773src += 8;774width -= 8;775}776mask = SSEReduce16And(wideMask);777}778#elif PPSSPP_ARCH(ARM_NEON)779if (width >= 8) {780uint16x8_t wideMask = vdupq_n_u16(0xFFFF);781while (width >= 8) {782wideMask = vandq_u16(wideMask, vld1q_u16(src));783src += 8;784width -= 8;785}786mask = NEONReduce16And(wideMask);787}788#endif789790DO_NOT_VECTORIZE_LOOP791for (int i = 0; i < width; i++) {792mask &= src[i];793}794*outMask &= (u32)mask;795}796797void CheckMask32(const u32 *src, int width, u32 *outMask) {798u32 mask = 0xFFFFFFFF;799#ifdef _M_SSE800if (width >= 4) {801__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);802while (width >= 4) {803wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));804src += 4;805width -= 4;806}807mask = SSEReduce32And(wideMask);808}809#elif PPSSPP_ARCH(ARM_NEON)810if (width >= 4) {811uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);812while (width >= 4) {813wideMask = vandq_u32(wideMask, vld1q_u32(src));814src += 4;815width -= 4;816}817mask = NEONReduce32And(wideMask);818}819#endif820821DO_NOT_VECTORIZE_LOOP822for (int i = 0; i < width; i++) {823mask &= src[i];824}825*outMask &= (u32)mask;826}827828829