CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Convert/ColorConv.cpp
Views: 1401
// Copyright (c) 2015- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "ppsspp_config.h"18#include "Common/Data/Convert/ColorConv.h"19#include "Common/Data/Convert/SmallDataConvert.h"20#include "Common/Common.h"21#include "Common/CPUDetect.h"2223#ifdef _M_SSE24#include <emmintrin.h>25#include <smmintrin.h>26#endif2728#if PPSSPP_ARCH(ARM_NEON)29#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)30#include <arm64_neon.h>31#else32#include <arm_neon.h>33#endif34#endif3536void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) {37#ifdef _M_SSE38const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);3940const __m128i *srcp = (const __m128i *)src;41__m128i *dstp = (__m128i *)dst;42u32 sseChunks = numPixels / 4;43if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {44sseChunks = 0;45}46for (u32 i = 0; i < sseChunks; ++i) {47__m128i c = _mm_load_si128(&srcp[i]);48__m128i rb = _mm_andnot_si128(maskGA, c);49c = _mm_and_si128(c, maskGA);5051__m128i b = _mm_srli_epi32(rb, 16);52__m128i r = _mm_slli_epi32(rb, 16);53c = _mm_or_si128(_mm_or_si128(c, r), b);54_mm_store_si128(&dstp[i], c);55}56// The remainder starts right after those done via SSE.57u32 i = sseChunks * 4;58#else59u32 i = 0;60#endif61for (; i < numPixels; i++) {62const u32 c = src[i];63dst[i] = ((c >> 16) & 0x000000FF) |64(c & 0xFF00FF00) |65((c << 16) & 0x00FF0000);66}67}6869void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {70for (uint32_t x = 0; x < numPixels; ++x) {71uint32_t c = src[x];72dst[x * 3 + 0] = (c >> 16) & 0xFF;73dst[x * 3 + 1] = (c >> 8) & 0xFF;74dst[x * 3 + 2] = (c >> 0) & 0xFF;75}76}7778#if defined(_M_SSE)79#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)80[[gnu::target("sse4.1")]]81#endif82static inline void ConvertRGBA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {83const __m128i maskAG = _mm_set1_epi32(0x8000F800);84const __m128i maskRB = _mm_set1_epi32(0x00F800F8);85const __m128i mask = _mm_set1_epi32(0x0000FFFF);8687for (u32 i = 0; i < sseChunks; i += 2) {88__m128i c1 = _mm_load_si128(&srcp[i + 0]);89__m128i c2 = _mm_load_si128(&srcp[i + 1]);90__m128i ag, rb;9192ag = _mm_and_si128(c1, maskAG);93ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));94rb = _mm_and_si128(c1, maskRB);95rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));96c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);9798ag = _mm_and_si128(c2, maskAG);99ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));100rb = _mm_and_si128(c2, maskRB);101rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));102c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);103104_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));105}106}107#endif108109void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {110#if defined(_M_SSE)111const __m128i *srcp = (const __m128i *)src;112__m128i *dstp = (__m128i *)dst;113u32 sseChunks = (numPixels / 4) & ~1;114// SSE 4.1 required for _mm_packus_epi32.115if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {116sseChunks = 0;117} else {118ConvertRGBA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);119}120121// The remainder starts right after those done via SSE.122u32 i = sseChunks * 4;123#else124u32 i = 0;125#endif126for (; i < numPixels; i++) {127dst[i] = RGBA8888toRGBA5551(src[i]);128}129}130131#if defined(_M_SSE)132#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)133[[gnu::target("sse4.1")]]134#endif135static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {136const __m128i maskAG = _mm_set1_epi32(0x8000F800);137const __m128i maskRB = _mm_set1_epi32(0x00F800F8);138const __m128i mask = _mm_set1_epi32(0x0000FFFF);139140for (u32 i = 0; i < sseChunks; i += 2) {141__m128i c1 = _mm_load_si128(&srcp[i + 0]);142__m128i c2 = _mm_load_si128(&srcp[i + 1]);143__m128i ag, rb;144145ag = _mm_and_si128(c1, maskAG);146ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));147rb = _mm_and_si128(c1, maskRB);148rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));149c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);150151ag = _mm_and_si128(c2, maskAG);152ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));153rb = _mm_and_si128(c2, maskRB);154rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));155c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);156157_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));158}159}160#endif161162void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {163#if defined(_M_SSE)164const __m128i *srcp = (const __m128i *)src;165__m128i *dstp = (__m128i *)dst;166u32 sseChunks = (numPixels / 4) & ~1;167// SSE 4.1 required for _mm_packus_epi32.168if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {169sseChunks = 0;170} else {171ConvertBGRA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);172}173174// The remainder starts right after those done via SSE.175u32 i = sseChunks * 4;176#else177u32 i = 0;178#endif179for (; i < numPixels; i++) {180dst[i] = BGRA8888toRGBA5551(src[i]);181}182}183184void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {185for (u32 i = 0; i < numPixels; i++) {186dst[i] = BGRA8888toRGB565(src[i]);187}188}189190void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {191for (u32 i = 0; i < numPixels; i++) {192dst[i] = BGRA8888toRGBA4444(src[i]);193}194}195196void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {197for (u32 x = 0; x < numPixels; ++x) {198dst[x] = RGBA8888toRGB565(src[x]);199}200}201202void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {203for (u32 x = 0; x < numPixels; ++x) {204dst[x] = RGBA8888toRGBA4444(src[x]);205}206}207208void ConvertRGBA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {209for (uint32_t x = 0; x < numPixels; ++x) {210memcpy(dst + x * 3, src + x, 3);211}212}213214void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {215#ifdef _M_SSE216const __m128i mask5 = _mm_set1_epi16(0x001f);217const __m128i mask6 = _mm_set1_epi16(0x003f);218const __m128i mask8 = _mm_set1_epi16(0x00ff);219220const __m128i *srcp = (const __m128i *)src;221__m128i *dstp = (__m128i *)dst32;222u32 sseChunks = numPixels / 8;223if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {224sseChunks = 0;225}226for (u32 i = 0; i < sseChunks; ++i) {227const __m128i c = _mm_load_si128(&srcp[i]);228229// Swizzle, resulting in RR00 RR00.230__m128i r = _mm_and_si128(c, mask5);231r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));232r = _mm_and_si128(r, mask8);233234// This one becomes 00GG 00GG.235__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask6);236g = _mm_or_si128(_mm_slli_epi16(g, 2), _mm_srli_epi16(g, 4));237g = _mm_slli_epi16(g, 8);238239// Almost done, we aim for BB00 BB00 again here.240__m128i b = _mm_and_si128(_mm_srli_epi16(c, 11), mask5);241b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));242b = _mm_and_si128(b, mask8);243244// Always set alpha to 00FF 00FF.245__m128i a = _mm_slli_epi16(mask8, 8);246247// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.248const __m128i rg = _mm_or_si128(r, g);249const __m128i ba = _mm_or_si128(b, a);250_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));251_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));252}253u32 i = sseChunks * 8;254#else255u32 i = 0;256#endif257258u8 *dst = (u8 *)dst32;259for (u32 x = i; x < numPixels; x++) {260u16 col = src[x];261dst[x * 4] = Convert5To8((col) & 0x1f);262dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);263dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f);264dst[x * 4 + 3] = 255;265}266}267268void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {269#ifdef _M_SSE270const __m128i mask5 = _mm_set1_epi16(0x001f);271const __m128i mask8 = _mm_set1_epi16(0x00ff);272273const __m128i *srcp = (const __m128i *)src;274__m128i *dstp = (__m128i *)dst32;275u32 sseChunks = numPixels / 8;276if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {277sseChunks = 0;278}279for (u32 i = 0; i < sseChunks; ++i) {280const __m128i c = _mm_load_si128(&srcp[i]);281282// Swizzle, resulting in RR00 RR00.283__m128i r = _mm_and_si128(c, mask5);284r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));285r = _mm_and_si128(r, mask8);286287// This one becomes 00GG 00GG.288__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask5);289g = _mm_or_si128(_mm_slli_epi16(g, 3), _mm_srli_epi16(g, 2));290g = _mm_slli_epi16(g, 8);291292// Almost done, we aim for BB00 BB00 again here.293__m128i b = _mm_and_si128(_mm_srli_epi16(c, 10), mask5);294b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));295b = _mm_and_si128(b, mask8);296297// 1 bit A to 00AA 00AA.298__m128i a = _mm_srai_epi16(c, 15);299a = _mm_slli_epi16(a, 8);300301// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.302const __m128i rg = _mm_or_si128(r, g);303const __m128i ba = _mm_or_si128(b, a);304_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));305_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));306}307u32 i = sseChunks * 8;308#else309u32 i = 0;310#endif311312u8 *dst = (u8 *)dst32;313for (u32 x = i; x < numPixels; x++) {314u16 col = src[x];315dst[x * 4] = Convert5To8((col) & 0x1f);316dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);317dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f);318dst[x * 4 + 3] = (col >> 15) ? 255 : 0;319}320}321322void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {323#ifdef _M_SSE324const __m128i mask4 = _mm_set1_epi16(0x000f);325326const __m128i *srcp = (const __m128i *)src;327__m128i *dstp = (__m128i *)dst32;328u32 sseChunks = numPixels / 8;329if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {330sseChunks = 0;331}332for (u32 i = 0; i < sseChunks; ++i) {333const __m128i c = _mm_load_si128(&srcp[i]);334335// Let's just grab R000 R000, without swizzling yet.336__m128i r = _mm_and_si128(c, mask4);337// And then 00G0 00G0.338__m128i g = _mm_and_si128(_mm_srli_epi16(c, 4), mask4);339g = _mm_slli_epi16(g, 8);340// Now B000 B000.341__m128i b = _mm_and_si128(_mm_srli_epi16(c, 8), mask4);342// And lastly 00A0 00A0. No mask needed, we have a wall.343__m128i a = _mm_srli_epi16(c, 12);344a = _mm_slli_epi16(a, 8);345346// We swizzle after combining - R0G0 R0G0 and B0A0 B0A0 -> RRGG RRGG and BBAA BBAA.347__m128i rg = _mm_or_si128(r, g);348__m128i ba = _mm_or_si128(b, a);349rg = _mm_or_si128(rg, _mm_slli_epi16(rg, 4));350ba = _mm_or_si128(ba, _mm_slli_epi16(ba, 4));351352// And then we can store.353_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));354_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));355}356u32 i = sseChunks * 8;357#else358u32 i = 0;359#endif360361u8 *dst = (u8 *)dst32;362for (u32 x = i; x < numPixels; x++) {363u16 col = src[x];364dst[x * 4] = Convert4To8(col & 0xf);365dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);366dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf);367dst[x * 4 + 3] = Convert4To8(col >> 12);368}369}370371void ConvertBGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {372u8 *dst = (u8 *)dst32;373for (u32 x = 0; x < numPixels; x++) {374u16 col = src[x];375dst[x * 4] = Convert5To8((col >> 11) & 0x1f);376dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);377dst[x * 4 + 2] = Convert5To8((col) & 0x1f);378dst[x * 4 + 3] = 255;379}380}381382void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {383u8 *dst = (u8 *)dst32;384for (u32 x = 0; x < numPixels; x++) {385u16 col = src[x];386dst[x * 4] = Convert5To8((col >> 11) & 0x1f);387dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f);388dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f);389dst[x * 4 + 3] = (col & 1) ? 255 : 0;390}391}392393void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {394u8 *dst = (u8 *)dst32;395for (u32 x = 0; x < numPixels; x++) {396u16 col = src[x];397dst[x * 4] = Convert4To8(col >> 12);398dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf);399dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf);400dst[x * 4 + 3] = Convert4To8(col & 0xf);401}402}403404void ConvertRGBA4444ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {405for (u32 x = 0; x < numPixels; x++) {406u16 c = src[x];407u32 r = Convert4To8(c & 0x000f);408u32 g = Convert4To8((c >> 4) & 0x000f);409u32 b = Convert4To8((c >> 8) & 0x000f);410u32 a = Convert4To8((c >> 12) & 0x000f);411412dst[x] = (a << 24) | (r << 16) | (g << 8) | b;413}414}415416void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {417for (u32 x = 0; x < numPixels; x++) {418u16 c = src[x];419u32 r = Convert5To8(c & 0x001f);420u32 g = Convert5To8((c >> 5) & 0x001f);421u32 b = Convert5To8((c >> 10) & 0x001f);422// We force an arithmetic shift to get the sign bits.423u32 a = SignExtend16ToU32(c) & 0xff000000;424425dst[x] = a | (r << 16) | (g << 8) | b;426}427}428429void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {430for (u32 x = 0; x < numPixels; x++) {431u16 c = src[x];432u32 r = Convert5To8(c & 0x001f);433u32 g = Convert6To8((c >> 5) & 0x003f);434u32 b = Convert5To8((c >> 11) & 0x001f);435436dst[x] = 0xFF000000 | (r << 16) | (g << 8) | b;437}438}439440void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, u32 numPixels) {441#ifdef _M_SSE442const __m128i mask0040 = _mm_set1_epi16(0x00F0);443444const __m128i *srcp = (const __m128i *)src;445__m128i *dstp = (__m128i *)dst;446u32 sseChunks = numPixels / 8;447if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {448sseChunks = 0;449}450for (u32 i = 0; i < sseChunks; ++i) {451const __m128i c = _mm_load_si128(&srcp[i]);452__m128i v = _mm_srli_epi16(c, 12);453v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), mask0040));454v = _mm_or_si128(v, _mm_slli_epi16(_mm_and_si128(c, mask0040), 4));455v = _mm_or_si128(v, _mm_slli_epi16(c, 12));456_mm_store_si128(&dstp[i], v);457}458// The remainder is done in chunks of 2, SSE was chunks of 8.459u32 i = sseChunks * 8 / 2;460#elif PPSSPP_ARCH(ARM_NEON)461const uint16x8_t mask0040 = vdupq_n_u16(0x00F0);462463if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {464u32 simdable = (numPixels / 8) * 8;465for (u32 i = 0; i < simdable; i += 8) {466uint16x8_t c = vld1q_u16(src);467468const uint16x8_t a = vshrq_n_u16(c, 12);469const uint16x8_t b = vandq_u16(vshrq_n_u16(c, 4), mask0040);470const uint16x8_t g = vshlq_n_u16(vandq_u16(c, mask0040), 4);471const uint16x8_t r = vshlq_n_u16(c, 12);472473uint16x8_t res = vorrq_u16(vorrq_u16(r, g), vorrq_u16(b, a));474vst1q_u16(dst, res);475476src += 8;477dst += 8;478}479numPixels -= simdable;480}481u32 i = 0; // already moved the pointers forward482#else483u32 i = 0;484#endif485486const u32 *src32 = (const u32 *)src;487u32 *dst32 = (u32 *)dst;488for (; i < numPixels / 2; i++) {489const u32 c = src32[i];490dst32[i] = ((c >> 12) & 0x000F000F) |491((c >> 4) & 0x00F000F0) |492((c << 4) & 0x0F000F00) |493((c << 12) & 0xF000F000);494}495496if (numPixels & 1) {497const u32 i = numPixels - 1;498const u16 c = src[i];499dst[i] = ((c >> 12) & 0x000F) |500((c >> 4) & 0x00F0) |501((c << 4) & 0x0F00) |502((c << 12) & 0xF000);503}504}505506void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {507#ifdef _M_SSE508const __m128i maskB = _mm_set1_epi16(0x003E);509const __m128i maskG = _mm_set1_epi16(0x07C0);510511const __m128i *srcp = (const __m128i *)src;512__m128i *dstp = (__m128i *)dst;513u32 sseChunks = numPixels / 8;514if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {515sseChunks = 0;516}517for (u32 i = 0; i < sseChunks; ++i) {518const __m128i c = _mm_load_si128(&srcp[i]);519__m128i v = _mm_srli_epi16(c, 15);520v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));521v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));522v = _mm_or_si128(v, _mm_slli_epi16(c, 11));523_mm_store_si128(&dstp[i], v);524}525// The remainder is done in chunks of 2, SSE was chunks of 8.526u32 i = sseChunks * 8 / 2;527#elif PPSSPP_ARCH(ARM_NEON)528const uint16x8_t maskB = vdupq_n_u16(0x003E);529const uint16x8_t maskG = vdupq_n_u16(0x07C0);530531if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {532u32 simdable = (numPixels / 8) * 8;533for (u32 i = 0; i < simdable; i += 8) {534uint16x8_t c = vld1q_u16(src);535536const uint16x8_t a = vshrq_n_u16(c, 15);537const uint16x8_t b = vandq_u16(vshrq_n_u16(c, 9), maskB);538const uint16x8_t g = vandq_u16(vshlq_n_u16(c, 1), maskG);539const uint16x8_t r = vshlq_n_u16(c, 11);540541uint16x8_t res = vorrq_u16(vorrq_u16(r, g), vorrq_u16(b, a));542vst1q_u16(dst, res);543544src += 8;545dst += 8;546}547numPixels -= simdable;548}549u32 i = 0;550#else551u32 i = 0;552#endif553554const u32 *src32 = (const u32 *)src;555u32 *dst32 = (u32 *)dst;556for (; i < numPixels / 2; i++) {557const u32 c = src32[i];558dst32[i] = ((c >> 15) & 0x00010001) |559((c >> 9) & 0x003E003E) |560((c << 1) & 0x07C007C0) |561((c << 11) & 0xF800F800);562}563564if (numPixels & 1) {565const u32 i = numPixels - 1;566const u16 c = src[i];567dst[i] = ((c >> 15) & 0x0001) |568((c >> 9) & 0x003E) |569((c << 1) & 0x07C0) |570((c << 11) & 0xF800);571}572}573574void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, u32 numPixels) {575#ifdef _M_SSE576const __m128i maskG = _mm_set1_epi16(0x07E0);577578const __m128i *srcp = (const __m128i *)src;579__m128i *dstp = (__m128i *)dst;580u32 sseChunks = numPixels / 8;581if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {582sseChunks = 0;583}584for (u32 i = 0; i < sseChunks; ++i) {585const __m128i c = _mm_load_si128(&srcp[i]);586__m128i v = _mm_srli_epi16(c, 11);587v = _mm_or_si128(v, _mm_and_si128(c, maskG));588v = _mm_or_si128(v, _mm_slli_epi16(c, 11));589_mm_store_si128(&dstp[i], v);590}591// The remainder is done in chunks of 2, SSE was chunks of 8.592u32 i = sseChunks * 8 / 2;593#elif PPSSPP_ARCH(ARM_NEON)594const uint16x8_t maskG = vdupq_n_u16(0x07E0);595596if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {597u32 simdable = (numPixels / 8) * 8;598for (u32 i = 0; i < simdable; i += 8) {599uint16x8_t c = vld1q_u16(src);600601const uint16x8_t b = vshrq_n_u16(c, 11);602const uint16x8_t g = vandq_u16(c, maskG);603const uint16x8_t r = vshlq_n_u16(c, 11);604605uint16x8_t res = vorrq_u16(vorrq_u16(r, g), b);606vst1q_u16(dst, res);607608src += 8;609dst += 8;610}611numPixels -= simdable;612}613614u32 i = 0;615#else616u32 i = 0;617#endif618619// TODO: Add a 64-bit loop too.620const u32 *src32 = (const u32 *)src;621u32 *dst32 = (u32 *)dst;622for (; i < numPixels / 2; i++) {623const u32 c = src32[i];624dst32[i] = ((c >> 11) & 0x001F001F) |625((c >> 0) & 0x07E007E0) |626((c << 11) & 0xF800F800);627}628629if (numPixels & 1) {630const u32 i = numPixels - 1;631const u16 c = src[i];632dst[i] = ((c >> 11) & 0x001F) |633((c >> 0) & 0x07E0) |634((c << 11) & 0xF800);635}636}637638void ConvertBGRA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {639const u32 *src32 = (const u32 *)src;640u32 *dst32 = (u32 *)dst;641for (u32 i = 0; i < numPixels / 2; i++) {642const u32 c = src32[i];643dst32[i] = ((c >> 15) & 0x00010001) | ((c << 1) & 0xFFFEFFFE);644}645646if (numPixels & 1) {647const u32 i = numPixels - 1;648const u16 c = src[i];649dst[i] = (c >> 15) | (c << 1);650}651}652653654