CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureDecoder.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
20
#include "ext/xxhash.h"
21
22
#include "Common/Common.h"
23
#include "Common/Data/Convert/ColorConv.h"
24
#include "Common/CPUDetect.h"
25
#include "Common/Log.h"
26
#include "Common/Math/CrossSIMD.h"
27
28
#include "GPU/GPU.h"
29
#include "GPU/GPUState.h"
30
#include "GPU/Common/TextureDecoder.h"
31
32
#ifdef _M_SSE
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
#endif
36
37
#if PPSSPP_ARCH(ARM_NEON)
38
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
39
#include <arm64_neon.h>
40
#else
41
#include <arm_neon.h>
42
#endif
43
#endif
44
45
const u8 textureBitsPerPixel[16] = {
46
16, //GE_TFMT_5650,
47
16, //GE_TFMT_5551,
48
16, //GE_TFMT_4444,
49
32, //GE_TFMT_8888,
50
4, //GE_TFMT_CLUT4,
51
8, //GE_TFMT_CLUT8,
52
16, //GE_TFMT_CLUT16,
53
32, //GE_TFMT_CLUT32,
54
4, //GE_TFMT_DXT1,
55
8, //GE_TFMT_DXT3,
56
8, //GE_TFMT_DXT5,
57
0, // INVALID,
58
0, // INVALID,
59
0, // INVALID,
60
0, // INVALID,
61
0, // INVALID,
62
};
63
64
#ifdef _M_SSE
65
66
static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
67
u32 check = 0;
68
69
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
70
__m128i cursor = _mm_set1_epi32(0);
71
__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
72
__m128i update = _mm_set1_epi16(0x2455U);
73
const __m128i *p = (const __m128i *)checkp;
74
for (u32 i = 0; i < size / 16; i += 4) {
75
__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
76
cursor = _mm_add_epi16(cursor, chunk);
77
cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
78
cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
79
chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
80
cursor = _mm_xor_si128(cursor, chunk);
81
cursor2 = _mm_add_epi16(cursor2, update);
82
}
83
cursor = _mm_add_epi32(cursor, cursor2);
84
// Add the four parts into the low i32.
85
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
86
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
87
check = _mm_cvtsi128_si32(cursor);
88
} else {
89
const u32 *p = (const u32 *)checkp;
90
for (u32 i = 0; i < size / 8; ++i) {
91
check += *p++;
92
check ^= *p++;
93
}
94
}
95
96
return check;
97
}
98
#endif
99
100
#if PPSSPP_ARCH(ARM_NEON)
101
102
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
103
104
static u32 QuickTexHashNEON(const void *checkp, u32 size) {
105
u32 check = 0;
106
107
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
108
#if PPSSPP_PLATFORM(IOS) || PPSSPP_ARCH(ARM64) || defined(_MSC_VER) || !PPSSPP_ARCH(ARMV7)
109
uint32x4_t cursor = vdupq_n_u32(0);
110
uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);
111
uint16x8_t update = vdupq_n_u16(0x2455U);
112
113
const u32 *p = (const u32 *)checkp;
114
const u32 *pend = p + size / 4;
115
while (p < pend) {
116
cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));
117
cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));
118
cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));
119
cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));
120
cursor2 = vaddq_u16(cursor2, update);
121
122
p += 4 * 4;
123
}
124
125
cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));
126
uint32x2_t mixed = vadd_u32(vget_high_u32(cursor), vget_low_u32(cursor));
127
check = vget_lane_u32(mixed, 0) + vget_lane_u32(mixed, 1);
128
#else
129
// TODO: Why does this crash on iOS, but only certain devices?
130
// It's faster than the above, but I guess it sucks to be using an iPhone.
131
// As of 2020 clang, it's still faster by ~1.4%.
132
133
// d0/d1 (q0) - cursor
134
// d2/d3 (q1) - cursor2
135
// d4/d5 (q2) - update
136
// d16-d23 (q8-q11) - memory transfer
137
asm volatile (
138
// Initialize cursor.
139
"vmov.i32 q0, #0\n"
140
141
// Initialize cursor2.
142
"movw r0, 0xc00b\n"
143
"movt r0, 0x9bd9\n"
144
"movw r1, 0x4b73\n"
145
"movt r1, 0xb651\n"
146
"vmov d2, r0, r1\n"
147
"movw r0, 0x4d9b\n"
148
"movt r0, 0x4309\n"
149
"movw r1, 0x0083\n"
150
"movt r1, 0x0001\n"
151
"vmov d3, r0, r1\n"
152
153
// Initialize update.
154
"movw r0, 0x2455\n"
155
"vdup.i16 q2, r0\n"
156
157
// This is where we end.
158
"add r0, %1, %2\n"
159
160
// Okay, do the memory hashing.
161
"QuickTexHashNEON_next:\n"
162
"pld [%2, #0xc0]\n"
163
"vldmia %2!, {d16-d23}\n"
164
"vmla.i16 q0, q1, q8\n"
165
"vmul.i16 q11, q11, q1\n"
166
"veor.i32 q0, q0, q9\n"
167
"cmp %2, r0\n"
168
"vadd.i32 q0, q0, q10\n"
169
"vadd.i16 q1, q1, q2\n"
170
"veor.i32 q0, q0, q11\n"
171
"blo QuickTexHashNEON_next\n"
172
173
// Now let's get the result.
174
"vadd.i32 q0, q0, q1\n"
175
"vadd.i32 d0, d0, d1\n"
176
"vmov r0, r1, d0\n"
177
"add %0, r0, r1\n"
178
179
: "=r"(check)
180
: "r"(size), "r"(checkp)
181
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "cc"
182
);
183
#endif
184
} else {
185
const u32 size_u32 = size / 4;
186
const u32 *p = (const u32 *)checkp;
187
for (u32 i = 0; i < size_u32; i += 4) {
188
check += p[i + 0];
189
check ^= p[i + 1];
190
check += p[i + 2];
191
check ^= p[i + 3];
192
}
193
}
194
195
return check;
196
}
197
198
#endif // PPSSPP_ARCH(ARM_NEON)
199
200
// Masks to downalign bufw to 16 bytes, and wrap at 2048.
201
static const u32 textureAlignMask16[16] = {
202
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5650,
203
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5551,
204
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_4444,
205
0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_8888,
206
0x7FF & ~(((8 * 16) / 4) - 1), //GE_TFMT_CLUT4,
207
0x7FF & ~(((8 * 16) / 8) - 1), //GE_TFMT_CLUT8,
208
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_CLUT16,
209
0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_CLUT32,
210
0x7FF, //GE_TFMT_DXT1,
211
0x7FF, //GE_TFMT_DXT3,
212
0x7FF, //GE_TFMT_DXT5,
213
0, // INVALID,
214
0, // INVALID,
215
0, // INVALID,
216
0, // INVALID,
217
0, // INVALID,
218
};
219
220
u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
221
// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
222
if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
223
return gstate.texbufwidth[level] & 0x1FFF;
224
225
u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
226
if (bufw == 0 && format <= GE_TFMT_DXT5) {
227
// If it's less than 16 bytes, use 16 bytes.
228
bufw = (8 * 16) / textureBitsPerPixel[format];
229
}
230
return bufw;
231
}
232
233
// Matches QuickTexHashNEON/SSE, see #7029.
234
static u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
235
u32 check = 0;
236
237
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
238
static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
239
union u32x4_u16x8 {
240
#if defined(__GNUC__)
241
uint32_t x32 __attribute__((vector_size(16)));
242
uint16_t x16 __attribute__((vector_size(16)));
243
#else
244
u32 x32[4];
245
u16 x16[8];
246
#endif
247
};
248
u32x4_u16x8 cursor{};
249
u32x4_u16x8 cursor2;
250
static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
251
252
for (u32 j = 0; j < 8; ++j) {
253
cursor2.x16[j] = cursor2_initial[j];
254
}
255
256
const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
257
for (u32 i = 0; i < size / 16; i += 4) {
258
for (u32 j = 0; j < 8; ++j) {
259
const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
260
cursor.x16[j] += temp;
261
}
262
for (u32 j = 0; j < 4; ++j) {
263
cursor.x32[j] ^= p[i + 1].x32[j];
264
cursor.x32[j] += p[i + 2].x32[j];
265
}
266
for (u32 j = 0; j < 8; ++j) {
267
const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
268
cursor.x16[j] ^= temp;
269
}
270
for (u32 j = 0; j < 8; ++j) {
271
cursor2.x16[j] += update[j];
272
}
273
}
274
275
for (u32 j = 0; j < 4; ++j) {
276
cursor.x32[j] += cursor2.x32[j];
277
}
278
check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
279
} else {
280
const u32 *p = (const u32 *)checkp;
281
for (u32 i = 0; i < size / 8; ++i) {
282
check += *p++;
283
check ^= *p++;
284
}
285
}
286
287
return check;
288
}
289
290
u32 StableQuickTexHash(const void *checkp, u32 size) {
291
#if defined(_M_SSE)
292
return QuickTexHashSSE2(checkp, size);
293
#elif PPSSPP_ARCH(ARM_NEON)
294
return QuickTexHashNEON(checkp, size);
295
#else
296
return QuickTexHashNonSSE(checkp, size);
297
#endif
298
}
299
300
void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
301
// ysrcp is in 32-bits, so this is convenient.
302
const u32 pitchBy32 = pitch >> 2;
303
#ifdef _M_SSE
304
if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
305
__m128i *dest = (__m128i *)texptr;
306
// The pitch parameter is in bytes, so shift down for 128-bit.
307
// Note: it's always aligned to 16 bytes, so this is safe.
308
const u32 pitchBy128 = pitch >> 4;
309
for (int by = 0; by < byc; by++) {
310
const __m128i *xsrc = (const __m128i *)ysrcp;
311
for (int bx = 0; bx < bxc; bx++) {
312
const __m128i *src = xsrc;
313
for (int n = 0; n < 2; n++) {
314
// Textures are always 16-byte aligned so this is fine.
315
__m128i temp1 = _mm_load_si128(src);
316
src += pitchBy128;
317
__m128i temp2 = _mm_load_si128(src);
318
src += pitchBy128;
319
__m128i temp3 = _mm_load_si128(src);
320
src += pitchBy128;
321
__m128i temp4 = _mm_load_si128(src);
322
src += pitchBy128;
323
324
_mm_store_si128(dest, temp1);
325
_mm_store_si128(dest + 1, temp2);
326
_mm_store_si128(dest + 2, temp3);
327
_mm_store_si128(dest + 3, temp4);
328
dest += 4;
329
}
330
xsrc++;
331
}
332
ysrcp += pitchBy32 * 8;
333
}
334
} else
335
#endif
336
{
337
u32 *dest = (u32 *)texptr;
338
for (int by = 0; by < byc; by++) {
339
const u32 *xsrc = ysrcp;
340
for (int bx = 0; bx < bxc; bx++) {
341
const u32 *src = xsrc;
342
for (int n = 0; n < 8; n++) {
343
memcpy(dest, src, 16);
344
src += pitchBy32;
345
dest += 4;
346
}
347
xsrc += 4;
348
}
349
ysrcp += pitchBy32 * 8;
350
}
351
}
352
}
353
354
void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
355
// ydestp is in 32-bits, so this is convenient.
356
const u32 pitchBy32 = pitch >> 2;
357
358
#ifdef _M_SSE
359
// This check is pretty much a given, right?
360
if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
361
const __m128i *src = (const __m128i *)texptr;
362
// The pitch parameter is in bytes, so shift down for 128-bit.
363
// Note: it's always aligned to 16 bytes, so this is safe.
364
const u32 pitchBy128 = pitch >> 4;
365
for (int by = 0; by < byc; by++) {
366
__m128i *xdest = (__m128i *)ydestp;
367
for (int bx = 0; bx < bxc; bx++) {
368
__m128i *dest = xdest;
369
for (int n = 0; n < 2; n++) {
370
// Textures are always 16-byte aligned so this is fine.
371
__m128i temp1 = _mm_load_si128(src);
372
__m128i temp2 = _mm_load_si128(src + 1);
373
__m128i temp3 = _mm_load_si128(src + 2);
374
__m128i temp4 = _mm_load_si128(src + 3);
375
_mm_store_si128(dest, temp1);
376
dest += pitchBy128;
377
_mm_store_si128(dest, temp2);
378
dest += pitchBy128;
379
_mm_store_si128(dest, temp3);
380
dest += pitchBy128;
381
_mm_store_si128(dest, temp4);
382
dest += pitchBy128;
383
src += 4;
384
}
385
xdest++;
386
}
387
ydestp += pitchBy32 * 8;
388
}
389
} else
390
#elif PPSSPP_ARCH(ARM_NEON)
391
if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
392
const u32 *src = (const u32 *)texptr;
393
for (int by = 0; by < byc; by++) {
394
u32 *xdest = ydestp;
395
for (int bx = 0; bx < bxc; bx++) {
396
u32 *dest = xdest;
397
for (int n = 0; n < 2; n++) {
398
// Textures are always 16-byte aligned so this is fine.
399
uint32x4_t temp1 = vld1q_u32(src);
400
uint32x4_t temp2 = vld1q_u32(src + 4);
401
uint32x4_t temp3 = vld1q_u32(src + 8);
402
uint32x4_t temp4 = vld1q_u32(src + 12);
403
vst1q_u32(dest, temp1);
404
dest += pitchBy32;
405
vst1q_u32(dest, temp2);
406
dest += pitchBy32;
407
vst1q_u32(dest, temp3);
408
dest += pitchBy32;
409
vst1q_u32(dest, temp4);
410
dest += pitchBy32;
411
src += 16;
412
}
413
xdest += 4;
414
}
415
ydestp += pitchBy32 * 8;
416
}
417
} else
418
#endif
419
{
420
const u32 *src = (const u32 *)texptr;
421
for (int by = 0; by < byc; by++) {
422
u32 *xdest = ydestp;
423
for (int bx = 0; bx < bxc; bx++) {
424
u32 *dest = xdest;
425
for (int n = 0; n < 8; n++) {
426
memcpy(dest, src, 16);
427
dest += pitchBy32;
428
src += 4;
429
}
430
xdest += 4;
431
}
432
ydestp += pitchBy32 * 8;
433
}
434
}
435
}
436
437
// S3TC / DXT Decoder
438
class DXTDecoder {
439
public:
440
inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
441
inline void DecodeAlphaDXT5(const DXT5Block *src);
442
inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);
443
inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
444
inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);
445
446
bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }
447
448
protected:
449
u32 colors_[4];
450
u8 alpha_[8];
451
bool alphaMode_ = false;
452
bool anyNonFullAlpha_ = false;
453
};
454
455
static inline u32 makecol(int r, int g, int b, int a) {
456
return (a << 24) | (b << 16) | (g << 8) | r;
457
}
458
459
static inline int mix_2_3(int c1, int c2) {
460
return (c1 + c1 + c2) / 3;
461
}
462
463
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
464
void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
465
u16 c1 = src->color1;
466
u16 c2 = src->color2;
467
int blue1 = (c1 << 3) & 0xF8;
468
int blue2 = (c2 << 3) & 0xF8;
469
int green1 = (c1 >> 3) & 0xFC;
470
int green2 = (c2 >> 3) & 0xFC;
471
int red1 = (c1 >> 8) & 0xF8;
472
int red2 = (c2 >> 8) & 0xF8;
473
474
// Keep alpha zero for non-DXT1 to skip masking the colors.
475
int alpha = ignore1bitAlpha ? 0 : 255;
476
477
colors_[0] = makecol(red1, green1, blue1, alpha);
478
colors_[1] = makecol(red2, green2, blue2, alpha);
479
if (c1 > c2) {
480
colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
481
colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
482
} else {
483
// Average - these are always left shifted, so no need to worry about ties.
484
int red3 = (red1 + red2) / 2;
485
int green3 = (green1 + green2) / 2;
486
int blue3 = (blue1 + blue2) / 2;
487
colors_[2] = makecol(red3, green3, blue3, alpha);
488
colors_[3] = makecol(0, 0, 0, 0);
489
if (alpha == 255) {
490
alphaMode_ = true;
491
}
492
}
493
}
494
495
static inline u8 lerp8(const DXT5Block *src, int n) {
496
// These weights multiple alpha1/alpha2 to fixed 8.8 point.
497
int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
498
int alpha2 = (src->alpha2 * (n << 8)) / 7;
499
return (u8)((alpha1 + alpha2 + 31) >> 8);
500
}
501
502
static inline u8 lerp6(const DXT5Block *src, int n) {
503
int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
504
int alpha2 = (src->alpha2 * (n << 8)) / 5;
505
return (u8)((alpha1 + alpha2 + 31) >> 8);
506
}
507
508
void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
509
alpha_[0] = src->alpha1;
510
alpha_[1] = src->alpha2;
511
if (alpha_[0] > alpha_[1]) {
512
alpha_[2] = lerp8(src, 1);
513
alpha_[3] = lerp8(src, 2);
514
alpha_[4] = lerp8(src, 3);
515
alpha_[5] = lerp8(src, 4);
516
alpha_[6] = lerp8(src, 5);
517
alpha_[7] = lerp8(src, 6);
518
} else {
519
alpha_[2] = lerp6(src, 1);
520
alpha_[3] = lerp6(src, 2);
521
alpha_[4] = lerp6(src, 3);
522
alpha_[5] = lerp6(src, 4);
523
alpha_[6] = 0;
524
alpha_[7] = 255;
525
}
526
}
527
528
void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {
529
bool anyColor3 = false;
530
for (int y = 0; y < height; y++) {
531
int colordata = src->lines[y];
532
for (int x = 0; x < width; x++) {
533
int col = colordata & 3;
534
if (col == 3) {
535
anyColor3 = true;
536
}
537
dst[x] = colors_[col];
538
colordata >>= 2;
539
}
540
dst += pitch;
541
}
542
543
if (alphaMode_ && anyColor3) {
544
anyNonFullAlpha_ = true;
545
}
546
}
547
548
void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
549
for (int y = 0; y < height; y++) {
550
int colordata = src->color.lines[y];
551
u32 alphadata = src->alphaLines[y];
552
for (int x = 0; x < width; x++) {
553
dst[x] = colors_[colordata & 3] | (alphadata << 28);
554
colordata >>= 2;
555
alphadata >>= 4;
556
}
557
dst += pitch;
558
}
559
}
560
561
void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
562
// 48 bits, 3 bit index per pixel, 12 bits per line.
563
u64 allAlpha = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
564
565
for (int y = 0; y < height; y++) {
566
uint32_t colordata = src->color.lines[y];
567
uint32_t alphadata = allAlpha >> (12 * y);
568
for (int x = 0; x < width; x++) {
569
dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
570
colordata >>= 2;
571
alphadata >>= 3;
572
}
573
dst += pitch;
574
}
575
}
576
577
uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
578
_dbg_assert_(x >= 0 && x < 4);
579
_dbg_assert_(y >= 0 && y < 4);
580
581
uint16_t c1 = src->color1;
582
uint16_t c2 = src->color2;
583
int blue1 = (c1 << 3) & 0xF8;
584
int blue2 = (c2 << 3) & 0xF8;
585
int green1 = (c1 >> 3) & 0xFC;
586
int green2 = (c2 >> 3) & 0xFC;
587
int red1 = (c1 >> 8) & 0xF8;
588
int red2 = (c2 >> 8) & 0xF8;
589
590
int colorIndex = (src->lines[y] >> (x * 2)) & 3;
591
if (colorIndex == 0) {
592
return makecol(red1, green1, blue1, alpha);
593
} else if (colorIndex == 1) {
594
return makecol(red2, green2, blue2, alpha);
595
} else if (c1 > c2) {
596
if (colorIndex == 2) {
597
return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
598
}
599
return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
600
} else if (colorIndex == 3) {
601
return makecol(0, 0, 0, 0);
602
}
603
604
// Average - these are always left shifted, so no need to worry about ties.
605
int red3 = (red1 + red2) / 2;
606
int green3 = (green1 + green2) / 2;
607
int blue3 = (blue1 + blue2) / 2;
608
return makecol(red3, green3, blue3, alpha);
609
}
610
611
uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
612
return GetDXTTexelColor(src, x, y, 255);
613
}
614
615
uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
616
uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
617
u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
618
return color | (alpha << 28);
619
}
620
621
uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
622
uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
623
uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
624
int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
625
626
if (alphaIndex == 0) {
627
return color | (src->alpha1 << 24);
628
} else if (alphaIndex == 1) {
629
return color | (src->alpha2 << 24);
630
} else if (src->alpha1 > src->alpha2) {
631
return color | (lerp8(src, alphaIndex - 1) << 24);
632
} else if (alphaIndex == 6) {
633
return color;
634
} else if (alphaIndex == 7) {
635
return color | 0xFF000000;
636
}
637
return color | (lerp6(src, alphaIndex - 1) << 24);
638
}
639
640
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
641
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {
642
DXTDecoder dxt;
643
dxt.DecodeColors(src, false);
644
dxt.WriteColorsDXT1(dst, src, pitch, width, height);
645
*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;
646
}
647
648
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
649
DXTDecoder dxt;
650
dxt.DecodeColors(&src->color, true);
651
dxt.WriteColorsDXT3(dst, src, pitch, width, height);
652
}
653
654
void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
655
DXTDecoder dxt;
656
dxt.DecodeColors(&src->color, true);
657
dxt.DecodeAlphaDXT5(src);
658
dxt.WriteColorsDXT5(dst, src, pitch, width, height);
659
}
660
661
#ifdef _M_SSE
662
inline u32 SSEReduce32And(__m128i value) {
663
value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
664
value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)));
665
return _mm_cvtsi128_si32(value);
666
}
667
inline u32 SSEReduce16And(__m128i value) {
668
u32 mask = SSEReduce32And(value);
669
return mask & (mask >> 16);
670
}
671
#endif
672
673
#if PPSSPP_ARCH(ARM_NEON)
674
inline u32 NEONReduce32And(uint32x4_t value) {
675
// TODO: Maybe a shuffle and a vector and, or something?
676
return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);
677
}
678
inline u32 NEONReduce16And(uint16x8_t value) {
679
uint32x4_t value32 = vreinterpretq_u32_u16(value);
680
// TODO: Maybe a shuffle and a vector and, or something?
681
u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3);
682
return mask & (mask >> 16);
683
}
684
#endif
685
686
// TODO: SSE/SIMD
687
// At least on x86, compiler actually SIMDs these pretty well.
688
void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {
689
u16 mask = 0xFFFF;
690
#ifdef _M_SSE
691
if (width >= 8) {
692
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
693
while (width >= 8) {
694
__m128i color = _mm_loadu_si128((__m128i *)src);
695
wideMask = _mm_and_si128(wideMask, color);
696
_mm_storeu_si128((__m128i *)dst, color);
697
src += 8;
698
dst += 8;
699
width -= 8;
700
}
701
mask = SSEReduce16And(wideMask);
702
}
703
#elif PPSSPP_ARCH(ARM_NEON)
704
if (width >= 8) {
705
uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
706
while (width >= 8) {
707
uint16x8_t colors = vld1q_u16(src);
708
wideMask = vandq_u16(wideMask, colors);
709
vst1q_u16(dst, colors);
710
src += 8;
711
dst += 8;
712
width -= 8;
713
}
714
mask = NEONReduce16And(wideMask);
715
}
716
#endif
717
718
DO_NOT_VECTORIZE_LOOP
719
for (int i = 0; i < width; i++) {
720
u16 color = src[i];
721
mask &= color;
722
dst[i] = color;
723
}
724
*outMask &= (u32)mask;
725
}
726
727
// Used in video playback so nice to have being fast.
728
void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {
729
u32 mask = 0xFFFFFFFF;
730
#ifdef _M_SSE
731
if (width >= 4) {
732
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
733
while (width >= 4) {
734
__m128i color = _mm_loadu_si128((__m128i *)src);
735
wideMask = _mm_and_si128(wideMask, color);
736
_mm_storeu_si128((__m128i *)dst, color);
737
src += 4;
738
dst += 4;
739
width -= 4;
740
}
741
mask = SSEReduce32And(wideMask);
742
}
743
#elif PPSSPP_ARCH(ARM_NEON)
744
if (width >= 4) {
745
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
746
while (width >= 4) {
747
uint32x4_t colors = vld1q_u32(src);
748
wideMask = vandq_u32(wideMask, colors);
749
vst1q_u32(dst, colors);
750
src += 4;
751
dst += 4;
752
width -= 4;
753
}
754
mask = NEONReduce32And(wideMask);
755
}
756
#endif
757
758
DO_NOT_VECTORIZE_LOOP
759
for (int i = 0; i < width; i++) {
760
u32 color = src[i];
761
mask &= color;
762
dst[i] = color;
763
}
764
*outMask &= (u32)mask;
765
}
766
767
void CheckMask16(const u16 *src, int width, u32 *outMask) {
768
u16 mask = 0xFFFF;
769
#ifdef _M_SSE
770
if (width >= 8) {
771
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
772
while (width >= 8) {
773
wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
774
src += 8;
775
width -= 8;
776
}
777
mask = SSEReduce16And(wideMask);
778
}
779
#elif PPSSPP_ARCH(ARM_NEON)
780
if (width >= 8) {
781
uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
782
while (width >= 8) {
783
wideMask = vandq_u16(wideMask, vld1q_u16(src));
784
src += 8;
785
width -= 8;
786
}
787
mask = NEONReduce16And(wideMask);
788
}
789
#endif
790
791
DO_NOT_VECTORIZE_LOOP
792
for (int i = 0; i < width; i++) {
793
mask &= src[i];
794
}
795
*outMask &= (u32)mask;
796
}
797
798
void CheckMask32(const u32 *src, int width, u32 *outMask) {
799
u32 mask = 0xFFFFFFFF;
800
#ifdef _M_SSE
801
if (width >= 4) {
802
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
803
while (width >= 4) {
804
wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
805
src += 4;
806
width -= 4;
807
}
808
mask = SSEReduce32And(wideMask);
809
}
810
#elif PPSSPP_ARCH(ARM_NEON)
811
if (width >= 4) {
812
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
813
while (width >= 4) {
814
wideMask = vandq_u32(wideMask, vld1q_u32(src));
815
src += 4;
816
width -= 4;
817
}
818
mask = NEONReduce32And(wideMask);
819
}
820
#endif
821
822
DO_NOT_VECTORIZE_LOOP
823
for (int i = 0; i < width; i++) {
824
mask &= src[i];
825
}
826
*outMask &= (u32)mask;
827
}
828
829