CoCalc -- TextureDecoder.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureDecoder.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19

20
#include "ext/xxhash.h"
21

22
#include "Common/Common.h"
23
#include "Common/Data/Convert/ColorConv.h"
24
#include "Common/CPUDetect.h"
25
#include "Common/Log.h"
26
#include "Common/Math/CrossSIMD.h"
27

28
#include "GPU/GPU.h"
29
#include "GPU/GPUState.h"
30
#include "GPU/Common/TextureDecoder.h"
31

32
#ifdef _M_SSE
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
#endif
36

37
#if PPSSPP_ARCH(ARM_NEON)
38
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
39
#include <arm64_neon.h>
40
#else
41
#include <arm_neon.h>
42
#endif
43
#endif
44

45
const u8 textureBitsPerPixel[16] = {
46
	16,  //GE_TFMT_5650,
47
	16,  //GE_TFMT_5551,
48
	16,  //GE_TFMT_4444,
49
	32,  //GE_TFMT_8888,
50
	4,   //GE_TFMT_CLUT4,
51
	8,   //GE_TFMT_CLUT8,
52
	16,  //GE_TFMT_CLUT16,
53
	32,  //GE_TFMT_CLUT32,
54
	4,   //GE_TFMT_DXT1,
55
	8,   //GE_TFMT_DXT3,
56
	8,   //GE_TFMT_DXT5,
57
	0,   // INVALID,
58
	0,   // INVALID,
59
	0,   // INVALID,
60
	0,   // INVALID,
61
	0,   // INVALID,
62
};
63

64
#ifdef _M_SSE
65

66
static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
67
	u32 check = 0;
68

69
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
70
		__m128i cursor = _mm_set1_epi32(0);
71
		__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
72
		__m128i update = _mm_set1_epi16(0x2455U);
73
		const __m128i *p = (const __m128i *)checkp;
74
		for (u32 i = 0; i < size / 16; i += 4) {
75
			__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
76
			cursor = _mm_add_epi16(cursor, chunk);
77
			cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
78
			cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
79
			chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
80
			cursor = _mm_xor_si128(cursor, chunk);
81
			cursor2 = _mm_add_epi16(cursor2, update);
82
		}
83
		cursor = _mm_add_epi32(cursor, cursor2);
84
		// Add the four parts into the low i32.
85
		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
86
		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
87
		check = _mm_cvtsi128_si32(cursor);
88
	} else {
89
		const u32 *p = (const u32 *)checkp;
90
		for (u32 i = 0; i < size / 8; ++i) {
91
			check += *p++;
92
			check ^= *p++;
93
		}
94
	}
95

96
	return check;
97
}
98
#endif
99

100
#if PPSSPP_ARCH(ARM_NEON)
101

102
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
103

104
static u32 QuickTexHashNEON(const void *checkp, u32 size) {
105
	u32 check = 0;
106

107
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
108
#if PPSSPP_PLATFORM(IOS) || PPSSPP_ARCH(ARM64) || defined(_MSC_VER) || !PPSSPP_ARCH(ARMV7)
109
		uint32x4_t cursor = vdupq_n_u32(0);
110
		uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);
111
		uint16x8_t update = vdupq_n_u16(0x2455U);
112

113
		const u32 *p = (const u32 *)checkp;
114
		const u32 *pend = p + size / 4;
115
		while (p < pend) {
116
			cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));
117
			cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));
118
			cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));
119
			cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));
120
			cursor2 = vaddq_u16(cursor2, update);
121

122
			p += 4 * 4;
123
		}
124

125
		cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));
126
		uint32x2_t mixed = vadd_u32(vget_high_u32(cursor), vget_low_u32(cursor));
127
		check = vget_lane_u32(mixed, 0) + vget_lane_u32(mixed, 1);
128
#else
129
		// TODO: Why does this crash on iOS, but only certain devices?
130
		// It's faster than the above, but I guess it sucks to be using an iPhone.
131
		// As of 2020 clang, it's still faster by ~1.4%.
132

133
		// d0/d1 (q0) - cursor
134
		// d2/d3 (q1) - cursor2
135
		// d4/d5 (q2) - update
136
		// d16-d23 (q8-q11) - memory transfer
137
		asm volatile (
138
			// Initialize cursor.
139
			"vmov.i32 q0, #0\n"
140

141
			// Initialize cursor2.
142
			"movw r0, 0xc00b\n"
143
			"movt r0, 0x9bd9\n"
144
			"movw r1, 0x4b73\n"
145
			"movt r1, 0xb651\n"
146
			"vmov d2, r0, r1\n"
147
			"movw r0, 0x4d9b\n"
148
			"movt r0, 0x4309\n"
149
			"movw r1, 0x0083\n"
150
			"movt r1, 0x0001\n"
151
			"vmov d3, r0, r1\n"
152

153
			// Initialize update.
154
			"movw r0, 0x2455\n"
155
			"vdup.i16 q2, r0\n"
156

157
			// This is where we end.
158
			"add r0, %1, %2\n"
159

160
			// Okay, do the memory hashing.
161
			"QuickTexHashNEON_next:\n"
162
			"pld [%2, #0xc0]\n"
163
			"vldmia %2!, {d16-d23}\n"
164
			"vmla.i16 q0, q1, q8\n"
165
			"vmul.i16 q11, q11, q1\n"
166
			"veor.i32 q0, q0, q9\n"
167
			"cmp %2, r0\n"
168
			"vadd.i32 q0, q0, q10\n"
169
			"vadd.i16 q1, q1, q2\n"
170
			"veor.i32 q0, q0, q11\n"
171
			"blo QuickTexHashNEON_next\n"
172

173
			// Now let's get the result.
174
			"vadd.i32 q0, q0, q1\n"
175
			"vadd.i32 d0, d0, d1\n"
176
			"vmov r0, r1, d0\n"
177
			"add %0, r0, r1\n"
178

179
			: "=r"(check)
180
			: "r"(size), "r"(checkp)
181
			: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "cc"
182
			);
183
#endif
184
	} else {
185
		const u32 size_u32 = size / 4;
186
		const u32 *p = (const u32 *)checkp;
187
		for (u32 i = 0; i < size_u32; i += 4) {
188
			check += p[i + 0];
189
			check ^= p[i + 1];
190
			check += p[i + 2];
191
			check ^= p[i + 3];
192
		}
193
	}
194

195
	return check;
196
}
197

198
#endif  // PPSSPP_ARCH(ARM_NEON)
199

200
// Masks to downalign bufw to 16 bytes, and wrap at 2048.
201
static const u32 textureAlignMask16[16] = {
202
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5650,
203
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5551,
204
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_4444,
205
	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_8888,
206
	0x7FF & ~(((8 * 16) / 4) - 1),   //GE_TFMT_CLUT4,
207
	0x7FF & ~(((8 * 16) / 8) - 1),   //GE_TFMT_CLUT8,
208
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_CLUT16,
209
	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_CLUT32,
210
	0x7FF, //GE_TFMT_DXT1,
211
	0x7FF, //GE_TFMT_DXT3,
212
	0x7FF, //GE_TFMT_DXT5,
213
	0,   // INVALID,
214
	0,   // INVALID,
215
	0,   // INVALID,
216
	0,   // INVALID,
217
	0,   // INVALID,
218
};
219

220
u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
221
	// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
222
	if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
223
		return gstate.texbufwidth[level] & 0x1FFF;
224

225
	u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
226
	if (bufw == 0 && format <= GE_TFMT_DXT5) {
227
		// If it's less than 16 bytes, use 16 bytes.
228
		bufw = (8 * 16) / textureBitsPerPixel[format];
229
	}
230
	return bufw;
231
}
232

233
// Matches QuickTexHashNEON/SSE, see #7029.
234
static u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
235
	u32 check = 0;
236

237
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
238
		static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
239
		union u32x4_u16x8 {
240
#if defined(__GNUC__)
241
			uint32_t x32 __attribute__((vector_size(16)));
242
			uint16_t x16 __attribute__((vector_size(16)));
243
#else
244
			u32 x32[4];
245
			u16 x16[8];
246
#endif
247
		};
248
		u32x4_u16x8 cursor{};
249
		u32x4_u16x8 cursor2;
250
		static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
251

252
		for (u32 j = 0; j < 8; ++j) {
253
			cursor2.x16[j] = cursor2_initial[j];
254
		}
255

256
		const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
257
		for (u32 i = 0; i < size / 16; i += 4) {
258
			for (u32 j = 0; j < 8; ++j) {
259
				const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
260
				cursor.x16[j] += temp;
261
			}
262
			for (u32 j = 0; j < 4; ++j) {
263
				cursor.x32[j] ^= p[i + 1].x32[j];
264
				cursor.x32[j] += p[i + 2].x32[j];
265
			}
266
			for (u32 j = 0; j < 8; ++j) {
267
				const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
268
				cursor.x16[j] ^= temp;
269
			}
270
			for (u32 j = 0; j < 8; ++j) {
271
				cursor2.x16[j] += update[j];
272
			}
273
		}
274

275
		for (u32 j = 0; j < 4; ++j) {
276
			cursor.x32[j] += cursor2.x32[j];
277
		}
278
		check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
279
	} else {
280
		const u32 *p = (const u32 *)checkp;
281
		for (u32 i = 0; i < size / 8; ++i) {
282
			check += *p++;
283
			check ^= *p++;
284
		}
285
	}
286

287
	return check;
288
}
289

290
u32 StableQuickTexHash(const void *checkp, u32 size) {
291
#if defined(_M_SSE)
292
	return QuickTexHashSSE2(checkp, size);
293
#elif PPSSPP_ARCH(ARM_NEON)
294
	return QuickTexHashNEON(checkp, size);
295
#else
296
	return QuickTexHashNonSSE(checkp, size);
297
#endif
298
}
299

300
void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
301
	// ysrcp is in 32-bits, so this is convenient.
302
	const u32 pitchBy32 = pitch >> 2;
303
#ifdef _M_SSE
304
	if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
305
		__m128i *dest = (__m128i *)texptr;
306
		// The pitch parameter is in bytes, so shift down for 128-bit.
307
		// Note: it's always aligned to 16 bytes, so this is safe.
308
		const u32 pitchBy128 = pitch >> 4;
309
		for (int by = 0; by < byc; by++) {
310
			const __m128i *xsrc = (const __m128i *)ysrcp;
311
			for (int bx = 0; bx < bxc; bx++) {
312
				const __m128i *src = xsrc;
313
				for (int n = 0; n < 2; n++) {
314
					// Textures are always 16-byte aligned so this is fine.
315
					__m128i temp1 = _mm_load_si128(src);
316
					src += pitchBy128;
317
					__m128i temp2 = _mm_load_si128(src);
318
					src += pitchBy128;
319
					__m128i temp3 = _mm_load_si128(src);
320
					src += pitchBy128;
321
					__m128i temp4 = _mm_load_si128(src);
322
					src += pitchBy128;
323

324
					_mm_store_si128(dest, temp1);
325
					_mm_store_si128(dest + 1, temp2);
326
					_mm_store_si128(dest + 2, temp3);
327
					_mm_store_si128(dest + 3, temp4);
328
					dest += 4;
329
				}
330
				xsrc++;
331
			}
332
			ysrcp += pitchBy32 * 8;
333
		}
334
	} else
335
#endif
336
	{
337
		u32 *dest = (u32 *)texptr;
338
		for (int by = 0; by < byc; by++) {
339
			const u32 *xsrc = ysrcp;
340
			for (int bx = 0; bx < bxc; bx++) {
341
				const u32 *src = xsrc;
342
				for (int n = 0; n < 8; n++) {
343
					memcpy(dest, src, 16);
344
					src += pitchBy32;
345
					dest += 4;
346
				}
347
				xsrc += 4;
348
			}
349
			ysrcp += pitchBy32 * 8;
350
		}
351
	}
352
}
353

354
void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
355
	// ydestp is in 32-bits, so this is convenient.
356
	const u32 pitchBy32 = pitch >> 2;
357

358
#ifdef _M_SSE
359
	// This check is pretty much a given, right?
360
	if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
361
		const __m128i *src = (const __m128i *)texptr;
362
		// The pitch parameter is in bytes, so shift down for 128-bit.
363
		// Note: it's always aligned to 16 bytes, so this is safe.
364
		const u32 pitchBy128 = pitch >> 4;
365
		for (int by = 0; by < byc; by++) {
366
			__m128i *xdest = (__m128i *)ydestp;
367
			for (int bx = 0; bx < bxc; bx++) {
368
				__m128i *dest = xdest;
369
				for (int n = 0; n < 2; n++) {
370
					// Textures are always 16-byte aligned so this is fine.
371
					__m128i temp1 = _mm_load_si128(src);
372
					__m128i temp2 = _mm_load_si128(src + 1);
373
					__m128i temp3 = _mm_load_si128(src + 2);
374
					__m128i temp4 = _mm_load_si128(src + 3);
375
					_mm_store_si128(dest, temp1);
376
					dest += pitchBy128;
377
					_mm_store_si128(dest, temp2);
378
					dest += pitchBy128;
379
					_mm_store_si128(dest, temp3);
380
					dest += pitchBy128;
381
					_mm_store_si128(dest, temp4);
382
					dest += pitchBy128;
383
					src += 4;
384
				}
385
				xdest++;
386
			}
387
			ydestp += pitchBy32 * 8;
388
		}
389
	} else
390
#elif PPSSPP_ARCH(ARM_NEON)
391
	if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
392
		const u32 *src = (const u32 *)texptr;
393
		for (int by = 0; by < byc; by++) {
394
			u32 *xdest = ydestp;
395
			for (int bx = 0; bx < bxc; bx++) {
396
				u32 *dest = xdest;
397
				for (int n = 0; n < 2; n++) {
398
					// Textures are always 16-byte aligned so this is fine.
399
					uint32x4_t temp1 = vld1q_u32(src);
400
					uint32x4_t temp2 = vld1q_u32(src + 4);
401
					uint32x4_t temp3 = vld1q_u32(src + 8);
402
					uint32x4_t temp4 = vld1q_u32(src + 12);
403
					vst1q_u32(dest, temp1);
404
					dest += pitchBy32;
405
					vst1q_u32(dest, temp2);
406
					dest += pitchBy32;
407
					vst1q_u32(dest, temp3);
408
					dest += pitchBy32;
409
					vst1q_u32(dest, temp4);
410
					dest += pitchBy32;
411
					src += 16;
412
				}
413
				xdest += 4;
414
			}
415
			ydestp += pitchBy32 * 8;
416
		}
417
	} else
418
#endif
419
	{
420
		const u32 *src = (const u32 *)texptr;
421
		for (int by = 0; by < byc; by++) {
422
			u32 *xdest = ydestp;
423
			for (int bx = 0; bx < bxc; bx++) {
424
				u32 *dest = xdest;
425
				for (int n = 0; n < 8; n++) {
426
					memcpy(dest, src, 16);
427
					dest += pitchBy32;
428
					src += 4;
429
				}
430
				xdest += 4;
431
			}
432
			ydestp += pitchBy32 * 8;
433
		}
434
	}
435
}
436

437
// S3TC / DXT Decoder
438
class DXTDecoder {
439
public:
440
	inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
441
	inline void DecodeAlphaDXT5(const DXT5Block *src);
442
	inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);
443
	inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
444
	inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);
445

446
	bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }
447

448
protected:
449
	u32 colors_[4];
450
	u8 alpha_[8];
451
	bool alphaMode_ = false;
452
	bool anyNonFullAlpha_ = false;
453
};
454

455
static inline u32 makecol(int r, int g, int b, int a) {
456
	return (a << 24) | (b << 16) | (g << 8) | r;
457
}
458

459
static inline int mix_2_3(int c1, int c2) {
460
	return (c1 + c1 + c2) / 3;
461
}
462

463
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
464
void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
465
	u16 c1 = src->color1;
466
	u16 c2 = src->color2;
467
	int blue1 = (c1 << 3) & 0xF8;
468
	int blue2 = (c2 << 3) & 0xF8;
469
	int green1 = (c1 >> 3) & 0xFC;
470
	int green2 = (c2 >> 3) & 0xFC;
471
	int red1 = (c1 >> 8) & 0xF8;
472
	int red2 = (c2 >> 8) & 0xF8;
473

474
	// Keep alpha zero for non-DXT1 to skip masking the colors.
475
	int alpha = ignore1bitAlpha ? 0 : 255;
476

477
	colors_[0] = makecol(red1, green1, blue1, alpha);
478
	colors_[1] = makecol(red2, green2, blue2, alpha);
479
	if (c1 > c2) {
480
		colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
481
		colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
482
	} else {
483
		// Average - these are always left shifted, so no need to worry about ties.
484
		int red3 = (red1 + red2) / 2;
485
		int green3 = (green1 + green2) / 2;
486
		int blue3 = (blue1 + blue2) / 2;
487
		colors_[2] = makecol(red3, green3, blue3, alpha);
488
		colors_[3] = makecol(0, 0, 0, 0);
489
		if (alpha == 255) {
490
			alphaMode_ = true;
491
		}
492
	}
493
}
494

495
static inline u8 lerp8(const DXT5Block *src, int n) {
496
	// These weights multiple alpha1/alpha2 to fixed 8.8 point.
497
	int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
498
	int alpha2 = (src->alpha2 * (n << 8)) / 7;
499
	return (u8)((alpha1 + alpha2 + 31) >> 8);
500
}
501

502
static inline u8 lerp6(const DXT5Block *src, int n) {
503
	int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
504
	int alpha2 = (src->alpha2 * (n << 8)) / 5;
505
	return (u8)((alpha1 + alpha2 + 31) >> 8);
506
}
507

508
void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
509
	alpha_[0] = src->alpha1;
510
	alpha_[1] = src->alpha2;
511
	if (alpha_[0] > alpha_[1]) {
512
		alpha_[2] = lerp8(src, 1);
513
		alpha_[3] = lerp8(src, 2);
514
		alpha_[4] = lerp8(src, 3);
515
		alpha_[5] = lerp8(src, 4);
516
		alpha_[6] = lerp8(src, 5);
517
		alpha_[7] = lerp8(src, 6);
518
	} else {
519
		alpha_[2] = lerp6(src, 1);
520
		alpha_[3] = lerp6(src, 2);
521
		alpha_[4] = lerp6(src, 3);
522
		alpha_[5] = lerp6(src, 4);
523
		alpha_[6] = 0;
524
		alpha_[7] = 255;
525
	}
526
}
527

528
void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {
529
	bool anyColor3 = false;
530
	for (int y = 0; y < height; y++) {
531
		int colordata = src->lines[y];
532
		for (int x = 0; x < width; x++) {
533
			int col = colordata & 3;
534
			if (col == 3) {
535
				anyColor3 = true;
536
			}
537
			dst[x] = colors_[col];
538
			colordata >>= 2;
539
		}
540
		dst += pitch;
541
	}
542

543
	if (alphaMode_ && anyColor3) {
544
		anyNonFullAlpha_ = true;
545
	}
546
}
547

548
void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
549
	for (int y = 0; y < height; y++) {
550
		int colordata = src->color.lines[y];
551
		u32 alphadata = src->alphaLines[y];
552
		for (int x = 0; x < width; x++) {
553
			dst[x] = colors_[colordata & 3] | (alphadata << 28);
554
			colordata >>= 2;
555
			alphadata >>= 4;
556
		}
557
		dst += pitch;
558
	}
559
}
560

561
void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
562
	// 48 bits, 3 bit index per pixel, 12 bits per line.
563
	u64 allAlpha = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
564

565
	for (int y = 0; y < height; y++) {
566
		uint32_t colordata = src->color.lines[y];
567
		uint32_t alphadata = allAlpha >> (12 * y);
568
		for (int x = 0; x < width; x++) {
569
			dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
570
			colordata >>= 2;
571
			alphadata >>= 3;
572
		}
573
		dst += pitch;
574
	}
575
}
576

577
uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
578
	_dbg_assert_(x >= 0 && x < 4);
579
	_dbg_assert_(y >= 0 && y < 4);
580

581
	uint16_t c1 = src->color1;
582
	uint16_t c2 = src->color2;
583
	int blue1 = (c1 << 3) & 0xF8;
584
	int blue2 = (c2 << 3) & 0xF8;
585
	int green1 = (c1 >> 3) & 0xFC;
586
	int green2 = (c2 >> 3) & 0xFC;
587
	int red1 = (c1 >> 8) & 0xF8;
588
	int red2 = (c2 >> 8) & 0xF8;
589

590
	int colorIndex = (src->lines[y] >> (x * 2)) & 3;
591
	if (colorIndex == 0) {
592
		return makecol(red1, green1, blue1, alpha);
593
	} else if (colorIndex == 1) {
594
		return makecol(red2, green2, blue2, alpha);
595
	} else if (c1 > c2) {
596
		if (colorIndex == 2) {
597
			return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
598
		}
599
		return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
600
	} else if (colorIndex == 3) {
601
		return makecol(0, 0, 0, 0);
602
	}
603

604
	// Average - these are always left shifted, so no need to worry about ties.
605
	int red3 = (red1 + red2) / 2;
606
	int green3 = (green1 + green2) / 2;
607
	int blue3 = (blue1 + blue2) / 2;
608
	return makecol(red3, green3, blue3, alpha);
609
}
610

611
uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
612
	return GetDXTTexelColor(src, x, y, 255);
613
}
614

615
uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
616
	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
617
	u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
618
	return color | (alpha << 28);
619
}
620

621
uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
622
	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
623
	uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
624
	int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
625

626
	if (alphaIndex == 0) {
627
		return color | (src->alpha1 << 24);
628
	} else if (alphaIndex == 1) {
629
		return color | (src->alpha2 << 24);
630
	} else if (src->alpha1 > src->alpha2) {
631
		return color | (lerp8(src, alphaIndex - 1) << 24);
632
	} else if (alphaIndex == 6) {
633
		return color;
634
	} else if (alphaIndex == 7) {
635
		return color | 0xFF000000;
636
	}
637
	return color | (lerp6(src, alphaIndex - 1) << 24);
638
}
639

640
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
641
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {
642
	DXTDecoder dxt;
643
	dxt.DecodeColors(src, false);
644
	dxt.WriteColorsDXT1(dst, src, pitch, width, height);
645
	*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;
646
}
647

648
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width,  int height) {
649
	DXTDecoder dxt;
650
	dxt.DecodeColors(&src->color, true);
651
	dxt.WriteColorsDXT3(dst, src, pitch, width, height);
652
}
653

654
void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
655
	DXTDecoder dxt;
656
	dxt.DecodeColors(&src->color, true);
657
	dxt.DecodeAlphaDXT5(src);
658
	dxt.WriteColorsDXT5(dst, src, pitch, width, height);
659
}
660

661
#ifdef _M_SSE
662
inline u32 SSEReduce32And(__m128i value) {
663
	value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
664
	value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)));
665
	return _mm_cvtsi128_si32(value);
666
}
667
inline u32 SSEReduce16And(__m128i value) {
668
	u32 mask = SSEReduce32And(value);
669
	return mask & (mask >> 16);
670
}
671
#endif
672

673
#if PPSSPP_ARCH(ARM_NEON)
674
inline u32 NEONReduce32And(uint32x4_t value) {
675
	// TODO: Maybe a shuffle and a vector and, or something?
676
	return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);
677
}
678
inline u32 NEONReduce16And(uint16x8_t value) {
679
	uint32x4_t value32 = vreinterpretq_u32_u16(value);
680
	// TODO: Maybe a shuffle and a vector and, or something?
681
	u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3);
682
	return mask & (mask >> 16);
683
}
684
#endif
685

686
// TODO: SSE/SIMD
687
// At least on x86, compiler actually SIMDs these pretty well.
688
void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {
689
	u16 mask = 0xFFFF;
690
#ifdef _M_SSE
691
	if (width >= 8) {
692
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
693
		while (width >= 8) {
694
			__m128i color = _mm_loadu_si128((__m128i *)src);
695
			wideMask = _mm_and_si128(wideMask, color);
696
			_mm_storeu_si128((__m128i *)dst, color);
697
			src += 8;
698
			dst += 8;
699
			width -= 8;
700
		}
701
		mask = SSEReduce16And(wideMask);
702
	}
703
#elif PPSSPP_ARCH(ARM_NEON)
704
	if (width >= 8) {
705
		uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
706
		while (width >= 8) {
707
			uint16x8_t colors = vld1q_u16(src);
708
			wideMask = vandq_u16(wideMask, colors);
709
			vst1q_u16(dst, colors);
710
			src += 8;
711
			dst += 8;
712
			width -= 8;
713
		}
714
		mask = NEONReduce16And(wideMask);
715
	}
716
#endif
717

718
	DO_NOT_VECTORIZE_LOOP
719
	for (int i = 0; i < width; i++) {
720
		u16 color = src[i];
721
		mask &= color;
722
		dst[i] = color;
723
	}
724
	*outMask &= (u32)mask;
725
}
726

727
// Used in video playback so nice to have being fast.
728
void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {
729
	u32 mask = 0xFFFFFFFF;
730
#ifdef _M_SSE
731
	if (width >= 4) {
732
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
733
		while (width >= 4) {
734
			__m128i color = _mm_loadu_si128((__m128i *)src);
735
			wideMask = _mm_and_si128(wideMask, color);
736
			_mm_storeu_si128((__m128i *)dst, color);
737
			src += 4;
738
			dst += 4;
739
			width -= 4;
740
		}
741
		mask = SSEReduce32And(wideMask);
742
	}
743
#elif PPSSPP_ARCH(ARM_NEON)
744
	if (width >= 4) {
745
		uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
746
		while (width >= 4) {
747
			uint32x4_t colors = vld1q_u32(src);
748
			wideMask = vandq_u32(wideMask, colors);
749
			vst1q_u32(dst, colors);
750
			src += 4;
751
			dst += 4;
752
			width -= 4;
753
		}
754
		mask = NEONReduce32And(wideMask);
755
	}
756
#endif
757

758
	DO_NOT_VECTORIZE_LOOP
759
	for (int i = 0; i < width; i++) {
760
		u32 color = src[i];
761
		mask &= color;
762
		dst[i] = color;
763
	}
764
	*outMask &= (u32)mask;
765
}
766

767
void CheckMask16(const u16 *src, int width, u32 *outMask) {
768
	u16 mask = 0xFFFF;
769
#ifdef _M_SSE
770
	if (width >= 8) {
771
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
772
		while (width >= 8) {
773
			wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
774
			src += 8;
775
			width -= 8;
776
		}
777
		mask = SSEReduce16And(wideMask);
778
	}
779
#elif PPSSPP_ARCH(ARM_NEON)
780
	if (width >= 8) {
781
		uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
782
		while (width >= 8) {
783
			wideMask = vandq_u16(wideMask, vld1q_u16(src));
784
			src += 8;
785
			width -= 8;
786
		}
787
		mask = NEONReduce16And(wideMask);
788
	}
789
#endif
790

791
	DO_NOT_VECTORIZE_LOOP
792
	for (int i = 0; i < width; i++) {
793
		mask &= src[i];
794
	}
795
	*outMask &= (u32)mask;
796
}
797

798
void CheckMask32(const u32 *src, int width, u32 *outMask) {
799
	u32 mask = 0xFFFFFFFF;
800
#ifdef _M_SSE
801
	if (width >= 4) {
802
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
803
		while (width >= 4) {
804
			wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
805
			src += 4;
806
			width -= 4;
807
		}
808
		mask = SSEReduce32And(wideMask);
809
	}
810
#elif PPSSPP_ARCH(ARM_NEON)
811
	if (width >= 4) {
812
		uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
813
		while (width >= 4) {
814
			wideMask = vandq_u32(wideMask, vld1q_u32(src));
815
			src += 4;
816
			width -= 4;
817
		}
818
		mask = NEONReduce32And(wideMask);
819
	}
820
#endif
821

822
	DO_NOT_VECTORIZE_LOOP
823
	for (int i = 0; i < width; i++) {
824
		mask &= src[i];
825
	}
826
	*outMask &= (u32)mask;
827
}
828

829
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company