CoCalc -- DepalettizeShaderCommon.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/DepalettizeShaderCommon.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2014- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <cstdio>
19

20
#include "Common/GPU/Shader.h"
21
#include "Common/GPU/ShaderWriter.h"
22

23
#include "GPU/Common/ShaderCommon.h"
24
#include "Common/StringUtils.h"
25
#include "Common/Log.h"
26
#include "Common/LogReporting.h"
27
#include "GPU/Common/GPUStateUtils.h"
28
#include "GPU/Common/DepalettizeShaderCommon.h"
29
#include "GPU/Common/Draw2D.h"
30

31
static const InputDef vsInputs[2] = {
32
	{ "vec2", "a_position", Draw::SEM_POSITION, },
33
	{ "vec2", "a_texcoord0", Draw::SEM_TEXCOORD0, },
34
};
35

36
// TODO: Deduplicate with TextureShaderCommon.cpp
37
static const SamplerDef samplers[2] = {
38
	{ 0, "tex", SamplerFlags::ARRAY_ON_VULKAN },
39
	{ 1, "pal" },
40
};
41

42
static const VaryingDef varyings[1] = {
43
	{ "vec2", "v_texcoord", Draw::SEM_TEXCOORD0, 0, "highp" },
44
};
45

46
// Uses integer instructions available since OpenGL 3.0, ES 3.0 (and 2.0 with extensions), and of course Vulkan and D3D11.
47
void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
48
	const int shift = config.shift;
49
	const int mask = config.mask;
50

51
	writer.C("  vec2 texcoord = v_texcoord;\n");
52

53
	// Implement the swizzle we need to simulate, if a game uses 8888 framebuffers and any other mode than "6" to access depth textures.
54
	// This implements the "2" mode swizzle (it fixes up the Y direction but not X. See comments on issue #15898, Tantalus games)
55
	// NOTE: This swizzle can be made to work with any power-of-2 resolution scaleFactor by shifting
56
	// the bits around, but not sure how to handle 3x scaling. For now this is 1x-only (rough edges at higher resolutions).
57
	if (config.bufferFormat == GE_FORMAT_DEPTH16) {
58
		if (config.depthUpperBits == 0x2) {
59
			writer.C(R"(
60
  int x = int((texcoord.x / scaleFactor) * texSize.x);
61
  int xclear = x & 0x01F0;
62
  int temp = (x - xclear) | ((x >> 1) & 0xF0) | ((x << 4) & 0x100);
63
  texcoord.x = (float(temp) / texSize.x) * scaleFactor;
64
)");
65
		}
66
	}
67

68
	// Sampling turns our texture into floating point. To avoid this, might be able
69
	// to declare them as isampler2D objects, but these require integer textures, which needs more work.
70
	// Anyhow, we simply work around this by converting back to integer, which is fine.
71
	// Use the mask to skip reading some components.
72

73
	// TODO: Since we actually have higher precision color data here, we might want to apply a dithering pattern here
74
	// in the 5551, 565 and 4444 modes. This would benefit Test Drive which renders at 16-bit on the real hardware
75
	// and dithers immediately, while we render at higher color depth and thus don't dither resulting in banding
76
	// when we sample it at low color depth like this.
77

78
	// An alternative would be to have a special mode where we keep some extra precision here and sample the CLUT linearly - works for ramps such
79
	// as those that Test Drive uses for its color remapping. But would need game specific flagging.
80

81
	writer.C("  vec4 color = ").SampleTexture2D("tex", "texcoord").C(";\n");
82

83
	int shiftedMask = mask << shift;
84
	switch (config.bufferFormat) {
85
	case GE_FORMAT_CLUT8:
86
		writer.C("  int index = int(color.r * 255.99);\n");
87
		break;
88
	case GE_FORMAT_8888:
89
		if (shiftedMask & 0xFF) writer.C("  int r = int(color.r * 255.99);\n"); else writer.C("  int r = 0;\n");
90
		if (shiftedMask & 0xFF00) writer.C("  int g = int(color.g * 255.99);\n"); else writer.C("  int g = 0;\n");
91
		if (shiftedMask & 0xFF0000) writer.C("  int b = int(color.b * 255.99);\n"); else writer.C("  int b = 0;\n");
92
		if (shiftedMask & 0xFF000000) writer.C("  int a = int(color.a * 255.99);\n"); else writer.C("  int a = 0;\n");
93
		writer.C("  int index = (a << 24) | (b << 16) | (g << 8) | (r);\n");
94
		break;
95
	case GE_FORMAT_4444:
96
		if (shiftedMask & 0xF) writer.C("  int r = int(color.r * 15.99);\n"); else writer.C("  int r = 0;\n");
97
		if (shiftedMask & 0xF0) writer.C("  int g = int(color.g * 15.99);\n"); else writer.C("  int g = 0;\n");
98
		if (shiftedMask & 0xF00) writer.C("  int b = int(color.b * 15.99);\n"); else writer.C("  int b = 0;\n");
99
		if (shiftedMask & 0xF000) writer.C("  int a = int(color.a * 15.99);\n"); else writer.C("  int a = 0;\n");
100
		writer.C("  int index = (a << 12) | (b << 8) | (g << 4) | (r);\n");
101
		break;
102
	case GE_FORMAT_565:
103
		if (shiftedMask & 0x1F) writer.C("  int r = int(color.r * 31.99);\n"); else writer.C("  int r = 0;\n");
104
		if (shiftedMask & 0x7E0) writer.C("  int g = int(color.g * 63.99);\n"); else writer.C("  int g = 0;\n");
105
		if (shiftedMask & 0xF800) writer.C("  int b = int(color.b * 31.99);\n"); else writer.C("  int b = 0;\n");
106
		writer.C("  int index = (b << 11) | (g << 5) | (r);\n");
107
		break;
108
	case GE_FORMAT_5551:
109
		if (config.textureFormat == GE_TFMT_CLUT8) {
110
			// SOCOM case. We need to make sure the next few lines load the right bits, see below.
111
			shiftedMask <<= 8;
112
		}
113
		if (shiftedMask & 0x1F) writer.C("  int r = int(color.r * 31.99);\n"); else writer.C("  int r = 0;\n");
114
		if (shiftedMask & 0x3E0) writer.C("  int g = int(color.g * 31.99);\n"); else writer.C("  int g = 0;\n");
115
		if (shiftedMask & 0x7C00) writer.C("  int b = int(color.b * 31.99);\n"); else writer.C("  int b = 0;\n");
116
		if (shiftedMask & 0x8000) writer.C("  int a = int(color.a);\n"); else writer.C("  int a = 0;\n");
117
		writer.C("  int index = (a << 15) | (b << 10) | (g << 5) | (r);\n");
118

119
		if (config.textureFormat == GE_TFMT_CLUT8) {
120
			// SOCOM case. #16210
121
			// To debug the issue, remove this shift to see the texture (check for clamping etc).
122
			writer.C("  index >>= 8;\n");
123
		}
124

125
		break;
126
	case GE_FORMAT_DEPTH16:
127
		// Decode depth buffer.
128
		writer.C("  float depth = (color.x - z_offset) * z_scale * 65535.0f;\n");
129

130
		if (config.bufferFormat == GE_FORMAT_DEPTH16 && config.textureFormat == GE_TFMT_5650) {
131
			// Convert depth to 565, without going through a CLUT.
132
			// TODO: Make "depal without a CLUT" a separate concept, to avoid redundantly creating a CLUT texture.
133
			writer.C("  int idepth = int(clamp(depth, 0.0, 65535.0));\n");
134
			writer.C("  float r = float(idepth & 31) / 31.0;\n");
135
			writer.C("  float g = float((idepth >> 5) & 63) / 63.0;\n");
136
			writer.C("  float b = float((idepth >> 11) & 31) / 31.0;\n");
137
			writer.C("  vec4 outColor = vec4(r, g, b, 1.0);\n");
138
			return;
139
		}
140

141
		writer.C("  int index = int(clamp(depth, 0.0, 65535.0));\n");
142
		break;
143
	default:
144
		break;
145
	}
146

147
	float texturePixels = 512.0f;
148

149
	if (shift) {
150
		writer.F("  index = (int(uint(index) >> uint(%d)) & 0x%02x)", shift, mask);
151
	} else {
152
		writer.F("  index = (index & 0x%02x)", mask);
153
	}
154
	if (config.startPos) {
155
		writer.F(" | %d;\n", config.startPos);  // '|' matches what we have in gstate.h
156
	} else {
157
		writer.F(";\n");
158
	}
159

160
	writer.F("  vec2 uv = vec2((float(index) + 0.5) * %f, 0.0);\n", 1.0f / texturePixels);
161
	writer.C("  vec4 outColor = ").SampleTexture2D("pal", "uv").C(";\n");
162
}
163

164
// FP only, to suit GL(ES) 2.0 and DX9
165
void GenerateDepalShaderFloat(ShaderWriter &writer, const DepalConfig &config) {
166
	char lookupMethod[128] = "index.r";
167

168
	const int shift = config.shift;
169
	const int mask = config.mask;
170

171
	if (config.bufferFormat == GE_FORMAT_DEPTH16) {
172
		DepthScaleFactors factors = GetDepthScaleFactors(gstate_c.UseFlags());
173
		writer.ConstFloat("z_scale", factors.ScaleU16());
174
		writer.ConstFloat("z_offset", factors.Offset());
175
	}
176

177
	writer.C("  vec4 index = ").SampleTexture2D("tex", "v_texcoord").C(";\n");
178

179
	float index_multiplier = 1.0f;
180
	// pixelformat is the format of the texture we are sampling.
181
	bool formatOK = true;
182
	switch (config.bufferFormat) {
183
	case GE_FORMAT_CLUT8:
184
		if (shift == 0 && mask == 0xFF) {
185
			// Easy peasy.
186
			if (writer.Lang().shaderLanguage == HLSL_D3D9)
187
				snprintf(lookupMethod, sizeof(lookupMethod), "index.a");
188
			else
189
				snprintf(lookupMethod, sizeof(lookupMethod), "index.r");
190
			formatOK = true;
191
		} else {
192
			// Deal with this if we find it.
193
			formatOK = false;
194
		}
195
		break;
196
	case GE_FORMAT_8888:
197
		if ((mask & (mask + 1)) == 0) {
198
			// If the value has all bits contiguous (bitmask check above), we can mod by it + 1.
199
			const char *rgba = "rrrrrrrrggggggggbbbbbbbbaaaaaaaa";
200
			const u8 rgba_shift = shift & 7;
201
			if (rgba_shift == 0 && mask == 0xFF) {
202
				snprintf(lookupMethod, sizeof(lookupMethod), "index.%c", rgba[shift]);
203
			} else {
204
				snprintf(lookupMethod, sizeof(lookupMethod), "mod(index.%c * %f, %d.0)", rgba[shift], 255.99f / (1 << rgba_shift), mask + 1);
205
				index_multiplier = 1.0f / 256.0f;
206
				// Format was OK if there weren't bits from another component.
207
				formatOK = mask <= 255 - (1 << rgba_shift);
208
			}
209
		} else {
210
			formatOK = false;
211
		}
212
		break;
213
	case GE_FORMAT_4444:
214
		if ((mask & (mask + 1)) == 0 && shift < 16) {
215
			const char *rgba = "rrrrggggbbbbaaaa";
216
			const u8 rgba_shift = shift & 3;
217
			if (rgba_shift == 0 && mask == 0xF) {
218
				snprintf(lookupMethod, sizeof(lookupMethod), "index.%c", rgba[shift]);
219
				index_multiplier = 15.0f / 256.0f;
220
			} else {
221
				// Let's divide and mod to get the right bits.  A common case is shift=0, mask=01.
222
				snprintf(lookupMethod, sizeof(lookupMethod), "mod(index.%c * %f, %d.0)", rgba[shift], 15.99f / (1 << rgba_shift), mask + 1);
223
				index_multiplier = 1.0f / 256.0f;
224
				formatOK = mask <= 15 - (1 << rgba_shift);
225
			}
226
		} else {
227
			formatOK = false;
228
		}
229
		break;
230
	case GE_FORMAT_565:
231
		if ((mask & (mask + 1)) == 0 && shift < 16) {
232
			const u8 shifts[16] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4 };
233
			const int multipliers[16] = { 31, 31, 31, 31, 31, 63, 63, 63, 63, 63, 63, 31, 31, 31, 31, 31 };
234
			const char *rgba = "rrrrrggggggbbbbb";
235
			const u8 rgba_shift = shifts[shift];
236
			if (rgba_shift == 0 && mask == multipliers[shift]) {
237
				snprintf(lookupMethod, sizeof(lookupMethod), "index.%c", rgba[shift]);
238
				index_multiplier = multipliers[shift] / 256.0f;
239
			} else {
240
				// We just need to divide the right component by the right value, and then mod against the mask.
241
				// A common case is shift=1, mask=0f.
242
				snprintf(lookupMethod, sizeof(lookupMethod), "mod(index.%c * %f, %d.0)", rgba[shift], ((float)multipliers[shift] + 0.99f) / (1 << rgba_shift), mask + 1);
243
				index_multiplier = 1.0f / 256.0f;
244
				formatOK = mask <= multipliers[shift] - (1 << rgba_shift);
245
			}
246
		} else {
247
			formatOK = false;
248
		}
249
		break;
250
	case GE_FORMAT_5551:
251
		if (config.textureFormat == GE_TFMT_CLUT8 && mask == 0xFF && shift == 0) {
252
			// Follow the intent here, and ignore g (and let's not round unnecessarily).
253
			snprintf(lookupMethod, sizeof(lookupMethod), "floor(floor(index.a) * 128.0 + index.b * 64.0)");
254
			index_multiplier = 1.0f / 256.0f;
255
			// SOCOM case. #16210
256
		} else if ((mask & (mask + 1)) == 0 && shift < 16) {
257
			const char *rgba = "rrrrrgggggbbbbba";
258
			const u8 rgba_shift = shift % 5;
259
			if (rgba_shift == 0 && mask == 0x1F) {
260
				snprintf(lookupMethod, sizeof(lookupMethod), "index.%c", rgba[shift]);
261
				index_multiplier = 31.0f / 256.0f;
262
			} else if (shift == 15 && mask == 1) {
263
				snprintf(lookupMethod, sizeof(lookupMethod), "index.%c", rgba[shift]);
264
				index_multiplier = 1.0f / 256.0f;
265
			} else {
266
				// A isn't possible here.
267
				snprintf(lookupMethod, sizeof(lookupMethod), "mod(index.%c * %f, %d.0)", rgba[shift], 31.99f / (1 << rgba_shift), mask + 1);
268
				index_multiplier = 1.0f / 256.0f;
269
				formatOK = mask <= 31 - (1 << rgba_shift);
270
			}
271
		} else {
272
			formatOK = false;
273
		}
274
		break;
275
	case GE_FORMAT_DEPTH16:
276
	{
277
		// TODO: I think we can handle most scenarios here, but texturing from depth buffers requires an extension on ES 2.0 anyway.
278
		// Not on D3D9 though, so this path is still relevant.
279

280
		if (config.bufferFormat == GE_FORMAT_DEPTH16 && config.textureFormat == GE_TFMT_5650) {
281
			// Convert depth to 565, without going through a CLUT.
282
			writer.C("  float depth = (index.x - z_offset) * z_scale;\n");
283
			writer.C("  float idepth = floor(clamp(depth, 0.0, 65535.0));\n");
284
			writer.C("  float r = mod(idepth, 32.0) / 31.0;\n");
285
			writer.C("  float g = mod(floor(idepth / 32.0), 64.0) / 63.0;\n");
286
			writer.C("  float b = mod(floor(idepth / 2048.0), 32.0) / 31.0;\n");
287
			writer.C("  vec4 outColor = vec4(r, g, b, 1.0);\n");
288
			return;
289
		}
290

291
		if (shift < 16) {
292
			index_multiplier = 1.0f / (float)(1 << shift);
293
			truncate_cpy(lookupMethod, "((index.x - z_offset) * z_scale)");
294

295
			if ((mask & (mask + 1)) != 0) {
296
				// But we'll try with the above anyway.
297
				formatOK = false;
298
			}
299
		} else {
300
			formatOK = false;
301
		}
302
		break;
303
	}
304
	default:
305
		break;
306
	}
307

308
	// We always use 512-sized textures now.
309
	float texturePixels = 512.f;
310
	index_multiplier *= 0.5f;
311

312
	// Adjust index_multiplier, similar to the use of 15.99 instead of 16 in the ES 3 path.
313
	// index_multiplier -= 0.01f / texturePixels;
314

315
	if (!formatOK) {
316
		ERROR_LOG_REPORT_ONCE(depal, Log::G3D, "%s depal unsupported: shift=%d mask=%02x offset=%d", GeBufferFormatToString(config.bufferFormat), shift, mask, config.startPos);
317
	}
318

319
	// Offset by half a texel (plus clutBase) to turn NEAREST filtering into FLOOR.
320
	// Technically, the clutBase should be |'d, not added, but that's hard with floats.
321
	float texel_offset = ((float)config.startPos + 0.5f) / texturePixels;
322
	if (writer.Lang().shaderLanguage == HLSL_D3D9) {
323
		// Seems to need a half-pixel offset fix?  Might mean it was rendered wrong...
324
		texel_offset += 0.5f / texturePixels;
325
	}
326
	writer.F("  float coord = (%s * %f) + %f;\n", lookupMethod, index_multiplier, texel_offset);
327
	writer.C("  vec4 outColor = ").SampleTexture2D("pal", "vec2(coord, 0.0)").C(";\n");
328
}
329

330
void GenerateDepalSmoothed(ShaderWriter &writer, const DepalConfig &config) {
331
	const char *sourceChannel = "error";
332
	float indexMultiplier = 31.0f;
333

334
	if (config.bufferFormat == GE_FORMAT_5551) {
335
		_dbg_assert_(config.mask == 0x1F);
336
		switch (config.shift) {
337
		case 0: sourceChannel = "r"; break;
338
		case 5: sourceChannel = "g"; break;
339
		case 10: sourceChannel = "b"; break;
340
		default: _dbg_assert_(false);
341
		}
342
	} else if (config.bufferFormat == GE_FORMAT_565) {
343
		_dbg_assert_(config.mask == 0x1F || config.mask == 0x3F);
344
		switch (config.shift) {
345
		case 0: sourceChannel = "r"; break;
346
		case 5: sourceChannel = "g"; indexMultiplier = 63.0f; break;
347
		case 11: sourceChannel = "b"; break;
348
		default: _dbg_assert_(false);
349
		}
350
	} else {
351
		_dbg_assert_(false);
352
	}
353

354
	writer.C("  float index = ").SampleTexture2D("tex", "v_texcoord").F(".%s * %0.1f;\n", sourceChannel, indexMultiplier);
355
	float texturePixels = 512.f;
356
	writer.F("  float coord = (index + 0.5) * %f;\n", 1.0 / texturePixels);
357
	writer.C("  vec4 outColor = ").SampleTexture2D("pal", "vec2(coord, 0.0)").C(";\n");
358
}
359

360
void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config) {
361
	writer.DeclareSamplers(samplers);
362
	writer.HighPrecisionFloat();
363
	writer.BeginFSMain(config.bufferFormat == GE_FORMAT_DEPTH16 ? g_draw2Duniforms : Slice<UniformDef>::empty(), varyings);
364
	if (config.smoothedDepal) {
365
		// Handles a limited set of cases, but doesn't need any integer math so we don't
366
		// need two variants.
367
		GenerateDepalSmoothed(writer, config);
368
	} else {
369
		switch (writer.Lang().shaderLanguage) {
370
		case HLSL_D3D9:
371
		case GLSL_1xx:
372
			GenerateDepalShaderFloat(writer, config);
373
			break;
374
		case GLSL_VULKAN:
375
		case GLSL_3xx:
376
		case HLSL_D3D11:
377
			// Use the float shader for the SOCOM special.
378
			if (config.bufferFormat == GE_FORMAT_5551 && config.textureFormat == GE_TFMT_CLUT8) {
379
				GenerateDepalShaderFloat(writer, config);
380
			} else {
381
				GenerateDepalShader300(writer, config);
382
			}
383
			break;
384
		default:
385
			_assert_msg_(false, "Shader language not supported for depal: %d", (int)writer.Lang().shaderLanguage);
386
		}
387
	}
388
	writer.EndFSMain("outColor");
389
}
390

391
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company