CoCalc -- GPUStateUtils.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/GPUStateUtils.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2015- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <algorithm>
19
#include <limits>
20

21
#include "Common/System/Display.h"
22

23
#include "Common/StringUtils.h"
24
#include "Core/Config.h"
25
#include "Core/ConfigValues.h"
26
#include "Core/System.h"
27

28
#include "GPU/ge_constants.h"
29
#include "GPU/GPUState.h"
30
#include "GPU/Math3D.h"
31
#include "GPU/Common/FramebufferManagerCommon.h"
32
#include "GPU/Common/PresentationCommon.h"
33
#include "GPU/Common/ShaderId.h"
34
#include "GPU/Common/VertexDecoderCommon.h"
35

36
#include "GPU/Common/GPUStateUtils.h"
37

38
bool IsStencilTestOutputDisabled() {
39
	// The mask applies on all stencil ops.
40
	if (gstate.isStencilTestEnabled() && (gstate.pmska & 0xFF) != 0xFF) {
41
		if (gstate_c.framebufFormat == GE_FORMAT_565) {
42
			return true;
43
		}
44
		return gstate.getStencilOpZPass() == GE_STENCILOP_KEEP && gstate.getStencilOpZFail() == GE_STENCILOP_KEEP && gstate.getStencilOpSFail() == GE_STENCILOP_KEEP;
45
	}
46
	return true;
47
}
48

49
bool NeedsTestDiscard() {
50
	// We assume this is called only when enabled and not trivially true (may also be for color testing.)
51
	if (gstate.isStencilTestEnabled() && (gstate.pmska & 0xFF) != 0xFF)
52
		return true;
53
	if (gstate.isDepthTestEnabled() && gstate.isDepthWriteEnabled())
54
		return true;
55
	if (!gstate.isAlphaBlendEnabled())
56
		return true;
57
	if (gstate.getBlendFuncA() != GE_SRCBLEND_SRCALPHA && gstate.getBlendFuncA() != GE_SRCBLEND_DOUBLESRCALPHA)
58
		return true;
59
	// GE_DSTBLEND_DOUBLEINVSRCALPHA is actually inverse double src alpha, and doubling zero is still zero.
60
	if (gstate.getBlendFuncB() != GE_DSTBLEND_INVSRCALPHA && gstate.getBlendFuncB() != GE_DSTBLEND_DOUBLEINVSRCALPHA) {
61
		if (gstate.getBlendFuncB() != GE_DSTBLEND_FIXB || gstate.getFixB() != 0xFFFFFF)
62
			return true;
63
	}
64
	if (gstate.getBlendEq() != GE_BLENDMODE_MUL_AND_ADD && gstate.getBlendEq() != GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
65
		return true;
66
	if (gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY)
67
		return true;
68

69
	return false;
70
}
71

72
bool IsAlphaTestTriviallyTrue() {
73
	switch (gstate.getAlphaTestFunction()) {
74
	case GE_COMP_NEVER:
75
		return false;
76

77
	case GE_COMP_ALWAYS:
78
		return true;
79

80
	case GE_COMP_GEQUAL:
81
		if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed()))
82
			return true;  // If alpha is full, it doesn't matter what the ref value is.
83
		return gstate.getAlphaTestRef() == 0;
84

85
		// Non-zero check. If we have no depth testing (and thus no depth writing), and an alpha func that will result in no change if zero alpha, get rid of the alpha test.
86
		// Speeds up Lumines by a LOT on PowerVR.
87
	case GE_COMP_NOTEQUAL:
88
		if (gstate.getAlphaTestRef() == 255) {
89
			// Likely to be rare. Let's just skip the vertexFullAlpha optimization here instead of adding
90
			// complicated code to discard the draw or whatnot.
91
			return false;
92
		}
93
		// Fallthrough on purpose
94

95
	case GE_COMP_GREATER:
96
	{
97
		// If the texture and vertex only use 1.0 alpha, then the ref value doesn't matter.
98
		if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed()))
99
			return true;
100
		return gstate.getAlphaTestRef() == 0 && !NeedsTestDiscard();
101
	}
102

103
	case GE_COMP_LEQUAL:
104
		return gstate.getAlphaTestRef() == 255;
105

106
	case GE_COMP_EQUAL:
107
	case GE_COMP_LESS:
108
		return false;
109

110
	default:
111
		return false;
112
	}
113
}
114

115
bool IsAlphaTestAgainstZero() {
116
	return gstate.getAlphaTestRef() == 0 && gstate.getAlphaTestMask() == 0xFF;
117
}
118

119
bool IsColorTestAgainstZero() {
120
	return gstate.getColorTestRef() == 0 && gstate.getColorTestMask() == 0xFFFFFF;
121
}
122

123
bool IsColorTestTriviallyTrue() {
124
	switch (gstate.getColorTestFunction()) {
125
	case GE_COMP_NEVER:
126
		return false;
127

128
	case GE_COMP_ALWAYS:
129
		return true;
130

131
	case GE_COMP_EQUAL:
132
	case GE_COMP_NOTEQUAL:
133
		return false;
134
	default:
135
		return false;
136
	}
137
}
138

139
bool IsDepthTestEffectivelyDisabled() {
140
	if (!gstate.isDepthTestEnabled())
141
		return true;
142
	// We can ignore stencil, because ALWAYS and disabled choose the same stencil path.
143
	if (gstate.getDepthTestFunction() != GE_COMP_ALWAYS)
144
		return false;
145
	return !gstate.isDepthWriteEnabled();
146
}
147

148
const bool nonAlphaSrcFactors[16] = {
149
	true,  // GE_SRCBLEND_DSTCOLOR,
150
	true,  // GE_SRCBLEND_INVDSTCOLOR,
151
	false, // GE_SRCBLEND_SRCALPHA,
152
	false, // GE_SRCBLEND_INVSRCALPHA,
153
	true,  // GE_SRCBLEND_DSTALPHA,
154
	true,  // GE_SRCBLEND_INVDSTALPHA,
155
	false, // GE_SRCBLEND_DOUBLESRCALPHA,
156
	false, // GE_SRCBLEND_DOUBLEINVSRCALPHA,
157
	true,  // GE_SRCBLEND_DOUBLEDSTALPHA,
158
	true,  // GE_SRCBLEND_DOUBLEINVDSTALPHA,
159
	true,  // GE_SRCBLEND_FIXA,
160
	true,
161
	true,
162
	true,
163
	true,
164
	true,
165
};
166

167
const bool nonAlphaDestFactors[16] = {
168
	true,  // GE_DSTBLEND_SRCCOLOR,
169
	true,  // GE_DSTBLEND_INVSRCCOLOR,
170
	false, // GE_DSTBLEND_SRCALPHA,
171
	false, // GE_DSTBLEND_INVSRCALPHA,
172
	true,  // GE_DSTBLEND_DSTALPHA,
173
	true,  // GE_DSTBLEND_INVDSTALPHA,
174
	false, // GE_DSTBLEND_DOUBLESRCALPHA,
175
	false, // GE_DSTBLEND_DOUBLEINVSRCALPHA,
176
	true,  // GE_DSTBLEND_DOUBLEDSTALPHA,
177
	true,  // GE_DSTBLEND_DOUBLEINVDSTALPHA,
178
	true,  // GE_DSTBLEND_FIXB,
179
	true,
180
	true,
181
	true,
182
	true,
183
	true,
184
};
185

186
ReplaceAlphaType ReplaceAlphaWithStencil(ReplaceBlendType replaceBlend) {
187
	if (IsStencilTestOutputDisabled() || gstate.isModeClear()) {
188
		return REPLACE_ALPHA_NO;
189
	}
190

191
	if (replaceBlend != REPLACE_BLEND_NO && replaceBlend != REPLACE_BLEND_READ_FRAMEBUFFER) {
192
		if (nonAlphaSrcFactors[gstate.getBlendFuncA()] && nonAlphaDestFactors[gstate.getBlendFuncB()]) {
193
			return REPLACE_ALPHA_YES;
194
		} else {
195
			if (gstate_c.Use(GPU_USE_DUALSOURCE_BLEND)) {
196
				return REPLACE_ALPHA_DUALSOURCE;
197
			} else {
198
				return REPLACE_ALPHA_NO;
199
			}
200
		}
201
	}
202

203
	if (replaceBlend == ReplaceBlendType::REPLACE_BLEND_BLUE_TO_ALPHA) {
204
		return REPLACE_ALPHA_NO;  // irrelevant
205
	}
206

207
	return REPLACE_ALPHA_YES;
208
}
209

210
StencilValueType ReplaceAlphaWithStencilType() {
211
	switch (gstate_c.framebufFormat) {
212
	case GE_FORMAT_565:
213
		// There's never a stencil value.  Maybe the right alpha is 1?
214
		return STENCIL_VALUE_ONE;
215

216
	case GE_FORMAT_5551:
217
		switch (gstate.getStencilOpZPass()) {
218
			// Technically, this should only ever use zero/one.
219
		case GE_STENCILOP_REPLACE:
220
			return (gstate.getStencilTestRef() & 0x80) != 0 ? STENCIL_VALUE_ONE : STENCIL_VALUE_ZERO;
221

222
			// Decrementing always zeros, since there's only one bit.
223
		case GE_STENCILOP_DECR:
224
		case GE_STENCILOP_ZERO:
225
			return STENCIL_VALUE_ZERO;
226

227
			// Incrementing always fills, since there's only one bit.
228
		case GE_STENCILOP_INCR:
229
			return STENCIL_VALUE_ONE;
230

231
		case GE_STENCILOP_INVERT:
232
			return STENCIL_VALUE_INVERT;
233

234
		case GE_STENCILOP_KEEP:
235
			return STENCIL_VALUE_KEEP;
236
		}
237
		break;
238

239
	case GE_FORMAT_4444:
240
	case GE_FORMAT_8888:
241
	case GE_FORMAT_INVALID:
242
	case GE_FORMAT_DEPTH16:
243
	case GE_FORMAT_CLUT8:
244
		switch (gstate.getStencilOpZPass()) {
245
		case GE_STENCILOP_REPLACE:
246
			// TODO: Could detect zero here and force ZERO - less uniform updates?
247
			return STENCIL_VALUE_UNIFORM;
248

249
		case GE_STENCILOP_ZERO:
250
			return STENCIL_VALUE_ZERO;
251

252
		case GE_STENCILOP_DECR:
253
			return gstate_c.framebufFormat == GE_FORMAT_4444 ? STENCIL_VALUE_DECR_4 : STENCIL_VALUE_DECR_8;
254

255
		case GE_STENCILOP_INCR:
256
			return gstate_c.framebufFormat == GE_FORMAT_4444 ? STENCIL_VALUE_INCR_4 : STENCIL_VALUE_INCR_8;
257

258
		case GE_STENCILOP_INVERT:
259
			return STENCIL_VALUE_INVERT;
260

261
		case GE_STENCILOP_KEEP:
262
			return STENCIL_VALUE_KEEP;
263
		}
264
		break;
265
	}
266

267
	return STENCIL_VALUE_KEEP;
268
}
269

270
ReplaceBlendType ReplaceBlendWithShader(GEBufferFormat bufferFormat) {
271
	if (gstate_c.blueToAlpha) {
272
		return REPLACE_BLEND_BLUE_TO_ALPHA;
273
	}
274

275
	if (!gstate.isAlphaBlendEnabled() || gstate.isModeClear()) {
276
		return REPLACE_BLEND_NO;
277
	}
278

279
	GEBlendMode eq = gstate.getBlendEq();
280
	// Let's get the non-factor modes out of the way first.
281
	switch (eq) {
282
	case GE_BLENDMODE_ABSDIFF:
283
		return REPLACE_BLEND_READ_FRAMEBUFFER;
284

285
	case GE_BLENDMODE_MIN:
286
	case GE_BLENDMODE_MAX:
287
		if (gstate_c.Use(GPU_USE_BLEND_MINMAX)) {
288
			return REPLACE_BLEND_STANDARD;
289
		} else {
290
			return REPLACE_BLEND_READ_FRAMEBUFFER;
291
		}
292

293
	case GE_BLENDMODE_MUL_AND_ADD:
294
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
295
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
296
		// Handled below.
297
		break;
298

299
	default:
300
		// Other blend equations simply don't blend on hardware.
301
		return REPLACE_BLEND_NO;
302
	}
303

304
	GEBlendSrcFactor funcA = gstate.getBlendFuncA();
305
	GEBlendDstFactor funcB = gstate.getBlendFuncB();
306

307
	switch (funcA) {
308
	case GE_SRCBLEND_DOUBLESRCALPHA:
309
	case GE_SRCBLEND_DOUBLEINVSRCALPHA:
310
		// 2x alpha in the source function and not in the dest = source color doubling.
311
		// Even dest alpha is safe, since we're moving the * 2.0 into the src color.
312
		switch (funcB) {
313
		case GE_DSTBLEND_SRCCOLOR:
314
		case GE_DSTBLEND_INVSRCCOLOR:
315
			// When inversing, alpha clamping isn't an issue.
316
			if (funcA == GE_SRCBLEND_DOUBLEINVSRCALPHA)
317
				return REPLACE_BLEND_2X_ALPHA;
318
			// Can't double, we need the source color to be correct.
319
			// Doubling only alpha would clamp the src alpha incorrectly.
320
			return REPLACE_BLEND_READ_FRAMEBUFFER;
321

322
		case GE_DSTBLEND_DOUBLEDSTALPHA:
323
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
324
			if (bufferFormat == GE_FORMAT_565)
325
				return REPLACE_BLEND_2X_ALPHA;
326
			return REPLACE_BLEND_READ_FRAMEBUFFER;
327

328
		case GE_DSTBLEND_DOUBLESRCALPHA:
329
			// We can't technically do this correctly (due to clamping) without reading the dst color.
330
			// Using a copy isn't accurate either, though, when there's overlap.
331
			if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
332
				return REPLACE_BLEND_READ_FRAMEBUFFER;
333
			return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
334

335
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
336
			// For the inverse, doubling alpha is safe, because it will clamp correctly.
337
			return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
338

339
		case GE_DSTBLEND_SRCALPHA:
340
		case GE_DSTBLEND_INVSRCALPHA:
341
		case GE_DSTBLEND_DSTALPHA:
342
		case GE_DSTBLEND_INVDSTALPHA:
343
		case GE_DSTBLEND_FIXB:
344
		default:
345
			// TODO: Could use vertexFullAlpha, but it's not calculated yet.
346
			// This outputs the original alpha for the dest factor.
347
			return REPLACE_BLEND_PRE_SRC;
348
		}
349

350
	case GE_SRCBLEND_DOUBLEDSTALPHA:
351
		switch (funcB) {
352
		case GE_DSTBLEND_SRCCOLOR:
353
		case GE_DSTBLEND_INVSRCCOLOR:
354
			if (bufferFormat == GE_FORMAT_565) {
355
				// Dest alpha should be zero.
356
				return REPLACE_BLEND_STANDARD;
357
			}
358
			// Can't double, we need the source color to be correct.
359
			return REPLACE_BLEND_READ_FRAMEBUFFER;
360

361
		case GE_DSTBLEND_DOUBLEDSTALPHA:
362
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
363
			if (bufferFormat == GE_FORMAT_565) {
364
				// Both blend factors are 0 or 1, no need to read it, since it's known.
365
				// Doubling will have no effect here.
366
				return REPLACE_BLEND_STANDARD;
367
			}
368
			return REPLACE_BLEND_READ_FRAMEBUFFER;
369

370
		case GE_DSTBLEND_DOUBLESRCALPHA:
371
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
372
			if (bufferFormat == GE_FORMAT_565) {
373
				return REPLACE_BLEND_2X_ALPHA;
374
			}
375
			// Double both src (for dst alpha) and alpha (for dst factor.)
376
			// But to be accurate (clamping), we need to read the dst color.
377
			return REPLACE_BLEND_READ_FRAMEBUFFER;
378

379
		case GE_DSTBLEND_SRCALPHA:
380
		case GE_DSTBLEND_INVSRCALPHA:
381
		case GE_DSTBLEND_DSTALPHA:
382
		case GE_DSTBLEND_INVDSTALPHA:
383
		case GE_DSTBLEND_FIXB:
384
		default:
385
			if (bufferFormat == GE_FORMAT_565) {
386
				return REPLACE_BLEND_STANDARD;
387
			}
388
			// We can't technically do this correctly (due to clamping) without reading the dst alpha.
389
			return REPLACE_BLEND_READ_FRAMEBUFFER;
390
		}
391

392
	case GE_SRCBLEND_DOUBLEINVDSTALPHA:
393
		// Inverse double dst alpha is tricky.  Doubling the src color is probably the wrong direction,
394
		// halving might be more correct.  We really need to read the dst color.
395
		switch (funcB) {
396
		case GE_DSTBLEND_SRCCOLOR:
397
		case GE_DSTBLEND_INVSRCCOLOR:
398
		case GE_DSTBLEND_DOUBLEDSTALPHA:
399
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
400
			if (bufferFormat == GE_FORMAT_565) {
401
				return REPLACE_BLEND_STANDARD;
402
			}
403
			return REPLACE_BLEND_READ_FRAMEBUFFER;
404

405
		case GE_DSTBLEND_DOUBLESRCALPHA:
406
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
407
			if (bufferFormat == GE_FORMAT_565) {
408
				return REPLACE_BLEND_2X_ALPHA;
409
			}
410
			return REPLACE_BLEND_READ_FRAMEBUFFER;
411

412
		case GE_DSTBLEND_SRCALPHA:
413
		case GE_DSTBLEND_INVSRCALPHA:
414
		case GE_DSTBLEND_DSTALPHA:
415
		case GE_DSTBLEND_INVDSTALPHA:
416
		case GE_DSTBLEND_FIXB:
417
		default:
418
			if (bufferFormat == GE_FORMAT_565) {
419
				return REPLACE_BLEND_STANDARD;
420
			}
421
			return REPLACE_BLEND_READ_FRAMEBUFFER;
422
		}
423

424
	case GE_SRCBLEND_FIXA:
425
	default:
426
		switch (funcB) {
427
		case GE_DSTBLEND_DOUBLESRCALPHA:
428
			// Can't safely double alpha, will clamp.
429
			return REPLACE_BLEND_READ_FRAMEBUFFER;
430

431
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
432
			// Doubling alpha is safe for the inverse, will clamp to zero either way.
433
			return REPLACE_BLEND_2X_ALPHA;
434

435
		case GE_DSTBLEND_DOUBLEDSTALPHA:
436
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
437
			if (bufferFormat == GE_FORMAT_565) {
438
				return REPLACE_BLEND_STANDARD;
439
			}
440
			return REPLACE_BLEND_READ_FRAMEBUFFER;
441

442
		case GE_DSTBLEND_FIXB:
443
		default:
444
			if (gstate.getFixA() == 0xFFFFFF && gstate.getFixB() == 0x000000) {
445
				// Some games specify this.  Some cards may prefer blending off entirely.
446
				return REPLACE_BLEND_NO;
447
			} else if (gstate.getFixA() == 0xFFFFFF || gstate.getFixA() == 0x000000 || gstate.getFixB() == 0xFFFFFF || gstate.getFixB() == 0x000000) {
448
				return REPLACE_BLEND_STANDARD;
449
			} else {
450
				// Multiply the src color in the shader, that way it's always accurate.
451
				return REPLACE_BLEND_PRE_SRC;
452
			}
453

454
		case GE_DSTBLEND_SRCCOLOR:
455
		case GE_DSTBLEND_INVSRCCOLOR:
456
		case GE_DSTBLEND_SRCALPHA:
457
		case GE_DSTBLEND_INVSRCALPHA:
458
		case GE_DSTBLEND_DSTALPHA:
459
		case GE_DSTBLEND_INVDSTALPHA:
460
			return REPLACE_BLEND_STANDARD;
461
		}
462

463
	case GE_SRCBLEND_DSTCOLOR:
464
	case GE_SRCBLEND_INVDSTCOLOR:
465
	case GE_SRCBLEND_SRCALPHA:
466
	case GE_SRCBLEND_INVSRCALPHA:
467
	case GE_SRCBLEND_DSTALPHA:
468
	case GE_SRCBLEND_INVDSTALPHA:
469
		switch (funcB) {
470
		case GE_DSTBLEND_DOUBLESRCALPHA:
471
			if (funcA == GE_SRCBLEND_SRCALPHA || funcA == GE_SRCBLEND_INVSRCALPHA) {
472
				// Can't safely double alpha, will clamp.  However, a copy may easily be worse due to overlap.
473
				if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
474
					return REPLACE_BLEND_READ_FRAMEBUFFER;
475
				return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
476
			} else {
477
				// This means dst alpha/color is used in the src factor.
478
				// Unfortunately, copying here causes overlap problems in Silent Hill games (it seems?)
479
				// We will just hope that doubling alpha for the dst factor will not clamp too badly.
480
				if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
481
					return REPLACE_BLEND_READ_FRAMEBUFFER;
482
				return REPLACE_BLEND_2X_ALPHA;
483
			}
484

485
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
486
			// For inverse, things are simpler.  Clamping isn't an issue, as long as we avoid
487
			// messing with the other factor's components.
488
			if (funcA == GE_SRCBLEND_SRCALPHA || funcA == GE_SRCBLEND_INVSRCALPHA) {
489
				return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
490
			}
491
			return REPLACE_BLEND_2X_ALPHA;
492

493
		case GE_DSTBLEND_DOUBLEDSTALPHA:
494
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
495
			if (bufferFormat == GE_FORMAT_565) {
496
				return REPLACE_BLEND_STANDARD;
497
			}
498
			return REPLACE_BLEND_READ_FRAMEBUFFER;
499

500
		default:
501
			return REPLACE_BLEND_STANDARD;
502
		}
503
	}
504

505
	// Should never get here.
506
	return REPLACE_BLEND_STANDARD;
507
}
508

509
static const float DEPTH_SLICE_FACTOR_HIGH = 4.0f;
510
static const float DEPTH_SLICE_FACTOR_16BIT = 256.0f;
511

512
// The supported flag combinations. TODO: Maybe they should be distilled down into an enum.
513
//
514
// 0 - "Old"-style GL depth.
515
//     Or "Non-accurate depth" : effectively ignore minz / maxz. Map Z values based on viewport, which clamps.
516
//     This skews depth in many instances. Depth can be inverted in this mode if viewport says.
517
//     This is completely wrong, but works in some cases (probably because some game devs assumed it was how it worked)
518
//     and avoids some depth clamp issues.
519
//
520
// GPU_USE_ACCURATE_DEPTH:
521
//     Accurate depth: Z in the framebuffer matches the range of Z used on the PSP linearly in some way. We choose
522
//     a centered range, to simulate clamping by letting otherwise out-of-range pixels survive the 0 and 1 cutoffs.
523
//     Clip depth based on minz/maxz, and viewport is just a means to scale and center the value, not clipping or mapping to stored values.
524
//
525
// GPU_USE_ACCURATE_DEPTH | GPU_USE_DEPTH_CLAMP:
526
//     Variant of GPU_USE_ACCURATE_DEPTH, just the range is the nice and convenient 0-1 since we can use
527
//     hardware depth clamp. only viable in accurate depth mode, clamps depth and therefore uses the full 0-1 range. Using the full 0-1 range is not what accurate means, it's implied by depth clamp (which also means we're clamping.)
528
//
529
// GPU_USE_ACCURATE_DEPTH | GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT:
530
// GPU_USE_ACCURATE_DEPTH | GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT | GPU_USE_DEPTH_CLAMP:
531
//     Only viable in accurate depth mode, means to use a range of the 24-bit depth values available
532
//     from the GPU to represent the 16-bit values the PSP had, to try to make everything round and
533
//     z-fight (close to) the same way as on hardware, cheaply (cheaper than rounding depth in fragment shader).
534
//     We automatically switch to this if Z tests for equality are used.
535
//     Depth clamp has no effect on the depth scaling here if set, though will still be enabled
536
//     and clamp wildly out of line values.
537
//
538
// Any other combinations of these particular flags are bogus (like for example a lonely GPU_USE_DEPTH_CLAMP).
539

540
float DepthSliceFactor(u32 useFlags) {
541
	if (!(useFlags & GPU_USE_ACCURATE_DEPTH)) {
542
		// Old style depth.
543
		return 1.0f;
544
	}
545
	if (useFlags & GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT) {
546
		// Accurate depth but 16-bit resolution, so squish.
547
		return DEPTH_SLICE_FACTOR_16BIT;
548
	}
549
	if (useFlags & GPU_USE_DEPTH_CLAMP) {
550
		// Accurate depth, but we can use the full range since clamping is available.
551
		return 1.0f;
552
	}
553

554
	// Standard accurate depth.
555
	return DEPTH_SLICE_FACTOR_HIGH;
556
}
557

558
// See class DepthScaleFactors for how to apply.
559
DepthScaleFactors GetDepthScaleFactors(u32 useFlags) {
560
	if (!(useFlags & GPU_USE_ACCURATE_DEPTH)) {
561
		return DepthScaleFactors(0.0f, 65535.0f);
562
	}
563

564
	if (useFlags & GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT) {
565
		const double offset = 0.5 * (DEPTH_SLICE_FACTOR_16BIT - 1.0) / DEPTH_SLICE_FACTOR_16BIT;
566
		// Use one bit for each value, rather than 1.0 / (65535.0 * 256.0).
567
		const double scale = 16777215.0;
568
		return DepthScaleFactors(offset, scale);
569
	} else if (useFlags & GPU_USE_DEPTH_CLAMP) {
570
		return DepthScaleFactors(0.0f, 65535.0f);
571
	} else {
572
		const double offset = 0.5f * (DEPTH_SLICE_FACTOR_HIGH - 1.0f) * (1.0f / DEPTH_SLICE_FACTOR_HIGH);
573
		return DepthScaleFactors(offset, (float)(DEPTH_SLICE_FACTOR_HIGH * 65535.0));
574
	}
575
}
576

577
void ConvertViewportAndScissor(bool useBufferedRendering, float renderWidth, float renderHeight, int bufferWidth, int bufferHeight, ViewportAndScissor &out) {
578
	out.throughMode = gstate.isModeThrough();
579

580
	float renderWidthFactor, renderHeightFactor;
581
	float renderX = 0.0f, renderY = 0.0f;
582
	float displayOffsetX, displayOffsetY;
583
	if (useBufferedRendering) {
584
		displayOffsetX = 0.0f;
585
		displayOffsetY = 0.0f;
586
		renderWidthFactor = (float)renderWidth / (float)bufferWidth;
587
		renderHeightFactor = (float)renderHeight / (float)bufferHeight;
588
	} else {
589
		float pixelW = PSP_CoreParameter().pixelWidth;
590
		float pixelH = PSP_CoreParameter().pixelHeight;
591
		FRect frame = GetScreenFrame(pixelW, pixelH);
592
		FRect rc;
593
		CalculateDisplayOutputRect(&rc, 480, 272, frame, ROTATION_LOCKED_HORIZONTAL);
594
		displayOffsetX = rc.x;
595
		displayOffsetY = rc.y;
596
		renderWidth = rc.w;
597
		renderHeight = rc.h;
598
		renderWidthFactor = renderWidth / 480.0f;
599
		renderHeightFactor = renderHeight / 272.0f;
600
	}
601

602
	// We take care negative offsets of in the projection matrix.
603
	// These come from split framebuffers (Killzone).
604
	// TODO: Might be safe to do get rid of this here and do the same for positive offsets?
605
	renderX = std::max(gstate_c.curRTOffsetX, 0);
606
	renderY = std::max(gstate_c.curRTOffsetY, 0);
607

608
	// Scissor
609
	int scissorX1 = gstate.getScissorX1();
610
	int scissorY1 = gstate.getScissorY1();
611
	int scissorX2 = gstate.getScissorX2() + 1;
612
	int scissorY2 = gstate.getScissorY2() + 1;
613

614
	if (scissorX2 < scissorX1 || scissorY2 < scissorY1) {
615
		out.scissorX = 0;
616
		out.scissorY = 0;
617
		out.scissorW = 0;
618
		out.scissorH = 0;
619
	} else {
620
		out.scissorX = (renderX * renderWidthFactor) + displayOffsetX + scissorX1 * renderWidthFactor;
621
		out.scissorY = (renderY * renderHeightFactor) + displayOffsetY + scissorY1 * renderHeightFactor;
622
		out.scissorW = (scissorX2 - scissorX1) * renderWidthFactor;
623
		out.scissorH = (scissorY2 - scissorY1) * renderHeightFactor;
624
	}
625

626
	int curRTWidth = gstate_c.curRTWidth;
627
	int curRTHeight = gstate_c.curRTHeight;
628

629
	float offsetX = gstate.getOffsetX();
630
	float offsetY = gstate.getOffsetY();
631

632
	DepthScaleFactors depthScale = GetDepthScaleFactors(gstate_c.UseFlags());
633

634
	if (out.throughMode) {
635
		// If renderX/renderY are offset to compensate for a split framebuffer,
636
		// applying the offset to the viewport isn't enough, since the viewport clips.
637
		// We need to apply either directly to the vertices, or to the "through" projection matrix.
638
		out.viewportX = renderX * renderWidthFactor + displayOffsetX;
639
		out.viewportY = renderY * renderHeightFactor + displayOffsetY;
640
		out.viewportW = curRTWidth * renderWidthFactor;
641
		out.viewportH = curRTHeight * renderHeightFactor;
642
		out.depthRangeMin = depthScale.EncodeFromU16(0.0f);
643
		out.depthRangeMax = depthScale.EncodeFromU16(65536.0f);
644
	} else {
645
		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
646
		float vpXScale = gstate.getViewportXScale();
647
		float vpXCenter = gstate.getViewportXCenter();
648
		float vpYScale = gstate.getViewportYScale();
649
		float vpYCenter = gstate.getViewportYCenter();
650

651
		// The viewport transform appears to go like this:
652
		// Xscreen = -offsetX + vpXCenter + vpXScale * Xview
653
		// Yscreen = -offsetY + vpYCenter + vpYScale * Yview
654
		// Zscreen = vpZCenter + vpZScale * Zview
655

656
		// The viewport is normally centered at 2048,2048 but can also be centered at other locations.
657
		// Offset is subtracted from the viewport center and is also set to values in those ranges, and is set so that the viewport will cover
658
		// the desired screen area ([0-480)x[0-272)), so 1808,1912.
659

660
		// This means that to get the analogue glViewport we must:
661
		float vpX0 = vpXCenter - offsetX - fabsf(vpXScale);
662
		float vpY0 = vpYCenter - offsetY - fabsf(vpYScale);
663
		gstate_c.vpWidth = vpXScale * 2.0f;
664
		gstate_c.vpHeight = vpYScale * 2.0f;
665

666
		float vpWidth = fabsf(gstate_c.vpWidth);
667
		float vpHeight = fabsf(gstate_c.vpHeight);
668

669
		float left = renderX + vpX0;
670
		float top = renderY + vpY0;
671
		float right = left + vpWidth;
672
		float bottom = top + vpHeight;
673

674
		out.widthScale = 1.0f;
675
		out.xOffset = 0.0f;
676
		out.heightScale = 1.0f;
677
		out.yOffset = 0.0f;
678

679
		// If we're within the bounds, we want clipping the viewport way.  So leave it be.
680
		{
681
			float overageLeft = std::max(-left, 0.0f);
682
			float overageRight = std::max(right - bufferWidth, 0.0f);
683

684
			// Expand viewport to cover scissor region. The viewport doesn't clip on the PSP.
685
			if (right < scissorX2) {
686
				overageRight -= scissorX2 - right;
687
			}
688
			if (left > scissorX1) {
689
				overageLeft += scissorX1 - left;
690
			}
691

692
			// Our center drifted by the difference in overages.
693
			float drift = overageRight - overageLeft;
694

695
			if (overageLeft != 0.0f || overageRight != 0.0f) {
696
				left += overageLeft;
697
				right -= overageRight;
698

699
				// Protect against the viewport being entirely outside the scissor.
700
				// Emit a tiny but valid viewport. Really, we should probably emit a flag to ignore draws.
701
				if (right <= left) {
702
					right = left + 1.0f;
703
				}
704

705
				out.widthScale = vpWidth / (right - left);
706
				out.xOffset = drift / (right - left);
707
			}
708
		}
709

710
		{
711
			float overageTop = std::max(-top, 0.0f);
712
			float overageBottom = std::max(bottom - bufferHeight, 0.0f);
713

714
			// Expand viewport to cover scissor region. The viewport doesn't clip on the PSP.
715
			if (bottom < scissorY2) {
716
				overageBottom -= scissorY2 - bottom;
717
			}
718
			if (top > scissorY1) {
719
				overageTop += scissorY1 - top;
720
			}
721
			// Our center drifted by the difference in overages.
722
			float drift = overageBottom - overageTop;
723

724
			if (overageTop != 0.0f || overageBottom != 0.0f) {
725
				top += overageTop;
726
				bottom -= overageBottom;
727

728
				// Protect against the viewport being entirely outside the scissor.
729
				// Emit a tiny but valid  viewport. Really, we should probably emit a flag to ignore draws.
730
				if (bottom <= top) {
731
					bottom = top + 1.0f;
732
				}
733

734
				out.heightScale = vpHeight / (bottom - top);
735
				out.yOffset = drift / (bottom - top);
736
			}
737
		}
738

739
		out.viewportX = left * renderWidthFactor + displayOffsetX;
740
		out.viewportY = top * renderHeightFactor + displayOffsetY;
741
		out.viewportW = (right - left) * renderWidthFactor;
742
		out.viewportH = (bottom - top) * renderHeightFactor;
743

744
		// The depth viewport parameters are the same, but we handle it a bit differently.
745
		// When clipping is enabled, depth is clamped to [0, 65535].  And minz/maxz discard.
746
		// So, we apply the depth range as minz/maxz, and transform for the viewport.
747
		float vpZScale = gstate.getViewportZScale();
748
		float vpZCenter = gstate.getViewportZCenter();
749
		// TODO: This clip the entire draw if minz > maxz.
750
		float minz = gstate.getDepthRangeMin();
751
		float maxz = gstate.getDepthRangeMax();
752

753
		if (gstate.isDepthClampEnabled() && (minz == 0 || maxz == 65535)) {
754
			// Here, we should "clamp."  But clamping per fragment would be slow.
755
			// So, instead, we just increase the available range and hope.
756
			// If depthSliceFactor is 4, it means (75% / 2) of the depth lies in each direction.
757
			float fullDepthRange = 65535.0f * (depthScale.Scale() - 1.0f) * (1.0f / 2.0f);
758
			if (minz == 0) {
759
				minz -= fullDepthRange;
760
			}
761
			if (maxz == 65535) {
762
				maxz += fullDepthRange;
763
			}
764
		} else if (maxz == 65535) {
765
			// This means clamp isn't enabled, but we still want to allow values up to 65535.99.
766
			// If DepthSliceFactor() is 1.0, though, this would make out.depthRangeMax exceed 1.
767
			// Since that would clamp, it would make Z=1234 not match between draws when maxz changes.
768
			if (depthScale.Scale() > 1.0f)
769
				maxz = 65535.99f;
770
		}
771

772
		// Okay.  So, in our shader, -1 will map to minz, and +1 will map to maxz.
773
		float halfActualZRange = (maxz - minz) * (1.0f / 2.0f);
774
		out.depthScale = halfActualZRange < std::numeric_limits<float>::epsilon() ? 1.0f : vpZScale / halfActualZRange;
775
		// This adjusts the center from halfActualZRange to vpZCenter.
776
		out.zOffset = halfActualZRange < std::numeric_limits<float>::epsilon() ? 0.0f : (vpZCenter - (minz + halfActualZRange)) / halfActualZRange;
777

778
		if (!gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {
779
			out.depthScale = 1.0f;
780
			out.zOffset = 0.0f;
781
			out.depthRangeMin = depthScale.EncodeFromU16(vpZCenter - vpZScale);
782
			out.depthRangeMax = depthScale.EncodeFromU16(vpZCenter + vpZScale);
783
		} else {
784
			out.depthRangeMin = depthScale.EncodeFromU16(minz);
785
			out.depthRangeMax = depthScale.EncodeFromU16(maxz);
786
		}
787

788
		// OpenGL will clamp these for us anyway, and Direct3D will error if not clamped.
789
		// Of course, if this happens we've skewed out.depthScale/out.zOffset and may get z-fighting.
790
		out.depthRangeMin = std::max(out.depthRangeMin, 0.0f);
791
		out.depthRangeMax = std::min(out.depthRangeMax, 1.0f);
792
	}
793
}
794

795
void UpdateCachedViewportState(const ViewportAndScissor &vpAndScissor) {
796
	if (vpAndScissor.throughMode)
797
		return;
798

799
	bool scaleChanged = gstate_c.vpWidthScale != vpAndScissor.widthScale || gstate_c.vpHeightScale != vpAndScissor.heightScale;
800
	bool offsetChanged = gstate_c.vpXOffset != vpAndScissor.xOffset || gstate_c.vpYOffset != vpAndScissor.yOffset;
801
	bool depthChanged = gstate_c.vpDepthScale != vpAndScissor.depthScale || gstate_c.vpZOffset != vpAndScissor.zOffset;
802
	if (scaleChanged || offsetChanged || depthChanged) {
803
		gstate_c.vpWidthScale = vpAndScissor.widthScale;
804
		gstate_c.vpHeightScale = vpAndScissor.heightScale;
805
		gstate_c.vpDepthScale = vpAndScissor.depthScale;
806
		gstate_c.vpXOffset = vpAndScissor.xOffset;
807
		gstate_c.vpYOffset = vpAndScissor.yOffset;
808
		gstate_c.vpZOffset = vpAndScissor.zOffset;
809

810
		gstate_c.Dirty(DIRTY_PROJMATRIX);
811
		if (depthChanged) {
812
			gstate_c.Dirty(DIRTY_DEPTHRANGE);
813
		}
814
	}
815
}
816

817
static const BlendFactor genericALookup[11] = {
818
	BlendFactor::DST_COLOR,
819
	BlendFactor::ONE_MINUS_DST_COLOR,
820
	BlendFactor::SRC_ALPHA,
821
	BlendFactor::ONE_MINUS_SRC_ALPHA,
822
	BlendFactor::DST_ALPHA,
823
	BlendFactor::ONE_MINUS_DST_ALPHA,
824
	BlendFactor::SRC_ALPHA,			// GE_SRCBLEND_DOUBLESRCALPHA
825
	BlendFactor::ONE_MINUS_SRC_ALPHA,		// GE_SRCBLEND_DOUBLEINVSRCALPHA
826
	BlendFactor::DST_ALPHA,			// GE_SRCBLEND_DOUBLEDSTALPHA
827
	BlendFactor::ONE_MINUS_DST_ALPHA,		// GE_SRCBLEND_DOUBLEINVDSTALPHA
828
	BlendFactor::CONSTANT_COLOR,		// FIXA
829
};
830

831
static const BlendFactor genericBLookup[11] = {
832
	BlendFactor::SRC_COLOR,
833
	BlendFactor::ONE_MINUS_SRC_COLOR,
834
	BlendFactor::SRC_ALPHA,
835
	BlendFactor::ONE_MINUS_SRC_ALPHA,
836
	BlendFactor::DST_ALPHA,
837
	BlendFactor::ONE_MINUS_DST_ALPHA,
838
	BlendFactor::SRC_ALPHA,			// GE_SRCBLEND_DOUBLESRCALPHA
839
	BlendFactor::ONE_MINUS_SRC_ALPHA,		// GE_SRCBLEND_DOUBLEINVSRCALPHA
840
	BlendFactor::DST_ALPHA,			// GE_SRCBLEND_DOUBLEDSTALPHA
841
	BlendFactor::ONE_MINUS_DST_ALPHA,		// GE_SRCBLEND_DOUBLEINVDSTALPHA
842
	BlendFactor::CONSTANT_COLOR,		// FIXB
843
};
844

845
static const BlendEq eqLookupNoMinMax[] = {
846
	BlendEq::ADD,
847
	BlendEq::SUBTRACT,
848
	BlendEq::REVERSE_SUBTRACT,
849
	BlendEq::ADD,			// GE_BLENDMODE_MIN
850
	BlendEq::ADD,			// GE_BLENDMODE_MAX
851
	BlendEq::ADD,			// GE_BLENDMODE_ABSDIFF
852
};
853

854
static const BlendEq eqLookup[] = {
855
	BlendEq::ADD,
856
	BlendEq::SUBTRACT,
857
	BlendEq::REVERSE_SUBTRACT,
858
	BlendEq::MIN,			// GE_BLENDMODE_MIN
859
	BlendEq::MAX,			// GE_BLENDMODE_MAX
860
	BlendEq::MAX,			// GE_BLENDMODE_ABSDIFF
861
};
862

863
static BlendFactor toDualSource(BlendFactor blendfunc) {
864
	switch (blendfunc) {
865
	case BlendFactor::SRC_ALPHA:
866
		return BlendFactor::SRC1_ALPHA;
867
	case BlendFactor::ONE_MINUS_SRC_ALPHA:
868
		return BlendFactor::ONE_MINUS_SRC1_ALPHA;
869
	default:
870
		return blendfunc;
871
	}
872
}
873

874
static BlendFactor blendColor2Func(u32 fix, bool &approx) {
875
	if (fix == 0xFFFFFF)
876
		return BlendFactor::ONE;
877
	if (fix == 0)
878
		return BlendFactor::ZERO;
879

880
	// Otherwise, it's approximate if we pick ONE/ZERO.
881
	approx = true;
882

883
	const Vec3f fix3 = Vec3f::FromRGB(fix);
884
	if (fix3.x >= 0.99 && fix3.y >= 0.99 && fix3.z >= 0.99)
885
		return BlendFactor::ONE;
886
	else if (fix3.x <= 0.01 && fix3.y <= 0.01 && fix3.z <= 0.01)
887
		return BlendFactor::ZERO;
888
	return BlendFactor::INVALID;
889
}
890

891
// abs is a quagmire of compiler incompatibilities, so...
892
inline int iabs(int x) {
893
	return x >= 0 ? x : -x;
894
}
895

896
static inline bool blendColorSimilar(uint32_t a, uint32_t b, int margin = 25) {   // 25 ~= 0.1 * 255
897
	int diffx = iabs((a & 0xff) - (b & 0xff));
898
	int diffy = iabs(((a >> 8) & 0xff) - ((b >> 8) & 0xff));
899
	int diffz = iabs(((a >> 16) & 0xff) - ((b >> 16) & 0xff));
900
	if (diffx <= margin && diffy <= margin && diffz <= margin)
901
		return true;
902
	return false;
903
}
904

905
// Try to simulate some common logic ops by using blend, if needed.
906
// The shader might also need modification, the below function SimulateLogicOpShaderTypeIfNeeded
907
// takes care of that.
908
static bool SimulateLogicOpIfNeeded(BlendFactor &srcBlend, BlendFactor &dstBlend, BlendEq &blendEq) {
909
	if (!gstate.isLogicOpEnabled())
910
		return false;
911

912
	// Note: our shader solution applies logic ops BEFORE blending, not correctly after.
913
	// This is however fine for the most common ones, like CLEAR/NOOP/SET, etc.
914
	if (!gstate_c.Use(GPU_USE_LOGIC_OP)) {
915
		switch (gstate.getLogicOp()) {
916
		case GE_LOGIC_CLEAR:
917
			srcBlend = BlendFactor::ZERO;
918
			dstBlend = BlendFactor::ZERO;
919
			blendEq = BlendEq::ADD;
920
			return true;
921
		case GE_LOGIC_AND:
922
		case GE_LOGIC_AND_REVERSE:
923
			WARN_LOG_REPORT_ONCE(d3dLogicOpAnd, Log::G3D, "Unsupported AND logic op: %x", gstate.getLogicOp());
924
			break;
925
		case GE_LOGIC_COPY:
926
			// This is the same as off.
927
			break;
928
		case GE_LOGIC_COPY_INVERTED:
929
			// Handled in the shader.
930
			break;
931
		case GE_LOGIC_AND_INVERTED:
932
		case GE_LOGIC_NOR:
933
		case GE_LOGIC_NAND:
934
		case GE_LOGIC_EQUIV:
935
			// Handled in the shader.
936
			WARN_LOG_REPORT_ONCE(d3dLogicOpAndInverted, Log::G3D, "Attempted invert for logic op: %x", gstate.getLogicOp());
937
			break;
938
		case GE_LOGIC_INVERTED:
939
			srcBlend = BlendFactor::ONE;
940
			dstBlend = BlendFactor::ONE;
941
			blendEq = BlendEq::SUBTRACT;
942
			WARN_LOG_REPORT_ONCE(d3dLogicOpInverted, Log::G3D, "Attempted inverse for logic op: %x", gstate.getLogicOp());
943
			return true;
944
		case GE_LOGIC_NOOP:
945
			srcBlend = BlendFactor::ZERO;
946
			dstBlend = BlendFactor::ONE;
947
			blendEq = BlendEq::ADD;
948
			return true;
949
		case GE_LOGIC_XOR:
950
			WARN_LOG_REPORT_ONCE(d3dLogicOpOrXor, Log::G3D, "Unsupported XOR logic op: %x", gstate.getLogicOp());
951
			break;
952
		case GE_LOGIC_OR:
953
		case GE_LOGIC_OR_INVERTED:
954
			// Inverted in shader.
955
			srcBlend = BlendFactor::ONE;
956
			dstBlend = BlendFactor::ONE;
957
			blendEq = BlendEq::ADD;
958
			WARN_LOG_REPORT_ONCE(d3dLogicOpOr, Log::G3D, "Attempted or for logic op: %x", gstate.getLogicOp());
959
			return true;
960
		case GE_LOGIC_OR_REVERSE:
961
			WARN_LOG_REPORT_ONCE(d3dLogicOpOrReverse, Log::G3D, "Unsupported OR REVERSE logic op: %x", gstate.getLogicOp());
962
			break;
963
		case GE_LOGIC_SET:
964
			srcBlend = BlendFactor::ONE;
965
			dstBlend = BlendFactor::ONE;
966
			blendEq = BlendEq::ADD;
967
			WARN_LOG_REPORT_ONCE(d3dLogicOpSet, Log::G3D, "Attempted set for logic op: %x", gstate.getLogicOp());
968
			return true;
969
		}
970
	} else {
971
		// Even if we support hardware logic ops, alpha is handled wrong.
972
		// It's better to override blending for the simple cases.
973
		switch (gstate.getLogicOp()) {
974
		case GE_LOGIC_CLEAR:
975
			srcBlend = BlendFactor::ZERO;
976
			dstBlend = BlendFactor::ZERO;
977
			blendEq = BlendEq::ADD;
978
			return true;
979
		case GE_LOGIC_NOOP:
980
			srcBlend = BlendFactor::ZERO;
981
			dstBlend = BlendFactor::ONE;
982
			blendEq = BlendEq::ADD;
983
			return true;
984

985
		default:
986
			// Let's hope hardware gets it right.
987
			return false;
988
		}
989
	}
990
	return false;
991
}
992

993
// Choose the shader part of the above logic op fallback simulation.
994
SimulateLogicOpType SimulateLogicOpShaderTypeIfNeeded() {
995
	if (!gstate_c.Use(GPU_USE_LOGIC_OP) && gstate.isLogicOpEnabled()) {
996
		switch (gstate.getLogicOp()) {
997
		case GE_LOGIC_COPY_INVERTED:
998
		case GE_LOGIC_AND_INVERTED:
999
		case GE_LOGIC_OR_INVERTED:
1000
		case GE_LOGIC_NOR:
1001
		case GE_LOGIC_NAND:
1002
		case GE_LOGIC_EQUIV:
1003
			return LOGICOPTYPE_INVERT;
1004
		case GE_LOGIC_INVERTED:
1005
			return LOGICOPTYPE_ONE;
1006
		case GE_LOGIC_SET:
1007
			return LOGICOPTYPE_ONE;
1008
		default:
1009
			return LOGICOPTYPE_NORMAL;
1010
		}
1011
	}
1012
	return LOGICOPTYPE_NORMAL;
1013
}
1014

1015
void ApplyStencilReplaceAndLogicOpIgnoreBlend(ReplaceAlphaType replaceAlphaWithStencil, GenericBlendState &blendState) {
1016
	StencilValueType stencilType = STENCIL_VALUE_KEEP;
1017
	if (replaceAlphaWithStencil == REPLACE_ALPHA_YES) {
1018
		stencilType = ReplaceAlphaWithStencilType();
1019
	}
1020

1021
	// Normally, we would add src + 0 with blending off, but the logic op may have us do differently.
1022
	BlendFactor srcBlend = BlendFactor::ONE;
1023
	BlendFactor dstBlend = BlendFactor::ZERO;
1024
	BlendEq blendEq = BlendEq::ADD;
1025

1026
	// We're not blending, but we may still want to "blend" for stencil.
1027
	// This is only useful for INCR/DECR/INVERT.  Others can write directly.
1028
	switch (stencilType) {
1029
	case STENCIL_VALUE_INCR_4:
1030
	case STENCIL_VALUE_INCR_8:
1031
		// We'll add the incremented value output by the shader.
1032
		blendState.blendEnabled = true;
1033
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1034
		blendState.setEquation(blendEq, BlendEq::ADD);
1035
		break;
1036

1037
	case STENCIL_VALUE_DECR_4:
1038
	case STENCIL_VALUE_DECR_8:
1039
		// We'll subtract the incremented value output by the shader.
1040
		blendState.blendEnabled = true;
1041
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1042
		blendState.setEquation(blendEq, BlendEq::SUBTRACT);
1043
		break;
1044

1045
	case STENCIL_VALUE_INVERT:
1046
		// The shader will output one, and reverse subtracting will essentially invert.
1047
		blendState.blendEnabled = true;
1048
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1049
		blendState.setEquation(blendEq, BlendEq::REVERSE_SUBTRACT);
1050
		break;
1051

1052
	default:
1053
		if (srcBlend == BlendFactor::ONE && dstBlend == BlendFactor::ZERO && blendEq == BlendEq::ADD) {
1054
			blendState.blendEnabled = false;
1055
		} else {
1056
			blendState.blendEnabled = true;
1057
			blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ZERO);
1058
			blendState.setEquation(blendEq, BlendEq::ADD);
1059
		}
1060
		break;
1061
	}
1062
}
1063

1064
// If we can we emulate the colorMask by simply toggling the full R G B A masks offered
1065
// by modern hardware, we do that. This is 99.9% of the time.
1066
// When that's not enough, we fall back on a technique similar to shader blending,
1067
// we read from the framebuffer (or a copy of it).
1068
// We also prepare uniformMask so that if doing this in the shader gets forced-on,
1069
// we have the right mask already.
1070
static void ConvertMaskState(GenericMaskState &maskState, bool shaderBitOpsSupported) {
1071
	if (gstate_c.blueToAlpha) {
1072
		maskState.applyFramebufferRead = false;
1073
		maskState.uniformMask = 0xFF000000;
1074
		maskState.channelMask = 0x8;
1075
		return;
1076
	}
1077

1078
	// Invert to convert masks from the PSP's format where 1 is don't draw to PC where 1 is draw.
1079
	uint32_t colorMask = ~((gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24));
1080

1081
	maskState.uniformMask = colorMask;
1082
	maskState.applyFramebufferRead = false;
1083
	maskState.channelMask = 0;
1084
	for (int i = 0; i < 4; i++) {
1085
		uint32_t channelMask = (colorMask >> (i * 8)) & 0xFF;
1086
		switch (channelMask) {
1087
		case 0x0:
1088
			break;
1089
		case 0xFF:
1090
			maskState.channelMask |= 1 << i;
1091
			break;
1092
		default:
1093
			if (shaderBitOpsSupported && PSP_CoreParameter().compat.flags().ShaderColorBitmask) {
1094
				// Shaders can emulate masking accurately. Let's make use of that.
1095
				maskState.applyFramebufferRead = true;
1096
				maskState.channelMask |= 1 << i;
1097
			} else {
1098
				// Use the old inaccurate heuristic.
1099
				if (channelMask >= 128) {
1100
					maskState.channelMask |= 1 << i;
1101
				}
1102
			}
1103
		}
1104
	}
1105

1106
	// Let's not write to alpha if stencil isn't enabled.
1107
	// Also if the stencil type is set to KEEP, we shouldn't write to the stencil/alpha channel.
1108
	if (IsStencilTestOutputDisabled() || ReplaceAlphaWithStencilType() == STENCIL_VALUE_KEEP) {
1109
		maskState.channelMask &= ~8;
1110
		maskState.uniformMask &= ~0xFF000000;
1111
	}
1112

1113
	// For 5551, only the top alpha bit matters.  We might even want to swizzle 4444.
1114
	// Alpha should correctly read as 255 from a 5551 texture.
1115
	if (gstate.FrameBufFormat() == GE_FORMAT_5551) {
1116
		if ((maskState.uniformMask & 0x80000000) != 0)
1117
			maskState.uniformMask |= 0xFF000000;
1118
		else
1119
			maskState.uniformMask &= ~0xFF000000;
1120
	}
1121
}
1122

1123
// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
1124
static void ConvertBlendState(GenericBlendState &blendState, bool forceReplaceBlend) {
1125
	// Blending is a bit complex to emulate.  This is due to several reasons:
1126
	//
1127
	//  * Doubled blend modes (src, dst, inversed) aren't supported in OpenGL.
1128
	//    If possible, we double the src color or src alpha in the shader to account for these.
1129
	//    These may clip incorrectly, so we avoid unfortunately.
1130
	//  * OpenGL only has one arbitrary fixed color.  We premultiply the other in the shader.
1131
	//  * The written output alpha should actually be the stencil value.  Alpha is not written.
1132
	//
1133
	// If we can't apply blending, we make a copy of the framebuffer and do it manually.
1134

1135
	blendState.applyFramebufferRead = false;
1136
	blendState.dirtyShaderBlendFixValues = false;
1137
	blendState.useBlendColor = false;
1138

1139
	ReplaceBlendType replaceBlend = ReplaceBlendWithShader(gstate_c.framebufFormat);
1140
	if (forceReplaceBlend) {
1141
		// Enforce blend replacement if enabled. If not, shouldn't do anything of course.
1142
		replaceBlend = gstate.isAlphaBlendEnabled() ? REPLACE_BLEND_READ_FRAMEBUFFER : REPLACE_BLEND_NO;
1143
	}
1144

1145
	blendState.replaceBlend = replaceBlend;
1146

1147
	blendState.simulateLogicOpType = SimulateLogicOpShaderTypeIfNeeded();
1148

1149
	ReplaceAlphaType replaceAlphaWithStencil = ReplaceAlphaWithStencil(replaceBlend);
1150
	blendState.replaceAlphaWithStencil = replaceAlphaWithStencil;
1151

1152
	bool usePreSrc = false;
1153

1154
	bool blueToAlpha = false;
1155

1156
	switch (replaceBlend) {
1157
	case REPLACE_BLEND_NO:
1158
		// We may still want to do something about stencil -> alpha.
1159
		ApplyStencilReplaceAndLogicOpIgnoreBlend(replaceAlphaWithStencil, blendState);
1160

1161
		if (forceReplaceBlend) {
1162
			// If this is true, the logic and mask replacements will be applied, at least. In that case,
1163
			// we should not apply any logic op simulation.
1164
			blendState.simulateLogicOpType = LOGICOPTYPE_NORMAL;
1165
		}
1166
		return;
1167

1168
	case REPLACE_BLEND_BLUE_TO_ALPHA:
1169
		blueToAlpha = true;
1170
		blendState.blendEnabled = gstate.isAlphaBlendEnabled();
1171
		// We'll later convert the color blend to blend in the alpha channel.
1172
		break;
1173

1174
	case REPLACE_BLEND_READ_FRAMEBUFFER:
1175
		blendState.blendEnabled = true;
1176
		blendState.applyFramebufferRead = true;
1177
		blendState.simulateLogicOpType = LOGICOPTYPE_NORMAL;
1178
		break;
1179

1180
	case REPLACE_BLEND_PRE_SRC:
1181
	case REPLACE_BLEND_PRE_SRC_2X_ALPHA:
1182
		blendState.blendEnabled = true;
1183
		usePreSrc = true;
1184
		break;
1185

1186
	case REPLACE_BLEND_STANDARD:
1187
	case REPLACE_BLEND_2X_ALPHA:
1188
	case REPLACE_BLEND_2X_SRC:
1189
		blendState.blendEnabled = true;
1190
		break;
1191
	}
1192

1193
	const GEBlendMode blendFuncEq = gstate.getBlendEq();
1194
	GEBlendSrcFactor blendFuncA = gstate.getBlendFuncA();
1195
	GEBlendDstFactor blendFuncB = gstate.getBlendFuncB();
1196
	const u32 fixA = gstate.getFixA();
1197
	const u32 fixB = gstate.getFixB();
1198

1199
	if (blendFuncA > GE_SRCBLEND_FIXA)
1200
		blendFuncA = GE_SRCBLEND_FIXA;
1201
	if (blendFuncB > GE_DSTBLEND_FIXB)
1202
		blendFuncB = GE_DSTBLEND_FIXB;
1203

1204
	int constantAlpha = 255;
1205
	BlendFactor constantAlphaGL = BlendFactor::ONE;
1206
	if (!IsStencilTestOutputDisabled() && replaceAlphaWithStencil == REPLACE_ALPHA_NO) {
1207
		switch (ReplaceAlphaWithStencilType()) {
1208
		case STENCIL_VALUE_UNIFORM:
1209
			constantAlpha = gstate.getStencilTestRef();
1210
			break;
1211

1212
		case STENCIL_VALUE_INCR_4:
1213
		case STENCIL_VALUE_DECR_4:
1214
			constantAlpha = 16;
1215
			break;
1216

1217
		case STENCIL_VALUE_INCR_8:
1218
		case STENCIL_VALUE_DECR_8:
1219
			constantAlpha = 1;
1220
			break;
1221

1222
		default:
1223
			break;
1224
		}
1225

1226
		// Otherwise it will stay GL_ONE.
1227
		if (constantAlpha <= 0) {
1228
			constantAlphaGL = BlendFactor::ZERO;
1229
		} else if (constantAlpha < 255) {
1230
			constantAlphaGL = BlendFactor::CONSTANT_ALPHA;
1231
		}
1232
	}
1233

1234
	// Shortcut by using GL_ONE where possible, no need to set blendcolor
1235
	bool approxFuncA = false;
1236
	BlendFactor glBlendFuncA = blendFuncA == GE_SRCBLEND_FIXA ? blendColor2Func(fixA, approxFuncA) : genericALookup[blendFuncA];
1237
	bool approxFuncB = false;
1238
	BlendFactor glBlendFuncB = blendFuncB == GE_DSTBLEND_FIXB ? blendColor2Func(fixB, approxFuncB) : genericBLookup[blendFuncB];
1239

1240
	if (gstate_c.framebufFormat == GE_FORMAT_565) {
1241
		if (blendFuncA == GE_SRCBLEND_DSTALPHA || blendFuncA == GE_SRCBLEND_DOUBLEDSTALPHA) {
1242
			glBlendFuncA = BlendFactor::ZERO;
1243
		}
1244
		if (blendFuncA == GE_SRCBLEND_INVDSTALPHA || blendFuncA == GE_SRCBLEND_DOUBLEINVDSTALPHA) {
1245
			glBlendFuncA = BlendFactor::ONE;
1246
		}
1247
		if (blendFuncB == GE_DSTBLEND_DSTALPHA || blendFuncB == GE_DSTBLEND_DOUBLEDSTALPHA) {
1248
			glBlendFuncB = BlendFactor::ZERO;
1249
		}
1250
		if (blendFuncB == GE_DSTBLEND_INVDSTALPHA || blendFuncB == GE_DSTBLEND_DOUBLEINVDSTALPHA) {
1251
			glBlendFuncB = BlendFactor::ONE;
1252
		}
1253
	}
1254

1255
	if (usePreSrc) {
1256
		glBlendFuncA = BlendFactor::ONE;
1257
		// Need to pull in the fixed color. TODO: If it hasn't changed, no need to dirty.
1258
		if (blendFuncA == GE_SRCBLEND_FIXA) {
1259
			blendState.dirtyShaderBlendFixValues = true;
1260
		}
1261
	}
1262

1263
	if (replaceAlphaWithStencil == REPLACE_ALPHA_DUALSOURCE) {
1264
		glBlendFuncA = toDualSource(glBlendFuncA);
1265
		glBlendFuncB = toDualSource(glBlendFuncB);
1266
	}
1267

1268
	if (blendFuncA == GE_SRCBLEND_FIXA || blendFuncB == GE_DSTBLEND_FIXB) {
1269
		if (glBlendFuncA == BlendFactor::INVALID && glBlendFuncB != BlendFactor::INVALID) {
1270
			// Can use blendcolor trivially.
1271
			blendState.setBlendColor(fixA, constantAlpha);
1272
			glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1273
		} else if (glBlendFuncA != BlendFactor::INVALID && glBlendFuncB == BlendFactor::INVALID) {
1274
			// Can use blendcolor trivially.
1275
			blendState.setBlendColor(fixB, constantAlpha);
1276
			glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1277
		} else if (glBlendFuncA == BlendFactor::INVALID && glBlendFuncB == BlendFactor::INVALID) {
1278
			if (blendColorSimilar(fixA, 0xFFFFFF ^ fixB)) {
1279
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1280
				glBlendFuncB = BlendFactor::ONE_MINUS_CONSTANT_COLOR;
1281
				blendState.setBlendColor(fixA, constantAlpha);
1282
			} else if (blendColorSimilar(fixA, fixB)) {
1283
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1284
				glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1285
				blendState.setBlendColor(fixA, constantAlpha);
1286
			} else {
1287
				DEBUG_LOG(Log::G3D, "ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", fixA, fixB, blendFuncA, blendFuncB);
1288
				// Let's approximate, at least.  Close is better than totally off.
1289
				const bool nearZeroA = blendColorSimilar(fixA, 0, 64);
1290
				const bool nearZeroB = blendColorSimilar(fixB, 0, 64);
1291
				if (nearZeroA || blendColorSimilar(fixA, 0xFFFFFF, 64)) {
1292
					glBlendFuncA = nearZeroA ? BlendFactor::ZERO : BlendFactor::ONE;
1293
					glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1294
					blendState.setBlendColor(fixB, constantAlpha);
1295
				} else {
1296
					// We need to pick something.  Let's go with A as the fixed color.
1297
					glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1298
					glBlendFuncB = nearZeroB ? BlendFactor::ZERO : BlendFactor::ONE;
1299
					blendState.setBlendColor(fixA, constantAlpha);
1300
				}
1301
			}
1302
		} else {
1303
			// We optimized both, but that's probably not necessary, so let's pick one to be constant.
1304
			if (blendFuncA == GE_SRCBLEND_FIXA && !usePreSrc && approxFuncA) {
1305
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1306
				blendState.setBlendColor(fixA, constantAlpha);
1307
			} else if (approxFuncB) {
1308
				glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1309
				blendState.setBlendColor(fixB, constantAlpha);
1310
			} else {
1311
				if (constantAlphaGL == BlendFactor::CONSTANT_ALPHA) {
1312
					blendState.defaultBlendColor(constantAlpha);
1313
				}
1314
			}
1315
		}
1316
	} else {
1317
		if (constantAlphaGL == BlendFactor::CONSTANT_ALPHA) {
1318
			blendState.defaultBlendColor(constantAlpha);
1319
		}
1320
	}
1321

1322
	// Some Android devices (especially old Mali, it seems) composite badly if there's alpha in the backbuffer.
1323
	// So in non-buffered rendering, we will simply consider the dest alpha to be zero in blending equations.
1324
#ifdef __ANDROID__
1325
	if (g_Config.bSkipBufferEffects) {
1326
		if (glBlendFuncA == BlendFactor::DST_ALPHA) glBlendFuncA = BlendFactor::ZERO;
1327
		if (glBlendFuncB == BlendFactor::DST_ALPHA) glBlendFuncB = BlendFactor::ZERO;
1328
		if (glBlendFuncA == BlendFactor::ONE_MINUS_DST_ALPHA) glBlendFuncA = BlendFactor::ONE;
1329
		if (glBlendFuncB == BlendFactor::ONE_MINUS_DST_ALPHA) glBlendFuncB = BlendFactor::ONE;
1330
	}
1331
#endif
1332

1333
	// At this point, through all paths above, glBlendFuncA and glBlendFuncB will be set right somehow.
1334
	BlendEq colorEq;
1335
	if (gstate_c.Use(GPU_USE_BLEND_MINMAX)) {
1336
		colorEq = eqLookup[blendFuncEq];
1337
	} else {
1338
		colorEq = eqLookupNoMinMax[blendFuncEq];
1339
	}
1340

1341
	// The stencil-to-alpha in fragment shader doesn't apply here (blending is enabled), and we shouldn't
1342
	// do any blending in the alpha channel as that doesn't seem to happen on PSP.  So, we attempt to
1343
	// apply the stencil to the alpha, since that's what should be stored.
1344
	BlendEq alphaEq = BlendEq::ADD;
1345
	if (replaceAlphaWithStencil != REPLACE_ALPHA_NO) {
1346
		// Let the fragment shader take care of it.
1347
		switch (ReplaceAlphaWithStencilType()) {
1348
		case STENCIL_VALUE_INCR_4:
1349
		case STENCIL_VALUE_INCR_8:
1350
			// We'll add the increment value.
1351
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1352
			break;
1353

1354
		case STENCIL_VALUE_DECR_4:
1355
		case STENCIL_VALUE_DECR_8:
1356
			// Like add with a small value, but subtracting.
1357
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1358
			alphaEq = BlendEq::SUBTRACT;
1359
			break;
1360

1361
		case STENCIL_VALUE_INVERT:
1362
			// This will subtract by one, effectively inverting the bits.
1363
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1364
			alphaEq = BlendEq::REVERSE_SUBTRACT;
1365
			break;
1366

1367
		default:
1368
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ZERO);
1369
			break;
1370
		}
1371
	} else if (!IsStencilTestOutputDisabled()) {
1372
		StencilValueType stencilValue = ReplaceAlphaWithStencilType();
1373
		if (stencilValue == STENCIL_VALUE_UNIFORM && constantAlpha == 0x00) {
1374
			stencilValue = STENCIL_VALUE_ZERO;
1375
		} else if (stencilValue == STENCIL_VALUE_UNIFORM && constantAlpha == 0xFF) {
1376
			stencilValue = STENCIL_VALUE_ONE;
1377
		}
1378
		switch (stencilValue) {
1379
		case STENCIL_VALUE_KEEP:
1380
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ONE);
1381
			break;
1382
		case STENCIL_VALUE_ONE:
1383
			// This won't give one but it's our best shot...
1384
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1385
			break;
1386
		case STENCIL_VALUE_ZERO:
1387
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ZERO);
1388
			break;
1389
		case STENCIL_VALUE_UNIFORM:
1390
			// This won't give a correct value (it multiplies) but it may be better than random values.
1391
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ZERO);
1392
			break;
1393
		case STENCIL_VALUE_INCR_4:
1394
		case STENCIL_VALUE_INCR_8:
1395
			// This won't give a correct value always, but it will try to increase at least.
1396
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ONE);
1397
			break;
1398
		case STENCIL_VALUE_DECR_4:
1399
		case STENCIL_VALUE_DECR_8:
1400
			// This won't give a correct value always, but it will try to decrease at least.
1401
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ONE);
1402
			alphaEq = BlendEq::SUBTRACT;
1403
			break;
1404
		case STENCIL_VALUE_INVERT:
1405
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1406
			// If the output alpha is near 1, this will basically invert.  It's our best shot.
1407
			alphaEq = BlendEq::REVERSE_SUBTRACT;
1408
			break;
1409
		}
1410
	} else if (blueToAlpha) {
1411
		blendState.setFactors(BlendFactor::ZERO, BlendFactor::ZERO, BlendFactor::ONE, glBlendFuncB);
1412
		blendState.setEquation(BlendEq::ADD, colorEq);
1413
		return;
1414
	} else {
1415
		// Retain the existing value when stencil testing is off.
1416
		blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ONE);
1417
	}
1418

1419
	blendState.setEquation(colorEq, alphaEq);
1420
}
1421

1422
static void ConvertLogicOpState(GenericLogicState &logicOpState, bool logicSupported, bool shaderBitOpsSupported, bool forceApplyFramebuffer) {
1423
	// TODO: We can get more detailed with checks here. Some logic ops don't involve the destination at all.
1424
	// Several can be trivially supported even without any bitwise logic.
1425
	if (!gstate.isLogicOpEnabled() || gstate.getLogicOp() == GE_LOGIC_COPY) {
1426
		// No matter what, don't need to do anything.
1427
		logicOpState.logicOpEnabled = false;
1428
		logicOpState.logicOp = GE_LOGIC_COPY;
1429
		logicOpState.applyFramebufferRead = forceApplyFramebuffer;
1430
		return;
1431
	}
1432

1433
	if (forceApplyFramebuffer && shaderBitOpsSupported) {
1434
		// We have to emulate logic ops in the shader.
1435
		logicOpState.logicOpEnabled = false;  // Don't use any hardware logic op, supported or not.
1436
		logicOpState.applyFramebufferRead = true;
1437
		logicOpState.logicOp = gstate.getLogicOp();
1438
	} else if (logicSupported) {
1439
		// We can use hardware logic ops, if needed.
1440
		logicOpState.applyFramebufferRead = false;
1441
		if (gstate.isLogicOpEnabled()) {
1442
			logicOpState.logicOpEnabled = true;
1443
			logicOpState.logicOp = gstate.getLogicOp();
1444
		} else {
1445
			logicOpState.logicOpEnabled = false;
1446
			logicOpState.logicOp = GE_LOGIC_COPY;
1447
		}
1448
	} else if (shaderBitOpsSupported) {
1449
		// D3D11 and some OpenGL versions will end up here.
1450
		// Logic ops not support, bitops supported. Let's punt to the shader.
1451
		// We should possibly always do this and never use the hardware ops, since they'll mishandle the alpha channel..
1452
		logicOpState.logicOpEnabled = false;  // Don't use any hardware logic op, supported or not.
1453
		logicOpState.applyFramebufferRead = true;
1454
		logicOpState.logicOp = gstate.getLogicOp();
1455
	} else {
1456
		// In this case, the SIMULATE fallback should kick in.
1457
		// Need to make sure this is checking for the same things though...
1458
		logicOpState.logicOpEnabled = false;
1459
		logicOpState.logicOp = GE_LOGIC_COPY;
1460
		logicOpState.applyFramebufferRead = false;
1461
	}
1462
}
1463

1464
static void ConvertStencilFunc5551(GenericStencilFuncState &state) {
1465
	// Flaws:
1466
	// - INVERT should convert 1, 5, 0xFF to 0.  Currently it won't always.
1467
	// - INCR twice shouldn't change the value.
1468
	// - REPLACE should write 0 for 0x00 - 0x7F, and non-zero for 0x80 - 0xFF.
1469
	// - Write mask may need double checking, but likely only the top bit matters.
1470

1471
	const bool usesRef = state.sFail == GE_STENCILOP_REPLACE || state.zFail == GE_STENCILOP_REPLACE || state.zPass == GE_STENCILOP_REPLACE;
1472
	const u8 maskedRef = state.testRef & state.testMask;
1473
	const u8 usedRef = (state.testRef & 0x80) != 0 ? 0xFF : 0x00;
1474

1475
	auto rewriteFunc = [&](GEComparison func, u8 ref) {
1476
		// We can only safely rewrite if it doesn't use the ref, or if the ref is the same.
1477
		if (!usesRef || usedRef == ref) {
1478
			state.testFunc = func;
1479
			state.testRef = ref;
1480
			state.testMask = 0xFF;
1481
		}
1482
	};
1483
	auto rewriteRef = [&](bool always) {
1484
		state.testFunc = always ? GE_COMP_ALWAYS : GE_COMP_NEVER;
1485
		if (usesRef) {
1486
			// Rewrite the ref (for REPLACE) to 0x00 or 0xFF (the "best" values) if safe.
1487
			// This will only be called if the test doesn't need the ref.
1488
			state.testRef = usedRef;
1489
			// Nuke the mask as well, since this is always/never, just for consistency.
1490
			state.testMask = 0xFF;
1491
		} else {
1492
			// Not used, so let's make the ref 0xFF which is a useful value later.
1493
			state.testRef = 0xFF;
1494
			state.testMask = 0xFF;
1495
		}
1496
	};
1497

1498
	// For 5551, we treat any non-zero value in the buffer as 255.  Only zero is treated as zero.
1499
	// See: https://github.com/hrydgard/ppsspp/pull/4150#issuecomment-26211193
1500
	switch (state.testFunc) {
1501
	case GE_COMP_NEVER:
1502
	case GE_COMP_ALWAYS:
1503
		// Fine as is.
1504
		rewriteRef(state.testFunc == GE_COMP_ALWAYS);
1505
		break;
1506
	case GE_COMP_EQUAL: // maskedRef == maskedBuffer
1507
		if (maskedRef == 0) {
1508
			// Remove any mask, we might have bits less than 255 but that should not match.
1509
			rewriteFunc(GE_COMP_EQUAL, 0);
1510
		} else if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1511
			// Equal to 255, for our buffer, means not equal to zero.
1512
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1513
		} else {
1514
			// This should never pass, regardless of buffer value.  Only 0 and 255 are directly equal.
1515
			rewriteRef(false);
1516
		}
1517
		break;
1518
	case GE_COMP_NOTEQUAL: // maskedRef != maskedBuffer
1519
		if (maskedRef == 0) {
1520
			// Remove the mask, since our buffer might not be exactly 255.
1521
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1522
		} else if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1523
			// The only value != 255 is 0, in our buffer.
1524
			rewriteFunc(GE_COMP_EQUAL, 0);
1525
		} else {
1526
			// Every other value evaluates as not equal, always.
1527
			rewriteRef(true);
1528
		}
1529
		break;
1530
	case GE_COMP_LESS: // maskedRef < maskedBuffer
1531
		if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1532
			// No possible value is less than 255.
1533
			rewriteRef(false);
1534
		} else {
1535
			// "0 < (0 or 255)" and "254 < (0 or 255)" can only work for non zero.
1536
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1537
		}
1538
		break;
1539
	case GE_COMP_LEQUAL: // maskedRef <= maskedBuffer
1540
		if (maskedRef == 0) {
1541
			// 0 is <= every possible value.
1542
			rewriteRef(true);
1543
		} else {
1544
			// "1 <= (0 or 255)" and "255 <= (0 or 255)" simply mean, anything but zero.
1545
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1546
		}
1547
		break;
1548
	case GE_COMP_GREATER: // maskedRef > maskedBuffer
1549
		if (maskedRef > 0) {
1550
			// "1 > (0 or 255)" and "255 > (0 or 255)" can only match 0.
1551
			rewriteFunc(GE_COMP_EQUAL, 0);
1552
		} else {
1553
			// 0 is never greater than any possible value.
1554
			rewriteRef(false);
1555
		}
1556
		break;
1557
	case GE_COMP_GEQUAL: // maskedRef >= maskedBuffer
1558
		if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1559
			// 255 is >= every possible value.
1560
			rewriteRef(true);
1561
		} else {
1562
			// "0 >= (0 or 255)" and "254 >= "(0 or 255)" are the same, equal to zero.
1563
			rewriteFunc(GE_COMP_EQUAL, 0);
1564
		}
1565
		break;
1566
	}
1567

1568
	auto rewriteOps = [&](GEStencilOp from, GEStencilOp to) {
1569
		if (state.sFail == from)
1570
			state.sFail = to;
1571
		if (state.zFail == from)
1572
			state.zFail = to;
1573
		if (state.zPass == from)
1574
			state.zPass = to;
1575
	};
1576

1577
	// Decrement always zeros, so let's rewrite those to be safe (even if it's not 1.)
1578
	rewriteOps(GE_STENCILOP_DECR, GE_STENCILOP_ZERO);
1579

1580
	if (state.testFunc == GE_COMP_NOTEQUAL && state.testRef == 0 && state.testMask != 0) {
1581
		// If it's != 0 (as optimized above), then we can rewrite INVERT to ZERO.
1582
		// With 1 bit of stencil, INVERT != 0 can only make it 0.
1583
		rewriteOps(GE_STENCILOP_INVERT, GE_STENCILOP_ZERO);
1584
	}
1585
	if (state.testFunc == GE_COMP_EQUAL && state.testRef == 0 && state.testMask != 0) {
1586
		// If it's == 0 (as optimized above), then we can rewrite INCR to INVERT.
1587
		// Otherwise we get 1, which we mostly handle, but won't INVERT correctly.
1588
		rewriteOps(GE_STENCILOP_INCR, GE_STENCILOP_INVERT);
1589
	}
1590
	if (!usesRef && state.testRef == 0xFF) {
1591
		// Safe to use REPLACE instead of INCR.
1592
		rewriteOps(GE_STENCILOP_INCR, GE_STENCILOP_REPLACE);
1593
	}
1594
}
1595

1596
static void ConvertStencilMask5551(GenericStencilFuncState &state) {
1597
	state.writeMask = state.writeMask >= 0x80 ? 0xff : 0x00;
1598
}
1599

1600
void ConvertStencilFuncState(GenericStencilFuncState &state) {
1601
	// The PSP's mask is reversed (bits not to write.)  Ignore enabled, used for clears too.
1602
	state.writeMask = (~gstate.getStencilWriteMask()) & 0xFF;
1603
	state.enabled = gstate.isStencilTestEnabled();
1604
	if (!state.enabled) {
1605
		if (gstate_c.framebufFormat == GE_FORMAT_5551)
1606
			ConvertStencilMask5551(state);
1607
		return;
1608
	}
1609

1610
	state.sFail = gstate.getStencilOpSFail();
1611
	state.zFail = gstate.getStencilOpZFail();
1612
	state.zPass = gstate.getStencilOpZPass();
1613

1614
	state.testFunc = gstate.getStencilTestFunction();
1615
	state.testRef = gstate.getStencilTestRef();
1616
	state.testMask = gstate.getStencilTestMask();
1617

1618
	bool depthTest = gstate.isDepthTestEnabled();
1619
	if ((state.sFail == state.zFail || !depthTest) && state.sFail == state.zPass) {
1620
		// Common case: we're writing only to stencil (usually REPLACE/REPLACE/REPLACE.)
1621
		// We want to write stencil to alpha in this case, so switch to ALWAYS if already masked.
1622
		bool depthWrite = gstate.isDepthWriteEnabled();
1623
		if ((gstate.getColorMask() & 0x00FFFFFF) == 0x00FFFFFF && (!depthTest || !depthWrite)) {
1624
			state.testFunc = GE_COMP_ALWAYS;
1625
		}
1626
	}
1627

1628
	switch (gstate_c.framebufFormat) {
1629
	case GE_FORMAT_565:
1630
		state.writeMask = 0;
1631
		break;
1632

1633
	case GE_FORMAT_5551:
1634
		ConvertStencilMask5551(state);
1635
		ConvertStencilFunc5551(state);
1636
		break;
1637

1638
	default:
1639
		// Hard to do anything useful for 4444, and 8888 is fine.
1640
		break;
1641
	}
1642
}
1643

1644
void GenericMaskState::Log() {
1645
	WARN_LOG(Log::G3D, "Mask: %08x %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
1646
}
1647

1648
void GenericBlendState::Log() {
1649
	WARN_LOG(Log::G3D, "Blend: hwenable=%d readfb=%d replblend=%d replalpha=%d",
1650
		blendEnabled, applyFramebufferRead, replaceBlend, (int)replaceAlphaWithStencil);
1651
}
1652

1653
void ComputedPipelineState::Convert(bool shaderBitOpsSuppported) {
1654
	// Passing on the previous applyFramebufferRead as forceFrameBuffer read in the next one,
1655
	// thus propagating forward.
1656
	ConvertMaskState(maskState, shaderBitOpsSuppported);
1657
	ConvertLogicOpState(logicState, gstate_c.Use(GPU_USE_LOGIC_OP), shaderBitOpsSuppported, maskState.applyFramebufferRead);
1658
	ConvertBlendState(blendState, logicState.applyFramebufferRead);
1659

1660
	// Note: If the blend state decided it had to use framebuffer reads,
1661
	// we need to make sure that both mask and logic also use it, otherwise things will go wrong.
1662
	if (blendState.applyFramebufferRead || logicState.applyFramebufferRead) {
1663
		maskState.ConvertToShaderBlend();
1664
		logicState.ConvertToShaderBlend();
1665
	} else {
1666
		// If it isn't a read, we may need to change blending to apply the logic op.
1667
		logicState.ApplyToBlendState(blendState);
1668
	}
1669
}
1670

1671
void GenericLogicState::ApplyToBlendState(GenericBlendState &blendState) {
1672
	if (SimulateLogicOpIfNeeded(blendState.srcColor, blendState.dstColor, blendState.eqColor)) {
1673
		if (!blendState.blendEnabled) {
1674
			// If it wasn't turned on, make sure it is now.
1675
			blendState.blendEnabled = true;
1676
			blendState.srcAlpha = BlendFactor::ONE;
1677
			blendState.dstAlpha = BlendFactor::ZERO;
1678
			blendState.eqAlpha = BlendEq::ADD;
1679
		}
1680
		logicOpEnabled = false;
1681
		logicOp = GE_LOGIC_COPY;
1682
	}
1683
}
1684

1685
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company