CoCalc -- DrawPixel.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/DrawPixel.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2013- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#include <mutex>
20
#include "Common/Common.h"
21
#include "Common/Data/Convert/ColorConv.h"
22
#include "Core/Config.h"
23
#include "GPU/GPUState.h"
24
#include "GPU/Software/BinManager.h"
25
#include "GPU/Software/DrawPixel.h"
26
#include "GPU/Software/FuncId.h"
27
#include "GPU/Software/Rasterizer.h"
28
#include "GPU/Software/SoftGpu.h"
29

30
using namespace Math3D;
31

32
namespace Rasterizer {
33

34
std::mutex jitCacheLock;
35
PixelJitCache *jitCache = nullptr;
36

37
void Init() {
38
	jitCache = new PixelJitCache();
39
}
40

41
void FlushJit() {
42
	jitCache->Flush();
43
}
44

45
void Shutdown() {
46
	delete jitCache;
47
	jitCache = nullptr;
48
}
49

50
bool DescribeCodePtr(const u8 *ptr, std::string &name) {
51
	if (!jitCache->IsInSpace(ptr)) {
52
		return false;
53
	}
54

55
	name = jitCache->DescribeCodePtr(ptr);
56
	return true;
57
}
58

59
static inline u8 GetPixelStencil(GEBufferFormat fmt, int fbStride, int x, int y) {
60
	if (fmt == GE_FORMAT_565) {
61
		// Always treated as 0 for comparison purposes.
62
		return 0;
63
	} else if (fmt == GE_FORMAT_5551) {
64
		return ((fb.Get16(x, y, fbStride) & 0x8000) != 0) ? 0xFF : 0;
65
	} else if (fmt == GE_FORMAT_4444) {
66
		return Convert4To8(fb.Get16(x, y, fbStride) >> 12);
67
	} else {
68
		return fb.Get32(x, y, fbStride) >> 24;
69
	}
70
}
71

72
static inline void SetPixelStencil(GEBufferFormat fmt, int fbStride, uint32_t targetWriteMask, int x, int y, u8 value) {
73
	if (fmt == GE_FORMAT_565) {
74
		// Do nothing
75
	} else if (fmt == GE_FORMAT_5551) {
76
		if ((targetWriteMask & 0x8000) == 0) {
77
			u16 pixel = fb.Get16(x, y, fbStride) & ~0x8000;
78
			pixel |= (value & 0x80) << 8;
79
			fb.Set16(x, y, fbStride, pixel);
80
		}
81
	} else if (fmt == GE_FORMAT_4444) {
82
		const u16 write_mask = targetWriteMask | 0x0FFF;
83
		u16 pixel = fb.Get16(x, y, fbStride) & write_mask;
84
		pixel |= ((u16)value << 8) & ~write_mask;
85
		fb.Set16(x, y, fbStride, pixel);
86
	} else {
87
		const u32 write_mask = targetWriteMask | 0x00FFFFFF;
88
		u32 pixel = fb.Get32(x, y, fbStride) & write_mask;
89
		pixel |= ((u32)value << 24) & ~write_mask;
90
		fb.Set32(x, y, fbStride, pixel);
91
	}
92
}
93

94
static inline u16 GetPixelDepth(int x, int y, int stride) {
95
	return depthbuf.Get16(x, y, stride);
96
}
97

98
static inline void SetPixelDepth(int x, int y, int stride, u16 value) {
99
	depthbuf.Set16(x, y, stride, value);
100
}
101

102
// NOTE: These likely aren't endian safe
103
static inline u32 GetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y) {
104
	switch (fmt) {
105
	case GE_FORMAT_565:
106
		// A should be zero for the purposes of alpha blending.
107
		return RGB565ToRGBA8888(fb.Get16(x, y, fbStride)) & 0x00FFFFFF;
108

109
	case GE_FORMAT_5551:
110
		return RGBA5551ToRGBA8888(fb.Get16(x, y, fbStride));
111

112
	case GE_FORMAT_4444:
113
		return RGBA4444ToRGBA8888(fb.Get16(x, y, fbStride));
114

115
	case GE_FORMAT_8888:
116
		return fb.Get32(x, y, fbStride);
117

118
	default:
119
		return 0;
120
	}
121
}
122

123
static inline void SetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y, u32 value, u32 old_value, u32 targetWriteMask) {
124
	switch (fmt) {
125
	case GE_FORMAT_565:
126
		value = RGBA8888ToRGB565(value);
127
		if (targetWriteMask != 0) {
128
			old_value = RGBA8888ToRGB565(old_value);
129
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
130
		}
131
		fb.Set16(x, y, fbStride, value);
132
		break;
133

134
	case GE_FORMAT_5551:
135
		value = RGBA8888ToRGBA5551(value);
136
		if (targetWriteMask != 0) {
137
			old_value = RGBA8888ToRGBA5551(old_value);
138
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
139
		}
140
		fb.Set16(x, y, fbStride, value);
141
		break;
142

143
	case GE_FORMAT_4444:
144
		value = RGBA8888ToRGBA4444(value);
145
		if (targetWriteMask != 0) {
146
			old_value = RGBA8888ToRGBA4444(old_value);
147
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
148
		}
149
		fb.Set16(x, y, fbStride, value);
150
		break;
151

152
	case GE_FORMAT_8888:
153
		value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
154
		fb.Set32(x, y, fbStride, value);
155
		break;
156

157
	default:
158
		break;
159
	}
160
}
161

162
static inline bool AlphaTestPassed(const PixelFuncID &pixelID, int alpha) {
163
	const u8 ref = pixelID.alphaTestRef;
164
	if (pixelID.hasAlphaTestMask)
165
		alpha &= pixelID.cached.alphaTestMask;
166

167
	switch (pixelID.AlphaTestFunc()) {
168
	case GE_COMP_NEVER:
169
		return false;
170

171
	case GE_COMP_ALWAYS:
172
		return true;
173

174
	case GE_COMP_EQUAL:
175
		return (alpha == ref);
176

177
	case GE_COMP_NOTEQUAL:
178
		return (alpha != ref);
179

180
	case GE_COMP_LESS:
181
		return (alpha < ref);
182

183
	case GE_COMP_LEQUAL:
184
		return (alpha <= ref);
185

186
	case GE_COMP_GREATER:
187
		return (alpha > ref);
188

189
	case GE_COMP_GEQUAL:
190
		return (alpha >= ref);
191
	}
192
	return true;
193
}
194

195
static inline bool ColorTestPassed(const PixelFuncID &pixelID, const Vec3<int> &color) {
196
	const u32 mask = pixelID.cached.colorTestMask;
197
	const u32 c = color.ToRGB() & mask;
198
	const u32 ref = pixelID.cached.colorTestRef;
199
	switch (pixelID.cached.colorTestFunc) {
200
	case GE_COMP_NEVER:
201
		return false;
202

203
	case GE_COMP_ALWAYS:
204
		return true;
205

206
	case GE_COMP_EQUAL:
207
		return c == ref;
208

209
	case GE_COMP_NOTEQUAL:
210
		return c != ref;
211

212
	default:
213
		return true;
214
	}
215
}
216

217
static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
218
	if (pixelID.hasStencilTestMask)
219
		stencil &= pixelID.cached.stencilTestMask;
220
	u8 ref = pixelID.stencilTestRef;
221
	switch (pixelID.StencilTestFunc()) {
222
	case GE_COMP_NEVER:
223
		return false;
224

225
	case GE_COMP_ALWAYS:
226
		return true;
227

228
	case GE_COMP_EQUAL:
229
		return ref == stencil;
230

231
	case GE_COMP_NOTEQUAL:
232
		return ref != stencil;
233

234
	case GE_COMP_LESS:
235
		return ref < stencil;
236

237
	case GE_COMP_LEQUAL:
238
		return ref <= stencil;
239

240
	case GE_COMP_GREATER:
241
		return ref > stencil;
242

243
	case GE_COMP_GEQUAL:
244
		return ref >= stencil;
245
	}
246
	return true;
247
}
248

249
static inline u8 ApplyStencilOp(GEBufferFormat fmt, uint8_t stencilReplace, GEStencilOp op, u8 old_stencil) {
250
	switch (op) {
251
	case GE_STENCILOP_KEEP:
252
		return old_stencil;
253

254
	case GE_STENCILOP_ZERO:
255
		return 0;
256

257
	case GE_STENCILOP_REPLACE:
258
		return stencilReplace;
259

260
	case GE_STENCILOP_INVERT:
261
		return ~old_stencil;
262

263
	case GE_STENCILOP_INCR:
264
		switch (fmt) {
265
		case GE_FORMAT_8888:
266
			if (old_stencil != 0xFF) {
267
				return old_stencil + 1;
268
			}
269
			return old_stencil;
270
		case GE_FORMAT_5551:
271
			return 0xFF;
272
		case GE_FORMAT_4444:
273
			if (old_stencil < 0xF0) {
274
				return old_stencil + 0x10;
275
			}
276
			return old_stencil;
277
		default:
278
			return old_stencil;
279
		}
280
		break;
281

282
	case GE_STENCILOP_DECR:
283
		switch (fmt) {
284
		case GE_FORMAT_4444:
285
			if (old_stencil >= 0x10)
286
				return old_stencil - 0x10;
287
			break;
288
		case GE_FORMAT_5551:
289
			return 0;
290
		default:
291
			if (old_stencil != 0)
292
				return old_stencil - 1;
293
			return old_stencil;
294
		}
295
		break;
296
	}
297

298
	return old_stencil;
299
}
300

301
static inline bool DepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
302
	u16 reference_z = GetPixelDepth(x, y, stride);
303

304
	switch (func) {
305
	case GE_COMP_NEVER:
306
		return false;
307

308
	case GE_COMP_ALWAYS:
309
		return true;
310

311
	case GE_COMP_EQUAL:
312
		return (z == reference_z);
313

314
	case GE_COMP_NOTEQUAL:
315
		return (z != reference_z);
316

317
	case GE_COMP_LESS:
318
		return (z < reference_z);
319

320
	case GE_COMP_LEQUAL:
321
		return (z <= reference_z);
322

323
	case GE_COMP_GREATER:
324
		return (z > reference_z);
325

326
	case GE_COMP_GEQUAL:
327
		return (z >= reference_z);
328

329
	default:
330
		return 0;
331
	}
332
}
333

334
bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
335
	return DepthTestPassed(func, x, y, stride, z);
336
}
337

338
static inline u32 ApplyLogicOp(GELogicOp op, u32 old_color, u32 new_color) {
339
	// All of the operations here intentionally preserve alpha/stencil.
340
	switch (op) {
341
	case GE_LOGIC_CLEAR:
342
		new_color &= 0xFF000000;
343
		break;
344

345
	case GE_LOGIC_AND:
346
		new_color = new_color & (old_color | 0xFF000000);
347
		break;
348

349
	case GE_LOGIC_AND_REVERSE:
350
		new_color = new_color & (~old_color | 0xFF000000);
351
		break;
352

353
	case GE_LOGIC_COPY:
354
		// No change to new_color.
355
		break;
356

357
	case GE_LOGIC_AND_INVERTED:
358
		new_color = (~new_color & (old_color & 0x00FFFFFF)) | (new_color & 0xFF000000);
359
		break;
360

361
	case GE_LOGIC_NOOP:
362
		new_color = (old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
363
		break;
364

365
	case GE_LOGIC_XOR:
366
		new_color = new_color ^ (old_color & 0x00FFFFFF);
367
		break;
368

369
	case GE_LOGIC_OR:
370
		new_color = new_color | (old_color & 0x00FFFFFF);
371
		break;
372

373
	case GE_LOGIC_NOR:
374
		new_color = (~(new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
375
		break;
376

377
	case GE_LOGIC_EQUIV:
378
		new_color = (~(new_color ^ old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
379
		break;
380

381
	case GE_LOGIC_INVERTED:
382
		new_color = (~old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
383
		break;
384

385
	case GE_LOGIC_OR_REVERSE:
386
		new_color = new_color | (~old_color & 0x00FFFFFF);
387
		break;
388

389
	case GE_LOGIC_COPY_INVERTED:
390
		new_color = (~new_color & 0x00FFFFFF) | (new_color & 0xFF000000);
391
		break;
392

393
	case GE_LOGIC_OR_INVERTED:
394
		new_color = ((~new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
395
		break;
396

397
	case GE_LOGIC_NAND:
398
		new_color = (~(new_color & old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
399
		break;
400

401
	case GE_LOGIC_SET:
402
		new_color |= 0x00FFFFFF;
403
		break;
404
	}
405

406
	return new_color;
407
}
408

409
static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
410
	switch (factor) {
411
	case PixelBlendFactor::OTHERCOLOR:
412
		return dst.rgb();
413

414
	case PixelBlendFactor::INVOTHERCOLOR:
415
		return Vec3<int>::AssignToAll(255) - dst.rgb();
416

417
	case PixelBlendFactor::SRCALPHA:
418
#if defined(_M_SSE)
419
		return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
420
#elif PPSSPP_ARCH(ARM64_NEON)
421
		return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
422
#else
423
		return Vec3<int>::AssignToAll(source.a());
424
#endif
425

426
	case PixelBlendFactor::INVSRCALPHA:
427
#if defined(_M_SSE)
428
		return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
429
#elif PPSSPP_ARCH(ARM64_NEON)
430
		return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
431
#else
432
		return Vec3<int>::AssignToAll(255 - source.a());
433
#endif
434

435
	case PixelBlendFactor::DSTALPHA:
436
		return Vec3<int>::AssignToAll(dst.a());
437

438
	case PixelBlendFactor::INVDSTALPHA:
439
		return Vec3<int>::AssignToAll(255 - dst.a());
440

441
	case PixelBlendFactor::DOUBLESRCALPHA:
442
		return Vec3<int>::AssignToAll(2 * source.a());
443

444
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
445
		return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
446

447
	case PixelBlendFactor::DOUBLEDSTALPHA:
448
		return Vec3<int>::AssignToAll(2 * dst.a());
449

450
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
451
		return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
452

453
	case PixelBlendFactor::FIX:
454
	default:
455
		// All other dest factors (> 10) are treated as FIXA.
456
		return Vec3<int>::FromRGB(fix);
457

458
	case PixelBlendFactor::ZERO:
459
		return Vec3<int>::AssignToAll(0);
460

461
	case PixelBlendFactor::ONE:
462
		return Vec3<int>::AssignToAll(255);
463
	}
464
}
465

466
static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
467
	switch (factor) {
468
	case PixelBlendFactor::OTHERCOLOR:
469
		return source.rgb();
470

471
	case PixelBlendFactor::INVOTHERCOLOR:
472
		return Vec3<int>::AssignToAll(255) - source.rgb();
473

474
	case PixelBlendFactor::SRCALPHA:
475
#if defined(_M_SSE)
476
		return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
477
#elif PPSSPP_ARCH(ARM64_NEON)
478
		return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
479
#else
480
		return Vec3<int>::AssignToAll(source.a());
481
#endif
482

483
	case PixelBlendFactor::INVSRCALPHA:
484
#if defined(_M_SSE)
485
		return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
486
#elif PPSSPP_ARCH(ARM64_NEON)
487
		return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
488
#else
489
		return Vec3<int>::AssignToAll(255 - source.a());
490
#endif
491

492
	case PixelBlendFactor::DSTALPHA:
493
		return Vec3<int>::AssignToAll(dst.a());
494

495
	case PixelBlendFactor::INVDSTALPHA:
496
		return Vec3<int>::AssignToAll(255 - dst.a());
497

498
	case PixelBlendFactor::DOUBLESRCALPHA:
499
		return Vec3<int>::AssignToAll(2 * source.a());
500

501
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
502
		return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
503

504
	case PixelBlendFactor::DOUBLEDSTALPHA:
505
		return Vec3<int>::AssignToAll(2 * dst.a());
506

507
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
508
		return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
509

510
	case PixelBlendFactor::FIX:
511
	default:
512
		// All other dest factors (> 10) are treated as FIXB.
513
		return Vec3<int>::FromRGB(fix);
514

515
	case PixelBlendFactor::ZERO:
516
		return Vec3<int>::AssignToAll(0);
517

518
	case PixelBlendFactor::ONE:
519
		return Vec3<int>::AssignToAll(255);
520
	}
521
}
522

523
// Removed inline here - it was never chosen to be inlined by the compiler anyway, too complex.
524
static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst) {
525
	// Note: These factors cannot go below 0, but they can go above 255 when doubling.
526
	Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst, pixelID.cached.alphaBlendSrc);
527
	Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst, pixelID.cached.alphaBlendDst);
528

529
	switch (pixelID.AlphaBlendEq()) {
530
	case GE_BLENDMODE_MUL_AND_ADD:
531
	{
532
#if defined(_M_SSE)
533
		// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
534
		const __m128i half = _mm_set1_epi16(1 << 3);
535

536
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
537
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
538
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
539

540
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
541
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
542
		const __m128i d = _mm_mulhi_epi16(drgb, df);
543

544
		return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
545
#elif PPSSPP_ARCH(ARM64_NEON)
546
		const int32x4_t half = vdupq_n_s32(1);
547

548
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
549
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
550
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
551

552
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
553
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
554
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
555

556
		return Vec3<int>(vaddq_s32(s, d));
557
#else
558
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
559
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
560
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
561
		return lhs + rhs;
562
#endif
563
	}
564

565
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
566
	{
567
#if defined(_M_SSE)
568
		const __m128i half = _mm_set1_epi16(1 << 3);
569

570
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
571
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
572
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
573

574
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
575
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
576
		const __m128i d = _mm_mulhi_epi16(drgb, df);
577

578
		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
579
#elif PPSSPP_ARCH(ARM64_NEON)
580
		const int32x4_t half = vdupq_n_s32(1);
581

582
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
583
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
584
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
585

586
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
587
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
588
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
589

590
		return Vec3<int>(vqsubq_s32(s, d));
591
#else
592
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
593
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
594
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
595
		return lhs - rhs;
596
#endif
597
	}
598

599
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
600
	{
601
#if defined(_M_SSE)
602
		const __m128i half = _mm_set1_epi16(1 << 3);
603

604
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
605
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
606
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
607

608
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
609
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
610
		const __m128i d = _mm_mulhi_epi16(drgb, df);
611

612
		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
613
#elif PPSSPP_ARCH(ARM64_NEON)
614
		const int32x4_t half = vdupq_n_s32(1);
615

616
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
617
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
618
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
619

620
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
621
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
622
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
623

624
		return Vec3<int>(vqsubq_s32(d, s));
625
#else
626
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
627
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
628
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
629
		return rhs - lhs;
630
#endif
631
	}
632

633
	case GE_BLENDMODE_MIN:
634
#if PPSSPP_ARCH(ARM64_NEON)
635
		return Vec3<int>(vminq_s32(source.ivec, dst.ivec));
636
#else
637
		return Vec3<int>(std::min(source.r(), dst.r()),
638
			std::min(source.g(), dst.g()),
639
			std::min(source.b(), dst.b()));
640
#endif
641

642
	case GE_BLENDMODE_MAX:
643
#if PPSSPP_ARCH(ARM64_NEON)
644
		return Vec3<int>(vmaxq_s32(source.ivec, dst.ivec));
645
#else
646
		return Vec3<int>(std::max(source.r(), dst.r()),
647
			std::max(source.g(), dst.g()),
648
			std::max(source.b(), dst.b()));
649
#endif
650

651
	case GE_BLENDMODE_ABSDIFF:
652
#if PPSSPP_ARCH(ARM64_NEON)
653
		return Vec3<int>(vabdq_s32(source.ivec, dst.ivec));
654
#else
655
		return Vec3<int>(::abs(source.r() - dst.r()),
656
			::abs(source.g() - dst.g()),
657
			::abs(source.b() - dst.b()));
658
#endif
659

660
	default:
661
		return source.rgb();
662
	}
663
}
664

665
template <bool clearMode, GEBufferFormat fbFormat>
666
void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg color_in, const PixelFuncID &pixelID) {
667
	Vec4<int> prim_color = Vec4<int>(color_in).Clamp(0, 255);
668
	// Depth range test - applied in clear mode, if not through mode.
669
	if (pixelID.applyDepthRange && !pixelID.earlyZChecks)
670
		if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
671
			return;
672

673
	if (pixelID.AlphaTestFunc() != GE_COMP_ALWAYS && !clearMode)
674
		if (!AlphaTestPassed(pixelID, prim_color.a()))
675
			return;
676

677
	// Fog is applied prior to color test.
678
	if (pixelID.applyFog && !clearMode) {
679
		Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);
680
		// This is very similar to the BLEND texfunc, and simply always rounds up.
681
		static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);
682
		fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;
683
		prim_color.r() = fogColor.r();
684
		prim_color.g() = fogColor.g();
685
		prim_color.b() = fogColor.b();
686
	}
687

688
	if (pixelID.colorTest && !clearMode)
689
		if (!ColorTestPassed(pixelID, prim_color.rgb()))
690
			return;
691

692
	// In clear mode, it uses the alpha color as stencil.
693
	uint32_t targetWriteMask = pixelID.applyColorWriteMask ? pixelID.cached.colorWriteMask : 0;
694
	u8 stencil = clearMode ? prim_color.a() : GetPixelStencil(fbFormat, pixelID.cached.framebufStride, x, y);
695
	if (clearMode) {
696
		if (pixelID.DepthClear())
697
			SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
698
	} else if (pixelID.stencilTest) {
699
		const uint8_t stencilReplace = pixelID.hasStencilTestMask ? pixelID.cached.stencilRef : pixelID.stencilTestRef;
700
		if (!StencilTestPassed(pixelID, stencil)) {
701
			stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.SFail(), stencil);
702
			SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
703
			return;
704
		}
705

706
		// Also apply depth at the same time.  If disabled, same as passing.
707
		if (!pixelID.earlyZChecks && pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
708
			stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZFail(), stencil);
709
			SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
710
			return;
711
		}
712

713
		stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZPass(), stencil);
714
	} else if (!pixelID.earlyZChecks) {
715
		if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
716
			return;
717
		}
718
	}
719

720
	if (pixelID.depthWrite && !clearMode)
721
		SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
722

723
	const u32 old_color = GetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y);
724
	u32 new_color;
725

726
	// Dithering happens before the logic op and regardless of framebuffer format or clear mode.
727
	// We do it while alpha blending because it happens before clamping.
728
	if (pixelID.alphaBlend && !clearMode) {
729
		const Vec4<int> dst = Vec4<int>::FromRGBA(old_color);
730
		Vec3<int> blended = AlphaBlendingResult(pixelID, prim_color, dst);
731
		if (pixelID.dithering) {
732
			blended += Vec3<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
733
		}
734

735
		// ToRGB() always automatically clamps.
736
		new_color = blended.ToRGB();
737
		new_color |= stencil << 24;
738
	} else {
739
		if (pixelID.dithering) {
740
			// We'll discard alpha anyway.
741
			prim_color += Vec4<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
742
		}
743

744
#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)
745
		new_color = Vec3<int>(prim_color.ivec).ToRGB();
746
		new_color |= stencil << 24;
747
#else
748
		new_color = Vec4<int>(prim_color.r(), prim_color.g(), prim_color.b(), stencil).ToRGBA();
749
#endif
750
	}
751

752
	// Logic ops are applied after blending (if blending is enabled.)
753
	if (pixelID.applyLogicOp && !clearMode) {
754
		// Logic ops don't affect stencil, which happens inside ApplyLogicOp.
755
		new_color = ApplyLogicOp(pixelID.cached.logicOp, old_color, new_color);
756
	}
757

758
	if (clearMode) {
759
		if (!pixelID.ColorClear())
760
			new_color = (new_color & 0xFF000000) | (old_color & 0x00FFFFFF);
761
		if (!pixelID.StencilClear())
762
			new_color = (new_color & 0x00FFFFFF) | (old_color & 0xFF000000);
763
	}
764

765
	SetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y, new_color, old_color, targetWriteMask);
766
}
767

768
SingleFunc GetSingleFunc(const PixelFuncID &id, BinManager *binner) {
769
	SingleFunc jitted = jitCache->GetSingle(id, binner);
770
	if (jitted) {
771
		return jitted;
772
	}
773

774
	return jitCache->GenericSingle(id);
775
}
776

777
SingleFunc PixelJitCache::GenericSingle(const PixelFuncID &id) {
778
	if (id.clearMode) {
779
		switch (id.fbFormat) {
780
		case GE_FORMAT_565:
781
			return &DrawSinglePixel<true, GE_FORMAT_565>;
782
		case GE_FORMAT_5551:
783
			return &DrawSinglePixel<true, GE_FORMAT_5551>;
784
		case GE_FORMAT_4444:
785
			return &DrawSinglePixel<true, GE_FORMAT_4444>;
786
		case GE_FORMAT_8888:
787
			return &DrawSinglePixel<true, GE_FORMAT_8888>;
788
		}
789
	}
790
	switch (id.fbFormat) {
791
	case GE_FORMAT_565:
792
		return &DrawSinglePixel<false, GE_FORMAT_565>;
793
	case GE_FORMAT_5551:
794
		return &DrawSinglePixel<false, GE_FORMAT_5551>;
795
	case GE_FORMAT_4444:
796
		return &DrawSinglePixel<false, GE_FORMAT_4444>;
797
	case GE_FORMAT_8888:
798
		return &DrawSinglePixel<false, GE_FORMAT_8888>;
799
	}
800
	_assert_(false);
801
	return nullptr;
802
}
803

804
thread_local PixelJitCache::LastCache PixelJitCache::lastSingle_;
805
int PixelJitCache::clearGen_ = 0;
806

807
// 256k should be plenty of space for plenty of variations.
808
PixelJitCache::PixelJitCache() : CodeBlock(1024 * 64 * 4), cache_(64) {
809
	lastSingle_.gen = -1;
810
	clearGen_++;
811
}
812

813
void PixelJitCache::Clear() {
814
	clearGen_++;
815
	CodeBlock::Clear();
816
	cache_.Clear();
817
	addresses_.clear();
818

819
	constBlendHalf_11_4s_ = nullptr;
820
	constBlendInvert_11_4s_ = nullptr;
821
}
822

823
std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {
824
	constexpr bool USE_IDS = false;
825
	ptrdiff_t dist = 0x7FFFFFFF;
826
	if (USE_IDS) {
827
		PixelFuncID found{};
828
		for (const auto &it : addresses_) {
829
			ptrdiff_t it_dist = ptr - it.second;
830
			if (it_dist >= 0 && it_dist < dist) {
831
				found = it.first;
832
				dist = it_dist;
833
			}
834
		}
835

836
		return DescribePixelFuncID(found);
837
	}
838

839
	return CodeBlock::DescribeCodePtr(ptr);
840
}
841

842
void PixelJitCache::Flush() {
843
	std::unique_lock<std::mutex> guard(jitCacheLock);
844
	for (const auto &queued : compileQueue_) {
845
		// Might've been compiled after enqueue, but before now.
846
		size_t queuedKey = std::hash<PixelFuncID>()(queued);
847
		if (!cache_.ContainsKey(queuedKey))
848
			Compile(queued);
849
	}
850
	compileQueue_.clear();
851
}
852

853
SingleFunc PixelJitCache::GetSingle(const PixelFuncID &id, BinManager *binner) {
854
	if (!g_Config.bSoftwareRenderingJit)
855
		return nullptr;
856

857
	const size_t key = std::hash<PixelFuncID>()(id);
858
	if (lastSingle_.Match(key, clearGen_))
859
		return lastSingle_.func;
860

861
	std::unique_lock<std::mutex> guard(jitCacheLock);
862
	SingleFunc singleFunc;
863
	if (cache_.Get(key, &singleFunc)) {
864
		lastSingle_.Set(key, singleFunc, clearGen_);
865
		return singleFunc;
866
	}
867

868
	if (!binner) {
869
		// Can't compile, let's try to do it later when there's an opportunity.
870
		compileQueue_.insert(id);
871
		return nullptr;
872
	}
873

874
	guard.unlock();
875
	binner->Flush("compile");
876
	guard.lock();
877

878
	for (const auto &queued : compileQueue_) {
879
		// Might've been compiled after enqueue, but before now.
880
		size_t queuedKey = std::hash<PixelFuncID>()(queued);
881
		if (!cache_.ContainsKey(queuedKey))
882
			Compile(queued);
883
	}
884
	compileQueue_.clear();
885

886
	// Might've been in the queue.
887
	if (!cache_.ContainsKey(key))
888
		Compile(id);
889

890
	if (cache_.Get(key, &singleFunc)) {
891
		lastSingle_.Set(key, singleFunc, clearGen_);
892
		return singleFunc;
893
	} else {
894
		return nullptr;
895
	}
896
}
897

898
void PixelJitCache::Compile(const PixelFuncID &id) {
899
	// x64 is typically 200-500 bytes, but let's be safe.
900
	if (GetSpaceLeft() < 65536) {
901
		Clear();
902
	}
903

904
#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
905
	addresses_[id] = GetCodePointer();
906
	SingleFunc func = CompileSingle(id);
907
	cache_.Insert(std::hash<PixelFuncID>()(id), func);
908
#endif
909
}
910

911
void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id) {
912
	switch (id.AlphaBlendEq()) {
913
	case GE_BLENDMODE_MUL_AND_ADD:
914
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
915
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
916
		state.usesFactors = true;
917
		break;
918

919
	case GE_BLENDMODE_MIN:
920
	case GE_BLENDMODE_MAX:
921
	case GE_BLENDMODE_ABSDIFF:
922
		break;
923
	}
924

925
	if (state.usesFactors) {
926
		switch (id.AlphaBlendSrc()) {
927
		case PixelBlendFactor::DSTALPHA:
928
		case PixelBlendFactor::INVDSTALPHA:
929
		case PixelBlendFactor::DOUBLEDSTALPHA:
930
		case PixelBlendFactor::DOUBLEINVDSTALPHA:
931
			state.usesDstAlpha = true;
932
			break;
933

934
		case PixelBlendFactor::OTHERCOLOR:
935
		case PixelBlendFactor::INVOTHERCOLOR:
936
			state.dstColorAsFactor = true;
937
			break;
938

939
		case PixelBlendFactor::SRCALPHA:
940
		case PixelBlendFactor::INVSRCALPHA:
941
		case PixelBlendFactor::DOUBLESRCALPHA:
942
		case PixelBlendFactor::DOUBLEINVSRCALPHA:
943
			state.srcColorAsFactor = true;
944
			break;
945

946
		default:
947
			break;
948
		}
949

950
		switch (id.AlphaBlendDst()) {
951
		case PixelBlendFactor::INVSRCALPHA:
952
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA;
953
			state.srcColorAsFactor = true;
954
			break;
955

956
		case PixelBlendFactor::DOUBLEINVSRCALPHA:
957
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLESRCALPHA;
958
			state.srcColorAsFactor = true;
959
			break;
960

961
		case PixelBlendFactor::DSTALPHA:
962
			state.usesDstAlpha = true;
963
			break;
964

965
		case PixelBlendFactor::INVDSTALPHA:
966
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DSTALPHA;
967
			state.usesDstAlpha = true;
968
			break;
969

970
		case PixelBlendFactor::DOUBLEDSTALPHA:
971
			state.usesDstAlpha = true;
972
			break;
973

974
		case PixelBlendFactor::DOUBLEINVDSTALPHA:
975
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLEDSTALPHA;
976
			state.usesDstAlpha = true;
977
			break;
978

979
		case PixelBlendFactor::OTHERCOLOR:
980
		case PixelBlendFactor::INVOTHERCOLOR:
981
			state.srcColorAsFactor = true;
982
			break;
983

984
		case PixelBlendFactor::SRCALPHA:
985
		case PixelBlendFactor::DOUBLESRCALPHA:
986
			state.srcColorAsFactor = true;
987
			break;
988

989
		case PixelBlendFactor::ZERO:
990
			state.readsDstPixel = state.dstColorAsFactor || state.usesDstAlpha;
991
			break;
992

993
		default:
994
			break;
995
		}
996
	}
997
}
998

999
};
1000

1001
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company