CoCalc -- Sampler.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/Sampler.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2017- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#include <unordered_map>
20
#include <mutex>
21
#include "Common/Common.h"
22
#include "Common/Data/Convert/ColorConv.h"
23
#include "Common/LogReporting.h"
24
#include "Common/StringUtils.h"
25
#include "Core/Config.h"
26
#include "GPU/Common/TextureDecoder.h"
27
#include "GPU/GPUState.h"
28
#include "GPU/Software/BinManager.h"
29
#include "GPU/Software/Rasterizer.h"
30
#include "GPU/Software/RasterizerRegCache.h"
31
#include "GPU/Software/Sampler.h"
32

33
#if defined(_M_SSE)
34
#include <emmintrin.h>
35
#endif
36

37
#if PPSSPP_ARCH(ARM_NEON)
38
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
39
#include <arm64_neon.h>
40
#else
41
#include <arm_neon.h>
42
#endif
43
#endif
44

45
using namespace Math3D;
46
using namespace Rasterizer;
47

48
namespace Sampler {
49

50
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
51
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
52
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID);
53

54
std::mutex jitCacheLock;
55
SamplerJitCache *jitCache = nullptr;
56

57
void Init() {
58
	jitCache = new SamplerJitCache();
59
}
60

61
void FlushJit() {
62
	jitCache->Flush();
63
}
64

65
void Shutdown() {
66
	delete jitCache;
67
	jitCache = nullptr;
68
}
69

70
bool DescribeCodePtr(const u8 *ptr, std::string &name) {
71
	if (!jitCache->IsInSpace(ptr)) {
72
		return false;
73
	}
74

75
	name = jitCache->DescribeCodePtr(ptr);
76
	return true;
77
}
78

79
NearestFunc GetNearestFunc(SamplerID id, BinManager *binner) {
80
	id.linear = false;
81
	NearestFunc jitted = jitCache->GetNearest(id, binner);
82
	if (jitted) {
83
		return jitted;
84
	}
85

86
	return &SampleNearest;
87
}
88

89
LinearFunc GetLinearFunc(SamplerID id, BinManager *binner) {
90
	id.linear = true;
91
	LinearFunc jitted = jitCache->GetLinear(id, binner);
92
	if (jitted) {
93
		return jitted;
94
	}
95

96
	return &SampleLinear;
97
}
98

99
FetchFunc GetFetchFunc(SamplerID id, BinManager *binner) {
100
	id.fetch = true;
101
	FetchFunc jitted = jitCache->GetFetch(id, binner);
102
	if (jitted) {
103
		return jitted;
104
	}
105

106
	return &SampleFetch;
107
}
108

109
thread_local SamplerJitCache::LastCache SamplerJitCache::lastFetch_;
110
thread_local SamplerJitCache::LastCache SamplerJitCache::lastNearest_;
111
thread_local SamplerJitCache::LastCache SamplerJitCache::lastLinear_;
112
int SamplerJitCache::clearGen_ = 0;
113

114
// 256k should be enough.
115
SamplerJitCache::SamplerJitCache() : Rasterizer::CodeBlock(1024 * 64 * 4), cache_(64) {
116
	lastFetch_.gen = -1;
117
	lastNearest_.gen = -1;
118
	lastLinear_.gen = -1;
119
	clearGen_++;
120
}
121

122
void SamplerJitCache::Clear() {
123
	clearGen_++;
124
	CodeBlock::Clear();
125
	cache_.Clear();
126
	addresses_.clear();
127

128
	const10All16_ = nullptr;
129
	const10Low_ = nullptr;
130
	const10All8_ = nullptr;
131

132
	constWidthHeight256f_ = nullptr;
133
	constWidthMinus1i_ = nullptr;
134
	constHeightMinus1i_ = nullptr;
135

136
	constOnes32_ = nullptr;
137
	constOnes16_ = nullptr;
138
	constUNext_ = nullptr;
139
	constVNext_ = nullptr;
140

141
	const5551Swizzle_ = nullptr;
142
	const5650Swizzle_ = nullptr;
143
}
144

145
std::string SamplerJitCache::DescribeCodePtr(const u8 *ptr) {
146
	constexpr bool USE_IDS = false;
147
	ptrdiff_t dist = 0x7FFFFFFF;
148
	if (USE_IDS) {
149
		SamplerID found{};
150
		for (const auto &it : addresses_) {
151
			ptrdiff_t it_dist = ptr - it.second;
152
			if (it_dist >= 0 && it_dist < dist) {
153
				found = it.first;
154
				dist = it_dist;
155
			}
156
		}
157

158
		return DescribeSamplerID(found);
159
	}
160

161
	return CodeBlock::DescribeCodePtr(ptr);
162
}
163

164
void SamplerJitCache::Flush() {
165
	std::unique_lock<std::mutex> guard(jitCacheLock);
166
	for (const auto &queued : compileQueue_) {
167
		// Might've been compiled after enqueue, but before now.
168
		size_t queuedKey = std::hash<SamplerID>()(queued);
169
		if (!cache_.ContainsKey(queuedKey))
170
			Compile(queued);
171
	}
172
	compileQueue_.clear();
173
}
174

175
NearestFunc SamplerJitCache::GetByID(const SamplerID &id, size_t key, BinManager *binner) {
176
	std::unique_lock<std::mutex> guard(jitCacheLock);
177
	
178
	NearestFunc func;
179
	if (cache_.Get(key, &func)) {
180
		return func;
181
	}
182

183
	if (!binner) {
184
		// Can't compile, let's try to do it later when there's an opportunity.
185
		compileQueue_.insert(id);
186
		return nullptr;
187
	}
188

189
	guard.unlock();
190
	binner->Flush("compile");
191
	guard.lock();
192

193
	for (const auto &queued : compileQueue_) {
194
		// Might've been compiled after enqueue, but before now.
195
		size_t queuedKey = std::hash<SamplerID>()(queued);
196
		if (!cache_.ContainsKey(queuedKey))
197
			Compile(queued);
198
	}
199
	compileQueue_.clear();
200

201
	if (!cache_.ContainsKey(key))
202
		Compile(id);
203

204
	// Okay, should be there now.
205
	if (cache_.Get(key, &func)) {
206
		return func;
207
	} else {
208
		return nullptr;
209
	}
210
}
211

212
NearestFunc SamplerJitCache::GetNearest(const SamplerID &id, BinManager *binner) {
213
	if (!g_Config.bSoftwareRenderingJit)
214
		return nullptr;
215

216
	const size_t key = std::hash<SamplerID>()(id);
217
	if (lastNearest_.Match(key, clearGen_))
218
		return (NearestFunc)lastNearest_.func;
219

220
	auto func = GetByID(id, key, binner);
221
	lastNearest_.Set(key, func, clearGen_);
222
	return (NearestFunc)func;
223
}
224

225
LinearFunc SamplerJitCache::GetLinear(const SamplerID &id, BinManager *binner) {
226
	if (!g_Config.bSoftwareRenderingJit)
227
		return nullptr;
228

229
	const size_t key = std::hash<SamplerID>()(id);
230
	if (lastLinear_.Match(key, clearGen_))
231
		return (LinearFunc)lastLinear_.func;
232

233
	auto func = GetByID(id, key, binner);
234
	lastLinear_.Set(key, func, clearGen_);
235
	return (LinearFunc)func;
236
}
237

238
FetchFunc SamplerJitCache::GetFetch(const SamplerID &id, BinManager *binner) {
239
	if (!g_Config.bSoftwareRenderingJit)
240
		return nullptr;
241

242
	const size_t key = std::hash<SamplerID>()(id);
243
	if (lastFetch_.Match(key, clearGen_))
244
		return (FetchFunc)lastFetch_.func;
245

246
	auto func = GetByID(id, key, binner);
247
	lastFetch_.Set(key, func, clearGen_);
248
	return (FetchFunc)func;
249
}
250

251
void SamplerJitCache::Compile(const SamplerID &id) {
252
	// This should be sufficient.
253
	if (GetSpaceLeft() < 16384) {
254
		Clear();
255
	}
256

257
	// We compile them together so the cache can't possibly be cleared in between.
258
	// We might vary between nearest and linear, so we can't clear between.
259
#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
260
	SamplerID fetchID = id;
261
	fetchID.linear = false;
262
	fetchID.fetch = true;
263
	addresses_[fetchID] = GetCodePointer();
264
	cache_.Insert(std::hash<SamplerID>()(fetchID), (NearestFunc)CompileFetch(fetchID));
265

266
	SamplerID nearestID = id;
267
	nearestID.linear = false;
268
	nearestID.fetch = false;
269
	addresses_[nearestID] = GetCodePointer();
270
	cache_.Insert(std::hash<SamplerID>()(nearestID), (NearestFunc)CompileNearest(nearestID));
271

272
	SamplerID linearID = id;
273
	linearID.linear = true;
274
	linearID.fetch = false;
275
	addresses_[linearID] = GetCodePointer();
276
	cache_.Insert(std::hash<SamplerID>()(linearID), (NearestFunc)CompileLinear(linearID));
277
#endif
278
}
279

280
template <uint32_t texel_size_bits>
281
static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint32_t v, bool swizzled) {
282
	if (!swizzled)
283
		return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
284

285
	const uint32_t tile_size_bits = 32;
286
	const uint32_t tiles_in_block_horizontal = 4;
287
	const uint32_t tiles_in_block_vertical = 8;
288

289
	constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
290
	uint32_t tile_u = u / texels_per_tile;
291
	uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
292
	// TODO: not sure if the *texel_size_bits/8 factor is correct
293
					(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
294
					(tile_u % tiles_in_block_horizontal) +
295
					(tile_u / tiles_in_block_horizontal) * (tiles_in_block_horizontal*tiles_in_block_vertical);
296

297
	return tile_idx * (tile_size_bits / 8) + ((u % texels_per_tile) * texel_size_bits) / 8;
298
}
299

300
static inline u32 LookupColor(unsigned int index, unsigned int level, const SamplerID &samplerID) {
301
	const int clutSharingOffset = samplerID.useSharedClut ? 0 : level * 16;
302

303
	switch (samplerID.ClutFmt()) {
304
	case GE_CMODE_16BIT_BGR5650:
305
		return RGB565ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
306

307
	case GE_CMODE_16BIT_ABGR5551:
308
		return RGBA5551ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
309

310
	case GE_CMODE_16BIT_ABGR4444:
311
		return RGBA4444ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
312

313
	case GE_CMODE_32BIT_ABGR8888:
314
		return samplerID.cached.clut32[index + clutSharingOffset];
315

316
	default:
317
		ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt());
318
		return 0;
319
	}
320
}
321

322
uint32_t TransformClutIndex(uint32_t index, const SamplerID &samplerID) {
323
	if (samplerID.hasClutShift || samplerID.hasClutMask || samplerID.hasClutOffset) {
324
		const uint8_t shift = (samplerID.cached.clutFormat >> 2) & 0x1F;
325
		const uint8_t mask = (samplerID.cached.clutFormat >> 8) & 0xFF;
326
		const uint16_t offset = ((samplerID.cached.clutFormat >> 16) & 0x1F) << 4;
327
		// We need to wrap any entries beyond the first 1024 bytes.
328
		const uint16_t offsetMask = samplerID.ClutFmt() == GE_CMODE_32BIT_ABGR8888 ? 0xFF : 0x1FF;
329

330
		return ((index >> shift) & mask) | (offset & offsetMask);
331
	}
332
	return index & 0xFF;
333
}
334

335
struct Nearest4 {
336
	alignas(16) u32 v[4];
337

338
	operator u32() const {
339
		return v[0];
340
	}
341
};
342

343
template <int N>
344
inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N], const u8 *srcptr, uint16_t texbufw, int level, const SamplerID &samplerID) {
345
	Nearest4 res;
346
	if (!srcptr) {
347
		memset(res.v, 0, sizeof(res.v));
348
		return res;
349
	}
350

351
	// TODO: Should probably check if textures are aligned properly...
352

353
	switch (samplerID.TexFmt()) {
354
	case GE_TFMT_4444:
355
		for (int i = 0; i < N; ++i) {
356
			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
357
			res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src);
358
		}
359
		return res;
360
	
361
	case GE_TFMT_5551:
362
		for (int i = 0; i < N; ++i) {
363
			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
364
			res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src);
365
		}
366
		return res;
367

368
	case GE_TFMT_5650:
369
		for (int i = 0; i < N; ++i) {
370
			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
371
			res.v[i] = RGB565ToRGBA8888(*(const u16 *)src);
372
		}
373
		return res;
374

375
	case GE_TFMT_8888:
376
		for (int i = 0; i < N; ++i) {
377
			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);
378
			res.v[i] = *(const u32 *)src;
379
		}
380
		return res;
381

382
	case GE_TFMT_CLUT32:
383
		for (int i = 0; i < N; ++i) {
384
			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);
385
			u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24);
386
			res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
387
		}
388
		return res;
389

390
	case GE_TFMT_CLUT16:
391
		for (int i = 0; i < N; ++i) {
392
			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
393
			u16 val = src[0] + (src[1] << 8);
394
			res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
395
		}
396
		return res;
397

398
	case GE_TFMT_CLUT8:
399
		for (int i = 0; i < N; ++i) {
400
			const u8 *src = srcptr + GetPixelDataOffset<8>(texbufw, u[i], v[i], samplerID.swizzle);
401
			u8 val = *src;
402
			res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
403
		}
404
		return res;
405

406
	case GE_TFMT_CLUT4:
407
		for (int i = 0; i < N; ++i) {
408
			const u8 *src = srcptr + GetPixelDataOffset<4>(texbufw, u[i], v[i], samplerID.swizzle);
409
			u8 val = (u[i] & 1) ? (src[0] >> 4) : (src[0] & 0xF);
410
			// Only CLUT4 uses separate mipmap palettes.
411
			res.v[i] = LookupColor(TransformClutIndex(val, samplerID), level, samplerID);
412
		}
413
		return res;
414

415
	case GE_TFMT_DXT1:
416
		for (int i = 0; i < N; ++i) {
417
			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
418
			res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
419
		}
420
		return res;
421

422
	case GE_TFMT_DXT3:
423
		for (int i = 0; i < N; ++i) {
424
			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
425
			res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
426
		}
427
		return res;
428

429
	case GE_TFMT_DXT5:
430
		for (int i = 0; i < N; ++i) {
431
			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
432
			res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
433
		}
434
		return res;
435

436
	default:
437
		ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported texture format: %x", samplerID.TexFmt());
438
		memset(res.v, 0, sizeof(res.v));
439
		return res;
440
	}
441
}
442

443
static inline int ClampUV(int v, int height) {
444
	if (v >= height - 1)
445
		return height - 1;
446
	if (v >= 511)
447
		return 511;
448
	else if (v < 0)
449
		return 0;
450
	return v;
451
}
452

453
static inline int WrapUV(int v, int height) {
454
	return v & (height - 1) & 511;
455
}
456

457
template <int N>
458
static inline void ApplyTexelClamp(int out_u[N], int out_v[N], const int u[N], const int v[N], int width, int height, const SamplerID &samplerID) {
459
	if (samplerID.clampS) {
460
		for (int i = 0; i < N; ++i) {
461
			out_u[i] = ClampUV(u[i], width);
462
		}
463
	} else {
464
		for (int i = 0; i < N; ++i) {
465
			out_u[i] = WrapUV(u[i], width);
466
		}
467
	}
468
	if (samplerID.clampT) {
469
		for (int i = 0; i < N; ++i) {
470
			out_v[i] = ClampUV(v[i], height);
471
		}
472
	} else {
473
		for (int i = 0; i < N; ++i) {
474
			out_v[i] = WrapUV(v[i], height);
475
		}
476
	}
477
}
478

479
static inline void GetTexelCoordinates(int level, float s, float t, int &out_u, int &out_v, const SamplerID &samplerID) {
480
	int width = samplerID.cached.sizes[level].w;
481
	int height = samplerID.cached.sizes[level].h;
482

483
	int base_u = (int)(s * width * 256.0f);
484
	int base_v = (int)(t * height * 256.0f);
485

486
	base_u >>= 8;
487
	base_v >>= 8;
488

489
	ApplyTexelClamp<1>(&out_u, &out_v, &base_u, &base_v, width, height, samplerID);
490
}
491

492
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in, const SamplerID &samplerID) {
493
	const Vec4<int> prim_color = prim_color_in;
494
	const Vec4<int> texcolor = texcolor_in;
495

496
	Vec3<int> out_rgb;
497
	int out_a;
498

499
	bool rgba = samplerID.useTextureAlpha;
500

501
	switch (samplerID.TexFunc()) {
502
	case GE_TEXFUNC_MODULATE:
503
	{
504
#if defined(_M_SSE)
505
		// Modulate weights slightly on the tex color, by adding one to prim and dividing by 256.
506
		const __m128i p = _mm_slli_epi16(_mm_packs_epi32(prim_color.ivec, prim_color.ivec), 4);
507
		const __m128i pboost = _mm_add_epi16(p, _mm_set1_epi16(1 << 4));
508
		__m128i t = _mm_slli_epi16(_mm_packs_epi32(texcolor.ivec, texcolor.ivec), 4);
509
		if (samplerID.useColorDoubling) {
510
			const __m128i amask = _mm_set_epi16(-1, 0, 0, 0, -1, 0, 0, 0);
511
			const __m128i a = _mm_and_si128(t, amask);
512
			const __m128i rgb = _mm_andnot_si128(amask, t);
513
			t = _mm_or_si128(_mm_slli_epi16(rgb, 1), a);
514
		}
515
		const __m128i b = _mm_mulhi_epi16(pboost, t);
516
		out_rgb.ivec = _mm_unpacklo_epi16(b, _mm_setzero_si128());
517

518
		if (rgba) {
519
			return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
520
		} else {
521
			out_a = prim_color.a();
522
		}
523
#elif PPSSPP_ARCH(ARM64_NEON)
524
		int32x4_t pboost = vaddq_s32(prim_color.ivec, vdupq_n_s32(1));
525
		int32x4_t t = texcolor.ivec;
526
		if (samplerID.useColorDoubling) {
527
			static const int32_t rgbDouble[4] = { 1, 1, 1, 0 };
528
			t = vshlq_s32(t, vld1q_s32(rgbDouble));
529
		}
530
		out_rgb.ivec = vshrq_n_s32(vmulq_s32(pboost, t), 8);
531

532
		if (rgba) {
533
			return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
534
		}
535
		out_a = prim_color.a();
536
#else
537
		if (samplerID.useColorDoubling) {
538
			out_rgb = ((prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() * 2) / 256;
539
		} else {
540
			out_rgb = (prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() / 256;
541
		}
542
		out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
543
#endif
544
		break;
545
	}
546

547
	case GE_TEXFUNC_DECAL:
548
		if (rgba) {
549
			int t = texcolor.a();
550
			int invt = 255 - t;
551
			// Both colors are boosted here, making the alpha have more weight.
552
			Vec3<int> one = Vec3<int>::AssignToAll(1);
553
			out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t);
554
			// Keep the bits of accuracy when doubling.
555
			if (samplerID.useColorDoubling)
556
				out_rgb /= 128;
557
			else
558
				out_rgb /= 256;
559
		} else {
560
			if (samplerID.useColorDoubling)
561
				out_rgb = texcolor.rgb() * 2;
562
			else
563
				out_rgb = texcolor.rgb();
564
		}
565
		out_a = prim_color.a();
566
		break;
567

568
	case GE_TEXFUNC_BLEND:
569
	{
570
		const Vec3<int> const255(255, 255, 255);
571
		const Vec3<int> texenv = Vec3<int>::FromRGB(samplerID.cached.texBlendColor);
572

573
		// Unlike the others (and even alpha), this one simply always rounds up.
574
		const Vec3<int> roundup = Vec3<int>::AssignToAll(255);
575
		out_rgb = ((const255 - texcolor.rgb()) * prim_color.rgb() + texcolor.rgb() * texenv + roundup);
576
		// Must divide by less to keep the precision for doubling to be accurate.
577
		if (samplerID.useColorDoubling)
578
			out_rgb /= 128;
579
		else
580
			out_rgb /= 256;
581

582
		out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
583
		break;
584
	}
585

586
	case GE_TEXFUNC_REPLACE:
587
		out_rgb = texcolor.rgb();
588
		// Doubling even happens for replace.
589
		if (samplerID.useColorDoubling)
590
			out_rgb *= 2;
591
		out_a = (rgba) ? texcolor.a() : prim_color.a();
592
		break;
593

594
	case GE_TEXFUNC_ADD:
595
	case GE_TEXFUNC_UNKNOWN1:
596
	case GE_TEXFUNC_UNKNOWN2:
597
	case GE_TEXFUNC_UNKNOWN3:
598
		// Don't need to clamp afterward, we always clamp before tests.
599
		out_rgb = prim_color.rgb() + texcolor.rgb();
600
		if (samplerID.useColorDoubling)
601
			out_rgb *= 2;
602

603
		// Alpha is still blended the common way.
604
		out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
605
		break;
606
	}
607

608
	return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
609
}
610

611
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID) {
612
	int u, v;
613

614
	// Nearest filtering only.  Round texcoords.
615
	GetTexelCoordinates(level, s, t, u, v, samplerID);
616
	Vec4<int> c0 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[0], bufw[0], level, samplerID).v[0]);
617

618
	if (levelFrac) {
619
		GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
620
		Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);
621

622
		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
623
	}
624

625
	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
626
}
627

628
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID) {
629
	Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level, samplerID);
630
	return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));
631
}
632

633
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuad(bool clamp, Vec4IntArg vec, int width) {
634
	Vec4<int> result = vec;
635
#ifdef _M_SSE
636
	if (clamp) {
637
		// First, clamp to zero.
638
		__m128i negmask = _mm_cmpgt_epi32(_mm_setzero_si128(), result.ivec);
639
		result.ivec = _mm_andnot_si128(negmask, result.ivec);
640

641
		// Now the high bound.
642
		__m128i bound = _mm_set1_epi32(width > 512 ? 511 : width - 1);
643
		__m128i goodmask = _mm_cmpgt_epi32(bound, result.ivec);
644
		// Clear the ones that were too high, then or in the high bound to those.
645
		result.ivec = _mm_and_si128(goodmask, result.ivec);
646
		result.ivec = _mm_or_si128(result.ivec, _mm_andnot_si128(goodmask, bound));
647
	} else {
648
		result.ivec = _mm_and_si128(result.ivec, _mm_set1_epi32((width - 1) & 511));
649
	}
650
#elif PPSSPP_ARCH(ARM64_NEON)
651
	if (clamp) {
652
		// Let's start by clamping to the maximum.
653
		result.ivec = vminq_s32(result.ivec, vdupq_n_s32(width > 512 ? 511 : width - 1));
654
		// And then to zero.
655
		result.ivec = vmaxq_s32(result.ivec, vdupq_n_s32(0));
656
	} else {
657
		result.ivec = vandq_s32(result.ivec, vdupq_n_s32((width - 1) & 511));
658
	}
659
#else
660
	if (clamp) {
661
		for (int i = 0; i < 4; ++i) {
662
			result[i] = ClampUV(result[i], width);
663
		}
664
	} else {
665
		for (int i = 0; i < 4; ++i) {
666
			result[i] = WrapUV(result[i], width);
667
		}
668
	}
669
#endif
670

671
	return ToVec4IntResult(result);
672
}
673

674
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadS(bool clamp, int u, int width) {
675
#ifdef _M_SSE
676
	__m128i uvec = _mm_add_epi32(_mm_set1_epi32(u), _mm_set_epi32(1, 0, 1, 0));
677
	return ApplyTexelClampQuad(clamp, uvec, width);
678
#elif PPSSPP_ARCH(ARM64_NEON)
679
	static const int32_t u2[4] = { 0, 1, 0, 1 };
680
	int32x4_t uvec = vaddq_s32(vdupq_n_s32(u), vld1q_s32(u2));
681
	return ApplyTexelClampQuad(clamp, uvec, width);
682
#else
683
	Vec4<int> result = Vec4<int>::AssignToAll(u) + Vec4<int>(0, 1, 0, 1);
684
	return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), width);
685
#endif
686
}
687

688
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadT(bool clamp, int v, int height) {
689
#ifdef _M_SSE
690
	__m128i vvec = _mm_add_epi32(_mm_set1_epi32(v), _mm_set_epi32(1, 1, 0, 0));
691
	return ApplyTexelClampQuad(clamp, vvec, height);
692
#elif PPSSPP_ARCH(ARM64_NEON)
693
	static const int32_t v2[4] = { 0, 0, 1, 1 };
694
	int32x4_t vvec = vaddq_s32(vdupq_n_s32(v), vld1q_s32(v2));
695
	return ApplyTexelClampQuad(clamp, vvec, height);
696
#else
697
	Vec4<int> result = Vec4<int>::AssignToAll(v) + Vec4<int>(0, 0, 1, 1);
698
	return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), height);
699
#endif
700
}
701

702
static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadS(int level, float in_s, int &frac_u, const SamplerID &samplerID) {
703
	int width = samplerID.cached.sizes[level].w;
704

705
	int base_u = (int)(in_s * width * 256) - 128;
706
	frac_u = (int)(base_u >> 4) & 0x0F;
707
	base_u >>= 8;
708

709
	// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.
710
	return ApplyTexelClampQuadS(samplerID.clampS, base_u, width);
711
}
712

713
static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadT(int level, float in_t, int &frac_v, const SamplerID &samplerID) {
714
	int height = samplerID.cached.sizes[level].h;
715

716
	int base_v = (int)(in_t * height * 256) - 128;
717
	frac_v = (int)(base_v >> 4) & 0x0F;
718
	base_v >>= 8;
719

720
	// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.
721
	return ApplyTexelClampQuadT(samplerID.clampT, base_v, height);
722
}
723

724
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8 *const *tptr, const uint16_t *bufw, int texlevel, const SamplerID &samplerID) {
725
	int frac_u, frac_v;
726
	const Vec4<int> u = GetTexelCoordinatesQuadS(texlevel, s, frac_u, samplerID);
727
	const Vec4<int> v = GetTexelCoordinatesQuadT(texlevel, t, frac_v, samplerID);
728
	Nearest4 c = SampleNearest<4>(u.AsArray(), v.AsArray(), tptr[0], bufw[0], texlevel, samplerID);
729
#ifdef _M_SSE
730
	__m128i zero = _mm_setzero_si128();
731
	__m128i samples = _mm_loadu_si128((const __m128i*)(c.v));
732
	__m128i top = _mm_unpacklo_epi8(samples, zero);
733
	__m128i bot = _mm_unpackhi_epi8(samples, zero);
734
	// I just a want reasonably efficient
735
	// __m128i mul_u = _mm_setr_epi16(0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, frac_u, frac_u, frac_u, frac_u);
736
	// GCC/clang do something decent for that, MSVC - not so much.
737
	// Hence this. (0x10 - frac_u) is expressed as (frac_u ^ 0xF) + 1,
738
	// which REQUIRES 0 <= frac_u < 0x10.
739
	__m128i mul_u =	_mm_set1_epi16(frac_u);
740
	mul_u = _mm_xor_si128(mul_u, _mm_setr_epi16(0xF, 0xF, 0xF, 0xF, 0x0, 0x0, 0x0, 0x0));
741
	mul_u = _mm_add_epi16(mul_u, _mm_setr_epi16(0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0));
742
	top = _mm_mullo_epi16(top, _mm_set1_epi16(0x10 - frac_v));
743
	bot = _mm_mullo_epi16(bot, _mm_set1_epi16(frac_v));
744
	__m128i sum = _mm_add_epi16(top, bot);
745
	sum = _mm_mullo_epi16(sum, mul_u);
746
	sum = _mm_add_epi16(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(3, 2, 3, 2)));
747
	sum = _mm_srli_epi16(sum, 8);
748
	sum = _mm_unpacklo_epi16(sum, zero);
749
	return sum;
750
#else
751
	Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
752
	Vec4<int> texcolor_tr = Vec4<int>::FromRGBA(c.v[1]);
753
	Vec4<int> texcolor_bl = Vec4<int>::FromRGBA(c.v[2]);
754
	Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
755
	Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
756
	Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
757
	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));
758
#endif
759
}
760

761
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
762
	Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
763
	if (levelFrac) {
764
		const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
765
		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
766
	}
767
	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
768
}
769

770
};
771

772
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company