CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/Rasterizer.cpp
Views: 1401
1
// Copyright (c) 2013- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#include <algorithm>
20
#include <cmath>
21
22
#include "Common/Common.h"
23
#include "Common/CPUDetect.h"
24
#include "Common/Data/Convert/ColorConv.h"
25
#include "Common/Profiler/Profiler.h"
26
#include "Common/StringUtils.h"
27
#include "Core/Config.h"
28
#include "Core/Debugger/MemBlockInfo.h"
29
#include "Core/MemMap.h"
30
#include "GPU/GPUState.h"
31
32
#include "GPU/Common/TextureDecoder.h"
33
#include "GPU/Software/BinManager.h"
34
#include "GPU/Software/DrawPixel.h"
35
#include "GPU/Software/Rasterizer.h"
36
#include "GPU/Software/Sampler.h"
37
#include "GPU/Software/SoftGpu.h"
38
#include "GPU/Software/TransformUnit.h"
39
40
#if defined(_M_SSE)
41
#include <emmintrin.h>
42
#include <smmintrin.h>
43
#endif
44
45
namespace Rasterizer {
46
47
// Only OK on x64 where our stack is aligned
48
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
49
static inline __m128 InterpolateF(const __m128 &c0, const __m128 &c1, const __m128 &c2, int w0, int w1, int w2, float wsum) {
50
__m128 v = _mm_mul_ps(c0, _mm_cvtepi32_ps(_mm_set1_epi32(w0)));
51
v = _mm_add_ps(v, _mm_mul_ps(c1, _mm_cvtepi32_ps(_mm_set1_epi32(w1))));
52
v = _mm_add_ps(v, _mm_mul_ps(c2, _mm_cvtepi32_ps(_mm_set1_epi32(w2))));
53
return _mm_mul_ps(v, _mm_set_ps1(wsum));
54
}
55
56
static inline __m128i InterpolateI(const __m128i &c0, const __m128i &c1, const __m128i &c2, int w0, int w1, int w2, float wsum) {
57
return _mm_cvtps_epi32(InterpolateF(_mm_cvtepi32_ps(c0), _mm_cvtepi32_ps(c1), _mm_cvtepi32_ps(c2), w0, w1, w2, wsum));
58
}
59
#elif PPSSPP_ARCH(ARM64_NEON)
60
static inline float32x4_t InterpolateF(const float32x4_t &c0, const float32x4_t &c1, const float32x4_t &c2, int w0, int w1, int w2, float wsum) {
61
float32x4_t v = vmulq_f32(c0, vcvtq_f32_s32(vdupq_n_s32(w0)));
62
v = vaddq_f32(v, vmulq_f32(c1, vcvtq_f32_s32(vdupq_n_s32(w1))));
63
v = vaddq_f32(v, vmulq_f32(c2, vcvtq_f32_s32(vdupq_n_s32(w2))));
64
return vmulq_f32(v, vdupq_n_f32(wsum));
65
}
66
67
static inline int32x4_t InterpolateI(const int32x4_t &c0, const int32x4_t &c1, const int32x4_t &c2, int w0, int w1, int w2, float wsum) {
68
return vcvtq_s32_f32(InterpolateF(vcvtq_f32_s32(c0), vcvtq_f32_s32(c1), vcvtq_f32_s32(c2), w0, w1, w2, wsum));
69
}
70
#endif
71
72
// NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues.
73
// Not sure if that should be regarded as a bug or if casting to float is a valid fix.
74
75
static inline Vec4<int> Interpolate(const Vec4<int> &c0, const Vec4<int> &c1, const Vec4<int> &c2, int w0, int w1, int w2, float wsum) {
76
#if (defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)) && !PPSSPP_ARCH(X86)
77
return Vec4<int>(InterpolateI(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));
78
#else
79
return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();
80
#endif
81
}
82
83
static inline Vec3<int> Interpolate(const Vec3<int> &c0, const Vec3<int> &c1, const Vec3<int> &c2, int w0, int w1, int w2, float wsum) {
84
#if (defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)) && !PPSSPP_ARCH(X86)
85
return Vec3<int>(InterpolateI(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));
86
#else
87
return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();
88
#endif
89
}
90
91
static inline Vec4<float> Interpolate(const float &c0, const float &c1, const float &c2, const Vec4<float> &w0, const Vec4<float> &w1, const Vec4<float> &w2, const Vec4<float> &wsum_recip) {
92
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
93
__m128 v = _mm_mul_ps(w0.vec, _mm_set1_ps(c0));
94
v = _mm_add_ps(v, _mm_mul_ps(w1.vec, _mm_set1_ps(c1)));
95
v = _mm_add_ps(v, _mm_mul_ps(w2.vec, _mm_set1_ps(c2)));
96
return _mm_mul_ps(v, wsum_recip.vec);
97
#elif PPSSPP_ARCH(ARM64_NEON)
98
float32x4_t v = vmulq_f32(w0.vec, vdupq_n_f32(c0));
99
v = vaddq_f32(v, vmulq_f32(w1.vec, vdupq_n_f32(c1)));
100
v = vaddq_f32(v, vmulq_f32(w2.vec, vdupq_n_f32(c2)));
101
return vmulq_f32(v, wsum_recip.vec);
102
#else
103
return (w0 * c0 + w1 * c1 + w2 * c2) * wsum_recip;
104
#endif
105
}
106
107
static inline Vec4<float> Interpolate(const float &c0, const float &c1, const float &c2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip) {
108
return Interpolate(c0, c1, c2, w0.Cast<float>(), w1.Cast<float>(), w2.Cast<float>(), wsum_recip);
109
}
110
111
void ComputeRasterizerState(RasterizerState *state, BinManager *binner) {
112
ComputePixelFuncID(&state->pixelID);
113
state->drawPixel = Rasterizer::GetSingleFunc(state->pixelID, binner);
114
115
state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;
116
if (state->enableTextures) {
117
ComputeSamplerID(&state->samplerID);
118
state->linear = Sampler::GetLinearFunc(state->samplerID, binner);
119
state->nearest = Sampler::GetNearestFunc(state->samplerID, binner);
120
121
// Since the definitions are the same, just force this setting using the func pointer.
122
if (g_Config.iTexFiltering == TEX_FILTER_FORCE_LINEAR) {
123
state->nearest = state->linear;
124
} else if (g_Config.iTexFiltering == TEX_FILTER_FORCE_NEAREST) {
125
state->linear = state->nearest;
126
}
127
128
state->maxTexLevel = state->samplerID.hasAnyMips ? gstate.getTextureMaxLevel() : 0;
129
130
GETextureFormat texfmt = state->samplerID.TexFmt();
131
for (uint8_t i = 0; i <= state->maxTexLevel; i++) {
132
u32 texaddr = gstate.getTextureAddress(i);
133
state->texaddr[i] = texaddr;
134
state->texbufw[i] = (uint16_t)GetTextureBufw(i, texaddr, texfmt);
135
if (Memory::IsValidAddress(texaddr))
136
state->texptr[i] = Memory::GetPointerUnchecked(texaddr);
137
else
138
state->texptr[i] = nullptr;
139
}
140
141
state->textureLodSlope = gstate.getTextureLodSlope();
142
state->texLevelMode = gstate.getTexLevelMode();
143
state->texLevelOffset = (int8_t)gstate.getTexLevelOffset16();
144
state->mipFilt = gstate.isMipmapFilteringEnabled();
145
state->minFilt = gstate.isMinifyFilteringEnabled();
146
state->magFilt = gstate.isMagnifyFilteringEnabled();
147
state->textureProj = gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX;
148
if (state->textureProj) {
149
// We may be able to optimize this off. This is actually kinda common.
150
const bool qZeroST = gstate.tgenMatrix[2] == 0.0f && gstate.tgenMatrix[5] == 0.0f;
151
const bool qZeroQ = gstate.tgenMatrix[8] == 0.0f;
152
153
// Two common cases: the source q factor is zero, OR source is UV.
154
const bool qFactorZero = gstate.getUVProjMode() == GE_PROJMAP_UV;
155
if (qZeroST && (qZeroQ || qFactorZero) && gstate.tgenMatrix[11] == 1.0f) {
156
state->textureProj = false;
157
}
158
}
159
}
160
161
state->shadeGouraud = !gstate.isModeClear() && gstate.getShadeMode() == GE_SHADE_GOURAUD;
162
state->throughMode = gstate.isModeThrough();
163
state->antialiasLines = gstate.isAntiAliasEnabled();
164
165
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
166
DisplayList currentList{};
167
if (gpuDebug)
168
gpuDebug->GetCurrentDisplayList(currentList);
169
state->listPC = currentList.pc;
170
#endif
171
}
172
173
static inline void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, bool useColor) {
174
if (useColor) {
175
if ((v0.color0 & 0x00FFFFFF) != 0x00FFFFFF)
176
state->flags |= RasterizerStateFlags::VERTEX_NON_FULL_WHITE;
177
uint8_t alpha = v0.color0 >> 24;
178
if (alpha != 0)
179
state->flags |= RasterizerStateFlags::VERTEX_ALPHA_NON_ZERO;
180
if (alpha != 0xFF)
181
state->flags |= RasterizerStateFlags::VERTEX_ALPHA_NON_FULL;
182
}
183
if (!(v0.fogdepth >= 1.0f))
184
state->flags |= RasterizerStateFlags::VERTEX_HAS_FOG;
185
}
186
187
void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0) {
188
CalculateRasterStateFlags(state, v0, true);
189
}
190
191
void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, const VertexData &v1, bool forceFlat) {
192
CalculateRasterStateFlags(state, v0, !forceFlat && state->shadeGouraud);
193
CalculateRasterStateFlags(state, v1, true);
194
}
195
196
void CalculateRasterStateFlags(RasterizerState *state, const VertexData &v0, const VertexData &v1, const VertexData &v2) {
197
CalculateRasterStateFlags(state, v0, state->shadeGouraud);
198
CalculateRasterStateFlags(state, v1, state->shadeGouraud);
199
CalculateRasterStateFlags(state, v2, true);
200
}
201
202
static inline int OptimizePixelIDFlags(const RasterizerStateFlags &flags) {
203
return (int)flags & (int)RasterizerStateFlags::OPTIMIZED_PIXELID;
204
}
205
206
static inline int OptimizeSamplerIDFlags(const RasterizerStateFlags &flags) {
207
return (int)flags & (int)RasterizerStateFlags::OPTIMIZED_SAMPLERID;
208
}
209
210
static inline int OptimizeAllFlags(const RasterizerStateFlags &flags) {
211
return OptimizePixelIDFlags(flags) | OptimizeSamplerIDFlags(flags);
212
}
213
214
static inline RasterizerStateFlags ClearFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &mask) {
215
int clearBits = (int)flags & (int)mask;
216
return (RasterizerStateFlags)((int)flags & ~clearBits);
217
}
218
219
static inline RasterizerStateFlags ReplacePixelIDFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &replace) {
220
RasterizerStateFlags updated = ClearFlags(flags, RasterizerStateFlags::OPTIMIZED_PIXELID);
221
return updated | (RasterizerStateFlags)OptimizePixelIDFlags(replace);
222
}
223
224
static inline RasterizerStateFlags ReplaceSamplerIDFlags(const RasterizerStateFlags &flags, const RasterizerStateFlags &replace) {
225
RasterizerStateFlags updated = ClearFlags(flags, RasterizerStateFlags::OPTIMIZED_SAMPLERID);
226
return updated | (RasterizerStateFlags)OptimizeSamplerIDFlags(replace);
227
}
228
229
static bool CheckClutAlphaFull(RasterizerState *state) {
230
// We only need to check it once.
231
if (state->flags & RasterizerStateFlags::CLUT_ALPHA_CHECKED)
232
return !(state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_FULL);
233
// For now, let's keep things simple.
234
const SamplerID &samplerID = state->samplerID;
235
if (samplerID.hasClutOffset || !samplerID.useSharedClut)
236
return false;
237
238
uint32_t count = samplerID.TexFmt() == GE_TFMT_CLUT4 ? 16 : 256;
239
if (samplerID.hasClutMask)
240
count = std::min(count, ((samplerID.cached.clutFormat >> 8) & 0xFF) + 1);
241
242
u32 alphaSum = 0xFFFFFFFF;
243
if (samplerID.ClutFmt() == GE_CMODE_32BIT_ABGR8888) {
244
CheckMask32((const uint32_t *)samplerID.cached.clut, count, &alphaSum);
245
} else {
246
CheckMask16((const uint16_t *)samplerID.cached.clut, count, &alphaSum);
247
}
248
249
bool onlyFull = true;
250
switch (samplerID.ClutFmt()) {
251
case GE_CMODE_16BIT_BGR5650:
252
break;
253
254
case GE_CMODE_16BIT_ABGR5551:
255
onlyFull = (alphaSum & 0x8000) != 0;
256
break;
257
258
case GE_CMODE_16BIT_ABGR4444:
259
onlyFull = (alphaSum & 0xF000) == 0xF000;
260
break;
261
262
case GE_CMODE_32BIT_ABGR8888:
263
onlyFull = (alphaSum & 0xFF000000) == 0xFF000000;
264
break;
265
}
266
267
// Might just be different patterns, but if alphaSum != 0, it can't contain zero.
268
if (alphaSum != 0)
269
state->flags |= RasterizerStateFlags::CLUT_ALPHA_NON_ZERO;
270
if (!onlyFull)
271
state->flags |= RasterizerStateFlags::CLUT_ALPHA_NON_FULL;
272
state->flags |= RasterizerStateFlags::CLUT_ALPHA_CHECKED;
273
274
return onlyFull;
275
}
276
277
static RasterizerStateFlags DetectStateOptimizations(RasterizerState *state) {
278
// Note: all optimizations must be undoable.
279
RasterizerStateFlags optimize = RasterizerStateFlags::NONE;
280
auto &pixelID = state->pixelID;
281
auto &samplerID = state->samplerID;
282
283
bool alphaZero = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_ZERO);
284
bool alphaFull = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_FULL);
285
bool needTextureAlpha = state->enableTextures && samplerID.useTextureAlpha;
286
287
if (!pixelID.clearMode) {
288
auto &cached = pixelID.cached;
289
290
bool alphaBlend = pixelID.alphaBlend || (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_OFF);
291
if (needTextureAlpha && alphaBlend && alphaFull) {
292
bool usesClut = (samplerID.texfmt & 4) != 0;
293
if (usesClut && CheckClutAlphaFull(state))
294
needTextureAlpha = false;
295
}
296
297
if (alphaBlend && !needTextureAlpha) {
298
PixelBlendFactor src = pixelID.AlphaBlendSrc();
299
PixelBlendFactor dst = pixelID.AlphaBlendDst();
300
if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)
301
src = PixelBlendFactor::SRCALPHA;
302
if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)
303
dst = PixelBlendFactor::INVSRCALPHA;
304
305
// Okay, we may be able to convert this to a fixed value.
306
if (alphaZero || alphaFull) {
307
// If it was already set and we still can, set it again.
308
if (src == PixelBlendFactor::SRCALPHA)
309
optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_SRC;
310
if (dst == PixelBlendFactor::INVSRCALPHA)
311
optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_DST;
312
}
313
if (alphaFull && (src == PixelBlendFactor::SRCALPHA || src == PixelBlendFactor::ONE) && (dst == PixelBlendFactor::INVSRCALPHA || dst == PixelBlendFactor::ZERO)) {
314
optimize |= RasterizerStateFlags::OPTIMIZED_BLEND_OFF;
315
}
316
}
317
318
if (alphaBlend && (needTextureAlpha || !alphaFull)) {
319
// Okay, we're blending, and we need to. Are we alpha testing?
320
GEComparison alphaTestFunc = pixelID.AlphaTestFunc();
321
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)
322
alphaTestFunc = GE_COMP_NOTEQUAL;
323
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)
324
alphaTestFunc = GE_COMP_GREATER;
325
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON)
326
alphaTestFunc = GE_COMP_ALWAYS;
327
328
PixelBlendFactor src = pixelID.AlphaBlendSrc();
329
PixelBlendFactor dst = pixelID.AlphaBlendDst();
330
if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)
331
src = PixelBlendFactor::SRCALPHA;
332
if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)
333
dst = PixelBlendFactor::INVSRCALPHA;
334
335
if (alphaTestFunc == GE_COMP_ALWAYS && src == PixelBlendFactor::SRCALPHA && dst == PixelBlendFactor::INVSRCALPHA) {
336
bool usesClut = (samplerID.texfmt & 4) != 0;
337
bool couldHaveZeroTexAlpha = true;
338
if (usesClut && CheckClutAlphaFull(state))
339
couldHaveZeroTexAlpha = false;
340
if (state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_ZERO)
341
couldHaveZeroTexAlpha = false;
342
343
// Blending is expensive, since we read the target. Force alpha testing on.
344
if (!pixelID.depthWrite && !pixelID.stencilTest && couldHaveZeroTexAlpha)
345
optimize |= RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON;
346
}
347
}
348
349
bool applyFog = pixelID.applyFog || (state->flags & RasterizerStateFlags::OPTIMIZED_FOG_OFF);
350
if (applyFog) {
351
bool hasFog = state->flags & RasterizerStateFlags::VERTEX_HAS_FOG;
352
if (!hasFog)
353
optimize |= RasterizerStateFlags::OPTIMIZED_FOG_OFF;
354
}
355
}
356
357
if (state->enableTextures) {
358
bool colorFull = !(state->flags & RasterizerStateFlags::VERTEX_NON_FULL_WHITE);
359
if (colorFull && (!needTextureAlpha || alphaFull)) {
360
// Modulate is common, sometimes even with a fixed color. Replace is cheaper.
361
GETexFunc texFunc = samplerID.TexFunc();
362
if (state->flags & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)
363
texFunc = GE_TEXFUNC_MODULATE;
364
365
if (texFunc == GE_TEXFUNC_MODULATE)
366
optimize |= RasterizerStateFlags::OPTIMIZED_TEXREPLACE;
367
}
368
369
bool usesClut = (samplerID.texfmt & 4) != 0;
370
if (usesClut && alphaFull && samplerID.useTextureAlpha) {
371
GEComparison alphaTestFunc = pixelID.AlphaTestFunc();
372
// We optimize > 0 to != 0, so this is especially common.
373
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)
374
alphaTestFunc = GE_COMP_NOTEQUAL;
375
// > 16, 8, or similar are also very common.
376
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)
377
alphaTestFunc = GE_COMP_GREATER;
378
if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON)
379
alphaTestFunc = GE_COMP_ALWAYS;
380
381
bool alphaTest = (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) && pixelID.alphaTestRef < 0xFF && !state->pixelID.hasAlphaTestMask;
382
if (alphaTest) {
383
bool canSkipAlphaTest = CheckClutAlphaFull(state);
384
if ((state->flags & RasterizerStateFlags::CLUT_ALPHA_NON_ZERO) && pixelID.alphaTestRef == 0)
385
canSkipAlphaTest = true;
386
if (canSkipAlphaTest)
387
optimize |= alphaTestFunc == GE_COMP_NOTEQUAL ? RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE : RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT;
388
}
389
}
390
}
391
392
return optimize;
393
}
394
395
static bool ApplyStateOptimizations(RasterizerState *state, const RasterizerStateFlags &optimize) {
396
bool changed = false;
397
398
// Check if we can compile the new funcs before replacing.
399
if (OptimizePixelIDFlags(state->flags) != OptimizePixelIDFlags(optimize)) {
400
bool canFull = !(state->flags & RasterizerStateFlags::VERTEX_ALPHA_NON_FULL);
401
402
PixelFuncID pixelID = state->pixelID;
403
if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_OFF)
404
pixelID.alphaBlend = false;
405
else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_OFF)
406
pixelID.alphaBlend = true;
407
if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)
408
pixelID.alphaBlendSrc = (uint8_t)(canFull ? PixelBlendFactor::ONE : PixelBlendFactor::ZERO);
409
else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_SRC)
410
pixelID.alphaBlendSrc = (uint8_t)PixelBlendFactor::SRCALPHA;
411
if (optimize & RasterizerStateFlags::OPTIMIZED_BLEND_DST)
412
pixelID.alphaBlendDst = (uint8_t)(canFull ? PixelBlendFactor::ZERO : PixelBlendFactor::ONE);
413
else if (state->flags & RasterizerStateFlags::OPTIMIZED_BLEND_DST)
414
pixelID.alphaBlendDst = (uint8_t)PixelBlendFactor::INVSRCALPHA;
415
if (optimize & RasterizerStateFlags::OPTIMIZED_FOG_OFF)
416
pixelID.applyFog = false;
417
else if (state->flags & RasterizerStateFlags::OPTIMIZED_FOG_OFF)
418
pixelID.applyFog = true;
419
if (optimize & (RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE | RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT))
420
pixelID.alphaTestFunc = GE_COMP_ALWAYS;
421
else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_NE)
422
pixelID.alphaTestFunc = GE_COMP_NOTEQUAL;
423
else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_OFF_GT)
424
pixelID.alphaTestFunc = GE_COMP_GREATER;
425
else if (optimize & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON) {
426
pixelID.alphaTestFunc = GE_COMP_NOTEQUAL;
427
pixelID.alphaTestRef = 0;
428
pixelID.hasAlphaTestMask = false;
429
} else if (state->flags & RasterizerStateFlags::OPTIMIZED_ALPHATEST_ON) {
430
pixelID.alphaTestFunc = GE_COMP_ALWAYS;
431
}
432
433
SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID, nullptr);
434
// Can't compile during runtime. This failing is a bit of a problem when undoing...
435
if (drawPixel) {
436
state->drawPixel = drawPixel;
437
memcpy(&state->pixelID, &pixelID, sizeof(PixelFuncID));
438
state->flags = ReplacePixelIDFlags(state->flags, optimize) | RasterizerStateFlags::OPTIMIZED;
439
changed = true;
440
}
441
}
442
443
if (OptimizeSamplerIDFlags(state->flags) != OptimizeSamplerIDFlags(optimize)) {
444
SamplerID samplerID = state->samplerID;
445
if (optimize & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)
446
samplerID.texFunc = (uint8_t)GE_TEXFUNC_REPLACE;
447
else if (state->flags & RasterizerStateFlags::OPTIMIZED_TEXREPLACE)
448
samplerID.texFunc = (uint8_t)GE_TEXFUNC_MODULATE;
449
450
Sampler::LinearFunc linear = Sampler::GetLinearFunc(samplerID, nullptr);
451
Sampler::LinearFunc nearest = Sampler::GetNearestFunc(samplerID, nullptr);
452
// Can't compile during runtime. This failing is a bit of a problem when undoing...
453
if (linear && nearest) {
454
// Since the definitions are the same, just force this setting using the func pointer.
455
if (g_Config.iTexFiltering == TEX_FILTER_FORCE_LINEAR) {
456
state->nearest = linear;
457
state->linear = linear;
458
} else if (g_Config.iTexFiltering == TEX_FILTER_FORCE_NEAREST) {
459
state->nearest = nearest;
460
state->linear = nearest;
461
} else {
462
state->nearest = nearest;
463
state->linear = linear;
464
}
465
memcpy(&state->samplerID, &samplerID, sizeof(SamplerID));
466
state->flags = ReplaceSamplerIDFlags(state->flags, optimize) | RasterizerStateFlags::OPTIMIZED;
467
changed = true;
468
}
469
}
470
471
state->lastFlags = state->flags;
472
return changed;
473
}
474
475
bool OptimizeRasterState(RasterizerState *state) {
476
if (state->flags == state->lastFlags)
477
return false;
478
479
RasterizerStateFlags optimize = DetectStateOptimizations(state);
480
481
// If it was optimized before, just revert and don't churn.
482
if ((state->flags & RasterizerStateFlags::OPTIMIZED) && OptimizeAllFlags(state->flags) != OptimizeAllFlags(optimize)) {
483
optimize = RasterizerStateFlags::NONE;
484
} else if (optimize == RasterizerStateFlags::NONE && !(state->flags & RasterizerStateFlags::OPTIMIZED)) {
485
state->lastFlags = state->flags;
486
return false;
487
}
488
489
return ApplyStateOptimizations(state, optimize);
490
}
491
492
RasterizerState OptimizeFlatRasterizerState(const RasterizerState &origState, const VertexData &v1) {
493
uint8_t alpha = v1.color0 >> 24;
494
RasterizerState state = origState;
495
496
// Sometimes, a particular draw can do better than the overall state.
497
state.flags = ClearFlags(state.flags, RasterizerStateFlags::VERTEX_FLAT_RESET);
498
CalculateRasterStateFlags(&state, v1, true);
499
500
RasterizerStateFlags optimize = DetectStateOptimizations(&state);
501
if (OptimizeAllFlags(state.flags) != OptimizeAllFlags(optimize)) {
502
ApplyStateOptimizations(&state, optimize);
503
return state;
504
}
505
506
return origState;
507
}
508
509
static inline u8 ClampFogDepth(float fogdepth) {
510
union FloatBits {
511
float f;
512
u32 u;
513
};
514
FloatBits f;
515
f.f = fogdepth;
516
517
u32 exp = f.u >> 23;
518
if ((f.u & 0x80000000) != 0 || exp <= 126 - 8)
519
return 0;
520
if (exp > 126)
521
return 255;
522
523
u32 mantissa = (f.u & 0x007FFFFF) | 0x00800000;
524
return mantissa >> (16 + 126 - exp);
525
}
526
527
static inline void GetTextureCoordinates(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {
528
// Note that for environment mapping, texture coordinates have been calculated during lighting
529
float q0 = 1.f / v0.clipw;
530
float q1 = 1.f / v1.clipw;
531
float wq0 = p * q0;
532
float wq1 = (1.0f - p) * q1;
533
534
float q_recip = 1.0f / (wq0 + wq1);
535
s = (v0.texturecoords.s() * wq0 + v1.texturecoords.s() * wq1) * q_recip;
536
t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;
537
}
538
539
static inline void GetTextureCoordinatesProj(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {
540
// This is for texture matrix projection.
541
float q0 = 1.f / v0.clipw;
542
float q1 = 1.f / v1.clipw;
543
float wq0 = p * q0;
544
float wq1 = (1.0f - p) * q1;
545
546
float q_recip = 1.0f / (v0.texturecoords.q() * wq0 + v1.texturecoords.q() * wq1);
547
548
s = (v0.texturecoords.s() * wq0 + v1.texturecoords.s() * wq1) * q_recip;
549
t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;
550
}
551
552
static inline void GetTextureCoordinates(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {
553
// Note that for environment mapping, texture coordinates have been calculated during lighting.
554
float q0 = 1.f / v0.clipw;
555
float q1 = 1.f / v1.clipw;
556
float q2 = 1.f / v2.clipw;
557
Vec4<float> wq0 = w0.Cast<float>() * q0;
558
Vec4<float> wq1 = w1.Cast<float>() * q1;
559
Vec4<float> wq2 = w2.Cast<float>() * q2;
560
561
Vec4<float> q_recip = (wq0 + wq1 + wq2).Reciprocal();
562
s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), wq0, wq1, wq2, q_recip);
563
t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);
564
}
565
566
static inline void GetTextureCoordinatesProj(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {
567
// This is for texture matrix projection.
568
float q0 = 1.f / v0.clipw;
569
float q1 = 1.f / v1.clipw;
570
float q2 = 1.f / v2.clipw;
571
Vec4<float> wq0 = w0.Cast<float>() * q0;
572
Vec4<float> wq1 = w1.Cast<float>() * q1;
573
Vec4<float> wq2 = w2.Cast<float>() * q2;
574
575
// Here, Interpolate() is a bit suboptimal, since
576
// there's no need to multiply by 1.0f.
577
Vec4<float> q_recip = Interpolate(v0.texturecoords.q(), v1.texturecoords.q(), v2.texturecoords.q(), wq0, wq1, wq2, Vec4<float>::AssignToAll(1.0f)).Reciprocal();
578
579
s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), wq0, wq1, wq2, q_recip);
580
t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);
581
}
582
583
static inline void SetPixelDepth(int x, int y, int stride, u16 value) {
584
depthbuf.Set16(x, y, stride, value);
585
}
586
587
static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Vec2<int>& line1, const Vec2<int>& line2)
588
{
589
if (line1.y == line2.y) {
590
// just check if vertex is above us => bottom line parallel to x-axis
591
return vertex.y < line1.y;
592
} else {
593
// check if vertex is on our left => right side
594
return vertex.x < line1.x + (line2.x - line1.x) * (vertex.y - line1.y) / (line2.y - line1.y);
595
}
596
}
597
598
static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(float s, float t, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {
599
const u8 **tptr0 = const_cast<const u8 **>(&state.texptr[texlevel]);
600
const uint16_t *bufw0 = &state.texbufw[texlevel];
601
602
if (!bilinear) {
603
return state.nearest(s, t, prim_color, tptr0, bufw0, texlevel, frac_texlevel, state.samplerID);
604
}
605
return state.linear(s, t, prim_color, tptr0, bufw0, texlevel, frac_texlevel, state.samplerID);
606
}
607
608
static inline Vec4IntResult SOFTRAST_CALL ApplyTexturingSingle(float s, float t, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {
609
return ApplyTexturing(s, t, prim_color, texlevel, frac_texlevel, bilinear, state);
610
}
611
612
// Produces a signed 1.27.4 value.
613
static int TexLog2(float delta) {
614
union FloatBits {
615
float f;
616
u32 u;
617
};
618
FloatBits f;
619
f.f = delta;
620
// Use the exponent as the tex level, and the top mantissa bits for a frac.
621
// We can't support more than 4 bits of frac, so truncate.
622
int useful = (f.u >> 19) & 0x0FFF;
623
// Now offset so the exponent aligns with log2f (exp=127 is 0.)
624
return useful - 127 * 16;
625
}
626
627
static inline void CalculateSamplingParams(const float ds, const float dt, float w, const RasterizerState &state, int &level, int &levelFrac, bool &filt) {
628
const int width = 1 << state.samplerID.width0Shift;
629
const int height = 1 << state.samplerID.height0Shift;
630
631
// With 8 bits of fraction (because texslope can be fairly precise.)
632
int detail;
633
switch (state.TexLevelMode()) {
634
case GE_TEXLEVEL_MODE_AUTO:
635
detail = TexLog2(std::max(std::abs(ds * width), std::abs(dt * height)));
636
break;
637
case GE_TEXLEVEL_MODE_SLOPE:
638
// This is always offset by an extra texlevel.
639
detail = TexLog2(2.0f * w * state.textureLodSlope);
640
break;
641
case GE_TEXLEVEL_MODE_CONST:
642
default:
643
// Unused value 3 operates the same as CONST.
644
detail = 0;
645
break;
646
}
647
648
// Add in the bias (used in all modes), with 4 bits of fraction.
649
detail += state.texLevelOffset;
650
651
if (detail > 0 && state.maxTexLevel > 0) {
652
bool mipFilt = state.mipFilt;
653
654
int level8 = std::min(detail, state.maxTexLevel * 16);
655
if (!mipFilt) {
656
// Round up at 1.5.
657
level8 += 8;
658
}
659
level = level8 >> 4;
660
levelFrac = mipFilt ? level8 & 0xF : 0;
661
} else {
662
level = 0;
663
levelFrac = 0;
664
}
665
666
if (detail > 0)
667
filt = state.minFilt;
668
else
669
filt = state.magFilt;
670
}
671
672
static inline void ApplyTexturing(const RasterizerState &state, Vec4<int> *prim_color, const Vec4<int> &mask, const Vec4<float> &s, const Vec4<float> &t, float w) {
673
float ds = s[1] - s[0];
674
float dt = t[2] - t[0];
675
676
int level;
677
int levelFrac;
678
bool bilinear;
679
CalculateSamplingParams(ds, dt, w, state, level, levelFrac, bilinear);
680
681
PROFILE_THIS_SCOPE("sampler");
682
for (int i = 0; i < 4; ++i) {
683
if (mask[i] >= 0)
684
prim_color[i] = ApplyTexturing(s[i], t[i], ToVec4IntArg(prim_color[i]), level, levelFrac, bilinear, state);
685
}
686
}
687
688
static inline Vec4<int> SOFTRAST_CALL CheckDepthTestPassed4(const Vec4<int> &mask, GEComparison func, int x, int y, int stride, Vec4<int> z) {
689
// Skip the depth buffer read if we're masked already.
690
#if defined(_M_SSE)
691
__m128i result = SAFE_M128I(mask.ivec);
692
int maskbits = _mm_movemask_epi8(result);
693
if (maskbits >= 0xFFFF)
694
return mask;
695
#else
696
Vec4<int> result = mask;
697
if (mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)
698
return result;
699
#endif
700
701
// Read in the existing depth values.
702
#if defined(_M_SSE)
703
// Tried using flags from maskbits to skip dwords... seemed neutral.
704
__m128i refz = _mm_cvtsi32_si128(*(u32 *)depthbuf.Get16Ptr(x, y, stride));
705
refz = _mm_unpacklo_epi32(refz, _mm_cvtsi32_si128(*(u32 *)depthbuf.Get16Ptr(x, y + 1, stride)));
706
refz = _mm_unpacklo_epi16(refz, _mm_setzero_si128());
707
#else
708
Vec4<int> refz(depthbuf.Get16(x, y, stride), depthbuf.Get16(x + 1, y, stride), depthbuf.Get16(x, y + 1, stride), depthbuf.Get16(x + 1, y + 1, stride));
709
#endif
710
711
switch (func) {
712
case GE_COMP_NEVER:
713
#if defined(_M_SSE)
714
result = _mm_set1_epi32(-1);
715
#else
716
result = Vec4<int>::AssignToAll(-1);
717
#endif
718
break;
719
720
case GE_COMP_ALWAYS:
721
break;
722
723
case GE_COMP_EQUAL:
724
#if defined(_M_SSE)
725
result = _mm_or_si128(result, _mm_xor_si128(_mm_cmpeq_epi32(z.ivec, refz), _mm_set1_epi32(-1)));
726
#else
727
for (int i = 0; i < 4; ++i)
728
result[i] |= z[i] != refz[i] ? -1 : 0;
729
#endif
730
break;
731
732
case GE_COMP_NOTEQUAL:
733
#if defined(_M_SSE)
734
result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));
735
#else
736
for (int i = 0; i < 4; ++i)
737
result[i] |= z[i] == refz[i] ? -1 : 0;
738
#endif
739
break;
740
741
case GE_COMP_LESS:
742
#if defined(_M_SSE)
743
result = _mm_or_si128(result, _mm_cmpgt_epi32(z.ivec, refz));
744
result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));
745
#else
746
for (int i = 0; i < 4; ++i)
747
result[i] |= z[i] >= refz[i] ? -1 : 0;
748
#endif
749
break;
750
751
case GE_COMP_LEQUAL:
752
#if defined(_M_SSE)
753
result = _mm_or_si128(result, _mm_cmpgt_epi32(z.ivec, refz));
754
#else
755
for (int i = 0; i < 4; ++i)
756
result[i] |= z[i] > refz[i] ? -1 : 0;
757
#endif
758
break;
759
760
case GE_COMP_GREATER:
761
#if defined(_M_SSE)
762
result = _mm_or_si128(result, _mm_cmplt_epi32(z.ivec, refz));
763
result = _mm_or_si128(result, _mm_cmpeq_epi32(z.ivec, refz));
764
#else
765
for (int i = 0; i < 4; ++i)
766
result[i] |= z[i] <= refz[i] ? -1 : 0;
767
#endif
768
break;
769
770
case GE_COMP_GEQUAL:
771
#if defined(_M_SSE)
772
result = _mm_or_si128(result, _mm_cmplt_epi32(z.ivec, refz));
773
#else
774
for (int i = 0; i < 4; ++i)
775
result[i] |= z[i] < refz[i] ? -1 : 0;
776
#endif
777
break;
778
}
779
780
return result;
781
}
782
783
template <bool useSSE4>
784
struct TriangleEdge {
785
Vec4<int> Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin);
786
inline Vec4<int> StepX(const Vec4<int> &w);
787
inline Vec4<int> StepY(const Vec4<int> &w);
788
789
inline void NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX);
790
inline Vec4<int> StepXTimes(const Vec4<int> &w, int c);
791
792
Vec4<int> stepX;
793
Vec4<int> stepY;
794
};
795
796
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
797
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
798
[[gnu::target("sse4.1")]]
799
#endif
800
static inline __m128i SOFTRAST_CALL TriangleEdgeStartSSE4(__m128i initX, __m128i initY, int xf, int yf, int c) {
801
initX = _mm_mullo_epi32(initX, _mm_set1_epi32(xf));
802
initY = _mm_mullo_epi32(initY, _mm_set1_epi32(yf));
803
return _mm_add_epi32(_mm_add_epi32(initX, initY), _mm_set1_epi32(c));
804
}
805
#endif
806
807
template <bool useSSE4>
808
Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
809
// Start at pixel centers.
810
static constexpr int centerOff = (SCREEN_SCALE_FACTOR / 2) - 1;
811
static constexpr int centerPlus1 = SCREEN_SCALE_FACTOR + centerOff;
812
Vec4<int> initX = Vec4<int>::AssignToAll(origin.x) + Vec4<int>(centerOff, centerPlus1, centerOff, centerPlus1);
813
Vec4<int> initY = Vec4<int>::AssignToAll(origin.y) + Vec4<int>(centerOff, centerOff, centerPlus1, centerPlus1);
814
815
// orient2d refactored.
816
int xf = v0.y - v1.y;
817
int yf = v1.x - v0.x;
818
int c = v1.y * v0.x - v1.x * v0.y;
819
820
stepX = Vec4<int>::AssignToAll(xf * SCREEN_SCALE_FACTOR * 2);
821
stepY = Vec4<int>::AssignToAll(yf * SCREEN_SCALE_FACTOR * 2);
822
823
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
824
if constexpr (useSSE4)
825
return TriangleEdgeStartSSE4(initX.ivec, initY.ivec, xf, yf, c);
826
#endif
827
return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);
828
}
829
830
template <bool useSSE4>
831
inline Vec4<int> TriangleEdge<useSSE4>::StepX(const Vec4<int> &w) {
832
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
833
return _mm_add_epi32(w.ivec, stepX.ivec);
834
#elif PPSSPP_ARCH(ARM64_NEON)
835
return vaddq_s32(w.ivec, stepX.ivec);
836
#else
837
return w + stepX;
838
#endif
839
}
840
841
template <bool useSSE4>
842
inline Vec4<int> TriangleEdge<useSSE4>::StepY(const Vec4<int> &w) {
843
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
844
return _mm_add_epi32(w.ivec, stepY.ivec);
845
#elif PPSSPP_ARCH(ARM64_NEON)
846
return vaddq_s32(w.ivec, stepY.ivec);
847
#else
848
return w + stepY;
849
#endif
850
}
851
852
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
853
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
854
[[gnu::target("sse4.1")]]
855
#endif
856
static inline int SOFTRAST_CALL MaxWeightSSE4(__m128i w) {
857
__m128i max2 = _mm_max_epi32(w, _mm_shuffle_epi32(w, _MM_SHUFFLE(3, 2, 3, 2)));
858
__m128i max1 = _mm_max_epi32(max2, _mm_shuffle_epi32(max2, _MM_SHUFFLE(1, 1, 1, 1)));
859
return _mm_cvtsi128_si32(max1);
860
}
861
#endif
862
863
template <bool useSSE4>
864
void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
865
int wmax;
866
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
867
if constexpr (useSSE4) {
868
wmax = MaxWeightSSE4(w.ivec);
869
} else {
870
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
871
}
872
#elif PPSSPP_ARCH(ARM64_NEON)
873
int32x2_t wmax_temp = vpmax_s32(vget_low_s32(w.ivec), vget_high_s32(w.ivec));
874
wmax = vget_lane_s32(vpmax_s32(wmax_temp, wmax_temp), 0);
875
#else
876
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
877
#endif
878
if (wmax < 0) {
879
if (stepX.x > 0) {
880
int steps = -wmax / stepX.x;
881
rowMinX = std::max(rowMinX, minX + steps * SCREEN_SCALE_FACTOR * 2);
882
} else if (stepX.x <= 0) {
883
rowMinX = rowMaxX + 1;
884
}
885
}
886
887
if (wmax >= 0 && stepX.x < 0) {
888
int steps = (-wmax / stepX.x) + 1;
889
rowMaxX = std::min(rowMaxX, minX + steps * SCREEN_SCALE_FACTOR * 2);
890
}
891
}
892
893
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
894
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
895
[[gnu::target("sse4.1")]]
896
#endif
897
static inline __m128i SOFTRAST_CALL StepTimesSSE4(__m128i w, __m128i step, int c) {
898
return _mm_add_epi32(w, _mm_mullo_epi32(_mm_set1_epi32(c), step));
899
}
900
#endif
901
902
template <bool useSSE4>
903
inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {
904
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
905
if constexpr (useSSE4)
906
return StepTimesSSE4(w.ivec, stepX.ivec, c);
907
#elif PPSSPP_ARCH(ARM64_NEON)
908
return vaddq_s32(w.ivec, vmulq_s32(vdupq_n_s32(c), stepX.ivec));
909
#endif
910
return w + stepX * c;
911
}
912
913
static inline Vec4<int> MakeMask(const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<int> &bias0, const Vec4<int> &bias1, const Vec4<int> &bias2, const Vec4<int> &scissor) {
914
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
915
__m128i biased0 = _mm_add_epi32(w0.ivec, bias0.ivec);
916
__m128i biased1 = _mm_add_epi32(w1.ivec, bias1.ivec);
917
__m128i biased2 = _mm_add_epi32(w2.ivec, bias2.ivec);
918
919
return _mm_or_si128(_mm_or_si128(biased0, _mm_or_si128(biased1, biased2)), scissor.ivec);
920
#elif PPSSPP_ARCH(ARM64_NEON)
921
int32x4_t biased0 = vaddq_s32(w0.ivec, bias0.ivec);
922
int32x4_t biased1 = vaddq_s32(w1.ivec, bias1.ivec);
923
int32x4_t biased2 = vaddq_s32(w2.ivec, bias2.ivec);
924
925
return vorrq_s32(vorrq_s32(biased0, vorrq_s32(biased1, biased2)), scissor.ivec);
926
#else
927
return (w0 + bias0) | (w1 + bias1) | (w2 + bias2) | scissor;
928
#endif
929
}
930
931
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
932
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
933
[[gnu::target("sse4.1")]]
934
#endif
935
static inline bool SOFTRAST_CALL AnyMaskSSE4(__m128i mask) {
936
__m128i sig = _mm_srai_epi32(mask, 31);
937
return _mm_test_all_ones(sig) == 0;
938
}
939
#endif
940
941
template <bool useSSE4>
942
static inline bool AnyMask(const Vec4<int> &mask) {
943
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
944
if constexpr (useSSE4) {
945
return AnyMaskSSE4(mask.ivec);
946
}
947
948
// Source: https://fgiesen.wordpress.com/2013/02/10/optimizing-the-basic-rasterizer/#comment-6676
949
return _mm_movemask_ps(_mm_castsi128_ps(mask.ivec)) != 15;
950
#elif PPSSPP_ARCH(ARM64_NEON)
951
int64x2_t sig = vreinterpretq_s64_s32(vshrq_n_s32(mask.ivec, 31));
952
return vgetq_lane_s64(sig, 0) != -1 || vgetq_lane_s64(sig, 1) != -1;
953
#else
954
return mask.x >= 0 || mask.y >= 0 || mask.z >= 0 || mask.w >= 0;
955
#endif
956
}
957
958
static inline Vec4<float> EdgeRecip(const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2) {
959
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
960
__m128i wsum = _mm_add_epi32(w0.ivec, _mm_add_epi32(w1.ivec, w2.ivec));
961
// _mm_rcp_ps loses too much precision.
962
return _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(wsum));
963
#elif PPSSPP_ARCH(ARM64_NEON)
964
int32x4_t wsum = vaddq_s32(w0.ivec, vaddq_s32(w1.ivec, w2.ivec));
965
return vdivq_f32(vdupq_n_f32(1.0f), vcvtq_f32_s32(wsum));
966
#else
967
return (w0 + w1 + w2).Cast<float>().Reciprocal();
968
#endif
969
}
970
971
template <bool clearMode, bool useSSE4>
972
void DrawTriangleSlice(
973
const VertexData& v0, const VertexData& v1, const VertexData& v2,
974
int x1, int y1, int x2, int y2,
975
const RasterizerState &state)
976
{
977
Vec4<int> bias0 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v0.screenpos.xy(), v1.screenpos.xy(), v2.screenpos.xy()) ? -1 : 0);
978
Vec4<int> bias1 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v1.screenpos.xy(), v2.screenpos.xy(), v0.screenpos.xy()) ? -1 : 0);
979
Vec4<int> bias2 = Vec4<int>::AssignToAll(IsRightSideOrFlatBottomLine(v2.screenpos.xy(), v0.screenpos.xy(), v1.screenpos.xy()) ? -1 : 0);
980
981
const PixelFuncID &pixelID = state.pixelID;
982
983
TriangleEdge<useSSE4> e0;
984
TriangleEdge<useSSE4> e1;
985
TriangleEdge<useSSE4> e2;
986
987
int64_t minX = x1, maxX = x2, minY = y1, maxY = y2;
988
989
ScreenCoords pprime(minX, minY, 0);
990
Vec4<int> w0_base = e0.Start(v1.screenpos, v2.screenpos, pprime);
991
Vec4<int> w1_base = e1.Start(v2.screenpos, v0.screenpos, pprime);
992
Vec4<int> w2_base = e2.Start(v0.screenpos, v1.screenpos, pprime);
993
994
// The sum of weights should remain constant as we move toward/away from the edges.
995
const Vec4<float> wsum_recip = EdgeRecip(w0_base, w1_base, w2_base);
996
997
// All the z values are the same, no interpolation required.
998
// This is common, and when we interpolate, we lose accuracy.
999
const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z;
1000
const bool flatColorAll = !state.shadeGouraud;
1001
const bool flatColor0 = flatColorAll || (v0.color0 == v1.color0 && v0.color0 == v2.color0);
1002
const bool flatColor1 = flatColorAll || (v0.color1 == v1.color1 && v0.color1 == v2.color1);
1003
const bool noFog = clearMode || !pixelID.applyFog || (v0.fogdepth >= 1.0f && v1.fogdepth >= 1.0f && v2.fogdepth >= 1.0f);
1004
1005
if (pixelID.applyDepthRange && flatZ) {
1006
if (v0.screenpos.z < pixelID.cached.minz || v0.screenpos.z > pixelID.cached.maxz)
1007
return;
1008
}
1009
1010
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1011
uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
1012
std::string tag = StringFromFormat("DisplayListT_%08x", state.listPC);
1013
std::string ztag = StringFromFormat("DisplayListTZ_%08x", state.listPC);
1014
#endif
1015
1016
const Vec4<int> v0_c0 = Vec4<int>::FromRGBA(v0.color0);
1017
const Vec4<int> v1_c0 = Vec4<int>::FromRGBA(v1.color0);
1018
const Vec4<int> v2_c0 = Vec4<int>::FromRGBA(v2.color0);
1019
const Vec3<int> v0_c1 = Vec3<int>::FromRGB(v0.color1);
1020
const Vec3<int> v1_c1 = Vec3<int>::FromRGB(v1.color1);
1021
const Vec3<int> v2_c1 = Vec3<int>::FromRGB(v2.color1);
1022
1023
const Vec4<float> v0_z4 = Vec4<int>::AssignToAll(v0.screenpos.z).Cast<float>();
1024
const Vec4<float> v1_z4 = Vec4<int>::AssignToAll(v1.screenpos.z).Cast<float>();
1025
const Vec4<float> v2_z4 = Vec4<int>::AssignToAll(v2.screenpos.z).Cast<float>();
1026
const Vec4<int> minz = Vec4<int>::AssignToAll(pixelID.cached.minz);
1027
const Vec4<int> maxz = Vec4<int>::AssignToAll(pixelID.cached.maxz);
1028
1029
for (int64_t curY = minY; curY <= maxY; curY += SCREEN_SCALE_FACTOR * 2,
1030
w0_base = e0.StepY(w0_base),
1031
w1_base = e1.StepY(w1_base),
1032
w2_base = e2.StepY(w2_base)) {
1033
Vec4<int> w0 = w0_base;
1034
Vec4<int> w1 = w1_base;
1035
Vec4<int> w2 = w2_base;
1036
1037
DrawingCoords p = TransformUnit::ScreenToDrawing(minX, curY);
1038
1039
int64_t rowMinX = minX, rowMaxX = maxX;
1040
e0.NarrowMinMaxX(w0, minX, rowMinX, rowMaxX);
1041
e1.NarrowMinMaxX(w1, minX, rowMinX, rowMaxX);
1042
e2.NarrowMinMaxX(w2, minX, rowMinX, rowMaxX);
1043
1044
int skipX = (rowMinX - minX) / (SCREEN_SCALE_FACTOR * 2);
1045
w0 = e0.StepXTimes(w0, skipX);
1046
w1 = e1.StepXTimes(w1, skipX);
1047
w2 = e2.StepXTimes(w2, skipX);
1048
p.x = (p.x + 2 * skipX) & 0x3FF;
1049
1050
// TODO: Maybe we can clip the edges instead?
1051
int scissorYPlus1 = curY + SCREEN_SCALE_FACTOR > maxY ? -1 : 0;
1052
Vec4<int> scissor_mask = Vec4<int>(0, rowMaxX - rowMinX - SCREEN_SCALE_FACTOR, scissorYPlus1, (rowMaxX - rowMinX - SCREEN_SCALE_FACTOR) | scissorYPlus1);
1053
Vec4<int> scissor_step = Vec4<int>(0, -(SCREEN_SCALE_FACTOR * 2), 0, -(SCREEN_SCALE_FACTOR * 2));
1054
1055
for (int64_t curX = rowMinX; curX <= rowMaxX; curX += SCREEN_SCALE_FACTOR * 2,
1056
w0 = e0.StepX(w0),
1057
w1 = e1.StepX(w1),
1058
w2 = e2.StepX(w2),
1059
scissor_mask = scissor_mask + scissor_step,
1060
p.x = (p.x + 2) & 0x3FF) {
1061
1062
// If p is on or inside all edges, render pixel
1063
Vec4<int> mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask);
1064
if (AnyMask<useSSE4>(mask)) {
1065
Vec4<int> z;
1066
if (flatZ) {
1067
z = Vec4<int>::AssignToAll(v2.screenpos.z);
1068
} else {
1069
// Z is interpolated pretty much directly.
1070
Vec4<float> zfloats = w0.Cast<float>() * v0_z4 + w1.Cast<float>() * v1_z4 + w2.Cast<float>() * v2_z4;
1071
z = (zfloats * wsum_recip).Cast<int>();
1072
}
1073
1074
if (pixelID.earlyZChecks) {
1075
if (pixelID.applyDepthRange) {
1076
#if defined(_M_SSE)
1077
mask.ivec = _mm_or_si128(mask.ivec, _mm_or_si128(_mm_cmplt_epi32(z.ivec, minz.ivec), _mm_cmpgt_epi32(z.ivec, maxz.ivec)));
1078
#else
1079
for (int i = 0; i < 4; ++i) {
1080
if (z[i] < minz[i] || z[i] > maxz[i])
1081
mask[i] = -1;
1082
}
1083
#endif
1084
}
1085
mask = CheckDepthTestPassed4(mask, pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z);
1086
if (!AnyMask<useSSE4>(mask))
1087
continue;
1088
}
1089
1090
// Color interpolation is not perspective corrected on the PSP.
1091
Vec4<int> prim_color[4];
1092
if (!flatColor0) {
1093
for (int i = 0; i < 4; ++i) {
1094
if (mask[i] >= 0)
1095
prim_color[i] = Interpolate(v0_c0, v1_c0, v2_c0, w0[i], w1[i], w2[i], wsum_recip[i]);
1096
}
1097
} else {
1098
for (int i = 0; i < 4; ++i) {
1099
prim_color[i] = v2_c0;
1100
}
1101
}
1102
Vec3<int> sec_color[4];
1103
if (!flatColor1) {
1104
for (int i = 0; i < 4; ++i) {
1105
if (mask[i] >= 0)
1106
sec_color[i] = Interpolate(v0_c1, v1_c1, v2_c1, w0[i], w1[i], w2[i], wsum_recip[i]);
1107
}
1108
} else {
1109
for (int i = 0; i < 4; ++i) {
1110
sec_color[i] = v2_c1;
1111
}
1112
}
1113
1114
if (state.enableTextures) {
1115
if constexpr (!clearMode) {
1116
Vec4<float> s, t;
1117
if (state.throughMode) {
1118
s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), w0, w1,
1119
w2, wsum_recip);
1120
t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), w0, w1,
1121
w2, wsum_recip);
1122
1123
// For levels > 0, mipmapping is always based on level 0. Simpler to scale first.
1124
s *= 1.0f / (float) (1 << state.samplerID.width0Shift);
1125
t *= 1.0f / (float) (1 << state.samplerID.height0Shift);
1126
} else if (state.textureProj) {
1127
// Texture coordinate interpolation must definitely be perspective-correct.
1128
GetTextureCoordinatesProj(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);
1129
} else {
1130
// Texture coordinate interpolation must definitely be perspective-correct.
1131
GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);
1132
}
1133
1134
if (state.TexLevelMode() == GE_TEXLEVEL_MODE_SLOPE) {
1135
// Not sure what's right, but we need one value for the slope.
1136
float clipw = (v0.clipw * w0.x + v1.clipw * w1.x + v2.clipw * w2.x) * wsum_recip.x;
1137
ApplyTexturing(state, prim_color, mask, s, t, clipw);
1138
} else {
1139
ApplyTexturing(state, prim_color, mask, s, t, 0.0f);
1140
}
1141
}
1142
}
1143
1144
if constexpr (!clearMode) {
1145
for (int i = 0; i < 4; ++i) {
1146
#if defined(_M_SSE)
1147
// TODO: Tried making Vec4 do this, but things got slower.
1148
const __m128i sec = _mm_and_si128(sec_color[i].ivec, _mm_set_epi32(0, -1, -1, -1));
1149
prim_color[i].ivec = _mm_add_epi32(prim_color[i].ivec, sec);
1150
#elif PPSSPP_ARCH(ARM64_NEON)
1151
int32x4_t sec = vsetq_lane_s32(0, sec_color[i].ivec, 3);
1152
prim_color[i].ivec = vaddq_s32(prim_color[i].ivec, sec);
1153
#else
1154
prim_color[i] += Vec4<int>(sec_color[i], 0);
1155
#endif
1156
}
1157
}
1158
1159
Vec4<int> fog = Vec4<int>::AssignToAll(255);
1160
if (!noFog) {
1161
Vec4<float> fogdepths = w0.Cast<float>() * v0.fogdepth + w1.Cast<float>() * v1.fogdepth + w2.Cast<float>() * v2.fogdepth;
1162
fogdepths = fogdepths * wsum_recip;
1163
for (int i = 0; i < 4; ++i) {
1164
fog[i] = ClampFogDepth(fogdepths[i]);
1165
}
1166
}
1167
1168
PROFILE_THIS_SCOPE("draw_tri_px");
1169
DrawingCoords subp = p;
1170
for (int i = 0; i < 4; ++i) {
1171
if (mask[i] < 0) {
1172
continue;
1173
}
1174
subp.x = p.x + (i & 1);
1175
subp.y = p.y + (i / 2);
1176
1177
state.drawPixel(subp.x, subp.y, z[i], fog[i], ToVec4IntArg(prim_color[i]), pixelID);
1178
1179
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED)
1180
uint32_t row = gstate.getFrameBufAddress() + subp.y * pixelID.cached.framebufStride * bpp;
1181
NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * bpp, bpp, tag.c_str(), tag.size());
1182
if (pixelID.depthWrite) {
1183
row = gstate.getDepthBufAddress() + subp.y * pixelID.cached.depthbufStride * 2;
1184
NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * 2, 2, ztag.c_str(), ztag.size());
1185
}
1186
#endif
1187
}
1188
}
1189
}
1190
}
1191
1192
#if !defined(SOFTGPU_MEMORY_TAGGING_DETAILED) && defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1193
for (int y = minY; y <= maxY; y += SCREEN_SCALE_FACTOR) {
1194
DrawingCoords p = TransformUnit::ScreenToDrawing(minX, y);
1195
DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, y);
1196
uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;
1197
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, (pend.x - p.x) * bpp, tag.c_str(), tag.size());
1198
1199
if (pixelID.depthWrite) {
1200
row = gstate.getDepthBufAddress() + p.y * pixelID.cached.depthbufStride * 2;
1201
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, (pend.x - p.x) * 2, ztag.c_str(), ztag.size());
1202
}
1203
}
1204
#endif
1205
}
1206
1207
// Draws triangle, vertices specified in counter-clockwise direction
1208
void DrawTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const BinCoords &range, const RasterizerState &state) {
1209
PROFILE_THIS_SCOPE("draw_tri");
1210
1211
auto drawSlice = cpu_info.bSSE4_1 ?
1212
(state.pixelID.clearMode ? &DrawTriangleSlice<true, true> : &DrawTriangleSlice<false, true>) :
1213
(state.pixelID.clearMode ? &DrawTriangleSlice<true, false> : &DrawTriangleSlice<false, false>);
1214
1215
drawSlice(v0, v1, v2, range.x1, range.y1, range.x2, range.y2, state);
1216
}
1217
1218
void DrawRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &rastState) {
1219
int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);
1220
int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);
1221
int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;
1222
int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;
1223
int minX = std::max(entireX1 & ~(SCREEN_SCALE_FACTOR - 1), range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);
1224
int minY = std::max(entireY1 & ~(SCREEN_SCALE_FACTOR - 1), range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);
1225
int maxX = std::min(entireX2, range.x2);
1226
int maxY = std::min(entireY2, range.y2);
1227
1228
// If TL x or y was after the half, we don't draw the pixel.
1229
// TODO: Verify what center is used, allowing slight offset makes gpu/primitives/trianglefan pass.
1230
if (minX < entireX1 - 1)
1231
minX += SCREEN_SCALE_FACTOR;
1232
if (minY < entireY1 - 1)
1233
minY += SCREEN_SCALE_FACTOR;
1234
1235
RasterizerState state = OptimizeFlatRasterizerState(rastState, v1);
1236
1237
Vec2f rowST(0.0f, 0.0f);
1238
// Note: this is double the x or y movement.
1239
Vec2f stx(0.0f, 0.0f);
1240
Vec2f sty(0.0f, 0.0f);
1241
if (state.enableTextures) {
1242
// Note: texture projection is not handled here, those always turn into triangles.
1243
Vec2f tc0 = v0.texturecoords.uv();
1244
Vec2f tc1 = v1.texturecoords.uv();
1245
if (state.throughMode) {
1246
// For levels > 0, mipmapping is always based on level 0. Simpler to scale first.
1247
tc0.s() *= 1.0f / (float)(1 << state.samplerID.width0Shift);
1248
tc1.s() *= 1.0f / (float)(1 << state.samplerID.width0Shift);
1249
tc0.t() *= 1.0f / (float)(1 << state.samplerID.height0Shift);
1250
tc1.t() *= 1.0f / (float)(1 << state.samplerID.height0Shift);
1251
}
1252
1253
float diffX = (entireX2 - entireX1 + 1) / (float)SCREEN_SCALE_FACTOR;
1254
float diffY = (entireY2 - entireY1 + 1) / (float)SCREEN_SCALE_FACTOR;
1255
float diffS = tc1.s() - tc0.s();
1256
float diffT = tc1.t() - tc0.t();
1257
1258
if (v0.screenpos.x < v1.screenpos.x) {
1259
if (v0.screenpos.y < v1.screenpos.y) {
1260
// Okay, simple, TL -> BR. S and T move toward v1 with X and Y.
1261
rowST = tc0;
1262
stx = Vec2f(2.0f * diffS / diffX, 0.0f);
1263
sty = Vec2f(0.0f, 2.0f * diffT / diffY);
1264
} else {
1265
// BL to TR, rotated. We start at TL still.
1266
// X moves T (not S) toward v1, and Y moves S away from v1.
1267
rowST = Vec2f(tc1.s(), tc0.t());
1268
stx = Vec2f(0.0f, 2.0f * diffT / diffX);
1269
sty = Vec2f(2.0f * -diffS / diffY, 0.0f);
1270
}
1271
} else {
1272
if (v0.screenpos.y < v1.screenpos.y) {
1273
// TR to BL. Like BL to TR, rotated.
1274
// X moves T (not s) away from v1, and Y moves S toward v1.
1275
rowST = Vec2f(tc0.s(), tc1.t());
1276
stx = Vec2f(0.0f, 2.0f * -diffT / diffX);
1277
sty = Vec2f(2.0f * diffS / diffY, 0.0f);
1278
} else {
1279
// BR to TL, just inverse of TL to BR.
1280
rowST = Vec2f(tc1.s(), tc1.t());
1281
stx = Vec2f(2.0f * -diffS / diffX, 0.0f);
1282
sty = Vec2f(0.0f, 2.0f * -diffT / diffY);
1283
}
1284
}
1285
1286
// Okay, now move ST to the minX, minY position.
1287
rowST += (stx / (float)(SCREEN_SCALE_FACTOR * 2)) * (minX - entireX1 + 1);
1288
rowST += (sty / (float)(SCREEN_SCALE_FACTOR * 2)) * (minY - entireY1 + 1);
1289
}
1290
1291
// And now what we add to spread out to 4 values.
1292
const Vec4f sto4(0.0f, 0.5f * stx.s(), 0.5f * sty.s(), 0.5f * stx.s() + 0.5f * sty.s());
1293
const Vec4f tto4(0.0f, 0.5f * stx.t(), 0.5f * sty.t(), 0.5f * stx.t() + 0.5f * sty.t());
1294
1295
ScreenCoords pprime(minX, minY, 0);
1296
const Vec4<int> fog = Vec4<int>::AssignToAll(ClampFogDepth(v1.fogdepth));
1297
const Vec4<int> z = Vec4<int>::AssignToAll(v1.screenpos.z);
1298
const Vec4<int> c0 = Vec4<int>::FromRGBA(v1.color0);
1299
const Vec3<int> sec_color = Vec3<int>::FromRGB(v1.color1);
1300
1301
if (state.pixelID.applyDepthRange) {
1302
// We can bail early since the Z is flat.
1303
if (v1.screenpos.z < state.pixelID.cached.minz || v1.screenpos.z > state.pixelID.cached.maxz)
1304
return;
1305
}
1306
1307
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1308
uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
1309
std::string tag = StringFromFormat("DisplayListR_%08x", state.listPC);
1310
std::string ztag = StringFromFormat("DisplayListRZ_%08x", state.listPC);
1311
#endif
1312
1313
for (int64_t curY = minY; curY < maxY; curY += SCREEN_SCALE_FACTOR * 2, rowST += sty) {
1314
DrawingCoords p = TransformUnit::ScreenToDrawing(minX, curY);
1315
1316
int scissorY2 = curY + SCREEN_SCALE_FACTOR > maxY ? -1 : 0;
1317
Vec4<int> scissor_mask = Vec4<int>(0, maxX - minX - SCREEN_SCALE_FACTOR, scissorY2, (maxX - minX - SCREEN_SCALE_FACTOR) | scissorY2);
1318
Vec4<int> scissor_step = Vec4<int>(0, -(SCREEN_SCALE_FACTOR * 2), 0, -(SCREEN_SCALE_FACTOR * 2));
1319
Vec2f st = rowST;
1320
1321
for (int64_t curX = minX; curX < maxX; curX += SCREEN_SCALE_FACTOR * 2,
1322
st += stx,
1323
scissor_mask += scissor_step,
1324
p.x = (p.x + 2) & 0x3FF) {
1325
Vec4<int> mask = scissor_mask;
1326
1327
Vec4<int> prim_color[4];
1328
for (int i = 0; i < 4; ++i) {
1329
prim_color[i] = c0;
1330
}
1331
1332
if (state.pixelID.earlyZChecks) {
1333
for (int i = 0; i < 4; ++i) {
1334
if (mask[i] < 0)
1335
continue;
1336
1337
int x = p.x + (i & 1);
1338
int y = p.y + (i / 2);
1339
if (!CheckDepthTestPassed(state.pixelID.DepthTestFunc(), x, y, state.pixelID.cached.depthbufStride, z[i])) {
1340
mask[i] = -1;
1341
}
1342
}
1343
}
1344
1345
if (state.enableTextures) {
1346
Vec4<float> s, t;
1347
s = Vec4<float>::AssignToAll(st.s()) + sto4;
1348
t = Vec4<float>::AssignToAll(st.t()) + tto4;
1349
1350
ApplyTexturing(state, prim_color, mask, s, t, v1.clipw);
1351
}
1352
1353
if (!state.pixelID.clearMode) {
1354
for (int i = 0; i < 4; ++i) {
1355
#if defined(_M_SSE)
1356
// TODO: Tried making Vec4 do this, but things got slower.
1357
const __m128i sec = _mm_and_si128(sec_color.ivec, _mm_set_epi32(0, -1, -1, -1));
1358
prim_color[i].ivec = _mm_add_epi32(prim_color[i].ivec, sec);
1359
#elif PPSSPP_ARCH(ARM64_NEON)
1360
int32x4_t sec = vsetq_lane_s32(0, sec_color.ivec, 3);
1361
prim_color[i].ivec = vaddq_s32(prim_color[i].ivec, sec);
1362
#else
1363
prim_color[i] += Vec4<int>(sec_color, 0);
1364
#endif
1365
}
1366
}
1367
1368
PROFILE_THIS_SCOPE("draw_rect_px");
1369
DrawingCoords subp = p;
1370
for (int i = 0; i < 4; ++i) {
1371
if (mask[i] < 0) {
1372
continue;
1373
}
1374
subp.x = p.x + (i & 1);
1375
subp.y = p.y + (i / 2);
1376
1377
state.drawPixel(subp.x, subp.y, z[i], fog[i], ToVec4IntArg(prim_color[i]), state.pixelID);
1378
1379
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED)
1380
uint32_t row = gstate.getFrameBufAddress() + subp.y * state.pixelID.cached.framebufStride * bpp;
1381
NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * bpp, bpp, tag.c_str(), tag.size());
1382
if (state.pixelID.depthWrite) {
1383
row = gstate.getDepthBufAddress() + subp.y * state.pixelID.cached.depthbufStride * 2;
1384
NotifyMemInfo(MemBlockFlags::WRITE, row + subp.x * 2, 2, ztag.c_str(), ztag.size());
1385
}
1386
#endif
1387
}
1388
}
1389
}
1390
1391
#if !defined(SOFTGPU_MEMORY_TAGGING_DETAILED) && defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1392
for (int y = minY; y <= maxY; y += SCREEN_SCALE_FACTOR) {
1393
DrawingCoords p = TransformUnit::ScreenToDrawing(minX, y);
1394
DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, y);
1395
uint32_t row = gstate.getFrameBufAddress() + p.y * state.pixelID.cached.framebufStride * bpp;
1396
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, (pend.x - p.x) * bpp, tag.c_str(), tag.size());
1397
1398
if (state.pixelID.depthWrite) {
1399
row = gstate.getDepthBufAddress() + p.y * state.pixelID.cached.depthbufStride * 2;
1400
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, (pend.x - p.x) * 2, ztag.c_str(), ztag.size());
1401
}
1402
}
1403
#endif
1404
}
1405
1406
void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerState &state) {
1407
ScreenCoords pos = v0.screenpos;
1408
Vec4<int> prim_color = Vec4<int>::FromRGBA(v0.color0);
1409
1410
auto &pixelID = state.pixelID;
1411
auto &samplerID = state.samplerID;
1412
1413
DrawingCoords p = TransformUnit::ScreenToDrawing(pos);
1414
u16 z = pos.z;
1415
1416
if (pixelID.earlyZChecks) {
1417
if (pixelID.applyDepthRange) {
1418
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
1419
return;
1420
}
1421
1422
if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {
1423
return;
1424
}
1425
}
1426
1427
if (state.enableTextures) {
1428
float s = v0.texturecoords.s();
1429
float t = v0.texturecoords.t();
1430
if (state.throughMode) {
1431
s *= 1.0f / (float)(1 << state.samplerID.width0Shift);
1432
t *= 1.0f / (float)(1 << state.samplerID.height0Shift);
1433
} else if (state.textureProj) {
1434
GetTextureCoordinatesProj(v0, v0, 0.0f, s, t);
1435
} else {
1436
// Texture coordinate interpolation must definitely be perspective-correct.
1437
GetTextureCoordinates(v0, v0, 0.0f, s, t);
1438
}
1439
1440
int texLevel;
1441
int texLevelFrac;
1442
bool bilinear;
1443
CalculateSamplingParams(0.0f, 0.0f, v0.clipw, state, texLevel, texLevelFrac, bilinear);
1444
PROFILE_THIS_SCOPE("sampler");
1445
prim_color = ApplyTexturingSingle(s, t, ToVec4IntArg(prim_color), texLevel, texLevelFrac, bilinear, state);
1446
}
1447
1448
if (!pixelID.clearMode) {
1449
Vec3<int> sec_color = Vec3<int>::FromRGB(v0.color1);
1450
prim_color += Vec4<int>(sec_color, 0);
1451
}
1452
1453
u8 fog = 255;
1454
if (pixelID.applyFog) {
1455
fog = ClampFogDepth(v0.fogdepth);
1456
}
1457
1458
PROFILE_THIS_SCOPE("draw_px");
1459
state.drawPixel(p.x, p.y, z, fog, ToVec4IntArg(prim_color), pixelID);
1460
1461
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1462
uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
1463
std::string tag = StringFromFormat("DisplayListP_%08x", state.listPC);
1464
1465
uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;
1466
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, bpp, tag.c_str(), tag.size());
1467
1468
if (pixelID.depthWrite) {
1469
std::string ztag = StringFromFormat("DisplayListPZ_%08x", state.listPC);
1470
row = gstate.getDepthBufAddress() + p.y * pixelID.cached.depthbufStride * 2;
1471
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, 2, ztag.c_str(), ztag.size());
1472
}
1473
#endif
1474
}
1475
1476
void ClearRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {
1477
int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);
1478
int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);
1479
int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;
1480
int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;
1481
int minX = std::max(entireX1 & ~(SCREEN_SCALE_FACTOR - 1), range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);
1482
int minY = std::max(entireY1 & ~(SCREEN_SCALE_FACTOR - 1), range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);
1483
int maxX = std::min(entireX2, range.x2);
1484
int maxY = std::min(entireY2, range.y2);
1485
1486
// If TL x or y was after the half, we don't draw the pixel.
1487
if (minX < entireX1 - 1)
1488
minX += SCREEN_SCALE_FACTOR;
1489
if (minY < entireY1 - 1)
1490
minY += SCREEN_SCALE_FACTOR;
1491
1492
const DrawingCoords pprime = TransformUnit::ScreenToDrawing(minX, minY);
1493
// Only include the end pixel when it's >= 0.5.
1494
const DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX - SCREEN_SCALE_FACTOR / 2, maxY - SCREEN_SCALE_FACTOR / 2);
1495
auto &pixelID = state.pixelID;
1496
auto &samplerID = state.samplerID;
1497
1498
const int w = pend.x - pprime.x + 1;
1499
if (w <= 0)
1500
return;
1501
1502
if (pixelID.DepthClear()) {
1503
const u16 z = v1.screenpos.z;
1504
const int stride = pixelID.cached.depthbufStride;
1505
1506
// If both bytes of Z equal, we can just use memset directly which is faster.
1507
if ((z & 0xFF) == (z >> 8)) {
1508
DrawingCoords p = pprime;
1509
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1510
u16 *row = depthbuf.Get16Ptr(p.x, p.y, stride);
1511
memset(row, z, w * 2);
1512
}
1513
} else {
1514
DrawingCoords p = pprime;
1515
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1516
for (int x = 0; x < w; ++x) {
1517
SetPixelDepth(p.x + x, p.y, pixelID.cached.depthbufStride, z);
1518
}
1519
}
1520
}
1521
1522
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1523
std::string tag = StringFromFormat("DisplayListXZ_%08x", state.listPC);
1524
for (int y = pprime.y; y <= pend.y; ++y) {
1525
uint32_t row = gstate.getDepthBufAddress() + y * pixelID.cached.depthbufStride * 2;
1526
NotifyMemInfo(MemBlockFlags::WRITE, row + pprime.x * 2, w * 2, tag.c_str(), tag.size());
1527
}
1528
#endif
1529
}
1530
1531
// Note: this stays 0xFFFFFFFF if keeping color and alpha, even for 16-bit.
1532
u32 keepOldMask = 0xFFFFFFFF;
1533
if (pixelID.ColorClear() && pixelID.StencilClear()) {
1534
keepOldMask = 0;
1535
} else {
1536
switch (pixelID.FBFormat()) {
1537
case GE_FORMAT_565:
1538
if (pixelID.ColorClear())
1539
keepOldMask = 0;
1540
break;
1541
1542
case GE_FORMAT_5551:
1543
if (pixelID.ColorClear())
1544
keepOldMask = 0xFFFF8000;
1545
else if (pixelID.StencilClear())
1546
keepOldMask = 0xFFFF7FFF;
1547
break;
1548
1549
case GE_FORMAT_4444:
1550
if (pixelID.ColorClear())
1551
keepOldMask = 0xFFFFF000;
1552
else if (pixelID.StencilClear())
1553
keepOldMask = 0xFFFF0FFF;
1554
break;
1555
1556
case GE_FORMAT_8888:
1557
default:
1558
if (pixelID.ColorClear())
1559
keepOldMask = 0xFF000000;
1560
else if (pixelID.StencilClear())
1561
keepOldMask = 0x00FFFFFF;
1562
break;
1563
}
1564
}
1565
1566
// The pixel write masks are respected in clear mode.
1567
if (pixelID.applyColorWriteMask) {
1568
keepOldMask |= pixelID.cached.colorWriteMask;
1569
}
1570
1571
const u32 new_color = v1.color0;
1572
u16 new_color16;
1573
switch (pixelID.FBFormat()) {
1574
case GE_FORMAT_565:
1575
new_color16 = RGBA8888ToRGB565(new_color);
1576
break;
1577
1578
case GE_FORMAT_5551:
1579
new_color16 = RGBA8888ToRGBA5551(new_color);
1580
break;
1581
1582
case GE_FORMAT_4444:
1583
new_color16 = RGBA8888ToRGBA4444(new_color);
1584
break;
1585
1586
case GE_FORMAT_8888:
1587
break;
1588
1589
case GE_FORMAT_INVALID:
1590
case GE_FORMAT_DEPTH16:
1591
case GE_FORMAT_CLUT8:
1592
_dbg_assert_msg_(false, "Software: invalid framebuf format.");
1593
break;
1594
}
1595
1596
if (keepOldMask == 0) {
1597
const int stride = pixelID.cached.framebufStride;
1598
1599
if (pixelID.FBFormat() == GE_FORMAT_8888) {
1600
const bool canMemsetColor = (new_color & 0xFF) == (new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16);
1601
if (canMemsetColor) {
1602
DrawingCoords p = pprime;
1603
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1604
u32 *row = fb.Get32Ptr(p.x, p.y, stride);
1605
memset(row, new_color, w * 4);
1606
}
1607
} else {
1608
DrawingCoords p = pprime;
1609
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1610
for (int x = 0; x < w; ++x) {
1611
fb.Set32(p.x + x, p.y, stride, new_color);
1612
}
1613
}
1614
}
1615
} else {
1616
const bool canMemsetColor = (new_color16 & 0xFF) == (new_color16 >> 8);
1617
if (canMemsetColor) {
1618
DrawingCoords p = pprime;
1619
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1620
u16 *row = fb.Get16Ptr(p.x, p.y, stride);
1621
memset(row, new_color16, w * 2);
1622
}
1623
} else {
1624
DrawingCoords p = pprime;
1625
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1626
for (int x = 0; x < w; ++x) {
1627
fb.Set16(p.x + x, p.y, stride, new_color16);
1628
}
1629
}
1630
}
1631
}
1632
} else if (keepOldMask != 0xFFFFFFFF) {
1633
const int stride = pixelID.cached.framebufStride;
1634
1635
if (pixelID.FBFormat() == GE_FORMAT_8888) {
1636
DrawingCoords p = pprime;
1637
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1638
for (int x = 0; x < w; ++x) {
1639
const u32 old_color = fb.Get32(p.x + x, p.y, stride);
1640
const u32 c = (old_color & keepOldMask) | (new_color & ~keepOldMask);
1641
fb.Set32(p.x + x, p.y, stride, c);
1642
}
1643
}
1644
} else {
1645
DrawingCoords p = pprime;
1646
for (p.y = pprime.y; p.y <= pend.y; ++p.y) {
1647
for (int x = 0; x < w; ++x) {
1648
const u16 old_color = fb.Get16(p.x + x, p.y, stride);
1649
const u16 c = (old_color & keepOldMask) | (new_color16 & ~keepOldMask);
1650
fb.Set16(p.x + x, p.y, stride, c);
1651
}
1652
}
1653
}
1654
}
1655
1656
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1657
if (keepOldMask != 0xFFFFFFFF) {
1658
uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
1659
std::string tag = StringFromFormat("DisplayListX_%08x", state.listPC);
1660
for (int y = pprime.y; y < pend.y; ++y) {
1661
uint32_t row = gstate.getFrameBufAddress() + y * pixelID.cached.framebufStride * bpp;
1662
NotifyMemInfo(MemBlockFlags::WRITE, row + pprime.x * bpp, w * bpp, tag.c_str(), tag.size());
1663
}
1664
}
1665
#endif
1666
}
1667
1668
void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {
1669
// TODO: Use a proper line drawing algorithm that handles fractional endpoints correctly.
1670
Vec3<int> a(v0.screenpos.x, v0.screenpos.y, v0.screenpos.z);
1671
Vec3<int> b(v1.screenpos.x, v1.screenpos.y, v1.screenpos.z);
1672
1673
int dx = b.x - a.x;
1674
int dy = b.y - a.y;
1675
int dz = b.z - a.z;
1676
1677
int steps;
1678
if (abs(dx) < abs(dy))
1679
steps = abs(dy) / SCREEN_SCALE_FACTOR;
1680
else
1681
steps = abs(dx) / SCREEN_SCALE_FACTOR;
1682
1683
// Avoid going too far since we typically don't start at the pixel center.
1684
if (dx < 0 && dx >= -SCREEN_SCALE_FACTOR)
1685
dx++;
1686
if (dy < 0 && dy >= -SCREEN_SCALE_FACTOR)
1687
dy++;
1688
1689
double xinc = (double)dx / steps;
1690
double yinc = (double)dy / steps;
1691
double zinc = (double)dz / steps;
1692
1693
auto &pixelID = state.pixelID;
1694
auto &samplerID = state.samplerID;
1695
1696
const bool interpolateColor = !state.shadeGouraud || (v0.color0 == v1.color0 && v0.color1 == v1.color1);
1697
const Vec4<int> v0_c0 = Vec4<int>::FromRGBA(v0.color0);
1698
const Vec4<int> v1_c0 = Vec4<int>::FromRGBA(v1.color0);
1699
const Vec3<int> v0_c1 = Vec3<int>::FromRGB(v0.color1);
1700
const Vec3<int> v1_c1 = Vec3<int>::FromRGB(v1.color1);
1701
1702
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1703
std::string tag = StringFromFormat("DisplayListL_%08x", state.listPC);
1704
std::string ztag = StringFromFormat("DisplayListLZ_%08x", state.listPC);
1705
#endif
1706
1707
double x = a.x > b.x ? a.x - 1 : a.x;
1708
double y = a.y > b.y ? a.y - 1 : a.y;
1709
double z = a.z;
1710
const int steps1 = steps == 0 ? 1 : steps;
1711
for (int i = 0; i < steps; i++) {
1712
DrawingCoords p = TransformUnit::ScreenToDrawing(x, y);
1713
1714
bool maskOK = x >= range.x1 && y >= range.y1 && x <= range.x2 && y <= range.y2;
1715
if (maskOK) {
1716
if (pixelID.earlyZChecks) {
1717
if (pixelID.applyDepthRange) {
1718
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
1719
maskOK = false;
1720
}
1721
1722
if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {
1723
maskOK = false;
1724
}
1725
}
1726
}
1727
1728
if (maskOK) {
1729
// Interpolate between the two points.
1730
Vec4<int> prim_color;
1731
Vec3<int> sec_color;
1732
if (interpolateColor) {
1733
prim_color = (v0_c0 * (steps - i) + v1_c0 * i) / steps1;
1734
sec_color = (v0_c1 * (steps - i) + v1_c1 * i) / steps1;
1735
} else {
1736
prim_color = v1_c0;
1737
sec_color = v1_c1;
1738
}
1739
1740
u8 fog = 255;
1741
if (pixelID.applyFog) {
1742
fog = ClampFogDepth((v0.fogdepth * (float)(steps - i) + v1.fogdepth * (float)i) / steps1);
1743
}
1744
1745
if (state.antialiasLines) {
1746
// TODO: Clearmode?
1747
// TODO: Calculate.
1748
prim_color.a() = 0x7F;
1749
}
1750
1751
if (state.enableTextures) {
1752
float s, s1;
1753
float t, t1;
1754
if (state.throughMode) {
1755
Vec2<float> tc = (v0.texturecoords.uv() * (float)(steps - i) + v1.texturecoords.uv() * (float)i) / steps1;
1756
Vec2<float> tc1 = (v0.texturecoords.uv() * (float)(steps - i - 1) + v1.texturecoords.uv() * (float)(i + 1)) / steps1;
1757
1758
s = tc.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));
1759
s1 = tc1.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));
1760
t = tc.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));
1761
t1 = tc1.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));
1762
} else if (state.textureProj) {
1763
GetTextureCoordinatesProj(v0, v1, (float)(steps - i) / steps1, s, t);
1764
GetTextureCoordinatesProj(v0, v1, (float)(steps - i - 1) / steps1, s1, t1);
1765
} else {
1766
// Texture coordinate interpolation must definitely be perspective-correct.
1767
GetTextureCoordinates(v0, v1, (float)(steps - i) / steps1, s, t);
1768
GetTextureCoordinates(v0, v1, (float)(steps - i - 1) / steps1, s1, t1);
1769
}
1770
1771
// If inc is 0, force the delta to zero.
1772
float ds = xinc == 0.0 ? 0.0f : (s1 - s) * (float)SCREEN_SCALE_FACTOR * (1.0f / xinc);
1773
float dt = yinc == 0.0 ? 0.0f : (t1 - t) * (float)SCREEN_SCALE_FACTOR * (1.0f / yinc);
1774
float w = (v0.clipw * (float)(steps - i) + v1.clipw * (float)i) / steps1;
1775
1776
int texLevel;
1777
int texLevelFrac;
1778
bool texBilinear;
1779
CalculateSamplingParams(ds, dt, w, state, texLevel, texLevelFrac, texBilinear);
1780
1781
if (state.antialiasLines) {
1782
// TODO: This is a naive and wrong implementation.
1783
DrawingCoords p0 = TransformUnit::ScreenToDrawing(x, y);
1784
s = ((float)p0.x + xinc / 32.0f) / 512.0f;
1785
t = ((float)p0.y + yinc / 32.0f) / 512.0f;
1786
1787
texBilinear = true;
1788
}
1789
1790
PROFILE_THIS_SCOPE("sampler");
1791
prim_color = ApplyTexturingSingle(s, t, ToVec4IntArg(prim_color), texLevel, texLevelFrac, texBilinear, state);
1792
}
1793
1794
if (!pixelID.clearMode)
1795
prim_color += Vec4<int>(sec_color, 0);
1796
1797
PROFILE_THIS_SCOPE("draw_px");
1798
state.drawPixel(p.x, p.y, z, fog, ToVec4IntArg(prim_color), pixelID);
1799
1800
#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
1801
uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
1802
uint32_t row = gstate.getFrameBufAddress() + p.y * pixelID.cached.framebufStride * bpp;
1803
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * bpp, bpp, tag.c_str(), tag.size());
1804
1805
if (pixelID.depthWrite) {
1806
uint32_t row = gstate.getDepthBufAddress() + y * pixelID.cached.depthbufStride * 2;
1807
NotifyMemInfo(MemBlockFlags::WRITE, row + p.x * 2, 2, ztag.c_str(), ztag.size());
1808
}
1809
#endif
1810
}
1811
1812
x += xinc;
1813
y += yinc;
1814
z += zinc;
1815
}
1816
}
1817
1818
bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
1819
{
1820
if (!gstate.isTextureMapEnabled()) {
1821
return false;
1822
}
1823
1824
GETextureFormat texfmt = gstate.getTextureFormat();
1825
u32 texaddr = gstate.getTextureAddress(level);
1826
u32 texbufw = GetTextureBufw(level, texaddr, texfmt);
1827
int w = gstate.getTextureWidth(level);
1828
int h = gstate.getTextureHeight(level);
1829
1830
u32 sizeInBits = textureBitsPerPixel[texfmt] * (texbufw * (h - 1) + w);
1831
if (!texaddr || !Memory::IsValidRange(texaddr, sizeInBits / 8))
1832
return false;
1833
// We'll break trying to allocate this much.
1834
if (w >= 0x8000 && h >= 0x8000)
1835
return false;
1836
1837
buffer.Allocate(w, h, GE_FORMAT_8888, false);
1838
1839
SamplerID id;
1840
ComputeSamplerID(&id);
1841
id.cached.clut = clut;
1842
1843
// Slight annoyance, we may have to force a compile.
1844
Sampler::FetchFunc sampler = Sampler::GetFetchFunc(id, nullptr);
1845
if (!sampler) {
1846
Sampler::FlushJit();
1847
sampler = Sampler::GetFetchFunc(id, nullptr);
1848
if (!sampler)
1849
return false;
1850
}
1851
1852
u8 *texptr = Memory::GetPointerWrite(texaddr);
1853
u32 *row = (u32 *)buffer.GetData();
1854
for (int y = 0; y < h; ++y) {
1855
for (int x = 0; x < w; ++x) {
1856
row[x] = Vec4<int>(sampler(x, y, texptr, texbufw, level, id)).ToRGBA();
1857
}
1858
row += w;
1859
}
1860
return true;
1861
}
1862
1863
} // namespace
1864
1865