CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/DrawPixelX86.cpp
Views: 1401
1
// Copyright (c) 2017- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(AMD64)
20
21
#include <emmintrin.h>
22
#include "Common/x64Emitter.h"
23
#include "Common/CPUDetect.h"
24
#include "Common/LogReporting.h"
25
#include "GPU/GPUState.h"
26
#include "GPU/Software/DrawPixel.h"
27
#include "GPU/Software/SoftGpu.h"
28
#include "GPU/ge_constants.h"
29
30
using namespace Gen;
31
32
namespace Rasterizer {
33
34
SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) {
35
// Setup the reg cache and disallow spill for arguments.
36
regCache_.SetupABI({
37
RegCache::GEN_ARG_X,
38
RegCache::GEN_ARG_Y,
39
RegCache::GEN_ARG_Z,
40
RegCache::GEN_ARG_FOG,
41
RegCache::VEC_ARG_COLOR,
42
RegCache::GEN_ARG_ID,
43
});
44
45
BeginWrite(64);
46
Describe("Init");
47
WriteConstantPool(id);
48
49
const u8 *resetPos = AlignCode16();
50
EndWrite();
51
bool success = true;
52
53
#if PPSSPP_PLATFORM(WINDOWS)
54
// RET + Windows reserves space to save args, half of 1 xmm + 4 ints before the id.
55
_assert_(!regCache_.Has(RegCache::GEN_ARG_ID));
56
int stackSpace = 0;
57
if (id.hasStencilTestMask)
58
stackSpace = WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 }, { R12, R13, R14, R15 });
59
else
60
stackSpace = WriteProlog(0, {}, {});
61
stackIDOffset_ = stackSpace + 8 + 8 + 4 * PTRBITS / 8;
62
#else
63
_assert_(regCache_.Has(RegCache::GEN_ARG_ID));
64
WriteProlog(0, {}, {});
65
stackIDOffset_ = -1;
66
#endif
67
68
// Start with the depth range.
69
success = success && Jit_ApplyDepthRange(id);
70
71
// Next, let's clamp the color (might affect alpha test, and everything expects it clamped.)
72
// We simply convert to 4x8-bit to clamp. Everything else expects color in this format.
73
Describe("ClampColor");
74
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
75
PACKSSDW(argColorReg, R(argColorReg));
76
PACKUSWB(argColorReg, R(argColorReg));
77
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
78
colorIs16Bit_ = false;
79
80
success = success && Jit_AlphaTest(id);
81
// Fog is applied prior to color test. Maybe before alpha test too, but it doesn't affect it...
82
success = success && Jit_ApplyFog(id);
83
success = success && Jit_ColorTest(id);
84
85
if (id.stencilTest && !id.clearMode)
86
success = success && Jit_StencilAndDepthTest(id);
87
else if (!id.clearMode)
88
success = success && Jit_DepthTest(id);
89
success = success && Jit_WriteDepth(id);
90
91
success = success && Jit_AlphaBlend(id);
92
success = success && Jit_Dither(id);
93
success = success && Jit_WriteColor(id);
94
95
for (auto &fixup : discards_) {
96
SetJumpTarget(fixup);
97
}
98
discards_.clear();
99
100
if (regCache_.Has(RegCache::GEN_ARG_ID))
101
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
102
103
if (!success) {
104
ERROR_LOG_REPORT(Log::G3D, "Could not compile pixel func: %s", DescribePixelFuncID(id).c_str());
105
106
regCache_.Reset(false);
107
EndWrite();
108
ResetCodePtr(GetOffset(resetPos));
109
return nullptr;
110
}
111
112
const u8 *start = WriteFinalizedEpilog();
113
regCache_.Reset(true);
114
return (SingleFunc)start;
115
}
116
117
RegCache::Reg PixelJitCache::GetPixelID() {
118
if (regCache_.Has(RegCache::GEN_ARG_ID))
119
return regCache_.Find(RegCache::GEN_ARG_ID);
120
if (!regCache_.Has(RegCache::GEN_ID)) {
121
X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
122
_assert_(stackIDOffset_ != -1);
123
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
124
return r;
125
}
126
return regCache_.Find(RegCache::GEN_ID);
127
}
128
129
void PixelJitCache::UnlockPixelID(RegCache::Reg &r) {
130
if (regCache_.Has(RegCache::GEN_ARG_ID))
131
regCache_.Unlock(r, RegCache::GEN_ARG_ID);
132
else
133
regCache_.Unlock(r, RegCache::GEN_ID);
134
}
135
136
RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
137
if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {
138
Describe("GetColorOff");
139
if (id.useStandardStride && !id.dithering) {
140
bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);
141
X64Reg depthTemp = INVALID_REG;
142
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
143
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
144
145
// In this mode, we force argXReg to the off, and throw away argYReg.
146
SHL(32, R(argYReg), Imm8(9));
147
ADD(32, R(argXReg), R(argYReg));
148
149
// Now add the pointer for the color buffer.
150
if (loadDepthOff) {
151
_assert_(Accessible(&fb.data, &depthbuf.data));
152
depthTemp = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
153
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
154
MOV(PTRBITS, R(argYReg), M(&fb.data));
155
} else {
156
MOV(PTRBITS, R(depthTemp), ImmPtr(&fb.data));
157
MOV(PTRBITS, R(argYReg), MatR(depthTemp));
158
}
159
} else {
160
if (RipAccessible(&fb.data)) {
161
MOV(PTRBITS, R(argYReg), M(&fb.data));
162
} else {
163
MOV(PTRBITS, R(argYReg), ImmPtr(&fb.data));
164
MOV(PTRBITS, R(argYReg), MatR(argYReg));
165
}
166
}
167
LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
168
// With that, argYOff is now GEN_COLOR_OFF.
169
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
170
regCache_.Change(RegCache::GEN_ARG_Y, RegCache::GEN_COLOR_OFF);
171
// Retain it, because we can't recalculate this.
172
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
173
174
// Next, also calculate the depth offset, unless we won't need it at all.
175
if (loadDepthOff) {
176
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
177
MOV(PTRBITS, R(depthTemp), M(&depthbuf.data));
178
} else {
179
MOV(PTRBITS, R(depthTemp), MAccessibleDisp(depthTemp, &fb.data, &depthbuf.data));
180
}
181
LEA(PTRBITS, argXReg, MComplex(depthTemp, argXReg, 2, 0));
182
regCache_.Release(depthTemp, RegCache::GEN_DEPTH_OFF);
183
184
// Okay, same deal - release as GEN_DEPTH_OFF and force retain it.
185
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
186
regCache_.Change(RegCache::GEN_ARG_X, RegCache::GEN_DEPTH_OFF);
187
regCache_.ForceRetain(RegCache::GEN_DEPTH_OFF);
188
} else {
189
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
190
regCache_.ForceRelease(RegCache::GEN_ARG_X);
191
}
192
193
return regCache_.Find(RegCache::GEN_COLOR_OFF);
194
}
195
196
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
197
X64Reg r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
198
if (id.useStandardStride) {
199
MOV(32, R(r), R(argYReg));
200
SHL(32, R(r), Imm8(9));
201
} else {
202
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
203
X64Reg idReg = GetPixelID();
204
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.framebufStride)));
205
UnlockPixelID(idReg);
206
} else {
207
_assert_(stackIDOffset_ != -1);
208
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
209
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.framebufStride)));
210
}
211
212
IMUL(32, r, R(argYReg));
213
}
214
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
215
216
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
217
ADD(32, R(r), R(argXReg));
218
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
219
220
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
221
if (RipAccessible(&fb.data)) {
222
MOV(PTRBITS, R(temp), M(&fb.data));
223
} else {
224
MOV(PTRBITS, R(temp), ImmPtr(&fb.data));
225
MOV(PTRBITS, R(temp), MatR(temp));
226
}
227
LEA(PTRBITS, r, MComplex(temp, r, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
228
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
229
230
return r;
231
}
232
return regCache_.Find(RegCache::GEN_COLOR_OFF);
233
}
234
235
RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {
236
if (!regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
237
// If both color and depth use 512, the offsets are the same.
238
if (id.useStandardStride && !id.dithering) {
239
// Calculate once inside GetColorOff().
240
X64Reg colorOffReg = GetColorOff(id);
241
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
242
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
243
}
244
245
Describe("GetDepthOff");
246
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
247
X64Reg r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
248
if (id.useStandardStride) {
249
MOV(32, R(r), R(argYReg));
250
SHL(32, R(r), Imm8(9));
251
} else {
252
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
253
X64Reg idReg = GetPixelID();
254
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.depthbufStride)));
255
UnlockPixelID(idReg);
256
} else {
257
_assert_(stackIDOffset_ != -1);
258
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
259
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.depthbufStride)));
260
}
261
262
IMUL(32, r, R(argYReg));
263
}
264
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
265
266
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
267
ADD(32, R(r), R(argXReg));
268
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
269
270
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
271
if (RipAccessible(&depthbuf.data)) {
272
MOV(PTRBITS, R(temp), M(&depthbuf.data));
273
} else {
274
MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data));
275
MOV(PTRBITS, R(temp), MatR(temp));
276
}
277
LEA(PTRBITS, r, MComplex(temp, r, 2, 0));
278
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
279
280
return r;
281
}
282
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
283
}
284
285
286
RegCache::Reg PixelJitCache::GetDestStencil(const PixelFuncID &id) {
287
// Skip if 565, since stencil is fixed zero.
288
if (id.FBFormat() == GE_FORMAT_565)
289
return INVALID_REG;
290
291
X64Reg colorOffReg = GetColorOff(id);
292
Describe("GetDestStencil");
293
X64Reg stencilReg = regCache_.Alloc(RegCache::GEN_STENCIL);
294
if (id.FBFormat() == GE_FORMAT_8888) {
295
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 3));
296
} else if (id.FBFormat() == GE_FORMAT_5551) {
297
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
298
SAR(8, R(stencilReg), Imm8(7));
299
} else if (id.FBFormat() == GE_FORMAT_4444) {
300
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
301
SHR(32, R(stencilReg), Imm8(4));
302
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
303
MOV(32, R(temp), R(stencilReg));
304
SHL(32, R(temp), Imm8(4));
305
OR(32, R(stencilReg), R(temp));
306
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
307
}
308
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
309
310
return stencilReg;
311
}
312
313
void PixelJitCache::Discard() {
314
discards_.push_back(J(true));
315
}
316
317
void PixelJitCache::Discard(Gen::CCFlags cc) {
318
discards_.push_back(J_CC(cc, true));
319
}
320
321
void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
322
// This is used to add a fixed point 0.5 (as s.11.4) for blend factors to multiply accurately.
323
WriteSimpleConst8x16(constBlendHalf_11_4s_, 1 << 3);
324
325
// This is used for shifted blend factors, to inverse them.
326
WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);
327
}
328
329
bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
330
if (id.applyDepthRange && !id.earlyZChecks) {
331
Describe("ApplyDepthR");
332
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
333
X64Reg idReg = GetPixelID();
334
335
// We expanded this to 32 bits, so it's convenient to compare.
336
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.minz)));
337
Discard(CC_L);
338
339
// We load the low 16 bits, but compare all 32 of z. Above handles < 0.
340
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.maxz)));
341
Discard(CC_G);
342
343
UnlockPixelID(idReg);
344
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
345
}
346
347
// Since this is early on, try to free up the z reg if we don't need it anymore.
348
if (id.clearMode && !id.DepthClear())
349
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
350
else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))
351
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
352
353
return true;
354
}
355
356
bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) {
357
// Take care of ALWAYS/NEVER first. ALWAYS is common, means disabled.
358
Describe("AlphaTest");
359
switch (id.AlphaTestFunc()) {
360
case GE_COMP_NEVER:
361
Discard();
362
return true;
363
364
case GE_COMP_ALWAYS:
365
return true;
366
367
default:
368
break;
369
}
370
371
// Load alpha into its own general reg.
372
X64Reg alphaReg;
373
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
374
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
375
} else {
376
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
377
_assert_(!colorIs16Bit_);
378
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
379
MOVD_xmm(R(alphaReg), argColorReg);
380
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
381
SHR(32, R(alphaReg), Imm8(24));
382
}
383
384
if (id.hasAlphaTestMask) {
385
// Unfortunate, we'll need pixelID to load the mask.
386
// Note: we leave the ALPHA purpose untouched and free it, because later code may reuse.
387
X64Reg idReg = GetPixelID();
388
X64Reg maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
389
390
MOVZX(32, 8, maskedReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaTestMask)));
391
UnlockPixelID(idReg);
392
AND(32, R(maskedReg), R(alphaReg));
393
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
394
395
// Okay now do the rest using the masked reg, which we modified.
396
alphaReg = maskedReg;
397
}
398
399
// We hardcode the ref into this jit func.
400
CMP(8, R(alphaReg), Imm8(id.alphaTestRef));
401
if (id.hasAlphaTestMask)
402
regCache_.Release(alphaReg, RegCache::GEN_TEMP0);
403
else
404
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
405
406
switch (id.AlphaTestFunc()) {
407
case GE_COMP_NEVER:
408
case GE_COMP_ALWAYS:
409
break;
410
411
case GE_COMP_EQUAL:
412
Discard(CC_NE);
413
break;
414
415
case GE_COMP_NOTEQUAL:
416
Discard(CC_E);
417
break;
418
419
case GE_COMP_LESS:
420
Discard(CC_AE);
421
break;
422
423
case GE_COMP_LEQUAL:
424
Discard(CC_A);
425
break;
426
427
case GE_COMP_GREATER:
428
Discard(CC_BE);
429
break;
430
431
case GE_COMP_GEQUAL:
432
Discard(CC_B);
433
break;
434
}
435
436
return true;
437
}
438
439
bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) {
440
if (!id.colorTest || id.clearMode)
441
return true;
442
443
// We'll have 4 with fog released, so we're using them all...
444
Describe("ColorTest");
445
X64Reg idReg = GetPixelID();
446
X64Reg funcReg = regCache_.Alloc(RegCache::GEN_TEMP0);
447
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP1);
448
X64Reg refReg = regCache_.Alloc(RegCache::GEN_TEMP2);
449
450
// First, load the registers: mask and ref.
451
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestMask)));
452
MOV(32, R(refReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestRef)));
453
454
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
455
if (colorIs16Bit_) {
456
// If it's expanded, we need to clamp anyway if it was fogged.
457
PACKUSWB(argColorReg, R(argColorReg));
458
colorIs16Bit_ = false;
459
}
460
461
// Temporarily abuse funcReg to grab the color into maskReg.
462
MOVD_xmm(R(funcReg), argColorReg);
463
AND(32, R(maskReg), R(funcReg));
464
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
465
466
// Now that we're setup, get the func and follow it.
467
MOVZX(32, 8, funcReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorTestFunc)));
468
UnlockPixelID(idReg);
469
470
CMP(8, R(funcReg), Imm8(GE_COMP_ALWAYS));
471
// Discard for GE_COMP_NEVER...
472
Discard(CC_B);
473
FixupBranch skip = J_CC(CC_E);
474
475
CMP(8, R(funcReg), Imm8(GE_COMP_EQUAL));
476
FixupBranch doEqual = J_CC(CC_E);
477
regCache_.Release(funcReg, RegCache::GEN_TEMP0);
478
479
// The not equal path here... if they are equal, we discard.
480
CMP(32, R(refReg), R(maskReg));
481
Discard(CC_E);
482
FixupBranch skip2 = J();
483
484
SetJumpTarget(doEqual);
485
CMP(32, R(refReg), R(maskReg));
486
Discard(CC_NE);
487
488
regCache_.Release(maskReg, RegCache::GEN_TEMP1);
489
regCache_.Release(refReg, RegCache::GEN_TEMP2);
490
491
SetJumpTarget(skip);
492
SetJumpTarget(skip2);
493
494
return true;
495
}
496
497
bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
498
if (!id.applyFog) {
499
// Okay, anyone can use the fog register then.
500
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
501
return true;
502
}
503
504
// Load fog and expand to 16 bit. Ignore the high 8 bits, which'll match up with A.
505
Describe("ApplyFog");
506
X64Reg fogColorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
507
X64Reg idReg = GetPixelID();
508
if (cpu_info.bSSE4_1) {
509
PMOVZXBW(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
510
} else {
511
X64Reg zeroReg = GetZeroVec();
512
MOVD_xmm(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
513
PUNPCKLBW(fogColorReg, R(zeroReg));
514
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
515
}
516
UnlockPixelID(idReg);
517
518
// Load a set of 255s at 16 bit into a reg for later...
519
X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
520
PCMPEQW(invertReg, R(invertReg));
521
PSRLW(invertReg, 8);
522
523
// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
524
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
525
if (!colorIs16Bit_) {
526
if (cpu_info.bSSE4_1) {
527
PMOVZXBW(argColorReg, R(argColorReg));
528
} else {
529
X64Reg zeroReg = GetZeroVec();
530
PUNPCKLBW(argColorReg, R(zeroReg));
531
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
532
}
533
colorIs16Bit_ = true;
534
}
535
536
// Save A so we can put it back, we don't "fog" A.
537
X64Reg alphaReg;
538
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
539
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
540
} else {
541
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
542
PEXTRW(alphaReg, argColorReg, 3);
543
}
544
545
// Okay, let's broadcast fog to an XMM.
546
X64Reg fogMultReg = regCache_.Alloc(RegCache::VEC_TEMP3);
547
X64Reg argFogReg = regCache_.Find(RegCache::GEN_ARG_FOG);
548
MOVD_xmm(fogMultReg, R(argFogReg));
549
PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0));
550
regCache_.Unlock(argFogReg, RegCache::GEN_ARG_FOG);
551
// We can free up the actual fog reg now.
552
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
553
554
// Our goal here is to calculate this formula:
555
// (argColor * fog + fogColor * (255 - fog) + 255) / 256
556
557
// Now we multiply the existing color by fog...
558
PMULLW(argColorReg, R(fogMultReg));
559
// Before inversing, let's add that 255 we loaded in as well, since we have it.
560
PADDW(argColorReg, R(invertReg));
561
// And then inverse the fog value using those 255s, and multiply by fog color.
562
PSUBW(invertReg, R(fogMultReg));
563
PMULLW(fogColorReg, R(invertReg));
564
// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
565
PADDW(argColorReg, R(fogColorReg));
566
regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
567
regCache_.Release(invertReg, RegCache::VEC_TEMP2);
568
regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);
569
570
// Now we simply divide by 256, or in other words shift by 8.
571
PSRLW(argColorReg, 8);
572
573
// Okay, put A back in, we'll shrink it to 8888 when needed.
574
PINSRW(argColorReg, R(alphaReg), 3);
575
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
576
577
// We most likely won't use alphaReg again.
578
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
579
580
return true;
581
}
582
583
bool PixelJitCache::Jit_StencilAndDepthTest(const PixelFuncID &id) {
584
_assert_(!id.clearMode && id.stencilTest);
585
586
X64Reg stencilReg = GetDestStencil(id);
587
Describe("StencilAndDepth");
588
X64Reg maskedReg = stencilReg;
589
if (id.hasStencilTestMask && stencilReg != INVALID_REG) {
590
X64Reg idReg = GetPixelID();
591
maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
592
MOV(32, R(maskedReg), R(stencilReg));
593
AND(8, R(maskedReg), MDisp(idReg, offsetof(PixelFuncID, cached.stencilTestMask)));
594
UnlockPixelID(idReg);
595
}
596
597
bool success = true;
598
success = success && Jit_StencilTest(id, stencilReg, maskedReg);
599
if (maskedReg != stencilReg)
600
regCache_.Release(maskedReg, RegCache::GEN_TEMP0);
601
602
// Next up, the depth test.
603
if (stencilReg == INVALID_REG) {
604
// Just use the standard one, since we don't need to write stencil.
605
// We also don't need to worry about cleanup either.
606
return success && Jit_DepthTest(id);
607
}
608
609
success = success && Jit_DepthTestForStencil(id, stencilReg);
610
success = success && Jit_ApplyStencilOp(id, id.ZPass(), stencilReg);
611
612
// At this point, stencilReg can't be spilled. It contains the updated value.
613
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
614
regCache_.ForceRetain(RegCache::GEN_STENCIL);
615
616
return success;
617
}
618
619
bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencilReg, RegCache::Reg maskedReg) {
620
Describe("StencilTest");
621
622
bool hasFixedResult = false;
623
bool fixedResult = false;
624
FixupBranch toPass;
625
if (stencilReg == INVALID_REG) {
626
// This means stencil is a fixed value 0.
627
hasFixedResult = true;
628
switch (id.StencilTestFunc()) {
629
case GE_COMP_NEVER: fixedResult = false; break;
630
case GE_COMP_ALWAYS: fixedResult = true; break;
631
case GE_COMP_EQUAL: fixedResult = id.stencilTestRef == 0; break;
632
case GE_COMP_NOTEQUAL: fixedResult = id.stencilTestRef != 0; break;
633
case GE_COMP_LESS: fixedResult = false; break;
634
case GE_COMP_LEQUAL: fixedResult = id.stencilTestRef == 0; break;
635
case GE_COMP_GREATER: fixedResult = id.stencilTestRef != 0; break;
636
case GE_COMP_GEQUAL: fixedResult = true; break;
637
}
638
} else if (id.StencilTestFunc() == GE_COMP_ALWAYS) {
639
// Fairly common, skip the CMP.
640
hasFixedResult = true;
641
fixedResult = true;
642
} else {
643
// Reversed here because of the imm, so tests below are reversed.
644
CMP(8, R(maskedReg), Imm8(id.stencilTestRef));
645
switch (id.StencilTestFunc()) {
646
case GE_COMP_NEVER:
647
hasFixedResult = true;
648
fixedResult = false;
649
break;
650
651
case GE_COMP_ALWAYS:
652
_assert_(false);
653
break;
654
655
case GE_COMP_EQUAL:
656
toPass = J_CC(CC_E);
657
break;
658
659
case GE_COMP_NOTEQUAL:
660
toPass = J_CC(CC_NE);
661
break;
662
663
case GE_COMP_LESS:
664
toPass = J_CC(CC_A);
665
break;
666
667
case GE_COMP_LEQUAL:
668
toPass = J_CC(CC_AE);
669
break;
670
671
case GE_COMP_GREATER:
672
toPass = J_CC(CC_B);
673
break;
674
675
case GE_COMP_GEQUAL:
676
toPass = J_CC(CC_BE);
677
break;
678
}
679
}
680
681
if (hasFixedResult && !fixedResult && stencilReg == INVALID_REG) {
682
Discard();
683
return true;
684
}
685
686
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
687
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
688
689
bool success = true;
690
if (stencilReg != INVALID_REG && (!hasFixedResult || !fixedResult)) {
691
// This is the fail path.
692
success = success && Jit_ApplyStencilOp(id, id.SFail(), stencilReg);
693
success = success && Jit_WriteStencilOnly(id, stencilReg);
694
695
Discard();
696
}
697
698
// If we allocated either id or colorOff in the conditional, forget.
699
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
700
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
701
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
702
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
703
704
if (!hasFixedResult)
705
SetJumpTarget(toPass);
706
return success;
707
}
708
709
bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {
710
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
711
return true;
712
713
X64Reg depthOffReg = GetDepthOff(id);
714
Describe("DepthTestStencil");
715
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
716
CMP(16, R(argZReg), MatR(depthOffReg));
717
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
718
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
719
720
// We discard the opposite of the passing test.
721
FixupBranch skip;
722
switch (id.DepthTestFunc()) {
723
case GE_COMP_NEVER:
724
// Shouldn't happen, just do an extra CMP.
725
CMP(32, R(RAX), R(RAX));
726
// This is just to have a skip that is valid.
727
skip = J_CC(CC_NE);
728
break;
729
730
case GE_COMP_ALWAYS:
731
// Shouldn't happen, just do an extra CMP.
732
CMP(32, R(RAX), R(RAX));
733
skip = J_CC(CC_E);
734
break;
735
736
case GE_COMP_EQUAL:
737
skip = J_CC(CC_E);
738
break;
739
740
case GE_COMP_NOTEQUAL:
741
skip = J_CC(CC_NE);
742
break;
743
744
case GE_COMP_LESS:
745
skip = J_CC(CC_B);
746
break;
747
748
case GE_COMP_LEQUAL:
749
skip = J_CC(CC_BE);
750
break;
751
752
case GE_COMP_GREATER:
753
skip = J_CC(CC_A);
754
break;
755
756
case GE_COMP_GEQUAL:
757
skip = J_CC(CC_AE);
758
break;
759
}
760
761
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
762
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
763
764
bool success = true;
765
success = success && Jit_ApplyStencilOp(id, id.ZFail(), stencilReg);
766
success = success && Jit_WriteStencilOnly(id, stencilReg);
767
Discard();
768
769
// If we allocated either id or colorOff in the conditional, forget.
770
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
771
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
772
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
773
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
774
775
SetJumpTarget(skip);
776
777
// Like in Jit_DepthTest(), at this point we may not need this reg anymore.
778
if (!id.depthWrite)
779
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
780
781
return success;
782
}
783
784
bool PixelJitCache::Jit_ApplyStencilOp(const PixelFuncID &id, GEStencilOp op, RegCache::Reg stencilReg) {
785
_assert_(stencilReg != INVALID_REG);
786
787
Describe("ApplyStencil");
788
FixupBranch skip;
789
switch (op) {
790
case GE_STENCILOP_KEEP:
791
// Nothing to do.
792
break;
793
794
case GE_STENCILOP_ZERO:
795
XOR(32, R(stencilReg), R(stencilReg));
796
break;
797
798
case GE_STENCILOP_REPLACE:
799
if (id.hasStencilTestMask) {
800
// Load the unmasked value.
801
X64Reg idReg = GetPixelID();
802
MOVZX(32, 8, stencilReg, MDisp(idReg, offsetof(PixelFuncID, cached.stencilRef)));
803
UnlockPixelID(idReg);
804
} else {
805
MOV(8, R(stencilReg), Imm8(id.stencilTestRef));
806
}
807
break;
808
809
case GE_STENCILOP_INVERT:
810
NOT(8, R(stencilReg));
811
break;
812
813
case GE_STENCILOP_INCR:
814
switch (id.fbFormat) {
815
case GE_FORMAT_565:
816
break;
817
818
case GE_FORMAT_5551:
819
MOV(8, R(stencilReg), Imm8(0xFF));
820
break;
821
822
case GE_FORMAT_4444:
823
CMP(8, R(stencilReg), Imm8(0xF0));
824
skip = J_CC(CC_AE);
825
ADD(8, R(stencilReg), Imm8(0x11));
826
SetJumpTarget(skip);
827
break;
828
829
case GE_FORMAT_8888:
830
CMP(8, R(stencilReg), Imm8(0xFF));
831
skip = J_CC(CC_E);
832
ADD(8, R(stencilReg), Imm8(0x01));
833
SetJumpTarget(skip);
834
break;
835
}
836
break;
837
838
case GE_STENCILOP_DECR:
839
switch (id.fbFormat) {
840
case GE_FORMAT_565:
841
break;
842
843
case GE_FORMAT_5551:
844
XOR(32, R(stencilReg), R(stencilReg));
845
break;
846
847
case GE_FORMAT_4444:
848
CMP(8, R(stencilReg), Imm8(0x11));
849
skip = J_CC(CC_B);
850
SUB(8, R(stencilReg), Imm8(0x11));
851
SetJumpTarget(skip);
852
break;
853
854
case GE_FORMAT_8888:
855
CMP(8, R(stencilReg), Imm8(0x00));
856
skip = J_CC(CC_E);
857
SUB(8, R(stencilReg), Imm8(0x01));
858
SetJumpTarget(skip);
859
break;
860
}
861
break;
862
}
863
864
return true;
865
}
866
867
bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg stencilReg) {
868
_assert_(stencilReg != INVALID_REG);
869
870
// It's okay to destroy stencilReg here, we know we're the last writing it.
871
X64Reg colorOffReg = GetColorOff(id);
872
Describe("WriteStencil");
873
if (id.applyColorWriteMask) {
874
X64Reg idReg = GetPixelID();
875
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP5);
876
877
switch (id.fbFormat) {
878
case GE_FORMAT_565:
879
break;
880
881
case GE_FORMAT_5551:
882
// Read the high 8 bits of the 16-bit color mask.
883
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
884
OR(8, R(maskReg), Imm8(0x7F));
885
886
// Poor man's BIC...
887
NOT(32, R(stencilReg));
888
OR(32, R(stencilReg), R(maskReg));
889
NOT(32, R(stencilReg));
890
891
AND(8, MDisp(colorOffReg, 1), R(maskReg));
892
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
893
break;
894
895
case GE_FORMAT_4444:
896
// Read the high 8 bits of the 16-bit color mask.
897
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
898
OR(8, R(maskReg), Imm8(0x0F));
899
900
// Poor man's BIC...
901
NOT(32, R(stencilReg));
902
OR(32, R(stencilReg), R(maskReg));
903
NOT(32, R(stencilReg));
904
905
AND(8, MDisp(colorOffReg, 1), R(maskReg));
906
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
907
break;
908
909
case GE_FORMAT_8888:
910
// Read the high 8 bits of the 32-bit color mask.
911
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 3));
912
913
// Poor man's BIC...
914
NOT(32, R(stencilReg));
915
OR(32, R(stencilReg), R(maskReg));
916
NOT(32, R(stencilReg));
917
918
AND(8, MDisp(colorOffReg, 3), R(maskReg));
919
OR(8, MDisp(colorOffReg, 3), R(stencilReg));
920
break;
921
}
922
923
regCache_.Release(maskReg, RegCache::GEN_TEMP5);
924
UnlockPixelID(idReg);
925
} else {
926
switch (id.fbFormat) {
927
case GE_FORMAT_565:
928
break;
929
930
case GE_FORMAT_5551:
931
AND(8, R(stencilReg), Imm8(0x80));
932
AND(8, MDisp(colorOffReg, 1), Imm8(0x7F));
933
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
934
break;
935
936
case GE_FORMAT_4444:
937
AND(8, MDisp(colorOffReg, 1), Imm8(0x0F));
938
AND(8, R(stencilReg), Imm8(0xF0));
939
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
940
break;
941
942
case GE_FORMAT_8888:
943
MOV(8, MDisp(colorOffReg, 3), R(stencilReg));
944
break;
945
}
946
}
947
948
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
949
return true;
950
}
951
952
bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {
953
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
954
return true;
955
956
if (id.DepthTestFunc() == GE_COMP_NEVER) {
957
Discard();
958
// This should be uncommon, just keep going to have shared cleanup...
959
}
960
961
X64Reg depthOffReg = GetDepthOff(id);
962
Describe("DepthTest");
963
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
964
CMP(16, R(argZReg), MatR(depthOffReg));
965
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
966
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
967
968
// We discard the opposite of the passing test.
969
switch (id.DepthTestFunc()) {
970
case GE_COMP_NEVER:
971
case GE_COMP_ALWAYS:
972
break;
973
974
case GE_COMP_EQUAL:
975
Discard(CC_NE);
976
break;
977
978
case GE_COMP_NOTEQUAL:
979
Discard(CC_E);
980
break;
981
982
case GE_COMP_LESS:
983
Discard(CC_AE);
984
break;
985
986
case GE_COMP_LEQUAL:
987
Discard(CC_A);
988
break;
989
990
case GE_COMP_GREATER:
991
Discard(CC_BE);
992
break;
993
994
case GE_COMP_GEQUAL:
995
Discard(CC_B);
996
break;
997
}
998
999
// If we're not writing, we don't need Z anymore. We'll free GEN_DEPTH_OFF in Jit_WriteDepth().
1000
if (!id.depthWrite)
1001
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
1002
1003
return true;
1004
}
1005
1006
bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) {
1007
// Clear mode shares depthWrite for DepthClear().
1008
if (id.depthWrite) {
1009
X64Reg depthOffReg = GetDepthOff(id);
1010
Describe("WriteDepth");
1011
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
1012
MOV(16, MatR(depthOffReg), R(argZReg));
1013
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
1014
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
1015
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
1016
}
1017
1018
// We can free up this reg if we force locked it.
1019
if (regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
1020
regCache_.ForceRelease(RegCache::GEN_DEPTH_OFF);
1021
}
1022
1023
return true;
1024
}
1025
1026
bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
1027
if (!id.alphaBlend)
1028
return true;
1029
1030
// Check if we need to load and prep factors.
1031
PixelBlendState blendState;
1032
ComputePixelBlendState(blendState, id);
1033
1034
bool success = true;
1035
1036
// Step 1: Load and expand dest color.
1037
X64Reg dstReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1038
if (!blendState.readsDstPixel) {
1039
// Let's load colorOff just for registers to be consistent.
1040
X64Reg colorOff = GetColorOff(id);
1041
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1042
1043
PXOR(dstReg, R(dstReg));
1044
} else if (id.FBFormat() == GE_FORMAT_8888) {
1045
X64Reg colorOff = GetColorOff(id);
1046
Describe("AlphaBlend");
1047
MOVD_xmm(dstReg, MatR(colorOff));
1048
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1049
} else {
1050
X64Reg colorOff = GetColorOff(id);
1051
Describe("AlphaBlend");
1052
X64Reg dstGenReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1053
MOVZX(32, 16, dstGenReg, MatR(colorOff));
1054
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1055
1056
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1057
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1058
1059
switch (id.fbFormat) {
1060
case GE_FORMAT_565:
1061
success = success && Jit_ConvertFrom565(id, dstGenReg, temp1Reg, temp2Reg);
1062
break;
1063
1064
case GE_FORMAT_5551:
1065
success = success && Jit_ConvertFrom5551(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
1066
break;
1067
1068
case GE_FORMAT_4444:
1069
success = success && Jit_ConvertFrom4444(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
1070
break;
1071
1072
case GE_FORMAT_8888:
1073
break;
1074
}
1075
1076
Describe("AlphaBlend");
1077
MOVD_xmm(dstReg, R(dstGenReg));
1078
1079
regCache_.Release(dstGenReg, RegCache::GEN_TEMP0);
1080
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
1081
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1082
}
1083
1084
// Step 2: Load and apply factors.
1085
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1086
if (blendState.usesFactors) {
1087
X64Reg srcFactorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1088
X64Reg dstFactorReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1089
1090
// We apply these at 16-bit, because they can be doubled and have a half offset.
1091
if (cpu_info.bSSE4_1) {
1092
if (!colorIs16Bit_)
1093
PMOVZXBW(argColorReg, R(argColorReg));
1094
PMOVZXBW(dstReg, R(dstReg));
1095
} else {
1096
X64Reg zeroReg = GetZeroVec();
1097
if (!colorIs16Bit_)
1098
PUNPCKLBW(argColorReg, R(zeroReg));
1099
PUNPCKLBW(dstReg, R(zeroReg));
1100
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1101
}
1102
colorIs16Bit_ = true;
1103
1104
// Skip multiplying by factors if we can.
1105
bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;
1106
bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;
1107
// We also shift left by 4, so mulhi gives us a free shift
1108
// We also need to add a half bit later, so this gives us space.
1109
if (multiplySrc || blendState.srcColorAsFactor)
1110
PSLLW(argColorReg, 4);
1111
if (multiplyDst || blendState.dstColorAsFactor || blendState.usesDstAlpha)
1112
PSLLW(dstReg, 4);
1113
1114
// Okay, now grab our factors. Don't bother if they're known values.
1115
if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)
1116
success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
1117
if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)
1118
success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);
1119
1120
X64Reg halfReg = INVALID_REG;
1121
if (multiplySrc || multiplyDst) {
1122
halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1123
// We'll use this several times, so load into a reg.
1124
MOVDQA(halfReg, M(constBlendHalf_11_4s_));
1125
}
1126
1127
// Add in the half bit to the factors and color values, then multiply.
1128
// We take the high 16 bits to get a free right shift by 16.
1129
if (multiplySrc) {
1130
POR(srcFactorReg, R(halfReg));
1131
POR(argColorReg, R(halfReg));
1132
PMULHUW(argColorReg, R(srcFactorReg));
1133
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {
1134
PXOR(argColorReg, R(argColorReg));
1135
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {
1136
if (blendState.srcColorAsFactor)
1137
PSRLW(argColorReg, 4);
1138
}
1139
1140
if (multiplyDst) {
1141
POR(dstFactorReg, R(halfReg));
1142
POR(dstReg, R(halfReg));
1143
PMULHUW(dstReg, R(dstFactorReg));
1144
} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {
1145
// No need to add or subtract zero, unless we're negating.
1146
// This is common for bloom preparation.
1147
if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
1148
PXOR(dstReg, R(dstReg));
1149
} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {
1150
if (blendState.dstColorAsFactor || blendState.usesDstAlpha)
1151
PSRLW(dstReg, 4);
1152
}
1153
1154
regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);
1155
regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);
1156
if (halfReg != INVALID_REG)
1157
regCache_.Release(halfReg, RegCache::VEC_TEMP3);
1158
} else if (colorIs16Bit_) {
1159
// If it's expanded, shrink and clamp for our min/max/absdiff handling.
1160
PACKUSWB(argColorReg, R(argColorReg));
1161
colorIs16Bit_ = false;
1162
}
1163
1164
// Step 3: Apply equation.
1165
// Note: below, we completely ignore what happens to the alpha bits.
1166
// It won't matter, since we'll replace those with stencil anyway.
1167
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1168
switch (id.AlphaBlendEq()) {
1169
case GE_BLENDMODE_MUL_AND_ADD:
1170
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
1171
PADDUSW(argColorReg, R(dstReg));
1172
break;
1173
1174
case GE_BLENDMODE_MUL_AND_SUBTRACT:
1175
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
1176
PSUBUSW(argColorReg, R(dstReg));
1177
break;
1178
1179
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
1180
if (cpu_info.bAVX) {
1181
VPSUBUSW(128, argColorReg, dstReg, R(argColorReg));
1182
} else {
1183
MOVDQA(tempReg, R(argColorReg));
1184
MOVDQA(argColorReg, R(dstReg));
1185
PSUBUSW(argColorReg, R(tempReg));
1186
}
1187
break;
1188
1189
case GE_BLENDMODE_MIN:
1190
PMINUB(argColorReg, R(dstReg));
1191
break;
1192
1193
case GE_BLENDMODE_MAX:
1194
PMAXUB(argColorReg, R(dstReg));
1195
break;
1196
1197
case GE_BLENDMODE_ABSDIFF:
1198
// Calculate A=(dst-src < 0 ? 0 : dst-src) and B=(src-dst < 0 ? 0 : src-dst)...
1199
MOVDQA(tempReg, R(dstReg));
1200
PSUBUSB(tempReg, R(argColorReg));
1201
PSUBUSB(argColorReg, R(dstReg));
1202
1203
// Now, one of those must be zero, and the other one is the result (could also be zero.)
1204
POR(argColorReg, R(tempReg));
1205
break;
1206
}
1207
1208
regCache_.Release(dstReg, RegCache::VEC_TEMP0);
1209
regCache_.Release(tempReg, RegCache::VEC_TEMP1);
1210
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1211
1212
return success;
1213
}
1214
1215
bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {
1216
X64Reg idReg = INVALID_REG;
1217
X64Reg tempReg = INVALID_REG;
1218
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1219
1220
// Everything below expects an expanded 16-bit color
1221
_assert_(colorIs16Bit_);
1222
1223
// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.
1224
// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.
1225
1226
// Load the invert constant first off, if needed.
1227
switch (factor) {
1228
case PixelBlendFactor::INVOTHERCOLOR:
1229
case PixelBlendFactor::INVSRCALPHA:
1230
case PixelBlendFactor::INVDSTALPHA:
1231
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1232
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1233
MOVDQA(factorReg, M(constBlendInvert_11_4s_));
1234
break;
1235
1236
default:
1237
break;
1238
}
1239
1240
switch (factor) {
1241
case PixelBlendFactor::OTHERCOLOR:
1242
MOVDQA(factorReg, R(dstReg));
1243
break;
1244
1245
case PixelBlendFactor::INVOTHERCOLOR:
1246
PSUBUSW(factorReg, R(dstReg));
1247
break;
1248
1249
case PixelBlendFactor::SRCALPHA:
1250
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1251
break;
1252
1253
case PixelBlendFactor::INVSRCALPHA:
1254
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1255
1256
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1257
PSUBUSW(factorReg, R(tempReg));
1258
break;
1259
1260
case PixelBlendFactor::DSTALPHA:
1261
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1262
break;
1263
1264
case PixelBlendFactor::INVDSTALPHA:
1265
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1266
1267
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1268
PSUBUSW(factorReg, R(tempReg));
1269
break;
1270
1271
case PixelBlendFactor::DOUBLESRCALPHA:
1272
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1273
PSLLW(factorReg, 1);
1274
break;
1275
1276
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1277
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1278
1279
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1280
PSLLW(tempReg, 1);
1281
PSUBUSW(factorReg, R(tempReg));
1282
break;
1283
1284
case PixelBlendFactor::DOUBLEDSTALPHA:
1285
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1286
PSLLW(factorReg, 1);
1287
break;
1288
1289
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1290
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1291
1292
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1293
PSLLW(tempReg, 1);
1294
PSUBUSW(factorReg, R(tempReg));
1295
break;
1296
1297
case PixelBlendFactor::ZERO:
1298
// Special value meaning zero.
1299
PXOR(factorReg, R(factorReg));
1300
break;
1301
1302
case PixelBlendFactor::ONE:
1303
// Special value meaning all 255s.
1304
PCMPEQD(factorReg, R(factorReg));
1305
PSLLW(factorReg, 8);
1306
PSRLW(factorReg, 4);
1307
break;
1308
1309
case PixelBlendFactor::FIX:
1310
default:
1311
idReg = GetPixelID();
1312
if (cpu_info.bSSE4_1) {
1313
PMOVZXBW(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
1314
} else {
1315
X64Reg zeroReg = GetZeroVec();
1316
MOVD_xmm(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
1317
PUNPCKLBW(factorReg, R(zeroReg));
1318
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1319
}
1320
// Round it out by shifting into place.
1321
PSLLW(factorReg, 4);
1322
break;
1323
}
1324
1325
if (idReg != INVALID_REG)
1326
UnlockPixelID(idReg);
1327
if (tempReg != INVALID_REG)
1328
regCache_.Release(tempReg, RegCache::VEC_TEMP3);
1329
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1330
1331
return true;
1332
}
1333
1334
bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg) {
1335
bool success = true;
1336
X64Reg idReg = INVALID_REG;
1337
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1338
1339
// Everything below expects an expanded 16-bit color
1340
_assert_(colorIs16Bit_);
1341
1342
PixelBlendState blendState;
1343
ComputePixelBlendState(blendState, id);
1344
1345
// We might be able to reuse srcFactorReg for dst, in some cases.
1346
switch (id.AlphaBlendDst()) {
1347
case PixelBlendFactor::OTHERCOLOR:
1348
MOVDQA(dstFactorReg, R(argColorReg));
1349
break;
1350
1351
case PixelBlendFactor::INVOTHERCOLOR:
1352
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
1353
PSUBUSW(dstFactorReg, R(argColorReg));
1354
break;
1355
1356
case PixelBlendFactor::SRCALPHA:
1357
case PixelBlendFactor::INVSRCALPHA:
1358
case PixelBlendFactor::DSTALPHA:
1359
case PixelBlendFactor::INVDSTALPHA:
1360
case PixelBlendFactor::DOUBLESRCALPHA:
1361
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1362
case PixelBlendFactor::DOUBLEDSTALPHA:
1363
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1364
case PixelBlendFactor::ZERO:
1365
case PixelBlendFactor::ONE:
1366
// These are all equivalent for src factor, so reuse that logic.
1367
if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {
1368
MOVDQA(dstFactorReg, R(srcFactorReg));
1369
} else if (blendState.dstFactorIsInverse) {
1370
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
1371
PSUBUSW(dstFactorReg, R(srcFactorReg));
1372
} else {
1373
success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());
1374
}
1375
break;
1376
1377
case PixelBlendFactor::FIX:
1378
default:
1379
idReg = GetPixelID();
1380
if (cpu_info.bSSE4_1) {
1381
PMOVZXBW(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
1382
} else {
1383
X64Reg zeroReg = GetZeroVec();
1384
MOVD_xmm(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
1385
PUNPCKLBW(dstFactorReg, R(zeroReg));
1386
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1387
}
1388
// Round it out by shifting into place.
1389
PSLLW(dstFactorReg, 4);
1390
break;
1391
}
1392
1393
if (idReg != INVALID_REG)
1394
UnlockPixelID(idReg);
1395
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1396
1397
return success;
1398
}
1399
1400
bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
1401
if (!id.dithering)
1402
return true;
1403
1404
Describe("Dither");
1405
X64Reg valueReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1406
1407
// Load the row dither matrix entry (will still need to get the X.)
1408
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
1409
MOV(32, R(valueReg), R(argYReg));
1410
AND(32, R(valueReg), Imm8(3));
1411
1412
// At this point, we're done with depth and y, so let's grab GEN_COLOR_OFF and retain it.
1413
// Then we can modify x and throw it away too, which is our actual goal.
1414
X64Reg colorOffReg = GetColorOff(id);
1415
Describe("Dither");
1416
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
1417
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
1418
// And get rid of y, we can use for other regs.
1419
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
1420
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
1421
1422
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
1423
AND(32, R(argXReg), Imm32(3));
1424
1425
// Sum up (x + y * 4) + ditherMatrix offset to valueReg.
1426
LEA(32, valueReg, MComplex(argXReg, valueReg, 4, offsetof(PixelFuncID, cached.ditherMatrix)));
1427
1428
// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.
1429
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1430
X64Reg idReg = GetPixelID();
1431
MOVSX(32, 8, valueReg, MRegSum(idReg, valueReg));
1432
UnlockPixelID(idReg);
1433
} else {
1434
_assert_(stackIDOffset_ != -1);
1435
MOV(PTRBITS, R(argXReg), MDisp(RSP, stackIDOffset_));
1436
MOVSX(32, 8, valueReg, MRegSum(argXReg, valueReg));
1437
}
1438
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
1439
regCache_.ForceRelease(RegCache::GEN_ARG_X);
1440
1441
// Copy that value into a vec to add to the color.
1442
X64Reg vecValueReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1443
MOVD_xmm(vecValueReg, R(valueReg));
1444
regCache_.Release(valueReg, RegCache::GEN_TEMP0);
1445
1446
// Now we want to broadcast RGB in 16-bit, but keep A as 0.
1447
// Luckily, we know that third lane (in 16-bit) is zero from MOVD clearing it.
1448
// We use 16-bit because we need a signed add, but we also want to saturate.
1449
PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(2, 0, 0, 0));
1450
1451
// With that, now let's convert the color to 16 bit...
1452
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1453
if (!colorIs16Bit_) {
1454
if (cpu_info.bSSE4_1) {
1455
PMOVZXBW(argColorReg, R(argColorReg));
1456
} else {
1457
X64Reg zeroReg = GetZeroVec();
1458
PUNPCKLBW(argColorReg, R(zeroReg));
1459
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1460
}
1461
colorIs16Bit_ = true;
1462
}
1463
// And simply add the dither values.
1464
PADDSW(argColorReg, R(vecValueReg));
1465
regCache_.Release(vecValueReg, RegCache::VEC_TEMP0);
1466
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1467
1468
return true;
1469
}
1470
1471
bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
1472
X64Reg colorOff = GetColorOff(id);
1473
Describe("WriteColor");
1474
if (regCache_.Has(RegCache::GEN_ARG_X)) {
1475
// We normally toss x and y during dithering or useStandardStride with no dithering.
1476
// Free up the regs now to get more reg space.
1477
regCache_.ForceRelease(RegCache::GEN_ARG_X);
1478
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
1479
1480
// But make sure we don't lose GEN_COLOR_OFF, we'll be lost without that now.
1481
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
1482
}
1483
1484
// Convert back to 8888 and clamp.
1485
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1486
if (colorIs16Bit_) {
1487
PACKUSWB(argColorReg, R(argColorReg));
1488
colorIs16Bit_ = false;
1489
}
1490
1491
if (id.clearMode) {
1492
bool drawingDone = false;
1493
if (!id.ColorClear() && !id.StencilClear())
1494
drawingDone = true;
1495
if (!id.ColorClear() && id.FBFormat() == GE_FORMAT_565)
1496
drawingDone = true;
1497
1498
bool success = true;
1499
if (!id.ColorClear() && !drawingDone) {
1500
// Let's reuse Jit_WriteStencilOnly for this path.
1501
X64Reg alphaReg;
1502
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
1503
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
1504
} else {
1505
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
1506
MOVD_xmm(R(alphaReg), argColorReg);
1507
SHR(32, R(alphaReg), Imm8(24));
1508
}
1509
success = Jit_WriteStencilOnly(id, alphaReg);
1510
regCache_.Release(alphaReg, RegCache::GEN_SRC_ALPHA);
1511
1512
drawingDone = true;
1513
}
1514
1515
if (drawingDone) {
1516
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1517
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1518
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1519
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
1520
return success;
1521
}
1522
1523
// In this case, we're clearing only color or only color and stencil. Proceed.
1524
}
1525
1526
X64Reg colorReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1527
MOVD_xmm(R(colorReg), argColorReg);
1528
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1529
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1530
1531
X64Reg stencilReg = INVALID_REG;
1532
if (regCache_.Has(RegCache::GEN_STENCIL))
1533
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
1534
1535
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1536
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1537
bool convertAlpha = id.clearMode && id.StencilClear();
1538
bool writeAlpha = convertAlpha || stencilReg != INVALID_REG;
1539
uint32_t fixedKeepMask = 0x00000000;
1540
1541
bool success = true;
1542
1543
// Step 1: Load the color into colorReg.
1544
switch (id.fbFormat) {
1545
case GE_FORMAT_565:
1546
// In this case, stencil doesn't matter.
1547
success = success && Jit_ConvertTo565(id, colorReg, temp1Reg, temp2Reg);
1548
break;
1549
1550
case GE_FORMAT_5551:
1551
success = success && Jit_ConvertTo5551(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
1552
1553
if (stencilReg != INVALID_REG) {
1554
// Truncate off the top bit of the stencil.
1555
SHR(32, R(stencilReg), Imm8(7));
1556
SHL(32, R(stencilReg), Imm8(15));
1557
} else if (!writeAlpha) {
1558
fixedKeepMask = 0x8000;
1559
}
1560
break;
1561
1562
case GE_FORMAT_4444:
1563
success = success && Jit_ConvertTo4444(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
1564
1565
if (stencilReg != INVALID_REG) {
1566
// Truncate off the top bit of the stencil.
1567
SHR(32, R(stencilReg), Imm8(4));
1568
SHL(32, R(stencilReg), Imm8(12));
1569
} else if (!writeAlpha) {
1570
fixedKeepMask = 0xF000;
1571
}
1572
break;
1573
1574
case GE_FORMAT_8888:
1575
if (stencilReg != INVALID_REG) {
1576
SHL(32, R(stencilReg), Imm8(24));
1577
// Clear out the alpha bits so we can fit the stencil.
1578
AND(32, R(colorReg), Imm32(0x00FFFFFF));
1579
} else if (!writeAlpha) {
1580
fixedKeepMask = 0xFF000000;
1581
}
1582
break;
1583
}
1584
1585
// Step 2: Load write mask if needed.
1586
// Note that we apply the write mask at the destination bit depth.
1587
Describe("WriteColor");
1588
X64Reg maskReg = INVALID_REG;
1589
if (id.applyColorWriteMask) {
1590
maskReg = regCache_.Alloc(RegCache::GEN_TEMP3);
1591
// Load the pre-converted and combined write mask.
1592
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1593
X64Reg idReg = GetPixelID();
1594
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask)));
1595
UnlockPixelID(idReg);
1596
} else {
1597
_assert_(stackIDOffset_ != -1);
1598
MOV(PTRBITS, R(maskReg), MDisp(RSP, stackIDOffset_));
1599
MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));
1600
}
1601
}
1602
1603
// We've run out of regs, let's live without temp2 from here on.
1604
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1605
1606
// Step 3: Apply logic op, combine stencil.
1607
skipStandardWrites_.clear();
1608
if (id.applyLogicOp) {
1609
// Note: we combine stencil during logic op, because it's a bit complex to retain.
1610
success = success && Jit_ApplyLogicOp(id, colorReg, maskReg);
1611
} else if (stencilReg != INVALID_REG) {
1612
OR(32, R(colorReg), R(stencilReg));
1613
}
1614
1615
// Step 4: Write and apply write mask.
1616
Describe("WriteColor");
1617
switch (id.fbFormat) {
1618
case GE_FORMAT_565:
1619
case GE_FORMAT_5551:
1620
case GE_FORMAT_4444:
1621
if (maskReg != INVALID_REG) {
1622
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
1623
AND(16, MatR(colorOff), R(maskReg));
1624
if (cpu_info.bBMI1) {
1625
ANDN(32, colorReg, maskReg, R(colorReg));
1626
} else {
1627
NOT(32, R(maskReg));
1628
AND(32, R(colorReg), R(maskReg));
1629
}
1630
OR(16, MatR(colorOff), R(colorReg));
1631
} else if (fixedKeepMask == 0) {
1632
MOV(16, MatR(colorOff), R(colorReg));
1633
} else {
1634
// Clear the non-stencil bits and or in the color.
1635
AND(16, MatR(colorOff), Imm16((uint16_t)fixedKeepMask));
1636
OR(16, MatR(colorOff), R(colorReg));
1637
}
1638
break;
1639
1640
case GE_FORMAT_8888:
1641
if (maskReg != INVALID_REG) {
1642
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
1643
AND(32, MatR(colorOff), R(maskReg));
1644
if (cpu_info.bBMI1) {
1645
ANDN(32, colorReg, maskReg, R(colorReg));
1646
} else {
1647
NOT(32, R(maskReg));
1648
AND(32, R(colorReg), R(maskReg));
1649
}
1650
OR(32, MatR(colorOff), R(colorReg));
1651
} else if (fixedKeepMask == 0) {
1652
MOV(32, MatR(colorOff), R(colorReg));
1653
} else if (fixedKeepMask == 0xFF000000) {
1654
// We want to set 24 bits only, since we're not changing stencil.
1655
// For now, let's do two writes rather than reading in the old stencil.
1656
MOV(16, MatR(colorOff), R(colorReg));
1657
SHR(32, R(colorReg), Imm8(16));
1658
MOV(8, MDisp(colorOff, 2), R(colorReg));
1659
} else {
1660
AND(32, MatR(colorOff), Imm32(fixedKeepMask));
1661
OR(32, MatR(colorOff), R(colorReg));
1662
}
1663
break;
1664
}
1665
1666
for (FixupBranch &fixup : skipStandardWrites_)
1667
SetJumpTarget(fixup);
1668
skipStandardWrites_.clear();
1669
1670
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1671
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
1672
regCache_.Release(colorReg, RegCache::GEN_TEMP0);
1673
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
1674
if (maskReg != INVALID_REG)
1675
regCache_.Release(maskReg, RegCache::GEN_TEMP3);
1676
if (stencilReg != INVALID_REG) {
1677
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
1678
regCache_.ForceRelease(RegCache::GEN_STENCIL);
1679
}
1680
1681
return success;
1682
}
1683
1684
bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {
1685
Describe("LogicOp");
1686
X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
1687
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1688
X64Reg idReg = GetPixelID();
1689
MOVZX(32, 8, logicOpReg, MDisp(idReg, offsetof(PixelFuncID, cached.logicOp)));
1690
UnlockPixelID(idReg);
1691
} else {
1692
_assert_(stackIDOffset_ != -1);
1693
MOV(PTRBITS, R(logicOpReg), MDisp(RSP, stackIDOffset_));
1694
MOVZX(32, 8, logicOpReg, MDisp(logicOpReg, offsetof(PixelFuncID, cached.logicOp)));
1695
}
1696
1697
X64Reg stencilReg = INVALID_REG;
1698
if (regCache_.Has(RegCache::GEN_STENCIL))
1699
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
1700
1701
// Should already be allocated.
1702
X64Reg colorOff = regCache_.Find(RegCache::GEN_COLOR_OFF);
1703
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP5);
1704
1705
// We'll use these in several cases, so prepare.
1706
int bits = id.fbFormat == GE_FORMAT_8888 ? 32 : 16;
1707
OpArg stencilMask, notStencilMask;
1708
switch (id.fbFormat) {
1709
case GE_FORMAT_565:
1710
stencilMask = Imm16(0);
1711
notStencilMask = Imm16(0xFFFF);
1712
break;
1713
case GE_FORMAT_5551:
1714
stencilMask = Imm16(0x8000);
1715
notStencilMask = Imm16(0x7FFF);
1716
break;
1717
case GE_FORMAT_4444:
1718
stencilMask = Imm16(0xF000);
1719
notStencilMask = Imm16(0x0FFF);
1720
break;
1721
case GE_FORMAT_8888:
1722
stencilMask = Imm32(0xFF000000);
1723
notStencilMask = Imm32(0x00FFFFFF);
1724
break;
1725
}
1726
1727
std::vector<FixupBranch> finishes;
1728
finishes.reserve(11);
1729
FixupBranch skipTable = J(true);
1730
const u8 *tableValues[16]{};
1731
1732
tableValues[GE_LOGIC_CLEAR] = GetCodePointer();
1733
if (stencilReg != INVALID_REG) {
1734
// If clearing and setting the stencil, that's easy - stencilReg has it.
1735
MOV(32, R(colorReg), R(stencilReg));
1736
finishes.push_back(J(true));
1737
} else if (maskReg != INVALID_REG) {
1738
// Just and out the unmasked bits (stencil already included in maskReg.)
1739
AND(bits, MatR(colorOff), R(maskReg));
1740
skipStandardWrites_.push_back(J(true));
1741
} else {
1742
// Otherwise, no mask, just AND the stencil bits to zero the rest.
1743
AND(bits, MatR(colorOff), stencilMask);
1744
skipStandardWrites_.push_back(J(true));
1745
}
1746
1747
tableValues[GE_LOGIC_AND] = GetCodePointer();
1748
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1749
// Since we're ANDing, set the mask bits (AND will keep them as-is.)
1750
OR(32, R(colorReg), R(maskReg));
1751
OR(32, R(colorReg), R(stencilReg));
1752
1753
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
1754
NOT(32, R(maskReg));
1755
AND(bits, R(maskReg), stencilMask);
1756
OR(bits, MatR(colorOff), R(maskReg));
1757
} else if (stencilReg != INVALID_REG) {
1758
OR(32, R(colorReg), R(stencilReg));
1759
// No mask, so just or in the stencil bits so our AND can set any we want.
1760
OR(bits, MatR(colorOff), stencilMask);
1761
} else if (maskReg != INVALID_REG) {
1762
// Force in the mask (which includes all stencil bits) so both are kept as-is.
1763
OR(32, R(colorReg), R(maskReg));
1764
} else {
1765
// Force on the stencil bits so they AND and keep the existing value.
1766
if (stencilMask.GetImmValue() != 0)
1767
OR(bits, R(colorReg), stencilMask);
1768
}
1769
// Now the AND, which applies stencil and the logic op.
1770
AND(bits, MatR(colorOff), R(colorReg));
1771
skipStandardWrites_.push_back(J(true));
1772
1773
tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
1774
// Reverse memory in a temp reg so we can apply the write mask easily.
1775
MOV(bits, R(temp1Reg), MatR(colorOff));
1776
if (cpu_info.bBMI1) {
1777
ANDN(32, colorReg, temp1Reg, R(colorReg));
1778
} else {
1779
NOT(32, R(temp1Reg));
1780
AND(32, R(colorReg), R(temp1Reg));
1781
}
1782
// Now add in the stencil bits (must be zero before, since we used AND.)
1783
if (stencilReg != INVALID_REG) {
1784
OR(32, R(colorReg), R(stencilReg));
1785
}
1786
finishes.push_back(J(true));
1787
1788
tableValues[GE_LOGIC_COPY] = GetCodePointer();
1789
// This is just a standard write, nothing complex.
1790
if (stencilReg != INVALID_REG) {
1791
OR(32, R(colorReg), R(stencilReg));
1792
}
1793
finishes.push_back(J(true));
1794
1795
tableValues[GE_LOGIC_AND_INVERTED] = GetCodePointer();
1796
if (stencilReg != INVALID_REG) {
1797
// Set the stencil bits, so they're zero when we invert.
1798
OR(bits, R(colorReg), stencilMask);
1799
NOT(32, R(colorReg));
1800
OR(32, R(colorReg), R(stencilReg));
1801
1802
if (maskReg != INVALID_REG) {
1803
// This way our AND will keep all those bits.
1804
OR(32, R(colorReg), R(maskReg));
1805
1806
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
1807
NOT(32, R(maskReg));
1808
AND(bits, R(maskReg), stencilMask);
1809
OR(bits, MatR(colorOff), R(maskReg));
1810
} else {
1811
// Force memory to take our stencil bits by ORing for the AND.
1812
OR(bits, MatR(colorOff), stencilMask);
1813
}
1814
} else if (maskReg != INVALID_REG) {
1815
NOT(32, R(colorReg));
1816
// This way our AND will keep all those bits.
1817
OR(32, R(colorReg), R(maskReg));
1818
} else {
1819
// Invert our color, but then add in stencil bits so the AND keeps them.
1820
NOT(32, R(colorReg));
1821
// We only do this for 8888 since the rest will have had 0 stencil bits (which turned to 1s.)
1822
if (id.FBFormat() == GE_FORMAT_8888)
1823
OR(bits, R(colorReg), stencilMask);
1824
}
1825
AND(bits, MatR(colorOff), R(colorReg));
1826
skipStandardWrites_.push_back(J(true));
1827
1828
tableValues[GE_LOGIC_NOOP] = GetCodePointer();
1829
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1830
// Start by clearing masked bits from stencilReg.
1831
if (cpu_info.bBMI1) {
1832
ANDN(32, stencilReg, maskReg, R(stencilReg));
1833
} else {
1834
NOT(32, R(maskReg));
1835
AND(32, R(stencilReg), R(maskReg));
1836
NOT(32, R(maskReg));
1837
}
1838
1839
// Now mask out the stencil bits we're writing from memory.
1840
OR(bits, R(maskReg), notStencilMask);
1841
AND(bits, MatR(colorOff), R(maskReg));
1842
1843
// Now set those remaining stencil bits.
1844
OR(bits, MatR(colorOff), R(stencilReg));
1845
skipStandardWrites_.push_back(J(true));
1846
} else if (stencilReg != INVALID_REG) {
1847
// Clear and set just the stencil bits.
1848
AND(bits, MatR(colorOff), notStencilMask);
1849
OR(bits, MatR(colorOff), R(stencilReg));
1850
skipStandardWrites_.push_back(J(true));
1851
} else {
1852
Discard();
1853
}
1854
1855
tableValues[GE_LOGIC_XOR] = GetCodePointer();
1856
XOR(bits, R(colorReg), MatR(colorOff));
1857
if (stencilReg != INVALID_REG) {
1858
// Purge out the stencil bits from the XOR and copy ours in.
1859
AND(bits, R(colorReg), notStencilMask);
1860
OR(32, R(colorReg), R(stencilReg));
1861
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1862
// XOR might've set some bits, and without a maskReg we won't clear them.
1863
AND(bits, R(colorReg), notStencilMask);
1864
}
1865
finishes.push_back(J(true));
1866
1867
tableValues[GE_LOGIC_OR] = GetCodePointer();
1868
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1869
OR(32, R(colorReg), R(stencilReg));
1870
1871
// Clear the bits we should be masking out.
1872
if (cpu_info.bBMI1) {
1873
ANDN(32, colorReg, maskReg, R(colorReg));
1874
} else {
1875
NOT(32, R(maskReg));
1876
AND(32, R(colorReg), R(maskReg));
1877
NOT(32, R(maskReg));
1878
}
1879
1880
// Clear all the unmasked stencil bits, so we can set our own.
1881
OR(bits, R(maskReg), notStencilMask);
1882
AND(bits, MatR(colorOff), R(maskReg));
1883
} else if (stencilReg != INVALID_REG) {
1884
OR(32, R(colorReg), R(stencilReg));
1885
// AND out the stencil bits so we set our own.
1886
AND(bits, MatR(colorOff), notStencilMask);
1887
} else if (maskReg != INVALID_REG) {
1888
// Clear the bits we should be masking out.
1889
if (cpu_info.bBMI1) {
1890
ANDN(32, colorReg, maskReg, R(colorReg));
1891
} else {
1892
NOT(32, R(maskReg));
1893
AND(32, R(colorReg), R(maskReg));
1894
}
1895
} else if (id.FBFormat() == GE_FORMAT_8888) {
1896
// We only need to do this for 8888, the others already have 0 stencil.
1897
AND(bits, R(colorReg), notStencilMask);
1898
}
1899
// Now the OR, which applies stencil and the logic op itself.
1900
OR(bits, MatR(colorOff), R(colorReg));
1901
skipStandardWrites_.push_back(J(true));
1902
1903
tableValues[GE_LOGIC_NOR] = GetCodePointer();
1904
OR(bits, R(colorReg), MatR(colorOff));
1905
NOT(32, R(colorReg));
1906
if (stencilReg != INVALID_REG) {
1907
AND(bits, R(colorReg), notStencilMask);
1908
OR(32, R(colorReg), R(stencilReg));
1909
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1910
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1911
AND(bits, R(colorReg), notStencilMask);
1912
}
1913
finishes.push_back(J(true));
1914
1915
tableValues[GE_LOGIC_EQUIV] = GetCodePointer();
1916
XOR(bits, R(colorReg), MatR(colorOff));
1917
NOT(32, R(colorReg));
1918
if (stencilReg != INVALID_REG) {
1919
AND(bits, R(colorReg), notStencilMask);
1920
OR(32, R(colorReg), R(stencilReg));
1921
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1922
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1923
AND(bits, R(colorReg), notStencilMask);
1924
}
1925
finishes.push_back(J(true));
1926
1927
tableValues[GE_LOGIC_INVERTED] = GetCodePointer();
1928
// We just toss our color entirely.
1929
MOV(bits, R(colorReg), MatR(colorOff));
1930
NOT(32, R(colorReg));
1931
if (stencilReg != INVALID_REG) {
1932
AND(bits, R(colorReg), notStencilMask);
1933
OR(32, R(colorReg), R(stencilReg));
1934
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1935
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1936
AND(bits, R(colorReg), notStencilMask);
1937
}
1938
finishes.push_back(J(true));
1939
1940
tableValues[GE_LOGIC_OR_REVERSE] = GetCodePointer();
1941
// Reverse in a temp reg so we can mask properly.
1942
MOV(bits, R(temp1Reg), MatR(colorOff));
1943
NOT(32, R(temp1Reg));
1944
OR(32, R(colorReg), R(temp1Reg));
1945
if (stencilReg != INVALID_REG) {
1946
AND(bits, R(colorReg), notStencilMask);
1947
OR(32, R(colorReg), R(stencilReg));
1948
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1949
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1950
AND(bits, R(colorReg), notStencilMask);
1951
}
1952
finishes.push_back(J(true));
1953
1954
tableValues[GE_LOGIC_COPY_INVERTED] = GetCodePointer();
1955
NOT(32, R(colorReg));
1956
if (stencilReg != INVALID_REG) {
1957
AND(bits, R(colorReg), notStencilMask);
1958
OR(32, R(colorReg), R(stencilReg));
1959
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1960
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1961
AND(bits, R(colorReg), notStencilMask);
1962
}
1963
finishes.push_back(J(true));
1964
1965
tableValues[GE_LOGIC_OR_INVERTED] = GetCodePointer();
1966
NOT(32, R(colorReg));
1967
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1968
AND(bits, R(colorReg), notStencilMask);
1969
OR(32, R(colorReg), R(stencilReg));
1970
1971
// Clear the bits we should be masking out.
1972
if (cpu_info.bBMI1) {
1973
ANDN(32, colorReg, maskReg, R(colorReg));
1974
} else {
1975
NOT(32, R(maskReg));
1976
AND(32, R(colorReg), R(maskReg));
1977
NOT(32, R(maskReg));
1978
}
1979
1980
// Clear all the unmasked stencil bits, so we can set our own.
1981
OR(bits, R(maskReg), notStencilMask);
1982
AND(bits, MatR(colorOff), R(maskReg));
1983
} else if (stencilReg != INVALID_REG) {
1984
AND(bits, R(colorReg), notStencilMask);
1985
OR(32, R(colorReg), R(stencilReg));
1986
// AND out the stencil bits so we set our own.
1987
AND(bits, MatR(colorOff), notStencilMask);
1988
} else if (maskReg != INVALID_REG) {
1989
// Clear the bits we should be masking out.
1990
NOT(32, R(maskReg));
1991
AND(32, R(colorReg), R(maskReg));
1992
} else if (id.FBFormat() == GE_FORMAT_8888) {
1993
// We only need to do this for 8888, the others already have 0 stencil.
1994
AND(bits, R(colorReg), notStencilMask);
1995
}
1996
OR(bits, MatR(colorOff), R(colorReg));
1997
skipStandardWrites_.push_back(J(true));
1998
1999
tableValues[GE_LOGIC_NAND] = GetCodePointer();
2000
AND(bits, R(temp1Reg), MatR(colorOff));
2001
NOT(32, R(colorReg));
2002
if (stencilReg != INVALID_REG) {
2003
AND(bits, R(colorReg), notStencilMask);
2004
OR(32, R(colorReg), R(stencilReg));
2005
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
2006
// We need to clear the stencil bits since the standard write logic assumes they're zero.
2007
AND(bits, R(colorReg), notStencilMask);
2008
}
2009
finishes.push_back(J(true));
2010
2011
tableValues[GE_LOGIC_SET] = GetCodePointer();
2012
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
2013
OR(32, R(colorReg), R(stencilReg));
2014
OR(bits, R(colorReg), notStencilMask);
2015
finishes.push_back(J(true));
2016
} else if (stencilReg != INVALID_REG) {
2017
// Set bits directly in stencilReg, and then put in memory.
2018
OR(bits, R(stencilReg), notStencilMask);
2019
MOV(bits, MatR(colorOff), R(stencilReg));
2020
skipStandardWrites_.push_back(J(true));
2021
} else if (maskReg != INVALID_REG) {
2022
// OR in the bits we're allowed to write (won't be any stencil.)
2023
NOT(32, R(maskReg));
2024
OR(bits, MatR(colorOff), R(maskReg));
2025
skipStandardWrites_.push_back(J(true));
2026
} else {
2027
OR(bits, MatR(colorOff), notStencilMask);
2028
skipStandardWrites_.push_back(J(true));
2029
}
2030
2031
const u8 *tablePtr = GetCodePointer();
2032
for (int i = 0; i < 16; ++i) {
2033
Write64((uintptr_t)tableValues[i]);
2034
}
2035
2036
SetJumpTarget(skipTable);
2037
LEA(64, temp1Reg, M(tablePtr));
2038
JMPptr(MComplex(temp1Reg, logicOpReg, 8, 0));
2039
2040
for (FixupBranch &fixup : finishes)
2041
SetJumpTarget(fixup);
2042
2043
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
2044
regCache_.Release(logicOpReg, RegCache::GEN_TEMP4);
2045
regCache_.Release(temp1Reg, RegCache::GEN_TEMP5);
2046
if (stencilReg != INVALID_REG)
2047
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
2048
2049
return true;
2050
}
2051
2052
bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
2053
Describe("ConvertTo565");
2054
2055
if (cpu_info.bBMI2_fast) {
2056
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
2057
PEXT(32, colorReg, colorReg, R(temp1Reg));
2058
return true;
2059
}
2060
2061
// Assemble the 565 color, starting with R...
2062
MOV(32, R(temp1Reg), R(colorReg));
2063
SHR(32, R(temp1Reg), Imm8(3));
2064
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2065
2066
// For G, move right 5 (because the top 6 are offset by 10.)
2067
MOV(32, R(temp2Reg), R(colorReg));
2068
SHR(32, R(temp2Reg), Imm8(5));
2069
AND(16, R(temp2Reg), Imm16(0x3F << 5));
2070
OR(32, R(temp1Reg), R(temp2Reg));
2071
2072
// And finally B, move right 8 (top 5 are offset by 19.)
2073
SHR(32, R(colorReg), Imm8(8));
2074
AND(16, R(colorReg), Imm16(0x1F << 11));
2075
OR(32, R(colorReg), R(temp1Reg));
2076
2077
return true;
2078
}
2079
2080
bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2081
Describe("ConvertTo5551");
2082
2083
if (cpu_info.bBMI2_fast) {
2084
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
2085
PEXT(32, colorReg, colorReg, R(temp1Reg));
2086
return true;
2087
}
2088
2089
// This is R, pretty simple.
2090
MOV(32, R(temp1Reg), R(colorReg));
2091
SHR(32, R(temp1Reg), Imm8(3));
2092
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2093
2094
// G moves right 6, to match the top 5 at 11.
2095
MOV(32, R(temp2Reg), R(colorReg));
2096
SHR(32, R(temp2Reg), Imm8(6));
2097
AND(16, R(temp2Reg), Imm16(0x1F << 5));
2098
OR(32, R(temp1Reg), R(temp2Reg));
2099
2100
if (keepAlpha) {
2101
// Grab A into tempReg2 before handling B.
2102
MOV(32, R(temp2Reg), R(colorReg));
2103
SHR(32, R(temp2Reg), Imm8(31));
2104
SHL(32, R(temp2Reg), Imm8(15));
2105
}
2106
2107
// B moves right 9, to match the top 5 at 19.
2108
SHR(32, R(colorReg), Imm8(9));
2109
AND(16, R(colorReg), Imm16(0x1F << 10));
2110
OR(32, R(colorReg), R(temp1Reg));
2111
2112
if (keepAlpha)
2113
OR(32, R(colorReg), R(temp2Reg));
2114
2115
return true;
2116
}
2117
2118
bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2119
Describe("ConvertTo4444");
2120
2121
if (cpu_info.bBMI2_fast) {
2122
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
2123
PEXT(32, colorReg, colorReg, R(temp1Reg));
2124
return true;
2125
}
2126
2127
// Shift and mask out R.
2128
MOV(32, R(temp1Reg), R(colorReg));
2129
SHR(32, R(temp1Reg), Imm8(4));
2130
AND(16, R(temp1Reg), Imm16(0xF << 0));
2131
2132
// Shift G into position and mask.
2133
MOV(32, R(temp2Reg), R(colorReg));
2134
SHR(32, R(temp2Reg), Imm8(8));
2135
AND(16, R(temp2Reg), Imm16(0xF << 4));
2136
OR(32, R(temp1Reg), R(temp2Reg));
2137
2138
if (keepAlpha) {
2139
// Grab A into tempReg2 before handling B.
2140
MOV(32, R(temp2Reg), R(colorReg));
2141
SHR(32, R(temp2Reg), Imm8(28));
2142
SHL(32, R(temp2Reg), Imm8(12));
2143
}
2144
2145
// B moves right 12, to match the top 4 at 20.
2146
SHR(32, R(colorReg), Imm8(12));
2147
AND(16, R(colorReg), Imm16(0xF << 8));
2148
OR(32, R(colorReg), R(temp1Reg));
2149
2150
if (keepAlpha)
2151
OR(32, R(colorReg), R(temp2Reg));
2152
2153
return true;
2154
}
2155
2156
bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
2157
Describe("ConvertFrom565");
2158
2159
if (cpu_info.bBMI2_fast) {
2160
// Start off with the high bits.
2161
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
2162
PDEP(32, temp1Reg, colorReg, R(temp1Reg));
2163
2164
// Now grab the low bits (they end up packed.)
2165
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
2166
PEXT(32, colorReg, colorReg, R(temp2Reg));
2167
// And spread them back out.
2168
MOV(32, R(temp2Reg), Imm32(0x00070307));
2169
PDEP(32, colorReg, colorReg, R(temp2Reg));
2170
2171
// Finally put the high bits in, we're done.
2172
OR(32, R(colorReg), R(temp1Reg));
2173
return true;
2174
}
2175
2176
// Filter out red only into temp1.
2177
MOV(32, R(temp1Reg), R(colorReg));
2178
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2179
// Move it left to the top of the 8 bits.
2180
SHL(32, R(temp1Reg), Imm8(3));
2181
2182
// Now we bring in blue, since it's also 5 like red.
2183
MOV(32, R(temp2Reg), R(colorReg));
2184
AND(16, R(temp2Reg), Imm16(0x1F << 11));
2185
// Shift blue into place, 8 left (at 19), and merge back to temp1.
2186
SHL(32, R(temp2Reg), Imm8(8));
2187
OR(32, R(temp1Reg), R(temp2Reg));
2188
2189
// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
2190
OR(32, R(temp2Reg), R(temp1Reg));
2191
SHL(32, R(temp2Reg), Imm8(1));
2192
2193
// We go to green last because it's the different one. Put it in place.
2194
AND(16, R(colorReg), Imm16(0x3F << 5));
2195
SHL(32, R(colorReg), Imm8(5));
2196
// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
2197
OR(32, R(temp2Reg), R(colorReg));
2198
OR(32, R(colorReg), R(temp1Reg));
2199
2200
// Now shift and mask temp2 for swizzle.
2201
SHR(32, R(temp2Reg), Imm8(6));
2202
AND(32, R(temp2Reg), Imm32(0x00070307));
2203
// And then OR that in too. We're done.
2204
OR(32, R(colorReg), R(temp2Reg));
2205
2206
return true;
2207
}
2208
2209
bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2210
Describe("ConvertFrom5551");
2211
2212
if (cpu_info.bBMI2_fast) {
2213
// First, grab the top bits.
2214
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
2215
PDEP(32, colorReg, colorReg, R(temp1Reg));
2216
2217
// Now make the swizzle bits.
2218
MOV(32, R(temp2Reg), R(colorReg));
2219
SHR(32, R(temp2Reg), Imm8(5));
2220
AND(32, R(temp2Reg), Imm32(0x00070707));
2221
2222
if (keepAlpha) {
2223
// Sign extend the alpha bit to 8 bits.
2224
SHL(32, R(colorReg), Imm8(7));
2225
SAR(32, R(colorReg), Imm8(7));
2226
}
2227
2228
OR(32, R(colorReg), R(temp2Reg));
2229
return true;
2230
}
2231
2232
// Filter out red only into temp1.
2233
MOV(32, R(temp1Reg), R(colorReg));
2234
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2235
// Move it left to the top of the 8 bits.
2236
SHL(32, R(temp1Reg), Imm8(3));
2237
2238
// Add in green and shift into place (top bits.)
2239
MOV(32, R(temp2Reg), R(colorReg));
2240
AND(16, R(temp2Reg), Imm16(0x1F << 5));
2241
SHL(32, R(temp2Reg), Imm8(6));
2242
OR(32, R(temp1Reg), R(temp2Reg));
2243
2244
if (keepAlpha) {
2245
// Now take blue and alpha together.
2246
AND(16, R(colorReg), Imm16(0x8000 | (0x1F << 10)));
2247
// We move all the way left, then sign extend right to expand alpha.
2248
SHL(32, R(colorReg), Imm8(16));
2249
SAR(32, R(colorReg), Imm8(7));
2250
} else {
2251
AND(16, R(colorReg), Imm16(0x1F << 10));
2252
SHL(32, R(colorReg), Imm8(9));
2253
}
2254
2255
// Combine both together, we still need to swizzle.
2256
OR(32, R(colorReg), R(temp1Reg));
2257
OR(32, R(temp1Reg), R(colorReg));
2258
// Now for swizzle, we'll mask carefully to avoid overflow.
2259
SHR(32, R(temp1Reg), Imm8(5));
2260
AND(32, R(temp1Reg), Imm32(0x00070707));
2261
2262
// Then finally merge in the swizzle bits.
2263
OR(32, R(colorReg), R(temp1Reg));
2264
return true;
2265
}
2266
2267
bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2268
Describe("ConvertFrom4444");
2269
2270
if (cpu_info.bBMI2_fast) {
2271
// First, spread the bits out with spaces.
2272
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
2273
PDEP(32, colorReg, colorReg, R(temp1Reg));
2274
2275
// Now swizzle the low bits in.
2276
MOV(32, R(temp1Reg), R(colorReg));
2277
SHR(32, R(temp1Reg), Imm8(4));
2278
OR(32, R(colorReg), R(temp1Reg));
2279
return true;
2280
}
2281
2282
// Move red into position within temp1.
2283
MOV(32, R(temp1Reg), R(colorReg));
2284
AND(16, R(temp1Reg), Imm16(0xF << 0));
2285
SHL(32, R(temp1Reg), Imm8(4));
2286
2287
// Green is just as simple.
2288
MOV(32, R(temp2Reg), R(colorReg));
2289
AND(16, R(temp2Reg), Imm16(0xF << 4));
2290
SHL(32, R(temp2Reg), Imm8(8));
2291
OR(32, R(temp1Reg), R(temp2Reg));
2292
2293
// Blue isn't last this time, but it's next.
2294
MOV(32, R(temp2Reg), R(colorReg));
2295
AND(16, R(temp2Reg), Imm16(0xF << 8));
2296
SHL(32, R(temp2Reg), Imm8(12));
2297
OR(32, R(temp1Reg), R(temp2Reg));
2298
2299
if (keepAlpha) {
2300
// Last but not least, alpha.
2301
AND(16, R(colorReg), Imm16(0xF << 12));
2302
SHL(32, R(colorReg), Imm8(16));
2303
OR(32, R(colorReg), R(temp1Reg));
2304
2305
// Copy to temp1 again for swizzling.
2306
OR(32, R(temp1Reg), R(colorReg));
2307
} else {
2308
// Overwrite colorReg (we need temp1 as a copy anyway.)
2309
MOV(32, R(colorReg), R(temp1Reg));
2310
}
2311
2312
// Masking isn't necessary here since everything is 4 wide.
2313
SHR(32, R(temp1Reg), Imm8(4));
2314
OR(32, R(colorReg), R(temp1Reg));
2315
return true;
2316
}
2317
2318
};
2319
2320
#endif
2321
2322