CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/SamplerX86.cpp
Views: 1401
1
// Copyright (c) 2017- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20
21
#include <emmintrin.h>
22
#include "Common/x64Emitter.h"
23
#include "Common/BitScan.h"
24
#include "Common/CPUDetect.h"
25
#include "GPU/GPUState.h"
26
#include "GPU/Software/Sampler.h"
27
#include "GPU/ge_constants.h"
28
29
using namespace Gen;
30
using namespace Rasterizer;
31
32
namespace Sampler {
33
34
FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
35
_assert_msg_(id.fetch && !id.linear, "Only fetch should be set on sampler id");
36
regCache_.SetupABI({
37
RegCache::GEN_ARG_U,
38
RegCache::GEN_ARG_V,
39
RegCache::GEN_ARG_TEXPTR,
40
RegCache::GEN_ARG_BUFW,
41
RegCache::GEN_ARG_LEVEL,
42
RegCache::GEN_ARG_ID,
43
});
44
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
45
regCache_.ForceRetain(RegCache::GEN_RESULT);
46
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
47
48
BeginWrite(2048);
49
Describe("Init");
50
const u8 *start = AlignCode16();
51
52
#if PPSSPP_PLATFORM(WINDOWS)
53
// RET and shadow space.
54
stackArgPos_ = 8 + 32;
55
stackIDOffset_ = 8;
56
stackLevelOffset_ = 0;
57
#else
58
stackArgPos_ = 0;
59
stackIDOffset_ = -1;
60
stackLevelOffset_ = -1;
61
#endif
62
63
// Early exit on !srcPtr.
64
FixupBranch zeroSrc;
65
if (id.hasInvalidPtr) {
66
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
67
CMP(PTRBITS, R(srcReg), Imm8(0));
68
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
69
70
FixupBranch nonZeroSrc = J_CC(CC_NZ);
71
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
72
PXOR(vecResultReg, R(vecResultReg));
73
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
74
zeroSrc = J(true);
75
SetJumpTarget(nonZeroSrc);
76
}
77
78
// This reads the pixel data into resultReg from the args.
79
if (!Jit_ReadTextureFormat(id)) {
80
regCache_.Reset(false);
81
EndWrite();
82
ResetCodePtr(GetOffset(start));
83
ERROR_LOG(Log::G3D, "Failed to compile fetch %s", DescribeSamplerID(id).c_str());
84
return nullptr;
85
}
86
87
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
88
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
89
if (regCache_.Has(RegCache::GEN_ARG_ID))
90
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
91
92
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
93
94
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
95
MOVD_xmm(vecResultReg, R(resultReg));
96
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
97
regCache_.ForceRelease(RegCache::GEN_RESULT);
98
99
if (cpu_info.bSSE4_1) {
100
PMOVZXBD(vecResultReg, R(vecResultReg));
101
} else {
102
X64Reg vecTempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
103
PXOR(vecTempReg, R(vecTempReg));
104
PUNPCKLBW(vecResultReg, R(vecTempReg));
105
PUNPCKLWD(vecResultReg, R(vecTempReg));
106
regCache_.Release(vecTempReg, RegCache::VEC_TEMP0);
107
}
108
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
109
110
Describe("Init");
111
if (id.hasInvalidPtr) {
112
SetJumpTarget(zeroSrc);
113
}
114
115
RET();
116
117
regCache_.Reset(true);
118
119
EndWrite();
120
return (FetchFunc)start;
121
}
122
123
NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
124
_assert_msg_(!id.fetch && !id.linear, "Fetch and linear should be cleared on sampler id");
125
BeginWrite(2048);
126
Describe("Init");
127
128
// Let's drop some helpful constants here.
129
WriteConstantPool(id);
130
131
const u8 *start = AlignCode16();
132
133
regCache_.SetupABI({
134
RegCache::VEC_ARG_S,
135
RegCache::VEC_ARG_T,
136
RegCache::VEC_ARG_COLOR,
137
RegCache::GEN_ARG_TEXPTR_PTR,
138
RegCache::GEN_ARG_BUFW_PTR,
139
RegCache::GEN_ARG_LEVEL,
140
RegCache::GEN_ARG_LEVELFRAC,
141
RegCache::GEN_ARG_ID,
142
});
143
144
#if PPSSPP_PLATFORM(WINDOWS)
145
// RET + shadow space.
146
stackArgPos_ = 8 + 32;
147
148
// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac
149
stackIDOffset_ = 24;
150
stackLevelOffset_ = 8;
151
#else
152
stackArgPos_ = 0;
153
// No args on the stack.
154
stackIDOffset_ = -1;
155
stackLevelOffset_ = -1;
156
#endif
157
158
// Start out by saving some registers, since we'll need more.
159
PUSH(R15);
160
PUSH(R14);
161
PUSH(R13);
162
PUSH(R12);
163
regCache_.Add(R15, RegCache::GEN_INVALID);
164
regCache_.Add(R14, RegCache::GEN_INVALID);
165
regCache_.Add(R13, RegCache::GEN_INVALID);
166
regCache_.Add(R12, RegCache::GEN_INVALID);
167
stackArgPos_ += 32;
168
169
#if PPSSPP_PLATFORM(WINDOWS)
170
// Use the shadow space to save U1/V1.
171
stackUV1Offset_ = -8;
172
#else
173
// Use the red zone, but account for the R15-R12 we push just below.
174
stackUV1Offset_ = -stackArgPos_ - 8;
175
#endif
176
177
// We can throw these away right off if there are no mips.
178
if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVEL) && id.useSharedClut)
179
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
180
if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))
181
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
182
183
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
184
// On Linux, RCX is currently levelFrac, but we'll need it for other things.
185
if (!cpu_info.bBMI2) {
186
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
187
MOV(64, R(R15), R(levelFracReg));
188
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
189
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
190
regCache_.ChangeReg(R15, RegCache::GEN_ARG_LEVELFRAC);
191
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
192
}
193
} else if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
194
// Let's load bufwptr into regs. RDX is free.
195
MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));
196
regCache_.ChangeReg(RDX, RegCache::GEN_ARG_BUFW_PTR);
197
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
198
}
199
// Okay, now lock RCX as a shifting reg.
200
if (!cpu_info.bBMI2) {
201
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
202
regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
203
}
204
205
bool success = true;
206
207
// Convert S/T + X/Y to U/V (and U1/V1 if appropriate.)
208
success = success && Jit_GetTexelCoords(id);
209
210
// At this point, XMM0 should be free. Swap it to the result.
211
success = success && regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
212
// Let's also pick a reg for GEN_RESULT - doesn't matter which.
213
X64Reg resultReg = regCache_.Alloc(RegCache::GEN_RESULT);
214
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
215
regCache_.ForceRetain(RegCache::GEN_RESULT);
216
217
// Early exit on !srcPtr (either one.)
218
FixupBranch zeroSrc;
219
if (id.hasInvalidPtr) {
220
Describe("NullCheck");
221
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
222
223
if (id.hasAnyMips) {
224
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
225
MOV(64, R(tempReg), MDisp(srcReg, 0));
226
AND(64, R(tempReg), MDisp(srcReg, 8));
227
228
CMP(PTRBITS, R(tempReg), Imm8(0));
229
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
230
} else {
231
CMP(PTRBITS, MatR(srcReg), Imm8(0));
232
}
233
FixupBranch nonZeroSrc = J_CC(CC_NZ);
234
PXOR(XMM0, R(XMM0));
235
zeroSrc = J(true);
236
SetJumpTarget(nonZeroSrc);
237
238
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
239
}
240
241
auto loadPtrs = [&](bool level1) {
242
X64Reg bufwReg = regCache_.Alloc(RegCache::GEN_ARG_BUFW);
243
X64Reg bufwPtrReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
244
MOVZX(32, 16, bufwReg, MDisp(bufwPtrReg, level1 ? 2 : 0));
245
regCache_.Unlock(bufwPtrReg, RegCache::GEN_ARG_BUFW_PTR);
246
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
247
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);
248
249
X64Reg srcReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
250
X64Reg srcPtrReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
251
MOV(64, R(srcReg), MDisp(srcPtrReg, level1 ? 8 : 0));
252
regCache_.Unlock(srcPtrReg, RegCache::GEN_ARG_TEXPTR_PTR);
253
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
254
regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR);
255
};
256
257
loadPtrs(false);
258
success = success && Jit_ReadTextureFormat(id);
259
260
// Convert that to 16-bit from 8-bit channels.
261
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
262
resultReg = regCache_.Find(RegCache::GEN_RESULT);
263
MOVD_xmm(vecResultReg, R(resultReg));
264
if (cpu_info.bSSE4_1) {
265
PMOVZXBW(vecResultReg, R(vecResultReg));
266
} else {
267
X64Reg zeroReg = GetZeroVec();
268
PUNPCKLBW(vecResultReg, R(zeroReg));
269
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
270
}
271
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
272
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
273
274
if (id.hasAnyMips) {
275
X64Reg vecResultReg = regCache_.Alloc(RegCache::VEC_RESULT1);
276
277
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
278
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
279
CMP(8, R(levelFracReg), Imm8(0));
280
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
281
} else {
282
CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
283
}
284
FixupBranch skip = J_CC(CC_Z, true);
285
286
// Modify the level, so the new level value is used. We don't need the old.
287
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
288
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
289
ADD(32, R(levelReg), Imm8(1));
290
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
291
} else {
292
// It's fine to just modify this in place.
293
ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
294
}
295
296
// This is inside the conditional, but it's okay because we throw it away after.
297
loadPtrs(true);
298
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
299
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
300
301
X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
302
MOV(32, R(uReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0));
303
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
304
regCache_.ForceRetain(RegCache::GEN_ARG_U);
305
306
X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
307
MOV(32, R(vReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4));
308
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
309
regCache_.ForceRetain(RegCache::GEN_ARG_V);
310
311
bool hadId = regCache_.Has(RegCache::GEN_ID);
312
bool hadZero = regCache_.Has(RegCache::VEC_ZERO);
313
success = success && Jit_ReadTextureFormat(id);
314
315
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
316
MOVD_xmm(vecResultReg, R(resultReg));
317
if (cpu_info.bSSE4_1) {
318
PMOVZXBW(vecResultReg, R(vecResultReg));
319
} else {
320
X64Reg zeroReg = GetZeroVec();
321
PUNPCKLBW(vecResultReg, R(zeroReg));
322
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
323
}
324
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
325
326
// Since we're inside a conditional, make sure these go away if we allocated them.
327
if (!hadId && regCache_.Has(RegCache::GEN_ID))
328
regCache_.ForceRelease(RegCache::GEN_ID);
329
if (!hadZero && regCache_.Has(RegCache::VEC_ZERO))
330
regCache_.ForceRelease(RegCache::VEC_ZERO);
331
332
SetJumpTarget(skip);
333
334
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT1);
335
} else {
336
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
337
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
338
}
339
340
// We're done with these now.
341
if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR))
342
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
343
if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR))
344
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
345
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
346
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
347
if (regCache_.Has(RegCache::GEN_SHIFTVAL))
348
regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
349
regCache_.ForceRelease(RegCache::GEN_RESULT);
350
351
if (id.hasAnyMips) {
352
Describe("BlendMips");
353
if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
354
X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
355
MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
356
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
357
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
358
}
359
360
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
361
CMP(8, R(levelFracReg), Imm8(0));
362
FixupBranch skip = J_CC(CC_Z, true);
363
364
// TODO: PMADDWD? Refactor shared?
365
// First, broadcast the levelFrac value into an XMM.
366
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
367
MOVD_xmm(fracReg, R(levelFracReg));
368
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
369
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
370
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
371
372
// Multiply level1 color by the fraction.
373
X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
374
PMULLW(color1Reg, R(fracReg));
375
376
// Okay, next we need an inverse for color 0.
377
X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
378
MOVDQA(invFracReg, M(const10All16_));
379
PSUBW(invFracReg, R(fracReg));
380
381
// And multiply.
382
PMULLW(XMM0, R(invFracReg));
383
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
384
regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
385
386
// Okay, now sum and divide by 16 (which is what the fraction maxed at.)
387
PADDW(XMM0, R(color1Reg));
388
PSRLW(XMM0, 4);
389
390
// And now we're done with color1Reg/VEC_RESULT1.
391
regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
392
regCache_.ForceRelease(RegCache::VEC_RESULT1);
393
394
SetJumpTarget(skip);
395
}
396
397
// Finally, it's time to apply the texture function.
398
success = success && Jit_ApplyTextureFunc(id);
399
400
// Last of all, convert to 32-bit channels.
401
Describe("Init");
402
if (cpu_info.bSSE4_1) {
403
PMOVZXWD(XMM0, R(XMM0));
404
} else {
405
X64Reg zeroReg = GetZeroVec();
406
PUNPCKLWD(XMM0, R(zeroReg));
407
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
408
}
409
410
regCache_.ForceRelease(RegCache::VEC_RESULT);
411
if (regCache_.Has(RegCache::GEN_ARG_ID))
412
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
413
414
if (!success) {
415
regCache_.Reset(false);
416
EndWrite();
417
ResetCodePtr(GetOffset(start));
418
ERROR_LOG(Log::G3D, "Failed to compile nearest %s", DescribeSamplerID(id).c_str());
419
return nullptr;
420
}
421
422
if (id.hasInvalidPtr) {
423
SetJumpTarget(zeroSrc);
424
}
425
426
POP(R12);
427
POP(R13);
428
POP(R14);
429
POP(R15);
430
431
RET();
432
433
regCache_.Reset(true);
434
435
EndWrite();
436
return (NearestFunc)start;
437
}
438
439
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
440
_assert_msg_(id.linear && !id.fetch, "Only linear should be set on sampler id");
441
BeginWrite(2048);
442
Describe("Init");
443
444
// We don't use stackArgPos_ here, this is just for DXT.
445
stackArgPos_ = -1;
446
447
// Let's drop some helpful constants here.
448
WriteConstantPool(id);
449
450
const u8 *nearest = nullptr;
451
if (id.TexFmt() >= GE_TFMT_DXT1) {
452
regCache_.SetupABI({
453
RegCache::GEN_ARG_U,
454
RegCache::GEN_ARG_V,
455
RegCache::GEN_ARG_TEXPTR,
456
RegCache::GEN_ARG_BUFW,
457
RegCache::GEN_ARG_LEVEL,
458
// Avoid clobber.
459
RegCache::GEN_ARG_LEVELFRAC,
460
});
461
auto lockReg = [&](X64Reg r, RegCache::Purpose p) {
462
regCache_.ChangeReg(r, p);
463
regCache_.ForceRetain(p);
464
};
465
lockReg(RAX, RegCache::GEN_RESULT);
466
lockReg(XMM0, RegCache::VEC_ARG_U);
467
lockReg(XMM1, RegCache::VEC_ARG_V);
468
lockReg(XMM5, RegCache::VEC_RESULT);
469
#if !PPSSPP_PLATFORM(WINDOWS)
470
if (id.hasAnyMips) {
471
lockReg(XMM6, RegCache::VEC_U1);
472
lockReg(XMM7, RegCache::VEC_V1);
473
lockReg(XMM8, RegCache::VEC_RESULT1);
474
lockReg(XMM12, RegCache::VEC_INDEX1);
475
}
476
lockReg(XMM9, RegCache::VEC_ARG_COLOR);
477
lockReg(XMM10, RegCache::VEC_FRAC);
478
lockReg(XMM11, RegCache::VEC_INDEX);
479
#endif
480
481
// We'll first write the nearest sampler, which we will CALL.
482
// This may differ slightly based on the "linear" flag.
483
nearest = AlignCode16();
484
485
if (!Jit_ReadTextureFormat(id)) {
486
regCache_.Reset(false);
487
EndWrite();
488
ResetCodePtr(GetOffset(nearest));
489
ERROR_LOG(Log::G3D, "Failed to compile linear nearest %s", DescribeSamplerID(id).c_str());
490
return nullptr;
491
}
492
493
Describe("Init");
494
RET();
495
496
regCache_.ForceRelease(RegCache::GEN_RESULT);
497
regCache_.ForceRelease(RegCache::VEC_ARG_U);
498
regCache_.ForceRelease(RegCache::VEC_ARG_V);
499
regCache_.ForceRelease(RegCache::VEC_RESULT);
500
501
auto unlockOptReg = [&](RegCache::Purpose p) {
502
if (regCache_.Has(p))
503
regCache_.ForceRelease(p);
504
};
505
unlockOptReg(RegCache::GEN_ARG_LEVEL);
506
unlockOptReg(RegCache::GEN_ARG_LEVELFRAC);
507
unlockOptReg(RegCache::VEC_U1);
508
unlockOptReg(RegCache::VEC_V1);
509
unlockOptReg(RegCache::VEC_RESULT1);
510
unlockOptReg(RegCache::VEC_ARG_COLOR);
511
unlockOptReg(RegCache::VEC_FRAC);
512
unlockOptReg(RegCache::VEC_INDEX);
513
unlockOptReg(RegCache::VEC_INDEX1);
514
regCache_.Reset(true);
515
}
516
EndWrite();
517
518
// Now the actual linear func, which is exposed externally.
519
const u8 *linearResetPos = GetCodePointer();
520
Describe("Init");
521
522
regCache_.SetupABI({
523
RegCache::VEC_ARG_S,
524
RegCache::VEC_ARG_T,
525
RegCache::VEC_ARG_COLOR,
526
RegCache::GEN_ARG_TEXPTR_PTR,
527
RegCache::GEN_ARG_BUFW_PTR,
528
RegCache::GEN_ARG_LEVEL,
529
RegCache::GEN_ARG_LEVELFRAC,
530
RegCache::GEN_ARG_ID,
531
});
532
533
#if PPSSPP_PLATFORM(WINDOWS)
534
// RET + shadow space.
535
stackArgPos_ = 8 + 32;
536
// Free up some more vector regs on Windows too, where we're a bit tight.
537
stackArgPos_ += WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12 }, { R15, R14, R13, R12 });
538
539
// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac
540
stackIDOffset_ = 24;
541
stackLevelOffset_ = 8;
542
543
// If needed, we could store UV1 data in shadow space, but we no longer do.
544
stackUV1Offset_ = -8;
545
#else
546
stackArgPos_ = 0;
547
stackArgPos_ += WriteProlog(0, {}, { R15, R14, R13, R12 });
548
stackIDOffset_ = -1;
549
stackLevelOffset_ = -1;
550
551
// Use the red zone.
552
stackUV1Offset_ = -stackArgPos_ - 8;
553
#endif
554
555
// This is what we'll put in them, anyway...
556
if (nearest != nullptr) {
557
regCache_.ChangeReg(XMM10, RegCache::VEC_FRAC);
558
regCache_.ForceRetain(RegCache::VEC_FRAC);
559
regCache_.ChangeReg(XMM11, RegCache::VEC_INDEX);
560
regCache_.ForceRetain(RegCache::VEC_INDEX);
561
if (id.hasAnyMips) {
562
regCache_.ChangeReg(XMM12, RegCache::VEC_INDEX1);
563
regCache_.ForceRetain(RegCache::VEC_INDEX1);
564
}
565
}
566
567
// Reserve a couple regs that the nearest CALL won't use.
568
if (id.hasAnyMips) {
569
regCache_.ChangeReg(XMM6, RegCache::VEC_U1);
570
regCache_.ChangeReg(XMM7, RegCache::VEC_V1);
571
regCache_.ForceRetain(RegCache::VEC_U1);
572
regCache_.ForceRetain(RegCache::VEC_V1);
573
} else if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
574
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
575
}
576
577
// Save prim color for later in a different XMM too if we're using the nearest helper.
578
if (nearest != nullptr) {
579
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
580
MOVDQA(XMM9, R(primColorReg));
581
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
582
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
583
regCache_.ChangeReg(XMM9, RegCache::VEC_ARG_COLOR);
584
regCache_.ForceRetain(RegCache::VEC_ARG_COLOR);
585
}
586
587
// We also want to save src and bufw for later. Might be in a reg already.
588
if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR) && regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
589
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
590
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
591
MOV(64, R(R14), R(srcReg));
592
MOV(64, R(R15), R(bufwReg));
593
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
594
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
595
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
596
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
597
} else if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR)) {
598
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
599
MOV(64, R(R14), R(srcReg));
600
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
601
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
602
MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 0));
603
} else {
604
MOV(64, R(R14), MDisp(RSP, stackArgPos_ + 0));
605
MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 8));
606
}
607
608
// Okay, and now remember we moved to R14/R15.
609
regCache_.ChangeReg(R14, RegCache::GEN_ARG_TEXPTR_PTR);
610
regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR_PTR);
611
if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
612
regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
613
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
614
}
615
616
bool success = true;
617
618
// Our first goal is to convert S/T and X/Y into U/V and frac_u/frac_v.
619
success = success && Jit_GetTexelCoordsQuad(id);
620
621
// Early exit on !srcPtr (either one.)
622
FixupBranch zeroSrc;
623
if (id.hasInvalidPtr) {
624
Describe("NullCheck");
625
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
626
627
if (id.hasAnyMips) {
628
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
629
MOV(64, R(tempReg), MDisp(srcReg, 0));
630
AND(64, R(tempReg), MDisp(srcReg, 8));
631
632
CMP(PTRBITS, R(tempReg), Imm8(0));
633
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
634
} else {
635
CMP(PTRBITS, MatR(srcReg), Imm8(0));
636
}
637
FixupBranch nonZeroSrc = J_CC(CC_NZ);
638
PXOR(XMM0, R(XMM0));
639
zeroSrc = J(true);
640
SetJumpTarget(nonZeroSrc);
641
642
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
643
}
644
645
auto prepareDataOffsets = [&](RegCache::Purpose uPurpose, RegCache::Purpose vPurpose, bool level1) {
646
X64Reg uReg = regCache_.Find(uPurpose);
647
X64Reg vReg = regCache_.Find(vPurpose);
648
success = success && Jit_PrepareDataOffsets(id, uReg, vReg, level1);
649
regCache_.Unlock(uReg, uPurpose);
650
regCache_.Unlock(vReg, vPurpose);
651
};
652
653
Describe("DataOffsets");
654
prepareDataOffsets(RegCache::VEC_ARG_U, RegCache::VEC_ARG_V, false);
655
if (id.hasAnyMips)
656
prepareDataOffsets(RegCache::VEC_U1, RegCache::VEC_V1, true);
657
658
// The data offset goes into V, except in the CLUT4 case and DXT (nearest func) cases.
659
if (nearest == nullptr && id.TexFmt() != GE_TFMT_CLUT4)
660
regCache_.ForceRelease(RegCache::VEC_ARG_U);
661
662
// Hard allocate results if we're using the func method.
663
if (nearest != nullptr) {
664
regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);
665
regCache_.ForceRetain(RegCache::VEC_RESULT);
666
if (id.hasAnyMips) {
667
regCache_.ChangeReg(XMM8, RegCache::VEC_RESULT1);
668
regCache_.ForceRetain(RegCache::VEC_RESULT1);
669
}
670
}
671
672
// This stores the result in an XMM for later processing.
673
// We map lookups to nearest CALLs, with arg order: u, v, src, bufw, level
674
auto doNearestCall = [&](int off, bool level1) {
675
#if PPSSPP_PLATFORM(WINDOWS)
676
static const X64Reg uArgReg = RCX;
677
static const X64Reg vArgReg = RDX;
678
static const X64Reg srcArgReg = R8;
679
static const X64Reg bufwArgReg = R9;
680
#else
681
static const X64Reg uArgReg = RDI;
682
static const X64Reg vArgReg = RSI;
683
static const X64Reg srcArgReg = RDX;
684
static const X64Reg bufwArgReg = RCX;
685
#endif
686
static const X64Reg resultReg = RAX;
687
688
X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
689
X64Reg vReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
690
// Otherwise, we'll overwrite them...
691
_assert_(level1 || (uReg == XMM0 && vReg == XMM1));
692
693
if (cpu_info.bSSE4_1) {
694
PEXTRD(R(uArgReg), uReg, off / 4);
695
PEXTRD(R(vArgReg), vReg, off / 4);
696
} else {
697
MOVD_xmm(R(uArgReg), uReg);
698
MOVD_xmm(R(vArgReg), vReg);
699
PSRLDQ(uReg, 4);
700
PSRLDQ(vReg, 4);
701
}
702
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
703
regCache_.Unlock(vReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
704
705
X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
706
if (cpu_info.bSSE4_1) {
707
PEXTRD(R(srcArgReg), indexReg, off / 4);
708
} else {
709
MOVD_xmm(R(srcArgReg), indexReg);
710
PSRLDQ(indexReg, 4);
711
}
712
regCache_.Unlock(indexReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
713
714
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
715
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
716
ADD(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));
717
MOVZX(32, 16, bufwArgReg, MDisp(bufwReg, level1 ? 2 : 0));
718
// Leave level/levelFrac, we just always load from RAM on Windows and lock on POSIX.
719
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
720
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
721
722
CALL(nearest);
723
724
X64Reg vecResultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
725
if (cpu_info.bSSE4_1) {
726
PINSRD(vecResultReg, R(resultReg), off / 4);
727
} else if (off == 0) {
728
MOVD_xmm(vecResultReg, R(resultReg));
729
} else {
730
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
731
MOVD_xmm(tempReg, R(resultReg));
732
PSLLDQ(tempReg, off);
733
POR(vecResultReg, R(tempReg));
734
regCache_.Release(tempReg, RegCache::VEC_TEMP0);
735
}
736
regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
737
};
738
739
if (nearest != nullptr) {
740
Describe("Calls");
741
doNearestCall(0, false);
742
doNearestCall(4, false);
743
doNearestCall(8, false);
744
doNearestCall(12, false);
745
746
// After doing the calls, certain cached things aren't safe.
747
if (regCache_.Has(RegCache::GEN_ID))
748
regCache_.ForceRelease(RegCache::GEN_ID);
749
if (regCache_.Has(RegCache::VEC_ZERO))
750
regCache_.ForceRelease(RegCache::VEC_ZERO);
751
} else {
752
success = success && Jit_FetchQuad(id, false);
753
}
754
755
if (id.hasAnyMips) {
756
Describe("MipsCalls");
757
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
758
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
759
CMP(8, R(levelFracReg), Imm8(0));
760
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
761
} else {
762
CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
763
}
764
FixupBranch skip = J_CC(CC_Z, true);
765
766
// Modify the level, so the new level value is used. We don't need the old.
767
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
768
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
769
ADD(32, R(levelReg), Imm8(1));
770
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
771
} else {
772
// It's fine to just modify this in place.
773
ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
774
}
775
776
if (nearest != nullptr) {
777
Describe("MipsCalls");
778
doNearestCall(0, true);
779
doNearestCall(4, true);
780
doNearestCall(8, true);
781
doNearestCall(12, true);
782
} else {
783
success = success && Jit_FetchQuad(id, true);
784
}
785
786
SetJumpTarget(skip);
787
}
788
789
// We're done with these now.
790
if (nearest != nullptr) {
791
regCache_.ForceRelease(RegCache::VEC_ARG_U);
792
regCache_.ForceRelease(RegCache::VEC_ARG_V);
793
regCache_.ForceRelease(RegCache::VEC_INDEX);
794
}
795
if (regCache_.Has(RegCache::VEC_INDEX1))
796
regCache_.ForceRelease(RegCache::VEC_INDEX1);
797
if (regCache_.Has(RegCache::VEC_U1))
798
regCache_.ForceRelease(RegCache::VEC_U1);
799
if (regCache_.Has(RegCache::VEC_V1))
800
regCache_.ForceRelease(RegCache::VEC_V1);
801
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
802
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
803
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
804
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
805
806
success = success && Jit_DecodeQuad(id, false);
807
success = success && Jit_BlendQuad(id, false);
808
if (id.hasAnyMips) {
809
Describe("BlendMips");
810
if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
811
X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
812
MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
813
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
814
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
815
}
816
817
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
818
CMP(8, R(levelFracReg), Imm8(0));
819
FixupBranch skip = J_CC(CC_Z, true);
820
821
success = success && Jit_DecodeQuad(id, true);
822
success = success && Jit_BlendQuad(id, true);
823
824
Describe("BlendMips");
825
// First, broadcast the levelFrac value into an XMM.
826
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
827
MOVD_xmm(fracReg, R(levelFracReg));
828
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
829
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
830
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
831
832
// Multiply level1 color by the fraction.
833
X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
834
PMULLW(color1Reg, R(fracReg));
835
836
// Okay, next we need an inverse for color 0.
837
X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
838
MOVDQA(invFracReg, M(const10All16_));
839
PSUBW(invFracReg, R(fracReg));
840
841
// And multiply.
842
PMULLW(XMM0, R(invFracReg));
843
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
844
regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
845
846
// Okay, now sum and divide by 16 (which is what the fraction maxed at.)
847
PADDW(XMM0, R(color1Reg));
848
PSRLW(XMM0, 4);
849
850
// And now we're done with color1Reg/VEC_RESULT1.
851
regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
852
regCache_.ForceRelease(RegCache::VEC_RESULT1);
853
854
SetJumpTarget(skip);
855
}
856
857
if (regCache_.Has(RegCache::VEC_FRAC))
858
regCache_.ForceRelease(RegCache::VEC_FRAC);
859
860
// Finally, it's time to apply the texture function.
861
success = success && Jit_ApplyTextureFunc(id);
862
863
// Last of all, convert to 32-bit channels.
864
Describe("Init");
865
if (cpu_info.bSSE4_1) {
866
PMOVZXWD(XMM0, R(XMM0));
867
} else {
868
X64Reg zeroReg = GetZeroVec();
869
PUNPCKLWD(XMM0, R(zeroReg));
870
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
871
}
872
873
regCache_.ForceRelease(RegCache::VEC_RESULT);
874
if (regCache_.Has(RegCache::GEN_ARG_ID))
875
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
876
877
if (!success) {
878
regCache_.Reset(false);
879
EndWrite();
880
ResetCodePtr(GetOffset(nearest ? nearest : linearResetPos));
881
ERROR_LOG(Log::G3D, "Failed to compile linear %s", DescribeSamplerID(id).c_str());
882
return nullptr;
883
}
884
885
if (id.hasInvalidPtr) {
886
SetJumpTarget(zeroSrc);
887
}
888
889
const u8 *start = WriteFinalizedEpilog();
890
regCache_.Reset(true);
891
return (LinearFunc)start;
892
}
893
894
void SamplerJitCache::WriteConstantPool(const SamplerID &id) {
895
// We reuse constants in any pool, because our code space is small.
896
WriteSimpleConst8x16(const10All16_, 0x10);
897
WriteSimpleConst16x8(const10All8_, 0x10);
898
899
if (const10Low_ == nullptr) {
900
const10Low_ = AlignCode16();
901
for (int i = 0; i < 4; ++i)
902
Write16(0x10);
903
for (int i = 0; i < 4; ++i)
904
Write16(0);
905
}
906
907
WriteSimpleConst4x32(constOnes32_, 1);
908
WriteSimpleConst8x16(constOnes16_, 1);
909
// This is the mask for clamp or wrap, the max texel in the S or T direction.
910
WriteSimpleConst4x32(constMaxTexel32_, 511);
911
912
if (constUNext_ == nullptr) {
913
constUNext_ = AlignCode16();
914
Write32(0); Write32(1); Write32(0); Write32(1);
915
}
916
917
if (constVNext_ == nullptr) {
918
constVNext_ = AlignCode16();
919
Write32(0); Write32(0); Write32(1); Write32(1);
920
}
921
922
WriteSimpleConst4x32(const5551Swizzle_, 0x00070707);
923
WriteSimpleConst4x32(const5650Swizzle_, 0x00070307);
924
925
// These are unique to the sampler ID.
926
if (!id.hasAnyMips) {
927
float w256f = (1 << id.width0Shift) * 256;
928
float h256f = (1 << id.height0Shift) * 256;
929
constWidthHeight256f_ = AlignCode16();
930
Write32(*(uint32_t *)&w256f);
931
Write32(*(uint32_t *)&h256f);
932
Write32(*(uint32_t *)&w256f);
933
Write32(*(uint32_t *)&h256f);
934
935
WriteDynamicConst4x32(constWidthMinus1i_, id.width0Shift > 9 ? 511 : (1 << id.width0Shift) - 1);
936
WriteDynamicConst4x32(constHeightMinus1i_, id.height0Shift > 9 ? 511 : (1 << id.height0Shift) - 1);
937
} else {
938
constWidthHeight256f_ = nullptr;
939
constWidthMinus1i_ = nullptr;
940
constHeightMinus1i_ = nullptr;
941
}
942
}
943
944
RegCache::Reg SamplerJitCache::GetSamplerID() {
945
if (regCache_.Has(RegCache::GEN_ARG_ID))
946
return regCache_.Find(RegCache::GEN_ARG_ID);
947
if (!regCache_.Has(RegCache::GEN_ID)) {
948
X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
949
_assert_(stackIDOffset_ != -1);
950
MOV(PTRBITS, R(r), MDisp(RSP, stackArgPos_ + stackIDOffset_));
951
return r;
952
}
953
return regCache_.Find(RegCache::GEN_ID);
954
}
955
956
void SamplerJitCache::UnlockSamplerID(RegCache::Reg &r) {
957
if (regCache_.Has(RegCache::GEN_ARG_ID))
958
regCache_.Unlock(r, RegCache::GEN_ARG_ID);
959
else
960
regCache_.Unlock(r, RegCache::GEN_ID);
961
}
962
963
bool SamplerJitCache::Jit_FetchQuad(const SamplerID &id, bool level1) {
964
bool success = true;
965
switch (id.TexFmt()) {
966
case GE_TFMT_5650:
967
case GE_TFMT_5551:
968
case GE_TFMT_4444:
969
success = Jit_GetDataQuad(id, level1, 16);
970
// Mask away the high bits, if loaded via AVX2.
971
if (cpu_info.bAVX2) {
972
X64Reg destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
973
PSLLD(destReg, 16);
974
PSRLD(destReg, 16);
975
regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
976
}
977
break;
978
979
case GE_TFMT_8888:
980
success = Jit_GetDataQuad(id, level1, 32);
981
break;
982
983
case GE_TFMT_CLUT32:
984
success = Jit_GetDataQuad(id, level1, 32);
985
if (success)
986
success = Jit_TransformClutIndexQuad(id, 32);
987
if (success)
988
success = Jit_ReadClutQuad(id, level1);
989
break;
990
991
case GE_TFMT_CLUT16:
992
success = Jit_GetDataQuad(id, level1, 16);
993
if (success)
994
success = Jit_TransformClutIndexQuad(id, 16);
995
if (success)
996
success = Jit_ReadClutQuad(id, level1);
997
break;
998
999
case GE_TFMT_CLUT8:
1000
success = Jit_GetDataQuad(id, level1, 8);
1001
if (success)
1002
success = Jit_TransformClutIndexQuad(id, 8);
1003
if (success)
1004
success = Jit_ReadClutQuad(id, level1);
1005
break;
1006
1007
case GE_TFMT_CLUT4:
1008
success = Jit_GetDataQuad(id, level1, 4);
1009
if (success)
1010
success = Jit_TransformClutIndexQuad(id, 4);
1011
if (success)
1012
success = Jit_ReadClutQuad(id, level1);
1013
break;
1014
1015
case GE_TFMT_DXT1:
1016
case GE_TFMT_DXT3:
1017
case GE_TFMT_DXT5:
1018
// No SIMD version currently, should use nearest helper path.
1019
success = false;
1020
break;
1021
1022
default:
1023
success = false;
1024
}
1025
1026
return success;
1027
}
1028
1029
bool SamplerJitCache::Jit_GetDataQuad(const SamplerID &id, bool level1, int bitsPerTexel) {
1030
Describe("DataQuad");
1031
bool success = true;
1032
1033
X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
1034
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
1035
MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0));
1036
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
1037
1038
X64Reg destReg = INVALID_REG;
1039
if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32)
1040
destReg = regCache_.Alloc(RegCache::VEC_INDEX);
1041
else if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
1042
destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1043
else
1044
destReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1045
1046
X64Reg byteOffsetReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1047
if (cpu_info.bAVX2 && id.overReadSafe) {
1048
// We have to set a mask for which values to load. Load all 4.
1049
// Note this is overwritten with zeroes by the gather instruction.
1050
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1051
PCMPEQD(maskReg, R(maskReg));
1052
VPGATHERDD(128, destReg, MComplex(baseReg, byteOffsetReg, SCALE_1, 0), maskReg);
1053
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
1054
} else {
1055
if (bitsPerTexel != 32)
1056
PXOR(destReg, R(destReg));
1057
1058
// Grab each value separately... try to use the right memory access size.
1059
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1060
if (cpu_info.bSSE4_1) {
1061
for (int i = 0; i < 4; ++i) {
1062
PEXTRD(R(temp2Reg), byteOffsetReg, i);
1063
if (bitsPerTexel <= 8)
1064
PINSRB(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 4);
1065
else if (bitsPerTexel == 16)
1066
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1067
else if (bitsPerTexel == 32)
1068
PINSRD(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i);
1069
}
1070
} else {
1071
for (int i = 0; i < 4; ++i) {
1072
MOVD_xmm(R(temp2Reg), byteOffsetReg);
1073
if (i != 3)
1074
PSRLDQ(byteOffsetReg, 4);
1075
if (bitsPerTexel <= 8) {
1076
MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
1077
PINSRW(destReg, R(temp2Reg), i * 2);
1078
} else if (bitsPerTexel == 16) {
1079
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1080
} else if (bitsPerTexel == 32) {
1081
if (i == 0) {
1082
MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
1083
} else {
1084
// Maybe a temporary would be better, but this path should be rare.
1085
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1086
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 2), i * 2 + 1);
1087
}
1088
}
1089
}
1090
}
1091
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1092
}
1093
regCache_.Unlock(byteOffsetReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1094
regCache_.ForceRelease(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1095
regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR);
1096
1097
if (bitsPerTexel == 4) {
1098
// Take only lowest bit, multiply by 4 with shifting.
1099
X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1100
// Next, shift away based on the odd U bits.
1101
if (cpu_info.bAVX2) {
1102
// This is really convenient with AVX. Just make the bit into a shift amount.
1103
PSLLD(uReg, 31);
1104
PSRLD(uReg, 29);
1105
VPSRLVD(128, destReg, destReg, R(uReg));
1106
} else {
1107
// This creates a mask - FFFFFFFF to shift, zero otherwise.
1108
PSLLD(uReg, 31);
1109
PSRAD(uReg, 31);
1110
1111
X64Reg unshiftedReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1112
MOVDQA(unshiftedReg, R(destReg));
1113
PSRLD(destReg, 4);
1114
// Mask destReg (shifted) and reverse uReg to unshifted masked.
1115
PAND(destReg, R(uReg));
1116
PANDN(uReg, R(unshiftedReg));
1117
// Now combine.
1118
POR(destReg, R(uReg));
1119
regCache_.Release(unshiftedReg, RegCache::VEC_TEMP0);
1120
}
1121
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1122
regCache_.ForceRelease(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1123
}
1124
1125
if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32) {
1126
regCache_.Unlock(destReg, RegCache::VEC_INDEX);
1127
} else {
1128
regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1129
regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1130
}
1131
1132
return success;
1133
}
1134
1135
bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {
1136
Describe("TrCLUTQuad");
1137
GEPaletteFormat fmt = id.ClutFmt();
1138
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
1139
// This is simple - just mask.
1140
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1141
// Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4.
1142
PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
1143
PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
1144
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
1145
1146
return true;
1147
}
1148
1149
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1150
bool maskedIndex = false;
1151
1152
// Okay, first load the actual samplerID clutformat bits we'll use.
1153
X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1154
X64Reg idReg = GetSamplerID();
1155
if (cpu_info.bAVX2 && !id.hasClutShift)
1156
VPBROADCASTD(128, formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
1157
else
1158
MOVD_xmm(formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
1159
UnlockSamplerID(idReg);
1160
1161
// Shift = (clutformat >> 2) & 0x1F
1162
if (id.hasClutShift) {
1163
// Before shifting, let's mask if needed (we always read 32 bits.)
1164
// We have to do this here, because the bits should be zero even if F is used as a mask.
1165
if (bitsPerIndex < 32) {
1166
PSLLD(indexReg, 32 - bitsPerIndex);
1167
PSRLD(indexReg, 32 - bitsPerIndex);
1168
maskedIndex = true;
1169
}
1170
1171
X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1172
// Shift against walls to get 5 bits after the rightmost 2.
1173
PSLLD(shiftReg, formatReg, 32 - 7);
1174
PSRLD(shiftReg, 32 - 5);
1175
// The other lanes are zero, so we can use PSRLD.
1176
PSRLD(indexReg, R(shiftReg));
1177
regCache_.Release(shiftReg, RegCache::VEC_TEMP1);
1178
}
1179
1180
// With shifting done, we need the format in each lane.
1181
if (!cpu_info.bAVX2 || id.hasClutShift)
1182
PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));
1183
1184
// Mask = (clutformat >> 8) & 0xFF
1185
if (id.hasClutMask) {
1186
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1187
// If it was CLUT4, grab only 4 bits of the mask.
1188
PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
1189
PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
1190
1191
PAND(indexReg, R(maskReg));
1192
regCache_.Release(maskReg, RegCache::VEC_TEMP1);
1193
} else if (!maskedIndex || bitsPerIndex > 8) {
1194
// Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.)
1195
PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
1196
PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
1197
}
1198
1199
// Offset = (clutformat >> 12) & 0x01F0
1200
if (id.hasClutOffset) {
1201
// Use walls to extract the 5 bits at 16, and then put them shifted left by 4.
1202
int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;
1203
PSRLD(formatReg, 16);
1204
PSLLD(formatReg, 32 - offsetBits);
1205
PSRLD(formatReg, 32 - offsetBits - 4);
1206
1207
POR(indexReg, R(formatReg));
1208
}
1209
1210
regCache_.Release(formatReg, RegCache::VEC_TEMP0);
1211
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
1212
return true;
1213
}
1214
1215
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
1216
Describe("ReadCLUTQuad");
1217
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1218
1219
if (!id.useSharedClut) {
1220
X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1221
1222
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
1223
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
1224
MOVD_xmm(vecLevelReg, R(levelReg));
1225
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
1226
} else {
1227
#if PPSSPP_PLATFORM(WINDOWS)
1228
if (cpu_info.bAVX2) {
1229
VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
1230
} else {
1231
MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
1232
PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0));
1233
}
1234
#else
1235
_assert_(false);
1236
#endif
1237
}
1238
1239
// Now we multiply by 16, and add.
1240
PSLLD(vecLevelReg, 4);
1241
PADDD(indexReg, R(vecLevelReg));
1242
regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
1243
}
1244
1245
X64Reg idReg = GetSamplerID();
1246
X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
1247
MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
1248
UnlockSamplerID(idReg);
1249
1250
X64Reg resultReg = INVALID_REG;
1251
if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
1252
resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1253
else
1254
resultReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1255
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1256
if (cpu_info.bAVX2 && id.overReadSafe)
1257
PCMPEQD(maskReg, R(maskReg));
1258
1259
switch (id.ClutFmt()) {
1260
case GE_CMODE_16BIT_BGR5650:
1261
case GE_CMODE_16BIT_ABGR5551:
1262
case GE_CMODE_16BIT_ABGR4444:
1263
if (cpu_info.bAVX2 && id.overReadSafe) {
1264
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg);
1265
// Clear out the top 16 bits.
1266
PCMPEQD(maskReg, R(maskReg));
1267
PSRLD(maskReg, 16);
1268
PAND(resultReg, R(maskReg));
1269
} else {
1270
PXOR(resultReg, R(resultReg));
1271
1272
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1273
if (cpu_info.bSSE4_1) {
1274
for (int i = 0; i < 4; ++i) {
1275
PEXTRD(R(temp2Reg), indexReg, i);
1276
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
1277
}
1278
} else {
1279
for (int i = 0; i < 4; ++i) {
1280
MOVD_xmm(R(temp2Reg), indexReg);
1281
if (i != 3)
1282
PSRLDQ(indexReg, 4);
1283
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
1284
}
1285
}
1286
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1287
}
1288
break;
1289
1290
case GE_CMODE_32BIT_ABGR8888:
1291
if (cpu_info.bAVX2 && id.overReadSafe) {
1292
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg);
1293
} else {
1294
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1295
if (cpu_info.bSSE4_1) {
1296
for (int i = 0; i < 4; ++i) {
1297
PEXTRD(R(temp2Reg), indexReg, i);
1298
PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i);
1299
}
1300
} else {
1301
for (int i = 0; i < 4; ++i) {
1302
MOVD_xmm(R(temp2Reg), indexReg);
1303
if (i != 3)
1304
PSRLDQ(indexReg, 4);
1305
1306
if (i == 0) {
1307
MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
1308
} else {
1309
MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
1310
PSLLDQ(maskReg, 4 * i);
1311
POR(resultReg, R(maskReg));
1312
}
1313
}
1314
}
1315
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1316
}
1317
break;
1318
}
1319
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
1320
regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1321
regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1322
1323
regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1);
1324
regCache_.Release(indexReg, RegCache::VEC_INDEX);
1325
return true;
1326
}
1327
1328
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
1329
Describe(level1 ? "BlendQuadMips" : "BlendQuad");
1330
1331
if (cpu_info.bSSE4_1 && cpu_info.bSSSE3) {
1332
// Let's start by rearranging from TL TR BL BR like this:
1333
// ABCD EFGH IJKL MNOP -> AI BJ CK DL EM FN GO HP -> AIEM BJFN CKGO DLHP
1334
// This way, all the RGBAs are next to each other, and in order TL BL TR BR.
1335
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1336
X64Reg tempArrangeReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1337
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
1338
PUNPCKLBW(quadReg, R(tempArrangeReg));
1339
// Okay, that's top and bottom interleaved, now for left and right.
1340
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
1341
PUNPCKLWD(quadReg, R(tempArrangeReg));
1342
regCache_.Release(tempArrangeReg, RegCache::VEC_TEMP0);
1343
1344
// Next up, we want to multiply and add using a repeated TB frac pair.
1345
// That's (0x10 - frac_v) in byte 1, frac_v in byte 2, repeating.
1346
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1347
X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1348
X64Reg zeroReg = GetZeroVec();
1349
if (level1) {
1350
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
1351
} else {
1352
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
1353
}
1354
PSHUFB(fracReg, R(zeroReg));
1355
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1356
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1357
1358
// Now, inverse fracReg, then interleave into the actual multiplier.
1359
// This gives us the repeated TB pairs we wanted.
1360
X64Reg multTBReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1361
MOVDQA(multTBReg, M(const10All8_));
1362
PSUBB(multTBReg, R(fracReg));
1363
PUNPCKLBW(multTBReg, R(fracReg));
1364
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
1365
1366
// Now we can multiply and add paired lanes in one go.
1367
// Note that since T+B=0x10, this gives us exactly 12 bits.
1368
PMADDUBSW(quadReg, R(multTBReg));
1369
regCache_.Release(multTBReg, RegCache::VEC_TEMP1);
1370
1371
// With that done, we need to multiply by LR, or rather 0L0R, and sum again.
1372
// Since RRRR was all next to each other, this gives us a clean total R.
1373
fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1374
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1375
if (level1) {
1376
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
1377
} else {
1378
// We can ignore the high bits, since we'll interleave those away anyway.
1379
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
1380
}
1381
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1382
1383
// Again, we're inversing into an interleaved multiplier. L is the inversed one.
1384
// 0L0R is (0x10 - frac_u), frac_u - 2x16 repeated four times.
1385
X64Reg multLRReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1386
MOVDQA(multLRReg, M(const10All16_));
1387
PSUBW(multLRReg, R(fracReg));
1388
PUNPCKLWD(multLRReg, R(fracReg));
1389
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
1390
1391
// This gives us RGBA as dwords, but they're all shifted left by 8 from the multiplies.
1392
PMADDWD(quadReg, R(multLRReg));
1393
PSRLD(quadReg, 8);
1394
regCache_.Release(multLRReg, RegCache::VEC_TEMP1);
1395
1396
// Shrink to 16-bit, it's more convenient for later.
1397
if (level1) {
1398
PACKSSDW(quadReg, R(quadReg));
1399
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
1400
} else {
1401
if (cpu_info.bAVX) {
1402
VPACKSSDW(128, XMM0, quadReg, R(quadReg));
1403
} else {
1404
PACKSSDW(quadReg, R(quadReg));
1405
MOVDQA(XMM0, R(quadReg));
1406
}
1407
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
1408
1409
regCache_.ForceRelease(RegCache::VEC_RESULT);
1410
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1411
_assert_msg_(changeSuccess, "Unexpected reg locked as destReg");
1412
}
1413
} else {
1414
X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1415
X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1416
1417
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1418
X64Reg zeroReg = GetZeroVec();
1419
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
1420
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
1421
PUNPCKLBW(topReg, R(zeroReg));
1422
PUNPCKLBW(bottomReg, R(zeroReg));
1423
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1424
if (!level1) {
1425
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
1426
regCache_.ForceRelease(RegCache::VEC_RESULT);
1427
}
1428
1429
// Grab frac_u and spread to lower (L) lanes.
1430
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1431
X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1432
X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1433
if (level1) {
1434
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
1435
} else {
1436
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
1437
}
1438
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1439
// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
1440
MOVDQA(fracMulReg, M(const10Low_));
1441
PSUBW(fracMulReg, R(fracReg));
1442
// Then we just put the original frac_u in the upper bits.
1443
PUNPCKLQDQ(fracMulReg, R(fracReg));
1444
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
1445
1446
// Okay, we have 8-bits in the top and bottom rows for the color.
1447
// Multiply by frac to get 12, which we keep for the next stage.
1448
PMULLW(topReg, R(fracMulReg));
1449
PMULLW(bottomReg, R(fracMulReg));
1450
regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
1451
1452
// Time for frac_v. This time, we want it in all 8 lanes.
1453
fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1454
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1455
X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1456
if (level1) {
1457
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
1458
} else {
1459
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
1460
}
1461
PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
1462
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1463
1464
// Now, inverse fracReg into fracTopReg for the top row.
1465
MOVDQA(fracTopReg, M(const10All16_));
1466
PSUBW(fracTopReg, R(fracReg));
1467
1468
// We had 12, plus 4 frac, that gives us 16.
1469
PMULLW(bottomReg, R(fracReg));
1470
PMULLW(topReg, R(fracTopReg));
1471
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
1472
regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
1473
1474
// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
1475
PADDUSW(bottomReg, R(topReg));
1476
regCache_.Release(topReg, RegCache::VEC_TEMP0);
1477
1478
if (level1) {
1479
PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1480
PADDUSW(quadReg, R(bottomReg));
1481
PSRLW(quadReg, 8);
1482
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1483
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
1484
} else {
1485
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1486
if (!changeSuccess) {
1487
_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");
1488
X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1489
PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1490
PADDUSW(bottomReg, R(otherReg));
1491
regCache_.Release(otherReg, RegCache::VEC_TEMP0);
1492
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1493
1494
// Okay, now it can be changed.
1495
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1496
} else {
1497
PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1498
PADDUSW(XMM0, R(bottomReg));
1499
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1500
}
1501
1502
PSRLW(XMM0, 8);
1503
}
1504
}
1505
1506
return true;
1507
}
1508
1509
bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
1510
X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);
1511
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1512
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1513
1514
auto useAlphaFrom = [&](X64Reg alphaColorReg) {
1515
if (cpu_info.bSSE4_1) {
1516
// Copy only alpha.
1517
PBLENDW(resultReg, R(alphaColorReg), 0x08);
1518
} else {
1519
PSRLDQ(alphaColorReg, 6);
1520
PSLLDQ(alphaColorReg, 6);
1521
// Zero out the result alpha and OR them together.
1522
PSLLDQ(resultReg, 10);
1523
PSRLDQ(resultReg, 10);
1524
POR(resultReg, R(alphaColorReg));
1525
}
1526
};
1527
1528
// Note: color is in DWORDs, but result is in WORDs.
1529
switch (id.TexFunc()) {
1530
case GE_TEXFUNC_MODULATE:
1531
Describe("Modulate");
1532
PACKSSDW(primColorReg, R(primColorReg));
1533
if (cpu_info.bAVX) {
1534
VPADDW(128, tempReg, primColorReg, M(constOnes16_));
1535
1536
// Okay, time to multiply. This produces 16 bits, neatly.
1537
VPMULLW(128, resultReg, tempReg, R(resultReg));
1538
} else {
1539
MOVDQA(tempReg, M(constOnes16_));
1540
PADDW(tempReg, R(primColorReg));
1541
1542
PMULLW(resultReg, R(tempReg));
1543
}
1544
1545
if (id.useColorDoubling)
1546
PSRLW(resultReg, 7);
1547
else
1548
PSRLW(resultReg, 8);
1549
1550
if (!id.useTextureAlpha) {
1551
useAlphaFrom(primColorReg);
1552
} else if (id.useColorDoubling) {
1553
// We still need to finish dividing alpha, it's currently doubled (from the 7 above.)
1554
PSRLW(primColorReg, resultReg, 1);
1555
useAlphaFrom(primColorReg);
1556
}
1557
break;
1558
1559
case GE_TEXFUNC_DECAL:
1560
Describe("Decal");
1561
PACKSSDW(primColorReg, R(primColorReg));
1562
if (id.useTextureAlpha) {
1563
// Get alpha into the tempReg.
1564
PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));
1565
PADDW(resultReg, M(constOnes16_));
1566
PMULLW(resultReg, R(tempReg));
1567
1568
X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1569
// Materialize some 255s, and subtract out alpha.
1570
PCMPEQD(invAlphaReg, R(invAlphaReg));
1571
PSRLW(invAlphaReg, 8);
1572
PSUBW(invAlphaReg, R(tempReg));
1573
1574
MOVDQA(tempReg, R(primColorReg));
1575
PADDW(tempReg, M(constOnes16_));
1576
PMULLW(tempReg, R(invAlphaReg));
1577
regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);
1578
1579
// Now sum, and divide.
1580
PADDW(resultReg, R(tempReg));
1581
if (id.useColorDoubling)
1582
PSRLW(resultReg, 7);
1583
else
1584
PSRLW(resultReg, 8);
1585
} else if (id.useColorDoubling) {
1586
PSLLW(resultReg, 1);
1587
}
1588
useAlphaFrom(primColorReg);
1589
break;
1590
1591
case GE_TEXFUNC_BLEND:
1592
{
1593
Describe("EnvBlend");
1594
PACKSSDW(primColorReg, R(primColorReg));
1595
1596
// First off, let's grab the color value.
1597
X64Reg idReg = GetSamplerID();
1598
X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1599
if (cpu_info.bSSE4_1) {
1600
PMOVZXBW(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
1601
} else {
1602
MOVD_xmm(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
1603
X64Reg zeroReg = GetZeroVec();
1604
PUNPCKLBW(texEnvReg, R(zeroReg));
1605
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1606
}
1607
UnlockSamplerID(idReg);
1608
1609
// Now merge in the prim color so we have them interleaved, texenv low.
1610
PUNPCKLWD(texEnvReg, R(primColorReg));
1611
1612
// Okay, now materialize 255 for inversing resultReg and rounding.
1613
PCMPEQD(tempReg, R(tempReg));
1614
PSRLW(tempReg, 8);
1615
1616
// If alpha is used, we want the roundup and factor to be zero.
1617
if (id.useTextureAlpha)
1618
PSRLDQ(tempReg, 10);
1619
1620
// We're going to lose tempReg, so save the 255s.
1621
X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1622
MOVDQA(roundValueReg, R(tempReg));
1623
1624
// Okay, now inverse, then merge with resultReg low to match texenv low.
1625
PSUBUSW(tempReg, R(resultReg));
1626
PUNPCKLWD(resultReg, R(tempReg));
1627
1628
if (id.useTextureAlpha) {
1629
// Before we multiply, let's include alpha in that multiply.
1630
PADDW(primColorReg, M(constOnes16_));
1631
// Mask off everything but alpha, and move to the second highest short.
1632
PSRLDQ(primColorReg, 6);
1633
PSLLDQ(primColorReg, 12);
1634
// Now simply merge in with texenv.
1635
POR(texEnvReg, R(primColorReg));
1636
}
1637
1638
// Alright, now to multiply and add all in one go. Note this gives us DWORDs.
1639
PMADDWD(resultReg, R(texEnvReg));
1640
regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);
1641
1642
// Now convert back to 16 bit and add the 255s for rounding.
1643
if (cpu_info.bSSE4_1) {
1644
PACKUSDW(resultReg, R(resultReg));
1645
} else {
1646
PSLLD(resultReg, 16);
1647
PSRAD(resultReg, 16);
1648
PACKSSDW(resultReg, R(resultReg));
1649
}
1650
PADDW(resultReg, R(roundValueReg));
1651
regCache_.Release(roundValueReg, RegCache::VEC_TEMP2);
1652
1653
// Okay, divide by 256 or 128 depending on doubling (we want to preserve the precision.)
1654
if (id.useColorDoubling && id.useTextureAlpha) {
1655
// If doubling, we want to still divide alpha by 256.
1656
PSRLW(resultReg, 7);
1657
PSRLW(primColorReg, resultReg, 1);
1658
useAlphaFrom(primColorReg);
1659
} else if (id.useColorDoubling) {
1660
PSRLW(resultReg, 7);
1661
} else {
1662
PSRLW(resultReg, 8);
1663
}
1664
1665
if (!id.useTextureAlpha)
1666
useAlphaFrom(primColorReg);
1667
break;
1668
}
1669
1670
case GE_TEXFUNC_REPLACE:
1671
Describe("Replace");
1672
if (id.useColorDoubling && id.useTextureAlpha) {
1673
// We can abuse primColorReg as a temp.
1674
MOVDQA(primColorReg, R(resultReg));
1675
// Shift to zero out alpha in resultReg.
1676
PSLLDQ(resultReg, 10);
1677
PSRLDQ(resultReg, 10);
1678
// Now simply add them together, restoring alpha and doubling the colors.
1679
PADDW(resultReg, R(primColorReg));
1680
} else if (!id.useTextureAlpha) {
1681
if (id.useColorDoubling) {
1682
// Let's just double using shifting. Ignore alpha.
1683
PSLLW(resultReg, 1);
1684
}
1685
// Now we want prim_color in W, so convert, then shift-mask away the color.
1686
PACKSSDW(primColorReg, R(primColorReg));
1687
useAlphaFrom(primColorReg);
1688
}
1689
break;
1690
1691
case GE_TEXFUNC_ADD:
1692
case GE_TEXFUNC_UNKNOWN1:
1693
case GE_TEXFUNC_UNKNOWN2:
1694
case GE_TEXFUNC_UNKNOWN3:
1695
Describe("Add");
1696
PACKSSDW(primColorReg, R(primColorReg));
1697
if (id.useTextureAlpha) {
1698
MOVDQA(tempReg, M(constOnes16_));
1699
// Add and multiply the alpha (and others, but we'll mask them.)
1700
PADDW(tempReg, R(primColorReg));
1701
PMULLW(tempReg, R(resultReg));
1702
1703
// Now that we've extracted alpha, sum and double as needed.
1704
PADDW(resultReg, R(primColorReg));
1705
if (id.useColorDoubling)
1706
PSLLW(resultReg, 1);
1707
1708
// Divide by 256 to normalize alpha.
1709
PSRLW(tempReg, 8);
1710
useAlphaFrom(tempReg);
1711
} else {
1712
PADDW(resultReg, R(primColorReg));
1713
if (id.useColorDoubling)
1714
PSLLW(resultReg, 1);
1715
useAlphaFrom(primColorReg);
1716
}
1717
break;
1718
}
1719
1720
regCache_.Release(tempReg, RegCache::VEC_TEMP0);
1721
regCache_.Unlock(resultReg, RegCache::VEC_RESULT);
1722
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
1723
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1724
return true;
1725
}
1726
1727
bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
1728
GETextureFormat fmt = id.TexFmt();
1729
bool success = true;
1730
switch (fmt) {
1731
case GE_TFMT_5650:
1732
success = Jit_GetTexData(id, 16);
1733
if (success)
1734
success = Jit_Decode5650(id);
1735
break;
1736
1737
case GE_TFMT_5551:
1738
success = Jit_GetTexData(id, 16);
1739
if (success)
1740
success = Jit_Decode5551(id);
1741
break;
1742
1743
case GE_TFMT_4444:
1744
success = Jit_GetTexData(id, 16);
1745
if (success)
1746
success = Jit_Decode4444(id);
1747
break;
1748
1749
case GE_TFMT_8888:
1750
success = Jit_GetTexData(id, 32);
1751
break;
1752
1753
case GE_TFMT_CLUT32:
1754
success = Jit_GetTexData(id, 32);
1755
if (success)
1756
success = Jit_TransformClutIndex(id, 32);
1757
if (success)
1758
success = Jit_ReadClutColor(id);
1759
break;
1760
1761
case GE_TFMT_CLUT16:
1762
success = Jit_GetTexData(id, 16);
1763
if (success)
1764
success = Jit_TransformClutIndex(id, 16);
1765
if (success)
1766
success = Jit_ReadClutColor(id);
1767
break;
1768
1769
case GE_TFMT_CLUT8:
1770
success = Jit_GetTexData(id, 8);
1771
if (success)
1772
success = Jit_TransformClutIndex(id, 8);
1773
if (success)
1774
success = Jit_ReadClutColor(id);
1775
break;
1776
1777
case GE_TFMT_CLUT4:
1778
success = Jit_GetTexData(id, 4);
1779
if (success)
1780
success = Jit_TransformClutIndex(id, 4);
1781
if (success)
1782
success = Jit_ReadClutColor(id);
1783
break;
1784
1785
case GE_TFMT_DXT1:
1786
success = Jit_GetDXT1Color(id, 8, 255);
1787
break;
1788
1789
case GE_TFMT_DXT3:
1790
success = Jit_GetDXT1Color(id, 16, 0);
1791
if (success)
1792
success = Jit_ApplyDXTAlpha(id);
1793
break;
1794
1795
case GE_TFMT_DXT5:
1796
success = Jit_GetDXT1Color(id, 16, 0);
1797
if (success)
1798
success = Jit_ApplyDXTAlpha(id);
1799
break;
1800
1801
default:
1802
success = false;
1803
}
1804
1805
return success;
1806
}
1807
1808
// Note: afterward, srcReg points at the block, and uReg/vReg have offset into block.
1809
bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int alpha) {
1810
Describe("DXT1");
1811
// Like Jit_GetTexData, this gets the color into resultReg.
1812
// Note: color low bits are red, high bits are blue.
1813
_assert_msg_(blockSize == 8 || blockSize == 16, "Invalid DXT block size");
1814
1815
X64Reg colorIndexReg = INVALID_REG;
1816
if (!id.linear) {
1817
// First, we need to get the block's offset, which is:
1818
// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize
1819
// We distribute the blockSize constant for convenience:
1820
// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)
1821
1822
// Copy u (we'll need it later), and round down to the nearest 4 after scaling.
1823
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
1824
X64Reg srcBaseReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1825
LEA(32, srcBaseReg, MScaled(uReg, blockSize / 4, 0));
1826
AND(32, R(srcBaseReg), Imm32(blockSize == 8 ? ~7 : ~15));
1827
// Add in srcReg already, since we'll be multiplying soon.
1828
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1829
ADD(64, R(srcBaseReg), R(srcReg));
1830
1831
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
1832
X64Reg srcOffsetReg = regCache_.Alloc(RegCache::GEN_TEMP1);
1833
LEA(32, srcOffsetReg, MScaled(vReg, blockSize / 4, 0));
1834
AND(32, R(srcOffsetReg), Imm32(blockSize == 8 ? ~7 : ~15));
1835
// Modify bufw in place and then multiply.
1836
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
1837
SHR(32, R(bufwReg), Imm8(2));
1838
IMUL(32, srcOffsetReg, R(bufwReg));
1839
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
1840
// We no longer need bufwReg.
1841
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
1842
1843
// And now let's chop off the offset for u and v.
1844
AND(32, R(uReg), Imm32(3));
1845
AND(32, R(vReg), Imm32(3));
1846
1847
// Okay, at this point srcBaseReg + srcOffsetReg = blockPos. To free up regs, put back in srcReg.
1848
LEA(64, srcReg, MRegSum(srcBaseReg, srcOffsetReg));
1849
regCache_.Release(srcBaseReg, RegCache::GEN_TEMP0);
1850
regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);
1851
1852
// Make sure we don't grab this as colorIndexReg.
1853
if (uReg != ECX && !cpu_info.bBMI2)
1854
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1855
1856
// The colorIndex is simply the 2 bits at blockPos + (v & 3), shifted right by (u & 3) twice.
1857
colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1858
MOVZX(32, 8, colorIndexReg, MRegSum(srcReg, vReg));
1859
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1860
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
1861
// Only DXT3/5 need this reg later.
1862
if (id.TexFmt() == GE_TFMT_DXT1)
1863
regCache_.ForceRelease(RegCache::GEN_ARG_V);
1864
1865
if (uReg == ECX) {
1866
SHR(32, R(colorIndexReg), R(CL));
1867
SHR(32, R(colorIndexReg), R(CL));
1868
} else if (cpu_info.bBMI2) {
1869
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
1870
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
1871
} else {
1872
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1873
_assert_(hasRCX);
1874
LEA(32, ECX, MScaled(uReg, SCALE_2, 0));
1875
SHR(32, R(colorIndexReg), R(CL));
1876
}
1877
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
1878
// If DXT1, there's no alpha and we can toss this reg.
1879
if (id.TexFmt() == GE_TFMT_DXT1)
1880
regCache_.ForceRelease(RegCache::GEN_ARG_U);
1881
} else {
1882
// For linear, we already precalculated the block pos into srcReg.
1883
// uReg is the shift for the color index fomr the 32 bits of color index data.
1884
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
1885
// If we don't have alpha, we don't need vReg.
1886
if (id.TexFmt() == GE_TFMT_DXT1)
1887
regCache_.ForceRelease(RegCache::GEN_ARG_V);
1888
1889
// Make sure we don't grab this as colorIndexReg.
1890
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
1891
if (uReg != ECX && !cpu_info.bBMI2)
1892
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1893
1894
// Shift and mask out the 2 bits we need into colorIndexReg.
1895
colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1896
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1897
if (cpu_info.bBMI2) {
1898
SHRX(32, colorIndexReg, MatR(srcReg), uReg);
1899
} else {
1900
MOV(32, R(colorIndexReg), MatR(srcReg));
1901
if (uReg != RCX) {
1902
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1903
_assert_(hasRCX);
1904
MOV(32, R(RCX), R(uReg));
1905
}
1906
SHR(32, R(colorIndexReg), R(CL));
1907
}
1908
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1909
// We're done with U now.
1910
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
1911
regCache_.ForceRelease(RegCache::GEN_ARG_U);
1912
}
1913
1914
// Mask out the value.
1915
AND(32, R(colorIndexReg), Imm32(3));
1916
1917
X64Reg color1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1918
X64Reg color2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1919
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
1920
1921
// For colorIndex 0 or 1, we'll simply take the 565 color and convert.
1922
CMP(32, R(colorIndexReg), Imm32(1));
1923
FixupBranch handleSimple565 = J_CC(CC_BE);
1924
1925
// Otherwise, it depends if color1 or color2 is higher, so fetch them.
1926
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1927
MOVZX(32, 16, color1Reg, MDisp(srcReg, 4));
1928
MOVZX(32, 16, color2Reg, MDisp(srcReg, 6));
1929
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1930
1931
CMP(32, R(color1Reg), R(color2Reg));
1932
FixupBranch handleMix23 = J_CC(CC_A, true);
1933
1934
// If we're still here, then colorIndex is either 3 for 0 (easy) or 2 for 50% mix.
1935
XOR(32, R(resultReg), R(resultReg));
1936
CMP(32, R(colorIndexReg), Imm32(3));
1937
FixupBranch finishZero = J_CC(CC_E, true);
1938
1939
// At this point, resultReg, colorIndexReg, and maybe R12/R13 can be used as temps.
1940
// We'll add, then shift from 565 a bit less to "divide" by 2 for a 50/50 mix.
1941
1942
if (cpu_info.bBMI2_fast) {
1943
// Expand everything out to 0BGR at 8888, but halved.
1944
MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));
1945
PDEP(32, color1Reg, color1Reg, R(colorIndexReg));
1946
PDEP(32, color2Reg, color2Reg, R(colorIndexReg));
1947
1948
// Now let's sum them together (this undoes our halving.)
1949
LEA(32, resultReg, MRegSum(color1Reg, color2Reg));
1950
1951
// Time to swap into order. Luckily we can ignore alpha.
1952
BSWAP(32, resultReg);
1953
SHR(32, R(resultReg), Imm8(8));
1954
} else {
1955
// We'll need more regs. Grab two more.
1956
PUSH(R12);
1957
PUSH(R13);
1958
1959
// Start with summing R, then shift into position.
1960
MOV(32, R(resultReg), R(color1Reg));
1961
AND(32, R(resultReg), Imm32(0x0000F800));
1962
MOV(32, R(colorIndexReg), R(color2Reg));
1963
AND(32, R(colorIndexReg), Imm32(0x0000F800));
1964
LEA(32, R12, MRegSum(resultReg, colorIndexReg));
1965
// The position is 9, instead of 8, due to doubling.
1966
SHR(32, R(R12), Imm8(9));
1967
1968
// For G, summing leaves it 4 right (doubling made it not need more.)
1969
MOV(32, R(resultReg), R(color1Reg));
1970
AND(32, R(resultReg), Imm32(0x000007E0));
1971
MOV(32, R(colorIndexReg), R(color2Reg));
1972
AND(32, R(colorIndexReg), Imm32(0x000007E0));
1973
LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
1974
SHL(32, R(resultReg), Imm8(5 - 1));
1975
// Now add G and R together.
1976
OR(32, R(resultReg), R(R12));
1977
1978
// At B, we're free to modify the regs in place, finally.
1979
AND(32, R(color1Reg), Imm32(0x0000001F));
1980
AND(32, R(color2Reg), Imm32(0x0000001F));
1981
LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
1982
// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
1983
SHL(32, R(colorIndexReg), Imm8(16 + 2));
1984
// And combine into the result.
1985
OR(32, R(resultReg), R(colorIndexReg));
1986
1987
POP(R13);
1988
POP(R12);
1989
}
1990
1991
FixupBranch finishMix50 = J(true);
1992
1993
// Simply load the 565 color, and convert to 0888.
1994
SetJumpTarget(handleSimple565);
1995
srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1996
MOVZX(32, 16, colorIndexReg, MComplex(srcReg, colorIndexReg, SCALE_2, 4));
1997
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1998
// DXT1 is done with this reg.
1999
if (id.TexFmt() == GE_TFMT_DXT1)
2000
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2001
2002
if (cpu_info.bBMI2_fast) {
2003
// We're only grabbing the high bits, no swizzle here.
2004
MOV(32, R(resultReg), Imm32(0x00F8FCF8));
2005
PDEP(32, resultReg, colorIndexReg, R(resultReg));
2006
BSWAP(32, resultReg);
2007
SHR(32, R(resultReg), Imm8(8));
2008
} else {
2009
// Start with R, shifting it into place.
2010
MOV(32, R(resultReg), R(colorIndexReg));
2011
AND(32, R(resultReg), Imm32(0x0000F800));
2012
SHR(32, R(resultReg), Imm8(8));
2013
2014
// Then take G and shift it too.
2015
MOV(32, R(color2Reg), R(colorIndexReg));
2016
AND(32, R(color2Reg), Imm32(0x000007E0));
2017
SHL(32, R(color2Reg), Imm8(5));
2018
// And now combine with R, shifting that in the process.
2019
OR(32, R(resultReg), R(color2Reg));
2020
2021
// Modify B in place and OR in.
2022
AND(32, R(colorIndexReg), Imm32(0x0000001F));
2023
SHL(32, R(colorIndexReg), Imm8(16 + 3));
2024
OR(32, R(resultReg), R(colorIndexReg));
2025
}
2026
FixupBranch finish565 = J(true);
2027
2028
// Here we'll mix color1 and color2 by 2/3 (which gets the 2 depends on colorIndexReg.)
2029
SetJumpTarget(handleMix23);
2030
2031
// If colorIndexReg is 2, it's color1Reg * 2 + color2Reg, but if colorIndexReg is 3, it's reversed.
2032
// Let's swap the regs in that case.
2033
CMP(32, R(colorIndexReg), Imm32(2));
2034
FixupBranch skipSwap23 = J_CC(CC_E);
2035
XCHG(32, R(color2Reg), R(color1Reg));
2036
SetJumpTarget(skipSwap23);
2037
2038
if (cpu_info.bBMI2_fast) {
2039
// Gather B, G, and R and space them apart by 14 or 15 bits.
2040
MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));
2041
PDEP(64, color1Reg, color1Reg, R(colorIndexReg));
2042
PDEP(64, color2Reg, color2Reg, R(colorIndexReg));
2043
LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
2044
2045
// Now multiply all of them by a special constant to divide by 3.
2046
// This constant is (1 << 13) / 3, which is importantly less than 14 or 15.
2047
IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));
2048
2049
// Now extract the BGR values to 8 bits each.
2050
// We subtract 3 from 13 to get 8 from 5 bits, then 2 from 20 + 13, and 3 from 40 + 13.
2051
MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));
2052
PEXT(64, resultReg, resultReg, R(colorIndexReg));
2053
2054
// Finally swap B and R.
2055
BSWAP(32, resultReg);
2056
SHR(32, R(resultReg), Imm8(8));
2057
} else {
2058
// We'll need more regs. Grab two more to keep the stack aligned.
2059
PUSH(R12);
2060
PUSH(R13);
2061
2062
// Start off with R, adding together first...
2063
MOV(32, R(resultReg), R(color1Reg));
2064
AND(32, R(resultReg), Imm32(0x0000F800));
2065
MOV(32, R(colorIndexReg), R(color2Reg));
2066
AND(32, R(colorIndexReg), Imm32(0x0000F800));
2067
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
2068
// We'll overflow if we divide here, so shift into place already.
2069
SHR(32, R(resultReg), Imm8(8));
2070
// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
2071
IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
2072
// Now we SHR off the extra bits we added on.
2073
SHR(32, R(R12), Imm8(17));
2074
2075
// Now add up G. We leave this in place and shift right more.
2076
MOV(32, R(resultReg), R(color1Reg));
2077
AND(32, R(resultReg), Imm32(0x000007E0));
2078
MOV(32, R(colorIndexReg), R(color2Reg));
2079
AND(32, R(colorIndexReg), Imm32(0x000007E0));
2080
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
2081
// Again, multiply and now we use AAAB, this time masking.
2082
IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
2083
SHR(32, R(resultReg), Imm8(17 - 5));
2084
AND(32, R(resultReg), Imm32(0x0000FF00));
2085
// Let's combine R in already.
2086
OR(32, R(resultReg), R(R12));
2087
2088
// Now for B, it starts in the lowest place so we'll need to mask.
2089
AND(32, R(color1Reg), Imm32(0x0000001F));
2090
AND(32, R(color2Reg), Imm32(0x0000001F));
2091
LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
2092
// Instead of shifting left, though, we multiply by a bit more.
2093
IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
2094
AND(32, R(colorIndexReg), Imm32(0x00FF0000));
2095
OR(32, R(resultReg), R(colorIndexReg));
2096
2097
POP(R13);
2098
POP(R12);
2099
}
2100
2101
regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);
2102
regCache_.Release(color1Reg, RegCache::GEN_TEMP1);
2103
regCache_.Release(color2Reg, RegCache::GEN_TEMP2);
2104
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2105
2106
SetJumpTarget(finishMix50);
2107
SetJumpTarget(finish565);
2108
// In all these cases, it's time to add in alpha. Zero doesn't get it.
2109
if (alpha != 0) {
2110
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2111
OR(32, R(resultReg), Imm32(alpha << 24));
2112
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2113
}
2114
2115
SetJumpTarget(finishZero);
2116
2117
return true;
2118
}
2119
2120
bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
2121
GETextureFormat fmt = id.TexFmt();
2122
2123
// At this point, srcReg points at the block, and u/v are offsets inside it.
2124
2125
bool success = false;
2126
if (fmt == GE_TFMT_DXT3) {
2127
Describe("DXT3A");
2128
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2129
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2130
2131
if (id.linear) {
2132
// We precalculated the shift for the 64 bits of alpha data in vReg.
2133
if (!cpu_info.bBMI2) {
2134
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2135
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2136
}
2137
2138
if (cpu_info.bBMI2) {
2139
SHRX(64, srcReg, MDisp(srcReg, 8), vReg);
2140
} else {
2141
MOV(64, R(srcReg), MDisp(srcReg, 8));
2142
MOV(32, R(RCX), R(vReg));
2143
SHR(64, R(srcReg), R(CL));
2144
}
2145
// This will mask the 4 bits we want using a wall also.
2146
SHL(32, R(srcReg), Imm8(28));
2147
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2148
OR(32, R(resultReg), R(srcReg));
2149
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2150
2151
success = true;
2152
} else {
2153
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2154
2155
if (uReg != RCX && !cpu_info.bBMI2) {
2156
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2157
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2158
}
2159
2160
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2161
MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));
2162
if (cpu_info.bBMI2) {
2163
LEA(32, uReg, MScaled(uReg, SCALE_4, 0));
2164
SHRX(32, temp1Reg, R(temp1Reg), uReg);
2165
} else {
2166
// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
2167
LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
2168
SHR(32, R(temp1Reg), R(CL));
2169
}
2170
SHL(32, R(temp1Reg), Imm8(28));
2171
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2172
OR(32, R(resultReg), R(temp1Reg));
2173
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2174
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2175
2176
success = true;
2177
2178
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2179
regCache_.ForceRelease(RegCache::GEN_ARG_U);
2180
}
2181
2182
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2183
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2184
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2185
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2186
} else if (fmt == GE_TFMT_DXT5) {
2187
Describe("DXT5A");
2188
2189
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2190
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2191
X64Reg alphaIndexReg = INVALID_REG;
2192
if (id.linear) {
2193
// We precalculated the shift for the 64 bits of alpha data in vReg.
2194
if (cpu_info.bBMI2) {
2195
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2196
SHRX(64, alphaIndexReg, MDisp(srcReg, 8), vReg);
2197
} else {
2198
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2199
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2200
2201
MOV(64, R(alphaIndexReg), MDisp(srcReg, 8));
2202
MOV(32, R(RCX), R(vReg));
2203
SHR(64, R(alphaIndexReg), R(CL));
2204
}
2205
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2206
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2207
} else {
2208
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2209
if (uReg != RCX && !cpu_info.bBMI2)
2210
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2211
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2212
2213
// Let's figure out the alphaIndex bit offset so we can read the right byte.
2214
// bitOffset = (u + v * 4) * 3;
2215
LEA(32, uReg, MComplex(uReg, vReg, SCALE_4, 0));
2216
LEA(32, uReg, MComplex(uReg, uReg, SCALE_2, 0));
2217
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2218
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2219
2220
if (cpu_info.bBMI2) {
2221
SHRX(64, alphaIndexReg, MDisp(srcReg, 8), uReg);
2222
} else {
2223
// And now the byte offset and bit from there, from those.
2224
MOV(32, R(alphaIndexReg), R(uReg));
2225
SHR(32, R(alphaIndexReg), Imm8(3));
2226
AND(32, R(uReg), Imm32(7));
2227
2228
// Load 16 bits and mask, in case it straddles bytes.
2229
MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
2230
// If not, it's in what was bufwReg.
2231
if (uReg != RCX) {
2232
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2233
MOV(32, R(RCX), R(uReg));
2234
}
2235
SHR(32, R(alphaIndexReg), R(CL));
2236
}
2237
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2238
regCache_.ForceRelease(RegCache::GEN_ARG_U);
2239
}
2240
2241
X64Reg alpha1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2242
X64Reg alpha2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2243
2244
AND(32, R(alphaIndexReg), Imm32(7));
2245
2246
X64Reg temp3Reg = regCache_.Alloc(RegCache::GEN_TEMP3);
2247
2248
// Okay, now check for 0 or 1 alphaIndex in alphaIndexReg, those are simple.
2249
CMP(32, R(alphaIndexReg), Imm32(1));
2250
FixupBranch handleSimple = J_CC(CC_BE, true);
2251
2252
// Now load a1 and a2, since the rest depend on those values. Frees up srcReg.
2253
MOVZX(32, 8, alpha1Reg, MDisp(srcReg, 14));
2254
MOVZX(32, 8, alpha2Reg, MDisp(srcReg, 15));
2255
2256
CMP(32, R(alpha1Reg), R(alpha2Reg));
2257
FixupBranch handleLerp8 = J_CC(CC_A);
2258
2259
// Okay, check for zero or full alpha, at alphaIndex 6 or 7.
2260
CMP(32, R(alphaIndexReg), Imm32(6));
2261
FixupBranch finishZero = J_CC(CC_E, true);
2262
// Remember, MOV doesn't affect flags.
2263
MOV(32, R(srcReg), Imm32(0xFF));
2264
FixupBranch finishFull = J_CC(CC_A, true);
2265
2266
// At this point, we're handling a 6-step lerp between alpha1 and alpha2.
2267
SHL(32, R(alphaIndexReg), Imm8(8));
2268
// Prepare a multiplier in temp3Reg and multiply alpha1 by it.
2269
MOV(32, R(temp3Reg), Imm32(6 << 8));
2270
SUB(32, R(temp3Reg), R(alphaIndexReg));
2271
IMUL(32, alpha1Reg, R(temp3Reg));
2272
// And now the same for alpha2, using alphaIndexReg.
2273
SUB(32, R(alphaIndexReg), Imm32(1 << 8));
2274
IMUL(32, alpha2Reg, R(alphaIndexReg));
2275
2276
// Let's skip a step and sum before dividing by 5, also adding the 31.
2277
LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 5 * 31));
2278
// To divide by 5, we will actually multiply by 0x3334 and shift.
2279
IMUL(32, srcReg, Imm32(0x3334));
2280
SHR(32, R(srcReg), Imm8(24));
2281
FixupBranch finishLerp6 = J(true);
2282
2283
// This will be a 8-step lerp between alpha1 and alpha2.
2284
SetJumpTarget(handleLerp8);
2285
SHL(32, R(alphaIndexReg), Imm8(8));
2286
// Prepare a multiplier in temp3Reg and multiply alpha1 by it.
2287
MOV(32, R(temp3Reg), Imm32(8 << 8));
2288
SUB(32, R(temp3Reg), R(alphaIndexReg));
2289
IMUL(32, alpha1Reg, R(temp3Reg));
2290
// And now the same for alpha2, using alphaIndexReg.
2291
SUB(32, R(alphaIndexReg), Imm32(1 << 8));
2292
IMUL(32, alpha2Reg, R(alphaIndexReg));
2293
2294
// And divide by 7 together here too, also adding the 31.
2295
LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 7 * 31));
2296
// Our magic constant here is 0x124A, but it's a bit more complex than just a shift.
2297
IMUL(32, alpha1Reg, R(srcReg), Imm32(0x124A));
2298
SHR(32, R(alpha1Reg), Imm8(15));
2299
SUB(32, R(srcReg), R(alpha1Reg));
2300
SHR(32, R(srcReg), Imm8(1));
2301
ADD(32, R(srcReg), R(alpha1Reg));
2302
SHR(32, R(srcReg), Imm8(10));
2303
2304
FixupBranch finishLerp8 = J();
2305
2306
SetJumpTarget(handleSimple);
2307
// Just load the specified alpha byte.
2308
MOVZX(32, 8, srcReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 14));
2309
2310
regCache_.Release(alphaIndexReg, RegCache::GEN_TEMP0);
2311
regCache_.Release(alpha1Reg, RegCache::GEN_TEMP1);
2312
regCache_.Release(alpha2Reg, RegCache::GEN_TEMP2);
2313
regCache_.Release(temp3Reg, RegCache::GEN_TEMP3);
2314
2315
SetJumpTarget(finishFull);
2316
SetJumpTarget(finishLerp6);
2317
SetJumpTarget(finishLerp8);
2318
2319
SHL(32, R(srcReg), Imm8(24));
2320
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2321
OR(32, R(resultReg), R(srcReg));
2322
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2323
success = true;
2324
2325
SetJumpTarget(finishZero);
2326
2327
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2328
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2329
}
2330
2331
_dbg_assert_(success);
2332
return success;
2333
}
2334
2335
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
2336
if (id.swizzle) {
2337
return Jit_GetTexDataSwizzled(id, bitsPerTexel);
2338
}
2339
2340
_assert_msg_(!id.linear, "Should not use this path for linear")
2341
Describe("TexData");
2342
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2343
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2344
2345
// srcReg might be EDX, so let's copy and uReg that before we multiply.
2346
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2347
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2348
bool success = true;
2349
switch (bitsPerTexel) {
2350
case 32:
2351
case 16:
2352
case 8:
2353
LEA(64, temp1Reg, MComplex(srcReg, uReg, bitsPerTexel / 8, 0));
2354
break;
2355
2356
case 4: {
2357
if (cpu_info.bBMI2_fast)
2358
MOV(32, R(temp2Reg), Imm32(0x0F));
2359
else
2360
XOR(32, R(temp2Reg), R(temp2Reg));
2361
SHR(32, R(uReg), Imm8(1));
2362
FixupBranch skip = J_CC(CC_NC);
2363
// Track whether we shifted a 1 off or not.
2364
if (cpu_info.bBMI2_fast)
2365
SHL(32, R(temp2Reg), Imm8(4));
2366
else
2367
MOV(32, R(temp2Reg), Imm32(4));
2368
SetJumpTarget(skip);
2369
LEA(64, temp1Reg, MRegSum(srcReg, uReg));
2370
break;
2371
}
2372
2373
default:
2374
success = false;
2375
break;
2376
}
2377
// All done with u and texptr.
2378
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2379
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2380
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2381
regCache_.ForceRelease(RegCache::GEN_ARG_U);
2382
2383
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2384
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2385
MOV(32, R(resultReg), R(vReg));
2386
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2387
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2388
2389
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2390
IMUL(32, resultReg, R(bufwReg));
2391
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2392
// We can throw bufw away, now.
2393
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2394
2395
if (bitsPerTexel == 4 && !cpu_info.bBMI2) {
2396
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2397
_assert_(hasRCX);
2398
}
2399
2400
switch (bitsPerTexel) {
2401
case 32:
2402
case 16:
2403
case 8:
2404
MOVZX(32, bitsPerTexel, resultReg, MComplex(temp1Reg, resultReg, bitsPerTexel / 8, 0));
2405
break;
2406
2407
case 4: {
2408
SHR(32, R(resultReg), Imm8(1));
2409
if (cpu_info.bBMI2_fast) {
2410
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
2411
PEXT(32, resultReg, resultReg, R(temp2Reg));
2412
} else if (cpu_info.bBMI2) {
2413
SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);
2414
AND(32, R(resultReg), Imm8(0x0F));
2415
} else {
2416
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
2417
// RCX is now free.
2418
MOV(8, R(RCX), R(temp2Reg));
2419
SHR(8, R(resultReg), R(RCX));
2420
// Zero out any bits not shifted off.
2421
AND(32, R(resultReg), Imm8(0x0F));
2422
}
2423
break;
2424
}
2425
2426
default:
2427
success = false;
2428
break;
2429
}
2430
2431
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2432
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2433
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2434
return success;
2435
}
2436
2437
bool SamplerJitCache::Jit_GetTexDataSwizzled4(const SamplerID &id) {
2438
Describe("TexDataS4");
2439
_assert_msg_(!id.linear, "Should not use this path for linear")
2440
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2441
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2442
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2443
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2444
2445
// Get the horizontal tile pos into temp1Reg.
2446
LEA(32, temp1Reg, MScaled(uReg, SCALE_4, 0));
2447
// Note: imm8 sign extends negative.
2448
AND(32, R(temp1Reg), Imm8(~127));
2449
2450
// Add vertical offset inside tile to temp1Reg.
2451
LEA(32, temp2Reg, MScaled(vReg, SCALE_4, 0));
2452
AND(32, R(temp2Reg), Imm8(31));
2453
LEA(32, temp1Reg, MComplex(temp1Reg, temp2Reg, SCALE_4, 0));
2454
// Add srcReg, since we'll need it at some point.
2455
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2456
ADD(64, R(temp1Reg), R(srcReg));
2457
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2458
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2459
2460
// Now find the vertical tile pos, and add to temp1Reg.
2461
SHR(32, R(vReg), Imm8(3));
2462
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2463
LEA(32, temp2Reg, MScaled(bufwReg, SCALE_4, 0));
2464
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2465
// We can throw bufw away, now.
2466
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2467
2468
IMUL(32, temp2Reg, R(vReg));
2469
ADD(64, R(temp1Reg), R(temp2Reg));
2470
// We no longer have a good value in vReg.
2471
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2472
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2473
2474
// Last and possible also least, the horizontal offset inside the tile.
2475
AND(32, R(uReg), Imm8(31));
2476
SHR(32, R(uReg), Imm8(1));
2477
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2478
MOV(8, R(resultReg), MRegSum(temp1Reg, uReg));
2479
FixupBranch skipNonZero = J_CC(CC_NC);
2480
// If the horizontal offset was odd, take the upper 4.
2481
SHR(8, R(resultReg), Imm8(4));
2482
SetJumpTarget(skipNonZero);
2483
// Zero out the rest of the bits.
2484
AND(32, R(resultReg), Imm8(0x0F));
2485
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2486
2487
// This destroyed u as well.
2488
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2489
regCache_.ForceRelease(RegCache::GEN_ARG_U);
2490
2491
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2492
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2493
return true;
2494
}
2495
2496
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {
2497
if (bitsPerTexel == 4) {
2498
// Specialized implementation.
2499
return Jit_GetTexDataSwizzled4(id);
2500
}
2501
2502
bool success = true;
2503
_assert_msg_(!id.linear, "Should not use this path for linear")
2504
2505
Describe("TexDataS");
2506
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2507
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2508
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2509
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2510
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2511
2512
LEA(32, temp1Reg, MScaled(vReg, SCALE_4, 0));
2513
AND(32, R(temp1Reg), Imm8(31));
2514
AND(32, R(vReg), Imm8(~7));
2515
2516
MOV(32, R(temp2Reg), R(uReg));
2517
MOV(32, R(resultReg), R(uReg));
2518
switch (bitsPerTexel) {
2519
case 32:
2520
SHR(32, R(resultReg), Imm8(2));
2521
break;
2522
case 16:
2523
SHR(32, R(vReg), Imm8(1));
2524
SHR(32, R(temp2Reg), Imm8(1));
2525
SHR(32, R(resultReg), Imm8(3));
2526
break;
2527
case 8:
2528
SHR(32, R(vReg), Imm8(2));
2529
SHR(32, R(temp2Reg), Imm8(2));
2530
SHR(32, R(resultReg), Imm8(4));
2531
break;
2532
default:
2533
success = false;
2534
break;
2535
}
2536
AND(32, R(temp2Reg), Imm8(3));
2537
SHL(32, R(resultReg), Imm8(5));
2538
ADD(32, R(temp1Reg), R(temp2Reg));
2539
ADD(32, R(temp1Reg), R(resultReg));
2540
2541
// We may clobber srcReg in the multiply, so let's grab it now.
2542
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2543
LEA(64, temp1Reg, MComplex(srcReg, temp1Reg, SCALE_4, 0));
2544
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2545
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2546
2547
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2548
LEA(32, resultReg, MScaled(bufwReg, SCALE_4, 0));
2549
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2550
// We can throw bufw away, now.
2551
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2552
2553
IMUL(32, resultReg, R(vReg));
2554
// We no longer have a good value in vReg.
2555
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2556
regCache_.ForceRelease(RegCache::GEN_ARG_V);
2557
2558
switch (bitsPerTexel) {
2559
case 32:
2560
MOV(bitsPerTexel, R(resultReg), MRegSum(temp1Reg, resultReg));
2561
break;
2562
case 16:
2563
AND(32, R(uReg), Imm8(1));
2564
LEA(32, resultReg, MComplex(resultReg, uReg, SCALE_2, 0));
2565
MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
2566
break;
2567
case 8:
2568
AND(32, R(uReg), Imm8(3));
2569
ADD(32, R(resultReg), R(uReg));
2570
MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
2571
break;
2572
default:
2573
success = false;
2574
break;
2575
}
2576
2577
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2578
regCache_.ForceRelease(RegCache::GEN_ARG_U);
2579
2580
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2581
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2582
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2583
return success;
2584
}
2585
2586
bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {
2587
Describe("Texel");
2588
2589
X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
2590
X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
2591
X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
2592
X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
2593
if (id.hasAnyMips) {
2594
// We have to figure out levels and the proper width, ugh.
2595
X64Reg idReg = GetSamplerID();
2596
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2597
2598
X64Reg levelReg = INVALID_REG;
2599
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
2600
levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
2601
} else {
2602
levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
2603
MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
2604
}
2605
2606
// We'll multiply these at the same time, so it's nice to put together.
2607
UNPCKLPS(sReg, R(tReg));
2608
SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
2609
2610
X64Reg sizesReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2611
if (cpu_info.bSSE4_1) {
2612
PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2613
} else {
2614
MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2615
X64Reg zeroReg = GetZeroVec();
2616
PUNPCKLWD(sizesReg, R(zeroReg));
2617
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2618
}
2619
2620
// We just want this value as a float, times 256.
2621
PSLLD(sizesReg, 8);
2622
CVTDQ2PS(sizesReg, R(sizesReg));
2623
2624
// Okay, we can multiply now, and convert back to integer.
2625
MULPS(sReg, R(sizesReg));
2626
CVTTPS2DQ(sReg, R(sReg));
2627
regCache_.Release(sizesReg, RegCache::VEC_TEMP0);
2628
2629
PSRAD(sReg, 8);
2630
2631
// Reuse tempXYReg for the level1 values.
2632
if (!cpu_info.bSSE4_1)
2633
PSHUFD(tReg, R(sReg), _MM_SHUFFLE(3, 2, 3, 2));
2634
2635
auto applyClampWrap = [&](X64Reg dest, bool clamp, bool isY, bool isLevel1) {
2636
int offset = offsetof(SamplerID, cached.sizes[0].w) + (isY ? 2 : 0) + (isLevel1 ? 4 : 0);
2637
// Grab the size, already pre-shifted for us.
2638
MOVZX(32, 16, tempReg, MComplex(idReg, levelReg, SCALE_4, offset));
2639
2640
// Grab the size from the multiply.
2641
if (cpu_info.bSSE4_1) {
2642
if (isY || isLevel1)
2643
PEXTRD(R(dest), sReg, (isY ? 1 : 0) + (isLevel1 ? 2 : 0));
2644
else
2645
MOVD_xmm(R(dest), sReg);
2646
} else {
2647
X64Reg srcReg = isLevel1 ? tReg : sReg;
2648
MOVD_xmm(R(dest), srcReg);
2649
if (!isY)
2650
PSRLDQ(srcReg, 4);
2651
}
2652
2653
SUB(32, R(tempReg), Imm8(1));
2654
AND(32, R(tempReg), Imm32(0x000001FF));
2655
if (clamp) {
2656
CMP(32, R(dest), R(tempReg));
2657
CMOVcc(32, dest, R(tempReg), CC_G);
2658
XOR(32, R(tempReg), R(tempReg));
2659
CMP(32, R(dest), R(tempReg));
2660
CMOVcc(32, dest, R(tempReg), CC_L);
2661
} else {
2662
AND(32, R(dest), R(tempReg));
2663
}
2664
};
2665
2666
// Do the next level first, so we can save them and reuse the regs.
2667
// Note: for non-SSE4, this must be in S/T order.
2668
applyClampWrap(uReg, id.clampS, false, true);
2669
applyClampWrap(vReg, id.clampT, true, true);
2670
2671
// Okay, now stuff them on the stack - we'll load them again later.
2672
MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0), R(uReg));
2673
MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4), R(vReg));
2674
2675
// And then the given level.
2676
// Note: for non-SSE4, this must be in S/T order.
2677
applyClampWrap(uReg, id.clampS, false, false);
2678
applyClampWrap(vReg, id.clampT, true, false);
2679
2680
UnlockSamplerID(idReg);
2681
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
2682
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
2683
} else {
2684
// Multiply, then convert to integer...
2685
UNPCKLPS(sReg, R(tReg));
2686
MULPS(sReg, M(constWidthHeight256f_));
2687
CVTTPS2DQ(sReg, R(sReg));
2688
// Great, shift out the fraction.
2689
PSRAD(sReg, 8);
2690
2691
// Square textures are kinda common.
2692
bool clampApplied = false;
2693
if (id.width0Shift == id.height0Shift) {
2694
if (!id.clampS && !id.clampT) {
2695
PAND(sReg, M(constWidthMinus1i_));
2696
clampApplied = true;
2697
} else if (id.clampS && id.clampT && cpu_info.bSSE4_1) {
2698
X64Reg zeroReg = GetZeroVec();
2699
PMINSD(sReg, M(constWidthMinus1i_));
2700
PMAXSD(sReg, R(zeroReg));
2701
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2702
clampApplied = true;
2703
}
2704
}
2705
2706
// Now extract to do the clamping (unless we already did it.)
2707
MOVQ_xmm(R(uReg), sReg);
2708
MOV(64, R(vReg), R(uReg));
2709
SHR(64, R(vReg), Imm8(32));
2710
// Strip off the top bits.
2711
AND(32, R(uReg), R(uReg));
2712
2713
auto applyClampWrap = [this](X64Reg dest, bool clamp, uint8_t shift) {
2714
// Clamp and wrap both max out at 512.
2715
if (shift > 9)
2716
shift = 9;
2717
2718
if (clamp) {
2719
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2720
MOV(32, R(tempReg), Imm32((1 << shift) - 1));
2721
CMP(32, R(dest), R(tempReg));
2722
CMOVcc(32, dest, R(tempReg), CC_G);
2723
XOR(32, R(tempReg), R(tempReg));
2724
CMP(32, R(dest), R(tempReg));
2725
CMOVcc(32, dest, R(tempReg), CC_L);
2726
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
2727
} else {
2728
AND(32, R(dest), Imm32((1 << shift) - 1));
2729
}
2730
};
2731
2732
// Now apply clamp/wrap.
2733
if (!clampApplied) {
2734
applyClampWrap(uReg, id.clampS, id.width0Shift);
2735
applyClampWrap(vReg, id.clampT, id.height0Shift);
2736
}
2737
}
2738
2739
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2740
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2741
regCache_.ForceRetain(RegCache::GEN_ARG_U);
2742
regCache_.ForceRetain(RegCache::GEN_ARG_V);
2743
2744
// And get rid of S and T, we're done with them now.
2745
regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
2746
regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
2747
regCache_.ForceRelease(RegCache::VEC_ARG_S);
2748
regCache_.ForceRelease(RegCache::VEC_ARG_T);
2749
2750
return true;
2751
}
2752
2753
bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
2754
Describe("TexelQuad");
2755
2756
X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
2757
X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
2758
2759
// We use this if there are mips later, to apply wrap/clamp.
2760
X64Reg sizesReg = INVALID_REG;
2761
2762
// Start by multiplying with the width/height... which might be complex with mips.
2763
if (id.hasAnyMips) {
2764
// We have to figure out levels and the proper width, ugh.
2765
X64Reg idReg = GetSamplerID();
2766
2767
X64Reg levelReg = INVALID_REG;
2768
// To avoid ABI problems, we don't hold onto level.
2769
bool releaseLevelReg = !regCache_.Has(RegCache::GEN_ARG_LEVEL);
2770
if (!releaseLevelReg) {
2771
levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
2772
} else {
2773
releaseLevelReg = true;
2774
levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
2775
MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
2776
}
2777
2778
// This will load the current and next level's sizes, 16x4.
2779
sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);
2780
// We actually want this in 32-bit, though, so extend.
2781
if (cpu_info.bSSE4_1) {
2782
PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2783
} else {
2784
MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2785
X64Reg zeroReg = GetZeroVec();
2786
PUNPCKLWD(sizesReg, R(zeroReg));
2787
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2788
}
2789
2790
if (releaseLevelReg)
2791
regCache_.Release(levelReg, RegCache::GEN_ARG_LEVEL);
2792
else
2793
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
2794
UnlockSamplerID(idReg);
2795
2796
// Now make a float version of sizesReg, times 256.
2797
X64Reg sizes256Reg = regCache_.Alloc(RegCache::VEC_TEMP0);
2798
PSLLD(sizes256Reg, sizesReg, 8);
2799
CVTDQ2PS(sizes256Reg, R(sizes256Reg));
2800
2801
// Next off, move S and T into a single reg, which will become U0 V0 U1 V1.
2802
UNPCKLPS(sReg, R(tReg));
2803
SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
2804
// And multiply by the sizes, all lined up already.
2805
MULPS(sReg, R(sizes256Reg));
2806
regCache_.Release(sizes256Reg, RegCache::VEC_TEMP0);
2807
2808
// For wrap/clamp purposes, we want width or height minus one. Do that now.
2809
PSUBD(sizesReg, M(constOnes32_));
2810
PAND(sizesReg, M(constMaxTexel32_));
2811
} else {
2812
// Easy mode.
2813
UNPCKLPS(sReg, R(tReg));
2814
MULPS(sReg, M(constWidthHeight256f_));
2815
}
2816
2817
// And now, convert to integers for all later processing.
2818
CVTPS2DQ(sReg, R(sReg));
2819
2820
// Now adjust X and Y...
2821
X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2822
// Product a -128 constant.
2823
PCMPEQD(tempXYReg, R(tempXYReg));
2824
PSLLD(tempXYReg, 7);
2825
PADDD(sReg, R(tempXYReg));
2826
regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);
2827
2828
// We do want the fraction, though, so extract that to an XMM for later.
2829
X64Reg allFracReg = INVALID_REG;
2830
if (regCache_.Has(RegCache::VEC_FRAC))
2831
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
2832
else
2833
allFracReg = regCache_.Alloc(RegCache::VEC_FRAC);
2834
// We only want the four bits after the first four, though.
2835
PSLLD(allFracReg, sReg, 24);
2836
PSRLD(allFracReg, 28);
2837
// It's convenient later if this is in the low words only.
2838
PACKSSDW(allFracReg, R(allFracReg));
2839
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
2840
regCache_.ForceRetain(RegCache::VEC_FRAC);
2841
2842
// With those extracted, we can now get rid of the fractional bits.
2843
PSRAD(sReg, 8);
2844
2845
// Now it's time to separate the lanes into separate registers and add next UV offsets.
2846
if (id.hasAnyMips) {
2847
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
2848
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
2849
PSHUFD(u1Reg, R(sReg), _MM_SHUFFLE(2, 2, 2, 2));
2850
PSHUFD(v1Reg, R(sReg), _MM_SHUFFLE(3, 3, 3, 3));
2851
PADDD(u1Reg, M(constUNext_));
2852
PADDD(v1Reg, M(constVNext_));
2853
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
2854
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
2855
}
2856
2857
PSHUFD(tReg, R(sReg), _MM_SHUFFLE(1, 1, 1, 1));
2858
PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
2859
PADDD(tReg, M(constVNext_));
2860
PADDD(sReg, M(constUNext_));
2861
2862
X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2863
bool temp0ClampZero = false;
2864
2865
auto doClamp = [&](bool clamp, X64Reg stReg, const OpArg &bound) {
2866
if (!clamp) {
2867
// Wrapping is easy.
2868
PAND(stReg, bound);
2869
return;
2870
}
2871
2872
if (!temp0ClampZero)
2873
PXOR(temp0ClampReg, R(temp0ClampReg));
2874
temp0ClampZero = true;
2875
2876
if (cpu_info.bSSE4_1) {
2877
PMINSD(stReg, bound);
2878
PMAXSD(stReg, R(temp0ClampReg));
2879
} else {
2880
temp0ClampZero = false;
2881
// Set temp to max(0, stReg) = AND(NOT(0 > stReg), stReg).
2882
PCMPGTD(temp0ClampReg, R(stReg));
2883
PANDN(temp0ClampReg, R(stReg));
2884
2885
// Now make a mask where bound is greater than the ST value in temp0ClampReg.
2886
if (cpu_info.bAVX && bound.IsSimpleReg()) {
2887
VPCMPGTD(128, stReg, bound.GetSimpleReg(), R(temp0ClampReg));
2888
} else {
2889
MOVDQA(stReg, bound);
2890
PCMPGTD(stReg, R(temp0ClampReg));
2891
}
2892
// Throw away the values that are greater in our temp0ClampReg in progress result.
2893
PAND(temp0ClampReg, R(stReg));
2894
2895
// Now, set bound only where ST was too high.
2896
PANDN(stReg, bound);
2897
// And put in the values that were fine.
2898
POR(stReg, R(temp0ClampReg));
2899
}
2900
};
2901
2902
if (id.hasAnyMips) {
2903
// We'll spread sizes out into a temp.
2904
X64Reg spreadSizeReg = regCache_.Alloc(RegCache::VEC_TEMP1);
2905
2906
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(0, 0, 0, 0));
2907
doClamp(id.clampS, sReg, R(spreadSizeReg));
2908
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(1, 1, 1, 1));
2909
doClamp(id.clampT, tReg, R(spreadSizeReg));
2910
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
2911
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
2912
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(2, 2, 2, 2));
2913
doClamp(id.clampS, u1Reg, R(spreadSizeReg));
2914
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(3, 3, 3, 3));
2915
doClamp(id.clampT, v1Reg, R(spreadSizeReg));
2916
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
2917
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
2918
2919
regCache_.Release(spreadSizeReg, RegCache::VEC_TEMP1);
2920
} else {
2921
doClamp(id.clampS, sReg, M(constWidthMinus1i_));
2922
doClamp(id.clampT, tReg, M(constHeightMinus1i_));
2923
}
2924
2925
if (sizesReg != INVALID_REG)
2926
regCache_.Release(sizesReg, RegCache::VEC_TEMP5);
2927
regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);
2928
2929
regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
2930
regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
2931
regCache_.Change(RegCache::VEC_ARG_S, RegCache::VEC_ARG_U);
2932
regCache_.Change(RegCache::VEC_ARG_T, RegCache::VEC_ARG_V);
2933
return true;
2934
}
2935
2936
bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1) {
2937
_assert_(id.linear);
2938
2939
bool success = true;
2940
int bits = 0;
2941
switch (id.TexFmt()) {
2942
case GE_TFMT_5650:
2943
case GE_TFMT_5551:
2944
case GE_TFMT_4444:
2945
case GE_TFMT_CLUT16:
2946
bits = 16;
2947
break;
2948
2949
case GE_TFMT_8888:
2950
case GE_TFMT_CLUT32:
2951
bits = 32;
2952
break;
2953
2954
case GE_TFMT_CLUT8:
2955
bits = 8;
2956
break;
2957
2958
case GE_TFMT_CLUT4:
2959
bits = 4;
2960
break;
2961
2962
case GE_TFMT_DXT1:
2963
bits = -8;
2964
break;
2965
2966
case GE_TFMT_DXT3:
2967
case GE_TFMT_DXT5:
2968
bits = -16;
2969
break;
2970
2971
default:
2972
success = false;
2973
}
2974
2975
if (success && bits != 0) {
2976
if (bits < 0) {
2977
success = Jit_PrepareDataDXTOffsets(id, uReg, vReg, level1, -bits);
2978
} else if (id.swizzle) {
2979
success = Jit_PrepareDataSwizzledOffsets(id, uReg, vReg, level1, bits);
2980
} else {
2981
success = Jit_PrepareDataDirectOffsets(id, uReg, vReg, level1, bits);
2982
}
2983
}
2984
2985
return success;
2986
}
2987
2988
bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
2989
Describe("DataOff");
2990
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2991
if (!id.useStandardBufw || id.hasAnyMips) {
2992
// Spread bufw into each lane.
2993
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
2994
if (cpu_info.bSSE4_1) {
2995
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
2996
} else {
2997
PXOR(bufwVecReg, R(bufwVecReg));
2998
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
2999
}
3000
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3001
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3002
3003
if (bitsPerTexel == 4)
3004
PSRLD(bufwVecReg, 1);
3005
else if (bitsPerTexel == 16)
3006
PSLLD(bufwVecReg, 1);
3007
else if (bitsPerTexel == 32)
3008
PSLLD(bufwVecReg, 2);
3009
}
3010
3011
if (id.useStandardBufw && !id.hasAnyMips) {
3012
int amt = id.width0Shift;
3013
if (bitsPerTexel == 4)
3014
amt -= 1;
3015
else if (bitsPerTexel == 16)
3016
amt += 1;
3017
else if (bitsPerTexel == 32)
3018
amt += 2;
3019
// It's aligned to 16 bytes, so must at least be 16.
3020
PSLLD(vReg, std::max(4, amt));
3021
} else if (cpu_info.bSSE4_1) {
3022
// And now multiply. This is slow, but not worse than the SSE2 version...
3023
PMULLD(vReg, R(bufwVecReg));
3024
} else {
3025
// Copy that into another temp for multiply.
3026
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3027
MOVDQA(vOddLaneReg, R(vReg));
3028
3029
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3030
PMULUDQ(vReg, R(bufwVecReg));
3031
PSRLDQ(vOddLaneReg, 4);
3032
PSRLDQ(bufwVecReg, 4);
3033
// And now get XXXX DDDD XXXX BBBB.
3034
PMULUDQ(vOddLaneReg, R(bufwVecReg));
3035
3036
// We know everything is positive, so XXXX must be zero. Let's combine.
3037
PSLLDQ(vOddLaneReg, 4);
3038
POR(vReg, R(vOddLaneReg));
3039
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
3040
}
3041
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3042
3043
if (bitsPerTexel == 4) {
3044
// Need to keep uvec for the odd bit.
3045
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3046
MOVDQA(uCopyReg, R(uReg));
3047
PSRLD(uCopyReg, 1);
3048
PADDD(vReg, R(uCopyReg));
3049
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
3050
} else {
3051
// Destroy uvec, we won't use it again.
3052
if (bitsPerTexel == 16)
3053
PSLLD(uReg, 1);
3054
else if (bitsPerTexel == 32)
3055
PSLLD(uReg, 2);
3056
PADDD(vReg, R(uReg));
3057
}
3058
3059
return true;
3060
}
3061
3062
bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
3063
Describe("DataOffS");
3064
// See Jit_GetTexDataSwizzled() for usage of this offset.
3065
3066
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3067
if (!id.useStandardBufw || id.hasAnyMips) {
3068
// Spread bufw into each lane.
3069
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
3070
if (cpu_info.bSSE4_1) {
3071
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
3072
} else {
3073
PXOR(bufwVecReg, R(bufwVecReg));
3074
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
3075
}
3076
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3077
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3078
}
3079
3080
// Divide vvec by 8 in a temp.
3081
X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3082
PSRLD(vMultReg, vReg, 3);
3083
3084
// And now multiply by bufw. May be able to use a shift in a common case.
3085
int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
3086
if (id.useStandardBufw && !id.hasAnyMips) {
3087
int amt = id.width0Shift;
3088
// Account for 16 byte minimum.
3089
amt = std::max(7 - shiftAmount, amt);
3090
shiftAmount += amt;
3091
} else if (cpu_info.bSSE4_1) {
3092
// And now multiply. This is slow, but not worse than the SSE2 version...
3093
PMULLD(vMultReg, R(bufwVecReg));
3094
} else {
3095
// Copy that into another temp for multiply.
3096
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP2);
3097
MOVDQA(vOddLaneReg, R(vMultReg));
3098
3099
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3100
PMULUDQ(vMultReg, R(bufwVecReg));
3101
PSRLDQ(vOddLaneReg, 4);
3102
PSRLDQ(bufwVecReg, 4);
3103
// And now get XXXX DDDD XXXX BBBB.
3104
PMULUDQ(vOddLaneReg, R(bufwVecReg));
3105
3106
// We know everything is positive, so XXXX must be zero. Let's combine.
3107
PSLLDQ(vOddLaneReg, 4);
3108
POR(vMultReg, R(vOddLaneReg));
3109
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP2);
3110
}
3111
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3112
3113
// Multiply the result by bitsPerTexel using a shift.
3114
PSLLD(vMultReg, shiftAmount);
3115
3116
// Now we're adding (v & 7) * 16. Use a 16-bit wall.
3117
PSLLW(vReg, 13);
3118
PSRLD(vReg, 9);
3119
PADDD(vReg, R(vMultReg));
3120
regCache_.Release(vMultReg, RegCache::VEC_TEMP1);
3121
3122
// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.
3123
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3124
PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
3125
PSLLD(uCopyReg, 7);
3126
// Add it in to our running total.
3127
PADDD(vReg, R(uCopyReg));
3128
3129
if (bitsPerTexel == 4) {
3130
// Finally, we want (uvec & 31) / 2. Use a 16-bit wall.
3131
PSLLW(uCopyReg, uReg, 11);
3132
PSRLD(uCopyReg, 12);
3133
// With that, this is our byte offset. uvec & 1 has which half.
3134
PADDD(vReg, R(uCopyReg));
3135
} else {
3136
// We can destroy uvec in this path. Clear all but 2 bits for 32, 3 for 16, or 4 for 8.
3137
PSLLW(uReg, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);
3138
// Now that it's at the top of the 16 bits, we always shift that to the top of 4 bits.
3139
PSRLD(uReg, 12);
3140
PADDD(vReg, R(uReg));
3141
}
3142
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
3143
3144
return true;
3145
}
3146
3147
bool SamplerJitCache::Jit_PrepareDataDXTOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int blockSize) {
3148
Describe("DataOffDXT");
3149
// Wwe need to get the block's offset, which is:
3150
// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize
3151
// We distribute the blockSize constant for convenience:
3152
// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)
3153
3154
X64Reg baseVReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3155
// This gives us the V factor for the block, which we multiply by bufw.
3156
PSRLD(baseVReg, vReg, 2);
3157
PSLLD(baseVReg, blockSize == 16 ? 4 : 3);
3158
3159
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3160
if (!id.useStandardBufw || id.hasAnyMips) {
3161
// Spread bufw into each lane.
3162
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
3163
if (cpu_info.bSSE4_1) {
3164
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
3165
} else {
3166
PXOR(bufwVecReg, R(bufwVecReg));
3167
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
3168
}
3169
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3170
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3171
3172
// Divide by 4 before the multiply.
3173
PSRLD(bufwVecReg, 2);
3174
}
3175
3176
if (id.useStandardBufw && !id.hasAnyMips) {
3177
int amt = id.width0Shift - 2;
3178
if (amt < 0)
3179
PSRLD(baseVReg, -amt);
3180
else if (amt > 0)
3181
PSLLD(baseVReg, amt);
3182
} else if (cpu_info.bSSE4_1) {
3183
// And now multiply. This is slow, but not worse than the SSE2 version...
3184
PMULLD(baseVReg, R(bufwVecReg));
3185
} else {
3186
// Copy that into another temp for multiply.
3187
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3188
MOVDQA(vOddLaneReg, R(baseVReg));
3189
3190
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3191
PMULUDQ(baseVReg, R(bufwVecReg));
3192
PSRLDQ(vOddLaneReg, 4);
3193
PSRLDQ(bufwVecReg, 4);
3194
// And now get XXXX DDDD XXXX BBBB.
3195
PMULUDQ(vOddLaneReg, R(bufwVecReg));
3196
3197
// We know everything is positive, so XXXX must be zero. Let's combine.
3198
PSLLDQ(vOddLaneReg, 4);
3199
POR(baseVReg, R(vOddLaneReg));
3200
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
3201
}
3202
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3203
3204
// Now add in the U factor for the block.
3205
X64Reg baseUReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3206
PSRLD(baseUReg, uReg, 2);
3207
PSLLD(baseUReg, blockSize == 16 ? 4 : 3);
3208
PADDD(baseVReg, R(baseUReg));
3209
regCache_.Release(baseUReg, RegCache::VEC_TEMP0);
3210
3211
// Okay, the base index (block byte offset from src) is ready.
3212
regCache_.Unlock(baseVReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3213
regCache_.ForceRetain(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3214
3215
// For everything else, we only want the low two bits of U and V.
3216
PSLLD(uReg, 30);
3217
PSLLD(vReg, 30);
3218
3219
X64Reg alphaTempRegU = regCache_.Alloc(RegCache::VEC_TEMP0);
3220
if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5)
3221
PSRLD(alphaTempRegU, uReg, 30);
3222
3223
PSRLD(uReg, 30 - 1);
3224
PSRLD(vReg, 30 - 3);
3225
// At this point, uReg is now the bit offset of the color index.
3226
PADDD(uReg, R(vReg));
3227
3228
// Grab the alpha index into vReg next.
3229
if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5) {
3230
PSRLD(vReg, 1);
3231
PADDD(vReg, R(alphaTempRegU));
3232
3233
if (id.TexFmt() == GE_TFMT_DXT3) {
3234
PSLLD(vReg, 2);
3235
} else if (id.TexFmt() == GE_TFMT_DXT5) {
3236
// Multiply by 3.
3237
PSLLD(alphaTempRegU, vReg, 1);
3238
PADDD(vReg, R(alphaTempRegU));
3239
}
3240
}
3241
regCache_.Release(alphaTempRegU, RegCache::VEC_TEMP0);
3242
3243
return true;
3244
}
3245
3246
bool SamplerJitCache::Jit_DecodeQuad(const SamplerID &id, bool level1) {
3247
GETextureFormat decodeFmt = id.TexFmt();
3248
switch (id.TexFmt()) {
3249
case GE_TFMT_CLUT32:
3250
case GE_TFMT_CLUT16:
3251
case GE_TFMT_CLUT8:
3252
case GE_TFMT_CLUT4:
3253
// The values match, so just use the clut fmt.
3254
decodeFmt = (GETextureFormat)id.ClutFmt();
3255
break;
3256
3257
default:
3258
// We'll decode below.
3259
break;
3260
}
3261
3262
bool success = true;
3263
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
3264
3265
switch (decodeFmt) {
3266
case GE_TFMT_5650:
3267
success = Jit_Decode5650Quad(id, quadReg);
3268
break;
3269
3270
case GE_TFMT_5551:
3271
success = Jit_Decode5551Quad(id, quadReg);
3272
break;
3273
3274
case GE_TFMT_4444:
3275
success = Jit_Decode4444Quad(id, quadReg);
3276
break;
3277
3278
default:
3279
// Doesn't need decoding.
3280
break;
3281
}
3282
3283
regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
3284
return success;
3285
}
3286
3287
bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3288
Describe("5650Quad");
3289
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3290
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3291
3292
// Filter out red only into temp1. We do this by shifting into a wall.
3293
PSLLD(temp1Reg, quadReg, 32 - 5);
3294
// Move it right to the top of the 8 bits.
3295
PSRLD(temp1Reg, 24);
3296
3297
// Now we bring in blue, since it's also 5 like red.
3298
// Luckily, we know the top 16 bits are zero. Shift right into a wall.
3299
PSRLD(temp2Reg, quadReg, 11);
3300
// Shift blue into place at 19, and merge back to temp1.
3301
PSLLD(temp2Reg, 19);
3302
POR(temp1Reg, R(temp2Reg));
3303
3304
// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
3305
PSLLD(temp2Reg, temp1Reg, 1);
3306
3307
// We go to green last because it's the different one. Shift off red and blue.
3308
PSRLD(quadReg, 5);
3309
// Use a word shift to put a wall just at the right place, top 6 bits of second byte.
3310
PSLLW(quadReg, 10);
3311
// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
3312
POR(temp2Reg, R(quadReg));
3313
POR(quadReg, R(temp1Reg));
3314
3315
// Now shift and mask temp2 for swizzle.
3316
PSRLD(temp2Reg, 6);
3317
PAND(temp2Reg, M(const5650Swizzle_));
3318
// And then OR that in too. Only alpha left now.
3319
POR(quadReg, R(temp2Reg));
3320
3321
if (id.useTextureAlpha) {
3322
// Just put a fixed FF in. Maybe we could even avoid this and act like it's FF later...
3323
PCMPEQD(temp2Reg, R(temp2Reg));
3324
PSLLD(temp2Reg, 24);
3325
POR(quadReg, R(temp2Reg));
3326
}
3327
3328
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3329
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3330
return true;
3331
}
3332
3333
bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
3334
Describe("5650");
3335
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3336
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3337
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3338
3339
if (cpu_info.bBMI2_fast) {
3340
// Start off with the high bits.
3341
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
3342
PDEP(32, temp1Reg, resultReg, R(temp1Reg));
3343
if (id.useTextureAlpha || id.fetch)
3344
OR(32, R(temp1Reg), Imm32(0xFF000000));
3345
3346
// Now grab the low bits (they end up packed.)
3347
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
3348
PEXT(32, resultReg, resultReg, R(temp2Reg));
3349
// And spread them back out.
3350
MOV(32, R(temp2Reg), Imm32(0x00070307));
3351
PDEP(32, resultReg, resultReg, R(temp2Reg));
3352
3353
// Finally put the high bits in, we're done.
3354
OR(32, R(resultReg), R(temp1Reg));
3355
} else {
3356
MOV(32, R(temp2Reg), R(resultReg));
3357
AND(32, R(temp2Reg), Imm32(0x0000001F));
3358
3359
// B (we do R and B at the same time, they're both 5.)
3360
MOV(32, R(temp1Reg), R(resultReg));
3361
AND(32, R(temp1Reg), Imm32(0x0000F800));
3362
SHL(32, R(temp1Reg), Imm8(5));
3363
OR(32, R(temp2Reg), R(temp1Reg));
3364
3365
// Expand 5 -> 8. At this point we have 00BB00RR.
3366
MOV(32, R(temp1Reg), R(temp2Reg));
3367
SHL(32, R(temp2Reg), Imm8(3));
3368
SHR(32, R(temp1Reg), Imm8(2));
3369
OR(32, R(temp2Reg), R(temp1Reg));
3370
AND(32, R(temp2Reg), Imm32(0x00FF00FF));
3371
3372
// Now's as good a time to put in A as any.
3373
if (id.useTextureAlpha || id.fetch)
3374
OR(32, R(temp2Reg), Imm32(0xFF000000));
3375
3376
// Last, we need to align, extract, and expand G.
3377
// 3 to align to G, and then 2 to expand to 8.
3378
SHL(32, R(resultReg), Imm8(3 + 2));
3379
AND(32, R(resultReg), Imm32(0x0000FC00));
3380
MOV(32, R(temp1Reg), R(resultReg));
3381
// 2 to account for resultReg being preshifted, 4 for expansion.
3382
SHR(32, R(temp1Reg), Imm8(2 + 4));
3383
OR(32, R(resultReg), R(temp1Reg));
3384
AND(32, R(resultReg), Imm32(0x0000FF00));
3385
OR(32, R(resultReg), R(temp2Reg));
3386
}
3387
3388
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3389
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3390
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3391
return true;
3392
}
3393
3394
bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3395
Describe("5551Quad");
3396
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3397
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3398
3399
// Filter out red only into temp1. We do this by shifting into a wall.
3400
PSLLD(temp1Reg, quadReg, 32 - 5);
3401
// Move it right to the top of the 8 bits.
3402
PSRLD(temp1Reg, 24);
3403
3404
// Add in green and shift into place (top 5 bits of byte 2.)
3405
PSRLD(temp2Reg, quadReg, 5);
3406
PSLLW(temp2Reg, 11);
3407
POR(temp1Reg, R(temp2Reg));
3408
3409
// First, extend alpha using an arithmetic shift.
3410
// We use 10 to meanwhile get rid of green too. The extra alpha bits are fine.
3411
PSRAW(quadReg, 10);
3412
// This gets rid of those extra alpha bits and puts blue in place too.
3413
PSLLD(quadReg, 19);
3414
3415
// Combine both together, we still need to swizzle.
3416
POR(quadReg, R(temp1Reg));
3417
PSRLD(temp1Reg, quadReg, 5);
3418
3419
// Now for swizzle, we'll mask carefully to avoid overflow.
3420
PAND(temp1Reg, M(const5551Swizzle_));
3421
// Then finally merge in the swizzle bits.
3422
POR(quadReg, R(temp1Reg));
3423
3424
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3425
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3426
return true;
3427
}
3428
3429
bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
3430
Describe("5551");
3431
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3432
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3433
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3434
3435
if (cpu_info.bBMI2_fast) {
3436
// First, grab the top bits.
3437
bool keepAlpha = id.useTextureAlpha || id.fetch;
3438
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
3439
PDEP(32, resultReg, resultReg, R(temp1Reg));
3440
3441
// Now make the swizzle bits.
3442
MOV(32, R(temp2Reg), R(resultReg));
3443
SHR(32, R(temp2Reg), Imm8(5));
3444
AND(32, R(temp2Reg), Imm32(0x00070707));
3445
3446
if (keepAlpha) {
3447
// Sign extend the alpha bit to 8 bits.
3448
SHL(32, R(resultReg), Imm8(7));
3449
SAR(32, R(resultReg), Imm8(7));
3450
}
3451
3452
OR(32, R(resultReg), R(temp2Reg));
3453
} else {
3454
MOV(32, R(temp2Reg), R(resultReg));
3455
MOV(32, R(temp1Reg), R(resultReg));
3456
AND(32, R(temp2Reg), Imm32(0x0000001F));
3457
AND(32, R(temp1Reg), Imm32(0x000003E0));
3458
SHL(32, R(temp1Reg), Imm8(3));
3459
OR(32, R(temp2Reg), R(temp1Reg));
3460
3461
MOV(32, R(temp1Reg), R(resultReg));
3462
AND(32, R(temp1Reg), Imm32(0x00007C00));
3463
SHL(32, R(temp1Reg), Imm8(6));
3464
OR(32, R(temp2Reg), R(temp1Reg));
3465
3466
// Expand 5 -> 8. After this is just A.
3467
MOV(32, R(temp1Reg), R(temp2Reg));
3468
SHL(32, R(temp2Reg), Imm8(3));
3469
SHR(32, R(temp1Reg), Imm8(2));
3470
// Chop off the bits that were shifted out.
3471
AND(32, R(temp1Reg), Imm32(0x00070707));
3472
OR(32, R(temp2Reg), R(temp1Reg));
3473
3474
if (id.useTextureAlpha || id.fetch) {
3475
// For A, we sign extend to get either 16 1s or 0s of alpha.
3476
SAR(16, R(resultReg), Imm8(15));
3477
// Now, shift left by 24 to get the lowest 8 of those at the top.
3478
SHL(32, R(resultReg), Imm8(24));
3479
OR(32, R(resultReg), R(temp2Reg));
3480
} else {
3481
MOV(32, R(resultReg), R(temp2Reg));
3482
}
3483
}
3484
3485
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3486
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3487
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3488
return true;
3489
}
3490
3491
bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3492
Describe("4444Quad");
3493
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3494
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3495
3496
// Mask and move red into position within temp1.
3497
PSLLD(temp1Reg, quadReg, 28);
3498
PSRLD(temp1Reg, 24);
3499
3500
// Green is easy too, we use a word shift to get a free wall.
3501
PSRLD(temp2Reg, quadReg, 4);
3502
PSLLW(temp2Reg, 12);
3503
POR(temp1Reg, R(temp2Reg));
3504
3505
// Blue isn't last this time, but it's next.
3506
PSRLD(temp2Reg, quadReg, 8);
3507
PSLLD(temp2Reg, 28);
3508
PSRLD(temp2Reg, 8);
3509
POR(temp1Reg, R(temp2Reg));
3510
3511
if (id.useTextureAlpha) {
3512
// Last but not least, alpha.
3513
PSRLW(quadReg, 12);
3514
PSLLD(quadReg, 28);
3515
POR(quadReg, R(temp1Reg));
3516
3517
// Masking isn't necessary here since everything is 4 wide.
3518
PSRLD(temp1Reg, quadReg, 4);
3519
POR(quadReg, R(temp1Reg));
3520
} else {
3521
// Overwrite quadReg (we need temp1 as a copy anyway.)
3522
PSRLD(quadReg, temp1Reg, 4);
3523
POR(quadReg, R(temp1Reg));
3524
}
3525
3526
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3527
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3528
return true;
3529
}
3530
3531
alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
3532
3533
bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
3534
Describe("4444");
3535
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3536
3537
if (cpu_info.bBMI2_fast) {
3538
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3539
// First, spread the bits out with spaces.
3540
MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
3541
PDEP(32, resultReg, resultReg, R(temp1Reg));
3542
3543
// Now swizzle the low bits in.
3544
MOV(32, R(temp1Reg), R(resultReg));
3545
SHR(32, R(temp1Reg), Imm8(4));
3546
OR(32, R(resultReg), R(temp1Reg));
3547
3548
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3549
} else {
3550
X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3551
X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3552
X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
3553
3554
MOVD_xmm(vecTemp1Reg, R(resultReg));
3555
PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
3556
if (RipAccessible(color4444mask)) {
3557
PAND(vecTemp1Reg, M(color4444mask));
3558
} else {
3559
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3560
MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
3561
PAND(vecTemp1Reg, MatR(temp1Reg));
3562
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3563
}
3564
MOVSS(vecTemp2Reg, R(vecTemp1Reg));
3565
MOVSS(vecTemp3Reg, R(vecTemp1Reg));
3566
PSRLW(vecTemp2Reg, 4);
3567
PSLLW(vecTemp3Reg, 4);
3568
POR(vecTemp1Reg, R(vecTemp2Reg));
3569
POR(vecTemp1Reg, R(vecTemp3Reg));
3570
MOVD_xmm(R(resultReg), vecTemp1Reg);
3571
3572
regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
3573
regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
3574
regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
3575
}
3576
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3577
return true;
3578
}
3579
3580
bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerIndex) {
3581
Describe("TrCLUT");
3582
GEPaletteFormat fmt = id.ClutFmt();
3583
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
3584
// This is simple - just mask if necessary.
3585
if (bitsPerIndex > 8) {
3586
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3587
AND(32, R(resultReg), Imm32(0x000000FF));
3588
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3589
}
3590
return true;
3591
}
3592
3593
if (!cpu_info.bBMI2) {
3594
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
3595
_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
3596
}
3597
3598
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3599
X64Reg idReg = GetSamplerID();
3600
MOV(32, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
3601
UnlockSamplerID(idReg);
3602
3603
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3604
int shiftedToSoFar = 0;
3605
3606
// Shift = (clutformat >> 2) & 0x1F
3607
if (id.hasClutShift) {
3608
SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));
3609
shiftedToSoFar = 2;
3610
3611
if (cpu_info.bBMI2) {
3612
SHRX(32, resultReg, R(resultReg), temp1Reg);
3613
} else {
3614
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
3615
MOV(32, R(RCX), R(temp1Reg));
3616
SHR(32, R(resultReg), R(RCX));
3617
}
3618
}
3619
3620
// Mask = (clutformat >> 8) & 0xFF
3621
if (id.hasClutMask) {
3622
SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));
3623
shiftedToSoFar = 8;
3624
3625
AND(32, R(resultReg), R(temp1Reg));
3626
}
3627
3628
// We need to wrap any entries beyond the first 1024 bytes.
3629
u32 offsetMask = fmt == GE_CMODE_32BIT_ABGR8888 ? 0x00FF : 0x01FF;
3630
3631
// We must mask to 0xFF before ORing 0x100 in 16 bit CMODEs.
3632
// But skip if we'll mask 0xFF after offset anyway.
3633
if (bitsPerIndex > 8 && (!id.hasClutOffset || offsetMask != 0x00FF)) {
3634
AND(32, R(resultReg), Imm32(0x000000FF));
3635
}
3636
3637
// Offset = (clutformat >> 12) & 0x01F0
3638
if (id.hasClutOffset) {
3639
SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));
3640
SHL(32, R(temp1Reg), Imm8(4));
3641
OR(32, R(resultReg), R(temp1Reg));
3642
AND(32, R(resultReg), Imm32(offsetMask));
3643
}
3644
3645
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3646
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3647
return true;
3648
}
3649
3650
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
3651
Describe("ReadCLUT");
3652
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3653
_assert_msg_(!id.linear, "Should not use this path for linear");
3654
3655
if (!id.useSharedClut) {
3656
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3657
3658
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
3659
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
3660
// We need to multiply by 16 and add, LEA allows us to copy too.
3661
LEA(32, temp2Reg, MScaled(levelReg, SCALE_4, 0));
3662
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
3663
if (id.fetch)
3664
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
3665
} else {
3666
_assert_(stackLevelOffset_ != -1);
3667
// The argument was saved on the stack.
3668
MOV(32, R(temp2Reg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
3669
LEA(32, temp2Reg, MScaled(temp2Reg, SCALE_4, 0));
3670
}
3671
3672
// Second step of the multiply by 16 (since we only multiplied by 4 before.)
3673
LEA(64, resultReg, MComplex(resultReg, temp2Reg, SCALE_4, 0));
3674
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3675
}
3676
3677
X64Reg idReg = GetSamplerID();
3678
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3679
MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
3680
UnlockSamplerID(idReg);
3681
3682
switch (id.ClutFmt()) {
3683
case GE_CMODE_16BIT_BGR5650:
3684
case GE_CMODE_16BIT_ABGR5551:
3685
case GE_CMODE_16BIT_ABGR4444:
3686
MOVZX(32, 16, resultReg, MComplex(temp1Reg, resultReg, SCALE_2, 0));
3687
break;
3688
3689
case GE_CMODE_32BIT_ABGR8888:
3690
MOV(32, R(resultReg), MComplex(temp1Reg, resultReg, SCALE_4, 0));
3691
break;
3692
}
3693
3694
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3695
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3696
3697
switch (id.ClutFmt()) {
3698
case GE_CMODE_16BIT_BGR5650:
3699
return Jit_Decode5650(id);
3700
3701
case GE_CMODE_16BIT_ABGR5551:
3702
return Jit_Decode5551(id);
3703
3704
case GE_CMODE_16BIT_ABGR4444:
3705
return Jit_Decode4444(id);
3706
3707
case GE_CMODE_32BIT_ABGR8888:
3708
return true;
3709
3710
default:
3711
return false;
3712
}
3713
}
3714
3715
};
3716
3717
#endif
3718
3719