CoCalc -- SamplerX86.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/SamplerX86.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2017- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20

21
#include <emmintrin.h>
22
#include "Common/x64Emitter.h"
23
#include "Common/BitScan.h"
24
#include "Common/CPUDetect.h"
25
#include "GPU/GPUState.h"
26
#include "GPU/Software/Sampler.h"
27
#include "GPU/ge_constants.h"
28

29
using namespace Gen;
30
using namespace Rasterizer;
31

32
namespace Sampler {
33

34
FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
35
	_assert_msg_(id.fetch && !id.linear, "Only fetch should be set on sampler id");
36
	regCache_.SetupABI({
37
		RegCache::GEN_ARG_U,
38
		RegCache::GEN_ARG_V,
39
		RegCache::GEN_ARG_TEXPTR,
40
		RegCache::GEN_ARG_BUFW,
41
		RegCache::GEN_ARG_LEVEL,
42
		RegCache::GEN_ARG_ID,
43
	});
44
	regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
45
	regCache_.ForceRetain(RegCache::GEN_RESULT);
46
	regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
47

48
	BeginWrite(2048);
49
	Describe("Init");
50
	const u8 *start = AlignCode16();
51

52
#if PPSSPP_PLATFORM(WINDOWS)
53
	// RET and shadow space.
54
	stackArgPos_ = 8 + 32;
55
	stackIDOffset_ = 8;
56
	stackLevelOffset_ = 0;
57
#else
58
	stackArgPos_ = 0;
59
	stackIDOffset_ = -1;
60
	stackLevelOffset_ = -1;
61
#endif
62

63
	// Early exit on !srcPtr.
64
	FixupBranch zeroSrc;
65
	if (id.hasInvalidPtr) {
66
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
67
		CMP(PTRBITS, R(srcReg), Imm8(0));
68
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
69

70
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
71
		X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
72
		PXOR(vecResultReg, R(vecResultReg));
73
		regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
74
		zeroSrc = J(true);
75
		SetJumpTarget(nonZeroSrc);
76
	}
77

78
	// This reads the pixel data into resultReg from the args.
79
	if (!Jit_ReadTextureFormat(id)) {
80
		regCache_.Reset(false);
81
		EndWrite();
82
		ResetCodePtr(GetOffset(start));
83
		ERROR_LOG(Log::G3D, "Failed to compile fetch %s", DescribeSamplerID(id).c_str());
84
		return nullptr;
85
	}
86

87
	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
88
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
89
	if (regCache_.Has(RegCache::GEN_ARG_ID))
90
		regCache_.ForceRelease(RegCache::GEN_ARG_ID);
91

92
	X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
93

94
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
95
	MOVD_xmm(vecResultReg, R(resultReg));
96
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
97
	regCache_.ForceRelease(RegCache::GEN_RESULT);
98

99
	if (cpu_info.bSSE4_1) {
100
		PMOVZXBD(vecResultReg, R(vecResultReg));
101
	} else {
102
		X64Reg vecTempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
103
		PXOR(vecTempReg, R(vecTempReg));
104
		PUNPCKLBW(vecResultReg, R(vecTempReg));
105
		PUNPCKLWD(vecResultReg, R(vecTempReg));
106
		regCache_.Release(vecTempReg, RegCache::VEC_TEMP0);
107
	}
108
	regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
109

110
	Describe("Init");
111
	if (id.hasInvalidPtr) {
112
		SetJumpTarget(zeroSrc);
113
	}
114

115
	RET();
116

117
	regCache_.Reset(true);
118

119
	EndWrite();
120
	return (FetchFunc)start;
121
}
122

123
NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
124
	_assert_msg_(!id.fetch && !id.linear, "Fetch and linear should be cleared on sampler id");
125
	BeginWrite(2048);
126
	Describe("Init");
127

128
	// Let's drop some helpful constants here.
129
	WriteConstantPool(id);
130

131
	const u8 *start = AlignCode16();
132

133
	regCache_.SetupABI({
134
		RegCache::VEC_ARG_S,
135
		RegCache::VEC_ARG_T,
136
		RegCache::VEC_ARG_COLOR,
137
		RegCache::GEN_ARG_TEXPTR_PTR,
138
		RegCache::GEN_ARG_BUFW_PTR,
139
		RegCache::GEN_ARG_LEVEL,
140
		RegCache::GEN_ARG_LEVELFRAC,
141
		RegCache::GEN_ARG_ID,
142
	});
143

144
#if PPSSPP_PLATFORM(WINDOWS)
145
	// RET + shadow space.
146
	stackArgPos_ = 8 + 32;
147

148
	// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac
149
	stackIDOffset_ = 24;
150
	stackLevelOffset_ = 8;
151
#else
152
	stackArgPos_ = 0;
153
	// No args on the stack.
154
	stackIDOffset_ = -1;
155
	stackLevelOffset_ = -1;
156
#endif
157

158
	// Start out by saving some registers, since we'll need more.
159
	PUSH(R15);
160
	PUSH(R14);
161
	PUSH(R13);
162
	PUSH(R12);
163
	regCache_.Add(R15, RegCache::GEN_INVALID);
164
	regCache_.Add(R14, RegCache::GEN_INVALID);
165
	regCache_.Add(R13, RegCache::GEN_INVALID);
166
	regCache_.Add(R12, RegCache::GEN_INVALID);
167
	stackArgPos_ += 32;
168

169
#if PPSSPP_PLATFORM(WINDOWS)
170
	// Use the shadow space to save U1/V1.
171
	stackUV1Offset_ = -8;
172
#else
173
	// Use the red zone, but account for the R15-R12 we push just below.
174
	stackUV1Offset_ = -stackArgPos_ - 8;
175
#endif
176

177
	// We can throw these away right off if there are no mips.
178
	if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVEL) && id.useSharedClut)
179
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
180
	if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))
181
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
182

183
	if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
184
		// On Linux, RCX is currently levelFrac, but we'll need it for other things.
185
		if (!cpu_info.bBMI2) {
186
			X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
187
			MOV(64, R(R15), R(levelFracReg));
188
			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
189
			regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
190
			regCache_.ChangeReg(R15, RegCache::GEN_ARG_LEVELFRAC);
191
			regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
192
		}
193
	} else if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
194
		// Let's load bufwptr into regs.  RDX is free.
195
		MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));
196
		regCache_.ChangeReg(RDX, RegCache::GEN_ARG_BUFW_PTR);
197
		regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
198
	}
199
	// Okay, now lock RCX as a shifting reg.
200
	if (!cpu_info.bBMI2) {
201
		regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
202
		regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
203
	}
204

205
	bool success = true;
206

207
	// Convert S/T + X/Y to U/V (and U1/V1 if appropriate.)
208
	success = success && Jit_GetTexelCoords(id);
209

210
	// At this point, XMM0 should be free.  Swap it to the result.
211
	success = success && regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
212
	// Let's also pick a reg for GEN_RESULT - doesn't matter which.
213
	X64Reg resultReg = regCache_.Alloc(RegCache::GEN_RESULT);
214
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
215
	regCache_.ForceRetain(RegCache::GEN_RESULT);
216

217
	// Early exit on !srcPtr (either one.)
218
	FixupBranch zeroSrc;
219
	if (id.hasInvalidPtr) {
220
		Describe("NullCheck");
221
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
222

223
		if (id.hasAnyMips) {
224
			X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
225
			MOV(64, R(tempReg), MDisp(srcReg, 0));
226
			AND(64, R(tempReg), MDisp(srcReg, 8));
227

228
			CMP(PTRBITS, R(tempReg), Imm8(0));
229
			regCache_.Release(tempReg, RegCache::GEN_TEMP0);
230
		} else {
231
			CMP(PTRBITS, MatR(srcReg), Imm8(0));
232
		}
233
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
234
		PXOR(XMM0, R(XMM0));
235
		zeroSrc = J(true);
236
		SetJumpTarget(nonZeroSrc);
237

238
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
239
	}
240

241
	auto loadPtrs = [&](bool level1) {
242
		X64Reg bufwReg = regCache_.Alloc(RegCache::GEN_ARG_BUFW);
243
		X64Reg bufwPtrReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
244
		MOVZX(32, 16, bufwReg, MDisp(bufwPtrReg, level1 ? 2 : 0));
245
		regCache_.Unlock(bufwPtrReg, RegCache::GEN_ARG_BUFW_PTR);
246
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
247
		regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);
248

249
		X64Reg srcReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
250
		X64Reg srcPtrReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
251
		MOV(64, R(srcReg), MDisp(srcPtrReg, level1 ? 8 : 0));
252
		regCache_.Unlock(srcPtrReg, RegCache::GEN_ARG_TEXPTR_PTR);
253
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
254
		regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR);
255
	};
256

257
	loadPtrs(false);
258
	success = success && Jit_ReadTextureFormat(id);
259

260
	// Convert that to 16-bit from 8-bit channels.
261
	X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
262
	resultReg = regCache_.Find(RegCache::GEN_RESULT);
263
	MOVD_xmm(vecResultReg, R(resultReg));
264
	if (cpu_info.bSSE4_1) {
265
		PMOVZXBW(vecResultReg, R(vecResultReg));
266
	} else {
267
		X64Reg zeroReg = GetZeroVec();
268
		PUNPCKLBW(vecResultReg, R(zeroReg));
269
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
270
	}
271
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
272
	regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
273

274
	if (id.hasAnyMips) {
275
		X64Reg vecResultReg = regCache_.Alloc(RegCache::VEC_RESULT1);
276

277
		if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
278
			X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
279
			CMP(8, R(levelFracReg), Imm8(0));
280
			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
281
		} else {
282
			CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
283
		}
284
		FixupBranch skip = J_CC(CC_Z, true);
285

286
		// Modify the level, so the new level value is used.  We don't need the old.
287
		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
288
			X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
289
			ADD(32, R(levelReg), Imm8(1));
290
			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
291
		} else {
292
			// It's fine to just modify this in place.
293
			ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
294
		}
295

296
		// This is inside the conditional, but it's okay because we throw it away after.
297
		loadPtrs(true);
298
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
299
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
300

301
		X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
302
		MOV(32, R(uReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0));
303
		regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
304
		regCache_.ForceRetain(RegCache::GEN_ARG_U);
305

306
		X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
307
		MOV(32, R(vReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4));
308
		regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
309
		regCache_.ForceRetain(RegCache::GEN_ARG_V);
310

311
		bool hadId = regCache_.Has(RegCache::GEN_ID);
312
		bool hadZero = regCache_.Has(RegCache::VEC_ZERO);
313
		success = success && Jit_ReadTextureFormat(id);
314

315
		X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
316
		MOVD_xmm(vecResultReg, R(resultReg));
317
		if (cpu_info.bSSE4_1) {
318
			PMOVZXBW(vecResultReg, R(vecResultReg));
319
		} else {
320
			X64Reg zeroReg = GetZeroVec();
321
			PUNPCKLBW(vecResultReg, R(zeroReg));
322
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
323
		}
324
		regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
325

326
		// Since we're inside a conditional, make sure these go away if we allocated them.
327
		if (!hadId && regCache_.Has(RegCache::GEN_ID))
328
			regCache_.ForceRelease(RegCache::GEN_ID);
329
		if (!hadZero && regCache_.Has(RegCache::VEC_ZERO))
330
			regCache_.ForceRelease(RegCache::VEC_ZERO);
331

332
		SetJumpTarget(skip);
333

334
		regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT1);
335
	} else {
336
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
337
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
338
	}
339

340
	// We're done with these now.
341
	if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR))
342
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
343
	if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR))
344
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
345
	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
346
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
347
	if (regCache_.Has(RegCache::GEN_SHIFTVAL))
348
		regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
349
	regCache_.ForceRelease(RegCache::GEN_RESULT);
350

351
	if (id.hasAnyMips) {
352
		Describe("BlendMips");
353
		if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
354
			X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
355
			MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
356
			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
357
			regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
358
		}
359

360
		X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
361
		CMP(8, R(levelFracReg), Imm8(0));
362
		FixupBranch skip = J_CC(CC_Z, true);
363

364
		// TODO: PMADDWD?  Refactor shared?
365
		// First, broadcast the levelFrac value into an XMM.
366
		X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
367
		MOVD_xmm(fracReg, R(levelFracReg));
368
		PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
369
		regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
370
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
371

372
		// Multiply level1 color by the fraction.
373
		X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
374
		PMULLW(color1Reg, R(fracReg));
375

376
		// Okay, next we need an inverse for color 0.
377
		X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
378
		MOVDQA(invFracReg, M(const10All16_));
379
		PSUBW(invFracReg, R(fracReg));
380

381
		// And multiply.
382
		PMULLW(XMM0, R(invFracReg));
383
		regCache_.Release(fracReg, RegCache::VEC_TEMP0);
384
		regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
385

386
		// Okay, now sum and divide by 16 (which is what the fraction maxed at.)
387
		PADDW(XMM0, R(color1Reg));
388
		PSRLW(XMM0, 4);
389

390
		// And now we're done with color1Reg/VEC_RESULT1.
391
		regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
392
		regCache_.ForceRelease(RegCache::VEC_RESULT1);
393

394
		SetJumpTarget(skip);
395
	}
396

397
	// Finally, it's time to apply the texture function.
398
	success = success && Jit_ApplyTextureFunc(id);
399

400
	// Last of all, convert to 32-bit channels.
401
	Describe("Init");
402
	if (cpu_info.bSSE4_1) {
403
		PMOVZXWD(XMM0, R(XMM0));
404
	} else {
405
		X64Reg zeroReg = GetZeroVec();
406
		PUNPCKLWD(XMM0, R(zeroReg));
407
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
408
	}
409

410
	regCache_.ForceRelease(RegCache::VEC_RESULT);
411
	if (regCache_.Has(RegCache::GEN_ARG_ID))
412
		regCache_.ForceRelease(RegCache::GEN_ARG_ID);
413

414
	if (!success) {
415
		regCache_.Reset(false);
416
		EndWrite();
417
		ResetCodePtr(GetOffset(start));
418
		ERROR_LOG(Log::G3D, "Failed to compile nearest %s", DescribeSamplerID(id).c_str());
419
		return nullptr;
420
	}
421

422
	if (id.hasInvalidPtr) {
423
		SetJumpTarget(zeroSrc);
424
	}
425

426
	POP(R12);
427
	POP(R13);
428
	POP(R14);
429
	POP(R15);
430

431
	RET();
432

433
	regCache_.Reset(true);
434

435
	EndWrite();
436
	return (NearestFunc)start;
437
}
438

439
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
440
	_assert_msg_(id.linear && !id.fetch, "Only linear should be set on sampler id");
441
	BeginWrite(2048);
442
	Describe("Init");
443

444
	// We don't use stackArgPos_ here, this is just for DXT.
445
	stackArgPos_ = -1;
446

447
	// Let's drop some helpful constants here.
448
	WriteConstantPool(id);
449

450
	const u8 *nearest = nullptr;
451
	if (id.TexFmt() >= GE_TFMT_DXT1) {
452
		regCache_.SetupABI({
453
			RegCache::GEN_ARG_U,
454
			RegCache::GEN_ARG_V,
455
			RegCache::GEN_ARG_TEXPTR,
456
			RegCache::GEN_ARG_BUFW,
457
			RegCache::GEN_ARG_LEVEL,
458
			// Avoid clobber.
459
			RegCache::GEN_ARG_LEVELFRAC,
460
		});
461
		auto lockReg = [&](X64Reg r, RegCache::Purpose p) {
462
			regCache_.ChangeReg(r, p);
463
			regCache_.ForceRetain(p);
464
		};
465
		lockReg(RAX, RegCache::GEN_RESULT);
466
		lockReg(XMM0, RegCache::VEC_ARG_U);
467
		lockReg(XMM1, RegCache::VEC_ARG_V);
468
		lockReg(XMM5, RegCache::VEC_RESULT);
469
#if !PPSSPP_PLATFORM(WINDOWS)
470
		if (id.hasAnyMips) {
471
			lockReg(XMM6, RegCache::VEC_U1);
472
			lockReg(XMM7, RegCache::VEC_V1);
473
			lockReg(XMM8, RegCache::VEC_RESULT1);
474
			lockReg(XMM12, RegCache::VEC_INDEX1);
475
		}
476
		lockReg(XMM9, RegCache::VEC_ARG_COLOR);
477
		lockReg(XMM10, RegCache::VEC_FRAC);
478
		lockReg(XMM11, RegCache::VEC_INDEX);
479
#endif
480

481
		// We'll first write the nearest sampler, which we will CALL.
482
		// This may differ slightly based on the "linear" flag.
483
		nearest = AlignCode16();
484

485
		if (!Jit_ReadTextureFormat(id)) {
486
			regCache_.Reset(false);
487
			EndWrite();
488
			ResetCodePtr(GetOffset(nearest));
489
			ERROR_LOG(Log::G3D, "Failed to compile linear nearest %s", DescribeSamplerID(id).c_str());
490
			return nullptr;
491
		}
492

493
		Describe("Init");
494
		RET();
495

496
		regCache_.ForceRelease(RegCache::GEN_RESULT);
497
		regCache_.ForceRelease(RegCache::VEC_ARG_U);
498
		regCache_.ForceRelease(RegCache::VEC_ARG_V);
499
		regCache_.ForceRelease(RegCache::VEC_RESULT);
500

501
		auto unlockOptReg = [&](RegCache::Purpose p) {
502
			if (regCache_.Has(p))
503
				regCache_.ForceRelease(p);
504
		};
505
		unlockOptReg(RegCache::GEN_ARG_LEVEL);
506
		unlockOptReg(RegCache::GEN_ARG_LEVELFRAC);
507
		unlockOptReg(RegCache::VEC_U1);
508
		unlockOptReg(RegCache::VEC_V1);
509
		unlockOptReg(RegCache::VEC_RESULT1);
510
		unlockOptReg(RegCache::VEC_ARG_COLOR);
511
		unlockOptReg(RegCache::VEC_FRAC);
512
		unlockOptReg(RegCache::VEC_INDEX);
513
		unlockOptReg(RegCache::VEC_INDEX1);
514
		regCache_.Reset(true);
515
	}
516
	EndWrite();
517

518
	// Now the actual linear func, which is exposed externally.
519
	const u8 *linearResetPos = GetCodePointer();
520
	Describe("Init");
521

522
	regCache_.SetupABI({
523
		RegCache::VEC_ARG_S,
524
		RegCache::VEC_ARG_T,
525
		RegCache::VEC_ARG_COLOR,
526
		RegCache::GEN_ARG_TEXPTR_PTR,
527
		RegCache::GEN_ARG_BUFW_PTR,
528
		RegCache::GEN_ARG_LEVEL,
529
		RegCache::GEN_ARG_LEVELFRAC,
530
		RegCache::GEN_ARG_ID,
531
	});
532

533
#if PPSSPP_PLATFORM(WINDOWS)
534
	// RET + shadow space.
535
	stackArgPos_ = 8 + 32;
536
	// Free up some more vector regs on Windows too, where we're a bit tight.
537
	stackArgPos_ += WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12 }, { R15, R14, R13, R12 });
538

539
	// Positions: stackArgPos_+0=bufwptr, stackArgPos_+8=level, stackArgPos_+16=levelFrac
540
	stackIDOffset_ = 24;
541
	stackLevelOffset_ = 8;
542

543
	// If needed, we could store UV1 data in shadow space, but we no longer do.
544
	stackUV1Offset_ = -8;
545
#else
546
	stackArgPos_ = 0;
547
	stackArgPos_ += WriteProlog(0, {}, { R15, R14, R13, R12 });
548
	stackIDOffset_ = -1;
549
	stackLevelOffset_ = -1;
550

551
	// Use the red zone.
552
	stackUV1Offset_ = -stackArgPos_ - 8;
553
#endif
554

555
	// This is what we'll put in them, anyway...
556
	if (nearest != nullptr) {
557
		regCache_.ChangeReg(XMM10, RegCache::VEC_FRAC);
558
		regCache_.ForceRetain(RegCache::VEC_FRAC);
559
		regCache_.ChangeReg(XMM11, RegCache::VEC_INDEX);
560
		regCache_.ForceRetain(RegCache::VEC_INDEX);
561
		if (id.hasAnyMips) {
562
			regCache_.ChangeReg(XMM12, RegCache::VEC_INDEX1);
563
			regCache_.ForceRetain(RegCache::VEC_INDEX1);
564
		}
565
	}
566

567
	// Reserve a couple regs that the nearest CALL won't use.
568
	if (id.hasAnyMips) {
569
		regCache_.ChangeReg(XMM6, RegCache::VEC_U1);
570
		regCache_.ChangeReg(XMM7, RegCache::VEC_V1);
571
		regCache_.ForceRetain(RegCache::VEC_U1);
572
		regCache_.ForceRetain(RegCache::VEC_V1);
573
	} else if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
574
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
575
	}
576

577
	// Save prim color for later in a different XMM too if we're using the nearest helper.
578
	if (nearest != nullptr) {
579
		X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
580
		MOVDQA(XMM9, R(primColorReg));
581
		regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
582
		regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
583
		regCache_.ChangeReg(XMM9, RegCache::VEC_ARG_COLOR);
584
		regCache_.ForceRetain(RegCache::VEC_ARG_COLOR);
585
	}
586

587
	// We also want to save src and bufw for later.  Might be in a reg already.
588
	if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR) && regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
589
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
590
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
591
		MOV(64, R(R14), R(srcReg));
592
		MOV(64, R(R15), R(bufwReg));
593
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
594
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
595
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
596
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
597
	} else if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR)) {
598
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
599
		MOV(64, R(R14), R(srcReg));
600
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
601
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
602
		MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 0));
603
	} else {
604
		MOV(64, R(R14), MDisp(RSP, stackArgPos_ + 0));
605
		MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 8));
606
	}
607

608
	// Okay, and now remember we moved to R14/R15.
609
	regCache_.ChangeReg(R14, RegCache::GEN_ARG_TEXPTR_PTR);
610
	regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR_PTR);
611
	if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
612
		regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
613
		regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
614
	}
615

616
	bool success = true;
617

618
	// Our first goal is to convert S/T and X/Y into U/V and frac_u/frac_v.
619
	success = success && Jit_GetTexelCoordsQuad(id);
620

621
	// Early exit on !srcPtr (either one.)
622
	FixupBranch zeroSrc;
623
	if (id.hasInvalidPtr) {
624
		Describe("NullCheck");
625
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
626

627
		if (id.hasAnyMips) {
628
			X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
629
			MOV(64, R(tempReg), MDisp(srcReg, 0));
630
			AND(64, R(tempReg), MDisp(srcReg, 8));
631

632
			CMP(PTRBITS, R(tempReg), Imm8(0));
633
			regCache_.Release(tempReg, RegCache::GEN_TEMP0);
634
		} else {
635
			CMP(PTRBITS, MatR(srcReg), Imm8(0));
636
		}
637
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
638
		PXOR(XMM0, R(XMM0));
639
		zeroSrc = J(true);
640
		SetJumpTarget(nonZeroSrc);
641

642
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
643
	}
644

645
	auto prepareDataOffsets = [&](RegCache::Purpose uPurpose, RegCache::Purpose vPurpose, bool level1) {
646
		X64Reg uReg = regCache_.Find(uPurpose);
647
		X64Reg vReg = regCache_.Find(vPurpose);
648
		success = success && Jit_PrepareDataOffsets(id, uReg, vReg, level1);
649
		regCache_.Unlock(uReg, uPurpose);
650
		regCache_.Unlock(vReg, vPurpose);
651
	};
652

653
	Describe("DataOffsets");
654
	prepareDataOffsets(RegCache::VEC_ARG_U, RegCache::VEC_ARG_V, false);
655
	if (id.hasAnyMips)
656
		prepareDataOffsets(RegCache::VEC_U1, RegCache::VEC_V1, true);
657

658
	// The data offset goes into V, except in the CLUT4 case and DXT (nearest func) cases.
659
	if (nearest == nullptr && id.TexFmt() != GE_TFMT_CLUT4)
660
		regCache_.ForceRelease(RegCache::VEC_ARG_U);
661

662
	// Hard allocate results if we're using the func method.
663
	if (nearest != nullptr) {
664
		regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);
665
		regCache_.ForceRetain(RegCache::VEC_RESULT);
666
		if (id.hasAnyMips) {
667
			regCache_.ChangeReg(XMM8, RegCache::VEC_RESULT1);
668
			regCache_.ForceRetain(RegCache::VEC_RESULT1);
669
		}
670
	}
671

672
	// This stores the result in an XMM for later processing.
673
	// We map lookups to nearest CALLs, with arg order: u, v, src, bufw, level
674
	auto doNearestCall = [&](int off, bool level1) {
675
#if PPSSPP_PLATFORM(WINDOWS)
676
		static const X64Reg uArgReg = RCX;
677
		static const X64Reg vArgReg = RDX;
678
		static const X64Reg srcArgReg = R8;
679
		static const X64Reg bufwArgReg = R9;
680
#else
681
		static const X64Reg uArgReg = RDI;
682
		static const X64Reg vArgReg = RSI;
683
		static const X64Reg srcArgReg = RDX;
684
		static const X64Reg bufwArgReg = RCX;
685
#endif
686
		static const X64Reg resultReg = RAX;
687

688
		X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
689
		X64Reg vReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
690
		// Otherwise, we'll overwrite them...
691
		_assert_(level1 || (uReg == XMM0 && vReg == XMM1));
692

693
		if (cpu_info.bSSE4_1) {
694
			PEXTRD(R(uArgReg), uReg, off / 4);
695
			PEXTRD(R(vArgReg), vReg, off / 4);
696
		} else {
697
			MOVD_xmm(R(uArgReg), uReg);
698
			MOVD_xmm(R(vArgReg), vReg);
699
			PSRLDQ(uReg, 4);
700
			PSRLDQ(vReg, 4);
701
		}
702
		regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
703
		regCache_.Unlock(vReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
704

705
		X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
706
		if (cpu_info.bSSE4_1) {
707
			PEXTRD(R(srcArgReg), indexReg, off / 4);
708
		} else {
709
			MOVD_xmm(R(srcArgReg), indexReg);
710
			PSRLDQ(indexReg, 4);
711
		}
712
		regCache_.Unlock(indexReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
713

714
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
715
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
716
		ADD(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));
717
		MOVZX(32, 16, bufwArgReg, MDisp(bufwReg, level1 ? 2 : 0));
718
		// Leave level/levelFrac, we just always load from RAM on Windows and lock on POSIX.
719
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
720
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
721

722
		CALL(nearest);
723

724
		X64Reg vecResultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
725
		if (cpu_info.bSSE4_1) {
726
			PINSRD(vecResultReg, R(resultReg), off / 4);
727
		} else if (off == 0) {
728
			MOVD_xmm(vecResultReg, R(resultReg));
729
		} else {
730
			X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
731
			MOVD_xmm(tempReg, R(resultReg));
732
			PSLLDQ(tempReg, off);
733
			POR(vecResultReg, R(tempReg));
734
			regCache_.Release(tempReg, RegCache::VEC_TEMP0);
735
		}
736
		regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
737
	};
738

739
	if (nearest != nullptr) {
740
		Describe("Calls");
741
		doNearestCall(0, false);
742
		doNearestCall(4, false);
743
		doNearestCall(8, false);
744
		doNearestCall(12, false);
745

746
		// After doing the calls, certain cached things aren't safe.
747
		if (regCache_.Has(RegCache::GEN_ID))
748
			regCache_.ForceRelease(RegCache::GEN_ID);
749
		if (regCache_.Has(RegCache::VEC_ZERO))
750
			regCache_.ForceRelease(RegCache::VEC_ZERO);
751
	} else {
752
		success = success && Jit_FetchQuad(id, false);
753
	}
754

755
	if (id.hasAnyMips) {
756
		Describe("MipsCalls");
757
		if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
758
			X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
759
			CMP(8, R(levelFracReg), Imm8(0));
760
			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
761
		} else {
762
			CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
763
		}
764
		FixupBranch skip = J_CC(CC_Z, true);
765

766
		// Modify the level, so the new level value is used.  We don't need the old.
767
		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
768
			X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
769
			ADD(32, R(levelReg), Imm8(1));
770
			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
771
		} else {
772
			// It's fine to just modify this in place.
773
			ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
774
		}
775

776
		if (nearest != nullptr) {
777
			Describe("MipsCalls");
778
			doNearestCall(0, true);
779
			doNearestCall(4, true);
780
			doNearestCall(8, true);
781
			doNearestCall(12, true);
782
		} else {
783
			success = success && Jit_FetchQuad(id, true);
784
		}
785

786
		SetJumpTarget(skip);
787
	}
788

789
	// We're done with these now.
790
	if (nearest != nullptr) {
791
		regCache_.ForceRelease(RegCache::VEC_ARG_U);
792
		regCache_.ForceRelease(RegCache::VEC_ARG_V);
793
		regCache_.ForceRelease(RegCache::VEC_INDEX);
794
	}
795
	if (regCache_.Has(RegCache::VEC_INDEX1))
796
		regCache_.ForceRelease(RegCache::VEC_INDEX1);
797
	if (regCache_.Has(RegCache::VEC_U1))
798
		regCache_.ForceRelease(RegCache::VEC_U1);
799
	if (regCache_.Has(RegCache::VEC_V1))
800
		regCache_.ForceRelease(RegCache::VEC_V1);
801
	regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
802
	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
803
	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
804
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
805

806
	success = success && Jit_DecodeQuad(id, false);
807
	success = success && Jit_BlendQuad(id, false);
808
	if (id.hasAnyMips) {
809
		Describe("BlendMips");
810
		if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
811
			X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
812
			MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
813
			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
814
			regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
815
		}
816

817
		X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
818
		CMP(8, R(levelFracReg), Imm8(0));
819
		FixupBranch skip = J_CC(CC_Z, true);
820

821
		success = success && Jit_DecodeQuad(id, true);
822
		success = success && Jit_BlendQuad(id, true);
823

824
		Describe("BlendMips");
825
		// First, broadcast the levelFrac value into an XMM.
826
		X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
827
		MOVD_xmm(fracReg, R(levelFracReg));
828
		PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
829
		regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
830
		regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
831

832
		// Multiply level1 color by the fraction.
833
		X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
834
		PMULLW(color1Reg, R(fracReg));
835

836
		// Okay, next we need an inverse for color 0.
837
		X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
838
		MOVDQA(invFracReg, M(const10All16_));
839
		PSUBW(invFracReg, R(fracReg));
840

841
		// And multiply.
842
		PMULLW(XMM0, R(invFracReg));
843
		regCache_.Release(fracReg, RegCache::VEC_TEMP0);
844
		regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
845

846
		// Okay, now sum and divide by 16 (which is what the fraction maxed at.)
847
		PADDW(XMM0, R(color1Reg));
848
		PSRLW(XMM0, 4);
849

850
		// And now we're done with color1Reg/VEC_RESULT1.
851
		regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
852
		regCache_.ForceRelease(RegCache::VEC_RESULT1);
853

854
		SetJumpTarget(skip);
855
	}
856

857
	if (regCache_.Has(RegCache::VEC_FRAC))
858
		regCache_.ForceRelease(RegCache::VEC_FRAC);
859

860
	// Finally, it's time to apply the texture function.
861
	success = success && Jit_ApplyTextureFunc(id);
862

863
	// Last of all, convert to 32-bit channels.
864
	Describe("Init");
865
	if (cpu_info.bSSE4_1) {
866
		PMOVZXWD(XMM0, R(XMM0));
867
	} else {
868
		X64Reg zeroReg = GetZeroVec();
869
		PUNPCKLWD(XMM0, R(zeroReg));
870
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
871
	}
872

873
	regCache_.ForceRelease(RegCache::VEC_RESULT);
874
	if (regCache_.Has(RegCache::GEN_ARG_ID))
875
		regCache_.ForceRelease(RegCache::GEN_ARG_ID);
876

877
	if (!success) {
878
		regCache_.Reset(false);
879
		EndWrite();
880
		ResetCodePtr(GetOffset(nearest ? nearest : linearResetPos));
881
		ERROR_LOG(Log::G3D, "Failed to compile linear %s", DescribeSamplerID(id).c_str());
882
		return nullptr;
883
	}
884

885
	if (id.hasInvalidPtr) {
886
		SetJumpTarget(zeroSrc);
887
	}
888

889
	const u8 *start = WriteFinalizedEpilog();
890
	regCache_.Reset(true);
891
	return (LinearFunc)start;
892
}
893

894
void SamplerJitCache::WriteConstantPool(const SamplerID &id) {
895
	// We reuse constants in any pool, because our code space is small.
896
	WriteSimpleConst8x16(const10All16_, 0x10);
897
	WriteSimpleConst16x8(const10All8_, 0x10);
898

899
	if (const10Low_ == nullptr) {
900
		const10Low_ = AlignCode16();
901
		for (int i = 0; i < 4; ++i)
902
			Write16(0x10);
903
		for (int i = 0; i < 4; ++i)
904
			Write16(0);
905
	}
906

907
	WriteSimpleConst4x32(constOnes32_, 1);
908
	WriteSimpleConst8x16(constOnes16_, 1);
909
	// This is the mask for clamp or wrap, the max texel in the S or T direction.
910
	WriteSimpleConst4x32(constMaxTexel32_, 511);
911

912
	if (constUNext_ == nullptr) {
913
		constUNext_ = AlignCode16();
914
		Write32(0); Write32(1); Write32(0); Write32(1);
915
	}
916

917
	if (constVNext_ == nullptr) {
918
		constVNext_ = AlignCode16();
919
		Write32(0); Write32(0); Write32(1); Write32(1);
920
	}
921

922
	WriteSimpleConst4x32(const5551Swizzle_, 0x00070707);
923
	WriteSimpleConst4x32(const5650Swizzle_, 0x00070307);
924

925
	// These are unique to the sampler ID.
926
	if (!id.hasAnyMips) {
927
		float w256f = (1 << id.width0Shift) * 256;
928
		float h256f = (1 << id.height0Shift) * 256;
929
		constWidthHeight256f_ = AlignCode16();
930
		Write32(*(uint32_t *)&w256f);
931
		Write32(*(uint32_t *)&h256f);
932
		Write32(*(uint32_t *)&w256f);
933
		Write32(*(uint32_t *)&h256f);
934

935
		WriteDynamicConst4x32(constWidthMinus1i_, id.width0Shift > 9 ? 511 : (1 << id.width0Shift) - 1);
936
		WriteDynamicConst4x32(constHeightMinus1i_, id.height0Shift > 9 ? 511 : (1 << id.height0Shift) - 1);
937
	} else {
938
		constWidthHeight256f_ = nullptr;
939
		constWidthMinus1i_ = nullptr;
940
		constHeightMinus1i_ = nullptr;
941
	}
942
}
943

944
RegCache::Reg SamplerJitCache::GetSamplerID() {
945
	if (regCache_.Has(RegCache::GEN_ARG_ID))
946
		return regCache_.Find(RegCache::GEN_ARG_ID);
947
	if (!regCache_.Has(RegCache::GEN_ID)) {
948
		X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
949
		_assert_(stackIDOffset_ != -1);
950
		MOV(PTRBITS, R(r), MDisp(RSP, stackArgPos_ + stackIDOffset_));
951
		return r;
952
	}
953
	return regCache_.Find(RegCache::GEN_ID);
954
}
955

956
void SamplerJitCache::UnlockSamplerID(RegCache::Reg &r) {
957
	if (regCache_.Has(RegCache::GEN_ARG_ID))
958
		regCache_.Unlock(r, RegCache::GEN_ARG_ID);
959
	else
960
		regCache_.Unlock(r, RegCache::GEN_ID);
961
}
962

963
bool SamplerJitCache::Jit_FetchQuad(const SamplerID &id, bool level1) {
964
	bool success = true;
965
	switch (id.TexFmt()) {
966
	case GE_TFMT_5650:
967
	case GE_TFMT_5551:
968
	case GE_TFMT_4444:
969
		success = Jit_GetDataQuad(id, level1, 16);
970
		// Mask away the high bits, if loaded via AVX2.
971
		if (cpu_info.bAVX2) {
972
			X64Reg destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
973
			PSLLD(destReg, 16);
974
			PSRLD(destReg, 16);
975
			regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
976
		}
977
		break;
978

979
	case GE_TFMT_8888:
980
		success = Jit_GetDataQuad(id, level1, 32);
981
		break;
982

983
	case GE_TFMT_CLUT32:
984
		success = Jit_GetDataQuad(id, level1, 32);
985
		if (success)
986
			success = Jit_TransformClutIndexQuad(id, 32);
987
		if (success)
988
			success = Jit_ReadClutQuad(id, level1);
989
		break;
990

991
	case GE_TFMT_CLUT16:
992
		success = Jit_GetDataQuad(id, level1, 16);
993
		if (success)
994
			success = Jit_TransformClutIndexQuad(id, 16);
995
		if (success)
996
			success = Jit_ReadClutQuad(id, level1);
997
		break;
998

999
	case GE_TFMT_CLUT8:
1000
		success = Jit_GetDataQuad(id, level1, 8);
1001
		if (success)
1002
			success = Jit_TransformClutIndexQuad(id, 8);
1003
		if (success)
1004
			success = Jit_ReadClutQuad(id, level1);
1005
		break;
1006

1007
	case GE_TFMT_CLUT4:
1008
		success = Jit_GetDataQuad(id, level1, 4);
1009
		if (success)
1010
			success = Jit_TransformClutIndexQuad(id, 4);
1011
		if (success)
1012
			success = Jit_ReadClutQuad(id, level1);
1013
		break;
1014

1015
	case GE_TFMT_DXT1:
1016
	case GE_TFMT_DXT3:
1017
	case GE_TFMT_DXT5:
1018
		// No SIMD version currently, should use nearest helper path.
1019
		success = false;
1020
		break;
1021

1022
	default:
1023
		success = false;
1024
	}
1025

1026
	return success;
1027
}
1028

1029
bool SamplerJitCache::Jit_GetDataQuad(const SamplerID &id, bool level1, int bitsPerTexel) {
1030
	Describe("DataQuad");
1031
	bool success = true;
1032

1033
	X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
1034
	X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
1035
	MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0));
1036
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
1037

1038
	X64Reg destReg = INVALID_REG;
1039
	if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32)
1040
		destReg = regCache_.Alloc(RegCache::VEC_INDEX);
1041
	else if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
1042
		destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1043
	else
1044
		destReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1045

1046
	X64Reg byteOffsetReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1047
	if (cpu_info.bAVX2 && id.overReadSafe) {
1048
		// We have to set a mask for which values to load.  Load all 4.
1049
		// Note this is overwritten with zeroes by the gather instruction.
1050
		X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1051
		PCMPEQD(maskReg, R(maskReg));
1052
		VPGATHERDD(128, destReg, MComplex(baseReg, byteOffsetReg, SCALE_1, 0), maskReg);
1053
		regCache_.Release(maskReg, RegCache::VEC_TEMP0);
1054
	} else {
1055
		if (bitsPerTexel != 32)
1056
			PXOR(destReg, R(destReg));
1057

1058
		// Grab each value separately... try to use the right memory access size.
1059
		X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1060
		if (cpu_info.bSSE4_1) {
1061
			for (int i = 0; i < 4; ++i) {
1062
				PEXTRD(R(temp2Reg), byteOffsetReg, i);
1063
				if (bitsPerTexel <= 8)
1064
					PINSRB(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 4);
1065
				else if (bitsPerTexel == 16)
1066
					PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1067
				else if (bitsPerTexel == 32)
1068
					PINSRD(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i);
1069
			}
1070
		} else {
1071
			for (int i = 0; i < 4; ++i) {
1072
				MOVD_xmm(R(temp2Reg), byteOffsetReg);
1073
				if (i != 3)
1074
					PSRLDQ(byteOffsetReg, 4);
1075
				if (bitsPerTexel <= 8) {
1076
					MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
1077
					PINSRW(destReg, R(temp2Reg), i * 2);
1078
				} else if (bitsPerTexel == 16) {
1079
					PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1080
				} else if (bitsPerTexel == 32) {
1081
					if (i == 0) {
1082
						MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
1083
					} else {
1084
						// Maybe a temporary would be better, but this path should be rare.
1085
						PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
1086
						PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 2), i * 2 + 1);
1087
					}
1088
				}
1089
			}
1090
		}
1091
		regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1092
	}
1093
	regCache_.Unlock(byteOffsetReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1094
	regCache_.ForceRelease(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
1095
	regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR);
1096

1097
	if (bitsPerTexel == 4) {
1098
		// Take only lowest bit, multiply by 4 with shifting.
1099
		X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1100
		// Next, shift away based on the odd U bits.
1101
		if (cpu_info.bAVX2) {
1102
			// This is really convenient with AVX.  Just make the bit into a shift amount.
1103
			PSLLD(uReg, 31);
1104
			PSRLD(uReg, 29);
1105
			VPSRLVD(128, destReg, destReg, R(uReg));
1106
		} else {
1107
			// This creates a mask - FFFFFFFF to shift, zero otherwise.
1108
			PSLLD(uReg, 31);
1109
			PSRAD(uReg, 31);
1110

1111
			X64Reg unshiftedReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1112
			MOVDQA(unshiftedReg, R(destReg));
1113
			PSRLD(destReg, 4);
1114
			// Mask destReg (shifted) and reverse uReg to unshifted masked.
1115
			PAND(destReg, R(uReg));
1116
			PANDN(uReg, R(unshiftedReg));
1117
			// Now combine.
1118
			POR(destReg, R(uReg));
1119
			regCache_.Release(unshiftedReg, RegCache::VEC_TEMP0);
1120
		}
1121
		regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1122
		regCache_.ForceRelease(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
1123
	}
1124

1125
	if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32) {
1126
		regCache_.Unlock(destReg, RegCache::VEC_INDEX);
1127
	} else {
1128
		regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1129
		regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1130
	}
1131

1132
	return success;
1133
}
1134

1135
bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {
1136
	Describe("TrCLUTQuad");
1137
	GEPaletteFormat fmt = id.ClutFmt();
1138
	if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
1139
		// This is simple - just mask.
1140
		X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1141
		// Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4.
1142
		PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
1143
		PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
1144
		regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
1145

1146
		return true;
1147
	}
1148

1149
	X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1150
	bool maskedIndex = false;
1151

1152
	// Okay, first load the actual samplerID clutformat bits we'll use.
1153
	X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1154
	X64Reg idReg = GetSamplerID();
1155
	if (cpu_info.bAVX2 && !id.hasClutShift)
1156
		VPBROADCASTD(128, formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
1157
	else
1158
		MOVD_xmm(formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
1159
	UnlockSamplerID(idReg);
1160

1161
	// Shift = (clutformat >> 2) & 0x1F
1162
	if (id.hasClutShift) {
1163
		// Before shifting, let's mask if needed (we always read 32 bits.)
1164
		// We have to do this here, because the bits should be zero even if F is used as a mask.
1165
		if (bitsPerIndex < 32) {
1166
			PSLLD(indexReg, 32 - bitsPerIndex);
1167
			PSRLD(indexReg, 32 - bitsPerIndex);
1168
			maskedIndex = true;
1169
		}
1170

1171
		X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1172
		// Shift against walls to get 5 bits after the rightmost 2.
1173
		PSLLD(shiftReg, formatReg, 32 - 7);
1174
		PSRLD(shiftReg, 32 - 5);
1175
		// The other lanes are zero, so we can use PSRLD.
1176
		PSRLD(indexReg, R(shiftReg));
1177
		regCache_.Release(shiftReg, RegCache::VEC_TEMP1);
1178
	}
1179

1180
	// With shifting done, we need the format in each lane.
1181
	if (!cpu_info.bAVX2 || id.hasClutShift)
1182
		PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));
1183

1184
	// Mask = (clutformat >> 8) & 0xFF
1185
	if (id.hasClutMask) {
1186
		X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1187
		// If it was CLUT4, grab only 4 bits of the mask.
1188
		PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
1189
		PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
1190

1191
		PAND(indexReg, R(maskReg));
1192
		regCache_.Release(maskReg, RegCache::VEC_TEMP1);
1193
	} else if (!maskedIndex || bitsPerIndex > 8) {
1194
		// Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.)
1195
		PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
1196
		PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
1197
	}
1198

1199
	// Offset = (clutformat >> 12) & 0x01F0
1200
	if (id.hasClutOffset) {
1201
		// Use walls to extract the 5 bits at 16, and then put them shifted left by 4.
1202
		int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;
1203
		PSRLD(formatReg, 16);
1204
		PSLLD(formatReg, 32 - offsetBits);
1205
		PSRLD(formatReg, 32 - offsetBits - 4);
1206

1207
		POR(indexReg, R(formatReg));
1208
	}
1209

1210
	regCache_.Release(formatReg, RegCache::VEC_TEMP0);
1211
	regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
1212
	return true;
1213
}
1214

1215
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
1216
	Describe("ReadCLUTQuad");
1217
	X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
1218

1219
	if (!id.useSharedClut) {
1220
		X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1221

1222
		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
1223
			X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
1224
			MOVD_xmm(vecLevelReg, R(levelReg));
1225
			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
1226
		} else {
1227
#if PPSSPP_PLATFORM(WINDOWS)
1228
			if (cpu_info.bAVX2) {
1229
				VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
1230
			} else {
1231
				MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
1232
				PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0));
1233
			}
1234
#else
1235
			_assert_(false);
1236
#endif
1237
		}
1238

1239
		// Now we multiply by 16, and add.
1240
		PSLLD(vecLevelReg, 4);
1241
		PADDD(indexReg, R(vecLevelReg));
1242
		regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
1243
	}
1244

1245
	X64Reg idReg = GetSamplerID();
1246
	X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
1247
	MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
1248
	UnlockSamplerID(idReg);
1249

1250
	X64Reg resultReg = INVALID_REG;
1251
	if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
1252
		resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1253
	else
1254
		resultReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1255
	X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1256
	if (cpu_info.bAVX2 && id.overReadSafe)
1257
		PCMPEQD(maskReg, R(maskReg));
1258

1259
	switch (id.ClutFmt()) {
1260
	case GE_CMODE_16BIT_BGR5650:
1261
	case GE_CMODE_16BIT_ABGR5551:
1262
	case GE_CMODE_16BIT_ABGR4444:
1263
		if (cpu_info.bAVX2 && id.overReadSafe) {
1264
			VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg);
1265
			// Clear out the top 16 bits.
1266
			PCMPEQD(maskReg, R(maskReg));
1267
			PSRLD(maskReg, 16);
1268
			PAND(resultReg, R(maskReg));
1269
		} else {
1270
			PXOR(resultReg, R(resultReg));
1271

1272
			X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1273
			if (cpu_info.bSSE4_1) {
1274
				for (int i = 0; i < 4; ++i) {
1275
					PEXTRD(R(temp2Reg), indexReg, i);
1276
					PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
1277
				}
1278
			} else {
1279
				for (int i = 0; i < 4; ++i) {
1280
					MOVD_xmm(R(temp2Reg), indexReg);
1281
					if (i != 3)
1282
						PSRLDQ(indexReg, 4);
1283
					PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
1284
				}
1285
			}
1286
			regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1287
		}
1288
		break;
1289

1290
	case GE_CMODE_32BIT_ABGR8888:
1291
		if (cpu_info.bAVX2 && id.overReadSafe) {
1292
			VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg);
1293
		} else {
1294
			X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1295
			if (cpu_info.bSSE4_1) {
1296
				for (int i = 0; i < 4; ++i) {
1297
					PEXTRD(R(temp2Reg), indexReg, i);
1298
					PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i);
1299
				}
1300
			} else {
1301
				for (int i = 0; i < 4; ++i) {
1302
					MOVD_xmm(R(temp2Reg), indexReg);
1303
					if (i != 3)
1304
						PSRLDQ(indexReg, 4);
1305

1306
					if (i == 0) {
1307
						MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
1308
					} else {
1309
						MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
1310
						PSLLDQ(maskReg, 4 * i);
1311
						POR(resultReg, R(maskReg));
1312
					}
1313
				}
1314
			}
1315
			regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1316
		}
1317
		break;
1318
	}
1319
	regCache_.Release(maskReg, RegCache::VEC_TEMP0);
1320
	regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1321
	regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1322

1323
	regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1);
1324
	regCache_.Release(indexReg, RegCache::VEC_INDEX);
1325
	return true;
1326
}
1327

1328
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
1329
	Describe(level1 ? "BlendQuadMips" : "BlendQuad");
1330

1331
	if (cpu_info.bSSE4_1 && cpu_info.bSSSE3) {
1332
		// Let's start by rearranging from TL TR BL BR like this:
1333
		// ABCD EFGH IJKL MNOP -> AI BJ CK DL EM FN GO HP -> AIEM BJFN CKGO DLHP
1334
		// This way, all the RGBAs are next to each other, and in order TL BL TR BR.
1335
		X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1336
		X64Reg tempArrangeReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1337
		PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
1338
		PUNPCKLBW(quadReg, R(tempArrangeReg));
1339
		// Okay, that's top and bottom interleaved, now for left and right.
1340
		PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
1341
		PUNPCKLWD(quadReg, R(tempArrangeReg));
1342
		regCache_.Release(tempArrangeReg, RegCache::VEC_TEMP0);
1343

1344
		// Next up, we want to multiply and add using a repeated TB frac pair.
1345
		// That's (0x10 - frac_v) in byte 1, frac_v in byte 2, repeating.
1346
		X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1347
		X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1348
		X64Reg zeroReg = GetZeroVec();
1349
		if (level1) {
1350
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
1351
		} else {
1352
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
1353
		}
1354
		PSHUFB(fracReg, R(zeroReg));
1355
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1356
		regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1357

1358
		// Now, inverse fracReg, then interleave into the actual multiplier.
1359
		// This gives us the repeated TB pairs we wanted.
1360
		X64Reg multTBReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1361
		MOVDQA(multTBReg, M(const10All8_));
1362
		PSUBB(multTBReg, R(fracReg));
1363
		PUNPCKLBW(multTBReg, R(fracReg));
1364
		regCache_.Release(fracReg, RegCache::VEC_TEMP0);
1365

1366
		// Now we can multiply and add paired lanes in one go.
1367
		// Note that since T+B=0x10, this gives us exactly 12 bits.
1368
		PMADDUBSW(quadReg, R(multTBReg));
1369
		regCache_.Release(multTBReg, RegCache::VEC_TEMP1);
1370

1371
		// With that done, we need to multiply by LR, or rather 0L0R, and sum again.
1372
		// Since RRRR was all next to each other, this gives us a clean total R.
1373
		fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1374
		allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1375
		if (level1) {
1376
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
1377
		} else {
1378
			// We can ignore the high bits, since we'll interleave those away anyway.
1379
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
1380
		}
1381
		regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1382

1383
		// Again, we're inversing into an interleaved multiplier.  L is the inversed one.
1384
		// 0L0R is (0x10 - frac_u), frac_u - 2x16 repeated four times.
1385
		X64Reg multLRReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1386
		MOVDQA(multLRReg, M(const10All16_));
1387
		PSUBW(multLRReg, R(fracReg));
1388
		PUNPCKLWD(multLRReg, R(fracReg));
1389
		regCache_.Release(fracReg, RegCache::VEC_TEMP0);
1390

1391
		// This gives us RGBA as dwords, but they're all shifted left by 8 from the multiplies.
1392
		PMADDWD(quadReg, R(multLRReg));
1393
		PSRLD(quadReg, 8);
1394
		regCache_.Release(multLRReg, RegCache::VEC_TEMP1);
1395

1396
		// Shrink to 16-bit, it's more convenient for later.
1397
		if (level1) {
1398
			PACKSSDW(quadReg, R(quadReg));
1399
			regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
1400
		} else {
1401
			if (cpu_info.bAVX) {
1402
				VPACKSSDW(128, XMM0, quadReg, R(quadReg));
1403
			} else {
1404
				PACKSSDW(quadReg, R(quadReg));
1405
				MOVDQA(XMM0, R(quadReg));
1406
			}
1407
			regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
1408

1409
			regCache_.ForceRelease(RegCache::VEC_RESULT);
1410
			bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1411
			_assert_msg_(changeSuccess, "Unexpected reg locked as destReg");
1412
		}
1413
	} else {
1414
		X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1415
		X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1416

1417
		X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
1418
		X64Reg zeroReg = GetZeroVec();
1419
		PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
1420
		PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
1421
		PUNPCKLBW(topReg, R(zeroReg));
1422
		PUNPCKLBW(bottomReg, R(zeroReg));
1423
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1424
		if (!level1) {
1425
			regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
1426
			regCache_.ForceRelease(RegCache::VEC_RESULT);
1427
		}
1428

1429
		// Grab frac_u and spread to lower (L) lanes.
1430
		X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1431
		X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1432
		X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1433
		if (level1) {
1434
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
1435
		} else {
1436
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
1437
		}
1438
		regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1439
		// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
1440
		MOVDQA(fracMulReg, M(const10Low_));
1441
		PSUBW(fracMulReg, R(fracReg));
1442
		// Then we just put the original frac_u in the upper bits.
1443
		PUNPCKLQDQ(fracMulReg, R(fracReg));
1444
		regCache_.Release(fracReg, RegCache::VEC_TEMP2);
1445

1446
		// Okay, we have 8-bits in the top and bottom rows for the color.
1447
		// Multiply by frac to get 12, which we keep for the next stage.
1448
		PMULLW(topReg, R(fracMulReg));
1449
		PMULLW(bottomReg, R(fracMulReg));
1450
		regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
1451

1452
		// Time for frac_v.  This time, we want it in all 8 lanes.
1453
		fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1454
		allFracReg = regCache_.Find(RegCache::VEC_FRAC);
1455
		X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1456
		if (level1) {
1457
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
1458
		} else {
1459
			PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
1460
		}
1461
		PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
1462
		regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
1463

1464
		// Now, inverse fracReg into fracTopReg for the top row.
1465
		MOVDQA(fracTopReg, M(const10All16_));
1466
		PSUBW(fracTopReg, R(fracReg));
1467

1468
		// We had 12, plus 4 frac, that gives us 16.
1469
		PMULLW(bottomReg, R(fracReg));
1470
		PMULLW(topReg, R(fracTopReg));
1471
		regCache_.Release(fracReg, RegCache::VEC_TEMP2);
1472
		regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
1473

1474
		// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
1475
		PADDUSW(bottomReg, R(topReg));
1476
		regCache_.Release(topReg, RegCache::VEC_TEMP0);
1477

1478
		if (level1) {
1479
			PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1480
			PADDUSW(quadReg, R(bottomReg));
1481
			PSRLW(quadReg, 8);
1482
			regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1483
			regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
1484
		} else {
1485
			bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1486
			if (!changeSuccess) {
1487
				_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");
1488
				X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1489
				PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1490
				PADDUSW(bottomReg, R(otherReg));
1491
				regCache_.Release(otherReg, RegCache::VEC_TEMP0);
1492
				regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1493

1494
				// Okay, now it can be changed.
1495
				regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
1496
			} else {
1497
				PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
1498
				PADDUSW(XMM0, R(bottomReg));
1499
				regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
1500
			}
1501

1502
			PSRLW(XMM0, 8);
1503
		}
1504
	}
1505

1506
	return true;
1507
}
1508

1509
bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
1510
	X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);
1511
	X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1512
	X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1513

1514
	auto useAlphaFrom = [&](X64Reg alphaColorReg) {
1515
		if (cpu_info.bSSE4_1) {
1516
			// Copy only alpha.
1517
			PBLENDW(resultReg, R(alphaColorReg), 0x08);
1518
		} else {
1519
			PSRLDQ(alphaColorReg, 6);
1520
			PSLLDQ(alphaColorReg, 6);
1521
			// Zero out the result alpha and OR them together.
1522
			PSLLDQ(resultReg, 10);
1523
			PSRLDQ(resultReg, 10);
1524
			POR(resultReg, R(alphaColorReg));
1525
		}
1526
	};
1527

1528
	// Note: color is in DWORDs, but result is in WORDs.
1529
	switch (id.TexFunc()) {
1530
	case GE_TEXFUNC_MODULATE:
1531
		Describe("Modulate");
1532
		PACKSSDW(primColorReg, R(primColorReg));
1533
		if (cpu_info.bAVX) {
1534
			VPADDW(128, tempReg, primColorReg, M(constOnes16_));
1535

1536
			// Okay, time to multiply.  This produces 16 bits, neatly.
1537
			VPMULLW(128, resultReg, tempReg, R(resultReg));
1538
		} else {
1539
			MOVDQA(tempReg, M(constOnes16_));
1540
			PADDW(tempReg, R(primColorReg));
1541

1542
			PMULLW(resultReg, R(tempReg));
1543
		}
1544

1545
		if (id.useColorDoubling)
1546
			PSRLW(resultReg, 7);
1547
		else
1548
			PSRLW(resultReg, 8);
1549

1550
		if (!id.useTextureAlpha) {
1551
			useAlphaFrom(primColorReg);
1552
		} else if (id.useColorDoubling) {
1553
			// We still need to finish dividing alpha, it's currently doubled (from the 7 above.)
1554
			PSRLW(primColorReg, resultReg, 1);
1555
			useAlphaFrom(primColorReg);
1556
		}
1557
		break;
1558

1559
	case GE_TEXFUNC_DECAL:
1560
		Describe("Decal");
1561
		PACKSSDW(primColorReg, R(primColorReg));
1562
		if (id.useTextureAlpha) {
1563
			// Get alpha into the tempReg.
1564
			PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));
1565
			PADDW(resultReg, M(constOnes16_));
1566
			PMULLW(resultReg, R(tempReg));
1567

1568
			X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1569
			// Materialize some 255s, and subtract out alpha.
1570
			PCMPEQD(invAlphaReg, R(invAlphaReg));
1571
			PSRLW(invAlphaReg, 8);
1572
			PSUBW(invAlphaReg, R(tempReg));
1573

1574
			MOVDQA(tempReg, R(primColorReg));
1575
			PADDW(tempReg, M(constOnes16_));
1576
			PMULLW(tempReg, R(invAlphaReg));
1577
			regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);
1578

1579
			// Now sum, and divide.
1580
			PADDW(resultReg, R(tempReg));
1581
			if (id.useColorDoubling)
1582
				PSRLW(resultReg, 7);
1583
			else
1584
				PSRLW(resultReg, 8);
1585
		} else if (id.useColorDoubling) {
1586
			PSLLW(resultReg, 1);
1587
		}
1588
		useAlphaFrom(primColorReg);
1589
		break;
1590

1591
	case GE_TEXFUNC_BLEND:
1592
	{
1593
		Describe("EnvBlend");
1594
		PACKSSDW(primColorReg, R(primColorReg));
1595

1596
		// First off, let's grab the color value.
1597
		X64Reg idReg = GetSamplerID();
1598
		X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1599
		if (cpu_info.bSSE4_1) {
1600
			PMOVZXBW(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
1601
		} else {
1602
			MOVD_xmm(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
1603
			X64Reg zeroReg = GetZeroVec();
1604
			PUNPCKLBW(texEnvReg, R(zeroReg));
1605
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1606
		}
1607
		UnlockSamplerID(idReg);
1608

1609
		// Now merge in the prim color so we have them interleaved, texenv low.
1610
		PUNPCKLWD(texEnvReg, R(primColorReg));
1611

1612
		// Okay, now materialize 255 for inversing resultReg and rounding.
1613
		PCMPEQD(tempReg, R(tempReg));
1614
		PSRLW(tempReg, 8);
1615

1616
		// If alpha is used, we want the roundup and factor to be zero.
1617
		if (id.useTextureAlpha)
1618
			PSRLDQ(tempReg, 10);
1619

1620
		// We're going to lose tempReg, so save the 255s.
1621
		X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1622
		MOVDQA(roundValueReg, R(tempReg));
1623

1624
		// Okay, now inverse, then merge with resultReg low to match texenv low.
1625
		PSUBUSW(tempReg, R(resultReg));
1626
		PUNPCKLWD(resultReg, R(tempReg));
1627

1628
		if (id.useTextureAlpha) {
1629
			// Before we multiply, let's include alpha in that multiply.
1630
			PADDW(primColorReg, M(constOnes16_));
1631
			// Mask off everything but alpha, and move to the second highest short.
1632
			PSRLDQ(primColorReg, 6);
1633
			PSLLDQ(primColorReg, 12);
1634
			// Now simply merge in with texenv.
1635
			POR(texEnvReg, R(primColorReg));
1636
		}
1637

1638
		// Alright, now to multiply and add all in one go.  Note this gives us DWORDs.
1639
		PMADDWD(resultReg, R(texEnvReg));
1640
		regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);
1641

1642
		// Now convert back to 16 bit and add the 255s for rounding.
1643
		if (cpu_info.bSSE4_1) {
1644
			PACKUSDW(resultReg, R(resultReg));
1645
		} else {
1646
			PSLLD(resultReg, 16);
1647
			PSRAD(resultReg, 16);
1648
			PACKSSDW(resultReg, R(resultReg));
1649
		}
1650
		PADDW(resultReg, R(roundValueReg));
1651
		regCache_.Release(roundValueReg, RegCache::VEC_TEMP2);
1652

1653
		// Okay, divide by 256 or 128 depending on doubling (we want to preserve the precision.)
1654
		if (id.useColorDoubling && id.useTextureAlpha) {
1655
			// If doubling, we want to still divide alpha by 256.
1656
			PSRLW(resultReg, 7);
1657
			PSRLW(primColorReg, resultReg, 1);
1658
			useAlphaFrom(primColorReg);
1659
		} else if (id.useColorDoubling) {
1660
			PSRLW(resultReg, 7);
1661
		} else {
1662
			PSRLW(resultReg, 8);
1663
		}
1664

1665
		if (!id.useTextureAlpha)
1666
			useAlphaFrom(primColorReg);
1667
		break;
1668
	}
1669

1670
	case GE_TEXFUNC_REPLACE:
1671
		Describe("Replace");
1672
		if (id.useColorDoubling && id.useTextureAlpha) {
1673
			// We can abuse primColorReg as a temp.
1674
			MOVDQA(primColorReg, R(resultReg));
1675
			// Shift to zero out alpha in resultReg.
1676
			PSLLDQ(resultReg, 10);
1677
			PSRLDQ(resultReg, 10);
1678
			// Now simply add them together, restoring alpha and doubling the colors.
1679
			PADDW(resultReg, R(primColorReg));
1680
		} else if (!id.useTextureAlpha) {
1681
			if (id.useColorDoubling) {
1682
				// Let's just double using shifting.  Ignore alpha.
1683
				PSLLW(resultReg, 1);
1684
			}
1685
			// Now we want prim_color in W, so convert, then shift-mask away the color.
1686
			PACKSSDW(primColorReg, R(primColorReg));
1687
			useAlphaFrom(primColorReg);
1688
		}
1689
		break;
1690

1691
	case GE_TEXFUNC_ADD:
1692
	case GE_TEXFUNC_UNKNOWN1:
1693
	case GE_TEXFUNC_UNKNOWN2:
1694
	case GE_TEXFUNC_UNKNOWN3:
1695
		Describe("Add");
1696
		PACKSSDW(primColorReg, R(primColorReg));
1697
		if (id.useTextureAlpha) {
1698
			MOVDQA(tempReg, M(constOnes16_));
1699
			// Add and multiply the alpha (and others, but we'll mask them.)
1700
			PADDW(tempReg, R(primColorReg));
1701
			PMULLW(tempReg, R(resultReg));
1702

1703
			// Now that we've extracted alpha, sum and double as needed.
1704
			PADDW(resultReg, R(primColorReg));
1705
			if (id.useColorDoubling)
1706
				PSLLW(resultReg, 1);
1707

1708
			// Divide by 256 to normalize alpha.
1709
			PSRLW(tempReg, 8);
1710
			useAlphaFrom(tempReg);
1711
		} else {
1712
			PADDW(resultReg, R(primColorReg));
1713
			if (id.useColorDoubling)
1714
				PSLLW(resultReg, 1);
1715
			useAlphaFrom(primColorReg);
1716
		}
1717
		break;
1718
	}
1719

1720
	regCache_.Release(tempReg, RegCache::VEC_TEMP0);
1721
	regCache_.Unlock(resultReg, RegCache::VEC_RESULT);
1722
	regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
1723
	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1724
	return true;
1725
}
1726

1727
bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
1728
	GETextureFormat fmt = id.TexFmt();
1729
	bool success = true;
1730
	switch (fmt) {
1731
	case GE_TFMT_5650:
1732
		success = Jit_GetTexData(id, 16);
1733
		if (success)
1734
			success = Jit_Decode5650(id);
1735
		break;
1736

1737
	case GE_TFMT_5551:
1738
		success = Jit_GetTexData(id, 16);
1739
		if (success)
1740
			success = Jit_Decode5551(id);
1741
		break;
1742

1743
	case GE_TFMT_4444:
1744
		success = Jit_GetTexData(id, 16);
1745
		if (success)
1746
			success = Jit_Decode4444(id);
1747
		break;
1748

1749
	case GE_TFMT_8888:
1750
		success = Jit_GetTexData(id, 32);
1751
		break;
1752

1753
	case GE_TFMT_CLUT32:
1754
		success = Jit_GetTexData(id, 32);
1755
		if (success)
1756
			success = Jit_TransformClutIndex(id, 32);
1757
		if (success)
1758
			success = Jit_ReadClutColor(id);
1759
		break;
1760

1761
	case GE_TFMT_CLUT16:
1762
		success = Jit_GetTexData(id, 16);
1763
		if (success)
1764
			success = Jit_TransformClutIndex(id, 16);
1765
		if (success)
1766
			success = Jit_ReadClutColor(id);
1767
		break;
1768

1769
	case GE_TFMT_CLUT8:
1770
		success = Jit_GetTexData(id, 8);
1771
		if (success)
1772
			success = Jit_TransformClutIndex(id, 8);
1773
		if (success)
1774
			success = Jit_ReadClutColor(id);
1775
		break;
1776

1777
	case GE_TFMT_CLUT4:
1778
		success = Jit_GetTexData(id, 4);
1779
		if (success)
1780
			success = Jit_TransformClutIndex(id, 4);
1781
		if (success)
1782
			success = Jit_ReadClutColor(id);
1783
		break;
1784

1785
	case GE_TFMT_DXT1:
1786
		success = Jit_GetDXT1Color(id, 8, 255);
1787
		break;
1788

1789
	case GE_TFMT_DXT3:
1790
		success = Jit_GetDXT1Color(id, 16, 0);
1791
		if (success)
1792
			success = Jit_ApplyDXTAlpha(id);
1793
		break;
1794

1795
	case GE_TFMT_DXT5:
1796
		success = Jit_GetDXT1Color(id, 16, 0);
1797
		if (success)
1798
			success = Jit_ApplyDXTAlpha(id);
1799
		break;
1800

1801
	default:
1802
		success = false;
1803
	}
1804

1805
	return success;
1806
}
1807

1808
// Note: afterward, srcReg points at the block, and uReg/vReg have offset into block.
1809
bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int alpha) {
1810
	Describe("DXT1");
1811
	// Like Jit_GetTexData, this gets the color into resultReg.
1812
	// Note: color low bits are red, high bits are blue.
1813
	_assert_msg_(blockSize == 8 || blockSize == 16, "Invalid DXT block size");
1814

1815
	X64Reg colorIndexReg = INVALID_REG;
1816
	if (!id.linear) {
1817
		// First, we need to get the block's offset, which is:
1818
		// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize
1819
		// We distribute the blockSize constant for convenience:
1820
		// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)
1821

1822
		// Copy u (we'll need it later), and round down to the nearest 4 after scaling.
1823
		X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
1824
		X64Reg srcBaseReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1825
		LEA(32, srcBaseReg, MScaled(uReg, blockSize / 4, 0));
1826
		AND(32, R(srcBaseReg), Imm32(blockSize == 8 ? ~7 : ~15));
1827
		// Add in srcReg already, since we'll be multiplying soon.
1828
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1829
		ADD(64, R(srcBaseReg), R(srcReg));
1830

1831
		X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
1832
		X64Reg srcOffsetReg = regCache_.Alloc(RegCache::GEN_TEMP1);
1833
		LEA(32, srcOffsetReg, MScaled(vReg, blockSize / 4, 0));
1834
		AND(32, R(srcOffsetReg), Imm32(blockSize == 8 ? ~7 : ~15));
1835
		// Modify bufw in place and then multiply.
1836
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
1837
		SHR(32, R(bufwReg), Imm8(2));
1838
		IMUL(32, srcOffsetReg, R(bufwReg));
1839
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
1840
		// We no longer need bufwReg.
1841
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
1842

1843
		// And now let's chop off the offset for u and v.
1844
		AND(32, R(uReg), Imm32(3));
1845
		AND(32, R(vReg), Imm32(3));
1846

1847
		// Okay, at this point srcBaseReg + srcOffsetReg = blockPos.  To free up regs, put back in srcReg.
1848
		LEA(64, srcReg, MRegSum(srcBaseReg, srcOffsetReg));
1849
		regCache_.Release(srcBaseReg, RegCache::GEN_TEMP0);
1850
		regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);
1851

1852
		// Make sure we don't grab this as colorIndexReg.
1853
		if (uReg != ECX && !cpu_info.bBMI2)
1854
			regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1855

1856
		// The colorIndex is simply the 2 bits at blockPos + (v & 3), shifted right by (u & 3) twice.
1857
		colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1858
		MOVZX(32, 8, colorIndexReg, MRegSum(srcReg, vReg));
1859
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1860
		regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
1861
		// Only DXT3/5 need this reg later.
1862
		if (id.TexFmt() == GE_TFMT_DXT1)
1863
			regCache_.ForceRelease(RegCache::GEN_ARG_V);
1864

1865
		if (uReg == ECX) {
1866
			SHR(32, R(colorIndexReg), R(CL));
1867
			SHR(32, R(colorIndexReg), R(CL));
1868
		} else if (cpu_info.bBMI2) {
1869
			SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
1870
			SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
1871
		} else {
1872
			bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1873
			_assert_(hasRCX);
1874
			LEA(32, ECX, MScaled(uReg, SCALE_2, 0));
1875
			SHR(32, R(colorIndexReg), R(CL));
1876
		}
1877
		regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
1878
		// If DXT1, there's no alpha and we can toss this reg.
1879
		if (id.TexFmt() == GE_TFMT_DXT1)
1880
			regCache_.ForceRelease(RegCache::GEN_ARG_U);
1881
	} else {
1882
		// For linear, we already precalculated the block pos into srcReg.
1883
		// uReg is the shift for the color index fomr the 32 bits of color index data.
1884
		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
1885
		// If we don't have alpha, we don't need vReg.
1886
		if (id.TexFmt() == GE_TFMT_DXT1)
1887
			regCache_.ForceRelease(RegCache::GEN_ARG_V);
1888

1889
		// Make sure we don't grab this as colorIndexReg.
1890
		X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
1891
		if (uReg != ECX && !cpu_info.bBMI2)
1892
			regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1893

1894
		// Shift and mask out the 2 bits we need into colorIndexReg.
1895
		colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1896
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1897
		if (cpu_info.bBMI2) {
1898
			SHRX(32, colorIndexReg, MatR(srcReg), uReg);
1899
		} else {
1900
			MOV(32, R(colorIndexReg), MatR(srcReg));
1901
			if (uReg != RCX) {
1902
				bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
1903
				_assert_(hasRCX);
1904
				MOV(32, R(RCX), R(uReg));
1905
			}
1906
			SHR(32, R(colorIndexReg), R(CL));
1907
		}
1908
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1909
		// We're done with U now.
1910
		regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
1911
		regCache_.ForceRelease(RegCache::GEN_ARG_U);
1912
	}
1913

1914
	// Mask out the value.
1915
	AND(32, R(colorIndexReg), Imm32(3));
1916

1917
	X64Reg color1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1918
	X64Reg color2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1919
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
1920

1921
	// For colorIndex 0 or 1, we'll simply take the 565 color and convert.
1922
	CMP(32, R(colorIndexReg), Imm32(1));
1923
	FixupBranch handleSimple565 = J_CC(CC_BE);
1924

1925
	// Otherwise, it depends if color1 or color2 is higher, so fetch them.
1926
	X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1927
	MOVZX(32, 16, color1Reg, MDisp(srcReg, 4));
1928
	MOVZX(32, 16, color2Reg, MDisp(srcReg, 6));
1929
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1930

1931
	CMP(32, R(color1Reg), R(color2Reg));
1932
	FixupBranch handleMix23 = J_CC(CC_A, true);
1933

1934
	// If we're still here, then colorIndex is either 3 for 0 (easy) or 2 for 50% mix.
1935
	XOR(32, R(resultReg), R(resultReg));
1936
	CMP(32, R(colorIndexReg), Imm32(3));
1937
	FixupBranch finishZero = J_CC(CC_E, true);
1938

1939
	// At this point, resultReg, colorIndexReg, and maybe R12/R13 can be used as temps.
1940
	// We'll add, then shift from 565 a bit less to "divide" by 2 for a 50/50 mix.
1941

1942
	if (cpu_info.bBMI2_fast) {
1943
		// Expand everything out to 0BGR at 8888, but halved.
1944
		MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));
1945
		PDEP(32, color1Reg, color1Reg, R(colorIndexReg));
1946
		PDEP(32, color2Reg, color2Reg, R(colorIndexReg));
1947

1948
		// Now let's sum them together (this undoes our halving.)
1949
		LEA(32, resultReg, MRegSum(color1Reg, color2Reg));
1950

1951
		// Time to swap into order.  Luckily we can ignore alpha.
1952
		BSWAP(32, resultReg);
1953
		SHR(32, R(resultReg), Imm8(8));
1954
	} else {
1955
		// We'll need more regs.  Grab two more.
1956
		PUSH(R12);
1957
		PUSH(R13);
1958

1959
		// Start with summing R, then shift into position.
1960
		MOV(32, R(resultReg), R(color1Reg));
1961
		AND(32, R(resultReg), Imm32(0x0000F800));
1962
		MOV(32, R(colorIndexReg), R(color2Reg));
1963
		AND(32, R(colorIndexReg), Imm32(0x0000F800));
1964
		LEA(32, R12, MRegSum(resultReg, colorIndexReg));
1965
		// The position is 9, instead of 8, due to doubling.
1966
		SHR(32, R(R12), Imm8(9));
1967

1968
		// For G, summing leaves it 4 right (doubling made it not need more.)
1969
		MOV(32, R(resultReg), R(color1Reg));
1970
		AND(32, R(resultReg), Imm32(0x000007E0));
1971
		MOV(32, R(colorIndexReg), R(color2Reg));
1972
		AND(32, R(colorIndexReg), Imm32(0x000007E0));
1973
		LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
1974
		SHL(32, R(resultReg), Imm8(5 - 1));
1975
		// Now add G and R together.
1976
		OR(32, R(resultReg), R(R12));
1977

1978
		// At B, we're free to modify the regs in place, finally.
1979
		AND(32, R(color1Reg), Imm32(0x0000001F));
1980
		AND(32, R(color2Reg), Imm32(0x0000001F));
1981
		LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
1982
		// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
1983
		SHL(32, R(colorIndexReg), Imm8(16 + 2));
1984
		// And combine into the result.
1985
		OR(32, R(resultReg), R(colorIndexReg));
1986

1987
		POP(R13);
1988
		POP(R12);
1989
	}
1990

1991
	FixupBranch finishMix50 = J(true);
1992

1993
	// Simply load the 565 color, and convert to 0888.
1994
	SetJumpTarget(handleSimple565);
1995
	srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
1996
	MOVZX(32, 16, colorIndexReg, MComplex(srcReg, colorIndexReg, SCALE_2, 4));
1997
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
1998
	// DXT1 is done with this reg.
1999
	if (id.TexFmt() == GE_TFMT_DXT1)
2000
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2001

2002
	if (cpu_info.bBMI2_fast) {
2003
		// We're only grabbing the high bits, no swizzle here.
2004
		MOV(32, R(resultReg), Imm32(0x00F8FCF8));
2005
		PDEP(32, resultReg, colorIndexReg, R(resultReg));
2006
		BSWAP(32, resultReg);
2007
		SHR(32, R(resultReg), Imm8(8));
2008
	} else {
2009
		// Start with R, shifting it into place.
2010
		MOV(32, R(resultReg), R(colorIndexReg));
2011
		AND(32, R(resultReg), Imm32(0x0000F800));
2012
		SHR(32, R(resultReg), Imm8(8));
2013

2014
		// Then take G and shift it too.
2015
		MOV(32, R(color2Reg), R(colorIndexReg));
2016
		AND(32, R(color2Reg), Imm32(0x000007E0));
2017
		SHL(32, R(color2Reg), Imm8(5));
2018
		// And now combine with R, shifting that in the process.
2019
		OR(32, R(resultReg), R(color2Reg));
2020

2021
		// Modify B in place and OR in.
2022
		AND(32, R(colorIndexReg), Imm32(0x0000001F));
2023
		SHL(32, R(colorIndexReg), Imm8(16 + 3));
2024
		OR(32, R(resultReg), R(colorIndexReg));
2025
	}
2026
	FixupBranch finish565 = J(true);
2027

2028
	// Here we'll mix color1 and color2 by 2/3 (which gets the 2 depends on colorIndexReg.)
2029
	SetJumpTarget(handleMix23);
2030

2031
	// If colorIndexReg is 2, it's color1Reg * 2 + color2Reg, but if colorIndexReg is 3, it's reversed.
2032
	// Let's swap the regs in that case.
2033
	CMP(32, R(colorIndexReg), Imm32(2));
2034
	FixupBranch skipSwap23 = J_CC(CC_E);
2035
	XCHG(32, R(color2Reg), R(color1Reg));
2036
	SetJumpTarget(skipSwap23);
2037

2038
	if (cpu_info.bBMI2_fast) {
2039
		// Gather B, G, and R and space them apart by 14 or 15 bits.
2040
		MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));
2041
		PDEP(64, color1Reg, color1Reg, R(colorIndexReg));
2042
		PDEP(64, color2Reg, color2Reg, R(colorIndexReg));
2043
		LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
2044

2045
		// Now multiply all of them by a special constant to divide by 3.
2046
		// This constant is (1 << 13) / 3, which is importantly less than 14 or 15.
2047
		IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));
2048

2049
		// Now extract the BGR values to 8 bits each.
2050
		// We subtract 3 from 13 to get 8 from 5 bits, then 2 from 20 + 13, and 3 from 40 + 13.
2051
		MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));
2052
		PEXT(64, resultReg, resultReg, R(colorIndexReg));
2053

2054
		// Finally swap B and R.
2055
		BSWAP(32, resultReg);
2056
		SHR(32, R(resultReg), Imm8(8));
2057
	} else {
2058
		// We'll need more regs.  Grab two more to keep the stack aligned.
2059
		PUSH(R12);
2060
		PUSH(R13);
2061

2062
		// Start off with R, adding together first...
2063
		MOV(32, R(resultReg), R(color1Reg));
2064
		AND(32, R(resultReg), Imm32(0x0000F800));
2065
		MOV(32, R(colorIndexReg), R(color2Reg));
2066
		AND(32, R(colorIndexReg), Imm32(0x0000F800));
2067
		LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
2068
		// We'll overflow if we divide here, so shift into place already.
2069
		SHR(32, R(resultReg), Imm8(8));
2070
		// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
2071
		IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
2072
		// Now we SHR off the extra bits we added on.
2073
		SHR(32, R(R12), Imm8(17));
2074

2075
		// Now add up G.  We leave this in place and shift right more.
2076
		MOV(32, R(resultReg), R(color1Reg));
2077
		AND(32, R(resultReg), Imm32(0x000007E0));
2078
		MOV(32, R(colorIndexReg), R(color2Reg));
2079
		AND(32, R(colorIndexReg), Imm32(0x000007E0));
2080
		LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
2081
		// Again, multiply and now we use AAAB, this time masking.
2082
		IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
2083
		SHR(32, R(resultReg), Imm8(17 - 5));
2084
		AND(32, R(resultReg), Imm32(0x0000FF00));
2085
		// Let's combine R in already.
2086
		OR(32, R(resultReg), R(R12));
2087

2088
		// Now for B, it starts in the lowest place so we'll need to mask.
2089
		AND(32, R(color1Reg), Imm32(0x0000001F));
2090
		AND(32, R(color2Reg), Imm32(0x0000001F));
2091
		LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
2092
		// Instead of shifting left, though, we multiply by a bit more.
2093
		IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
2094
		AND(32, R(colorIndexReg), Imm32(0x00FF0000));
2095
		OR(32, R(resultReg), R(colorIndexReg));
2096

2097
		POP(R13);
2098
		POP(R12);
2099
	}
2100

2101
	regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);
2102
	regCache_.Release(color1Reg, RegCache::GEN_TEMP1);
2103
	regCache_.Release(color2Reg, RegCache::GEN_TEMP2);
2104
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2105

2106
	SetJumpTarget(finishMix50);
2107
	SetJumpTarget(finish565);
2108
	// In all these cases, it's time to add in alpha.  Zero doesn't get it.
2109
	if (alpha != 0) {
2110
		X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2111
		OR(32, R(resultReg), Imm32(alpha << 24));
2112
		regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2113
	}
2114

2115
	SetJumpTarget(finishZero);
2116

2117
	return true;
2118
}
2119

2120
bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
2121
	GETextureFormat fmt = id.TexFmt();
2122

2123
	// At this point, srcReg points at the block, and u/v are offsets inside it.
2124

2125
	bool success = false;
2126
	if (fmt == GE_TFMT_DXT3) {
2127
		Describe("DXT3A");
2128
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2129
		X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2130

2131
		if (id.linear) {
2132
			// We precalculated the shift for the 64 bits of alpha data in vReg.
2133
			if (!cpu_info.bBMI2) {
2134
				regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2135
				_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2136
			}
2137

2138
			if (cpu_info.bBMI2) {
2139
				SHRX(64, srcReg, MDisp(srcReg, 8), vReg);
2140
			} else {
2141
				MOV(64, R(srcReg), MDisp(srcReg, 8));
2142
				MOV(32, R(RCX), R(vReg));
2143
				SHR(64, R(srcReg), R(CL));
2144
			}
2145
			// This will mask the 4 bits we want using a wall also.
2146
			SHL(32, R(srcReg), Imm8(28));
2147
			X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2148
			OR(32, R(resultReg), R(srcReg));
2149
			regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2150

2151
			success = true;
2152
		} else {
2153
			X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2154

2155
			if (uReg != RCX && !cpu_info.bBMI2) {
2156
				regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2157
				_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2158
			}
2159

2160
			X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2161
			MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));
2162
			if (cpu_info.bBMI2) {
2163
				LEA(32, uReg, MScaled(uReg, SCALE_4, 0));
2164
				SHRX(32, temp1Reg, R(temp1Reg), uReg);
2165
			} else {
2166
				// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
2167
				LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
2168
				SHR(32, R(temp1Reg), R(CL));
2169
			}
2170
			SHL(32, R(temp1Reg), Imm8(28));
2171
			X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2172
			OR(32, R(resultReg), R(temp1Reg));
2173
			regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2174
			regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2175

2176
			success = true;
2177

2178
			regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2179
			regCache_.ForceRelease(RegCache::GEN_ARG_U);
2180
		}
2181

2182
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2183
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2184
		regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2185
		regCache_.ForceRelease(RegCache::GEN_ARG_V);
2186
	} else if (fmt == GE_TFMT_DXT5) {
2187
		Describe("DXT5A");
2188

2189
		X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2190
		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2191
		X64Reg alphaIndexReg = INVALID_REG;
2192
		if (id.linear) {
2193
			// We precalculated the shift for the 64 bits of alpha data in vReg.
2194
			if (cpu_info.bBMI2) {
2195
				alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2196
				SHRX(64, alphaIndexReg, MDisp(srcReg, 8), vReg);
2197
			} else {
2198
				regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2199
				alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2200

2201
				MOV(64, R(alphaIndexReg), MDisp(srcReg, 8));
2202
				MOV(32, R(RCX), R(vReg));
2203
				SHR(64, R(alphaIndexReg), R(CL));
2204
			}
2205
			regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2206
			regCache_.ForceRelease(RegCache::GEN_ARG_V);
2207
		} else {
2208
			X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2209
			if (uReg != RCX && !cpu_info.bBMI2)
2210
				regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2211
			alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2212

2213
			// Let's figure out the alphaIndex bit offset so we can read the right byte.
2214
			// bitOffset = (u + v * 4) * 3;
2215
			LEA(32, uReg, MComplex(uReg, vReg, SCALE_4, 0));
2216
			LEA(32, uReg, MComplex(uReg, uReg, SCALE_2, 0));
2217
			regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2218
			regCache_.ForceRelease(RegCache::GEN_ARG_V);
2219

2220
			if (cpu_info.bBMI2) {
2221
				SHRX(64, alphaIndexReg, MDisp(srcReg, 8), uReg);
2222
			} else {
2223
				// And now the byte offset and bit from there, from those.
2224
				MOV(32, R(alphaIndexReg), R(uReg));
2225
				SHR(32, R(alphaIndexReg), Imm8(3));
2226
				AND(32, R(uReg), Imm32(7));
2227

2228
				// Load 16 bits and mask, in case it straddles bytes.
2229
				MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
2230
				// If not, it's in what was bufwReg.
2231
				if (uReg != RCX) {
2232
					_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
2233
					MOV(32, R(RCX), R(uReg));
2234
				}
2235
				SHR(32, R(alphaIndexReg), R(CL));
2236
			}
2237
			regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2238
			regCache_.ForceRelease(RegCache::GEN_ARG_U);
2239
		}
2240

2241
		X64Reg alpha1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2242
		X64Reg alpha2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2243

2244
		AND(32, R(alphaIndexReg), Imm32(7));
2245

2246
		X64Reg temp3Reg = regCache_.Alloc(RegCache::GEN_TEMP3);
2247

2248
		// Okay, now check for 0 or 1 alphaIndex in alphaIndexReg, those are simple.
2249
		CMP(32, R(alphaIndexReg), Imm32(1));
2250
		FixupBranch handleSimple = J_CC(CC_BE, true);
2251

2252
		// Now load a1 and a2, since the rest depend on those values.  Frees up srcReg.
2253
		MOVZX(32, 8, alpha1Reg, MDisp(srcReg, 14));
2254
		MOVZX(32, 8, alpha2Reg, MDisp(srcReg, 15));
2255

2256
		CMP(32, R(alpha1Reg), R(alpha2Reg));
2257
		FixupBranch handleLerp8 = J_CC(CC_A);
2258

2259
		// Okay, check for zero or full alpha, at alphaIndex 6 or 7.
2260
		CMP(32, R(alphaIndexReg), Imm32(6));
2261
		FixupBranch finishZero = J_CC(CC_E, true);
2262
		// Remember, MOV doesn't affect flags.
2263
		MOV(32, R(srcReg), Imm32(0xFF));
2264
		FixupBranch finishFull = J_CC(CC_A, true);
2265

2266
		// At this point, we're handling a 6-step lerp between alpha1 and alpha2.
2267
		SHL(32, R(alphaIndexReg), Imm8(8));
2268
		// Prepare a multiplier in temp3Reg and multiply alpha1 by it.
2269
		MOV(32, R(temp3Reg), Imm32(6 << 8));
2270
		SUB(32, R(temp3Reg), R(alphaIndexReg));
2271
		IMUL(32, alpha1Reg, R(temp3Reg));
2272
		// And now the same for alpha2, using alphaIndexReg.
2273
		SUB(32, R(alphaIndexReg), Imm32(1 << 8));
2274
		IMUL(32, alpha2Reg, R(alphaIndexReg));
2275

2276
		// Let's skip a step and sum before dividing by 5, also adding the 31.
2277
		LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 5 * 31));
2278
		// To divide by 5, we will actually multiply by 0x3334 and shift.
2279
		IMUL(32, srcReg, Imm32(0x3334));
2280
		SHR(32, R(srcReg), Imm8(24));
2281
		FixupBranch finishLerp6 = J(true);
2282

2283
		// This will be a 8-step lerp between alpha1 and alpha2.
2284
		SetJumpTarget(handleLerp8);
2285
		SHL(32, R(alphaIndexReg), Imm8(8));
2286
		// Prepare a multiplier in temp3Reg and multiply alpha1 by it.
2287
		MOV(32, R(temp3Reg), Imm32(8 << 8));
2288
		SUB(32, R(temp3Reg), R(alphaIndexReg));
2289
		IMUL(32, alpha1Reg, R(temp3Reg));
2290
		// And now the same for alpha2, using alphaIndexReg.
2291
		SUB(32, R(alphaIndexReg), Imm32(1 << 8));
2292
		IMUL(32, alpha2Reg, R(alphaIndexReg));
2293

2294
		// And divide by 7 together here too, also adding the 31.
2295
		LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 7 * 31));
2296
		// Our magic constant here is 0x124A, but it's a bit more complex than just a shift.
2297
		IMUL(32, alpha1Reg, R(srcReg), Imm32(0x124A));
2298
		SHR(32, R(alpha1Reg), Imm8(15));
2299
		SUB(32, R(srcReg), R(alpha1Reg));
2300
		SHR(32, R(srcReg), Imm8(1));
2301
		ADD(32, R(srcReg), R(alpha1Reg));
2302
		SHR(32, R(srcReg), Imm8(10));
2303

2304
		FixupBranch finishLerp8 = J();
2305

2306
		SetJumpTarget(handleSimple);
2307
		// Just load the specified alpha byte.
2308
		MOVZX(32, 8, srcReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 14));
2309

2310
		regCache_.Release(alphaIndexReg, RegCache::GEN_TEMP0);
2311
		regCache_.Release(alpha1Reg, RegCache::GEN_TEMP1);
2312
		regCache_.Release(alpha2Reg, RegCache::GEN_TEMP2);
2313
		regCache_.Release(temp3Reg, RegCache::GEN_TEMP3);
2314

2315
		SetJumpTarget(finishFull);
2316
		SetJumpTarget(finishLerp6);
2317
		SetJumpTarget(finishLerp8);
2318

2319
		SHL(32, R(srcReg), Imm8(24));
2320
		X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2321
		OR(32, R(resultReg), R(srcReg));
2322
		regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2323
		success = true;
2324

2325
		SetJumpTarget(finishZero);
2326

2327
		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2328
		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2329
	}
2330

2331
	_dbg_assert_(success);
2332
	return success;
2333
}
2334

2335
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
2336
	if (id.swizzle) {
2337
		return Jit_GetTexDataSwizzled(id, bitsPerTexel);
2338
	}
2339

2340
	_assert_msg_(!id.linear, "Should not use this path for linear")
2341
	Describe("TexData");
2342
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2343
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2344

2345
	// srcReg might be EDX, so let's copy and uReg that before we multiply.
2346
	X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2347
	X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2348
	bool success = true;
2349
	switch (bitsPerTexel) {
2350
	case 32:
2351
	case 16:
2352
	case 8:
2353
		LEA(64, temp1Reg, MComplex(srcReg, uReg, bitsPerTexel / 8, 0));
2354
		break;
2355

2356
	case 4: {
2357
		if (cpu_info.bBMI2_fast)
2358
			MOV(32, R(temp2Reg), Imm32(0x0F));
2359
		else
2360
			XOR(32, R(temp2Reg), R(temp2Reg));
2361
		SHR(32, R(uReg), Imm8(1));
2362
		FixupBranch skip = J_CC(CC_NC);
2363
		// Track whether we shifted a 1 off or not.
2364
		if (cpu_info.bBMI2_fast)
2365
			SHL(32, R(temp2Reg), Imm8(4));
2366
		else
2367
			MOV(32, R(temp2Reg), Imm32(4));
2368
		SetJumpTarget(skip);
2369
		LEA(64, temp1Reg, MRegSum(srcReg, uReg));
2370
		break;
2371
	}
2372

2373
	default:
2374
		success = false;
2375
		break;
2376
	}
2377
	// All done with u and texptr.
2378
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2379
	regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2380
	regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2381
	regCache_.ForceRelease(RegCache::GEN_ARG_U);
2382

2383
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2384
	X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2385
	MOV(32, R(resultReg), R(vReg));
2386
	regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2387
	regCache_.ForceRelease(RegCache::GEN_ARG_V);
2388

2389
	X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2390
	IMUL(32, resultReg, R(bufwReg));
2391
	regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2392
	// We can throw bufw away, now.
2393
	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2394

2395
	if (bitsPerTexel == 4 && !cpu_info.bBMI2) {
2396
		bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
2397
		_assert_(hasRCX);
2398
	}
2399

2400
	switch (bitsPerTexel) {
2401
	case 32:
2402
	case 16:
2403
	case 8:
2404
		MOVZX(32, bitsPerTexel, resultReg, MComplex(temp1Reg, resultReg, bitsPerTexel / 8, 0));
2405
		break;
2406

2407
	case 4: {
2408
		SHR(32, R(resultReg), Imm8(1));
2409
		if (cpu_info.bBMI2_fast) {
2410
			MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
2411
			PEXT(32, resultReg, resultReg, R(temp2Reg));
2412
		} else if (cpu_info.bBMI2) {
2413
			SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);
2414
			AND(32, R(resultReg), Imm8(0x0F));
2415
		} else {
2416
			MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
2417
			// RCX is now free.
2418
			MOV(8, R(RCX), R(temp2Reg));
2419
			SHR(8, R(resultReg), R(RCX));
2420
			// Zero out any bits not shifted off.
2421
			AND(32, R(resultReg), Imm8(0x0F));
2422
		}
2423
		break;
2424
	}
2425

2426
	default:
2427
		success = false;
2428
		break;
2429
	}
2430

2431
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2432
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2433
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2434
	return success;
2435
}
2436

2437
bool SamplerJitCache::Jit_GetTexDataSwizzled4(const SamplerID &id) {
2438
	Describe("TexDataS4");
2439
	_assert_msg_(!id.linear, "Should not use this path for linear")
2440
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2441
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2442
	X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2443
	X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2444

2445
	// Get the horizontal tile pos into temp1Reg.
2446
	LEA(32, temp1Reg, MScaled(uReg, SCALE_4, 0));
2447
	// Note: imm8 sign extends negative.
2448
	AND(32, R(temp1Reg), Imm8(~127));
2449

2450
	// Add vertical offset inside tile to temp1Reg.
2451
	LEA(32, temp2Reg, MScaled(vReg, SCALE_4, 0));
2452
	AND(32, R(temp2Reg), Imm8(31));
2453
	LEA(32, temp1Reg, MComplex(temp1Reg, temp2Reg, SCALE_4, 0));
2454
	// Add srcReg, since we'll need it at some point.
2455
	X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2456
	ADD(64, R(temp1Reg), R(srcReg));
2457
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2458
	regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2459

2460
	// Now find the vertical tile pos, and add to temp1Reg.
2461
	SHR(32, R(vReg), Imm8(3));
2462
	X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2463
	LEA(32, temp2Reg, MScaled(bufwReg, SCALE_4, 0));
2464
	regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2465
	// We can throw bufw away, now.
2466
	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2467

2468
	IMUL(32, temp2Reg, R(vReg));
2469
	ADD(64, R(temp1Reg), R(temp2Reg));
2470
	// We no longer have a good value in vReg.
2471
	regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2472
	regCache_.ForceRelease(RegCache::GEN_ARG_V);
2473

2474
	// Last and possible also least, the horizontal offset inside the tile.
2475
	AND(32, R(uReg), Imm8(31));
2476
	SHR(32, R(uReg), Imm8(1));
2477
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2478
	MOV(8, R(resultReg), MRegSum(temp1Reg, uReg));
2479
	FixupBranch skipNonZero = J_CC(CC_NC);
2480
	// If the horizontal offset was odd, take the upper 4.
2481
	SHR(8, R(resultReg), Imm8(4));
2482
	SetJumpTarget(skipNonZero);
2483
	// Zero out the rest of the bits.
2484
	AND(32, R(resultReg), Imm8(0x0F));
2485
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2486

2487
	// This destroyed u as well.
2488
	regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2489
	regCache_.ForceRelease(RegCache::GEN_ARG_U);
2490

2491
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2492
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2493
	return true;
2494
}
2495

2496
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {
2497
	if (bitsPerTexel == 4) {
2498
		// Specialized implementation.
2499
		return Jit_GetTexDataSwizzled4(id);
2500
	}
2501

2502
	bool success = true;
2503
	_assert_msg_(!id.linear, "Should not use this path for linear")
2504

2505
	Describe("TexDataS");
2506
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
2507
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
2508
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
2509
	X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
2510
	X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
2511

2512
	LEA(32, temp1Reg, MScaled(vReg, SCALE_4, 0));
2513
	AND(32, R(temp1Reg), Imm8(31));
2514
	AND(32, R(vReg), Imm8(~7));
2515

2516
	MOV(32, R(temp2Reg), R(uReg));
2517
	MOV(32, R(resultReg), R(uReg));
2518
	switch (bitsPerTexel) {
2519
	case 32:
2520
		SHR(32, R(resultReg), Imm8(2));
2521
		break;
2522
	case 16:
2523
		SHR(32, R(vReg), Imm8(1));
2524
		SHR(32, R(temp2Reg), Imm8(1));
2525
		SHR(32, R(resultReg), Imm8(3));
2526
		break;
2527
	case 8:
2528
		SHR(32, R(vReg), Imm8(2));
2529
		SHR(32, R(temp2Reg), Imm8(2));
2530
		SHR(32, R(resultReg), Imm8(4));
2531
		break;
2532
	default:
2533
		success = false;
2534
		break;
2535
	}
2536
	AND(32, R(temp2Reg), Imm8(3));
2537
	SHL(32, R(resultReg), Imm8(5));
2538
	ADD(32, R(temp1Reg), R(temp2Reg));
2539
	ADD(32, R(temp1Reg), R(resultReg));
2540

2541
	// We may clobber srcReg in the multiply, so let's grab it now.
2542
	X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
2543
	LEA(64, temp1Reg, MComplex(srcReg, temp1Reg, SCALE_4, 0));
2544
	regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
2545
	regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
2546

2547
	X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
2548
	LEA(32, resultReg, MScaled(bufwReg, SCALE_4, 0));
2549
	regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
2550
	// We can throw bufw away, now.
2551
	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
2552

2553
	IMUL(32, resultReg, R(vReg));
2554
	// We no longer have a good value in vReg.
2555
	regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2556
	regCache_.ForceRelease(RegCache::GEN_ARG_V);
2557

2558
	switch (bitsPerTexel) {
2559
	case 32:
2560
		MOV(bitsPerTexel, R(resultReg), MRegSum(temp1Reg, resultReg));
2561
		break;
2562
	case 16:
2563
		AND(32, R(uReg), Imm8(1));
2564
		LEA(32, resultReg, MComplex(resultReg, uReg, SCALE_2, 0));
2565
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
2566
		break;
2567
	case 8:
2568
		AND(32, R(uReg), Imm8(3));
2569
		ADD(32, R(resultReg), R(uReg));
2570
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
2571
		break;
2572
	default:
2573
		success = false;
2574
		break;
2575
	}
2576

2577
	regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2578
	regCache_.ForceRelease(RegCache::GEN_ARG_U);
2579

2580
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
2581
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
2582
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
2583
	return success;
2584
}
2585

2586
bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {
2587
	Describe("Texel");
2588

2589
	X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
2590
	X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
2591
	X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
2592
	X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
2593
	if (id.hasAnyMips) {
2594
		// We have to figure out levels and the proper width, ugh.
2595
		X64Reg idReg = GetSamplerID();
2596
		X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2597

2598
		X64Reg levelReg = INVALID_REG;
2599
		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
2600
			levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
2601
		} else {
2602
			levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
2603
			MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
2604
		}
2605

2606
		// We'll multiply these at the same time, so it's nice to put together.
2607
		UNPCKLPS(sReg, R(tReg));
2608
		SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
2609

2610
		X64Reg sizesReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2611
		if (cpu_info.bSSE4_1) {
2612
			PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2613
		} else {
2614
			MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2615
			X64Reg zeroReg = GetZeroVec();
2616
			PUNPCKLWD(sizesReg, R(zeroReg));
2617
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2618
		}
2619

2620
		// We just want this value as a float, times 256.
2621
		PSLLD(sizesReg, 8);
2622
		CVTDQ2PS(sizesReg, R(sizesReg));
2623

2624
		// Okay, we can multiply now, and convert back to integer.
2625
		MULPS(sReg, R(sizesReg));
2626
		CVTTPS2DQ(sReg, R(sReg));
2627
		regCache_.Release(sizesReg, RegCache::VEC_TEMP0);
2628

2629
		PSRAD(sReg, 8);
2630

2631
		// Reuse tempXYReg for the level1 values.
2632
		if (!cpu_info.bSSE4_1)
2633
			PSHUFD(tReg, R(sReg), _MM_SHUFFLE(3, 2, 3, 2));
2634

2635
		auto applyClampWrap = [&](X64Reg dest, bool clamp, bool isY, bool isLevel1) {
2636
			int offset = offsetof(SamplerID, cached.sizes[0].w) + (isY ? 2 : 0) + (isLevel1 ? 4 : 0);
2637
			// Grab the size, already pre-shifted for us.
2638
			MOVZX(32, 16, tempReg, MComplex(idReg, levelReg, SCALE_4, offset));
2639

2640
			// Grab the size from the multiply.
2641
			if (cpu_info.bSSE4_1) {
2642
				if (isY || isLevel1)
2643
					PEXTRD(R(dest), sReg, (isY ? 1 : 0) + (isLevel1 ? 2 : 0));
2644
				else
2645
					MOVD_xmm(R(dest), sReg);
2646
			} else {
2647
				X64Reg srcReg = isLevel1 ? tReg : sReg;
2648
				MOVD_xmm(R(dest), srcReg);
2649
				if (!isY)
2650
					PSRLDQ(srcReg, 4);
2651
			}
2652

2653
			SUB(32, R(tempReg), Imm8(1));
2654
			AND(32, R(tempReg), Imm32(0x000001FF));
2655
			if (clamp) {
2656
				CMP(32, R(dest), R(tempReg));
2657
				CMOVcc(32, dest, R(tempReg), CC_G);
2658
				XOR(32, R(tempReg), R(tempReg));
2659
				CMP(32, R(dest), R(tempReg));
2660
				CMOVcc(32, dest, R(tempReg), CC_L);
2661
			} else {
2662
				AND(32, R(dest), R(tempReg));
2663
			}
2664
		};
2665

2666
		// Do the next level first, so we can save them and reuse the regs.
2667
		// Note: for non-SSE4, this must be in S/T order.
2668
		applyClampWrap(uReg, id.clampS, false, true);
2669
		applyClampWrap(vReg, id.clampT, true, true);
2670

2671
		// Okay, now stuff them on the stack - we'll load them again later.
2672
		MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0), R(uReg));
2673
		MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4), R(vReg));
2674

2675
		// And then the given level.
2676
		// Note: for non-SSE4, this must be in S/T order.
2677
		applyClampWrap(uReg, id.clampS, false, false);
2678
		applyClampWrap(vReg, id.clampT, true, false);
2679

2680
		UnlockSamplerID(idReg);
2681
		regCache_.Release(tempReg, RegCache::GEN_TEMP0);
2682
		regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
2683
	} else {
2684
		// Multiply, then convert to integer...
2685
		UNPCKLPS(sReg, R(tReg));
2686
		MULPS(sReg, M(constWidthHeight256f_));
2687
		CVTTPS2DQ(sReg, R(sReg));
2688
		// Great, shift out the fraction.
2689
		PSRAD(sReg, 8);
2690

2691
		// Square textures are kinda common.
2692
		bool clampApplied = false;
2693
		if (id.width0Shift == id.height0Shift) {
2694
			if (!id.clampS && !id.clampT) {
2695
				PAND(sReg, M(constWidthMinus1i_));
2696
				clampApplied = true;
2697
			} else if (id.clampS && id.clampT && cpu_info.bSSE4_1) {
2698
				X64Reg zeroReg = GetZeroVec();
2699
				PMINSD(sReg, M(constWidthMinus1i_));
2700
				PMAXSD(sReg, R(zeroReg));
2701
				regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2702
				clampApplied = true;
2703
			}
2704
		}
2705

2706
		// Now extract to do the clamping (unless we already did it.)
2707
		MOVQ_xmm(R(uReg), sReg);
2708
		MOV(64, R(vReg), R(uReg));
2709
		SHR(64, R(vReg), Imm8(32));
2710
		// Strip off the top bits.
2711
		AND(32, R(uReg), R(uReg));
2712

2713
		auto applyClampWrap = [this](X64Reg dest, bool clamp, uint8_t shift) {
2714
			// Clamp and wrap both max out at 512.
2715
			if (shift > 9)
2716
				shift = 9;
2717

2718
			if (clamp) {
2719
				X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
2720
				MOV(32, R(tempReg), Imm32((1 << shift) - 1));
2721
				CMP(32, R(dest), R(tempReg));
2722
				CMOVcc(32, dest, R(tempReg), CC_G);
2723
				XOR(32, R(tempReg), R(tempReg));
2724
				CMP(32, R(dest), R(tempReg));
2725
				CMOVcc(32, dest, R(tempReg), CC_L);
2726
				regCache_.Release(tempReg, RegCache::GEN_TEMP0);
2727
			} else {
2728
				AND(32, R(dest), Imm32((1 << shift) - 1));
2729
			}
2730
		};
2731

2732
		// Now apply clamp/wrap.
2733
		if (!clampApplied) {
2734
			applyClampWrap(uReg, id.clampS, id.width0Shift);
2735
			applyClampWrap(vReg, id.clampT, id.height0Shift);
2736
		}
2737
	}
2738

2739
	regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
2740
	regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
2741
	regCache_.ForceRetain(RegCache::GEN_ARG_U);
2742
	regCache_.ForceRetain(RegCache::GEN_ARG_V);
2743

2744
	// And get rid of S and T, we're done with them now.
2745
	regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
2746
	regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
2747
	regCache_.ForceRelease(RegCache::VEC_ARG_S);
2748
	regCache_.ForceRelease(RegCache::VEC_ARG_T);
2749

2750
	return true;
2751
}
2752

2753
bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
2754
	Describe("TexelQuad");
2755

2756
	X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
2757
	X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
2758

2759
	// We use this if there are mips later, to apply wrap/clamp.
2760
	X64Reg sizesReg = INVALID_REG;
2761

2762
	// Start by multiplying with the width/height... which might be complex with mips.
2763
	if (id.hasAnyMips) {
2764
		// We have to figure out levels and the proper width, ugh.
2765
		X64Reg idReg = GetSamplerID();
2766

2767
		X64Reg levelReg = INVALID_REG;
2768
		// To avoid ABI problems, we don't hold onto level.
2769
		bool releaseLevelReg = !regCache_.Has(RegCache::GEN_ARG_LEVEL);
2770
		if (!releaseLevelReg) {
2771
			levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
2772
		} else {
2773
			releaseLevelReg = true;
2774
			levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
2775
			MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
2776
		}
2777

2778
		// This will load the current and next level's sizes, 16x4.
2779
		sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);
2780
		// We actually want this in 32-bit, though, so extend.
2781
		if (cpu_info.bSSE4_1) {
2782
			PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2783
		} else {
2784
			MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
2785
			X64Reg zeroReg = GetZeroVec();
2786
			PUNPCKLWD(sizesReg, R(zeroReg));
2787
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
2788
		}
2789

2790
		if (releaseLevelReg)
2791
			regCache_.Release(levelReg, RegCache::GEN_ARG_LEVEL);
2792
		else
2793
			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
2794
		UnlockSamplerID(idReg);
2795

2796
		// Now make a float version of sizesReg, times 256.
2797
		X64Reg sizes256Reg = regCache_.Alloc(RegCache::VEC_TEMP0);
2798
		PSLLD(sizes256Reg, sizesReg, 8);
2799
		CVTDQ2PS(sizes256Reg, R(sizes256Reg));
2800

2801
		// Next off, move S and T into a single reg, which will become U0 V0 U1 V1.
2802
		UNPCKLPS(sReg, R(tReg));
2803
		SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
2804
		// And multiply by the sizes, all lined up already.
2805
		MULPS(sReg, R(sizes256Reg));
2806
		regCache_.Release(sizes256Reg, RegCache::VEC_TEMP0);
2807

2808
		// For wrap/clamp purposes, we want width or height minus one.  Do that now.
2809
		PSUBD(sizesReg, M(constOnes32_));
2810
		PAND(sizesReg, M(constMaxTexel32_));
2811
	} else {
2812
		// Easy mode.
2813
		UNPCKLPS(sReg, R(tReg));
2814
		MULPS(sReg, M(constWidthHeight256f_));
2815
	}
2816

2817
	// And now, convert to integers for all later processing.
2818
	CVTPS2DQ(sReg, R(sReg));
2819

2820
	// Now adjust X and Y...
2821
	X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2822
	// Product a -128 constant.
2823
	PCMPEQD(tempXYReg, R(tempXYReg));
2824
	PSLLD(tempXYReg, 7);
2825
	PADDD(sReg, R(tempXYReg));
2826
	regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);
2827

2828
	// We do want the fraction, though, so extract that to an XMM for later.
2829
	X64Reg allFracReg = INVALID_REG;
2830
	if (regCache_.Has(RegCache::VEC_FRAC))
2831
		allFracReg = regCache_.Find(RegCache::VEC_FRAC);
2832
	else
2833
		allFracReg = regCache_.Alloc(RegCache::VEC_FRAC);
2834
	// We only want the four bits after the first four, though.
2835
	PSLLD(allFracReg, sReg, 24);
2836
	PSRLD(allFracReg, 28);
2837
	// It's convenient later if this is in the low words only.
2838
	PACKSSDW(allFracReg, R(allFracReg));
2839
	regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
2840
	regCache_.ForceRetain(RegCache::VEC_FRAC);
2841

2842
	// With those extracted, we can now get rid of the fractional bits.
2843
	PSRAD(sReg, 8);
2844

2845
	// Now it's time to separate the lanes into separate registers and add next UV offsets.
2846
	if (id.hasAnyMips) {
2847
		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
2848
		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
2849
		PSHUFD(u1Reg, R(sReg), _MM_SHUFFLE(2, 2, 2, 2));
2850
		PSHUFD(v1Reg, R(sReg), _MM_SHUFFLE(3, 3, 3, 3));
2851
		PADDD(u1Reg, M(constUNext_));
2852
		PADDD(v1Reg, M(constVNext_));
2853
		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
2854
		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
2855
	}
2856

2857
	PSHUFD(tReg, R(sReg), _MM_SHUFFLE(1, 1, 1, 1));
2858
	PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
2859
	PADDD(tReg, M(constVNext_));
2860
	PADDD(sReg, M(constUNext_));
2861

2862
	X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2863
	bool temp0ClampZero = false;
2864

2865
	auto doClamp = [&](bool clamp, X64Reg stReg, const OpArg &bound) {
2866
		if (!clamp) {
2867
			// Wrapping is easy.
2868
			PAND(stReg, bound);
2869
			return;
2870
		}
2871

2872
		if (!temp0ClampZero)
2873
			PXOR(temp0ClampReg, R(temp0ClampReg));
2874
		temp0ClampZero = true;
2875

2876
		if (cpu_info.bSSE4_1) {
2877
			PMINSD(stReg, bound);
2878
			PMAXSD(stReg, R(temp0ClampReg));
2879
		} else {
2880
			temp0ClampZero = false;
2881
			// Set temp to max(0, stReg) = AND(NOT(0 > stReg), stReg).
2882
			PCMPGTD(temp0ClampReg, R(stReg));
2883
			PANDN(temp0ClampReg, R(stReg));
2884

2885
			// Now make a mask where bound is greater than the ST value in temp0ClampReg.
2886
			if (cpu_info.bAVX && bound.IsSimpleReg()) {
2887
				VPCMPGTD(128, stReg, bound.GetSimpleReg(), R(temp0ClampReg));
2888
			} else {
2889
				MOVDQA(stReg, bound);
2890
				PCMPGTD(stReg, R(temp0ClampReg));
2891
			}
2892
			// Throw away the values that are greater in our temp0ClampReg in progress result.
2893
			PAND(temp0ClampReg, R(stReg));
2894

2895
			// Now, set bound only where ST was too high.
2896
			PANDN(stReg, bound);
2897
			// And put in the values that were fine.
2898
			POR(stReg, R(temp0ClampReg));
2899
		}
2900
	};
2901

2902
	if (id.hasAnyMips) {
2903
		// We'll spread sizes out into a temp.
2904
		X64Reg spreadSizeReg = regCache_.Alloc(RegCache::VEC_TEMP1);
2905

2906
		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(0, 0, 0, 0));
2907
		doClamp(id.clampS, sReg, R(spreadSizeReg));
2908
		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(1, 1, 1, 1));
2909
		doClamp(id.clampT, tReg, R(spreadSizeReg));
2910
		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
2911
		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
2912
		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(2, 2, 2, 2));
2913
		doClamp(id.clampS, u1Reg, R(spreadSizeReg));
2914
		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(3, 3, 3, 3));
2915
		doClamp(id.clampT, v1Reg, R(spreadSizeReg));
2916
		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
2917
		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
2918

2919
		regCache_.Release(spreadSizeReg, RegCache::VEC_TEMP1);
2920
	} else {
2921
		doClamp(id.clampS, sReg, M(constWidthMinus1i_));
2922
		doClamp(id.clampT, tReg, M(constHeightMinus1i_));
2923
	}
2924

2925
	if (sizesReg != INVALID_REG)
2926
		regCache_.Release(sizesReg, RegCache::VEC_TEMP5);
2927
	regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);
2928

2929
	regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
2930
	regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
2931
	regCache_.Change(RegCache::VEC_ARG_S, RegCache::VEC_ARG_U);
2932
	regCache_.Change(RegCache::VEC_ARG_T, RegCache::VEC_ARG_V);
2933
	return true;
2934
}
2935

2936
bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1) {
2937
	_assert_(id.linear);
2938

2939
	bool success = true;
2940
	int bits = 0;
2941
	switch (id.TexFmt()) {
2942
	case GE_TFMT_5650:
2943
	case GE_TFMT_5551:
2944
	case GE_TFMT_4444:
2945
	case GE_TFMT_CLUT16:
2946
		bits = 16;
2947
		break;
2948

2949
	case GE_TFMT_8888:
2950
	case GE_TFMT_CLUT32:
2951
		bits = 32;
2952
		break;
2953

2954
	case GE_TFMT_CLUT8:
2955
		bits = 8;
2956
		break;
2957

2958
	case GE_TFMT_CLUT4:
2959
		bits = 4;
2960
		break;
2961

2962
	case GE_TFMT_DXT1:
2963
		bits = -8;
2964
		break;
2965

2966
	case GE_TFMT_DXT3:
2967
	case GE_TFMT_DXT5:
2968
		bits = -16;
2969
		break;
2970

2971
	default:
2972
		success = false;
2973
	}
2974

2975
	if (success && bits != 0) {
2976
		if (bits < 0) {
2977
			success = Jit_PrepareDataDXTOffsets(id, uReg, vReg, level1, -bits);
2978
		} else if (id.swizzle) {
2979
			success = Jit_PrepareDataSwizzledOffsets(id, uReg, vReg, level1, bits);
2980
		} else {
2981
			success = Jit_PrepareDataDirectOffsets(id, uReg, vReg, level1, bits);
2982
		}
2983
	}
2984

2985
	return success;
2986
}
2987

2988
bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
2989
	Describe("DataOff");
2990
	X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
2991
	if (!id.useStandardBufw || id.hasAnyMips) {
2992
		// Spread bufw into each lane.
2993
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
2994
		if (cpu_info.bSSE4_1) {
2995
			PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
2996
		} else {
2997
			PXOR(bufwVecReg, R(bufwVecReg));
2998
			PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
2999
		}
3000
		PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3001
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3002

3003
		if (bitsPerTexel == 4)
3004
			PSRLD(bufwVecReg, 1);
3005
		else if (bitsPerTexel == 16)
3006
			PSLLD(bufwVecReg, 1);
3007
		else if (bitsPerTexel == 32)
3008
			PSLLD(bufwVecReg, 2);
3009
	}
3010

3011
	if (id.useStandardBufw && !id.hasAnyMips) {
3012
		int amt = id.width0Shift;
3013
		if (bitsPerTexel == 4)
3014
			amt -= 1;
3015
		else if (bitsPerTexel == 16)
3016
			amt += 1;
3017
		else if (bitsPerTexel == 32)
3018
			amt += 2;
3019
		// It's aligned to 16 bytes, so must at least be 16.
3020
		PSLLD(vReg, std::max(4, amt));
3021
	} else if (cpu_info.bSSE4_1) {
3022
		// And now multiply.  This is slow, but not worse than the SSE2 version...
3023
		PMULLD(vReg, R(bufwVecReg));
3024
	} else {
3025
		// Copy that into another temp for multiply.
3026
		X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3027
		MOVDQA(vOddLaneReg, R(vReg));
3028

3029
		// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3030
		PMULUDQ(vReg, R(bufwVecReg));
3031
		PSRLDQ(vOddLaneReg, 4);
3032
		PSRLDQ(bufwVecReg, 4);
3033
		// And now get XXXX DDDD XXXX BBBB.
3034
		PMULUDQ(vOddLaneReg, R(bufwVecReg));
3035

3036
		// We know everything is positive, so XXXX must be zero.  Let's combine.
3037
		PSLLDQ(vOddLaneReg, 4);
3038
		POR(vReg, R(vOddLaneReg));
3039
		regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
3040
	}
3041
	regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3042

3043
	if (bitsPerTexel == 4) {
3044
		// Need to keep uvec for the odd bit.
3045
		X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3046
		MOVDQA(uCopyReg, R(uReg));
3047
		PSRLD(uCopyReg, 1);
3048
		PADDD(vReg, R(uCopyReg));
3049
		regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
3050
	} else {
3051
		// Destroy uvec, we won't use it again.
3052
		if (bitsPerTexel == 16)
3053
			PSLLD(uReg, 1);
3054
		else if (bitsPerTexel == 32)
3055
			PSLLD(uReg, 2);
3056
		PADDD(vReg, R(uReg));
3057
	}
3058

3059
	return true;
3060
}
3061

3062
bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
3063
	Describe("DataOffS");
3064
	// See Jit_GetTexDataSwizzled() for usage of this offset.
3065

3066
	X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3067
	if (!id.useStandardBufw || id.hasAnyMips) {
3068
		// Spread bufw into each lane.
3069
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
3070
		if (cpu_info.bSSE4_1) {
3071
			PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
3072
		} else {
3073
			PXOR(bufwVecReg, R(bufwVecReg));
3074
			PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
3075
		}
3076
		PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3077
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3078
	}
3079

3080
	// Divide vvec by 8 in a temp.
3081
	X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3082
	PSRLD(vMultReg, vReg, 3);
3083

3084
	// And now multiply by bufw.  May be able to use a shift in a common case.
3085
	int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
3086
	if (id.useStandardBufw && !id.hasAnyMips) {
3087
		int amt = id.width0Shift;
3088
		// Account for 16 byte minimum.
3089
		amt = std::max(7 - shiftAmount, amt);
3090
		shiftAmount += amt;
3091
	} else if (cpu_info.bSSE4_1) {
3092
		// And now multiply.  This is slow, but not worse than the SSE2 version...
3093
		PMULLD(vMultReg, R(bufwVecReg));
3094
	} else {
3095
		// Copy that into another temp for multiply.
3096
		X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP2);
3097
		MOVDQA(vOddLaneReg, R(vMultReg));
3098

3099
		// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3100
		PMULUDQ(vMultReg, R(bufwVecReg));
3101
		PSRLDQ(vOddLaneReg, 4);
3102
		PSRLDQ(bufwVecReg, 4);
3103
		// And now get XXXX DDDD XXXX BBBB.
3104
		PMULUDQ(vOddLaneReg, R(bufwVecReg));
3105

3106
		// We know everything is positive, so XXXX must be zero.  Let's combine.
3107
		PSLLDQ(vOddLaneReg, 4);
3108
		POR(vMultReg, R(vOddLaneReg));
3109
		regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP2);
3110
	}
3111
	regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3112

3113
	// Multiply the result by bitsPerTexel using a shift.
3114
	PSLLD(vMultReg, shiftAmount);
3115

3116
	// Now we're adding (v & 7) * 16.  Use a 16-bit wall.
3117
	PSLLW(vReg, 13);
3118
	PSRLD(vReg, 9);
3119
	PADDD(vReg, R(vMultReg));
3120
	regCache_.Release(vMultReg, RegCache::VEC_TEMP1);
3121

3122
	// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.
3123
	X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3124
	PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
3125
	PSLLD(uCopyReg, 7);
3126
	// Add it in to our running total.
3127
	PADDD(vReg, R(uCopyReg));
3128

3129
	if (bitsPerTexel == 4) {
3130
		// Finally, we want (uvec & 31) / 2.  Use a 16-bit wall.
3131
		PSLLW(uCopyReg, uReg, 11);
3132
		PSRLD(uCopyReg, 12);
3133
		// With that, this is our byte offset.  uvec & 1 has which half.
3134
		PADDD(vReg, R(uCopyReg));
3135
	} else {
3136
		// We can destroy uvec in this path.  Clear all but 2 bits for 32, 3 for 16, or 4 for 8.
3137
		PSLLW(uReg, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);
3138
		// Now that it's at the top of the 16 bits, we always shift that to the top of 4 bits.
3139
		PSRLD(uReg, 12);
3140
		PADDD(vReg, R(uReg));
3141
	}
3142
	regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
3143

3144
	return true;
3145
}
3146

3147
bool SamplerJitCache::Jit_PrepareDataDXTOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int blockSize) {
3148
	Describe("DataOffDXT");
3149
	// Wwe need to get the block's offset, which is:
3150
	// blockPos = src + (v/4 * bufw/4 + u/4) * blockSize
3151
	// We distribute the blockSize constant for convenience:
3152
	// blockPos = src + (blockSize*v/4 * bufw/4 + blockSize*u/4)
3153

3154
	X64Reg baseVReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3155
	// This gives us the V factor for the block, which we multiply by bufw.
3156
	PSRLD(baseVReg, vReg, 2);
3157
	PSLLD(baseVReg, blockSize == 16 ? 4 : 3);
3158

3159
	X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3160
	if (!id.useStandardBufw || id.hasAnyMips) {
3161
		// Spread bufw into each lane.
3162
		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
3163
		if (cpu_info.bSSE4_1) {
3164
			PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
3165
		} else {
3166
			PXOR(bufwVecReg, R(bufwVecReg));
3167
			PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
3168
		}
3169
		PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
3170
		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
3171

3172
		// Divide by 4 before the multiply.
3173
		PSRLD(bufwVecReg, 2);
3174
	}
3175

3176
	if (id.useStandardBufw && !id.hasAnyMips) {
3177
		int amt = id.width0Shift - 2;
3178
		if (amt < 0)
3179
			PSRLD(baseVReg, -amt);
3180
		else if (amt > 0)
3181
			PSLLD(baseVReg, amt);
3182
	} else if (cpu_info.bSSE4_1) {
3183
		// And now multiply.  This is slow, but not worse than the SSE2 version...
3184
		PMULLD(baseVReg, R(bufwVecReg));
3185
	} else {
3186
		// Copy that into another temp for multiply.
3187
		X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
3188
		MOVDQA(vOddLaneReg, R(baseVReg));
3189

3190
		// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
3191
		PMULUDQ(baseVReg, R(bufwVecReg));
3192
		PSRLDQ(vOddLaneReg, 4);
3193
		PSRLDQ(bufwVecReg, 4);
3194
		// And now get XXXX DDDD XXXX BBBB.
3195
		PMULUDQ(vOddLaneReg, R(bufwVecReg));
3196

3197
		// We know everything is positive, so XXXX must be zero.  Let's combine.
3198
		PSLLDQ(vOddLaneReg, 4);
3199
		POR(baseVReg, R(vOddLaneReg));
3200
		regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
3201
	}
3202
	regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
3203

3204
	// Now add in the U factor for the block.
3205
	X64Reg baseUReg = regCache_.Alloc(RegCache::VEC_TEMP0);
3206
	PSRLD(baseUReg, uReg, 2);
3207
	PSLLD(baseUReg, blockSize == 16 ? 4 : 3);
3208
	PADDD(baseVReg, R(baseUReg));
3209
	regCache_.Release(baseUReg, RegCache::VEC_TEMP0);
3210

3211
	// Okay, the base index (block byte offset from src) is ready.
3212
	regCache_.Unlock(baseVReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3213
	regCache_.ForceRetain(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
3214

3215
	// For everything else, we only want the low two bits of U and V.
3216
	PSLLD(uReg, 30);
3217
	PSLLD(vReg, 30);
3218

3219
	X64Reg alphaTempRegU = regCache_.Alloc(RegCache::VEC_TEMP0);
3220
	if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5)
3221
		PSRLD(alphaTempRegU, uReg, 30);
3222

3223
	PSRLD(uReg, 30 - 1);
3224
	PSRLD(vReg, 30 - 3);
3225
	// At this point, uReg is now the bit offset of the color index.
3226
	PADDD(uReg, R(vReg));
3227

3228
	// Grab the alpha index into vReg next.
3229
	if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5) {
3230
		PSRLD(vReg, 1);
3231
		PADDD(vReg, R(alphaTempRegU));
3232

3233
		if (id.TexFmt() == GE_TFMT_DXT3) {
3234
			PSLLD(vReg, 2);
3235
		} else if (id.TexFmt() == GE_TFMT_DXT5) {
3236
			// Multiply by 3.
3237
			PSLLD(alphaTempRegU, vReg, 1);
3238
			PADDD(vReg, R(alphaTempRegU));
3239
		}
3240
	}
3241
	regCache_.Release(alphaTempRegU, RegCache::VEC_TEMP0);
3242

3243
	return true;
3244
}
3245

3246
bool SamplerJitCache::Jit_DecodeQuad(const SamplerID &id, bool level1) {
3247
	GETextureFormat decodeFmt = id.TexFmt();
3248
	switch (id.TexFmt()) {
3249
	case GE_TFMT_CLUT32:
3250
	case GE_TFMT_CLUT16:
3251
	case GE_TFMT_CLUT8:
3252
	case GE_TFMT_CLUT4:
3253
		// The values match, so just use the clut fmt.
3254
		decodeFmt = (GETextureFormat)id.ClutFmt();
3255
		break;
3256

3257
	default:
3258
		// We'll decode below.
3259
		break;
3260
	}
3261

3262
	bool success = true;
3263
	X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
3264

3265
	switch (decodeFmt) {
3266
	case GE_TFMT_5650:
3267
		success = Jit_Decode5650Quad(id, quadReg);
3268
		break;
3269

3270
	case GE_TFMT_5551:
3271
		success = Jit_Decode5551Quad(id, quadReg);
3272
		break;
3273

3274
	case GE_TFMT_4444:
3275
		success = Jit_Decode4444Quad(id, quadReg);
3276
		break;
3277

3278
	default:
3279
		// Doesn't need decoding.
3280
		break;
3281
	}
3282

3283
	regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
3284
	return success;
3285
}
3286

3287
bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3288
	Describe("5650Quad");
3289
	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3290
	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3291

3292
	// Filter out red only into temp1.  We do this by shifting into a wall.
3293
	PSLLD(temp1Reg, quadReg, 32 - 5);
3294
	// Move it right to the top of the 8 bits.
3295
	PSRLD(temp1Reg, 24);
3296

3297
	// Now we bring in blue, since it's also 5 like red.
3298
	// Luckily, we know the top 16 bits are zero.  Shift right into a wall.
3299
	PSRLD(temp2Reg, quadReg, 11);
3300
	// Shift blue into place at 19, and merge back to temp1.
3301
	PSLLD(temp2Reg, 19);
3302
	POR(temp1Reg, R(temp2Reg));
3303

3304
	// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
3305
	PSLLD(temp2Reg, temp1Reg, 1);
3306

3307
	// We go to green last because it's the different one.  Shift off red and blue.
3308
	PSRLD(quadReg, 5);
3309
	// Use a word shift to put a wall just at the right place, top 6 bits of second byte.
3310
	PSLLW(quadReg, 10);
3311
	// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
3312
	POR(temp2Reg, R(quadReg));
3313
	POR(quadReg, R(temp1Reg));
3314

3315
	// Now shift and mask temp2 for swizzle.
3316
	PSRLD(temp2Reg, 6);
3317
	PAND(temp2Reg, M(const5650Swizzle_));
3318
	// And then OR that in too.  Only alpha left now.
3319
	POR(quadReg, R(temp2Reg));
3320

3321
	if (id.useTextureAlpha) {
3322
		// Just put a fixed FF in.  Maybe we could even avoid this and act like it's FF later...
3323
		PCMPEQD(temp2Reg, R(temp2Reg));
3324
		PSLLD(temp2Reg, 24);
3325
		POR(quadReg, R(temp2Reg));
3326
	}
3327

3328
	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3329
	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3330
	return true;
3331
}
3332

3333
bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
3334
	Describe("5650");
3335
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3336
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3337
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3338

3339
	if (cpu_info.bBMI2_fast) {
3340
		// Start off with the high bits.
3341
		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
3342
		PDEP(32, temp1Reg, resultReg, R(temp1Reg));
3343
		if (id.useTextureAlpha || id.fetch)
3344
			OR(32, R(temp1Reg), Imm32(0xFF000000));
3345

3346
		// Now grab the low bits (they end up packed.)
3347
		MOV(32, R(temp2Reg), Imm32(0x0000E61C));
3348
		PEXT(32, resultReg, resultReg, R(temp2Reg));
3349
		// And spread them back out.
3350
		MOV(32, R(temp2Reg), Imm32(0x00070307));
3351
		PDEP(32, resultReg, resultReg, R(temp2Reg));
3352

3353
		// Finally put the high bits in, we're done.
3354
		OR(32, R(resultReg), R(temp1Reg));
3355
	} else {
3356
		MOV(32, R(temp2Reg), R(resultReg));
3357
		AND(32, R(temp2Reg), Imm32(0x0000001F));
3358

3359
		// B (we do R and B at the same time, they're both 5.)
3360
		MOV(32, R(temp1Reg), R(resultReg));
3361
		AND(32, R(temp1Reg), Imm32(0x0000F800));
3362
		SHL(32, R(temp1Reg), Imm8(5));
3363
		OR(32, R(temp2Reg), R(temp1Reg));
3364

3365
		// Expand 5 -> 8.  At this point we have 00BB00RR.
3366
		MOV(32, R(temp1Reg), R(temp2Reg));
3367
		SHL(32, R(temp2Reg), Imm8(3));
3368
		SHR(32, R(temp1Reg), Imm8(2));
3369
		OR(32, R(temp2Reg), R(temp1Reg));
3370
		AND(32, R(temp2Reg), Imm32(0x00FF00FF));
3371

3372
		// Now's as good a time to put in A as any.
3373
		if (id.useTextureAlpha || id.fetch)
3374
			OR(32, R(temp2Reg), Imm32(0xFF000000));
3375

3376
		// Last, we need to align, extract, and expand G.
3377
		// 3 to align to G, and then 2 to expand to 8.
3378
		SHL(32, R(resultReg), Imm8(3 + 2));
3379
		AND(32, R(resultReg), Imm32(0x0000FC00));
3380
		MOV(32, R(temp1Reg), R(resultReg));
3381
		// 2 to account for resultReg being preshifted, 4 for expansion.
3382
		SHR(32, R(temp1Reg), Imm8(2 + 4));
3383
		OR(32, R(resultReg), R(temp1Reg));
3384
		AND(32, R(resultReg), Imm32(0x0000FF00));
3385
		OR(32, R(resultReg), R(temp2Reg));
3386
	}
3387

3388
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3389
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3390
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3391
	return true;
3392
}
3393

3394
bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3395
	Describe("5551Quad");
3396
	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3397
	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3398

3399
	// Filter out red only into temp1.  We do this by shifting into a wall.
3400
	PSLLD(temp1Reg, quadReg, 32 - 5);
3401
	// Move it right to the top of the 8 bits.
3402
	PSRLD(temp1Reg, 24);
3403

3404
	// Add in green and shift into place (top 5 bits of byte 2.)
3405
	PSRLD(temp2Reg, quadReg, 5);
3406
	PSLLW(temp2Reg, 11);
3407
	POR(temp1Reg, R(temp2Reg));
3408

3409
	// First, extend alpha using an arithmetic shift.
3410
	// We use 10 to meanwhile get rid of green too.  The extra alpha bits are fine.
3411
	PSRAW(quadReg, 10);
3412
	// This gets rid of those extra alpha bits and puts blue in place too.
3413
	PSLLD(quadReg, 19);
3414

3415
	// Combine both together, we still need to swizzle.
3416
	POR(quadReg, R(temp1Reg));
3417
	PSRLD(temp1Reg, quadReg, 5);
3418

3419
	// Now for swizzle, we'll mask carefully to avoid overflow.
3420
	PAND(temp1Reg, M(const5551Swizzle_));
3421
	// Then finally merge in the swizzle bits.
3422
	POR(quadReg, R(temp1Reg));
3423

3424
	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3425
	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3426
	return true;
3427
}
3428

3429
bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
3430
	Describe("5551");
3431
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3432
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3433
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3434

3435
	if (cpu_info.bBMI2_fast) {
3436
		// First, grab the top bits.
3437
		bool keepAlpha = id.useTextureAlpha || id.fetch;
3438
		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
3439
		PDEP(32, resultReg, resultReg, R(temp1Reg));
3440

3441
		// Now make the swizzle bits.
3442
		MOV(32, R(temp2Reg), R(resultReg));
3443
		SHR(32, R(temp2Reg), Imm8(5));
3444
		AND(32, R(temp2Reg), Imm32(0x00070707));
3445

3446
		if (keepAlpha) {
3447
			// Sign extend the alpha bit to 8 bits.
3448
			SHL(32, R(resultReg), Imm8(7));
3449
			SAR(32, R(resultReg), Imm8(7));
3450
		}
3451

3452
		OR(32, R(resultReg), R(temp2Reg));
3453
	} else {
3454
		MOV(32, R(temp2Reg), R(resultReg));
3455
		MOV(32, R(temp1Reg), R(resultReg));
3456
		AND(32, R(temp2Reg), Imm32(0x0000001F));
3457
		AND(32, R(temp1Reg), Imm32(0x000003E0));
3458
		SHL(32, R(temp1Reg), Imm8(3));
3459
		OR(32, R(temp2Reg), R(temp1Reg));
3460

3461
		MOV(32, R(temp1Reg), R(resultReg));
3462
		AND(32, R(temp1Reg), Imm32(0x00007C00));
3463
		SHL(32, R(temp1Reg), Imm8(6));
3464
		OR(32, R(temp2Reg), R(temp1Reg));
3465

3466
		// Expand 5 -> 8.  After this is just A.
3467
		MOV(32, R(temp1Reg), R(temp2Reg));
3468
		SHL(32, R(temp2Reg), Imm8(3));
3469
		SHR(32, R(temp1Reg), Imm8(2));
3470
		// Chop off the bits that were shifted out.
3471
		AND(32, R(temp1Reg), Imm32(0x00070707));
3472
		OR(32, R(temp2Reg), R(temp1Reg));
3473

3474
		if (id.useTextureAlpha || id.fetch) {
3475
			// For A, we sign extend to get either 16 1s or 0s of alpha.
3476
			SAR(16, R(resultReg), Imm8(15));
3477
			// Now, shift left by 24 to get the lowest 8 of those at the top.
3478
			SHL(32, R(resultReg), Imm8(24));
3479
			OR(32, R(resultReg), R(temp2Reg));
3480
		} else {
3481
			MOV(32, R(resultReg), R(temp2Reg));
3482
		}
3483
	}
3484

3485
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3486
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3487
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3488
	return true;
3489
}
3490

3491
bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
3492
	Describe("4444Quad");
3493
	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3494
	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3495

3496
	// Mask and move red into position within temp1.
3497
	PSLLD(temp1Reg, quadReg, 28);
3498
	PSRLD(temp1Reg, 24);
3499

3500
	// Green is easy too, we use a word shift to get a free wall.
3501
	PSRLD(temp2Reg, quadReg, 4);
3502
	PSLLW(temp2Reg, 12);
3503
	POR(temp1Reg, R(temp2Reg));
3504

3505
	// Blue isn't last this time, but it's next.
3506
	PSRLD(temp2Reg, quadReg, 8);
3507
	PSLLD(temp2Reg, 28);
3508
	PSRLD(temp2Reg, 8);
3509
	POR(temp1Reg, R(temp2Reg));
3510

3511
	if (id.useTextureAlpha) {
3512
		// Last but not least, alpha.
3513
		PSRLW(quadReg, 12);
3514
		PSLLD(quadReg, 28);
3515
		POR(quadReg, R(temp1Reg));
3516

3517
		// Masking isn't necessary here since everything is 4 wide.
3518
		PSRLD(temp1Reg, quadReg, 4);
3519
		POR(quadReg, R(temp1Reg));
3520
	} else {
3521
		// Overwrite quadReg (we need temp1 as a copy anyway.)
3522
		PSRLD(quadReg, temp1Reg, 4);
3523
		POR(quadReg, R(temp1Reg));
3524
	}
3525

3526
	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
3527
	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
3528
	return true;
3529
}
3530

3531
alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
3532

3533
bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
3534
	Describe("4444");
3535
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3536

3537
	if (cpu_info.bBMI2_fast) {
3538
		X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3539
		// First, spread the bits out with spaces.
3540
		MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
3541
		PDEP(32, resultReg, resultReg, R(temp1Reg));
3542

3543
		// Now swizzle the low bits in.
3544
		MOV(32, R(temp1Reg), R(resultReg));
3545
		SHR(32, R(temp1Reg), Imm8(4));
3546
		OR(32, R(resultReg), R(temp1Reg));
3547

3548
		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3549
	} else {
3550
		X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
3551
		X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
3552
		X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
3553

3554
		MOVD_xmm(vecTemp1Reg, R(resultReg));
3555
		PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
3556
		if (RipAccessible(color4444mask)) {
3557
			PAND(vecTemp1Reg, M(color4444mask));
3558
		} else {
3559
			X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3560
			MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
3561
			PAND(vecTemp1Reg, MatR(temp1Reg));
3562
			regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3563
		}
3564
		MOVSS(vecTemp2Reg, R(vecTemp1Reg));
3565
		MOVSS(vecTemp3Reg, R(vecTemp1Reg));
3566
		PSRLW(vecTemp2Reg, 4);
3567
		PSLLW(vecTemp3Reg, 4);
3568
		POR(vecTemp1Reg, R(vecTemp2Reg));
3569
		POR(vecTemp1Reg, R(vecTemp3Reg));
3570
		MOVD_xmm(R(resultReg), vecTemp1Reg);
3571

3572
		regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
3573
		regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
3574
		regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
3575
	}
3576
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3577
	return true;
3578
}
3579

3580
bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerIndex) {
3581
	Describe("TrCLUT");
3582
	GEPaletteFormat fmt = id.ClutFmt();
3583
	if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
3584
		// This is simple - just mask if necessary.
3585
		if (bitsPerIndex > 8) {
3586
			X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3587
			AND(32, R(resultReg), Imm32(0x000000FF));
3588
			regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3589
		}
3590
		return true;
3591
	}
3592

3593
	if (!cpu_info.bBMI2) {
3594
		bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
3595
		_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
3596
	}
3597

3598
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3599
	X64Reg idReg = GetSamplerID();
3600
	MOV(32, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
3601
	UnlockSamplerID(idReg);
3602

3603
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3604
	int shiftedToSoFar = 0;
3605

3606
	// Shift = (clutformat >> 2) & 0x1F
3607
	if (id.hasClutShift) {
3608
		SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));
3609
		shiftedToSoFar = 2;
3610

3611
		if (cpu_info.bBMI2) {
3612
			SHRX(32, resultReg, R(resultReg), temp1Reg);
3613
		} else {
3614
			_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
3615
			MOV(32, R(RCX), R(temp1Reg));
3616
			SHR(32, R(resultReg), R(RCX));
3617
		}
3618
	}
3619

3620
	// Mask = (clutformat >> 8) & 0xFF
3621
	if (id.hasClutMask) {
3622
		SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));
3623
		shiftedToSoFar = 8;
3624

3625
		AND(32, R(resultReg), R(temp1Reg));
3626
	}
3627

3628
	// We need to wrap any entries beyond the first 1024 bytes.
3629
	u32 offsetMask = fmt == GE_CMODE_32BIT_ABGR8888 ? 0x00FF : 0x01FF;
3630

3631
	// We must mask to 0xFF before ORing 0x100 in 16 bit CMODEs.
3632
	// But skip if we'll mask 0xFF after offset anyway.
3633
	if (bitsPerIndex > 8 && (!id.hasClutOffset || offsetMask != 0x00FF)) {
3634
		AND(32, R(resultReg), Imm32(0x000000FF));
3635
	}
3636

3637
	// Offset = (clutformat >> 12) & 0x01F0
3638
	if (id.hasClutOffset) {
3639
		SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));
3640
		SHL(32, R(temp1Reg), Imm8(4));
3641
		OR(32, R(resultReg), R(temp1Reg));
3642
		AND(32, R(resultReg), Imm32(offsetMask));
3643
	}
3644

3645
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3646
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3647
	return true;
3648
}
3649

3650
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
3651
	Describe("ReadCLUT");
3652
	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
3653
	_assert_msg_(!id.linear, "Should not use this path for linear");
3654

3655
	if (!id.useSharedClut) {
3656
		X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
3657

3658
		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
3659
			X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
3660
			// We need to multiply by 16 and add, LEA allows us to copy too.
3661
			LEA(32, temp2Reg, MScaled(levelReg, SCALE_4, 0));
3662
			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
3663
			if (id.fetch)
3664
				regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
3665
		} else {
3666
			_assert_(stackLevelOffset_ != -1);
3667
			// The argument was saved on the stack.
3668
			MOV(32, R(temp2Reg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
3669
			LEA(32, temp2Reg, MScaled(temp2Reg, SCALE_4, 0));
3670
		}
3671

3672
		// Second step of the multiply by 16 (since we only multiplied by 4 before.)
3673
		LEA(64, resultReg, MComplex(resultReg, temp2Reg, SCALE_4, 0));
3674
		regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
3675
	}
3676

3677
	X64Reg idReg = GetSamplerID();
3678
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
3679
	MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
3680
	UnlockSamplerID(idReg);
3681

3682
	switch (id.ClutFmt()) {
3683
	case GE_CMODE_16BIT_BGR5650:
3684
	case GE_CMODE_16BIT_ABGR5551:
3685
	case GE_CMODE_16BIT_ABGR4444:
3686
		MOVZX(32, 16, resultReg, MComplex(temp1Reg, resultReg, SCALE_2, 0));
3687
		break;
3688

3689
	case GE_CMODE_32BIT_ABGR8888:
3690
		MOV(32, R(resultReg), MComplex(temp1Reg, resultReg, SCALE_4, 0));
3691
		break;
3692
	}
3693

3694
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
3695
	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
3696

3697
	switch (id.ClutFmt()) {
3698
	case GE_CMODE_16BIT_BGR5650:
3699
		return Jit_Decode5650(id);
3700

3701
	case GE_CMODE_16BIT_ABGR5551:
3702
		return Jit_Decode5551(id);
3703

3704
	case GE_CMODE_16BIT_ABGR4444:
3705
		return Jit_Decode4444(id);
3706

3707
	case GE_CMODE_32BIT_ABGR8888:
3708
		return true;
3709

3710
	default:
3711
		return false;
3712
	}
3713
}
3714

3715
};
3716

3717
#endif
3718

3719
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company