Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
4574 views
1
/****************************************************************************
2
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
* @file fetch_jit.cpp
24
*
25
* @brief Implementation of the fetch jitter
26
*
27
* Notes:
28
*
29
******************************************************************************/
30
#include "jit_pch.hpp"
31
#include "builder_gfx_mem.h"
32
#include "jit_api.h"
33
#include "fetch_jit.h"
34
#include "gen_state_llvm.h"
35
#include "functionpasses/passes.h"
36
37
//#define FETCH_DUMP_VERTEX 1
38
using namespace llvm;
39
using namespace SwrJit;
40
41
bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43
enum ConversionType
44
{
45
CONVERT_NONE,
46
CONVERT_NORMALIZED,
47
CONVERT_USCALED,
48
CONVERT_SSCALED,
49
CONVERT_SFIXED,
50
};
51
52
//////////////////////////////////////////////////////////////////////////
53
/// Interface to Jitting a fetch shader
54
//////////////////////////////////////////////////////////////////////////
55
struct FetchJit : public BuilderGfxMem
56
{
57
FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {}
58
59
Function* Create(const FETCH_COMPILE_STATE& fetchState);
60
61
Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62
Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63
Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
64
template <typename T>
65
Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
66
67
// package up Shuffle*bpcGatherd args into a tuple for convenience
68
typedef std::tuple<Value*&,
69
Value*,
70
const Instruction::CastOps,
71
const ConversionType,
72
uint32_t&,
73
uint32_t&,
74
const ComponentEnable,
75
const ComponentControl (&)[4],
76
Value* (&)[4],
77
const uint32_t (&)[4]>
78
Shuffle8bpcArgs;
79
80
void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81
void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
82
83
typedef std::tuple<Value* (&)[2],
84
Value*,
85
const Instruction::CastOps,
86
const ConversionType,
87
uint32_t&,
88
uint32_t&,
89
const ComponentEnable,
90
const ComponentControl (&)[4],
91
Value* (&)[4]>
92
Shuffle16bpcArgs;
93
94
void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95
void Shuffle16bpcGather(Shuffle16bpcArgs& args);
96
97
void StoreVertexElements(Value* pVtxOut,
98
const uint32_t outputElt,
99
const uint32_t numEltsToStore,
100
Value* (&vVertexElements)[4]);
101
102
Value* GenerateCompCtrlVector(const ComponentControl ctrl);
103
104
void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
105
Value* streams,
106
Value* vIndices,
107
Value* pVtxOut);
108
109
bool IsOddFormat(SWR_FORMAT format);
110
bool IsUniformFormat(SWR_FORMAT format);
111
void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112
void CreateGatherOddFormats(
113
SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114
void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
115
116
Value* mpFetchInfo;
117
};
118
119
Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
120
{
121
std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
122
fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
123
124
Function* fetch = Function::Create(
125
JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
126
BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
127
128
fetch->getParent()->setModuleIdentifier(fetch->getName());
129
130
IRB()->SetInsertPoint(entry);
131
132
auto argitr = fetch->arg_begin();
133
134
// Fetch shader arguments
135
Value* privateContext = &*argitr;
136
++argitr;
137
privateContext->setName("privateContext");
138
SetPrivateContext(privateContext);
139
140
mpWorkerData = &*argitr;
141
++argitr;
142
mpWorkerData->setName("pWorkerData");
143
144
mpFetchInfo = &*argitr;
145
++argitr;
146
mpFetchInfo->setName("fetchInfo");
147
Value* pVtxOut = &*argitr;
148
pVtxOut->setName("vtxOutput");
149
150
uint32_t baseWidth = mVWidth;
151
152
SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
153
154
// Override builder target width to force 16-wide SIMD
155
#if USE_SIMD16_SHADERS
156
SetTargetWidth(16);
157
#endif
158
159
pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
160
161
// SWR_FETCH_CONTEXT::pStreams
162
Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163
streams->setName("pStreams");
164
165
// SWR_FETCH_CONTEXT::pIndices
166
Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167
indices->setName("pIndices");
168
169
// SWR_FETCH_CONTEXT::pLastIndex
170
Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171
pLastIndex->setName("pLastIndex");
172
173
Value* vIndices;
174
switch (fetchState.indexType)
175
{
176
case R8_UINT:
177
indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178
if (fetchState.bDisableIndexOOBCheck)
179
{
180
vIndices = LOAD(
181
BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)),
182
{(uint32_t)0});
183
vIndices = Z_EXT(vIndices, mSimdInt32Ty);
184
}
185
else
186
{
187
vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
188
}
189
break;
190
case R16_UINT:
191
if (fetchState.bDisableIndexOOBCheck)
192
{
193
vIndices = LOAD(
194
BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)),
195
{(uint32_t)0});
196
vIndices = Z_EXT(vIndices, mSimdInt32Ty);
197
}
198
else
199
{
200
vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
201
}
202
break;
203
case R32_UINT:
204
(fetchState.bDisableIndexOOBCheck)
205
? vIndices = LOAD(indices,
206
"",
207
PointerType::get(mSimdInt32Ty, 0),
208
MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
209
: vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
210
break; // incoming type is already 32bit int
211
default:
212
vIndices = nullptr;
213
assert(false && "Unsupported index type");
214
break;
215
}
216
217
if (fetchState.bForceSequentialAccessEnable)
218
{
219
Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
220
: C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
221
222
// VertexData buffers are accessed sequentially, the index is equal to the vertex number
223
vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
224
vIndices = ADD(vIndices, pOffsets);
225
}
226
227
Value* vVertexId = vIndices;
228
if (fetchState.bVertexIDOffsetEnable)
229
{
230
// Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
231
// correct
232
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
233
Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
234
vVertexId = ADD(vIndices, vBaseVertex);
235
vVertexId = ADD(vVertexId, vStartVertex);
236
}
237
238
// store out vertex IDs
239
if (mVWidth == 16)
240
{
241
// store out in simd8 halves until core supports 16-wide natively
242
auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
243
auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
244
STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
245
STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
246
}
247
else if (mVWidth == 8)
248
{
249
STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
250
}
251
252
// store out cut mask if enabled
253
if (fetchState.bEnableCutIndex)
254
{
255
Value* vCutIndex = VIMMED1(fetchState.cutIndex);
256
Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
257
258
if (mVWidth == 16)
259
{
260
auto cutMaskLo = EXTRACT_16(cutMask, 0);
261
auto cutMaskHi = EXTRACT_16(cutMask, 1);
262
STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
263
STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
264
}
265
else if (mVWidth == 8)
266
{
267
STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
268
}
269
}
270
271
// Fetch attributes from memory and output to a simdvertex struct
272
JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
273
274
RET_VOID();
275
276
JitManager::DumpToFile(fetch, "src");
277
278
#if defined(_DEBUG)
279
verifyFunction(*fetch);
280
#endif
281
282
::FunctionPassManager setupPasses(JM()->mpCurrentModule);
283
284
///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
285
setupPasses.add(createBreakCriticalEdgesPass());
286
setupPasses.add(createCFGSimplificationPass());
287
setupPasses.add(createEarlyCSEPass());
288
setupPasses.add(createPromoteMemoryToRegisterPass());
289
290
setupPasses.run(*fetch);
291
292
JitManager::DumpToFile(fetch, "se");
293
294
::FunctionPassManager optPasses(JM()->mpCurrentModule);
295
296
///@todo Haven't touched these either. Need to remove some of these and add others.
297
optPasses.add(createCFGSimplificationPass());
298
optPasses.add(createEarlyCSEPass());
299
optPasses.add(createInstructionCombiningPass());
300
#if LLVM_VERSION_MAJOR <= 11
301
optPasses.add(createConstantPropagationPass());
302
#endif
303
optPasses.add(createSCCPPass());
304
optPasses.add(createAggressiveDCEPass());
305
306
optPasses.run(*fetch);
307
308
optPasses.add(createLowerX86Pass(this));
309
optPasses.run(*fetch);
310
311
JitManager::DumpToFile(fetch, "opt");
312
313
314
// Revert 16-wide override
315
#if USE_SIMD16_SHADERS
316
SetTargetWidth(baseWidth);
317
#endif
318
319
return fetch;
320
}
321
322
// returns true for odd formats that require special state.gather handling
323
bool FetchJit::IsOddFormat(SWR_FORMAT format)
324
{
325
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
326
if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
327
{
328
return true;
329
}
330
return false;
331
}
332
333
// format is uniform if all components are the same size and type
334
bool FetchJit::IsUniformFormat(SWR_FORMAT format)
335
{
336
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
337
uint32_t bpc0 = info.bpc[0];
338
uint32_t type0 = info.type[0];
339
340
for (uint32_t c = 1; c < info.numComps; ++c)
341
{
342
if (bpc0 != info.bpc[c] || type0 != info.type[c])
343
{
344
return false;
345
}
346
}
347
return true;
348
}
349
350
// unpacks components based on format
351
// foreach component in the pixel
352
// mask off everything but this component
353
// shift component to LSB
354
void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
355
{
356
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
357
358
uint32_t bitOffset = 0;
359
for (uint32_t c = 0; c < info.numComps; ++c)
360
{
361
uint32_t swizzledIndex = info.swizzle[c];
362
uint32_t compBits = info.bpc[c];
363
uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
364
Value* comp = AND(vInput, bitmask);
365
comp = LSHR(comp, bitOffset);
366
367
result[swizzledIndex] = comp;
368
bitOffset += compBits;
369
}
370
}
371
372
// gather for odd component size formats
373
// gather SIMD full pixels per lane then shift/mask to move each component to their
374
// own vector
375
void FetchJit::CreateGatherOddFormats(
376
SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
377
{
378
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
379
380
// only works if pixel size is <= 32bits
381
SWR_ASSERT(info.bpp <= 32);
382
383
Value* pGather;
384
if (info.bpp == 32)
385
{
386
pGather =
387
GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
388
}
389
else
390
{
391
// Can't use 32-bit gather for items less than 32-bits, could cause page faults.
392
Value* pMem = ALLOCA(mSimdInt32Ty);
393
STORE(VIMMED1(0u), pMem);
394
395
Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
396
397
for (uint32_t lane = 0; lane < mVWidth; ++lane)
398
{
399
// Get index
400
Value* index = VEXTRACT(pOffsets, C(lane));
401
Value* mask = VEXTRACT(pMask, C(lane));
402
403
// use branch around load based on mask
404
// Needed to avoid page-faults on unmasked lanes
405
BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
406
BasicBlock* pMaskedLoadBlock =
407
BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
408
BasicBlock* pEndLoadBB =
409
BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
410
411
COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
412
413
JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
414
415
switch (info.bpp)
416
{
417
case 8:
418
{
419
Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
420
Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
421
STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
422
break;
423
}
424
425
case 16:
426
{
427
Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
428
Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
429
STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
430
break;
431
}
432
break;
433
434
case 24:
435
{
436
// First 16-bits of data
437
Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
438
Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
439
STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
440
441
// Last 8-bits of data
442
pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
443
xpSrc = ADD(xpSrc, C((int64_t)2));
444
STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
445
break;
446
}
447
448
default:
449
SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
450
break;
451
}
452
453
BR(pEndLoadBB);
454
JM()->mBuilder.SetInsertPoint(pEndLoadBB);
455
}
456
457
pGather = LOAD(pMem);
458
}
459
460
for (uint32_t comp = 0; comp < 4; ++comp)
461
{
462
pResult[comp] = VIMMED1((int)info.defaults[comp]);
463
}
464
465
UnpackComponents(format, pGather, pResult);
466
467
// cast to fp32
468
pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
469
pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
470
pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
471
pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
472
}
473
474
void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
475
{
476
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
477
478
for (uint32_t c = 0; c < info.numComps; ++c)
479
{
480
uint32_t compIndex = info.swizzle[c];
481
482
// skip any conversion on UNUSED components
483
if (info.type[c] == SWR_TYPE_UNUSED)
484
{
485
continue;
486
}
487
488
if (info.isNormalized[c])
489
{
490
if (info.type[c] == SWR_TYPE_SNORM)
491
{
492
/// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
493
/// -1.0f.
494
495
/// result = c * (1.0f / (2^(n-1) - 1);
496
uint32_t n = info.bpc[c];
497
uint32_t pow2 = 1 << (n - 1);
498
float scale = 1.0f / (float)(pow2 - 1);
499
Value* vScale = VIMMED1(scale);
500
texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
501
texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
502
texels[compIndex] = FMUL(texels[compIndex], vScale);
503
}
504
else
505
{
506
SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
507
508
/// result = c * (1.0f / (2^n - 1))
509
uint32_t n = info.bpc[c];
510
uint32_t pow2 = 1 << n;
511
// special case 24bit unorm format, which requires a full divide to meet ULP
512
// requirement
513
if (n == 24)
514
{
515
float scale = (float)(pow2 - 1);
516
Value* vScale = VIMMED1(scale);
517
texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
518
texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
519
texels[compIndex] = FDIV(texels[compIndex], vScale);
520
}
521
else
522
{
523
float scale = 1.0f / (float)(pow2 - 1);
524
Value* vScale = VIMMED1(scale);
525
texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
526
texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
527
texels[compIndex] = FMUL(texels[compIndex], vScale);
528
}
529
}
530
continue;
531
}
532
}
533
}
534
535
//////////////////////////////////////////////////////////////////////////
536
/// @brief Loads attributes from memory using AVX2 GATHER(s)
537
/// @param fetchState - info about attributes to be fetched from memory
538
/// @param streams - value pointer to the current vertex stream
539
/// @param vIndices - vector value of indices to gather
540
/// @param pVtxOut - value pointer to output simdvertex struct
541
void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
542
Value* streams,
543
Value* vIndices,
544
Value* pVtxOut)
545
{
546
uint32_t currentVertexElement = 0;
547
uint32_t outputElt = 0;
548
Value* vVertexElements[4];
549
550
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
551
Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
552
Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
553
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
554
curInstance->setName("curInstance");
555
556
for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
557
{
558
const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
559
560
// skip element if all components are disabled
561
if (ied.ComponentPacking == ComponentEnable::NONE)
562
{
563
continue;
564
}
565
566
const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
567
SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
568
uint32_t bpc =
569
info.bpp /
570
info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
571
572
Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
573
574
Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
575
Value* vStride = VBROADCAST(stride);
576
577
// max vertex index that is fully in bounds
578
Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
579
maxVertex = LOAD(maxVertex);
580
581
Value* minVertex = NULL;
582
if (fetchState.bPartialVertexBuffer)
583
{
584
// min vertex index for low bounds OOB checking
585
minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
586
minVertex = LOAD(minVertex);
587
}
588
589
if (fetchState.bInstanceIDOffsetEnable)
590
{
591
// the InstanceID (curInstance) value is offset by StartInstanceLocation
592
curInstance = ADD(curInstance, startInstance);
593
}
594
595
Value* vCurIndices;
596
Value* startOffset;
597
Value* vInstanceStride = VIMMED1(0);
598
599
if (ied.InstanceEnable)
600
{
601
Value* stepRate = C(ied.InstanceAdvancementState);
602
603
// prevent a div by 0 for 0 step rate
604
Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
605
stepRate = SELECT(isNonZeroStep, stepRate, C(1));
606
607
// calc the current offset into instanced data buffer
608
Value* calcInstance = UDIV(curInstance, stepRate);
609
610
// if step rate is 0, every instance gets instance 0
611
calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
612
613
vCurIndices = VBROADCAST(calcInstance);
614
startOffset = startInstance;
615
}
616
else if (ied.InstanceStrideEnable)
617
{
618
// grab the instance advancement state, determines stride in bytes from one instance to
619
// the next
620
Value* stepRate = C(ied.InstanceAdvancementState);
621
vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
622
623
// offset indices by baseVertex
624
vCurIndices = ADD(vIndices, vBaseVertex);
625
626
startOffset = startVertex;
627
SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
628
}
629
else
630
{
631
// offset indices by baseVertex
632
vCurIndices = ADD(vIndices, vBaseVertex);
633
startOffset = startVertex;
634
}
635
636
// All of the OOB calculations are in vertices, not VB offsets, to prevent having to
637
// do 64bit address offset calculations.
638
639
// calculate byte offset to the start of the VB
640
Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
641
642
// VGATHER* takes an *i8 src pointer so that's what stream is
643
Value* pStreamBaseGFX = ADD(stream, baseOffset);
644
645
// if we have a start offset, subtract from max vertex. Used for OOB check
646
maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
647
Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
648
// if we have a negative value, we're already OOB. clamp at 0.
649
maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
650
651
if (fetchState.bPartialVertexBuffer)
652
{
653
// similary for min vertex
654
minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
655
Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
656
minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
657
}
658
659
// Load the in bounds size of a partially valid vertex
660
Value* partialInboundsSize =
661
GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
662
partialInboundsSize = LOAD(partialInboundsSize);
663
Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
664
Value* vBpp = VBROADCAST(C(info.Bpp));
665
Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
666
667
// is the element is <= the partially valid size
668
Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
669
670
// override cur indices with 0 if pitch is 0
671
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
672
vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
673
674
// are vertices partially OOB?
675
Value* vMaxVertex = VBROADCAST(maxVertex);
676
Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
677
678
// are vertices fully in bounds?
679
Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
680
681
Value* vGatherMask;
682
if (fetchState.bPartialVertexBuffer)
683
{
684
// are vertices below minVertex limit?
685
Value* vMinVertex = VBROADCAST(minVertex);
686
Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
687
688
// only fetch lanes that pass both tests
689
vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
690
}
691
else
692
{
693
vGatherMask = vMaxGatherMask;
694
}
695
696
// blend in any partially OOB indices that have valid elements
697
vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
698
699
// calculate the actual offsets into the VB
700
Value* vOffsets = MUL(vCurIndices, vStride);
701
vOffsets = ADD(vOffsets, vAlignmentOffsets);
702
703
// if instance stride enable is:
704
// true - add product of the instanceID and advancement state to the offset into the VB
705
// false - value of vInstanceStride has been initialized to zero
706
vOffsets = ADD(vOffsets, vInstanceStride);
707
708
// Packing and component control
709
ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
710
const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
711
(ComponentControl)ied.ComponentControl1,
712
(ComponentControl)ied.ComponentControl2,
713
(ComponentControl)ied.ComponentControl3};
714
715
// Special gather/conversion for formats without equal component sizes
716
if (IsOddFormat((SWR_FORMAT)ied.Format))
717
{
718
Value* pResults[4];
719
CreateGatherOddFormats(
720
(SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
721
ConvertFormat((SWR_FORMAT)ied.Format, pResults);
722
723
for (uint32_t c = 0; c < 4; c += 1)
724
{
725
if (isComponentEnabled(compMask, c))
726
{
727
vVertexElements[currentVertexElement++] = pResults[c];
728
if (currentVertexElement > 3)
729
{
730
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
731
// reset to the next vVertexElement to output
732
currentVertexElement = 0;
733
}
734
}
735
}
736
}
737
else if (info.type[0] == SWR_TYPE_FLOAT)
738
{
739
///@todo: support 64 bit vb accesses
740
Value* gatherSrc = VIMMED1(0.0f);
741
742
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
743
"Unsupported format for standard gather fetch.");
744
745
// Gather components from memory to store in a simdvertex structure
746
switch (bpc)
747
{
748
case 16:
749
{
750
Value* vGatherResult[2];
751
752
// if we have at least one component out of x or y to fetch
753
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
754
{
755
vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
756
// e.g. result of first 8x32bit integer gather for 16bit components
757
// 256i - 0 1 2 3 4 5 6 7
758
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
759
//
760
}
761
762
// if we have at least one component out of z or w to fetch
763
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
764
{
765
// offset base to the next components(zw) in the vertex to gather
766
pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
767
768
vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
769
// e.g. result of second 8x32bit integer gather for 16bit components
770
// 256i - 0 1 2 3 4 5 6 7
771
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
772
//
773
}
774
775
// if we have at least one component to shuffle into place
776
if (compMask)
777
{
778
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
779
pVtxOut,
780
Instruction::CastOps::FPExt,
781
CONVERT_NONE,
782
currentVertexElement,
783
outputElt,
784
compMask,
785
compCtrl,
786
vVertexElements);
787
788
// Shuffle gathered components into place in simdvertex struct
789
mVWidth == 16 ? Shuffle16bpcGather16(args)
790
: Shuffle16bpcGather(args); // outputs to vVertexElements ref
791
}
792
}
793
break;
794
case 32:
795
{
796
for (uint32_t i = 0; i < 4; i += 1)
797
{
798
if (isComponentEnabled(compMask, i))
799
{
800
// if we need to gather the component
801
if (compCtrl[i] == StoreSrc)
802
{
803
// Gather a SIMD of vertices
804
// APIs allow a 4GB range for offsets
805
// However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
806
// Add 2GB to the base pointer and 2GB to the offsets. This makes
807
// "negative" (large) offsets into positive offsets and small offsets
808
// into negative offsets.
809
Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
810
vVertexElements[currentVertexElement++] =
811
GATHERPS(gatherSrc,
812
ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
813
vNewOffsets,
814
vGatherMask,
815
1,
816
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
817
}
818
else
819
{
820
vVertexElements[currentVertexElement++] =
821
GenerateCompCtrlVector(compCtrl[i]);
822
}
823
824
if (currentVertexElement > 3)
825
{
826
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
827
// reset to the next vVertexElement to output
828
currentVertexElement = 0;
829
}
830
}
831
832
// offset base to the next component in the vertex to gather
833
pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
834
}
835
}
836
break;
837
case 64:
838
{
839
for (uint32_t i = 0; i < 4; i += 1)
840
{
841
if (isComponentEnabled(compMask, i))
842
{
843
// if we need to gather the component
844
if (compCtrl[i] == StoreSrc)
845
{
846
Value* vShufLo;
847
Value* vShufHi;
848
Value* vShufAll;
849
850
if (mVWidth == 8)
851
{
852
vShufLo = C({0, 1, 2, 3});
853
vShufHi = C({4, 5, 6, 7});
854
vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
855
}
856
else
857
{
858
SWR_ASSERT(mVWidth == 16);
859
vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
860
vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
861
vShufAll =
862
C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
863
}
864
865
Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
866
Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
867
868
Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
869
Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
870
871
Value* vZeroDouble = VECTOR_SPLAT(
872
mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
873
874
Value* pGatherLo =
875
GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
876
Value* pGatherHi =
877
GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
878
879
Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
880
pGather = FP_TRUNC(pGather, mSimdFP32Ty);
881
882
vVertexElements[currentVertexElement++] = pGather;
883
}
884
else
885
{
886
vVertexElements[currentVertexElement++] =
887
GenerateCompCtrlVector(compCtrl[i]);
888
}
889
890
if (currentVertexElement > 3)
891
{
892
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
893
// reset to the next vVertexElement to output
894
currentVertexElement = 0;
895
}
896
}
897
898
// offset base to the next component in the vertex to gather
899
pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
900
}
901
}
902
break;
903
default:
904
SWR_INVALID("Tried to fetch invalid FP format");
905
break;
906
}
907
}
908
else
909
{
910
Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
911
ConversionType conversionType = CONVERT_NONE;
912
913
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
914
"Unsupported format for standard gather fetch.");
915
916
switch (info.type[0])
917
{
918
case SWR_TYPE_UNORM:
919
conversionType = CONVERT_NORMALIZED;
920
case SWR_TYPE_UINT:
921
extendCastType = Instruction::CastOps::ZExt;
922
break;
923
case SWR_TYPE_SNORM:
924
conversionType = CONVERT_NORMALIZED;
925
case SWR_TYPE_SINT:
926
extendCastType = Instruction::CastOps::SExt;
927
break;
928
case SWR_TYPE_USCALED:
929
conversionType = CONVERT_USCALED;
930
extendCastType = Instruction::CastOps::UIToFP;
931
break;
932
case SWR_TYPE_SSCALED:
933
conversionType = CONVERT_SSCALED;
934
extendCastType = Instruction::CastOps::SIToFP;
935
break;
936
case SWR_TYPE_SFIXED:
937
conversionType = CONVERT_SFIXED;
938
extendCastType = Instruction::CastOps::SExt;
939
break;
940
default:
941
break;
942
}
943
944
// value substituted when component of gather is masked
945
Value* gatherSrc = VIMMED1(0);
946
947
// Gather components from memory to store in a simdvertex structure
948
switch (bpc)
949
{
950
case 8:
951
{
952
// if we have at least one component to fetch
953
if (compMask)
954
{
955
Value* vGatherResult = GATHERDD(gatherSrc,
956
pStreamBaseGFX,
957
vOffsets,
958
vGatherMask,
959
1,
960
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
961
// e.g. result of an 8x32bit integer gather for 8bit components
962
// 256i - 0 1 2 3 4 5 6 7
963
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
964
965
Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
966
pVtxOut,
967
extendCastType,
968
conversionType,
969
currentVertexElement,
970
outputElt,
971
compMask,
972
compCtrl,
973
vVertexElements,
974
info.swizzle);
975
976
// Shuffle gathered components into place in simdvertex struct
977
mVWidth == 16 ? Shuffle8bpcGatherd16(args)
978
: Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
979
}
980
}
981
break;
982
case 16:
983
{
984
Value* vGatherResult[2];
985
986
// if we have at least one component out of x or y to fetch
987
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
988
{
989
vGatherResult[0] = GATHERDD(gatherSrc,
990
pStreamBaseGFX,
991
vOffsets,
992
vGatherMask,
993
1,
994
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
995
// e.g. result of first 8x32bit integer gather for 16bit components
996
// 256i - 0 1 2 3 4 5 6 7
997
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
998
//
999
}
1000
1001
// if we have at least one component out of z or w to fetch
1002
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1003
{
1004
// offset base to the next components(zw) in the vertex to gather
1005
pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1006
1007
vGatherResult[1] = GATHERDD(gatherSrc,
1008
pStreamBaseGFX,
1009
vOffsets,
1010
vGatherMask,
1011
1,
1012
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1013
// e.g. result of second 8x32bit integer gather for 16bit components
1014
// 256i - 0 1 2 3 4 5 6 7
1015
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016
//
1017
}
1018
1019
// if we have at least one component to shuffle into place
1020
if (compMask)
1021
{
1022
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1023
pVtxOut,
1024
extendCastType,
1025
conversionType,
1026
currentVertexElement,
1027
outputElt,
1028
compMask,
1029
compCtrl,
1030
vVertexElements);
1031
1032
// Shuffle gathered components into place in simdvertex struct
1033
mVWidth == 16 ? Shuffle16bpcGather16(args)
1034
: Shuffle16bpcGather(args); // outputs to vVertexElements ref
1035
}
1036
}
1037
break;
1038
case 32:
1039
{
1040
// Gathered components into place in simdvertex struct
1041
for (uint32_t i = 0; i < 4; i++)
1042
{
1043
if (isComponentEnabled(compMask, i))
1044
{
1045
// if we need to gather the component
1046
if (compCtrl[i] == StoreSrc)
1047
{
1048
Value* pGather = GATHERDD(gatherSrc,
1049
pStreamBaseGFX,
1050
vOffsets,
1051
vGatherMask,
1052
1,
1053
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1054
1055
if (conversionType == CONVERT_USCALED)
1056
{
1057
pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1058
}
1059
else if (conversionType == CONVERT_SSCALED)
1060
{
1061
pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1062
}
1063
else if (conversionType == CONVERT_SFIXED)
1064
{
1065
pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1066
VBROADCAST(C(1 / 65536.0f)));
1067
}
1068
1069
vVertexElements[currentVertexElement++] = pGather;
1070
1071
// e.g. result of a single 8x32bit integer gather for 32bit components
1072
// 256i - 0 1 2 3 4 5 6 7
1073
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1074
}
1075
else
1076
{
1077
vVertexElements[currentVertexElement++] =
1078
GenerateCompCtrlVector(compCtrl[i]);
1079
}
1080
1081
if (currentVertexElement > 3)
1082
{
1083
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1084
1085
// reset to the next vVertexElement to output
1086
currentVertexElement = 0;
1087
}
1088
}
1089
1090
// offset base to the next component in the vertex to gather
1091
pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1092
}
1093
}
1094
break;
1095
}
1096
}
1097
}
1098
1099
// if we have a partially filled vVertexElement struct, output it
1100
if (currentVertexElement > 0)
1101
{
1102
StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1103
}
1104
}
1105
1106
1107
typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
1108
1109
template <typename T>
1110
void GetSimdValidIndicesGfx(gfxptr_t indices,
1111
gfxptr_t lastIndex,
1112
uint32_t vWidth,
1113
PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1114
void* pdc,
1115
uint32_t* outIndices,
1116
void* pWorkerData)
1117
{
1118
SWR_ASSERT(outIndices != nullptr);
1119
1120
gfxptr_t indexPtr = indices;
1121
for (int64_t lane = 0; lane < vWidth; lane++)
1122
{
1123
uint32_t index = 0;
1124
1125
if (indexPtr < lastIndex)
1126
{
1127
// translate indexPtr and load from it
1128
T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
1129
SWR_ASSERT(addr != nullptr);
1130
index = *addr;
1131
}
1132
1133
// index to 32 bits and insert into the correct simd lane
1134
outIndices[lane] = index;
1135
1136
indexPtr += sizeof(T);
1137
}
1138
}
1139
1140
void GetSimdValid8bitIndicesGfx(gfxptr_t indices,
1141
gfxptr_t lastIndex,
1142
uint32_t vWidth,
1143
PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1144
void* pdc,
1145
uint32_t* outIndices,
1146
void* pWorkerData)
1147
{
1148
GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1149
}
1150
1151
void GetSimdValid16bitIndicesGfx(gfxptr_t indices,
1152
gfxptr_t lastIndex,
1153
uint32_t vWidth,
1154
PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1155
void* pdc,
1156
uint32_t* outIndices,
1157
void* pWorkerData)
1158
{
1159
GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1160
}
1161
1162
1163
template <typename T>
1164
Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1165
{
1166
SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1167
"Function expects gfxptr_t for both input parameters.");
1168
1169
Type* Ty = nullptr;
1170
1171
static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1172
"Unsupported type for use with GetSimdValidIndicesHelper<T>");
1173
constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1174
if (bSize)
1175
{
1176
Ty = mInt16PtrTy;
1177
}
1178
else if (sizeof(T) == sizeof(uint8_t))
1179
{
1180
Ty = mInt8PtrTy;
1181
}
1182
else
1183
{
1184
SWR_ASSERT(false, "This should never happen as per static_assert above.");
1185
}
1186
1187
Value* vIndices = VUNDEF_I();
1188
1189
{
1190
// store 0 index on stack to be used to conditionally load from if index address is OOB
1191
Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1192
STORE(C((T)0), pZeroIndex);
1193
1194
// Load a SIMD of index pointers
1195
for (int64_t lane = 0; lane < mVWidth; lane++)
1196
{
1197
// Calculate the address of the requested index
1198
Value* pIndex = GEP(pIndices, C(lane), Ty);
1199
1200
pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1201
1202
// check if the address is less than the max index,
1203
Value* mask = ICMP_ULT(pIndex, pLastIndex);
1204
1205
// if valid, load the index. if not, load 0 from the stack
1206
Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1207
Value* index = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1208
1209
// zero extended index to 32 bits and insert into the correct simd lane
1210
index = Z_EXT(index, mInt32Ty);
1211
vIndices = VINSERT(vIndices, index, lane);
1212
}
1213
}
1214
1215
return vIndices;
1216
}
1217
1218
//////////////////////////////////////////////////////////////////////////
1219
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1220
/// *Note* have to do 8bit index checking in scalar until we have AVX-512
1221
/// support
1222
/// @param pIndices - pointer to 8 bit indices
1223
/// @param pLastIndex - pointer to last valid index
1224
Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1225
{
1226
return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1227
}
1228
1229
//////////////////////////////////////////////////////////////////////////
1230
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1231
/// *Note* have to do 16bit index checking in scalar until we have AVX-512
1232
/// support
1233
/// @param pIndices - pointer to 16 bit indices
1234
/// @param pLastIndex - pointer to last valid index
1235
Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1236
{
1237
return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1238
}
1239
1240
//////////////////////////////////////////////////////////////////////////
1241
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1242
/// @param pIndices - pointer to 32 bit indices
1243
/// @param pLastIndex - pointer to last valid index
1244
Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1245
{
1246
DataLayout dL(JM()->mpCurrentModule);
1247
Value* iLastIndex = pLastIndex;
1248
Value* iIndices = pIndices;
1249
1250
// get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1251
Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1252
numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1253
numIndicesLeft = SDIV(numIndicesLeft, C(4));
1254
1255
// create a vector of index counts from the base index ptr passed into the fetch
1256
Constant* vIndexOffsets;
1257
if (mVWidth == 8)
1258
{
1259
vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1260
}
1261
else
1262
{
1263
vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1264
}
1265
1266
// compare index count to the max valid index
1267
// e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1268
// vIndexOffsets 0 1 2 3 4 5 6 7
1269
// ------------------------------
1270
// vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1271
// vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1272
Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1273
Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1274
1275
// Load the indices; OOB loads 0
1276
return MASKED_LOAD(pIndices,
1277
4,
1278
vIndexMask,
1279
VIMMED1(0),
1280
"vIndices",
1281
PointerType::get(mSimdInt32Ty, 0),
1282
MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1283
}
1284
1285
//////////////////////////////////////////////////////////////////////////
1286
/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1287
/// denormalizes if needed, converts to F32 if needed, and positions in
1288
// the proper SIMD rows to be output to the simdvertex structure
1289
/// @param args: (tuple of args, listed below)
1290
/// @param vGatherResult - 8 gathered 8bpc vertices
1291
/// @param pVtxOut - base pointer to output simdvertex struct
1292
/// @param extendType - sign extend or zero extend
1293
/// @param bNormalized - do we need to denormalize?
1294
/// @param currentVertexElement - reference to the current vVertexElement
1295
/// @param outputElt - reference to the current offset from simdvertex we're o
1296
/// @param compMask - component packing mask
1297
/// @param compCtrl - component control val
1298
/// @param vVertexElements[4] - vertex components to output
1299
/// @param swizzle[4] - component swizzle location
1300
void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1301
{
1302
// Unpack tuple args
1303
Value*& vGatherResult = std::get<0>(args);
1304
Value* pVtxOut = std::get<1>(args);
1305
const Instruction::CastOps extendType = std::get<2>(args);
1306
const ConversionType conversionType = std::get<3>(args);
1307
uint32_t& currentVertexElement = std::get<4>(args);
1308
uint32_t& outputElt = std::get<5>(args);
1309
const ComponentEnable compMask = std::get<6>(args);
1310
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1311
Value*(&vVertexElements)[4] = std::get<8>(args);
1312
const uint32_t(&swizzle)[4] = std::get<9>(args);
1313
1314
// cast types
1315
Type* vGatherTy = getVectorType(mInt32Ty, 8);
1316
Type* v32x8Ty = getVectorType(mInt8Ty, 32);
1317
1318
// have to do extra work for sign extending
1319
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1320
{
1321
Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1322
Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1323
1324
// shuffle mask, including any swizzling
1325
const char x = (char)swizzle[0];
1326
const char y = (char)swizzle[1];
1327
const char z = (char)swizzle[2];
1328
const char w = (char)swizzle[3];
1329
Value* vConstMask = C<char>(
1330
{char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4),
1331
char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12),
1332
char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4),
1333
char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12),
1334
char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4),
1335
char(w + 8), char(w + 12)});
1336
1337
// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1338
1339
Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1340
Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1341
1342
Value* vShufResult_lo =
1343
BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1344
Value* vShufResult_hi =
1345
BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1346
1347
// after pshufb: group components together in each 128bit lane
1348
// 256i - 0 1 2 3 4 5 6 7
1349
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1350
1351
Value* vi128XY_lo = nullptr;
1352
Value* vi128XY_hi = nullptr;
1353
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1354
{
1355
vi128XY_lo = BITCAST(
1356
VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1357
v128Ty);
1358
vi128XY_hi = BITCAST(
1359
VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1360
v128Ty);
1361
1362
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1363
// 256i - 0 1 2 3 4 5 6 7
1364
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1365
}
1366
1367
// do the same for zw components
1368
Value* vi128ZW_lo = nullptr;
1369
Value* vi128ZW_hi = nullptr;
1370
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1371
{
1372
vi128ZW_lo = BITCAST(
1373
VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1374
v128Ty);
1375
vi128ZW_hi = BITCAST(
1376
VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1377
v128Ty);
1378
}
1379
1380
// init denormalize variables if needed
1381
Instruction::CastOps fpCast;
1382
Value* conversionFactor;
1383
1384
switch (conversionType)
1385
{
1386
case CONVERT_NORMALIZED:
1387
fpCast = Instruction::CastOps::SIToFP;
1388
conversionFactor = VIMMED1((float)(1.0 / 127.0));
1389
break;
1390
case CONVERT_SSCALED:
1391
fpCast = Instruction::CastOps::SIToFP;
1392
conversionFactor = VIMMED1((float)(1.0));
1393
break;
1394
case CONVERT_USCALED:
1395
assert(false && "Type should not be sign extended!");
1396
conversionFactor = nullptr;
1397
break;
1398
default:
1399
assert(conversionType == CONVERT_NONE);
1400
conversionFactor = nullptr;
1401
break;
1402
}
1403
1404
// sign extend all enabled components. If we have a fill vVertexElements, output to current
1405
// simdvertex
1406
for (uint32_t i = 0; i < 4; i++)
1407
{
1408
if (isComponentEnabled(compMask, i))
1409
{
1410
if (compCtrl[i] == ComponentControl::StoreSrc)
1411
{
1412
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1413
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1414
// if x or y, use vi128XY permute result, else use vi128ZW
1415
Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1416
Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1417
1418
// sign extend
1419
Value* temp_lo =
1420
PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1421
Value* temp_hi =
1422
PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1423
1424
Value* temp = JOIN_16(temp_lo, temp_hi);
1425
1426
// denormalize if needed
1427
if (conversionType != CONVERT_NONE)
1428
{
1429
temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1430
}
1431
1432
vVertexElements[currentVertexElement] = temp;
1433
1434
currentVertexElement += 1;
1435
}
1436
else
1437
{
1438
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1439
}
1440
1441
if (currentVertexElement > 3)
1442
{
1443
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1444
// reset to the next vVertexElement to output
1445
currentVertexElement = 0;
1446
}
1447
}
1448
}
1449
}
1450
// else zero extend
1451
else if ((extendType == Instruction::CastOps::ZExt) ||
1452
(extendType == Instruction::CastOps::UIToFP))
1453
{
1454
// init denormalize variables if needed
1455
Instruction::CastOps fpCast;
1456
Value* conversionFactor;
1457
1458
switch (conversionType)
1459
{
1460
case CONVERT_NORMALIZED:
1461
fpCast = Instruction::CastOps::UIToFP;
1462
conversionFactor = VIMMED1((float)(1.0 / 255.0));
1463
break;
1464
case CONVERT_USCALED:
1465
fpCast = Instruction::CastOps::UIToFP;
1466
conversionFactor = VIMMED1((float)(1.0));
1467
break;
1468
case CONVERT_SSCALED:
1469
assert(false && "Type should not be zero extended!");
1470
conversionFactor = nullptr;
1471
break;
1472
default:
1473
assert(conversionType == CONVERT_NONE);
1474
conversionFactor = nullptr;
1475
break;
1476
}
1477
1478
// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1479
for (uint32_t i = 0; i < 4; i++)
1480
{
1481
if (isComponentEnabled(compMask, i))
1482
{
1483
if (compCtrl[i] == ComponentControl::StoreSrc)
1484
{
1485
// pshufb masks for each component
1486
Value* vConstMask;
1487
switch (swizzle[i])
1488
{
1489
case 0:
1490
// x shuffle mask
1491
vConstMask =
1492
C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1493
0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1494
break;
1495
case 1:
1496
// y shuffle mask
1497
vConstMask =
1498
C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1499
1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1500
break;
1501
case 2:
1502
// z shuffle mask
1503
vConstMask =
1504
C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1505
2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1506
break;
1507
case 3:
1508
// w shuffle mask
1509
vConstMask =
1510
C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1511
3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1512
break;
1513
default:
1514
assert(false && "Invalid component");
1515
vConstMask = nullptr;
1516
break;
1517
}
1518
1519
Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1520
Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1521
1522
Value* temp_lo =
1523
BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1524
Value* temp_hi =
1525
BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1526
1527
// after pshufb for x channel
1528
// 256i - 0 1 2 3 4 5 6 7
1529
// x000 x000 x000 x000 x000 x000 x000 x000
1530
1531
Value* temp = JOIN_16(temp_lo, temp_hi);
1532
1533
// denormalize if needed
1534
if (conversionType != CONVERT_NONE)
1535
{
1536
temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1537
}
1538
1539
vVertexElements[currentVertexElement] = temp;
1540
1541
currentVertexElement += 1;
1542
}
1543
else
1544
{
1545
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1546
}
1547
1548
if (currentVertexElement > 3)
1549
{
1550
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1551
// reset to the next vVertexElement to output
1552
currentVertexElement = 0;
1553
}
1554
}
1555
}
1556
}
1557
else
1558
{
1559
SWR_INVALID("Unsupported conversion type");
1560
}
1561
}
1562
1563
void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1564
{
1565
// Unpack tuple args
1566
Value*& vGatherResult = std::get<0>(args);
1567
Value* pVtxOut = std::get<1>(args);
1568
const Instruction::CastOps extendType = std::get<2>(args);
1569
const ConversionType conversionType = std::get<3>(args);
1570
uint32_t& currentVertexElement = std::get<4>(args);
1571
uint32_t& outputElt = std::get<5>(args);
1572
const ComponentEnable compMask = std::get<6>(args);
1573
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1574
Value*(&vVertexElements)[4] = std::get<8>(args);
1575
const uint32_t(&swizzle)[4] = std::get<9>(args);
1576
1577
// cast types
1578
Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1579
1580
for (uint32_t i = 0; i < 4; i++)
1581
{
1582
if (!isComponentEnabled(compMask, i))
1583
continue;
1584
1585
if (compCtrl[i] == ComponentControl::StoreSrc)
1586
{
1587
#if LLVM_VERSION_MAJOR >= 11
1588
using MaskType = int32_t;
1589
#else
1590
using MaskType = uint32_t;
1591
#endif
1592
std::vector<MaskType> vShuffleMasks[4] = {
1593
{0, 4, 8, 12, 16, 20, 24, 28}, // x
1594
{1, 5, 9, 13, 17, 21, 25, 29}, // y
1595
{2, 6, 10, 14, 18, 22, 26, 30}, // z
1596
{3, 7, 11, 15, 19, 23, 27, 31}, // w
1597
};
1598
1599
Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1600
UndefValue::get(v32x8Ty),
1601
vShuffleMasks[swizzle[i]]);
1602
1603
if ((extendType == Instruction::CastOps::SExt) ||
1604
(extendType == Instruction::CastOps::SIToFP))
1605
{
1606
switch (conversionType)
1607
{
1608
case CONVERT_NORMALIZED:
1609
val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1610
break;
1611
case CONVERT_SSCALED:
1612
val = SI_TO_FP(val, mSimdFP32Ty);
1613
break;
1614
case CONVERT_USCALED:
1615
SWR_INVALID("Type should not be sign extended!");
1616
break;
1617
default:
1618
SWR_ASSERT(conversionType == CONVERT_NONE);
1619
val = S_EXT(val, mSimdInt32Ty);
1620
break;
1621
}
1622
}
1623
else if ((extendType == Instruction::CastOps::ZExt) ||
1624
(extendType == Instruction::CastOps::UIToFP))
1625
{
1626
switch (conversionType)
1627
{
1628
case CONVERT_NORMALIZED:
1629
val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1630
break;
1631
case CONVERT_SSCALED:
1632
SWR_INVALID("Type should not be zero extended!");
1633
break;
1634
case CONVERT_USCALED:
1635
val = UI_TO_FP(val, mSimdFP32Ty);
1636
break;
1637
default:
1638
SWR_ASSERT(conversionType == CONVERT_NONE);
1639
val = Z_EXT(val, mSimdInt32Ty);
1640
break;
1641
}
1642
}
1643
else
1644
{
1645
SWR_INVALID("Unsupported conversion type");
1646
}
1647
1648
vVertexElements[currentVertexElement++] = val;
1649
}
1650
else
1651
{
1652
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1653
}
1654
1655
if (currentVertexElement > 3)
1656
{
1657
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1658
// reset to the next vVertexElement to output
1659
currentVertexElement = 0;
1660
}
1661
}
1662
}
1663
1664
//////////////////////////////////////////////////////////////////////////
1665
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1666
/// denormalizes if needed, converts to F32 if needed, and positions in
1667
// the proper SIMD rows to be output to the simdvertex structure
1668
/// @param args: (tuple of args, listed below)
1669
/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1670
/// @param pVtxOut - base pointer to output simdvertex struct
1671
/// @param extendType - sign extend or zero extend
1672
/// @param bNormalized - do we need to denormalize?
1673
/// @param currentVertexElement - reference to the current vVertexElement
1674
/// @param outputElt - reference to the current offset from simdvertex we're o
1675
/// @param compMask - component packing mask
1676
/// @param compCtrl - component control val
1677
/// @param vVertexElements[4] - vertex components to output
1678
void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1679
{
1680
// Unpack tuple args
1681
Value*(&vGatherResult)[2] = std::get<0>(args);
1682
Value* pVtxOut = std::get<1>(args);
1683
const Instruction::CastOps extendType = std::get<2>(args);
1684
const ConversionType conversionType = std::get<3>(args);
1685
uint32_t& currentVertexElement = std::get<4>(args);
1686
uint32_t& outputElt = std::get<5>(args);
1687
const ComponentEnable compMask = std::get<6>(args);
1688
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1689
Value*(&vVertexElements)[4] = std::get<8>(args);
1690
1691
// cast types
1692
Type* vGatherTy = getVectorType(mInt32Ty, 8);
1693
Type* v32x8Ty = getVectorType(mInt8Ty, 32);
1694
1695
// have to do extra work for sign extending
1696
if ((extendType == Instruction::CastOps::SExt) ||
1697
(extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1698
{
1699
// is this PP float?
1700
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1701
1702
Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1703
Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1704
1705
// shuffle mask
1706
Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1707
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1708
Value* vi128XY_lo = nullptr;
1709
Value* vi128XY_hi = nullptr;
1710
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1711
{
1712
// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1713
// now..
1714
1715
Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1716
Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1717
1718
Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1719
Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1720
1721
// after pshufb: group components together in each 128bit lane
1722
// 256i - 0 1 2 3 4 5 6 7
1723
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1724
1725
vi128XY_lo = BITCAST(
1726
VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1727
v128bitTy);
1728
vi128XY_hi = BITCAST(
1729
VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1730
v128bitTy);
1731
1732
// after PERMD: move and pack xy components into each 128bit lane
1733
// 256i - 0 1 2 3 4 5 6 7
1734
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1735
}
1736
1737
// do the same for zw components
1738
Value* vi128ZW_lo = nullptr;
1739
Value* vi128ZW_hi = nullptr;
1740
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1741
{
1742
Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1743
Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1744
1745
Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1746
Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1747
1748
vi128ZW_lo = BITCAST(
1749
VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1750
v128bitTy);
1751
vi128ZW_hi = BITCAST(
1752
VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1753
v128bitTy);
1754
}
1755
1756
// init denormalize variables if needed
1757
Instruction::CastOps IntToFpCast;
1758
Value* conversionFactor;
1759
1760
switch (conversionType)
1761
{
1762
case CONVERT_NORMALIZED:
1763
IntToFpCast = Instruction::CastOps::SIToFP;
1764
conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1765
break;
1766
case CONVERT_SSCALED:
1767
IntToFpCast = Instruction::CastOps::SIToFP;
1768
conversionFactor = VIMMED1((float)(1.0));
1769
break;
1770
case CONVERT_USCALED:
1771
assert(false && "Type should not be sign extended!");
1772
conversionFactor = nullptr;
1773
break;
1774
default:
1775
assert(conversionType == CONVERT_NONE);
1776
conversionFactor = nullptr;
1777
break;
1778
}
1779
1780
// sign extend all enabled components. If we have a fill vVertexElements, output to current
1781
// simdvertex
1782
for (uint32_t i = 0; i < 4; i++)
1783
{
1784
if (isComponentEnabled(compMask, i))
1785
{
1786
if (compCtrl[i] == ComponentControl::StoreSrc)
1787
{
1788
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1789
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1790
// if x or y, use vi128XY permute result, else use vi128ZW
1791
Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1792
Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1793
1794
if (bFP)
1795
{
1796
// extract 128 bit lanes to sign extend each component
1797
Value* temp_lo =
1798
CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1799
Value* temp_hi =
1800
CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1801
1802
vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1803
}
1804
else
1805
{
1806
// extract 128 bit lanes to sign extend each component
1807
Value* temp_lo =
1808
PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1809
Value* temp_hi =
1810
PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1811
1812
Value* temp = JOIN_16(temp_lo, temp_hi);
1813
1814
// denormalize if needed
1815
if (conversionType != CONVERT_NONE)
1816
{
1817
temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1818
}
1819
1820
vVertexElements[currentVertexElement] = temp;
1821
}
1822
1823
currentVertexElement += 1;
1824
}
1825
else
1826
{
1827
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1828
}
1829
1830
if (currentVertexElement > 3)
1831
{
1832
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1833
// reset to the next vVertexElement to output
1834
currentVertexElement = 0;
1835
}
1836
}
1837
}
1838
}
1839
// else zero extend
1840
else if ((extendType == Instruction::CastOps::ZExt) ||
1841
(extendType == Instruction::CastOps::UIToFP))
1842
{
1843
// pshufb masks for each component
1844
Value* vConstMask[2];
1845
1846
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1847
{
1848
// x/z shuffle mask
1849
vConstMask[0] = C<char>({
1850
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1851
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1852
});
1853
}
1854
1855
if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1856
{
1857
// y/w shuffle mask
1858
vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1859
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1860
}
1861
1862
// init denormalize variables if needed
1863
Instruction::CastOps fpCast;
1864
Value* conversionFactor;
1865
1866
switch (conversionType)
1867
{
1868
case CONVERT_NORMALIZED:
1869
fpCast = Instruction::CastOps::UIToFP;
1870
conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1871
break;
1872
case CONVERT_USCALED:
1873
fpCast = Instruction::CastOps::UIToFP;
1874
conversionFactor = VIMMED1((float)(1.0f));
1875
break;
1876
case CONVERT_SSCALED:
1877
SWR_INVALID("Type should not be zero extended!");
1878
conversionFactor = nullptr;
1879
break;
1880
default:
1881
SWR_ASSERT(conversionType == CONVERT_NONE);
1882
conversionFactor = nullptr;
1883
break;
1884
}
1885
1886
// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1887
for (uint32_t i = 0; i < 4; i++)
1888
{
1889
if (isComponentEnabled(compMask, i))
1890
{
1891
if (compCtrl[i] == ComponentControl::StoreSrc)
1892
{
1893
// select correct constMask for x/z or y/w pshufb
1894
uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1895
// if x or y, use vi128XY permute result, else use vi128ZW
1896
uint32_t selectedGather = (i < 2) ? 0 : 1;
1897
1898
// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL,
1899
// for now..
1900
1901
Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1902
Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1903
1904
Value* temp_lo = BITCAST(
1905
PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1906
vGatherTy);
1907
Value* temp_hi = BITCAST(
1908
PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1909
vGatherTy);
1910
1911
// after pshufb mask for x channel; z uses the same shuffle from the second
1912
// gather 256i - 0 1 2 3 4 5 6 7
1913
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1914
1915
Value* temp = JOIN_16(temp_lo, temp_hi);
1916
1917
// denormalize if needed
1918
if (conversionType != CONVERT_NONE)
1919
{
1920
temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1921
}
1922
1923
vVertexElements[currentVertexElement] = temp;
1924
1925
currentVertexElement += 1;
1926
}
1927
else
1928
{
1929
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1930
}
1931
1932
if (currentVertexElement > 3)
1933
{
1934
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1935
// reset to the next vVertexElement to output
1936
currentVertexElement = 0;
1937
}
1938
}
1939
}
1940
}
1941
else
1942
{
1943
SWR_INVALID("Unsupported conversion type");
1944
}
1945
}
1946
1947
void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1948
{
1949
// Unpack tuple args
1950
Value*(&vGatherResult)[2] = std::get<0>(args);
1951
Value* pVtxOut = std::get<1>(args);
1952
const Instruction::CastOps extendType = std::get<2>(args);
1953
const ConversionType conversionType = std::get<3>(args);
1954
uint32_t& currentVertexElement = std::get<4>(args);
1955
uint32_t& outputElt = std::get<5>(args);
1956
const ComponentEnable compMask = std::get<6>(args);
1957
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1958
Value*(&vVertexElements)[4] = std::get<8>(args);
1959
1960
// cast types
1961
Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1962
Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1963
1964
// have to do extra work for sign extending
1965
if ((extendType == Instruction::CastOps::SExt) ||
1966
(extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1967
{
1968
// is this PP float?
1969
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1970
1971
Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1972
Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
1973
mVWidth / 4); // vwidth is units of 32 bits
1974
1975
// shuffle mask
1976
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1977
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1978
Value* vi128XY = nullptr;
1979
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1980
{
1981
Value* vShufResult =
1982
BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1983
// after pshufb: group components together in each 128bit lane
1984
// 256i - 0 1 2 3 4 5 6 7
1985
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1986
1987
vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1988
// after PERMD: move and pack xy components into each 128bit lane
1989
// 256i - 0 1 2 3 4 5 6 7
1990
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1991
}
1992
1993
// do the same for zw components
1994
Value* vi128ZW = nullptr;
1995
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1996
{
1997
Value* vShufResult =
1998
BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1999
vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2000
}
2001
2002
// init denormalize variables if needed
2003
Instruction::CastOps IntToFpCast;
2004
Value* conversionFactor;
2005
2006
switch (conversionType)
2007
{
2008
case CONVERT_NORMALIZED:
2009
IntToFpCast = Instruction::CastOps::SIToFP;
2010
conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2011
break;
2012
case CONVERT_SSCALED:
2013
IntToFpCast = Instruction::CastOps::SIToFP;
2014
conversionFactor = VIMMED1((float)(1.0));
2015
break;
2016
case CONVERT_USCALED:
2017
SWR_INVALID("Type should not be sign extended!");
2018
conversionFactor = nullptr;
2019
break;
2020
default:
2021
SWR_ASSERT(conversionType == CONVERT_NONE);
2022
conversionFactor = nullptr;
2023
break;
2024
}
2025
2026
// sign extend all enabled components. If we have a fill vVertexElements, output to current
2027
// simdvertex
2028
for (uint32_t i = 0; i < 4; i++)
2029
{
2030
if (isComponentEnabled(compMask, i))
2031
{
2032
if (compCtrl[i] == ComponentControl::StoreSrc)
2033
{
2034
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2035
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2036
// if x or y, use vi128XY permute result, else use vi128ZW
2037
Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2038
2039
if (bFP)
2040
{
2041
// extract 128 bit lanes to sign extend each component
2042
vVertexElements[currentVertexElement] =
2043
CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2044
}
2045
else
2046
{
2047
// extract 128 bit lanes to sign extend each component
2048
vVertexElements[currentVertexElement] =
2049
PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2050
2051
// denormalize if needed
2052
if (conversionType != CONVERT_NONE)
2053
{
2054
vVertexElements[currentVertexElement] =
2055
FMUL(CAST(IntToFpCast,
2056
vVertexElements[currentVertexElement],
2057
mSimdFP32Ty),
2058
conversionFactor);
2059
}
2060
}
2061
currentVertexElement++;
2062
}
2063
else
2064
{
2065
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2066
}
2067
2068
if (currentVertexElement > 3)
2069
{
2070
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2071
// reset to the next vVertexElement to output
2072
currentVertexElement = 0;
2073
}
2074
}
2075
}
2076
}
2077
// else zero extend
2078
else if ((extendType == Instruction::CastOps::ZExt) ||
2079
(extendType == Instruction::CastOps::UIToFP))
2080
{
2081
// pshufb masks for each component
2082
Value* vConstMask[2];
2083
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2084
{
2085
// x/z shuffle mask
2086
vConstMask[0] = C<char>({
2087
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2088
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2089
});
2090
}
2091
2092
if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2093
{
2094
// y/w shuffle mask
2095
vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2096
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2097
}
2098
2099
// init denormalize variables if needed
2100
Instruction::CastOps fpCast;
2101
Value* conversionFactor;
2102
2103
switch (conversionType)
2104
{
2105
case CONVERT_NORMALIZED:
2106
fpCast = Instruction::CastOps::UIToFP;
2107
conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2108
break;
2109
case CONVERT_USCALED:
2110
fpCast = Instruction::CastOps::UIToFP;
2111
conversionFactor = VIMMED1((float)(1.0f));
2112
break;
2113
case CONVERT_SSCALED:
2114
SWR_INVALID("Type should not be zero extended!");
2115
conversionFactor = nullptr;
2116
break;
2117
default:
2118
SWR_ASSERT(conversionType == CONVERT_NONE);
2119
conversionFactor = nullptr;
2120
break;
2121
}
2122
2123
// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2124
for (uint32_t i = 0; i < 4; i++)
2125
{
2126
if (isComponentEnabled(compMask, i))
2127
{
2128
if (compCtrl[i] == ComponentControl::StoreSrc)
2129
{
2130
// select correct constMask for x/z or y/w pshufb
2131
uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2132
// if x or y, use vi128XY permute result, else use vi128ZW
2133
uint32_t selectedGather = (i < 2) ? 0 : 1;
2134
2135
vVertexElements[currentVertexElement] =
2136
BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2137
vConstMask[selectedMask]),
2138
vGatherTy);
2139
// after pshufb mask for x channel; z uses the same shuffle from the second
2140
// gather 256i - 0 1 2 3 4 5 6 7
2141
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2142
2143
// denormalize if needed
2144
if (conversionType != CONVERT_NONE)
2145
{
2146
vVertexElements[currentVertexElement] =
2147
FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2148
conversionFactor);
2149
}
2150
currentVertexElement++;
2151
}
2152
else
2153
{
2154
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2155
}
2156
2157
if (currentVertexElement > 3)
2158
{
2159
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2160
// reset to the next vVertexElement to output
2161
currentVertexElement = 0;
2162
}
2163
}
2164
}
2165
}
2166
else
2167
{
2168
SWR_INVALID("Unsupported conversion type");
2169
}
2170
}
2171
2172
//////////////////////////////////////////////////////////////////////////
2173
/// @brief Output a simdvertex worth of elements to the current outputElt
2174
/// @param pVtxOut - base address of VIN output struct
2175
/// @param outputElt - simdvertex offset in VIN to write to
2176
/// @param numEltsToStore - number of simdvertex rows to write out
2177
/// @param vVertexElements - LLVM Value*[] simdvertex to write out
2178
void FetchJit::StoreVertexElements(Value* pVtxOut,
2179
const uint32_t outputElt,
2180
const uint32_t numEltsToStore,
2181
Value* (&vVertexElements)[4])
2182
{
2183
SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2184
2185
for (uint32_t c = 0; c < numEltsToStore; ++c)
2186
{
2187
// STORE expects FP32 x vWidth type, just bitcast if needed
2188
if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2189
{
2190
#if FETCH_DUMP_VERTEX
2191
PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2192
#endif
2193
vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2194
}
2195
#if FETCH_DUMP_VERTEX
2196
else
2197
{
2198
PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2199
}
2200
#endif
2201
// outputElt * 4 = offsetting by the size of a simdvertex
2202
// + c offsets to a 32bit x vWidth row within the current vertex
2203
Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2204
STORE(vVertexElements[c], dest);
2205
}
2206
}
2207
2208
//////////////////////////////////////////////////////////////////////////
2209
/// @brief Generates a constant vector of values based on the
2210
/// ComponentControl value
2211
/// @param ctrl - ComponentControl value
2212
Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2213
{
2214
switch (ctrl)
2215
{
2216
case NoStore:
2217
return VUNDEF_I();
2218
case Store0:
2219
return VIMMED1(0);
2220
case Store1Fp:
2221
return VIMMED1(1.0f);
2222
case Store1Int:
2223
return VIMMED1(1);
2224
case StoreVertexId:
2225
{
2226
if (mVWidth == 16)
2227
{
2228
Type* pSimd8FPTy = getVectorType(mFP32Ty, 8);
2229
Value* pIdLo =
2230
BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2231
Value* pIdHi =
2232
BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2233
return JOIN_16(pIdLo, pIdHi);
2234
}
2235
else
2236
{
2237
return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2238
}
2239
}
2240
case StoreInstanceId:
2241
{
2242
Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2243
return VBROADCAST(pId);
2244
}
2245
2246
2247
case StoreSrc:
2248
default:
2249
SWR_INVALID("Invalid component control");
2250
return VUNDEF_I();
2251
}
2252
}
2253
2254
//////////////////////////////////////////////////////////////////////////
2255
/// @brief Returns the enable mask for the specified component.
2256
/// @param enableMask - enable bits
2257
/// @param component - component to check if enabled.
2258
bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2259
{
2260
switch (component)
2261
{
2262
// X
2263
case 0:
2264
return (enableMask & ComponentEnable::X);
2265
// Y
2266
case 1:
2267
return (enableMask & ComponentEnable::Y);
2268
// Z
2269
case 2:
2270
return (enableMask & ComponentEnable::Z);
2271
// W
2272
case 3:
2273
return (enableMask & ComponentEnable::W);
2274
2275
default:
2276
return false;
2277
}
2278
}
2279
2280
// Don't want two threads compiling the same fetch shader simultaneously
2281
// Has problems in the JIT cache implementation
2282
// This is only a problem for fetch right now.
2283
static std::mutex gFetchCodegenMutex;
2284
2285
//////////////////////////////////////////////////////////////////////////
2286
/// @brief JITs from fetch shader IR
2287
/// @param hJitMgr - JitManager handle
2288
/// @param func - LLVM function IR
2289
/// @return PFN_FETCH_FUNC - pointer to fetch code
2290
PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2291
{
2292
const llvm::Function* func = (const llvm::Function*)hFunc;
2293
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2294
PFN_FETCH_FUNC pfnFetch;
2295
2296
gFetchCodegenMutex.lock();
2297
pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2298
// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2299
// add new IR to the module
2300
pJitMgr->mIsModuleFinalized = true;
2301
2302
#if defined(KNOB_SWRC_TRACING)
2303
char fName[1024];
2304
const char* funcName = func->getName().data();
2305
sprintf(fName, "%s.bin", funcName);
2306
FILE* fd = fopen(fName, "wb");
2307
fwrite((void*)pfnFetch, 1, 2048, fd);
2308
fclose(fd);
2309
#endif
2310
2311
pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2312
gFetchCodegenMutex.unlock();
2313
2314
2315
return pfnFetch;
2316
}
2317
2318
//////////////////////////////////////////////////////////////////////////
2319
/// @brief JIT compiles fetch shader
2320
/// @param hJitMgr - JitManager handle
2321
/// @param state - fetch state to build function from
2322
extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2323
{
2324
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2325
2326
pJitMgr->SetupNewModule();
2327
2328
FetchJit theJit(pJitMgr);
2329
HANDLE hFunc = theJit.Create(state);
2330
2331
return JitFetchFunc(hJitMgr, hFunc);
2332
}
2333
2334