CoCalc -- fetch

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
⁴⁵⁷⁴ views
1
/****************************************************************************
2
 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 * @file fetch_jit.cpp
24
 *
25
 * @brief Implementation of the fetch jitter
26
 *
27
 * Notes:
28
 *
29
 ******************************************************************************/
30
#include "jit_pch.hpp"
31
#include "builder_gfx_mem.h"
32
#include "jit_api.h"
33
#include "fetch_jit.h"
34
#include "gen_state_llvm.h"
35
#include "functionpasses/passes.h"
36

37
//#define FETCH_DUMP_VERTEX 1
38
using namespace llvm;
39
using namespace SwrJit;
40

41
bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42

43
enum ConversionType
44
{
45
    CONVERT_NONE,
46
    CONVERT_NORMALIZED,
47
    CONVERT_USCALED,
48
    CONVERT_SSCALED,
49
    CONVERT_SFIXED,
50
};
51

52
//////////////////////////////////////////////////////////////////////////
53
/// Interface to Jitting a fetch shader
54
//////////////////////////////////////////////////////////////////////////
55
struct FetchJit : public BuilderGfxMem
56
{
57
    FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {}
58

59
    Function* Create(const FETCH_COMPILE_STATE& fetchState);
60

61
    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62
    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63
    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
64
    template <typename T>
65
    Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
66

67
    // package up Shuffle*bpcGatherd args into a tuple for convenience
68
    typedef std::tuple<Value*&,
69
                       Value*,
70
                       const Instruction::CastOps,
71
                       const ConversionType,
72
                       uint32_t&,
73
                       uint32_t&,
74
                       const ComponentEnable,
75
                       const ComponentControl (&)[4],
76
                       Value* (&)[4],
77
                       const uint32_t (&)[4]>
78
        Shuffle8bpcArgs;
79

80
    void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81
    void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
82

83
    typedef std::tuple<Value* (&)[2],
84
                       Value*,
85
                       const Instruction::CastOps,
86
                       const ConversionType,
87
                       uint32_t&,
88
                       uint32_t&,
89
                       const ComponentEnable,
90
                       const ComponentControl (&)[4],
91
                       Value* (&)[4]>
92
        Shuffle16bpcArgs;
93

94
    void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95
    void Shuffle16bpcGather(Shuffle16bpcArgs& args);
96

97
    void StoreVertexElements(Value*         pVtxOut,
98
                             const uint32_t outputElt,
99
                             const uint32_t numEltsToStore,
100
                             Value* (&vVertexElements)[4]);
101

102
    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
103

104
    void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
105
                           Value*                     streams,
106
                           Value*                     vIndices,
107
                           Value*                     pVtxOut);
108

109
    bool IsOddFormat(SWR_FORMAT format);
110
    bool IsUniformFormat(SWR_FORMAT format);
111
    void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112
    void CreateGatherOddFormats(
113
        SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114
    void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
115

116
    Value* mpFetchInfo;
117
};
118

119
Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
120
{
121
    std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
122
    fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
123

124
    Function* fetch = Function::Create(
125
        JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
126
    BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
127

128
    fetch->getParent()->setModuleIdentifier(fetch->getName());
129

130
    IRB()->SetInsertPoint(entry);
131

132
    auto argitr = fetch->arg_begin();
133

134
    // Fetch shader arguments
135
    Value* privateContext = &*argitr;
136
    ++argitr;
137
    privateContext->setName("privateContext");
138
    SetPrivateContext(privateContext);
139

140
    mpWorkerData = &*argitr;
141
    ++argitr;
142
    mpWorkerData->setName("pWorkerData");
143

144
    mpFetchInfo = &*argitr;
145
    ++argitr;
146
    mpFetchInfo->setName("fetchInfo");
147
    Value* pVtxOut = &*argitr;
148
    pVtxOut->setName("vtxOutput");
149

150
    uint32_t baseWidth = mVWidth;
151

152
    SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
153

154
    // Override builder target width to force 16-wide SIMD
155
#if USE_SIMD16_SHADERS
156
    SetTargetWidth(16);
157
#endif
158

159
    pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
160

161
    // SWR_FETCH_CONTEXT::pStreams
162
    Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163
    streams->setName("pStreams");
164

165
    // SWR_FETCH_CONTEXT::pIndices
166
    Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167
    indices->setName("pIndices");
168

169
    // SWR_FETCH_CONTEXT::pLastIndex
170
    Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171
    pLastIndex->setName("pLastIndex");
172

173
    Value* vIndices;
174
    switch (fetchState.indexType)
175
    {
176
    case R8_UINT:
177
        indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178
        if (fetchState.bDisableIndexOOBCheck)
179
        {
180
            vIndices = LOAD(
181
                BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)),
182
                {(uint32_t)0});
183
            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
184
        }
185
        else
186
        {
187
            vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
188
        }
189
        break;
190
    case R16_UINT:
191
        if (fetchState.bDisableIndexOOBCheck)
192
        {
193
            vIndices = LOAD(
194
                BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)),
195
                {(uint32_t)0});
196
            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
197
        }
198
        else
199
        {
200
            vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
201
        }
202
        break;
203
    case R32_UINT:
204
        (fetchState.bDisableIndexOOBCheck)
205
            ? vIndices = LOAD(indices,
206
                              "",
207
                              PointerType::get(mSimdInt32Ty, 0),
208
                              MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
209
            : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
210
        break; // incoming type is already 32bit int
211
    default:
212
        vIndices = nullptr;
213
        assert(false && "Unsupported index type");
214
        break;
215
    }
216

217
    if (fetchState.bForceSequentialAccessEnable)
218
    {
219
        Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
220
                                       : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
221

222
        // VertexData buffers are accessed sequentially, the index is equal to the vertex number
223
        vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
224
        vIndices = ADD(vIndices, pOffsets);
225
    }
226

227
    Value* vVertexId = vIndices;
228
    if (fetchState.bVertexIDOffsetEnable)
229
    {
230
        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
231
        // correct
232
        Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
233
        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
234
        vVertexId           = ADD(vIndices, vBaseVertex);
235
        vVertexId           = ADD(vVertexId, vStartVertex);
236
    }
237

238
    // store out vertex IDs
239
    if (mVWidth == 16)
240
    {
241
        // store out in simd8 halves until core supports 16-wide natively
242
        auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
243
        auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
244
        STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
245
        STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
246
    }
247
    else if (mVWidth == 8)
248
    {
249
        STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
250
    }
251

252
    // store out cut mask if enabled
253
    if (fetchState.bEnableCutIndex)
254
    {
255
        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
256
        Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
257

258
        if (mVWidth == 16)
259
        {
260
            auto cutMaskLo = EXTRACT_16(cutMask, 0);
261
            auto cutMaskHi = EXTRACT_16(cutMask, 1);
262
            STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
263
            STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
264
        }
265
        else if (mVWidth == 8)
266
        {
267
            STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
268
        }
269
    }
270

271
    // Fetch attributes from memory and output to a simdvertex struct
272
    JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
273

274
    RET_VOID();
275

276
    JitManager::DumpToFile(fetch, "src");
277

278
#if defined(_DEBUG)
279
    verifyFunction(*fetch);
280
#endif
281

282
    ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
283

284
    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
285
    setupPasses.add(createBreakCriticalEdgesPass());
286
    setupPasses.add(createCFGSimplificationPass());
287
    setupPasses.add(createEarlyCSEPass());
288
    setupPasses.add(createPromoteMemoryToRegisterPass());
289

290
    setupPasses.run(*fetch);
291

292
    JitManager::DumpToFile(fetch, "se");
293

294
    ::FunctionPassManager optPasses(JM()->mpCurrentModule);
295

296
    ///@todo Haven't touched these either. Need to remove some of these and add others.
297
    optPasses.add(createCFGSimplificationPass());
298
    optPasses.add(createEarlyCSEPass());
299
    optPasses.add(createInstructionCombiningPass());
300
#if LLVM_VERSION_MAJOR <= 11
301
    optPasses.add(createConstantPropagationPass());
302
#endif
303
    optPasses.add(createSCCPPass());
304
    optPasses.add(createAggressiveDCEPass());
305

306
    optPasses.run(*fetch);
307

308
    optPasses.add(createLowerX86Pass(this));
309
    optPasses.run(*fetch);
310

311
    JitManager::DumpToFile(fetch, "opt");
312

313

314
    // Revert 16-wide override
315
#if USE_SIMD16_SHADERS
316
    SetTargetWidth(baseWidth);
317
#endif
318

319
    return fetch;
320
}
321

322
// returns true for odd formats that require special state.gather handling
323
bool FetchJit::IsOddFormat(SWR_FORMAT format)
324
{
325
    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
326
    if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
327
    {
328
        return true;
329
    }
330
    return false;
331
}
332

333
// format is uniform if all components are the same size and type
334
bool FetchJit::IsUniformFormat(SWR_FORMAT format)
335
{
336
    const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
337
    uint32_t               bpc0  = info.bpc[0];
338
    uint32_t               type0 = info.type[0];
339

340
    for (uint32_t c = 1; c < info.numComps; ++c)
341
    {
342
        if (bpc0 != info.bpc[c] || type0 != info.type[c])
343
        {
344
            return false;
345
        }
346
    }
347
    return true;
348
}
349

350
// unpacks components based on format
351
// foreach component in the pixel
352
//   mask off everything but this component
353
//   shift component to LSB
354
void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
355
{
356
    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
357

358
    uint32_t bitOffset = 0;
359
    for (uint32_t c = 0; c < info.numComps; ++c)
360
    {
361
        uint32_t swizzledIndex = info.swizzle[c];
362
        uint32_t compBits      = info.bpc[c];
363
        uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
364
        Value*   comp          = AND(vInput, bitmask);
365
        comp                   = LSHR(comp, bitOffset);
366

367
        result[swizzledIndex] = comp;
368
        bitOffset += compBits;
369
    }
370
}
371

372
// gather for odd component size formats
373
// gather SIMD full pixels per lane then shift/mask to move each component to their
374
// own vector
375
void FetchJit::CreateGatherOddFormats(
376
    SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
377
{
378
    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
379

380
    // only works if pixel size is <= 32bits
381
    SWR_ASSERT(info.bpp <= 32);
382

383
    Value* pGather;
384
    if (info.bpp == 32)
385
    {
386
        pGather =
387
            GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
388
    }
389
    else
390
    {
391
        // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
392
        Value* pMem = ALLOCA(mSimdInt32Ty);
393
        STORE(VIMMED1(0u), pMem);
394

395
        Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
396

397
        for (uint32_t lane = 0; lane < mVWidth; ++lane)
398
        {
399
            // Get index
400
            Value* index = VEXTRACT(pOffsets, C(lane));
401
            Value* mask  = VEXTRACT(pMask, C(lane));
402

403
            // use branch around load based on mask
404
            // Needed to avoid page-faults on unmasked lanes
405
            BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
406
            BasicBlock* pMaskedLoadBlock =
407
                BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
408
            BasicBlock* pEndLoadBB =
409
                BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
410

411
            COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
412

413
            JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
414

415
            switch (info.bpp)
416
            {
417
            case 8:
418
            {
419
                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
420
                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
421
                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
422
                break;
423
            }
424

425
            case 16:
426
            {
427
                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
428
                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
429
                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
430
                break;
431
            }
432
            break;
433

434
            case 24:
435
            {
436
                // First 16-bits of data
437
                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
438
                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
439
                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
440

441
                // Last 8-bits of data
442
                pDst  = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
443
                xpSrc = ADD(xpSrc, C((int64_t)2));
444
                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
445
                break;
446
            }
447

448
            default:
449
                SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
450
                break;
451
            }
452

453
            BR(pEndLoadBB);
454
            JM()->mBuilder.SetInsertPoint(pEndLoadBB);
455
        }
456

457
        pGather = LOAD(pMem);
458
    }
459

460
    for (uint32_t comp = 0; comp < 4; ++comp)
461
    {
462
        pResult[comp] = VIMMED1((int)info.defaults[comp]);
463
    }
464

465
    UnpackComponents(format, pGather, pResult);
466

467
    // cast to fp32
468
    pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
469
    pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
470
    pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
471
    pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
472
}
473

474
void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
475
{
476
    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
477

478
    for (uint32_t c = 0; c < info.numComps; ++c)
479
    {
480
        uint32_t compIndex = info.swizzle[c];
481

482
        // skip any conversion on UNUSED components
483
        if (info.type[c] == SWR_TYPE_UNUSED)
484
        {
485
            continue;
486
        }
487

488
        if (info.isNormalized[c])
489
        {
490
            if (info.type[c] == SWR_TYPE_SNORM)
491
            {
492
                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
493
                /// -1.0f.
494

495
                /// result = c * (1.0f / (2^(n-1) - 1);
496
                uint32_t n        = info.bpc[c];
497
                uint32_t pow2     = 1 << (n - 1);
498
                float    scale    = 1.0f / (float)(pow2 - 1);
499
                Value*   vScale   = VIMMED1(scale);
500
                texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
501
                texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
502
                texels[compIndex] = FMUL(texels[compIndex], vScale);
503
            }
504
            else
505
            {
506
                SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
507

508
                /// result = c * (1.0f / (2^n - 1))
509
                uint32_t n    = info.bpc[c];
510
                uint32_t pow2 = 1 << n;
511
                // special case 24bit unorm format, which requires a full divide to meet ULP
512
                // requirement
513
                if (n == 24)
514
                {
515
                    float  scale      = (float)(pow2 - 1);
516
                    Value* vScale     = VIMMED1(scale);
517
                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
518
                    texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
519
                    texels[compIndex] = FDIV(texels[compIndex], vScale);
520
                }
521
                else
522
                {
523
                    float  scale      = 1.0f / (float)(pow2 - 1);
524
                    Value* vScale     = VIMMED1(scale);
525
                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
526
                    texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
527
                    texels[compIndex] = FMUL(texels[compIndex], vScale);
528
                }
529
            }
530
            continue;
531
        }
532
    }
533
}
534

535
//////////////////////////////////////////////////////////////////////////
536
/// @brief Loads attributes from memory using AVX2 GATHER(s)
537
/// @param fetchState - info about attributes to be fetched from memory
538
/// @param streams - value pointer to the current vertex stream
539
/// @param vIndices - vector value of indices to gather
540
/// @param pVtxOut - value pointer to output simdvertex struct
541
void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
542
                                 Value*                     streams,
543
                                 Value*                     vIndices,
544
                                 Value*                     pVtxOut)
545
{
546
    uint32_t currentVertexElement = 0;
547
    uint32_t outputElt            = 0;
548
    Value*   vVertexElements[4];
549

550
    Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
551
    Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
552
    Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
553
    Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
554
    curInstance->setName("curInstance");
555

556
    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
557
    {
558
        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
559

560
        // skip element if all components are disabled
561
        if (ied.ComponentPacking == ComponentEnable::NONE)
562
        {
563
            continue;
564
        }
565

566
        const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
567
        SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
568
        uint32_t bpc =
569
            info.bpp /
570
            info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
571

572
        Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
573

574
        Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
575
        Value* vStride = VBROADCAST(stride);
576

577
        // max vertex index that is fully in bounds
578
        Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
579
        maxVertex        = LOAD(maxVertex);
580

581
        Value* minVertex = NULL;
582
        if (fetchState.bPartialVertexBuffer)
583
        {
584
            // min vertex index for low bounds OOB checking
585
            minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
586
            minVertex = LOAD(minVertex);
587
        }
588

589
        if (fetchState.bInstanceIDOffsetEnable)
590
        {
591
            // the InstanceID (curInstance) value is offset by StartInstanceLocation
592
            curInstance = ADD(curInstance, startInstance);
593
        }
594

595
        Value* vCurIndices;
596
        Value* startOffset;
597
        Value* vInstanceStride = VIMMED1(0);
598

599
        if (ied.InstanceEnable)
600
        {
601
            Value* stepRate = C(ied.InstanceAdvancementState);
602

603
            // prevent a div by 0 for 0 step rate
604
            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
605
            stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
606

607
            // calc the current offset into instanced data buffer
608
            Value* calcInstance = UDIV(curInstance, stepRate);
609

610
            // if step rate is 0, every instance gets instance 0
611
            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
612

613
            vCurIndices = VBROADCAST(calcInstance);
614
            startOffset = startInstance;
615
        }
616
        else if (ied.InstanceStrideEnable)
617
        {
618
            // grab the instance advancement state, determines stride in bytes from one instance to
619
            // the next
620
            Value* stepRate = C(ied.InstanceAdvancementState);
621
            vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
622

623
            // offset indices by baseVertex
624
            vCurIndices = ADD(vIndices, vBaseVertex);
625

626
            startOffset = startVertex;
627
            SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
628
        }
629
        else
630
        {
631
            // offset indices by baseVertex
632
            vCurIndices = ADD(vIndices, vBaseVertex);
633
            startOffset = startVertex;
634
        }
635

636
        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
637
        // do 64bit address offset calculations.
638

639
        // calculate byte offset to the start of the VB
640
        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
641

642
        // VGATHER* takes an *i8 src pointer so that's what stream is
643
        Value* pStreamBaseGFX = ADD(stream, baseOffset);
644

645
        // if we have a start offset, subtract from max vertex. Used for OOB check
646
        maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
647
        Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
648
        // if we have a negative value, we're already OOB. clamp at 0.
649
        maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
650

651
        if (fetchState.bPartialVertexBuffer)
652
        {
653
            // similary for min vertex
654
            minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
655
            Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
656
            minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
657
        }
658

659
        // Load the in bounds size of a partially valid vertex
660
        Value* partialInboundsSize =
661
            GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
662
        partialInboundsSize       = LOAD(partialInboundsSize);
663
        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
664
        Value* vBpp               = VBROADCAST(C(info.Bpp));
665
        Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
666

667
        // is the element is <= the partially valid size
668
        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
669

670
        // override cur indices with 0 if pitch is 0
671
        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
672
        vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
673

674
        // are vertices partially OOB?
675
        Value* vMaxVertex      = VBROADCAST(maxVertex);
676
        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
677

678
        // are vertices fully in bounds?
679
        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
680

681
        Value* vGatherMask;
682
        if (fetchState.bPartialVertexBuffer)
683
        {
684
            // are vertices below minVertex limit?
685
            Value* vMinVertex     = VBROADCAST(minVertex);
686
            Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
687

688
            // only fetch lanes that pass both tests
689
            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
690
        }
691
        else
692
        {
693
            vGatherMask = vMaxGatherMask;
694
        }
695

696
        // blend in any partially OOB indices that have valid elements
697
        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
698

699
        // calculate the actual offsets into the VB
700
        Value* vOffsets = MUL(vCurIndices, vStride);
701
        vOffsets        = ADD(vOffsets, vAlignmentOffsets);
702

703
        // if instance stride enable is:
704
        //  true  - add product of the instanceID and advancement state to the offset into the VB
705
        //  false - value of vInstanceStride has been initialized to zero
706
        vOffsets = ADD(vOffsets, vInstanceStride);
707

708
        // Packing and component control
709
        ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
710
        const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
711
                                           (ComponentControl)ied.ComponentControl1,
712
                                           (ComponentControl)ied.ComponentControl2,
713
                                           (ComponentControl)ied.ComponentControl3};
714

715
        // Special gather/conversion for formats without equal component sizes
716
        if (IsOddFormat((SWR_FORMAT)ied.Format))
717
        {
718
            Value* pResults[4];
719
            CreateGatherOddFormats(
720
                (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
721
            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
722

723
            for (uint32_t c = 0; c < 4; c += 1)
724
            {
725
                if (isComponentEnabled(compMask, c))
726
                {
727
                    vVertexElements[currentVertexElement++] = pResults[c];
728
                    if (currentVertexElement > 3)
729
                    {
730
                        StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
731
                        // reset to the next vVertexElement to output
732
                        currentVertexElement = 0;
733
                    }
734
                }
735
            }
736
        }
737
        else if (info.type[0] == SWR_TYPE_FLOAT)
738
        {
739
            ///@todo: support 64 bit vb accesses
740
            Value* gatherSrc = VIMMED1(0.0f);
741

742
            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
743
                       "Unsupported format for standard gather fetch.");
744

745
            // Gather components from memory to store in a simdvertex structure
746
            switch (bpc)
747
            {
748
            case 16:
749
            {
750
                Value* vGatherResult[2];
751

752
                // if we have at least one component out of x or y to fetch
753
                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
754
                {
755
                    vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
756
                    // e.g. result of first 8x32bit integer gather for 16bit components
757
                    // 256i - 0    1    2    3    4    5    6    7
758
                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
759
                    //
760
                }
761

762
                // if we have at least one component out of z or w to fetch
763
                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
764
                {
765
                    // offset base to the next components(zw) in the vertex to gather
766
                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
767

768
                    vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
769
                    // e.g. result of second 8x32bit integer gather for 16bit components
770
                    // 256i - 0    1    2    3    4    5    6    7
771
                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
772
                    //
773
                }
774

775
                // if we have at least one component to shuffle into place
776
                if (compMask)
777
                {
778
                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
779
                                                                  pVtxOut,
780
                                                                  Instruction::CastOps::FPExt,
781
                                                                  CONVERT_NONE,
782
                                                                  currentVertexElement,
783
                                                                  outputElt,
784
                                                                  compMask,
785
                                                                  compCtrl,
786
                                                                  vVertexElements);
787

788
                    // Shuffle gathered components into place in simdvertex struct
789
                    mVWidth == 16 ? Shuffle16bpcGather16(args)
790
                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
791
                }
792
            }
793
            break;
794
            case 32:
795
            {
796
                for (uint32_t i = 0; i < 4; i += 1)
797
                {
798
                    if (isComponentEnabled(compMask, i))
799
                    {
800
                        // if we need to gather the component
801
                        if (compCtrl[i] == StoreSrc)
802
                        {
803
                            // Gather a SIMD of vertices
804
                            // APIs allow a 4GB range for offsets
805
                            // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
806
                            // Add 2GB to the base pointer and 2GB to the offsets.  This makes
807
                            // "negative" (large) offsets into positive offsets and small offsets
808
                            // into negative offsets.
809
                            Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
810
                            vVertexElements[currentVertexElement++] =
811
                                GATHERPS(gatherSrc,
812
                                         ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
813
                                         vNewOffsets,
814
                                         vGatherMask,
815
                                         1,
816
                                         MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
817
                        }
818
                        else
819
                        {
820
                            vVertexElements[currentVertexElement++] =
821
                                GenerateCompCtrlVector(compCtrl[i]);
822
                        }
823

824
                        if (currentVertexElement > 3)
825
                        {
826
                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
827
                            // reset to the next vVertexElement to output
828
                            currentVertexElement = 0;
829
                        }
830
                    }
831

832
                    // offset base to the next component in the vertex to gather
833
                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
834
                }
835
            }
836
            break;
837
            case 64:
838
            {
839
                for (uint32_t i = 0; i < 4; i += 1)
840
                {
841
                    if (isComponentEnabled(compMask, i))
842
                    {
843
                        // if we need to gather the component
844
                        if (compCtrl[i] == StoreSrc)
845
                        {
846
                            Value* vShufLo;
847
                            Value* vShufHi;
848
                            Value* vShufAll;
849

850
                            if (mVWidth == 8)
851
                            {
852
                                vShufLo  = C({0, 1, 2, 3});
853
                                vShufHi  = C({4, 5, 6, 7});
854
                                vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
855
                            }
856
                            else
857
                            {
858
                                SWR_ASSERT(mVWidth == 16);
859
                                vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
860
                                vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
861
                                vShufAll =
862
                                    C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
863
                            }
864

865
                            Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
866
                            Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
867

868
                            Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
869
                            Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
870

871
                            Value* vZeroDouble = VECTOR_SPLAT(
872
                                mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
873

874
                            Value* pGatherLo =
875
                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
876
                            Value* pGatherHi =
877
                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
878

879
                            Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
880
                            pGather        = FP_TRUNC(pGather, mSimdFP32Ty);
881

882
                            vVertexElements[currentVertexElement++] = pGather;
883
                        }
884
                        else
885
                        {
886
                            vVertexElements[currentVertexElement++] =
887
                                GenerateCompCtrlVector(compCtrl[i]);
888
                        }
889

890
                        if (currentVertexElement > 3)
891
                        {
892
                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
893
                            // reset to the next vVertexElement to output
894
                            currentVertexElement = 0;
895
                        }
896
                    }
897

898
                    // offset base to the next component  in the vertex to gather
899
                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
900
                }
901
            }
902
            break;
903
            default:
904
                SWR_INVALID("Tried to fetch invalid FP format");
905
                break;
906
            }
907
        }
908
        else
909
        {
910
            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
911
            ConversionType       conversionType = CONVERT_NONE;
912

913
            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
914
                       "Unsupported format for standard gather fetch.");
915

916
            switch (info.type[0])
917
            {
918
            case SWR_TYPE_UNORM:
919
                conversionType = CONVERT_NORMALIZED;
920
            case SWR_TYPE_UINT:
921
                extendCastType = Instruction::CastOps::ZExt;
922
                break;
923
            case SWR_TYPE_SNORM:
924
                conversionType = CONVERT_NORMALIZED;
925
            case SWR_TYPE_SINT:
926
                extendCastType = Instruction::CastOps::SExt;
927
                break;
928
            case SWR_TYPE_USCALED:
929
                conversionType = CONVERT_USCALED;
930
                extendCastType = Instruction::CastOps::UIToFP;
931
                break;
932
            case SWR_TYPE_SSCALED:
933
                conversionType = CONVERT_SSCALED;
934
                extendCastType = Instruction::CastOps::SIToFP;
935
                break;
936
            case SWR_TYPE_SFIXED:
937
                conversionType = CONVERT_SFIXED;
938
                extendCastType = Instruction::CastOps::SExt;
939
                break;
940
            default:
941
                break;
942
            }
943

944
            // value substituted when component of gather is masked
945
            Value* gatherSrc = VIMMED1(0);
946

947
            // Gather components from memory to store in a simdvertex structure
948
            switch (bpc)
949
            {
950
            case 8:
951
            {
952
                // if we have at least one component to fetch
953
                if (compMask)
954
                {
955
                    Value* vGatherResult = GATHERDD(gatherSrc,
956
                                                    pStreamBaseGFX,
957
                                                    vOffsets,
958
                                                    vGatherMask,
959
                                                    1,
960
                                                    MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
961
                    // e.g. result of an 8x32bit integer gather for 8bit components
962
                    // 256i - 0    1    2    3    4    5    6    7
963
                    //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
964

965
                    Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
966
                                                                 pVtxOut,
967
                                                                 extendCastType,
968
                                                                 conversionType,
969
                                                                 currentVertexElement,
970
                                                                 outputElt,
971
                                                                 compMask,
972
                                                                 compCtrl,
973
                                                                 vVertexElements,
974
                                                                 info.swizzle);
975

976
                    // Shuffle gathered components into place in simdvertex struct
977
                    mVWidth == 16 ? Shuffle8bpcGatherd16(args)
978
                                  : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
979
                }
980
            }
981
            break;
982
            case 16:
983
            {
984
                Value* vGatherResult[2];
985

986
                // if we have at least one component out of x or y to fetch
987
                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
988
                {
989
                    vGatherResult[0] = GATHERDD(gatherSrc,
990
                                                pStreamBaseGFX,
991
                                                vOffsets,
992
                                                vGatherMask,
993
                                                1,
994
                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
995
                    // e.g. result of first 8x32bit integer gather for 16bit components
996
                    // 256i - 0    1    2    3    4    5    6    7
997
                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
998
                    //
999
                }
1000

1001
                // if we have at least one component out of z or w to fetch
1002
                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1003
                {
1004
                    // offset base to the next components(zw) in the vertex to gather
1005
                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1006

1007
                    vGatherResult[1] = GATHERDD(gatherSrc,
1008
                                                pStreamBaseGFX,
1009
                                                vOffsets,
1010
                                                vGatherMask,
1011
                                                1,
1012
                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1013
                    // e.g. result of second 8x32bit integer gather for 16bit components
1014
                    // 256i - 0    1    2    3    4    5    6    7
1015
                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016
                    //
1017
                }
1018

1019
                // if we have at least one component to shuffle into place
1020
                if (compMask)
1021
                {
1022
                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1023
                                                                  pVtxOut,
1024
                                                                  extendCastType,
1025
                                                                  conversionType,
1026
                                                                  currentVertexElement,
1027
                                                                  outputElt,
1028
                                                                  compMask,
1029
                                                                  compCtrl,
1030
                                                                  vVertexElements);
1031

1032
                    // Shuffle gathered components into place in simdvertex struct
1033
                    mVWidth == 16 ? Shuffle16bpcGather16(args)
1034
                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1035
                }
1036
            }
1037
            break;
1038
            case 32:
1039
            {
1040
                // Gathered components into place in simdvertex struct
1041
                for (uint32_t i = 0; i < 4; i++)
1042
                {
1043
                    if (isComponentEnabled(compMask, i))
1044
                    {
1045
                        // if we need to gather the component
1046
                        if (compCtrl[i] == StoreSrc)
1047
                        {
1048
                            Value* pGather = GATHERDD(gatherSrc,
1049
                                                      pStreamBaseGFX,
1050
                                                      vOffsets,
1051
                                                      vGatherMask,
1052
                                                      1,
1053
                                                      MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1054

1055
                            if (conversionType == CONVERT_USCALED)
1056
                            {
1057
                                pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1058
                            }
1059
                            else if (conversionType == CONVERT_SSCALED)
1060
                            {
1061
                                pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1062
                            }
1063
                            else if (conversionType == CONVERT_SFIXED)
1064
                            {
1065
                                pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1066
                                               VBROADCAST(C(1 / 65536.0f)));
1067
                            }
1068

1069
                            vVertexElements[currentVertexElement++] = pGather;
1070

1071
                            // e.g. result of a single 8x32bit integer gather for 32bit components
1072
                            // 256i - 0    1    2    3    4    5    6    7
1073
                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1074
                        }
1075
                        else
1076
                        {
1077
                            vVertexElements[currentVertexElement++] =
1078
                                GenerateCompCtrlVector(compCtrl[i]);
1079
                        }
1080

1081
                        if (currentVertexElement > 3)
1082
                        {
1083
                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1084

1085
                            // reset to the next vVertexElement to output
1086
                            currentVertexElement = 0;
1087
                        }
1088
                    }
1089

1090
                    // offset base to the next component  in the vertex to gather
1091
                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1092
                }
1093
            }
1094
            break;
1095
            }
1096
        }
1097
    }
1098

1099
    // if we have a partially filled vVertexElement struct, output it
1100
    if (currentVertexElement > 0)
1101
    {
1102
        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1103
    }
1104
}
1105

1106

1107
typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
1108

1109
template <typename T>
1110
void GetSimdValidIndicesGfx(gfxptr_t                     indices,
1111
                            gfxptr_t                     lastIndex,
1112
                            uint32_t                     vWidth,
1113
                            PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1114
                            void*                        pdc,
1115
                            uint32_t*                    outIndices,
1116
                            void*                        pWorkerData)
1117
{
1118
    SWR_ASSERT(outIndices != nullptr);
1119

1120
    gfxptr_t indexPtr = indices;
1121
    for (int64_t lane = 0; lane < vWidth; lane++)
1122
    {
1123
        uint32_t index = 0;
1124

1125
        if (indexPtr < lastIndex)
1126
        {
1127
            // translate indexPtr and load from it
1128
            T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
1129
            SWR_ASSERT(addr != nullptr);
1130
            index = *addr;
1131
        }
1132

1133
        // index to 32 bits and insert into the correct simd lane
1134
        outIndices[lane] = index;
1135

1136
        indexPtr += sizeof(T);
1137
    }
1138
}
1139

1140
void GetSimdValid8bitIndicesGfx(gfxptr_t                     indices,
1141
                                gfxptr_t                     lastIndex,
1142
                                uint32_t                     vWidth,
1143
                                PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1144
                                void*                        pdc,
1145
                                uint32_t*                    outIndices,
1146
                                void*                        pWorkerData)
1147
{
1148
    GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1149
}
1150

1151
void GetSimdValid16bitIndicesGfx(gfxptr_t                     indices,
1152
                                 gfxptr_t                     lastIndex,
1153
                                 uint32_t                     vWidth,
1154
                                 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1155
                                 void*                        pdc,
1156
                                 uint32_t*                    outIndices,
1157
                                 void*                        pWorkerData)
1158
{
1159
    GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1160
}
1161

1162

1163
template <typename T>
1164
Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1165
{
1166
    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1167
               "Function expects gfxptr_t for both input parameters.");
1168

1169
    Type* Ty = nullptr;
1170

1171
    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1172
                  "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1173
    constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1174
    if (bSize)
1175
    {
1176
        Ty = mInt16PtrTy;
1177
    }
1178
    else if (sizeof(T) == sizeof(uint8_t))
1179
    {
1180
        Ty = mInt8PtrTy;
1181
    }
1182
    else
1183
    {
1184
        SWR_ASSERT(false, "This should never happen as per static_assert above.");
1185
    }
1186

1187
    Value* vIndices = VUNDEF_I();
1188

1189
    {
1190
        // store 0 index on stack to be used to conditionally load from if index address is OOB
1191
        Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1192
        STORE(C((T)0), pZeroIndex);
1193

1194
        // Load a SIMD of index pointers
1195
        for (int64_t lane = 0; lane < mVWidth; lane++)
1196
        {
1197
            // Calculate the address of the requested index
1198
            Value* pIndex = GEP(pIndices, C(lane), Ty);
1199

1200
            pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1201

1202
            // check if the address is less than the max index,
1203
            Value* mask = ICMP_ULT(pIndex, pLastIndex);
1204

1205
            // if valid, load the index. if not, load 0 from the stack
1206
            Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1207
            Value* index  = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1208

1209
            // zero extended index to 32 bits and insert into the correct simd lane
1210
            index    = Z_EXT(index, mInt32Ty);
1211
            vIndices = VINSERT(vIndices, index, lane);
1212
        }
1213
    }
1214

1215
    return vIndices;
1216
}
1217

1218
//////////////////////////////////////////////////////////////////////////
1219
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1220
/// *Note* have to do 8bit index checking in scalar until we have AVX-512
1221
/// support
1222
/// @param pIndices - pointer to 8 bit indices
1223
/// @param pLastIndex - pointer to last valid index
1224
Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1225
{
1226
    return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1227
}
1228

1229
//////////////////////////////////////////////////////////////////////////
1230
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1231
/// *Note* have to do 16bit index checking in scalar until we have AVX-512
1232
/// support
1233
/// @param pIndices - pointer to 16 bit indices
1234
/// @param pLastIndex - pointer to last valid index
1235
Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1236
{
1237
    return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1238
}
1239

1240
//////////////////////////////////////////////////////////////////////////
1241
/// @brief Loads a simd of valid indices. OOB indices are set to 0
1242
/// @param pIndices - pointer to 32 bit indices
1243
/// @param pLastIndex - pointer to last valid index
1244
Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1245
{
1246
    DataLayout dL(JM()->mpCurrentModule);
1247
    Value*     iLastIndex = pLastIndex;
1248
    Value*     iIndices   = pIndices;
1249

1250
    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1251
    Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1252
    numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1253
    numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1254

1255
    // create a vector of index counts from the base index ptr passed into the fetch
1256
    Constant* vIndexOffsets;
1257
    if (mVWidth == 8)
1258
    {
1259
        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1260
    }
1261
    else
1262
    {
1263
        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1264
    }
1265

1266
    // compare index count to the max valid index
1267
    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1268
    //     vIndexOffsets  0 1 2 3 4 5 6 7
1269
    //     ------------------------------
1270
    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1271
    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1272
    Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1273
    Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1274

1275
    // Load the indices; OOB loads 0
1276
    return MASKED_LOAD(pIndices,
1277
                       4,
1278
                       vIndexMask,
1279
                       VIMMED1(0),
1280
                       "vIndices",
1281
                       PointerType::get(mSimdInt32Ty, 0),
1282
                       MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1283
}
1284

1285
//////////////////////////////////////////////////////////////////////////
1286
/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1287
/// denormalizes if needed, converts to F32 if needed, and positions in
1288
//  the proper SIMD rows to be output to the simdvertex structure
1289
/// @param args: (tuple of args, listed below)
1290
///   @param vGatherResult - 8 gathered 8bpc vertices
1291
///   @param pVtxOut - base pointer to output simdvertex struct
1292
///   @param extendType - sign extend or zero extend
1293
///   @param bNormalized - do we need to denormalize?
1294
///   @param currentVertexElement - reference to the current vVertexElement
1295
///   @param outputElt - reference to the current offset from simdvertex we're o
1296
///   @param compMask - component packing mask
1297
///   @param compCtrl - component control val
1298
///   @param vVertexElements[4] - vertex components to output
1299
///   @param swizzle[4] - component swizzle location
1300
void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1301
{
1302
    // Unpack tuple args
1303
    Value*&                    vGatherResult        = std::get<0>(args);
1304
    Value*                     pVtxOut              = std::get<1>(args);
1305
    const Instruction::CastOps extendType           = std::get<2>(args);
1306
    const ConversionType       conversionType       = std::get<3>(args);
1307
    uint32_t&                  currentVertexElement = std::get<4>(args);
1308
    uint32_t&                  outputElt            = std::get<5>(args);
1309
    const ComponentEnable      compMask             = std::get<6>(args);
1310
    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1311
    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1312
    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1313

1314
    // cast types
1315
    Type* vGatherTy = getVectorType(mInt32Ty, 8);
1316
    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
1317

1318
    // have to do extra work for sign extending
1319
    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1320
    {
1321
        Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1322
        Type* v128Ty  = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1323

1324
        // shuffle mask, including any swizzling
1325
        const char x          = (char)swizzle[0];
1326
        const char y          = (char)swizzle[1];
1327
        const char z          = (char)swizzle[2];
1328
        const char w          = (char)swizzle[3];
1329
        Value*     vConstMask = C<char>(
1330
            {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1331
             char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1332
             char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1333
             char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1334
             char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1335
             char(w + 8), char(w + 12)});
1336

1337
        // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1338

1339
        Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1340
        Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1341

1342
        Value* vShufResult_lo =
1343
            BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1344
        Value* vShufResult_hi =
1345
            BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1346

1347
        // after pshufb: group components together in each 128bit lane
1348
        // 256i - 0    1    2    3    4    5    6    7
1349
        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1350

1351
        Value* vi128XY_lo = nullptr;
1352
        Value* vi128XY_hi = nullptr;
1353
        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1354
        {
1355
            vi128XY_lo = BITCAST(
1356
                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1357
                v128Ty);
1358
            vi128XY_hi = BITCAST(
1359
                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1360
                v128Ty);
1361

1362
            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1363
            // 256i - 0    1    2    3    4    5    6    7
1364
            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1365
        }
1366

1367
        // do the same for zw components
1368
        Value* vi128ZW_lo = nullptr;
1369
        Value* vi128ZW_hi = nullptr;
1370
        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1371
        {
1372
            vi128ZW_lo = BITCAST(
1373
                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1374
                v128Ty);
1375
            vi128ZW_hi = BITCAST(
1376
                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1377
                v128Ty);
1378
        }
1379

1380
        // init denormalize variables if needed
1381
        Instruction::CastOps fpCast;
1382
        Value*               conversionFactor;
1383

1384
        switch (conversionType)
1385
        {
1386
        case CONVERT_NORMALIZED:
1387
            fpCast           = Instruction::CastOps::SIToFP;
1388
            conversionFactor = VIMMED1((float)(1.0 / 127.0));
1389
            break;
1390
        case CONVERT_SSCALED:
1391
            fpCast           = Instruction::CastOps::SIToFP;
1392
            conversionFactor = VIMMED1((float)(1.0));
1393
            break;
1394
        case CONVERT_USCALED:
1395
            assert(false && "Type should not be sign extended!");
1396
            conversionFactor = nullptr;
1397
            break;
1398
        default:
1399
            assert(conversionType == CONVERT_NONE);
1400
            conversionFactor = nullptr;
1401
            break;
1402
        }
1403

1404
        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1405
        // simdvertex
1406
        for (uint32_t i = 0; i < 4; i++)
1407
        {
1408
            if (isComponentEnabled(compMask, i))
1409
            {
1410
                if (compCtrl[i] == ComponentControl::StoreSrc)
1411
                {
1412
                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1413
                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1414
                    // if x or y, use vi128XY permute result, else use vi128ZW
1415
                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1416
                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1417

1418
                    // sign extend
1419
                    Value* temp_lo =
1420
                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1421
                    Value* temp_hi =
1422
                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1423

1424
                    Value* temp = JOIN_16(temp_lo, temp_hi);
1425

1426
                    // denormalize if needed
1427
                    if (conversionType != CONVERT_NONE)
1428
                    {
1429
                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1430
                    }
1431

1432
                    vVertexElements[currentVertexElement] = temp;
1433

1434
                    currentVertexElement += 1;
1435
                }
1436
                else
1437
                {
1438
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1439
                }
1440

1441
                if (currentVertexElement > 3)
1442
                {
1443
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1444
                    // reset to the next vVertexElement to output
1445
                    currentVertexElement = 0;
1446
                }
1447
            }
1448
        }
1449
    }
1450
    // else zero extend
1451
    else if ((extendType == Instruction::CastOps::ZExt) ||
1452
             (extendType == Instruction::CastOps::UIToFP))
1453
    {
1454
        // init denormalize variables if needed
1455
        Instruction::CastOps fpCast;
1456
        Value*               conversionFactor;
1457

1458
        switch (conversionType)
1459
        {
1460
        case CONVERT_NORMALIZED:
1461
            fpCast           = Instruction::CastOps::UIToFP;
1462
            conversionFactor = VIMMED1((float)(1.0 / 255.0));
1463
            break;
1464
        case CONVERT_USCALED:
1465
            fpCast           = Instruction::CastOps::UIToFP;
1466
            conversionFactor = VIMMED1((float)(1.0));
1467
            break;
1468
        case CONVERT_SSCALED:
1469
            assert(false && "Type should not be zero extended!");
1470
            conversionFactor = nullptr;
1471
            break;
1472
        default:
1473
            assert(conversionType == CONVERT_NONE);
1474
            conversionFactor = nullptr;
1475
            break;
1476
        }
1477

1478
        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1479
        for (uint32_t i = 0; i < 4; i++)
1480
        {
1481
            if (isComponentEnabled(compMask, i))
1482
            {
1483
                if (compCtrl[i] == ComponentControl::StoreSrc)
1484
                {
1485
                    // pshufb masks for each component
1486
                    Value* vConstMask;
1487
                    switch (swizzle[i])
1488
                    {
1489
                    case 0:
1490
                        // x shuffle mask
1491
                        vConstMask =
1492
                            C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1493
                                     0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1494
                        break;
1495
                    case 1:
1496
                        // y shuffle mask
1497
                        vConstMask =
1498
                            C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1499
                                     1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1500
                        break;
1501
                    case 2:
1502
                        // z shuffle mask
1503
                        vConstMask =
1504
                            C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1505
                                     2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1506
                        break;
1507
                    case 3:
1508
                        // w shuffle mask
1509
                        vConstMask =
1510
                            C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1511
                                     3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1512
                        break;
1513
                    default:
1514
                        assert(false && "Invalid component");
1515
                        vConstMask = nullptr;
1516
                        break;
1517
                    }
1518

1519
                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1520
                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1521

1522
                    Value* temp_lo =
1523
                        BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1524
                    Value* temp_hi =
1525
                        BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1526

1527
                    // after pshufb for x channel
1528
                    // 256i - 0    1    2    3    4    5    6    7
1529
                    //        x000 x000 x000 x000 x000 x000 x000 x000
1530

1531
                    Value* temp = JOIN_16(temp_lo, temp_hi);
1532

1533
                    // denormalize if needed
1534
                    if (conversionType != CONVERT_NONE)
1535
                    {
1536
                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1537
                    }
1538

1539
                    vVertexElements[currentVertexElement] = temp;
1540

1541
                    currentVertexElement += 1;
1542
                }
1543
                else
1544
                {
1545
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1546
                }
1547

1548
                if (currentVertexElement > 3)
1549
                {
1550
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1551
                    // reset to the next vVertexElement to output
1552
                    currentVertexElement = 0;
1553
                }
1554
            }
1555
        }
1556
    }
1557
    else
1558
    {
1559
        SWR_INVALID("Unsupported conversion type");
1560
    }
1561
}
1562

1563
void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1564
{
1565
    // Unpack tuple args
1566
    Value*&                    vGatherResult        = std::get<0>(args);
1567
    Value*                     pVtxOut              = std::get<1>(args);
1568
    const Instruction::CastOps extendType           = std::get<2>(args);
1569
    const ConversionType       conversionType       = std::get<3>(args);
1570
    uint32_t&                  currentVertexElement = std::get<4>(args);
1571
    uint32_t&                  outputElt            = std::get<5>(args);
1572
    const ComponentEnable      compMask             = std::get<6>(args);
1573
    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1574
    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1575
    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1576

1577
    // cast types
1578
    Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1579

1580
    for (uint32_t i = 0; i < 4; i++)
1581
    {
1582
        if (!isComponentEnabled(compMask, i))
1583
            continue;
1584

1585
        if (compCtrl[i] == ComponentControl::StoreSrc)
1586
        {
1587
#if LLVM_VERSION_MAJOR >= 11
1588
            using MaskType = int32_t;
1589
#else
1590
            using MaskType = uint32_t;
1591
#endif
1592
            std::vector<MaskType> vShuffleMasks[4] = {
1593
                {0, 4, 8, 12, 16, 20, 24, 28},  // x
1594
                {1, 5, 9, 13, 17, 21, 25, 29},  // y
1595
                {2, 6, 10, 14, 18, 22, 26, 30}, // z
1596
                {3, 7, 11, 15, 19, 23, 27, 31}, // w
1597
            };
1598

1599
            Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1600
                                  UndefValue::get(v32x8Ty),
1601
                                  vShuffleMasks[swizzle[i]]);
1602

1603
            if ((extendType == Instruction::CastOps::SExt) ||
1604
                (extendType == Instruction::CastOps::SIToFP))
1605
            {
1606
                switch (conversionType)
1607
                {
1608
                case CONVERT_NORMALIZED:
1609
                    val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1610
                    break;
1611
                case CONVERT_SSCALED:
1612
                    val = SI_TO_FP(val, mSimdFP32Ty);
1613
                    break;
1614
                case CONVERT_USCALED:
1615
                    SWR_INVALID("Type should not be sign extended!");
1616
                    break;
1617
                default:
1618
                    SWR_ASSERT(conversionType == CONVERT_NONE);
1619
                    val = S_EXT(val, mSimdInt32Ty);
1620
                    break;
1621
                }
1622
            }
1623
            else if ((extendType == Instruction::CastOps::ZExt) ||
1624
                     (extendType == Instruction::CastOps::UIToFP))
1625
            {
1626
                switch (conversionType)
1627
                {
1628
                case CONVERT_NORMALIZED:
1629
                    val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1630
                    break;
1631
                case CONVERT_SSCALED:
1632
                    SWR_INVALID("Type should not be zero extended!");
1633
                    break;
1634
                case CONVERT_USCALED:
1635
                    val = UI_TO_FP(val, mSimdFP32Ty);
1636
                    break;
1637
                default:
1638
                    SWR_ASSERT(conversionType == CONVERT_NONE);
1639
                    val = Z_EXT(val, mSimdInt32Ty);
1640
                    break;
1641
                }
1642
            }
1643
            else
1644
            {
1645
                SWR_INVALID("Unsupported conversion type");
1646
            }
1647

1648
            vVertexElements[currentVertexElement++] = val;
1649
        }
1650
        else
1651
        {
1652
            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1653
        }
1654

1655
        if (currentVertexElement > 3)
1656
        {
1657
            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1658
            // reset to the next vVertexElement to output
1659
            currentVertexElement = 0;
1660
        }
1661
    }
1662
}
1663

1664
//////////////////////////////////////////////////////////////////////////
1665
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1666
/// denormalizes if needed, converts to F32 if needed, and positions in
1667
//  the proper SIMD rows to be output to the simdvertex structure
1668
/// @param args: (tuple of args, listed below)
1669
///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1670
///   @param pVtxOut - base pointer to output simdvertex struct
1671
///   @param extendType - sign extend or zero extend
1672
///   @param bNormalized - do we need to denormalize?
1673
///   @param currentVertexElement - reference to the current vVertexElement
1674
///   @param outputElt - reference to the current offset from simdvertex we're o
1675
///   @param compMask - component packing mask
1676
///   @param compCtrl - component control val
1677
///   @param vVertexElements[4] - vertex components to output
1678
void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1679
{
1680
    // Unpack tuple args
1681
    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1682
    Value*                     pVtxOut              = std::get<1>(args);
1683
    const Instruction::CastOps extendType           = std::get<2>(args);
1684
    const ConversionType       conversionType       = std::get<3>(args);
1685
    uint32_t&                  currentVertexElement = std::get<4>(args);
1686
    uint32_t&                  outputElt            = std::get<5>(args);
1687
    const ComponentEnable      compMask             = std::get<6>(args);
1688
    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1689
    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1690

1691
    // cast types
1692
    Type* vGatherTy = getVectorType(mInt32Ty, 8);
1693
    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
1694

1695
    // have to do extra work for sign extending
1696
    if ((extendType == Instruction::CastOps::SExt) ||
1697
        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1698
    {
1699
        // is this PP float?
1700
        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1701

1702
        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1703
        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1704

1705
        // shuffle mask
1706
        Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1707
                                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1708
        Value* vi128XY_lo = nullptr;
1709
        Value* vi128XY_hi = nullptr;
1710
        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1711
        {
1712
            // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1713
            // now..
1714

1715
            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1716
            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1717

1718
            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1719
            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1720

1721
            // after pshufb: group components together in each 128bit lane
1722
            // 256i - 0    1    2    3    4    5    6    7
1723
            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1724

1725
            vi128XY_lo = BITCAST(
1726
                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1727
                v128bitTy);
1728
            vi128XY_hi = BITCAST(
1729
                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1730
                v128bitTy);
1731

1732
            // after PERMD: move and pack xy components into each 128bit lane
1733
            // 256i - 0    1    2    3    4    5    6    7
1734
            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1735
        }
1736

1737
        // do the same for zw components
1738
        Value* vi128ZW_lo = nullptr;
1739
        Value* vi128ZW_hi = nullptr;
1740
        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1741
        {
1742
            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1743
            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1744

1745
            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1746
            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1747

1748
            vi128ZW_lo = BITCAST(
1749
                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1750
                v128bitTy);
1751
            vi128ZW_hi = BITCAST(
1752
                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1753
                v128bitTy);
1754
        }
1755

1756
        // init denormalize variables if needed
1757
        Instruction::CastOps IntToFpCast;
1758
        Value*               conversionFactor;
1759

1760
        switch (conversionType)
1761
        {
1762
        case CONVERT_NORMALIZED:
1763
            IntToFpCast      = Instruction::CastOps::SIToFP;
1764
            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1765
            break;
1766
        case CONVERT_SSCALED:
1767
            IntToFpCast      = Instruction::CastOps::SIToFP;
1768
            conversionFactor = VIMMED1((float)(1.0));
1769
            break;
1770
        case CONVERT_USCALED:
1771
            assert(false && "Type should not be sign extended!");
1772
            conversionFactor = nullptr;
1773
            break;
1774
        default:
1775
            assert(conversionType == CONVERT_NONE);
1776
            conversionFactor = nullptr;
1777
            break;
1778
        }
1779

1780
        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1781
        // simdvertex
1782
        for (uint32_t i = 0; i < 4; i++)
1783
        {
1784
            if (isComponentEnabled(compMask, i))
1785
            {
1786
                if (compCtrl[i] == ComponentControl::StoreSrc)
1787
                {
1788
                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1789
                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1790
                    // if x or y, use vi128XY permute result, else use vi128ZW
1791
                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1792
                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1793

1794
                    if (bFP)
1795
                    {
1796
                        // extract 128 bit lanes to sign extend each component
1797
                        Value* temp_lo =
1798
                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1799
                        Value* temp_hi =
1800
                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1801

1802
                        vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1803
                    }
1804
                    else
1805
                    {
1806
                        // extract 128 bit lanes to sign extend each component
1807
                        Value* temp_lo =
1808
                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1809
                        Value* temp_hi =
1810
                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1811

1812
                        Value* temp = JOIN_16(temp_lo, temp_hi);
1813

1814
                        // denormalize if needed
1815
                        if (conversionType != CONVERT_NONE)
1816
                        {
1817
                            temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1818
                        }
1819

1820
                        vVertexElements[currentVertexElement] = temp;
1821
                    }
1822

1823
                    currentVertexElement += 1;
1824
                }
1825
                else
1826
                {
1827
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1828
                }
1829

1830
                if (currentVertexElement > 3)
1831
                {
1832
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1833
                    // reset to the next vVertexElement to output
1834
                    currentVertexElement = 0;
1835
                }
1836
            }
1837
        }
1838
    }
1839
    // else zero extend
1840
    else if ((extendType == Instruction::CastOps::ZExt) ||
1841
             (extendType == Instruction::CastOps::UIToFP))
1842
    {
1843
        // pshufb masks for each component
1844
        Value* vConstMask[2];
1845

1846
        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1847
        {
1848
            // x/z shuffle mask
1849
            vConstMask[0] = C<char>({
1850
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1851
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1852
            });
1853
        }
1854

1855
        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1856
        {
1857
            // y/w shuffle mask
1858
            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1859
                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1860
        }
1861

1862
        // init denormalize variables if needed
1863
        Instruction::CastOps fpCast;
1864
        Value*               conversionFactor;
1865

1866
        switch (conversionType)
1867
        {
1868
        case CONVERT_NORMALIZED:
1869
            fpCast           = Instruction::CastOps::UIToFP;
1870
            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1871
            break;
1872
        case CONVERT_USCALED:
1873
            fpCast           = Instruction::CastOps::UIToFP;
1874
            conversionFactor = VIMMED1((float)(1.0f));
1875
            break;
1876
        case CONVERT_SSCALED:
1877
            SWR_INVALID("Type should not be zero extended!");
1878
            conversionFactor = nullptr;
1879
            break;
1880
        default:
1881
            SWR_ASSERT(conversionType == CONVERT_NONE);
1882
            conversionFactor = nullptr;
1883
            break;
1884
        }
1885

1886
        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1887
        for (uint32_t i = 0; i < 4; i++)
1888
        {
1889
            if (isComponentEnabled(compMask, i))
1890
            {
1891
                if (compCtrl[i] == ComponentControl::StoreSrc)
1892
                {
1893
                    // select correct constMask for x/z or y/w pshufb
1894
                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1895
                    // if x or y, use vi128XY permute result, else use vi128ZW
1896
                    uint32_t selectedGather = (i < 2) ? 0 : 1;
1897

1898
                    // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL,
1899
                    // for now..
1900

1901
                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1902
                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1903

1904
                    Value* temp_lo = BITCAST(
1905
                        PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1906
                        vGatherTy);
1907
                    Value* temp_hi = BITCAST(
1908
                        PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1909
                        vGatherTy);
1910

1911
                    // after pshufb mask for x channel; z uses the same shuffle from the second
1912
                    // gather 256i - 0    1    2    3    4    5    6    7
1913
                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1914

1915
                    Value* temp = JOIN_16(temp_lo, temp_hi);
1916

1917
                    // denormalize if needed
1918
                    if (conversionType != CONVERT_NONE)
1919
                    {
1920
                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1921
                    }
1922

1923
                    vVertexElements[currentVertexElement] = temp;
1924

1925
                    currentVertexElement += 1;
1926
                }
1927
                else
1928
                {
1929
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1930
                }
1931

1932
                if (currentVertexElement > 3)
1933
                {
1934
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1935
                    // reset to the next vVertexElement to output
1936
                    currentVertexElement = 0;
1937
                }
1938
            }
1939
        }
1940
    }
1941
    else
1942
    {
1943
        SWR_INVALID("Unsupported conversion type");
1944
    }
1945
}
1946

1947
void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1948
{
1949
    // Unpack tuple args
1950
    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1951
    Value*                     pVtxOut              = std::get<1>(args);
1952
    const Instruction::CastOps extendType           = std::get<2>(args);
1953
    const ConversionType       conversionType       = std::get<3>(args);
1954
    uint32_t&                  currentVertexElement = std::get<4>(args);
1955
    uint32_t&                  outputElt            = std::get<5>(args);
1956
    const ComponentEnable      compMask             = std::get<6>(args);
1957
    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1958
    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1959

1960
    // cast types
1961
    Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1962
    Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1963

1964
    // have to do extra work for sign extending
1965
    if ((extendType == Instruction::CastOps::SExt) ||
1966
        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1967
    {
1968
        // is this PP float?
1969
        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1970

1971
        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1972
        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
1973
                                          mVWidth / 4); // vwidth is units of 32 bits
1974

1975
        // shuffle mask
1976
        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1977
                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1978
        Value* vi128XY    = nullptr;
1979
        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1980
        {
1981
            Value* vShufResult =
1982
                BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1983
            // after pshufb: group components together in each 128bit lane
1984
            // 256i - 0    1    2    3    4    5    6    7
1985
            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1986

1987
            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1988
            // after PERMD: move and pack xy components into each 128bit lane
1989
            // 256i - 0    1    2    3    4    5    6    7
1990
            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1991
        }
1992

1993
        // do the same for zw components
1994
        Value* vi128ZW = nullptr;
1995
        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1996
        {
1997
            Value* vShufResult =
1998
                BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1999
            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2000
        }
2001

2002
        // init denormalize variables if needed
2003
        Instruction::CastOps IntToFpCast;
2004
        Value*               conversionFactor;
2005

2006
        switch (conversionType)
2007
        {
2008
        case CONVERT_NORMALIZED:
2009
            IntToFpCast      = Instruction::CastOps::SIToFP;
2010
            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2011
            break;
2012
        case CONVERT_SSCALED:
2013
            IntToFpCast      = Instruction::CastOps::SIToFP;
2014
            conversionFactor = VIMMED1((float)(1.0));
2015
            break;
2016
        case CONVERT_USCALED:
2017
            SWR_INVALID("Type should not be sign extended!");
2018
            conversionFactor = nullptr;
2019
            break;
2020
        default:
2021
            SWR_ASSERT(conversionType == CONVERT_NONE);
2022
            conversionFactor = nullptr;
2023
            break;
2024
        }
2025

2026
        // sign extend all enabled components. If we have a fill vVertexElements, output to current
2027
        // simdvertex
2028
        for (uint32_t i = 0; i < 4; i++)
2029
        {
2030
            if (isComponentEnabled(compMask, i))
2031
            {
2032
                if (compCtrl[i] == ComponentControl::StoreSrc)
2033
                {
2034
                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2035
                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2036
                    // if x or y, use vi128XY permute result, else use vi128ZW
2037
                    Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2038

2039
                    if (bFP)
2040
                    {
2041
                        // extract 128 bit lanes to sign extend each component
2042
                        vVertexElements[currentVertexElement] =
2043
                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2044
                    }
2045
                    else
2046
                    {
2047
                        // extract 128 bit lanes to sign extend each component
2048
                        vVertexElements[currentVertexElement] =
2049
                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2050

2051
                        // denormalize if needed
2052
                        if (conversionType != CONVERT_NONE)
2053
                        {
2054
                            vVertexElements[currentVertexElement] =
2055
                                FMUL(CAST(IntToFpCast,
2056
                                          vVertexElements[currentVertexElement],
2057
                                          mSimdFP32Ty),
2058
                                     conversionFactor);
2059
                        }
2060
                    }
2061
                    currentVertexElement++;
2062
                }
2063
                else
2064
                {
2065
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2066
                }
2067

2068
                if (currentVertexElement > 3)
2069
                {
2070
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2071
                    // reset to the next vVertexElement to output
2072
                    currentVertexElement = 0;
2073
                }
2074
            }
2075
        }
2076
    }
2077
    // else zero extend
2078
    else if ((extendType == Instruction::CastOps::ZExt) ||
2079
             (extendType == Instruction::CastOps::UIToFP))
2080
    {
2081
        // pshufb masks for each component
2082
        Value* vConstMask[2];
2083
        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2084
        {
2085
            // x/z shuffle mask
2086
            vConstMask[0] = C<char>({
2087
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2088
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2089
            });
2090
        }
2091

2092
        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2093
        {
2094
            // y/w shuffle mask
2095
            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2096
                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2097
        }
2098

2099
        // init denormalize variables if needed
2100
        Instruction::CastOps fpCast;
2101
        Value*               conversionFactor;
2102

2103
        switch (conversionType)
2104
        {
2105
        case CONVERT_NORMALIZED:
2106
            fpCast           = Instruction::CastOps::UIToFP;
2107
            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2108
            break;
2109
        case CONVERT_USCALED:
2110
            fpCast           = Instruction::CastOps::UIToFP;
2111
            conversionFactor = VIMMED1((float)(1.0f));
2112
            break;
2113
        case CONVERT_SSCALED:
2114
            SWR_INVALID("Type should not be zero extended!");
2115
            conversionFactor = nullptr;
2116
            break;
2117
        default:
2118
            SWR_ASSERT(conversionType == CONVERT_NONE);
2119
            conversionFactor = nullptr;
2120
            break;
2121
        }
2122

2123
        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2124
        for (uint32_t i = 0; i < 4; i++)
2125
        {
2126
            if (isComponentEnabled(compMask, i))
2127
            {
2128
                if (compCtrl[i] == ComponentControl::StoreSrc)
2129
                {
2130
                    // select correct constMask for x/z or y/w pshufb
2131
                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2132
                    // if x or y, use vi128XY permute result, else use vi128ZW
2133
                    uint32_t selectedGather = (i < 2) ? 0 : 1;
2134

2135
                    vVertexElements[currentVertexElement] =
2136
                        BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2137
                                       vConstMask[selectedMask]),
2138
                                vGatherTy);
2139
                    // after pshufb mask for x channel; z uses the same shuffle from the second
2140
                    // gather 256i - 0    1    2    3    4    5    6    7
2141
                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2142

2143
                    // denormalize if needed
2144
                    if (conversionType != CONVERT_NONE)
2145
                    {
2146
                        vVertexElements[currentVertexElement] =
2147
                            FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2148
                                 conversionFactor);
2149
                    }
2150
                    currentVertexElement++;
2151
                }
2152
                else
2153
                {
2154
                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2155
                }
2156

2157
                if (currentVertexElement > 3)
2158
                {
2159
                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2160
                    // reset to the next vVertexElement to output
2161
                    currentVertexElement = 0;
2162
                }
2163
            }
2164
        }
2165
    }
2166
    else
2167
    {
2168
        SWR_INVALID("Unsupported conversion type");
2169
    }
2170
}
2171

2172
//////////////////////////////////////////////////////////////////////////
2173
/// @brief Output a simdvertex worth of elements to the current outputElt
2174
/// @param pVtxOut - base address of VIN output struct
2175
/// @param outputElt - simdvertex offset in VIN to write to
2176
/// @param numEltsToStore - number of simdvertex rows to write out
2177
/// @param vVertexElements - LLVM Value*[] simdvertex to write out
2178
void FetchJit::StoreVertexElements(Value*         pVtxOut,
2179
                                   const uint32_t outputElt,
2180
                                   const uint32_t numEltsToStore,
2181
                                   Value* (&vVertexElements)[4])
2182
{
2183
    SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2184

2185
    for (uint32_t c = 0; c < numEltsToStore; ++c)
2186
    {
2187
        // STORE expects FP32 x vWidth type, just bitcast if needed
2188
        if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2189
        {
2190
#if FETCH_DUMP_VERTEX
2191
            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2192
#endif
2193
            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2194
        }
2195
#if FETCH_DUMP_VERTEX
2196
        else
2197
        {
2198
            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2199
        }
2200
#endif
2201
        // outputElt * 4 = offsetting by the size of a simdvertex
2202
        // + c offsets to a 32bit x vWidth row within the current vertex
2203
        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2204
        STORE(vVertexElements[c], dest);
2205
    }
2206
}
2207

2208
//////////////////////////////////////////////////////////////////////////
2209
/// @brief Generates a constant vector of values based on the
2210
/// ComponentControl value
2211
/// @param ctrl - ComponentControl value
2212
Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2213
{
2214
    switch (ctrl)
2215
    {
2216
    case NoStore:
2217
        return VUNDEF_I();
2218
    case Store0:
2219
        return VIMMED1(0);
2220
    case Store1Fp:
2221
        return VIMMED1(1.0f);
2222
    case Store1Int:
2223
        return VIMMED1(1);
2224
    case StoreVertexId:
2225
    {
2226
        if (mVWidth == 16)
2227
        {
2228
            Type*  pSimd8FPTy = getVectorType(mFP32Ty, 8);
2229
            Value* pIdLo =
2230
                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2231
            Value* pIdHi =
2232
                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2233
            return JOIN_16(pIdLo, pIdHi);
2234
        }
2235
        else
2236
        {
2237
            return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2238
        }
2239
    }
2240
    case StoreInstanceId:
2241
    {
2242
        Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2243
        return VBROADCAST(pId);
2244
    }
2245

2246

2247
    case StoreSrc:
2248
    default:
2249
        SWR_INVALID("Invalid component control");
2250
        return VUNDEF_I();
2251
    }
2252
}
2253

2254
//////////////////////////////////////////////////////////////////////////
2255
/// @brief Returns the enable mask for the specified component.
2256
/// @param enableMask - enable bits
2257
/// @param component - component to check if enabled.
2258
bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2259
{
2260
    switch (component)
2261
    {
2262
        // X
2263
    case 0:
2264
        return (enableMask & ComponentEnable::X);
2265
        // Y
2266
    case 1:
2267
        return (enableMask & ComponentEnable::Y);
2268
        // Z
2269
    case 2:
2270
        return (enableMask & ComponentEnable::Z);
2271
        // W
2272
    case 3:
2273
        return (enableMask & ComponentEnable::W);
2274

2275
    default:
2276
        return false;
2277
    }
2278
}
2279

2280
// Don't want two threads compiling the same fetch shader simultaneously
2281
// Has problems in the JIT cache implementation
2282
// This is only a problem for fetch right now.
2283
static std::mutex gFetchCodegenMutex;
2284

2285
//////////////////////////////////////////////////////////////////////////
2286
/// @brief JITs from fetch shader IR
2287
/// @param hJitMgr - JitManager handle
2288
/// @param func   - LLVM function IR
2289
/// @return PFN_FETCH_FUNC - pointer to fetch code
2290
PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2291
{
2292
    const llvm::Function* func    = (const llvm::Function*)hFunc;
2293
    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2294
    PFN_FETCH_FUNC        pfnFetch;
2295

2296
    gFetchCodegenMutex.lock();
2297
    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2298
    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2299
    // add new IR to the module
2300
    pJitMgr->mIsModuleFinalized = true;
2301

2302
#if defined(KNOB_SWRC_TRACING)
2303
    char        fName[1024];
2304
    const char* funcName = func->getName().data();
2305
    sprintf(fName, "%s.bin", funcName);
2306
    FILE* fd = fopen(fName, "wb");
2307
    fwrite((void*)pfnFetch, 1, 2048, fd);
2308
    fclose(fd);
2309
#endif
2310

2311
    pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2312
    gFetchCodegenMutex.unlock();
2313

2314

2315
    return pfnFetch;
2316
}
2317

2318
//////////////////////////////////////////////////////////////////////////
2319
/// @brief JIT compiles fetch shader
2320
/// @param hJitMgr - JitManager handle
2321
/// @param state   - fetch state to build function from
2322
extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2323
{
2324
    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2325

2326
    pJitMgr->SetupNewModule();
2327

2328
    FetchJit theJit(pJitMgr);
2329
    HANDLE   hFunc = theJit.Create(state);
2330

2331
    return JitFetchFunc(hJitMgr, hFunc);
2332
}
2333

2334
Product

Resources

Company