Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
4574 views
/****************************************************************************1* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* @file fetch_jit.cpp23*24* @brief Implementation of the fetch jitter25*26* Notes:27*28******************************************************************************/29#include "jit_pch.hpp"30#include "builder_gfx_mem.h"31#include "jit_api.h"32#include "fetch_jit.h"33#include "gen_state_llvm.h"34#include "functionpasses/passes.h"3536//#define FETCH_DUMP_VERTEX 137using namespace llvm;38using namespace SwrJit;3940bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);4142enum ConversionType43{44CONVERT_NONE,45CONVERT_NORMALIZED,46CONVERT_USCALED,47CONVERT_SSCALED,48CONVERT_SFIXED,49};5051//////////////////////////////////////////////////////////////////////////52/// Interface to Jitting a fetch shader53//////////////////////////////////////////////////////////////////////////54struct FetchJit : public BuilderGfxMem55{56FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {}5758Function* Create(const FETCH_COMPILE_STATE& fetchState);5960Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);61Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);62Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);63template <typename T>64Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);6566// package up Shuffle*bpcGatherd args into a tuple for convenience67typedef std::tuple<Value*&,68Value*,69const Instruction::CastOps,70const ConversionType,71uint32_t&,72uint32_t&,73const ComponentEnable,74const ComponentControl (&)[4],75Value* (&)[4],76const uint32_t (&)[4]>77Shuffle8bpcArgs;7879void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);80void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);8182typedef std::tuple<Value* (&)[2],83Value*,84const Instruction::CastOps,85const ConversionType,86uint32_t&,87uint32_t&,88const ComponentEnable,89const ComponentControl (&)[4],90Value* (&)[4]>91Shuffle16bpcArgs;9293void Shuffle16bpcGather16(Shuffle16bpcArgs& args);94void Shuffle16bpcGather(Shuffle16bpcArgs& args);9596void StoreVertexElements(Value* pVtxOut,97const uint32_t outputElt,98const uint32_t numEltsToStore,99Value* (&vVertexElements)[4]);100101Value* GenerateCompCtrlVector(const ComponentControl ctrl);102103void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,104Value* streams,105Value* vIndices,106Value* pVtxOut);107108bool IsOddFormat(SWR_FORMAT format);109bool IsUniformFormat(SWR_FORMAT format);110void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);111void CreateGatherOddFormats(112SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);113void ConvertFormat(SWR_FORMAT format, Value* texels[4]);114115Value* mpFetchInfo;116};117118Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)119{120std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);121fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));122123Function* fetch = Function::Create(124JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);125BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);126127fetch->getParent()->setModuleIdentifier(fetch->getName());128129IRB()->SetInsertPoint(entry);130131auto argitr = fetch->arg_begin();132133// Fetch shader arguments134Value* privateContext = &*argitr;135++argitr;136privateContext->setName("privateContext");137SetPrivateContext(privateContext);138139mpWorkerData = &*argitr;140++argitr;141mpWorkerData->setName("pWorkerData");142143mpFetchInfo = &*argitr;144++argitr;145mpFetchInfo->setName("fetchInfo");146Value* pVtxOut = &*argitr;147pVtxOut->setName("vtxOutput");148149uint32_t baseWidth = mVWidth;150151SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);152153// Override builder target width to force 16-wide SIMD154#if USE_SIMD16_SHADERS155SetTargetWidth(16);156#endif157158pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));159160// SWR_FETCH_CONTEXT::pStreams161Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});162streams->setName("pStreams");163164// SWR_FETCH_CONTEXT::pIndices165Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});166indices->setName("pIndices");167168// SWR_FETCH_CONTEXT::pLastIndex169Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});170pLastIndex->setName("pLastIndex");171172Value* vIndices;173switch (fetchState.indexType)174{175case R8_UINT:176indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));177if (fetchState.bDisableIndexOOBCheck)178{179vIndices = LOAD(180BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)),181{(uint32_t)0});182vIndices = Z_EXT(vIndices, mSimdInt32Ty);183}184else185{186vIndices = GetSimdValid8bitIndices(indices, pLastIndex);187}188break;189case R16_UINT:190if (fetchState.bDisableIndexOOBCheck)191{192vIndices = LOAD(193BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)),194{(uint32_t)0});195vIndices = Z_EXT(vIndices, mSimdInt32Ty);196}197else198{199vIndices = GetSimdValid16bitIndices(indices, pLastIndex);200}201break;202case R32_UINT:203(fetchState.bDisableIndexOOBCheck)204? vIndices = LOAD(indices,205"",206PointerType::get(mSimdInt32Ty, 0),207MEM_CLIENT::GFX_MEM_CLIENT_FETCH)208: vIndices = GetSimdValid32bitIndices(indices, pLastIndex);209break; // incoming type is already 32bit int210default:211vIndices = nullptr;212assert(false && "Unsupported index type");213break;214}215216if (fetchState.bForceSequentialAccessEnable)217{218Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})219: C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});220221// VertexData buffers are accessed sequentially, the index is equal to the vertex number222vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));223vIndices = ADD(vIndices, pOffsets);224}225226Value* vVertexId = vIndices;227if (fetchState.bVertexIDOffsetEnable)228{229// Assuming one of baseVertex or startVertex is 0, so adding both should be functionally230// correct231Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));232Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));233vVertexId = ADD(vIndices, vBaseVertex);234vVertexId = ADD(vVertexId, vStartVertex);235}236237// store out vertex IDs238if (mVWidth == 16)239{240// store out in simd8 halves until core supports 16-wide natively241auto vVertexIdLo = EXTRACT_16(vVertexId, 0);242auto vVertexIdHi = EXTRACT_16(vVertexId, 1);243STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));244STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));245}246else if (mVWidth == 8)247{248STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));249}250251// store out cut mask if enabled252if (fetchState.bEnableCutIndex)253{254Value* vCutIndex = VIMMED1(fetchState.cutIndex);255Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));256257if (mVWidth == 16)258{259auto cutMaskLo = EXTRACT_16(cutMask, 0);260auto cutMaskHi = EXTRACT_16(cutMask, 1);261STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));262STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));263}264else if (mVWidth == 8)265{266STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));267}268}269270// Fetch attributes from memory and output to a simdvertex struct271JitGatherVertices(fetchState, streams, vIndices, pVtxOut);272273RET_VOID();274275JitManager::DumpToFile(fetch, "src");276277#if defined(_DEBUG)278verifyFunction(*fetch);279#endif280281::FunctionPassManager setupPasses(JM()->mpCurrentModule);282283///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)284setupPasses.add(createBreakCriticalEdgesPass());285setupPasses.add(createCFGSimplificationPass());286setupPasses.add(createEarlyCSEPass());287setupPasses.add(createPromoteMemoryToRegisterPass());288289setupPasses.run(*fetch);290291JitManager::DumpToFile(fetch, "se");292293::FunctionPassManager optPasses(JM()->mpCurrentModule);294295///@todo Haven't touched these either. Need to remove some of these and add others.296optPasses.add(createCFGSimplificationPass());297optPasses.add(createEarlyCSEPass());298optPasses.add(createInstructionCombiningPass());299#if LLVM_VERSION_MAJOR <= 11300optPasses.add(createConstantPropagationPass());301#endif302optPasses.add(createSCCPPass());303optPasses.add(createAggressiveDCEPass());304305optPasses.run(*fetch);306307optPasses.add(createLowerX86Pass(this));308optPasses.run(*fetch);309310JitManager::DumpToFile(fetch, "opt");311312313// Revert 16-wide override314#if USE_SIMD16_SHADERS315SetTargetWidth(baseWidth);316#endif317318return fetch;319}320321// returns true for odd formats that require special state.gather handling322bool FetchJit::IsOddFormat(SWR_FORMAT format)323{324const SWR_FORMAT_INFO& info = GetFormatInfo(format);325if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)326{327return true;328}329return false;330}331332// format is uniform if all components are the same size and type333bool FetchJit::IsUniformFormat(SWR_FORMAT format)334{335const SWR_FORMAT_INFO& info = GetFormatInfo(format);336uint32_t bpc0 = info.bpc[0];337uint32_t type0 = info.type[0];338339for (uint32_t c = 1; c < info.numComps; ++c)340{341if (bpc0 != info.bpc[c] || type0 != info.type[c])342{343return false;344}345}346return true;347}348349// unpacks components based on format350// foreach component in the pixel351// mask off everything but this component352// shift component to LSB353void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])354{355const SWR_FORMAT_INFO& info = GetFormatInfo(format);356357uint32_t bitOffset = 0;358for (uint32_t c = 0; c < info.numComps; ++c)359{360uint32_t swizzledIndex = info.swizzle[c];361uint32_t compBits = info.bpc[c];362uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;363Value* comp = AND(vInput, bitmask);364comp = LSHR(comp, bitOffset);365366result[swizzledIndex] = comp;367bitOffset += compBits;368}369}370371// gather for odd component size formats372// gather SIMD full pixels per lane then shift/mask to move each component to their373// own vector374void FetchJit::CreateGatherOddFormats(375SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])376{377const SWR_FORMAT_INFO& info = GetFormatInfo(format);378379// only works if pixel size is <= 32bits380SWR_ASSERT(info.bpp <= 32);381382Value* pGather;383if (info.bpp == 32)384{385pGather =386GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);387}388else389{390// Can't use 32-bit gather for items less than 32-bits, could cause page faults.391Value* pMem = ALLOCA(mSimdInt32Ty);392STORE(VIMMED1(0u), pMem);393394Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);395396for (uint32_t lane = 0; lane < mVWidth; ++lane)397{398// Get index399Value* index = VEXTRACT(pOffsets, C(lane));400Value* mask = VEXTRACT(pMask, C(lane));401402// use branch around load based on mask403// Needed to avoid page-faults on unmasked lanes404BasicBlock* pCurrentBB = IRB()->GetInsertBlock();405BasicBlock* pMaskedLoadBlock =406BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());407BasicBlock* pEndLoadBB =408BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());409410COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);411412JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);413414switch (info.bpp)415{416case 8:417{418Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));419Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));420STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);421break;422}423424case 16:425{426Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));427Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));428STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);429break;430}431break;432433case 24:434{435// First 16-bits of data436Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));437Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));438STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);439440// Last 8-bits of data441pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));442xpSrc = ADD(xpSrc, C((int64_t)2));443STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);444break;445}446447default:448SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);449break;450}451452BR(pEndLoadBB);453JM()->mBuilder.SetInsertPoint(pEndLoadBB);454}455456pGather = LOAD(pMem);457}458459for (uint32_t comp = 0; comp < 4; ++comp)460{461pResult[comp] = VIMMED1((int)info.defaults[comp]);462}463464UnpackComponents(format, pGather, pResult);465466// cast to fp32467pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);468pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);469pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);470pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);471}472473void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])474{475const SWR_FORMAT_INFO& info = GetFormatInfo(format);476477for (uint32_t c = 0; c < info.numComps; ++c)478{479uint32_t compIndex = info.swizzle[c];480481// skip any conversion on UNUSED components482if (info.type[c] == SWR_TYPE_UNUSED)483{484continue;485}486487if (info.isNormalized[c])488{489if (info.type[c] == SWR_TYPE_SNORM)490{491/// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to492/// -1.0f.493494/// result = c * (1.0f / (2^(n-1) - 1);495uint32_t n = info.bpc[c];496uint32_t pow2 = 1 << (n - 1);497float scale = 1.0f / (float)(pow2 - 1);498Value* vScale = VIMMED1(scale);499texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);500texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);501texels[compIndex] = FMUL(texels[compIndex], vScale);502}503else504{505SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);506507/// result = c * (1.0f / (2^n - 1))508uint32_t n = info.bpc[c];509uint32_t pow2 = 1 << n;510// special case 24bit unorm format, which requires a full divide to meet ULP511// requirement512if (n == 24)513{514float scale = (float)(pow2 - 1);515Value* vScale = VIMMED1(scale);516texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);517texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);518texels[compIndex] = FDIV(texels[compIndex], vScale);519}520else521{522float scale = 1.0f / (float)(pow2 - 1);523Value* vScale = VIMMED1(scale);524texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);525texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);526texels[compIndex] = FMUL(texels[compIndex], vScale);527}528}529continue;530}531}532}533534//////////////////////////////////////////////////////////////////////////535/// @brief Loads attributes from memory using AVX2 GATHER(s)536/// @param fetchState - info about attributes to be fetched from memory537/// @param streams - value pointer to the current vertex stream538/// @param vIndices - vector value of indices to gather539/// @param pVtxOut - value pointer to output simdvertex struct540void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,541Value* streams,542Value* vIndices,543Value* pVtxOut)544{545uint32_t currentVertexElement = 0;546uint32_t outputElt = 0;547Value* vVertexElements[4];548549Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});550Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});551Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});552Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));553curInstance->setName("curInstance");554555for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)556{557const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];558559// skip element if all components are disabled560if (ied.ComponentPacking == ComponentEnable::NONE)561{562continue;563}564565const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);566SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");567uint32_t bpc =568info.bpp /569info.numComps; ///@todo Code below assumes all components are same size. Need to fix.570571Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});572573Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});574Value* vStride = VBROADCAST(stride);575576// max vertex index that is fully in bounds577Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});578maxVertex = LOAD(maxVertex);579580Value* minVertex = NULL;581if (fetchState.bPartialVertexBuffer)582{583// min vertex index for low bounds OOB checking584minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});585minVertex = LOAD(minVertex);586}587588if (fetchState.bInstanceIDOffsetEnable)589{590// the InstanceID (curInstance) value is offset by StartInstanceLocation591curInstance = ADD(curInstance, startInstance);592}593594Value* vCurIndices;595Value* startOffset;596Value* vInstanceStride = VIMMED1(0);597598if (ied.InstanceEnable)599{600Value* stepRate = C(ied.InstanceAdvancementState);601602// prevent a div by 0 for 0 step rate603Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));604stepRate = SELECT(isNonZeroStep, stepRate, C(1));605606// calc the current offset into instanced data buffer607Value* calcInstance = UDIV(curInstance, stepRate);608609// if step rate is 0, every instance gets instance 0610calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));611612vCurIndices = VBROADCAST(calcInstance);613startOffset = startInstance;614}615else if (ied.InstanceStrideEnable)616{617// grab the instance advancement state, determines stride in bytes from one instance to618// the next619Value* stepRate = C(ied.InstanceAdvancementState);620vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));621622// offset indices by baseVertex623vCurIndices = ADD(vIndices, vBaseVertex);624625startOffset = startVertex;626SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");627}628else629{630// offset indices by baseVertex631vCurIndices = ADD(vIndices, vBaseVertex);632startOffset = startVertex;633}634635// All of the OOB calculations are in vertices, not VB offsets, to prevent having to636// do 64bit address offset calculations.637638// calculate byte offset to the start of the VB639Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));640641// VGATHER* takes an *i8 src pointer so that's what stream is642Value* pStreamBaseGFX = ADD(stream, baseOffset);643644// if we have a start offset, subtract from max vertex. Used for OOB check645maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));646Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));647// if we have a negative value, we're already OOB. clamp at 0.648maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));649650if (fetchState.bPartialVertexBuffer)651{652// similary for min vertex653minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));654Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));655minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));656}657658// Load the in bounds size of a partially valid vertex659Value* partialInboundsSize =660GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});661partialInboundsSize = LOAD(partialInboundsSize);662Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);663Value* vBpp = VBROADCAST(C(info.Bpp));664Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));665666// is the element is <= the partially valid size667Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));668669// override cur indices with 0 if pitch is 0670Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));671vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);672673// are vertices partially OOB?674Value* vMaxVertex = VBROADCAST(maxVertex);675Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);676677// are vertices fully in bounds?678Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);679680Value* vGatherMask;681if (fetchState.bPartialVertexBuffer)682{683// are vertices below minVertex limit?684Value* vMinVertex = VBROADCAST(minVertex);685Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);686687// only fetch lanes that pass both tests688vGatherMask = AND(vMaxGatherMask, vMinGatherMask);689}690else691{692vGatherMask = vMaxGatherMask;693}694695// blend in any partially OOB indices that have valid elements696vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);697698// calculate the actual offsets into the VB699Value* vOffsets = MUL(vCurIndices, vStride);700vOffsets = ADD(vOffsets, vAlignmentOffsets);701702// if instance stride enable is:703// true - add product of the instanceID and advancement state to the offset into the VB704// false - value of vInstanceStride has been initialized to zero705vOffsets = ADD(vOffsets, vInstanceStride);706707// Packing and component control708ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;709const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,710(ComponentControl)ied.ComponentControl1,711(ComponentControl)ied.ComponentControl2,712(ComponentControl)ied.ComponentControl3};713714// Special gather/conversion for formats without equal component sizes715if (IsOddFormat((SWR_FORMAT)ied.Format))716{717Value* pResults[4];718CreateGatherOddFormats(719(SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);720ConvertFormat((SWR_FORMAT)ied.Format, pResults);721722for (uint32_t c = 0; c < 4; c += 1)723{724if (isComponentEnabled(compMask, c))725{726vVertexElements[currentVertexElement++] = pResults[c];727if (currentVertexElement > 3)728{729StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);730// reset to the next vVertexElement to output731currentVertexElement = 0;732}733}734}735}736else if (info.type[0] == SWR_TYPE_FLOAT)737{738///@todo: support 64 bit vb accesses739Value* gatherSrc = VIMMED1(0.0f);740741SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),742"Unsupported format for standard gather fetch.");743744// Gather components from memory to store in a simdvertex structure745switch (bpc)746{747case 16:748{749Value* vGatherResult[2];750751// if we have at least one component out of x or y to fetch752if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))753{754vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);755// e.g. result of first 8x32bit integer gather for 16bit components756// 256i - 0 1 2 3 4 5 6 7757// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy758//759}760761// if we have at least one component out of z or w to fetch762if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))763{764// offset base to the next components(zw) in the vertex to gather765pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));766767vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);768// e.g. result of second 8x32bit integer gather for 16bit components769// 256i - 0 1 2 3 4 5 6 7770// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw771//772}773774// if we have at least one component to shuffle into place775if (compMask)776{777Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,778pVtxOut,779Instruction::CastOps::FPExt,780CONVERT_NONE,781currentVertexElement,782outputElt,783compMask,784compCtrl,785vVertexElements);786787// Shuffle gathered components into place in simdvertex struct788mVWidth == 16 ? Shuffle16bpcGather16(args)789: Shuffle16bpcGather(args); // outputs to vVertexElements ref790}791}792break;793case 32:794{795for (uint32_t i = 0; i < 4; i += 1)796{797if (isComponentEnabled(compMask, i))798{799// if we need to gather the component800if (compCtrl[i] == StoreSrc)801{802// Gather a SIMD of vertices803// APIs allow a 4GB range for offsets804// However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(805// Add 2GB to the base pointer and 2GB to the offsets. This makes806// "negative" (large) offsets into positive offsets and small offsets807// into negative offsets.808Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));809vVertexElements[currentVertexElement++] =810GATHERPS(gatherSrc,811ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),812vNewOffsets,813vGatherMask,8141,815MEM_CLIENT::GFX_MEM_CLIENT_FETCH);816}817else818{819vVertexElements[currentVertexElement++] =820GenerateCompCtrlVector(compCtrl[i]);821}822823if (currentVertexElement > 3)824{825StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);826// reset to the next vVertexElement to output827currentVertexElement = 0;828}829}830831// offset base to the next component in the vertex to gather832pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));833}834}835break;836case 64:837{838for (uint32_t i = 0; i < 4; i += 1)839{840if (isComponentEnabled(compMask, i))841{842// if we need to gather the component843if (compCtrl[i] == StoreSrc)844{845Value* vShufLo;846Value* vShufHi;847Value* vShufAll;848849if (mVWidth == 8)850{851vShufLo = C({0, 1, 2, 3});852vShufHi = C({4, 5, 6, 7});853vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});854}855else856{857SWR_ASSERT(mVWidth == 16);858vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});859vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});860vShufAll =861C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});862}863864Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);865Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);866867Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);868Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);869870Value* vZeroDouble = VECTOR_SPLAT(871mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));872873Value* pGatherLo =874GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);875Value* pGatherHi =876GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);877878Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);879pGather = FP_TRUNC(pGather, mSimdFP32Ty);880881vVertexElements[currentVertexElement++] = pGather;882}883else884{885vVertexElements[currentVertexElement++] =886GenerateCompCtrlVector(compCtrl[i]);887}888889if (currentVertexElement > 3)890{891StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);892// reset to the next vVertexElement to output893currentVertexElement = 0;894}895}896897// offset base to the next component in the vertex to gather898pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));899}900}901break;902default:903SWR_INVALID("Tried to fetch invalid FP format");904break;905}906}907else908{909Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;910ConversionType conversionType = CONVERT_NONE;911912SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),913"Unsupported format for standard gather fetch.");914915switch (info.type[0])916{917case SWR_TYPE_UNORM:918conversionType = CONVERT_NORMALIZED;919case SWR_TYPE_UINT:920extendCastType = Instruction::CastOps::ZExt;921break;922case SWR_TYPE_SNORM:923conversionType = CONVERT_NORMALIZED;924case SWR_TYPE_SINT:925extendCastType = Instruction::CastOps::SExt;926break;927case SWR_TYPE_USCALED:928conversionType = CONVERT_USCALED;929extendCastType = Instruction::CastOps::UIToFP;930break;931case SWR_TYPE_SSCALED:932conversionType = CONVERT_SSCALED;933extendCastType = Instruction::CastOps::SIToFP;934break;935case SWR_TYPE_SFIXED:936conversionType = CONVERT_SFIXED;937extendCastType = Instruction::CastOps::SExt;938break;939default:940break;941}942943// value substituted when component of gather is masked944Value* gatherSrc = VIMMED1(0);945946// Gather components from memory to store in a simdvertex structure947switch (bpc)948{949case 8:950{951// if we have at least one component to fetch952if (compMask)953{954Value* vGatherResult = GATHERDD(gatherSrc,955pStreamBaseGFX,956vOffsets,957vGatherMask,9581,959MEM_CLIENT::GFX_MEM_CLIENT_FETCH);960// e.g. result of an 8x32bit integer gather for 8bit components961// 256i - 0 1 2 3 4 5 6 7962// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw963964Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,965pVtxOut,966extendCastType,967conversionType,968currentVertexElement,969outputElt,970compMask,971compCtrl,972vVertexElements,973info.swizzle);974975// Shuffle gathered components into place in simdvertex struct976mVWidth == 16 ? Shuffle8bpcGatherd16(args)977: Shuffle8bpcGatherd(args); // outputs to vVertexElements ref978}979}980break;981case 16:982{983Value* vGatherResult[2];984985// if we have at least one component out of x or y to fetch986if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))987{988vGatherResult[0] = GATHERDD(gatherSrc,989pStreamBaseGFX,990vOffsets,991vGatherMask,9921,993MEM_CLIENT::GFX_MEM_CLIENT_FETCH);994// e.g. result of first 8x32bit integer gather for 16bit components995// 256i - 0 1 2 3 4 5 6 7996// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy997//998}9991000// if we have at least one component out of z or w to fetch1001if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))1002{1003// offset base to the next components(zw) in the vertex to gather1004pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));10051006vGatherResult[1] = GATHERDD(gatherSrc,1007pStreamBaseGFX,1008vOffsets,1009vGatherMask,10101,1011MEM_CLIENT::GFX_MEM_CLIENT_FETCH);1012// e.g. result of second 8x32bit integer gather for 16bit components1013// 256i - 0 1 2 3 4 5 6 71014// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw1015//1016}10171018// if we have at least one component to shuffle into place1019if (compMask)1020{1021Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,1022pVtxOut,1023extendCastType,1024conversionType,1025currentVertexElement,1026outputElt,1027compMask,1028compCtrl,1029vVertexElements);10301031// Shuffle gathered components into place in simdvertex struct1032mVWidth == 16 ? Shuffle16bpcGather16(args)1033: Shuffle16bpcGather(args); // outputs to vVertexElements ref1034}1035}1036break;1037case 32:1038{1039// Gathered components into place in simdvertex struct1040for (uint32_t i = 0; i < 4; i++)1041{1042if (isComponentEnabled(compMask, i))1043{1044// if we need to gather the component1045if (compCtrl[i] == StoreSrc)1046{1047Value* pGather = GATHERDD(gatherSrc,1048pStreamBaseGFX,1049vOffsets,1050vGatherMask,10511,1052MEM_CLIENT::GFX_MEM_CLIENT_FETCH);10531054if (conversionType == CONVERT_USCALED)1055{1056pGather = UI_TO_FP(pGather, mSimdFP32Ty);1057}1058else if (conversionType == CONVERT_SSCALED)1059{1060pGather = SI_TO_FP(pGather, mSimdFP32Ty);1061}1062else if (conversionType == CONVERT_SFIXED)1063{1064pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),1065VBROADCAST(C(1 / 65536.0f)));1066}10671068vVertexElements[currentVertexElement++] = pGather;10691070// e.g. result of a single 8x32bit integer gather for 32bit components1071// 256i - 0 1 2 3 4 5 6 71072// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx1073}1074else1075{1076vVertexElements[currentVertexElement++] =1077GenerateCompCtrlVector(compCtrl[i]);1078}10791080if (currentVertexElement > 3)1081{1082StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);10831084// reset to the next vVertexElement to output1085currentVertexElement = 0;1086}1087}10881089// offset base to the next component in the vertex to gather1090pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));1091}1092}1093break;1094}1095}1096}10971098// if we have a partially filled vVertexElement struct, output it1099if (currentVertexElement > 0)1100{1101StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);1102}1103}110411051106typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);11071108template <typename T>1109void GetSimdValidIndicesGfx(gfxptr_t indices,1110gfxptr_t lastIndex,1111uint32_t vWidth,1112PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,1113void* pdc,1114uint32_t* outIndices,1115void* pWorkerData)1116{1117SWR_ASSERT(outIndices != nullptr);11181119gfxptr_t indexPtr = indices;1120for (int64_t lane = 0; lane < vWidth; lane++)1121{1122uint32_t index = 0;11231124if (indexPtr < lastIndex)1125{1126// translate indexPtr and load from it1127T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);1128SWR_ASSERT(addr != nullptr);1129index = *addr;1130}11311132// index to 32 bits and insert into the correct simd lane1133outIndices[lane] = index;11341135indexPtr += sizeof(T);1136}1137}11381139void GetSimdValid8bitIndicesGfx(gfxptr_t indices,1140gfxptr_t lastIndex,1141uint32_t vWidth,1142PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,1143void* pdc,1144uint32_t* outIndices,1145void* pWorkerData)1146{1147GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);1148}11491150void GetSimdValid16bitIndicesGfx(gfxptr_t indices,1151gfxptr_t lastIndex,1152uint32_t vWidth,1153PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,1154void* pdc,1155uint32_t* outIndices,1156void* pWorkerData)1157{1158GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);1159}116011611162template <typename T>1163Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)1164{1165SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,1166"Function expects gfxptr_t for both input parameters.");11671168Type* Ty = nullptr;11691170static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),1171"Unsupported type for use with GetSimdValidIndicesHelper<T>");1172constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));1173if (bSize)1174{1175Ty = mInt16PtrTy;1176}1177else if (sizeof(T) == sizeof(uint8_t))1178{1179Ty = mInt8PtrTy;1180}1181else1182{1183SWR_ASSERT(false, "This should never happen as per static_assert above.");1184}11851186Value* vIndices = VUNDEF_I();11871188{1189// store 0 index on stack to be used to conditionally load from if index address is OOB1190Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());1191STORE(C((T)0), pZeroIndex);11921193// Load a SIMD of index pointers1194for (int64_t lane = 0; lane < mVWidth; lane++)1195{1196// Calculate the address of the requested index1197Value* pIndex = GEP(pIndices, C(lane), Ty);11981199pLastIndex = INT_TO_PTR(pLastIndex, Ty);12001201// check if the address is less than the max index,1202Value* mask = ICMP_ULT(pIndex, pLastIndex);12031204// if valid, load the index. if not, load 0 from the stack1205Value* pValid = SELECT(mask, pIndex, pZeroIndex);1206Value* index = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);12071208// zero extended index to 32 bits and insert into the correct simd lane1209index = Z_EXT(index, mInt32Ty);1210vIndices = VINSERT(vIndices, index, lane);1211}1212}12131214return vIndices;1215}12161217//////////////////////////////////////////////////////////////////////////1218/// @brief Loads a simd of valid indices. OOB indices are set to 01219/// *Note* have to do 8bit index checking in scalar until we have AVX-5121220/// support1221/// @param pIndices - pointer to 8 bit indices1222/// @param pLastIndex - pointer to last valid index1223Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)1224{1225return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);1226}12271228//////////////////////////////////////////////////////////////////////////1229/// @brief Loads a simd of valid indices. OOB indices are set to 01230/// *Note* have to do 16bit index checking in scalar until we have AVX-5121231/// support1232/// @param pIndices - pointer to 16 bit indices1233/// @param pLastIndex - pointer to last valid index1234Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)1235{1236return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);1237}12381239//////////////////////////////////////////////////////////////////////////1240/// @brief Loads a simd of valid indices. OOB indices are set to 01241/// @param pIndices - pointer to 32 bit indices1242/// @param pLastIndex - pointer to last valid index1243Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)1244{1245DataLayout dL(JM()->mpCurrentModule);1246Value* iLastIndex = pLastIndex;1247Value* iIndices = pIndices;12481249// get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)1250Value* numIndicesLeft = SUB(iLastIndex, iIndices);1251numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);1252numIndicesLeft = SDIV(numIndicesLeft, C(4));12531254// create a vector of index counts from the base index ptr passed into the fetch1255Constant* vIndexOffsets;1256if (mVWidth == 8)1257{1258vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});1259}1260else1261{1262vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});1263}12641265// compare index count to the max valid index1266// e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load1267// vIndexOffsets 0 1 2 3 4 5 6 71268// ------------------------------1269// vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass1270// vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 01271Value* vMaxIndex = VBROADCAST(numIndicesLeft);1272Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);12731274// Load the indices; OOB loads 01275return MASKED_LOAD(pIndices,12764,1277vIndexMask,1278VIMMED1(0),1279"vIndices",1280PointerType::get(mSimdInt32Ty, 0),1281MEM_CLIENT::GFX_MEM_CLIENT_FETCH);1282}12831284//////////////////////////////////////////////////////////////////////////1285/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,1286/// denormalizes if needed, converts to F32 if needed, and positions in1287// the proper SIMD rows to be output to the simdvertex structure1288/// @param args: (tuple of args, listed below)1289/// @param vGatherResult - 8 gathered 8bpc vertices1290/// @param pVtxOut - base pointer to output simdvertex struct1291/// @param extendType - sign extend or zero extend1292/// @param bNormalized - do we need to denormalize?1293/// @param currentVertexElement - reference to the current vVertexElement1294/// @param outputElt - reference to the current offset from simdvertex we're o1295/// @param compMask - component packing mask1296/// @param compCtrl - component control val1297/// @param vVertexElements[4] - vertex components to output1298/// @param swizzle[4] - component swizzle location1299void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)1300{1301// Unpack tuple args1302Value*& vGatherResult = std::get<0>(args);1303Value* pVtxOut = std::get<1>(args);1304const Instruction::CastOps extendType = std::get<2>(args);1305const ConversionType conversionType = std::get<3>(args);1306uint32_t& currentVertexElement = std::get<4>(args);1307uint32_t& outputElt = std::get<5>(args);1308const ComponentEnable compMask = std::get<6>(args);1309const ComponentControl(&compCtrl)[4] = std::get<7>(args);1310Value*(&vVertexElements)[4] = std::get<8>(args);1311const uint32_t(&swizzle)[4] = std::get<9>(args);13121313// cast types1314Type* vGatherTy = getVectorType(mInt32Ty, 8);1315Type* v32x8Ty = getVectorType(mInt8Ty, 32);13161317// have to do extra work for sign extending1318if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))1319{1320Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane1321Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);13221323// shuffle mask, including any swizzling1324const char x = (char)swizzle[0];1325const char y = (char)swizzle[1];1326const char z = (char)swizzle[2];1327const char w = (char)swizzle[3];1328Value* vConstMask = C<char>(1329{char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4),1330char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12),1331char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4),1332char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12),1333char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4),1334char(w + 8), char(w + 12)});13351336// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..13371338Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);1339Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);13401341Value* vShufResult_lo =1342BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);1343Value* vShufResult_hi =1344BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);13451346// after pshufb: group components together in each 128bit lane1347// 256i - 0 1 2 3 4 5 6 71348// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww13491350Value* vi128XY_lo = nullptr;1351Value* vi128XY_hi = nullptr;1352if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))1353{1354vi128XY_lo = BITCAST(1355VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),1356v128Ty);1357vi128XY_hi = BITCAST(1358VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),1359v128Ty);13601361// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane1362// 256i - 0 1 2 3 4 5 6 71363// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)1364}13651366// do the same for zw components1367Value* vi128ZW_lo = nullptr;1368Value* vi128ZW_hi = nullptr;1369if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))1370{1371vi128ZW_lo = BITCAST(1372VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),1373v128Ty);1374vi128ZW_hi = BITCAST(1375VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),1376v128Ty);1377}13781379// init denormalize variables if needed1380Instruction::CastOps fpCast;1381Value* conversionFactor;13821383switch (conversionType)1384{1385case CONVERT_NORMALIZED:1386fpCast = Instruction::CastOps::SIToFP;1387conversionFactor = VIMMED1((float)(1.0 / 127.0));1388break;1389case CONVERT_SSCALED:1390fpCast = Instruction::CastOps::SIToFP;1391conversionFactor = VIMMED1((float)(1.0));1392break;1393case CONVERT_USCALED:1394assert(false && "Type should not be sign extended!");1395conversionFactor = nullptr;1396break;1397default:1398assert(conversionType == CONVERT_NONE);1399conversionFactor = nullptr;1400break;1401}14021403// sign extend all enabled components. If we have a fill vVertexElements, output to current1404// simdvertex1405for (uint32_t i = 0; i < 4; i++)1406{1407if (isComponentEnabled(compMask, i))1408{1409if (compCtrl[i] == ComponentControl::StoreSrc)1410{1411// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 11412uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;1413// if x or y, use vi128XY permute result, else use vi128ZW1414Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;1415Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;14161417// sign extend1418Value* temp_lo =1419PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));1420Value* temp_hi =1421PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));14221423Value* temp = JOIN_16(temp_lo, temp_hi);14241425// denormalize if needed1426if (conversionType != CONVERT_NONE)1427{1428temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);1429}14301431vVertexElements[currentVertexElement] = temp;14321433currentVertexElement += 1;1434}1435else1436{1437vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);1438}14391440if (currentVertexElement > 3)1441{1442StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);1443// reset to the next vVertexElement to output1444currentVertexElement = 0;1445}1446}1447}1448}1449// else zero extend1450else if ((extendType == Instruction::CastOps::ZExt) ||1451(extendType == Instruction::CastOps::UIToFP))1452{1453// init denormalize variables if needed1454Instruction::CastOps fpCast;1455Value* conversionFactor;14561457switch (conversionType)1458{1459case CONVERT_NORMALIZED:1460fpCast = Instruction::CastOps::UIToFP;1461conversionFactor = VIMMED1((float)(1.0 / 255.0));1462break;1463case CONVERT_USCALED:1464fpCast = Instruction::CastOps::UIToFP;1465conversionFactor = VIMMED1((float)(1.0));1466break;1467case CONVERT_SSCALED:1468assert(false && "Type should not be zero extended!");1469conversionFactor = nullptr;1470break;1471default:1472assert(conversionType == CONVERT_NONE);1473conversionFactor = nullptr;1474break;1475}14761477// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits1478for (uint32_t i = 0; i < 4; i++)1479{1480if (isComponentEnabled(compMask, i))1481{1482if (compCtrl[i] == ComponentControl::StoreSrc)1483{1484// pshufb masks for each component1485Value* vConstMask;1486switch (swizzle[i])1487{1488case 0:1489// x shuffle mask1490vConstMask =1491C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,14920, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});1493break;1494case 1:1495// y shuffle mask1496vConstMask =1497C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,14981, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});1499break;1500case 2:1501// z shuffle mask1502vConstMask =1503C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,15042, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});1505break;1506case 3:1507// w shuffle mask1508vConstMask =1509C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,15103, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});1511break;1512default:1513assert(false && "Invalid component");1514vConstMask = nullptr;1515break;1516}15171518Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);1519Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);15201521Value* temp_lo =1522BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);1523Value* temp_hi =1524BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);15251526// after pshufb for x channel1527// 256i - 0 1 2 3 4 5 6 71528// x000 x000 x000 x000 x000 x000 x000 x00015291530Value* temp = JOIN_16(temp_lo, temp_hi);15311532// denormalize if needed1533if (conversionType != CONVERT_NONE)1534{1535temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);1536}15371538vVertexElements[currentVertexElement] = temp;15391540currentVertexElement += 1;1541}1542else1543{1544vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);1545}15461547if (currentVertexElement > 3)1548{1549StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);1550// reset to the next vVertexElement to output1551currentVertexElement = 0;1552}1553}1554}1555}1556else1557{1558SWR_INVALID("Unsupported conversion type");1559}1560}15611562void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)1563{1564// Unpack tuple args1565Value*& vGatherResult = std::get<0>(args);1566Value* pVtxOut = std::get<1>(args);1567const Instruction::CastOps extendType = std::get<2>(args);1568const ConversionType conversionType = std::get<3>(args);1569uint32_t& currentVertexElement = std::get<4>(args);1570uint32_t& outputElt = std::get<5>(args);1571const ComponentEnable compMask = std::get<6>(args);1572const ComponentControl(&compCtrl)[4] = std::get<7>(args);1573Value*(&vVertexElements)[4] = std::get<8>(args);1574const uint32_t(&swizzle)[4] = std::get<9>(args);15751576// cast types1577Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits15781579for (uint32_t i = 0; i < 4; i++)1580{1581if (!isComponentEnabled(compMask, i))1582continue;15831584if (compCtrl[i] == ComponentControl::StoreSrc)1585{1586#if LLVM_VERSION_MAJOR >= 111587using MaskType = int32_t;1588#else1589using MaskType = uint32_t;1590#endif1591std::vector<MaskType> vShuffleMasks[4] = {1592{0, 4, 8, 12, 16, 20, 24, 28}, // x1593{1, 5, 9, 13, 17, 21, 25, 29}, // y1594{2, 6, 10, 14, 18, 22, 26, 30}, // z1595{3, 7, 11, 15, 19, 23, 27, 31}, // w1596};15971598Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),1599UndefValue::get(v32x8Ty),1600vShuffleMasks[swizzle[i]]);16011602if ((extendType == Instruction::CastOps::SExt) ||1603(extendType == Instruction::CastOps::SIToFP))1604{1605switch (conversionType)1606{1607case CONVERT_NORMALIZED:1608val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));1609break;1610case CONVERT_SSCALED:1611val = SI_TO_FP(val, mSimdFP32Ty);1612break;1613case CONVERT_USCALED:1614SWR_INVALID("Type should not be sign extended!");1615break;1616default:1617SWR_ASSERT(conversionType == CONVERT_NONE);1618val = S_EXT(val, mSimdInt32Ty);1619break;1620}1621}1622else if ((extendType == Instruction::CastOps::ZExt) ||1623(extendType == Instruction::CastOps::UIToFP))1624{1625switch (conversionType)1626{1627case CONVERT_NORMALIZED:1628val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));1629break;1630case CONVERT_SSCALED:1631SWR_INVALID("Type should not be zero extended!");1632break;1633case CONVERT_USCALED:1634val = UI_TO_FP(val, mSimdFP32Ty);1635break;1636default:1637SWR_ASSERT(conversionType == CONVERT_NONE);1638val = Z_EXT(val, mSimdInt32Ty);1639break;1640}1641}1642else1643{1644SWR_INVALID("Unsupported conversion type");1645}16461647vVertexElements[currentVertexElement++] = val;1648}1649else1650{1651vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);1652}16531654if (currentVertexElement > 3)1655{1656StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);1657// reset to the next vVertexElement to output1658currentVertexElement = 0;1659}1660}1661}16621663//////////////////////////////////////////////////////////////////////////1664/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,1665/// denormalizes if needed, converts to F32 if needed, and positions in1666// the proper SIMD rows to be output to the simdvertex structure1667/// @param args: (tuple of args, listed below)1668/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index1669/// @param pVtxOut - base pointer to output simdvertex struct1670/// @param extendType - sign extend or zero extend1671/// @param bNormalized - do we need to denormalize?1672/// @param currentVertexElement - reference to the current vVertexElement1673/// @param outputElt - reference to the current offset from simdvertex we're o1674/// @param compMask - component packing mask1675/// @param compCtrl - component control val1676/// @param vVertexElements[4] - vertex components to output1677void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)1678{1679// Unpack tuple args1680Value*(&vGatherResult)[2] = std::get<0>(args);1681Value* pVtxOut = std::get<1>(args);1682const Instruction::CastOps extendType = std::get<2>(args);1683const ConversionType conversionType = std::get<3>(args);1684uint32_t& currentVertexElement = std::get<4>(args);1685uint32_t& outputElt = std::get<5>(args);1686const ComponentEnable compMask = std::get<6>(args);1687const ComponentControl(&compCtrl)[4] = std::get<7>(args);1688Value*(&vVertexElements)[4] = std::get<8>(args);16891690// cast types1691Type* vGatherTy = getVectorType(mInt32Ty, 8);1692Type* v32x8Ty = getVectorType(mInt8Ty, 32);16931694// have to do extra work for sign extending1695if ((extendType == Instruction::CastOps::SExt) ||1696(extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))1697{1698// is this PP float?1699bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;17001701Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane1702Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);17031704// shuffle mask1705Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,17060, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});1707Value* vi128XY_lo = nullptr;1708Value* vi128XY_hi = nullptr;1709if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))1710{1711// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for1712// now..17131714Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);1715Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);17161717Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);1718Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);17191720// after pshufb: group components together in each 128bit lane1721// 256i - 0 1 2 3 4 5 6 71722// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy17231724vi128XY_lo = BITCAST(1725VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),1726v128bitTy);1727vi128XY_hi = BITCAST(1728VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),1729v128bitTy);17301731// after PERMD: move and pack xy components into each 128bit lane1732// 256i - 0 1 2 3 4 5 6 71733// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy1734}17351736// do the same for zw components1737Value* vi128ZW_lo = nullptr;1738Value* vi128ZW_hi = nullptr;1739if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))1740{1741Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);1742Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);17431744Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);1745Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);17461747vi128ZW_lo = BITCAST(1748VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),1749v128bitTy);1750vi128ZW_hi = BITCAST(1751VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),1752v128bitTy);1753}17541755// init denormalize variables if needed1756Instruction::CastOps IntToFpCast;1757Value* conversionFactor;17581759switch (conversionType)1760{1761case CONVERT_NORMALIZED:1762IntToFpCast = Instruction::CastOps::SIToFP;1763conversionFactor = VIMMED1((float)(1.0 / 32767.0));1764break;1765case CONVERT_SSCALED:1766IntToFpCast = Instruction::CastOps::SIToFP;1767conversionFactor = VIMMED1((float)(1.0));1768break;1769case CONVERT_USCALED:1770assert(false && "Type should not be sign extended!");1771conversionFactor = nullptr;1772break;1773default:1774assert(conversionType == CONVERT_NONE);1775conversionFactor = nullptr;1776break;1777}17781779// sign extend all enabled components. If we have a fill vVertexElements, output to current1780// simdvertex1781for (uint32_t i = 0; i < 4; i++)1782{1783if (isComponentEnabled(compMask, i))1784{1785if (compCtrl[i] == ComponentControl::StoreSrc)1786{1787// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 11788uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;1789// if x or y, use vi128XY permute result, else use vi128ZW1790Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;1791Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;17921793if (bFP)1794{1795// extract 128 bit lanes to sign extend each component1796Value* temp_lo =1797CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));1798Value* temp_hi =1799CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));18001801vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);1802}1803else1804{1805// extract 128 bit lanes to sign extend each component1806Value* temp_lo =1807PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));1808Value* temp_hi =1809PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));18101811Value* temp = JOIN_16(temp_lo, temp_hi);18121813// denormalize if needed1814if (conversionType != CONVERT_NONE)1815{1816temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);1817}18181819vVertexElements[currentVertexElement] = temp;1820}18211822currentVertexElement += 1;1823}1824else1825{1826vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);1827}18281829if (currentVertexElement > 3)1830{1831StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);1832// reset to the next vVertexElement to output1833currentVertexElement = 0;1834}1835}1836}1837}1838// else zero extend1839else if ((extendType == Instruction::CastOps::ZExt) ||1840(extendType == Instruction::CastOps::UIToFP))1841{1842// pshufb masks for each component1843Value* vConstMask[2];18441845if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))1846{1847// x/z shuffle mask1848vConstMask[0] = C<char>({18490, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,18500, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,1851});1852}18531854if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))1855{1856// y/w shuffle mask1857vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,18582, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});1859}18601861// init denormalize variables if needed1862Instruction::CastOps fpCast;1863Value* conversionFactor;18641865switch (conversionType)1866{1867case CONVERT_NORMALIZED:1868fpCast = Instruction::CastOps::UIToFP;1869conversionFactor = VIMMED1((float)(1.0 / 65535.0));1870break;1871case CONVERT_USCALED:1872fpCast = Instruction::CastOps::UIToFP;1873conversionFactor = VIMMED1((float)(1.0f));1874break;1875case CONVERT_SSCALED:1876SWR_INVALID("Type should not be zero extended!");1877conversionFactor = nullptr;1878break;1879default:1880SWR_ASSERT(conversionType == CONVERT_NONE);1881conversionFactor = nullptr;1882break;1883}18841885// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits1886for (uint32_t i = 0; i < 4; i++)1887{1888if (isComponentEnabled(compMask, i))1889{1890if (compCtrl[i] == ComponentControl::StoreSrc)1891{1892// select correct constMask for x/z or y/w pshufb1893uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;1894// if x or y, use vi128XY permute result, else use vi128ZW1895uint32_t selectedGather = (i < 2) ? 0 : 1;18961897// SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL,1898// for now..18991900Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);1901Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);19021903Value* temp_lo = BITCAST(1904PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),1905vGatherTy);1906Value* temp_hi = BITCAST(1907PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),1908vGatherTy);19091910// after pshufb mask for x channel; z uses the same shuffle from the second1911// gather 256i - 0 1 2 3 4 5 6 71912// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx0019131914Value* temp = JOIN_16(temp_lo, temp_hi);19151916// denormalize if needed1917if (conversionType != CONVERT_NONE)1918{1919temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);1920}19211922vVertexElements[currentVertexElement] = temp;19231924currentVertexElement += 1;1925}1926else1927{1928vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);1929}19301931if (currentVertexElement > 3)1932{1933StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);1934// reset to the next vVertexElement to output1935currentVertexElement = 0;1936}1937}1938}1939}1940else1941{1942SWR_INVALID("Unsupported conversion type");1943}1944}19451946void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)1947{1948// Unpack tuple args1949Value*(&vGatherResult)[2] = std::get<0>(args);1950Value* pVtxOut = std::get<1>(args);1951const Instruction::CastOps extendType = std::get<2>(args);1952const ConversionType conversionType = std::get<3>(args);1953uint32_t& currentVertexElement = std::get<4>(args);1954uint32_t& outputElt = std::get<5>(args);1955const ComponentEnable compMask = std::get<6>(args);1956const ComponentControl(&compCtrl)[4] = std::get<7>(args);1957Value*(&vVertexElements)[4] = std::get<8>(args);19581959// cast types1960Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);1961Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits19621963// have to do extra work for sign extending1964if ((extendType == Instruction::CastOps::SExt) ||1965(extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))1966{1967// is this PP float?1968bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;19691970Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane1971Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),1972mVWidth / 4); // vwidth is units of 32 bits19731974// shuffle mask1975Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,19760, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});1977Value* vi128XY = nullptr;1978if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))1979{1980Value* vShufResult =1981BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);1982// after pshufb: group components together in each 128bit lane1983// 256i - 0 1 2 3 4 5 6 71984// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy19851986vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);1987// after PERMD: move and pack xy components into each 128bit lane1988// 256i - 0 1 2 3 4 5 6 71989// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy1990}19911992// do the same for zw components1993Value* vi128ZW = nullptr;1994if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))1995{1996Value* vShufResult =1997BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);1998vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);1999}20002001// init denormalize variables if needed2002Instruction::CastOps IntToFpCast;2003Value* conversionFactor;20042005switch (conversionType)2006{2007case CONVERT_NORMALIZED:2008IntToFpCast = Instruction::CastOps::SIToFP;2009conversionFactor = VIMMED1((float)(1.0 / 32767.0));2010break;2011case CONVERT_SSCALED:2012IntToFpCast = Instruction::CastOps::SIToFP;2013conversionFactor = VIMMED1((float)(1.0));2014break;2015case CONVERT_USCALED:2016SWR_INVALID("Type should not be sign extended!");2017conversionFactor = nullptr;2018break;2019default:2020SWR_ASSERT(conversionType == CONVERT_NONE);2021conversionFactor = nullptr;2022break;2023}20242025// sign extend all enabled components. If we have a fill vVertexElements, output to current2026// simdvertex2027for (uint32_t i = 0; i < 4; i++)2028{2029if (isComponentEnabled(compMask, i))2030{2031if (compCtrl[i] == ComponentControl::StoreSrc)2032{2033// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 12034uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;2035// if x or y, use vi128XY permute result, else use vi128ZW2036Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;20372038if (bFP)2039{2040// extract 128 bit lanes to sign extend each component2041vVertexElements[currentVertexElement] =2042CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));2043}2044else2045{2046// extract 128 bit lanes to sign extend each component2047vVertexElements[currentVertexElement] =2048PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));20492050// denormalize if needed2051if (conversionType != CONVERT_NONE)2052{2053vVertexElements[currentVertexElement] =2054FMUL(CAST(IntToFpCast,2055vVertexElements[currentVertexElement],2056mSimdFP32Ty),2057conversionFactor);2058}2059}2060currentVertexElement++;2061}2062else2063{2064vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);2065}20662067if (currentVertexElement > 3)2068{2069StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);2070// reset to the next vVertexElement to output2071currentVertexElement = 0;2072}2073}2074}2075}2076// else zero extend2077else if ((extendType == Instruction::CastOps::ZExt) ||2078(extendType == Instruction::CastOps::UIToFP))2079{2080// pshufb masks for each component2081Value* vConstMask[2];2082if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))2083{2084// x/z shuffle mask2085vConstMask[0] = C<char>({20860, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,20870, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,2088});2089}20902091if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))2092{2093// y/w shuffle mask2094vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,20952, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});2096}20972098// init denormalize variables if needed2099Instruction::CastOps fpCast;2100Value* conversionFactor;21012102switch (conversionType)2103{2104case CONVERT_NORMALIZED:2105fpCast = Instruction::CastOps::UIToFP;2106conversionFactor = VIMMED1((float)(1.0 / 65535.0));2107break;2108case CONVERT_USCALED:2109fpCast = Instruction::CastOps::UIToFP;2110conversionFactor = VIMMED1((float)(1.0f));2111break;2112case CONVERT_SSCALED:2113SWR_INVALID("Type should not be zero extended!");2114conversionFactor = nullptr;2115break;2116default:2117SWR_ASSERT(conversionType == CONVERT_NONE);2118conversionFactor = nullptr;2119break;2120}21212122// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits2123for (uint32_t i = 0; i < 4; i++)2124{2125if (isComponentEnabled(compMask, i))2126{2127if (compCtrl[i] == ComponentControl::StoreSrc)2128{2129// select correct constMask for x/z or y/w pshufb2130uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;2131// if x or y, use vi128XY permute result, else use vi128ZW2132uint32_t selectedGather = (i < 2) ? 0 : 1;21332134vVertexElements[currentVertexElement] =2135BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),2136vConstMask[selectedMask]),2137vGatherTy);2138// after pshufb mask for x channel; z uses the same shuffle from the second2139// gather 256i - 0 1 2 3 4 5 6 72140// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx0021412142// denormalize if needed2143if (conversionType != CONVERT_NONE)2144{2145vVertexElements[currentVertexElement] =2146FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),2147conversionFactor);2148}2149currentVertexElement++;2150}2151else2152{2153vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);2154}21552156if (currentVertexElement > 3)2157{2158StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);2159// reset to the next vVertexElement to output2160currentVertexElement = 0;2161}2162}2163}2164}2165else2166{2167SWR_INVALID("Unsupported conversion type");2168}2169}21702171//////////////////////////////////////////////////////////////////////////2172/// @brief Output a simdvertex worth of elements to the current outputElt2173/// @param pVtxOut - base address of VIN output struct2174/// @param outputElt - simdvertex offset in VIN to write to2175/// @param numEltsToStore - number of simdvertex rows to write out2176/// @param vVertexElements - LLVM Value*[] simdvertex to write out2177void FetchJit::StoreVertexElements(Value* pVtxOut,2178const uint32_t outputElt,2179const uint32_t numEltsToStore,2180Value* (&vVertexElements)[4])2181{2182SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");21832184for (uint32_t c = 0; c < numEltsToStore; ++c)2185{2186// STORE expects FP32 x vWidth type, just bitcast if needed2187if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())2188{2189#if FETCH_DUMP_VERTEX2190PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});2191#endif2192vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);2193}2194#if FETCH_DUMP_VERTEX2195else2196{2197PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});2198}2199#endif2200// outputElt * 4 = offsetting by the size of a simdvertex2201// + c offsets to a 32bit x vWidth row within the current vertex2202Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");2203STORE(vVertexElements[c], dest);2204}2205}22062207//////////////////////////////////////////////////////////////////////////2208/// @brief Generates a constant vector of values based on the2209/// ComponentControl value2210/// @param ctrl - ComponentControl value2211Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)2212{2213switch (ctrl)2214{2215case NoStore:2216return VUNDEF_I();2217case Store0:2218return VIMMED1(0);2219case Store1Fp:2220return VIMMED1(1.0f);2221case Store1Int:2222return VIMMED1(1);2223case StoreVertexId:2224{2225if (mVWidth == 16)2226{2227Type* pSimd8FPTy = getVectorType(mFP32Ty, 8);2228Value* pIdLo =2229BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);2230Value* pIdHi =2231BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);2232return JOIN_16(pIdLo, pIdHi);2233}2234else2235{2236return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);2237}2238}2239case StoreInstanceId:2240{2241Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);2242return VBROADCAST(pId);2243}224422452246case StoreSrc:2247default:2248SWR_INVALID("Invalid component control");2249return VUNDEF_I();2250}2251}22522253//////////////////////////////////////////////////////////////////////////2254/// @brief Returns the enable mask for the specified component.2255/// @param enableMask - enable bits2256/// @param component - component to check if enabled.2257bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)2258{2259switch (component)2260{2261// X2262case 0:2263return (enableMask & ComponentEnable::X);2264// Y2265case 1:2266return (enableMask & ComponentEnable::Y);2267// Z2268case 2:2269return (enableMask & ComponentEnable::Z);2270// W2271case 3:2272return (enableMask & ComponentEnable::W);22732274default:2275return false;2276}2277}22782279// Don't want two threads compiling the same fetch shader simultaneously2280// Has problems in the JIT cache implementation2281// This is only a problem for fetch right now.2282static std::mutex gFetchCodegenMutex;22832284//////////////////////////////////////////////////////////////////////////2285/// @brief JITs from fetch shader IR2286/// @param hJitMgr - JitManager handle2287/// @param func - LLVM function IR2288/// @return PFN_FETCH_FUNC - pointer to fetch code2289PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)2290{2291const llvm::Function* func = (const llvm::Function*)hFunc;2292JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);2293PFN_FETCH_FUNC pfnFetch;22942295gFetchCodegenMutex.lock();2296pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));2297// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot2298// add new IR to the module2299pJitMgr->mIsModuleFinalized = true;23002301#if defined(KNOB_SWRC_TRACING)2302char fName[1024];2303const char* funcName = func->getName().data();2304sprintf(fName, "%s.bin", funcName);2305FILE* fd = fopen(fName, "wb");2306fwrite((void*)pfnFetch, 1, 2048, fd);2307fclose(fd);2308#endif23092310pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");2311gFetchCodegenMutex.unlock();231223132314return pfnFetch;2315}23162317//////////////////////////////////////////////////////////////////////////2318/// @brief JIT compiles fetch shader2319/// @param hJitMgr - JitManager handle2320/// @param state - fetch state to build function from2321extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)2322{2323JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);23242325pJitMgr->SetupNewModule();23262327FetchJit theJit(pJitMgr);2328HANDLE hFunc = theJit.Create(state);23292330return JitFetchFunc(hJitMgr, hFunc);2331}233223332334