Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
4574 views
/****************************************************************************1* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* @file streamout_jit.cpp23*24* @brief Implementation of the streamout jitter25*26* Notes:27*28******************************************************************************/29#include "jit_pch.hpp"30#include "builder_gfx_mem.h"31#include "jit_api.h"32#include "streamout_jit.h"33#include "gen_state_llvm.h"34#include "functionpasses/passes.h"3536using namespace llvm;37using namespace SwrJit;3839//////////////////////////////////////////////////////////////////////////40/// Interface to Jitting a fetch shader41//////////////////////////////////////////////////////////////////////////42struct StreamOutJit : public BuilderGfxMem43{44StreamOutJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr){};4546// returns pointer to SWR_STREAMOUT_BUFFER47Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)48{49return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer});50}5152//////////////////////////////////////////////////////////////////////////53// @brief checks if streamout buffer is oob54// @return <i1> true/false55Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)56{57Value* returnMask = C(false);5859Value* pBuf = getSOBuffer(pSoCtx, buffer);6061// load enable62// @todo bool data types should generate <i1> llvm type63Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty());6465// load buffer size66Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize});6768// load current streamOffset69Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});7071// load buffer pitch72Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});7374// buffer is considered oob if in use in a decl but not enabled75returnMask = OR(returnMask, NOT(enabled));7677// buffer is oob if cannot fit a prims worth of verts78Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));79returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));8081return returnMask;82}8384//////////////////////////////////////////////////////////////////////////85// @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,86// packing the active mask bits87// ex. bitmask 0011 -> (0, 1, 0, 0)88// bitmask 1000 -> (3, 0, 0, 0)89// bitmask 1100 -> (2, 3, 0, 0)90Value* PackMask(uint32_t bitmask)91{92std::vector<Constant*> indices(4, C(0));93unsigned long index;94uint32_t elem = 0;95while (_BitScanForward(&index, bitmask))96{97indices[elem++] = C((int)index);98bitmask &= ~(1 << index);99}100101return ConstantVector::get(indices);102}103104//////////////////////////////////////////////////////////////////////////105// @brief convert scalar bitmask to <4xfloat> bitmask106Value* ToMask(uint32_t bitmask)107{108std::vector<Constant*> indices;109for (uint32_t i = 0; i < 4; ++i)110{111if (bitmask & (1 << i))112{113indices.push_back(C(true));114}115else116{117indices.push_back(C(false));118}119}120return ConstantVector::get(indices);121}122123//////////////////////////////////////////////////////////////////////////124// @brief processes a single decl from the streamout stream. Reads 4 components from the input125// stream and writes N components to the output buffer given the componentMask or if126// a hole, just increments the buffer pointer127// @param pStream - pointer to current attribute128// @param pOutBuffers - pointers to the current location of each output buffer129// @param decl - input decl130void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)131{132uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);133uint32_t packedMask = (1 << numComponents) - 1;134if (!decl.hole)135{136// increment stream pointer to correct slot137Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));138139// load 4 components from stream140Type* simd4Ty = getVectorType(IRB()->getFloatTy(), 4);141Type* simd4PtrTy = PointerType::get(simd4Ty, 0);142pAttrib = BITCAST(pAttrib, simd4PtrTy);143Value* vattrib = LOAD(pAttrib);144145// shuffle/pack enabled components146Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));147148// store to output buffer149// cast SO buffer to i8*, needed by maskstore150Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0));151152// cast input to <4xfloat>153Value* src = BITCAST(vpackedAttrib, simd4Ty);154155// cast mask to <4xi1>156Value* mask = ToMask(packedMask);157MASKED_STORE(src, pOut, 4, mask, PointerType::get(simd4Ty, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT);158}159160// increment SO buffer161pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));162}163164//////////////////////////////////////////////////////////////////////////165// @brief builds a single vertex worth of data for the given stream166// @param streamState - state for this stream167// @param pCurVertex - pointer to src stream vertex data168// @param pOutBuffer - pointers to up to 4 SO buffers169void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])170{171for (uint32_t d = 0; d < streamState.numDecls; ++d)172{173const STREAMOUT_DECL& decl = streamState.decl[d];174buildDecl(pCurVertex, pOutBuffer, decl);175}176}177178void buildStream(const STREAMOUT_COMPILE_STATE& state,179const STREAMOUT_STREAM& streamState,180Value* pSoCtx,181BasicBlock* returnBB,182Function* soFunc)183{184// get list of active SO buffers185std::unordered_set<uint32_t> activeSOBuffers;186for (uint32_t d = 0; d < streamState.numDecls; ++d)187{188const STREAMOUT_DECL& decl = streamState.decl[d];189activeSOBuffers.insert(decl.bufferIndex);190}191192// always increment numPrimStorageNeeded193Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});194numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));195STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});196197// check OOB on active SO buffers. If any buffer is out of bound, don't write198// the primitive to any buffer199Value* oobMask = C(false);200for (uint32_t buffer : activeSOBuffers)201{202oobMask = OR(oobMask, oob(state, pSoCtx, buffer));203}204205BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);206207// early out if OOB208COND_BR(oobMask, returnBB, validBB);209210IRB()->SetInsertPoint(validBB);211212Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});213numPrimsWritten = ADD(numPrimsWritten, C(1));214STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});215216// compute start pointer for each output buffer217Value* pOutBuffer[4];218Value* pOutBufferStartVertex[4];219Value* outBufferPitch[4];220for (uint32_t b : activeSOBuffers)221{222Value* pBuf = getSOBuffer(pSoCtx, b);223Value* pData = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer});224Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});225pOutBuffer[b] = GEP(pData, streamOffset, PointerType::get(IRB()->getInt32Ty(), 0));226pOutBufferStartVertex[b] = pOutBuffer[b];227228outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});229}230231// loop over the vertices of the prim232Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData});233for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)234{235buildVertex(streamState, pStreamData, pOutBuffer);236237// increment stream and output buffer pointers238// stream verts are always 32*4 dwords apart239pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));240241// output buffers offset using pitch in buffer state242for (uint32_t b : activeSOBuffers)243{244pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);245pOutBuffer[b] = pOutBufferStartVertex[b];246}247}248249// update each active buffer's streamOffset250for (uint32_t b : activeSOBuffers)251{252Value* pBuf = getSOBuffer(pSoCtx, b);253Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});254streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));255STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});256}257}258259Function* Create(const STREAMOUT_COMPILE_STATE& state)260{261std::stringstream fnName("SO_",262std::ios_base::in | std::ios_base::out | std::ios_base::ate);263fnName << ComputeCRC(0, &state, sizeof(state));264265std::vector<Type*> args{266mInt8PtrTy,267mInt8PtrTy,268PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*269};270271FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);272Function* soFunc = Function::Create(273fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);274275soFunc->getParent()->setModuleIdentifier(soFunc->getName());276277// create return basic block278BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);279BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);280281IRB()->SetInsertPoint(entry);282283// arguments284auto argitr = soFunc->arg_begin();285286Value* privateContext = &*argitr++;287privateContext->setName("privateContext");288SetPrivateContext(privateContext);289290mpWorkerData = &*argitr;291++argitr;292mpWorkerData->setName("pWorkerData");293294Value* pSoCtx = &*argitr++;295pSoCtx->setName("pSoCtx");296297const STREAMOUT_STREAM& streamState = state.stream;298buildStream(state, streamState, pSoCtx, returnBB, soFunc);299300BR(returnBB);301302IRB()->SetInsertPoint(returnBB);303RET_VOID();304305JitManager::DumpToFile(soFunc, "SoFunc");306307::FunctionPassManager passes(JM()->mpCurrentModule);308309passes.add(createBreakCriticalEdgesPass());310passes.add(createCFGSimplificationPass());311passes.add(createEarlyCSEPass());312passes.add(createPromoteMemoryToRegisterPass());313passes.add(createCFGSimplificationPass());314passes.add(createEarlyCSEPass());315passes.add(createInstructionCombiningPass());316#if LLVM_VERSION_MAJOR <= 11317passes.add(createConstantPropagationPass());318#endif319passes.add(createSCCPPass());320passes.add(createAggressiveDCEPass());321322passes.add(createLowerX86Pass(this));323324passes.run(*soFunc);325326JitManager::DumpToFile(soFunc, "SoFunc_optimized");327328329return soFunc;330}331};332333//////////////////////////////////////////////////////////////////////////334/// @brief JITs from streamout shader IR335/// @param hJitMgr - JitManager handle336/// @param func - LLVM function IR337/// @return PFN_SO_FUNC - pointer to SOS function338PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)339{340llvm::Function* func = (llvm::Function*)hFunc;341JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);342PFN_SO_FUNC pfnStreamOut;343pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));344// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot345// add new IR to the module346pJitMgr->mIsModuleFinalized = true;347348pJitMgr->DumpAsm(func, "SoFunc_optimized");349350351return pfnStreamOut;352}353354//////////////////////////////////////////////////////////////////////////355/// @brief JIT compiles streamout shader356/// @param hJitMgr - JitManager handle357/// @param state - SO state to build function from358extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr,359const STREAMOUT_COMPILE_STATE& state)360{361JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);362363STREAMOUT_COMPILE_STATE soState = state;364if (soState.offsetAttribs)365{366for (uint32_t i = 0; i < soState.stream.numDecls; ++i)367{368soState.stream.decl[i].attribSlot -= soState.offsetAttribs;369}370}371372pJitMgr->SetupNewModule();373374StreamOutJit theJit(pJitMgr);375HANDLE hFunc = theJit.Create(soState);376377return JitStreamoutFunc(hJitMgr, hFunc);378}379380381