Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
4574 views
/****************************************************************************1* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* @file builder_misc.cpp23*24* @brief Implementation for miscellaneous builder functions25*26* Notes:27*28******************************************************************************/29#include "jit_pch.hpp"30#include "builder.h"3132#include <cstdarg>3334namespace SwrJit35{36void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)37{38SWR_ASSERT(39ptr->getType() != mInt64Ty,40"Address appears to be GFX access. Requires translation through BuilderGfxMem.");41}4243Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)44{45return IRB()->CreateGEP(Ptr, Idx, Name);46}4748Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)49{50return IRB()->CreateGEP(Ty, Ptr, Idx, Name);51}5253Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)54{55std::vector<Value*> indices;56for (auto i : indexList)57indices.push_back(i);58return GEPA(ptr, indices);59}6061Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)62{63std::vector<Value*> indices;64for (auto i : indexList)65indices.push_back(C(i));66return GEPA(ptr, indices);67}6869Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)70{71return IRB()->CreateGEP(Ptr, IdxList, Name);72}7374Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)75{76return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);77}7879Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)80{81std::vector<Value*> indices;82for (auto i : indexList)83indices.push_back(i);84return IN_BOUNDS_GEP(ptr, indices);85}8687Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)88{89std::vector<Value*> indices;90for (auto i : indexList)91indices.push_back(C(i));92return IN_BOUNDS_GEP(ptr, indices);93}9495LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)96{97AssertMemoryUsageParams(Ptr, usage);98return IRB()->CreateLoad(Ptr, Name);99}100101LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)102{103AssertMemoryUsageParams(Ptr, usage);104return IRB()->CreateLoad(Ptr, Name);105}106107LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)108{109AssertMemoryUsageParams(Ptr, usage);110return IRB()->CreateLoad(Ty, Ptr, Name);111}112113LoadInst*114Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)115{116AssertMemoryUsageParams(Ptr, usage);117return IRB()->CreateLoad(Ptr, isVolatile, Name);118}119120LoadInst* Builder::LOAD(Value* basePtr,121const std::initializer_list<uint32_t>& indices,122const llvm::Twine& name,123Type* Ty,124MEM_CLIENT usage)125{126std::vector<Value*> valIndices;127for (auto i : indices)128valIndices.push_back(C(i));129return Builder::LOAD(GEPA(basePtr, valIndices), name);130}131132LoadInst* Builder::LOADV(Value* basePtr,133const std::initializer_list<Value*>& indices,134const llvm::Twine& name)135{136std::vector<Value*> valIndices;137for (auto i : indices)138valIndices.push_back(i);139return LOAD(GEPA(basePtr, valIndices), name);140}141142StoreInst*143Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)144{145std::vector<Value*> valIndices;146for (auto i : indices)147valIndices.push_back(C(i));148return STORE(val, GEPA(basePtr, valIndices));149}150151StoreInst*152Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)153{154std::vector<Value*> valIndices;155for (auto i : indices)156valIndices.push_back(i);157return STORE(val, GEPA(basePtr, valIndices));158}159160Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)161{162return GEP(base, offset);163}164165Value* Builder::MEM_ADD(Value* i32Incr,166Value* basePtr,167const std::initializer_list<uint32_t>& indices,168const llvm::Twine& name)169{170Value* i32Value = LOAD(GEP(basePtr, indices), name);171Value* i32Result = ADD(i32Value, i32Incr);172return STORE(i32Result, GEP(basePtr, indices));173}174175//////////////////////////////////////////////////////////////////////////176/// @brief Generate a masked gather operation in LLVM IR. If not177/// supported on the underlying platform, emulate it with loads178/// @param vSrc - SIMD wide value that will be loaded if mask is invalid179/// @param pBase - Int8* base VB address pointer value180/// @param vIndices - SIMD wide value of VB byte offsets181/// @param vMask - SIMD wide mask that controls whether to access memory or the src values182/// @param scale - value to scale indices by183Value* Builder::GATHERPS(Value* vSrc,184Value* pBase,185Value* vIndices,186Value* vMask,187uint8_t scale,188MEM_CLIENT usage)189{190AssertMemoryUsageParams(pBase, usage);191192return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));193}194195//////////////////////////////////////////////////////////////////////////196/// @brief Generate a masked gather operation in LLVM IR. If not197/// supported on the underlying platform, emulate it with loads198/// @param vSrc - SIMD wide value that will be loaded if mask is invalid199/// @param pBase - Int8* base VB address pointer value200/// @param vIndices - SIMD wide value of VB byte offsets201/// @param vMask - SIMD wide mask that controls whether to access memory or the src values202/// @param scale - value to scale indices by203Value* Builder::GATHERDD(Value* vSrc,204Value* pBase,205Value* vIndices,206Value* vMask,207uint8_t scale,208MEM_CLIENT usage)209{210AssertMemoryUsageParams(pBase, usage);211212return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));213}214215//////////////////////////////////////////////////////////////////////////216/// @brief Generate a masked gather operation in LLVM IR. If not217/// supported on the underlying platform, emulate it with loads218/// @param vSrc - SIMD wide value that will be loaded if mask is invalid219/// @param pBase - Int8* base VB address pointer value220/// @param vIndices - SIMD wide value of VB byte offsets221/// @param vMask - SIMD wide mask that controls whether to access memory or the src values222/// @param scale - value to scale indices by223Value*224Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)225{226return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));227}228229//////////////////////////////////////////////////////////////////////////230/// @brief Alternative masked gather where source is a vector of pointers231/// @param pVecSrcPtr - SIMD wide vector of pointers232/// @param pVecMask - SIMD active lanes233/// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive234Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)235{236return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);237}238239void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)240{241MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);242}243244void Builder::Gather4(const SWR_FORMAT format,245Value* pSrcBase,246Value* byteOffsets,247Value* mask,248Value* vGatherComponents[],249bool bPackedOutput,250MEM_CLIENT usage)251{252const SWR_FORMAT_INFO& info = GetFormatInfo(format);253if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)254{255GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);256}257else258{259GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);260}261}262263void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,264Value* pSrcBase,265Value* byteOffsets,266Value* vMask,267Value* vGatherComponents[],268bool bPackedOutput,269MEM_CLIENT usage)270{271switch (info.bpp / info.numComps)272{273case 16:274{275Value* vGatherResult[2];276277// TODO: vGatherMaskedVal278Value* vGatherMaskedVal = VIMMED1((float)0);279280// always have at least one component out of x or y to fetch281282vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);283// e.g. result of first 8x32bit integer gather for 16bit components284// 256i - 0 1 2 3 4 5 6 7285// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy286//287288// if we have at least one component out of x or y to fetch289if (info.numComps > 2)290{291// offset base to the next components(zw) in the vertex to gather292pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));293294vGatherResult[1] =295GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);296// e.g. result of second 8x32bit integer gather for 16bit components297// 256i - 0 1 2 3 4 5 6 7298// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw299//300}301else302{303vGatherResult[1] = vGatherMaskedVal;304}305306// Shuffle gathered components into place, each row is a component307Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);308}309break;310case 32:311{312// apply defaults313for (uint32_t i = 0; i < 4; ++i)314{315vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);316}317318for (uint32_t i = 0; i < info.numComps; i++)319{320uint32_t swizzleIndex = info.swizzle[i];321322// Gather a SIMD of components323vGatherComponents[swizzleIndex] = GATHERPS(324vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);325326// offset base to the next component to gather327pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));328}329}330break;331default:332SWR_INVALID("Invalid float format");333break;334}335}336337void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,338Value* pSrcBase,339Value* byteOffsets,340Value* vMask,341Value* vGatherComponents[],342bool bPackedOutput,343MEM_CLIENT usage)344{345switch (info.bpp / info.numComps)346{347case 8:348{349Value* vGatherMaskedVal = VIMMED1((int32_t)0);350Value* vGatherResult =351GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);352// e.g. result of an 8x32bit integer gather for 8bit components353// 256i - 0 1 2 3 4 5 6 7354// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw355356Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);357}358break;359case 16:360{361Value* vGatherResult[2];362363// TODO: vGatherMaskedVal364Value* vGatherMaskedVal = VIMMED1((int32_t)0);365366// always have at least one component out of x or y to fetch367368vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);369// e.g. result of first 8x32bit integer gather for 16bit components370// 256i - 0 1 2 3 4 5 6 7371// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy372//373374// if we have at least one component out of x or y to fetch375if (info.numComps > 2)376{377// offset base to the next components(zw) in the vertex to gather378pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));379380vGatherResult[1] =381GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);382// e.g. result of second 8x32bit integer gather for 16bit components383// 256i - 0 1 2 3 4 5 6 7384// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw385//386}387else388{389vGatherResult[1] = vGatherMaskedVal;390}391392// Shuffle gathered components into place, each row is a component393Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);394}395break;396case 32:397{398// apply defaults399for (uint32_t i = 0; i < 4; ++i)400{401vGatherComponents[i] = VIMMED1((int)info.defaults[i]);402}403404for (uint32_t i = 0; i < info.numComps; i++)405{406uint32_t swizzleIndex = info.swizzle[i];407408// Gather a SIMD of components409vGatherComponents[swizzleIndex] = GATHERDD(410vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);411412// offset base to the next component to gather413pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));414}415}416break;417default:418SWR_INVALID("unsupported format");419break;420}421}422423void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,424Value* vGatherInput[2],425Value* vGatherOutput[4],426bool bPackedOutput)427{428// cast types429Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);430Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits431432// input could either be float or int vector; do shuffle work in int433vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);434vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);435436if (bPackedOutput)437{438Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),439mVWidth / 4); // vwidth is units of 32 bits440441// shuffle mask442Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,4430, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});444Value* vShufResult =445BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);446// after pshufb: group components together in each 128bit lane447// 256i - 0 1 2 3 4 5 6 7448// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy449450Value* vi128XY =451BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);452// after PERMD: move and pack xy components into each 128bit lane453// 256i - 0 1 2 3 4 5 6 7454// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy455456// do the same for zw components457Value* vi128ZW = nullptr;458if (info.numComps > 2)459{460Value* vShufResult =461BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);462vi128ZW =463BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);464}465466for (uint32_t i = 0; i < 4; i++)467{468uint32_t swizzleIndex = info.swizzle[i];469// todo: fixed for packed470Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));471if (i >= info.numComps)472{473// set the default component val474vGatherOutput[swizzleIndex] = vGatherMaskedVal;475continue;476}477478// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1479uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;480// if x or y, use vi128XY permute result, else use vi128ZW481Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;482483// extract packed component 128 bit lanes484vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));485}486}487else488{489// pshufb masks for each component490Value* vConstMask[2];491// x/z shuffle mask492vConstMask[0] = C<char>({4930, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,4940, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,495});496497// y/w shuffle mask498vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,4992, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});500501// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits502// apply defaults503for (uint32_t i = 0; i < 4; ++i)504{505vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);506}507508for (uint32_t i = 0; i < info.numComps; i++)509{510uint32_t swizzleIndex = info.swizzle[i];511512// select correct constMask for x/z or y/w pshufb513uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;514// if x or y, use vi128XY permute result, else use vi128ZW515uint32_t selectedGather = (i < 2) ? 0 : 1;516517vGatherOutput[swizzleIndex] =518BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),519vConstMask[selectedMask]),520vGatherTy);521// after pshufb mask for x channel; z uses the same shuffle from the second gather522// 256i - 0 1 2 3 4 5 6 7523// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00524}525}526}527528void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,529Value* vGatherInput,530Value* vGatherOutput[],531bool bPackedOutput)532{533// cast types534Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);535Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits536537if (bPackedOutput)538{539Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),540mVWidth / 4); // vwidth is units of 32 bits541// shuffle mask542Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,5430, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});544Value* vShufResult =545BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);546// after pshufb: group components together in each 128bit lane547// 256i - 0 1 2 3 4 5 6 7548// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww549550Value* vi128XY =551BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);552// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane553// 256i - 0 1 2 3 4 5 6 7554// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)555556// do the same for zw components557Value* vi128ZW = nullptr;558if (info.numComps > 2)559{560vi128ZW =561BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);562}563564// sign extend all enabled components. If we have a fill vVertexElements, output to565// current simdvertex566for (uint32_t i = 0; i < 4; i++)567{568uint32_t swizzleIndex = info.swizzle[i];569// todo: fix for packed570Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));571if (i >= info.numComps)572{573// set the default component val574vGatherOutput[swizzleIndex] = vGatherMaskedVal;575continue;576}577578// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1579uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;580// if x or y, use vi128XY permute result, else use vi128ZW581Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;582583// sign extend584vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));585}586}587// else zero extend588else589{590// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits591// apply defaults592for (uint32_t i = 0; i < 4; ++i)593{594vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);595}596597for (uint32_t i = 0; i < info.numComps; i++)598{599uint32_t swizzleIndex = info.swizzle[i];600601// pshufb masks for each component602Value* vConstMask;603switch (i)604{605case 0:606// x shuffle mask607vConstMask =608C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,6090, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});610break;611case 1:612// y shuffle mask613vConstMask =614C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,6151, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});616break;617case 2:618// z shuffle mask619vConstMask =620C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,6212, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});622break;623case 3:624// w shuffle mask625vConstMask =626C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,6273, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});628break;629default:630vConstMask = nullptr;631break;632}633634assert(vConstMask && "Invalid info.numComps value");635vGatherOutput[swizzleIndex] =636BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);637// after pshufb for x channel638// 256i - 0 1 2 3 4 5 6 7639// x000 x000 x000 x000 x000 x000 x000 x000640}641}642}643644//////////////////////////////////////////////////////////////////////////645/// @brief emulates a scatter operation.646/// @param pDst - pointer to destination647/// @param vSrc - vector of src data to scatter648/// @param vOffsets - vector of byte offsets from pDst649/// @param vMask - mask of valid lanes650void Builder::SCATTERPS(651Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)652{653AssertMemoryUsageParams(pDst, usage);654#if LLVM_VERSION_MAJOR >= 11655SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());656#else657SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());658#endif659VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));660return;661662/* Scatter algorithm663664while(Index = BitScanForward(mask))665srcElem = srcVector[Index]666offsetElem = offsetVector[Index]667*(pDst + offsetElem) = srcElem668Update mask (&= ~(1<<Index)669670*/671672/*673674// Reference implementation kept around for reference675676BasicBlock* pCurBB = IRB()->GetInsertBlock();677Function* pFunc = pCurBB->getParent();678Type* pSrcTy = vSrc->getType()->getVectorElementType();679680// Store vectors on stack681if (pScatterStackSrc == nullptr)682{683// Save off stack allocations and reuse per scatter. Significantly reduces stack684// requirements for shaders with a lot of scatters.685pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);686pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);687}688689Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));690Value* pOffsetsArrayPtr = pScatterStackOffsets;691STORE(vSrc, pSrcArrayPtr);692STORE(vOffsets, pOffsetsArrayPtr);693694// Cast to pointers for random access695pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));696pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));697698Value* pMask = VMOVMSK(vMask);699700// Setup loop basic block701BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);702703// compute first set bit704Value* pIndex = CTTZ(pMask, C(false));705706Value* pIsUndef = ICMP_EQ(pIndex, C(32));707708// Split current block or create new one if building inline709BasicBlock* pPostLoop;710if (pCurBB->getTerminator())711{712pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());713714// Remove unconditional jump created by splitBasicBlock715pCurBB->getTerminator()->eraseFromParent();716717// Add terminator to end of original block718IRB()->SetInsertPoint(pCurBB);719720// Add conditional branch721COND_BR(pIsUndef, pPostLoop, pLoop);722}723else724{725pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);726727// Add conditional branch728COND_BR(pIsUndef, pPostLoop, pLoop);729}730731// Add loop basic block contents732IRB()->SetInsertPoint(pLoop);733PHINode* pIndexPhi = PHI(mInt32Ty, 2);734PHINode* pMaskPhi = PHI(mInt32Ty, 2);735736pIndexPhi->addIncoming(pIndex, pCurBB);737pMaskPhi->addIncoming(pMask, pCurBB);738739// Extract elements for this index740Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi});741Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});742743// GEP to this offset in dst744Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);745pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));746STORE(pSrcElem, pCurDst);747748// Update the mask749Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));750751// Terminator752Value* pNewIndex = CTTZ(pNewMask, C(false));753754pIsUndef = ICMP_EQ(pNewIndex, C(32));755COND_BR(pIsUndef, pPostLoop, pLoop);756757// Update phi edges758pIndexPhi->addIncoming(pNewIndex, pLoop);759pMaskPhi->addIncoming(pNewMask, pLoop);760761// Move builder to beginning of post loop762IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());763764*/765}766} // namespace SwrJit767768769