Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
4574 views
/****************************************************************************1* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* @file lower_x86.cpp23*24* @brief llvm pass to lower meta code to x8625*26* Notes:27*28******************************************************************************/2930#include "jit_pch.hpp"31#include "passes.h"32#include "JitManager.h"3334#include "common/simdlib.hpp"3536#include <unordered_map>3738extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);3940namespace llvm41{42// forward declare the initializer43void initializeLowerX86Pass(PassRegistry&);44} // namespace llvm4546namespace SwrJit47{48using namespace llvm;4950enum TargetArch51{52AVX = 0,53AVX2 = 1,54AVX512 = 255};5657enum TargetWidth58{59W256 = 0,60W512 = 1,61NUM_WIDTHS = 262};6364struct LowerX86;6566typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;6768struct X86Intrinsic69{70IntrinsicID intrin[NUM_WIDTHS];71EmuFunc emuFunc;72};7374// Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the75// previous behavior of mapping directly to avx/avx2 intrinsics.76using intrinsicMap_t = std::map<std::string, IntrinsicID>;77static intrinsicMap_t& getIntrinsicMap() {78static std::map<std::string, IntrinsicID> intrinsicMap = {79{"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},80{"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},81{"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},82{"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},83{"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},84{"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},85{"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},86{"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}87};88return intrinsicMap;89}9091// Forward decls92Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);93Instruction*94VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);95Instruction*96VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);97Instruction*98VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);99Instruction*100VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);101Instruction*102VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);103Instruction*104VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);105106Instruction* DOUBLE_EMU(LowerX86* pThis,107TargetArch arch,108TargetWidth width,109CallInst* pCallInst,110Intrinsic::ID intrin);111112static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;113114using intrinsicMapAdvanced_t = std::vector<std::map<std::string, X86Intrinsic>>;115116static intrinsicMapAdvanced_t& getIntrinsicMapAdvanced()117{118// clang-format off119static intrinsicMapAdvanced_t intrinsicMapAdvanced = {120// 256 wide 512 wide121{122// AVX123{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},124{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},125{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},126{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},127{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},128{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},129{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},130{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},131{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},132{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},133},134{135// AVX2136{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},137{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},138{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},139{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},140{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},141{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},142{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},143{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},144{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},145{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},146},147{148// AVX512149{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},150#if LLVM_VERSION_MAJOR < 7151{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},152{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},153#else154{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},155{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},156#endif157{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},158{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},159{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},160{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},161#if LLVM_VERSION_MAJOR < 7162{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},163#else164{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},165#endif166{"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},167{"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}168}};169// clang-format on170return intrinsicMapAdvanced;171}172173static uint32_t getBitWidth(VectorType *pVTy)174{175#if LLVM_VERSION_MAJOR >= 12176return cast<FixedVectorType>(pVTy)->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();177#elif LLVM_VERSION_MAJOR >= 11178return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();179#else180return pVTy->getBitWidth();181#endif182}183184struct LowerX86 : public FunctionPass185{186LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)187{188initializeLowerX86Pass(*PassRegistry::getPassRegistry());189190// Determine target arch191if (JM()->mArch.AVX512F())192{193mTarget = AVX512;194}195else if (JM()->mArch.AVX2())196{197mTarget = AVX2;198}199else if (JM()->mArch.AVX())200{201mTarget = AVX;202}203else204{205SWR_ASSERT(false, "Unsupported AVX architecture.");206mTarget = AVX;207}208209// Setup scatter function for 256 wide210uint32_t curWidth = B->mVWidth;211B->SetTargetWidth(8);212std::vector<Type*> args = {213B->mInt8PtrTy, // pBase214B->mSimdInt32Ty, // vIndices215B->mSimdFP32Ty, // vSrc216B->mInt8Ty, // mask217B->mInt32Ty // scale218};219220FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);221mPfnScatter256 = cast<Function>(222#if LLVM_VERSION_MAJOR >= 9223B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());224#else225B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));226#endif227if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)228{229sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);230}231232B->SetTargetWidth(curWidth);233}234235// Try to decipher the vector type of the instruction. This does not work properly236// across all intrinsics, and will have to be rethought. Probably need something237// similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed238// intrinsic.239void GetRequestedWidthAndType(CallInst* pCallInst,240const StringRef intrinName,241TargetWidth* pWidth,242Type** pTy)243{244assert(pCallInst);245Type* pVecTy = pCallInst->getType();246247// Check for intrinsic specific types248// VCVTPD2PS type comes from src, not dst249if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))250{251Value* pOp = pCallInst->getOperand(0);252assert(pOp);253pVecTy = pOp->getType();254}255256if (!pVecTy->isVectorTy())257{258for (auto& op : pCallInst->arg_operands())259{260if (op.get()->getType()->isVectorTy())261{262pVecTy = op.get()->getType();263break;264}265}266}267SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");268269uint32_t width = getBitWidth(cast<VectorType>(pVecTy));270switch (width)271{272case 256:273*pWidth = W256;274break;275case 512:276*pWidth = W512;277break;278default:279SWR_ASSERT(false, "Unhandled vector width %d", width);280*pWidth = W256;281}282283*pTy = pVecTy->getScalarType();284}285286Value* GetZeroVec(TargetWidth width, Type* pTy)287{288uint32_t numElem = 0;289switch (width)290{291case W256:292numElem = 8;293break;294case W512:295numElem = 16;296break;297default:298SWR_ASSERT(false, "Unhandled vector width type %d\n", width);299}300301return ConstantVector::getNullValue(getVectorType(pTy, numElem));302}303304Value* GetMask(TargetWidth width)305{306Value* mask;307switch (width)308{309case W256:310mask = B->C((uint8_t)-1);311break;312case W512:313mask = B->C((uint16_t)-1);314break;315default:316SWR_ASSERT(false, "Unhandled vector width type %d\n", width);317}318return mask;319}320321// Convert <N x i1> mask to <N x i32> x86 mask322Value* VectorMask(Value* vi1Mask)323{324#if LLVM_VERSION_MAJOR >= 12325uint32_t numElem = cast<FixedVectorType>(vi1Mask->getType())->getNumElements();326#elif LLVM_VERSION_MAJOR >= 11327uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements();328#else329uint32_t numElem = vi1Mask->getType()->getVectorNumElements();330#endif331return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem));332}333334Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)335{336Function* pFunc = pCallInst->getCalledFunction();337assert(pFunc);338339auto& intrinsic = getIntrinsicMapAdvanced()[mTarget][pFunc->getName().str()];340TargetWidth vecWidth;341Type* pElemTy;342GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);343344// Check if there is a native intrinsic for this instruction345IntrinsicID id = intrinsic.intrin[vecWidth];346if (id == DOUBLE)347{348// Double pump the next smaller SIMD intrinsic349SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");350Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];351SWR_ASSERT(id2 != Intrinsic::not_intrinsic,352"Cannot find intrinsic to double pump.");353return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);354}355else if (id != Intrinsic::not_intrinsic)356{357Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);358SmallVector<Value*, 8> args;359for (auto& arg : pCallInst->arg_operands())360{361args.push_back(arg.get());362}363364// If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and365// full mask for now Assuming the intrinsics are consistent and place the src366// operand and mask last in the argument list.367if (mTarget == AVX512)368{369if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))370{371args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));372args.push_back(GetMask(W256));373// for AVX512 VCVTPD2PS, we also have to add rounding mode374args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));375}376else377{378args.push_back(GetZeroVec(vecWidth, pElemTy));379args.push_back(GetMask(vecWidth));380}381}382383return B->CALLA(pIntrin, args);384}385else386{387// No native intrinsic, call emulation function388return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);389}390391SWR_ASSERT(false);392return nullptr;393}394395Instruction* ProcessIntrinsic(CallInst* pCallInst)396{397Function* pFunc = pCallInst->getCalledFunction();398assert(pFunc);399400// Forward to the advanced support if found401if (getIntrinsicMapAdvanced()[mTarget].find(pFunc->getName().str()) != getIntrinsicMapAdvanced()[mTarget].end())402{403return ProcessIntrinsicAdvanced(pCallInst);404}405406SWR_ASSERT(getIntrinsicMap().find(pFunc->getName().str()) != getIntrinsicMap().end(),407"Unimplemented intrinsic %s.",408pFunc->getName().str().c_str());409410Intrinsic::ID x86Intrinsic = getIntrinsicMap()[pFunc->getName().str()];411Function* pX86IntrinFunc =412Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);413414SmallVector<Value*, 8> args;415for (auto& arg : pCallInst->arg_operands())416{417args.push_back(arg.get());418}419return B->CALLA(pX86IntrinFunc, args);420}421422//////////////////////////////////////////////////////////////////////////423/// @brief LLVM function pass run method.424/// @param f- The function we're working on with this pass.425virtual bool runOnFunction(Function& F)426{427std::vector<Instruction*> toRemove;428std::vector<BasicBlock*> bbs;429430// Make temp copy of the basic blocks and instructions, as the intrinsic431// replacement code might invalidate the iterators432for (auto& b : F.getBasicBlockList())433{434bbs.push_back(&b);435}436437for (auto* BB : bbs)438{439std::vector<Instruction*> insts;440for (auto& i : BB->getInstList())441{442insts.push_back(&i);443}444445for (auto* I : insts)446{447if (CallInst* pCallInst = dyn_cast<CallInst>(I))448{449Function* pFunc = pCallInst->getCalledFunction();450if (pFunc)451{452if (pFunc->getName().startswith("meta.intrinsic"))453{454B->IRB()->SetInsertPoint(I);455Instruction* pReplace = ProcessIntrinsic(pCallInst);456toRemove.push_back(pCallInst);457if (pReplace)458{459pCallInst->replaceAllUsesWith(pReplace);460}461}462}463}464}465}466467for (auto* pInst : toRemove)468{469pInst->eraseFromParent();470}471472JitManager::DumpToFile(&F, "lowerx86");473474return true;475}476477virtual void getAnalysisUsage(AnalysisUsage& AU) const {}478479JitManager* JM() { return B->JM(); }480Builder* B;481TargetArch mTarget;482Function* mPfnScatter256;483484static char ID; ///< Needed by LLVM to generate ID for FunctionPass.485};486487char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.488489FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }490491Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)492{493SWR_ASSERT(false, "Unimplemented intrinsic emulation.");494return nullptr;495}496497Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)498{499// Only need vperm emulation for AVX500SWR_ASSERT(arch == AVX);501502Builder* B = pThis->B;503auto v32A = pCallInst->getArgOperand(0);504auto vi32Index = pCallInst->getArgOperand(1);505506Value* v32Result;507if (isa<Constant>(vi32Index))508{509// Can use llvm shuffle vector directly with constant shuffle indices510v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);511}512else513{514v32Result = UndefValue::get(v32A->getType());515#if LLVM_VERSION_MAJOR >= 12516uint32_t numElem = cast<FixedVectorType>(v32A->getType())->getNumElements();517#elif LLVM_VERSION_MAJOR >= 11518uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements();519#else520uint32_t numElem = v32A->getType()->getVectorNumElements();521#endif522for (uint32_t l = 0; l < numElem; ++l)523{524auto i32Index = B->VEXTRACT(vi32Index, B->C(l));525auto val = B->VEXTRACT(v32A, i32Index);526v32Result = B->VINSERT(v32Result, val, B->C(l));527}528}529return cast<Instruction>(v32Result);530}531532Instruction*533VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)534{535Builder* B = pThis->B;536auto vSrc = pCallInst->getArgOperand(0);537auto pBase = pCallInst->getArgOperand(1);538auto vi32Indices = pCallInst->getArgOperand(2);539auto vi1Mask = pCallInst->getArgOperand(3);540auto i8Scale = pCallInst->getArgOperand(4);541542pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));543#if LLVM_VERSION_MAJOR >= 11544#if LLVM_VERSION_MAJOR >= 12545FixedVectorType* pVectorType = cast<FixedVectorType>(vSrc->getType());546#else547VectorType* pVectorType = cast<VectorType>(vSrc->getType());548#endif549uint32_t numElem = pVectorType->getNumElements();550auto srcTy = pVectorType->getElementType();551#else552uint32_t numElem = vSrc->getType()->getVectorNumElements();553auto srcTy = vSrc->getType()->getVectorElementType();554#endif555auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);556557Value* v32Gather = nullptr;558if (arch == AVX)559{560// Full emulation for AVX561// Store source on stack to provide a valid address to load from inactive lanes562auto pStack = B->STACKSAVE();563auto pTmp = B->ALLOCA(vSrc->getType());564B->STORE(vSrc, pTmp);565566v32Gather = UndefValue::get(vSrc->getType());567#if LLVM_VERSION_MAJOR <= 10568auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));569#elif LLVM_VERSION_MAJOR == 11570auto vi32Scale = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale));571#else572auto vi32Scale = ConstantVector::getSplat(ElementCount::get(numElem, false), cast<ConstantInt>(i32Scale));573#endif574auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);575576for (uint32_t i = 0; i < numElem; ++i)577{578auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));579auto pLoadAddress = B->GEP(pBase, i32Offset);580pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));581auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});582auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));583auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);584auto val = B->LOAD(pValidAddress);585v32Gather = B->VINSERT(v32Gather, val, B->C(i));586}587588B->STACKRESTORE(pStack);589}590else if (arch == AVX2 || (arch == AVX512 && width == W256))591{592Function* pX86IntrinFunc = nullptr;593if (srcTy == B->mFP32Ty)594{595pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,596Intrinsic::x86_avx2_gather_d_ps_256);597}598else if (srcTy == B->mInt32Ty)599{600pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,601Intrinsic::x86_avx2_gather_d_d_256);602}603else if (srcTy == B->mDoubleTy)604{605pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,606Intrinsic::x86_avx2_gather_d_q_256);607}608else609{610SWR_ASSERT(false, "Unsupported vector element type for gather.");611}612613if (width == W256)614{615auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());616v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});617}618else if (width == W512)619{620// Double pump 4-wide for 64bit elements621#if LLVM_VERSION_MAJOR >= 12622if (cast<FixedVectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)623#elif LLVM_VERSION_MAJOR >= 11624if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)625#else626if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)627#endif628{629auto v64Mask = pThis->VectorMask(vi1Mask);630#if LLVM_VERSION_MAJOR >= 12631uint32_t numElem = cast<FixedVectorType>(v64Mask->getType())->getNumElements();632#elif LLVM_VERSION_MAJOR >= 11633uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements();634#else635uint32_t numElem = v64Mask->getType()->getVectorNumElements();636#endif637v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem));638v64Mask = B->BITCAST(v64Mask, vSrc->getType());639640Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));641Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));642643Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));644Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));645646Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));647Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));648649#if LLVM_VERSION_MAJOR >= 12650uint32_t numElemSrc0 = cast<FixedVectorType>(src0->getType())->getNumElements();651uint32_t numElemMask0 = cast<FixedVectorType>(mask0->getType())->getNumElements();652uint32_t numElemSrc1 = cast<FixedVectorType>(src1->getType())->getNumElements();653uint32_t numElemMask1 = cast<FixedVectorType>(mask1->getType())->getNumElements();654#elif LLVM_VERSION_MAJOR >= 11655uint32_t numElemSrc0 = cast<VectorType>(src0->getType())->getNumElements();656uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements();657uint32_t numElemSrc1 = cast<VectorType>(src1->getType())->getNumElements();658uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements();659#else660uint32_t numElemSrc0 = src0->getType()->getVectorNumElements();661uint32_t numElemMask0 = mask0->getType()->getVectorNumElements();662uint32_t numElemSrc1 = src1->getType()->getVectorNumElements();663uint32_t numElemMask1 = mask1->getType()->getVectorNumElements();664#endif665src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0));666mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0));667Value* gather0 =668B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});669src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1));670mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1));671Value* gather1 =672B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});673v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));674v32Gather = B->BITCAST(v32Gather, vSrc->getType());675}676else677{678// Double pump 8-wide for 32bit elements679auto v32Mask = pThis->VectorMask(vi1Mask);680v32Mask = B->BITCAST(v32Mask, vSrc->getType());681Value* src0 = B->EXTRACT_16(vSrc, 0);682Value* src1 = B->EXTRACT_16(vSrc, 1);683684Value* indices0 = B->EXTRACT_16(vi32Indices, 0);685Value* indices1 = B->EXTRACT_16(vi32Indices, 1);686687Value* mask0 = B->EXTRACT_16(v32Mask, 0);688Value* mask1 = B->EXTRACT_16(v32Mask, 1);689690Value* gather0 =691B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});692Value* gather1 =693B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});694695v32Gather = B->JOIN_16(gather0, gather1);696}697}698}699else if (arch == AVX512)700{701Value* iMask = nullptr;702Function* pX86IntrinFunc = nullptr;703if (srcTy == B->mFP32Ty)704{705pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,706Intrinsic::x86_avx512_gather_dps_512);707iMask = B->BITCAST(vi1Mask, B->mInt16Ty);708}709else if (srcTy == B->mInt32Ty)710{711pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,712Intrinsic::x86_avx512_gather_dpi_512);713iMask = B->BITCAST(vi1Mask, B->mInt16Ty);714}715else if (srcTy == B->mDoubleTy)716{717pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,718Intrinsic::x86_avx512_gather_dpd_512);719iMask = B->BITCAST(vi1Mask, B->mInt8Ty);720}721else722{723SWR_ASSERT(false, "Unsupported vector element type for gather.");724}725726auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);727v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});728}729730return cast<Instruction>(v32Gather);731}732Instruction*733VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)734{735Builder* B = pThis->B;736auto pBase = pCallInst->getArgOperand(0);737auto vi1Mask = pCallInst->getArgOperand(1);738auto vi32Indices = pCallInst->getArgOperand(2);739auto v32Src = pCallInst->getArgOperand(3);740auto i32Scale = pCallInst->getArgOperand(4);741742if (arch != AVX512)743{744// Call into C function to do the scatter. This has significantly better compile perf745// compared to jitting scatter loops for every scatter746if (width == W256)747{748auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);749B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});750}751else752{753// Need to break up 512 wide scatter to two 256 wide754auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));755auto indicesLo =756B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));757auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));758759auto mask = B->BITCAST(maskLo, B->mInt8Ty);760B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});761762auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));763auto indicesHi =764B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));765auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));766767mask = B->BITCAST(maskHi, B->mInt8Ty);768B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});769}770return nullptr;771}772773Value* iMask;774Function* pX86IntrinFunc;775if (width == W256)776{777// No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we778// can use the scatter of 8 elements with 64bit indices779pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,780Intrinsic::x86_avx512_scatter_qps_512);781782auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);783iMask = B->BITCAST(vi1Mask, B->mInt8Ty);784B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});785}786else if (width == W512)787{788pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,789Intrinsic::x86_avx512_scatter_dps_512);790iMask = B->BITCAST(vi1Mask, B->mInt16Ty);791B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});792}793return nullptr;794}795796// No support for vroundps in avx512 (it is available in kncni), so emulate with avx797// instructions798Instruction*799VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)800{801SWR_ASSERT(arch == AVX512);802803auto B = pThis->B;804auto vf32Src = pCallInst->getOperand(0);805assert(vf32Src);806auto i8Round = pCallInst->getOperand(1);807assert(i8Round);808auto pfnFunc =809Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);810811if (width == W256)812{813return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));814}815else if (width == W512)816{817auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);818auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);819820auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);821auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);822823return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));824}825else826{827SWR_ASSERT(false, "Unimplemented vector width.");828}829830return nullptr;831}832833Instruction*834VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)835{836SWR_ASSERT(arch == AVX512);837838auto B = pThis->B;839auto vf32Src = pCallInst->getOperand(0);840841if (width == W256)842{843auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,844Intrinsic::x86_avx_round_ps_256);845return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));846}847else if (width == W512)848{849// 512 can use intrinsic850auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,851Intrinsic::x86_avx512_mask_cvtpd2ps_512);852return cast<Instruction>(B->CALL(pfnFunc, vf32Src));853}854else855{856SWR_ASSERT(false, "Unimplemented vector width.");857}858859return nullptr;860}861862// No support for hsub in AVX512863Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)864{865SWR_ASSERT(arch == AVX512);866867auto B = pThis->B;868auto src0 = pCallInst->getOperand(0);869auto src1 = pCallInst->getOperand(1);870871// 256b hsub can just use avx intrinsic872if (width == W256)873{874auto pX86IntrinFunc =875Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);876return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));877}878else if (width == W512)879{880// 512b hsub can be accomplished with shuf/sub combo881auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));882auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));883return cast<Instruction>(B->SUB(minuend, subtrahend));884}885else886{887SWR_ASSERT(false, "Unimplemented vector width.");888return nullptr;889}890}891892// Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from893// each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide894Instruction* DOUBLE_EMU(LowerX86* pThis,895TargetArch arch,896TargetWidth width,897CallInst* pCallInst,898Intrinsic::ID intrin)899{900auto B = pThis->B;901SWR_ASSERT(width == W512);902Value* result[2];903Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);904for (uint32_t i = 0; i < 2; ++i)905{906SmallVector<Value*, 8> args;907for (auto& arg : pCallInst->arg_operands())908{909auto argType = arg.get()->getType();910if (argType->isVectorTy())911{912#if LLVM_VERSION_MAJOR >= 12913uint32_t vecWidth = cast<FixedVectorType>(argType)->getNumElements();914auto elemTy = cast<FixedVectorType>(argType)->getElementType();915#elif LLVM_VERSION_MAJOR >= 11916uint32_t vecWidth = cast<VectorType>(argType)->getNumElements();917auto elemTy = cast<VectorType>(argType)->getElementType();918#else919uint32_t vecWidth = argType->getVectorNumElements();920auto elemTy = argType->getVectorElementType();921#endif922Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);923Value* argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes);924args.push_back(argToPush);925}926else927{928args.push_back(arg.get());929}930}931result[i] = B->CALLA(pX86IntrinFunc, args);932}933uint32_t vecWidth;934if (result[0]->getType()->isVectorTy())935{936assert(result[1]->getType()->isVectorTy());937#if LLVM_VERSION_MAJOR >= 12938vecWidth = cast<FixedVectorType>(result[0]->getType())->getNumElements() +939cast<FixedVectorType>(result[1]->getType())->getNumElements();940#elif LLVM_VERSION_MAJOR >= 11941vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() +942cast<VectorType>(result[1]->getType())->getNumElements();943#else944vecWidth = result[0]->getType()->getVectorNumElements() +945result[1]->getType()->getVectorNumElements();946#endif947}948else949{950vecWidth = 2;951}952Value* lanes = B->CInc<int>(0, vecWidth);953return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));954}955956} // namespace SwrJit957958using namespace SwrJit;959960INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)961INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)962963964