Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
35269 views
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// This pass does misc. AMDGPU optimizations on IR before instruction10/// selection.11//12//===----------------------------------------------------------------------===//1314#include "AMDGPU.h"15#include "AMDGPUTargetMachine.h"16#include "SIModeRegisterDefaults.h"17#include "llvm/Analysis/AssumptionCache.h"18#include "llvm/Analysis/ConstantFolding.h"19#include "llvm/Analysis/TargetLibraryInfo.h"20#include "llvm/Analysis/UniformityAnalysis.h"21#include "llvm/Analysis/ValueTracking.h"22#include "llvm/CodeGen/TargetPassConfig.h"23#include "llvm/IR/Dominators.h"24#include "llvm/IR/IRBuilder.h"25#include "llvm/IR/InstVisitor.h"26#include "llvm/IR/IntrinsicsAMDGPU.h"27#include "llvm/IR/PatternMatch.h"28#include "llvm/InitializePasses.h"29#include "llvm/Pass.h"30#include "llvm/Support/KnownBits.h"31#include "llvm/Transforms/Utils/IntegerDivision.h"32#include "llvm/Transforms/Utils/Local.h"3334#define DEBUG_TYPE "amdgpu-codegenprepare"3536using namespace llvm;37using namespace llvm::PatternMatch;3839namespace {4041static cl::opt<bool> WidenLoads(42"amdgpu-codegenprepare-widen-constant-loads",43cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),44cl::ReallyHidden,45cl::init(false));4647static cl::opt<bool> Widen16BitOps(48"amdgpu-codegenprepare-widen-16-bit-ops",49cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),50cl::ReallyHidden,51cl::init(true));5253static cl::opt<bool>54BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",55cl::desc("Break large PHI nodes for DAGISel"),56cl::ReallyHidden, cl::init(true));5758static cl::opt<bool>59ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",60cl::desc("For testing purposes, always break large "61"PHIs even if it isn't profitable."),62cl::ReallyHidden, cl::init(false));6364static cl::opt<unsigned> BreakLargePHIsThreshold(65"amdgpu-codegenprepare-break-large-phis-threshold",66cl::desc("Minimum type size in bits for breaking large PHI nodes"),67cl::ReallyHidden, cl::init(32));6869static cl::opt<bool> UseMul24Intrin(70"amdgpu-codegenprepare-mul24",71cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),72cl::ReallyHidden,73cl::init(true));7475// Legalize 64-bit division by using the generic IR expansion.76static cl::opt<bool> ExpandDiv64InIR(77"amdgpu-codegenprepare-expand-div64",78cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),79cl::ReallyHidden,80cl::init(false));8182// Leave all division operations as they are. This supersedes ExpandDiv64InIR83// and is used for testing the legalizer.84static cl::opt<bool> DisableIDivExpand(85"amdgpu-codegenprepare-disable-idiv-expansion",86cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),87cl::ReallyHidden,88cl::init(false));8990// Disable processing of fdiv so we can better test the backend implementations.91static cl::opt<bool> DisableFDivExpand(92"amdgpu-codegenprepare-disable-fdiv-expansion",93cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),94cl::ReallyHidden,95cl::init(false));9697class AMDGPUCodeGenPrepareImpl98: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {99public:100const GCNSubtarget *ST = nullptr;101const AMDGPUTargetMachine *TM = nullptr;102const TargetLibraryInfo *TLInfo = nullptr;103AssumptionCache *AC = nullptr;104DominatorTree *DT = nullptr;105UniformityInfo *UA = nullptr;106Module *Mod = nullptr;107const DataLayout *DL = nullptr;108bool HasUnsafeFPMath = false;109bool HasFP32DenormalFlush = false;110bool FlowChanged = false;111mutable Function *SqrtF32 = nullptr;112mutable Function *LdexpF32 = nullptr;113114DenseMap<const PHINode *, bool> BreakPhiNodesCache;115116Function *getSqrtF32() const {117if (SqrtF32)118return SqrtF32;119120LLVMContext &Ctx = Mod->getContext();121SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,122{Type::getFloatTy(Ctx)});123return SqrtF32;124}125126Function *getLdexpF32() const {127if (LdexpF32)128return LdexpF32;129130LLVMContext &Ctx = Mod->getContext();131LdexpF32 = Intrinsic::getDeclaration(132Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});133return LdexpF32;134}135136bool canBreakPHINode(const PHINode &I);137138/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to139/// binary operation \p V.140///141/// \returns Binary operation \p V.142/// \returns \p T's base element bit width.143unsigned getBaseElementBitWidth(const Type *T) const;144145/// \returns Equivalent 32 bit integer type for given type \p T. For example,146/// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>147/// is returned.148Type *getI32Ty(IRBuilder<> &B, const Type *T) const;149150/// \returns True if binary operation \p I is a signed binary operation, false151/// otherwise.152bool isSigned(const BinaryOperator &I) const;153154/// \returns True if the condition of 'select' operation \p I comes from a155/// signed 'icmp' operation, false otherwise.156bool isSigned(const SelectInst &I) const;157158/// \returns True if type \p T needs to be promoted to 32 bit integer type,159/// false otherwise.160bool needsPromotionToI32(const Type *T) const;161162/// Return true if \p T is a legal scalar floating point type.163bool isLegalFloatingTy(const Type *T) const;164165/// Wrapper to pass all the arguments to computeKnownFPClass166KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,167const Instruction *CtxI) const {168return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,169DT);170}171172bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {173return HasFP32DenormalFlush ||174computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();175}176177/// Promotes uniform binary operation \p I to equivalent 32 bit binary178/// operation.179///180/// \details \p I's base element bit width must be greater than 1 and less181/// than or equal 16. Promotion is done by sign or zero extending operands to182/// 32 bits, replacing \p I with equivalent 32 bit binary operation, and183/// truncating the result of 32 bit binary operation back to \p I's original184/// type. Division operation is not promoted.185///186/// \returns True if \p I is promoted to equivalent 32 bit binary operation,187/// false otherwise.188bool promoteUniformOpToI32(BinaryOperator &I) const;189190/// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.191///192/// \details \p I's base element bit width must be greater than 1 and less193/// than or equal 16. Promotion is done by sign or zero extending operands to194/// 32 bits, and replacing \p I with 32 bit 'icmp' operation.195///196/// \returns True.197bool promoteUniformOpToI32(ICmpInst &I) const;198199/// Promotes uniform 'select' operation \p I to 32 bit 'select'200/// operation.201///202/// \details \p I's base element bit width must be greater than 1 and less203/// than or equal 16. Promotion is done by sign or zero extending operands to204/// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the205/// result of 32 bit 'select' operation back to \p I's original type.206///207/// \returns True.208bool promoteUniformOpToI32(SelectInst &I) const;209210/// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'211/// intrinsic.212///213/// \details \p I's base element bit width must be greater than 1 and less214/// than or equal 16. Promotion is done by zero extending the operand to 32215/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the216/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the217/// shift amount is 32 minus \p I's base element bit width), and truncating218/// the result of the shift operation back to \p I's original type.219///220/// \returns True.221bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;222223/// \returns The minimum number of bits needed to store the value of \Op as an224/// unsigned integer. Truncating to this size and then zero-extending to225/// the original will not change the value.226unsigned numBitsUnsigned(Value *Op) const;227228/// \returns The minimum number of bits needed to store the value of \Op as a229/// signed integer. Truncating to this size and then sign-extending to230/// the original size will not change the value.231unsigned numBitsSigned(Value *Op) const;232233/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.234/// SelectionDAG has an issue where an and asserting the bits are known235bool replaceMulWithMul24(BinaryOperator &I) const;236237/// Perform same function as equivalently named function in DAGCombiner. Since238/// we expand some divisions here, we need to perform this before obscuring.239bool foldBinOpIntoSelect(BinaryOperator &I) const;240241bool divHasSpecialOptimization(BinaryOperator &I,242Value *Num, Value *Den) const;243int getDivNumBits(BinaryOperator &I,244Value *Num, Value *Den,245unsigned AtLeast, bool Signed) const;246247/// Expands 24 bit div or rem.248Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,249Value *Num, Value *Den,250bool IsDiv, bool IsSigned) const;251252Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,253Value *Num, Value *Den, unsigned NumBits,254bool IsDiv, bool IsSigned) const;255256/// Expands 32 bit div or rem.257Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,258Value *Num, Value *Den) const;259260Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,261Value *Num, Value *Den) const;262void expandDivRem64(BinaryOperator &I) const;263264/// Widen a scalar load.265///266/// \details \p Widen scalar load for uniform, small type loads from constant267// memory / to a full 32-bits and then truncate the input to allow a scalar268// load instead of a vector load.269//270/// \returns True.271272bool canWidenScalarExtLoad(LoadInst &I) const;273274Value *matchFractPat(IntrinsicInst &I);275Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);276277bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,278FastMathFlags SqrtFMF) const;279280Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,281FastMathFlags DivFMF, FastMathFlags SqrtFMF,282const Instruction *CtxI) const;283284Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,285FastMathFlags FMF, const Instruction *CtxI) const;286Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,287float ReqdAccuracy) const;288289Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,290FastMathFlags DivFMF, FastMathFlags SqrtFMF,291Value *RsqOp, const Instruction *FDiv,292float ReqdAccuracy) const;293294std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,295Value *Src) const;296297Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,298bool IsNegative) const;299Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,300FastMathFlags FMF) const;301Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,302FastMathFlags FMF) const;303304public:305bool visitFDiv(BinaryOperator &I);306307bool visitInstruction(Instruction &I) { return false; }308bool visitBinaryOperator(BinaryOperator &I);309bool visitLoadInst(LoadInst &I);310bool visitICmpInst(ICmpInst &I);311bool visitSelectInst(SelectInst &I);312bool visitPHINode(PHINode &I);313bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);314315bool visitIntrinsicInst(IntrinsicInst &I);316bool visitBitreverseIntrinsicInst(IntrinsicInst &I);317bool visitMinNum(IntrinsicInst &I);318bool visitSqrt(IntrinsicInst &I);319bool run(Function &F);320};321322class AMDGPUCodeGenPrepare : public FunctionPass {323private:324AMDGPUCodeGenPrepareImpl Impl;325326public:327static char ID;328AMDGPUCodeGenPrepare() : FunctionPass(ID) {329initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());330}331void getAnalysisUsage(AnalysisUsage &AU) const override {332AU.addRequired<AssumptionCacheTracker>();333AU.addRequired<UniformityInfoWrapperPass>();334AU.addRequired<TargetLibraryInfoWrapperPass>();335336// FIXME: Division expansion needs to preserve the dominator tree.337if (!ExpandDiv64InIR)338AU.setPreservesAll();339}340bool runOnFunction(Function &F) override;341bool doInitialization(Module &M) override;342StringRef getPassName() const override { return "AMDGPU IR optimizations"; }343};344345} // end anonymous namespace346347bool AMDGPUCodeGenPrepareImpl::run(Function &F) {348BreakPhiNodesCache.clear();349bool MadeChange = false;350351Function::iterator NextBB;352for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {353BasicBlock *BB = &*FI;354NextBB = std::next(FI);355356BasicBlock::iterator Next;357for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;358I = Next) {359Next = std::next(I);360361MadeChange |= visit(*I);362363if (Next != E) { // Control flow changed364BasicBlock *NextInstBB = Next->getParent();365if (NextInstBB != BB) {366BB = NextInstBB;367E = BB->end();368FE = F.end();369}370}371}372}373return MadeChange;374}375376unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {377assert(needsPromotionToI32(T) && "T does not need promotion to i32");378379if (T->isIntegerTy())380return T->getIntegerBitWidth();381return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();382}383384Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {385assert(needsPromotionToI32(T) && "T does not need promotion to i32");386387if (T->isIntegerTy())388return B.getInt32Ty();389return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));390}391392bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {393return I.getOpcode() == Instruction::AShr ||394I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;395}396397bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {398return isa<ICmpInst>(I.getOperand(0)) ?399cast<ICmpInst>(I.getOperand(0))->isSigned() : false;400}401402bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {403if (!Widen16BitOps)404return false;405406const IntegerType *IntTy = dyn_cast<IntegerType>(T);407if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)408return true;409410if (const VectorType *VT = dyn_cast<VectorType>(T)) {411// TODO: The set of packed operations is more limited, so may want to412// promote some anyway.413if (ST->hasVOP3PInsts())414return false;415416return needsPromotionToI32(VT->getElementType());417}418419return false;420}421422bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {423return Ty->isFloatTy() || Ty->isDoubleTy() ||424(Ty->isHalfTy() && ST->has16BitInsts());425}426427// Return true if the op promoted to i32 should have nsw set.428static bool promotedOpIsNSW(const Instruction &I) {429switch (I.getOpcode()) {430case Instruction::Shl:431case Instruction::Add:432case Instruction::Sub:433return true;434case Instruction::Mul:435return I.hasNoUnsignedWrap();436default:437return false;438}439}440441// Return true if the op promoted to i32 should have nuw set.442static bool promotedOpIsNUW(const Instruction &I) {443switch (I.getOpcode()) {444case Instruction::Shl:445case Instruction::Add:446case Instruction::Mul:447return true;448case Instruction::Sub:449return I.hasNoUnsignedWrap();450default:451return false;452}453}454455bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {456Type *Ty = I.getType();457const DataLayout &DL = Mod->getDataLayout();458int TySize = DL.getTypeSizeInBits(Ty);459Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);460461return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);462}463464bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {465assert(needsPromotionToI32(I.getType()) &&466"I does not need promotion to i32");467468if (I.getOpcode() == Instruction::SDiv ||469I.getOpcode() == Instruction::UDiv ||470I.getOpcode() == Instruction::SRem ||471I.getOpcode() == Instruction::URem)472return false;473474IRBuilder<> Builder(&I);475Builder.SetCurrentDebugLocation(I.getDebugLoc());476477Type *I32Ty = getI32Ty(Builder, I.getType());478Value *ExtOp0 = nullptr;479Value *ExtOp1 = nullptr;480Value *ExtRes = nullptr;481Value *TruncRes = nullptr;482483if (isSigned(I)) {484ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);485ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);486} else {487ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);488ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);489}490491ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);492if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {493if (promotedOpIsNSW(cast<Instruction>(I)))494Inst->setHasNoSignedWrap();495496if (promotedOpIsNUW(cast<Instruction>(I)))497Inst->setHasNoUnsignedWrap();498499if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))500Inst->setIsExact(ExactOp->isExact());501}502503TruncRes = Builder.CreateTrunc(ExtRes, I.getType());504505I.replaceAllUsesWith(TruncRes);506I.eraseFromParent();507508return true;509}510511bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {512assert(needsPromotionToI32(I.getOperand(0)->getType()) &&513"I does not need promotion to i32");514515IRBuilder<> Builder(&I);516Builder.SetCurrentDebugLocation(I.getDebugLoc());517518Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());519Value *ExtOp0 = nullptr;520Value *ExtOp1 = nullptr;521Value *NewICmp = nullptr;522523if (I.isSigned()) {524ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);525ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);526} else {527ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);528ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);529}530NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);531532I.replaceAllUsesWith(NewICmp);533I.eraseFromParent();534535return true;536}537538bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {539assert(needsPromotionToI32(I.getType()) &&540"I does not need promotion to i32");541542IRBuilder<> Builder(&I);543Builder.SetCurrentDebugLocation(I.getDebugLoc());544545Type *I32Ty = getI32Ty(Builder, I.getType());546Value *ExtOp1 = nullptr;547Value *ExtOp2 = nullptr;548Value *ExtRes = nullptr;549Value *TruncRes = nullptr;550551if (isSigned(I)) {552ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);553ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);554} else {555ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);556ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);557}558ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);559TruncRes = Builder.CreateTrunc(ExtRes, I.getType());560561I.replaceAllUsesWith(TruncRes);562I.eraseFromParent();563564return true;565}566567bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(568IntrinsicInst &I) const {569assert(I.getIntrinsicID() == Intrinsic::bitreverse &&570"I must be bitreverse intrinsic");571assert(needsPromotionToI32(I.getType()) &&572"I does not need promotion to i32");573574IRBuilder<> Builder(&I);575Builder.SetCurrentDebugLocation(I.getDebugLoc());576577Type *I32Ty = getI32Ty(Builder, I.getType());578Function *I32 =579Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });580Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);581Value *ExtRes = Builder.CreateCall(I32, { ExtOp });582Value *LShrOp =583Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));584Value *TruncRes =585Builder.CreateTrunc(LShrOp, I.getType());586587I.replaceAllUsesWith(TruncRes);588I.eraseFromParent();589590return true;591}592593unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {594return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();595}596597unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {598return ComputeMaxSignificantBits(Op, *DL, 0, AC);599}600601static void extractValues(IRBuilder<> &Builder,602SmallVectorImpl<Value *> &Values, Value *V) {603auto *VT = dyn_cast<FixedVectorType>(V->getType());604if (!VT) {605Values.push_back(V);606return;607}608609for (int I = 0, E = VT->getNumElements(); I != E; ++I)610Values.push_back(Builder.CreateExtractElement(V, I));611}612613static Value *insertValues(IRBuilder<> &Builder,614Type *Ty,615SmallVectorImpl<Value *> &Values) {616if (!Ty->isVectorTy()) {617assert(Values.size() == 1);618return Values[0];619}620621Value *NewVal = PoisonValue::get(Ty);622for (int I = 0, E = Values.size(); I != E; ++I)623NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);624625return NewVal;626}627628bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {629if (I.getOpcode() != Instruction::Mul)630return false;631632Type *Ty = I.getType();633unsigned Size = Ty->getScalarSizeInBits();634if (Size <= 16 && ST->has16BitInsts())635return false;636637// Prefer scalar if this could be s_mul_i32638if (UA->isUniform(&I))639return false;640641Value *LHS = I.getOperand(0);642Value *RHS = I.getOperand(1);643IRBuilder<> Builder(&I);644Builder.SetCurrentDebugLocation(I.getDebugLoc());645646unsigned LHSBits = 0, RHSBits = 0;647bool IsSigned = false;648649if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&650(RHSBits = numBitsUnsigned(RHS)) <= 24) {651IsSigned = false;652653} else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&654(RHSBits = numBitsSigned(RHS)) <= 24) {655IsSigned = true;656657} else658return false;659660SmallVector<Value *, 4> LHSVals;661SmallVector<Value *, 4> RHSVals;662SmallVector<Value *, 4> ResultVals;663extractValues(Builder, LHSVals, LHS);664extractValues(Builder, RHSVals, RHS);665666IntegerType *I32Ty = Builder.getInt32Ty();667IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;668Type *DstTy = LHSVals[0]->getType();669670for (int I = 0, E = LHSVals.size(); I != E; ++I) {671Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)672: Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);673Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)674: Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);675Intrinsic::ID ID =676IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;677Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});678Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)679: Builder.CreateZExtOrTrunc(Result, DstTy);680ResultVals.push_back(Result);681}682683Value *NewVal = insertValues(Builder, Ty, ResultVals);684NewVal->takeName(&I);685I.replaceAllUsesWith(NewVal);686I.eraseFromParent();687688return true;689}690691// Find a select instruction, which may have been casted. This is mostly to deal692// with cases where i16 selects were promoted here to i32.693static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {694Cast = nullptr;695if (SelectInst *Sel = dyn_cast<SelectInst>(V))696return Sel;697698if ((Cast = dyn_cast<CastInst>(V))) {699if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))700return Sel;701}702703return nullptr;704}705706bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {707// Don't do this unless the old select is going away. We want to eliminate the708// binary operator, not replace a binop with a select.709int SelOpNo = 0;710711CastInst *CastOp;712713// TODO: Should probably try to handle some cases with multiple714// users. Duplicating the select may be profitable for division.715SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);716if (!Sel || !Sel->hasOneUse()) {717SelOpNo = 1;718Sel = findSelectThroughCast(BO.getOperand(1), CastOp);719}720721if (!Sel || !Sel->hasOneUse())722return false;723724Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());725Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());726Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));727if (!CBO || !CT || !CF)728return false;729730if (CastOp) {731if (!CastOp->hasOneUse())732return false;733CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);734CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);735}736737// TODO: Handle special 0/-1 cases DAG combine does, although we only really738// need to handle divisions here.739Constant *FoldedT = SelOpNo ?740ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :741ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);742if (!FoldedT || isa<ConstantExpr>(FoldedT))743return false;744745Constant *FoldedF = SelOpNo ?746ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :747ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);748if (!FoldedF || isa<ConstantExpr>(FoldedF))749return false;750751IRBuilder<> Builder(&BO);752Builder.SetCurrentDebugLocation(BO.getDebugLoc());753if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))754Builder.setFastMathFlags(FPOp->getFastMathFlags());755756Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),757FoldedT, FoldedF);758NewSelect->takeName(&BO);759BO.replaceAllUsesWith(NewSelect);760BO.eraseFromParent();761if (CastOp)762CastOp->eraseFromParent();763Sel->eraseFromParent();764return true;765}766767std::pair<Value *, Value *>768AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,769Value *Src) const {770Type *Ty = Src->getType();771Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,772{Ty, Builder.getInt32Ty()}, Src);773Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});774775// Bypass the bug workaround for the exponent result since it doesn't matter.776// TODO: Does the bug workaround even really need to consider the exponent777// result? It's unspecified by the spec.778779Value *FrexpExp =780ST->hasFractBug()781? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,782{Builder.getInt32Ty(), Ty}, Src)783: Builder.CreateExtractValue(Frexp, {1});784return {FrexpMant, FrexpExp};785}786787/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.788Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,789Value *Src,790bool IsNegative) const {791// Same as for 1.0, but expand the sign out of the constant.792// -1.0 / x -> rcp (fneg x)793if (IsNegative)794Src = Builder.CreateFNeg(Src);795796// The rcp instruction doesn't support denormals, so scale the input797// out of the denormal range and convert at the end.798//799// Expand as 2^-n * (1.0 / (x * 2^n))800801// TODO: Skip scaling if input is known never denormal and the input802// range won't underflow to denormal. The hard part is knowing the803// result. We need a range check, the result could be denormal for804// 0x1p+126 < den <= 0x1p+127.805auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);806Value *ScaleFactor = Builder.CreateNeg(FrexpExp);807Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);808return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});809}810811/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.812Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,813Value *RHS,814FastMathFlags FMF) const {815// If we have have to work around the fract/frexp bug, we're worse off than816// using the fdiv.fast expansion. The full safe expansion is faster if we have817// fast FMA.818if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&819(!FMF.noNaNs() || !FMF.noInfs()))820return nullptr;821822// We're scaling the LHS to avoid a denormal input, and scale the denominator823// to avoid large values underflowing the result.824auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);825826Value *Rcp =827Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);828829auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);830Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);831832// We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the833// result.834Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);835return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});836}837838/// Emit a sqrt that handles denormals and is accurate to 2ulp.839Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,840Value *Src,841FastMathFlags FMF) const {842Type *Ty = Src->getType();843APFloat SmallestNormal =844APFloat::getSmallestNormalized(Ty->getFltSemantics());845Value *NeedScale =846Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));847848ConstantInt *Zero = Builder.getInt32(0);849Value *InputScaleFactor =850Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);851852Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});853854Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);855856Value *OutputScaleFactor =857Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);858return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});859}860861/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.862static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,863bool IsNegative) {864// bool need_scale = x < 0x1p-126f;865// float input_scale = need_scale ? 0x1.0p+24f : 1.0f;866// float output_scale = need_scale ? 0x1.0p+12f : 1.0f;867// rsq(x * input_scale) * output_scale;868869Type *Ty = Src->getType();870APFloat SmallestNormal =871APFloat::getSmallestNormalized(Ty->getFltSemantics());872Value *NeedScale =873Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));874Constant *One = ConstantFP::get(Ty, 1.0);875Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);876Constant *OutputScale =877ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);878879Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);880881Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);882Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);883Value *OutputScaleFactor = Builder.CreateSelect(884NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);885886return Builder.CreateFMul(Rsq, OutputScaleFactor);887}888889bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,890FastMathFlags DivFMF,891FastMathFlags SqrtFMF) const {892// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.893if (!DivFMF.allowContract() || !SqrtFMF.allowContract())894return false;895896// v_rsq_f32 gives 1ulp897return SqrtFMF.approxFunc() || HasUnsafeFPMath ||898SqrtOp->getFPAccuracy() >= 1.0f;899}900901Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(902IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,903const FastMathFlags SqrtFMF, const Instruction *CtxI) const {904// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.905assert(DivFMF.allowContract() && SqrtFMF.allowContract());906907// rsq_f16 is accurate to 0.51 ulp.908// rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.909// rsq_f64 is never accurate.910const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);911if (!CLHS)912return nullptr;913914assert(Den->getType()->isFloatTy());915916bool IsNegative = false;917918// TODO: Handle other numerator values with arcp.919if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {920// Add in the sqrt flags.921IRBuilder<>::FastMathFlagGuard Guard(Builder);922Builder.setFastMathFlags(DivFMF | SqrtFMF);923924if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||925canIgnoreDenormalInput(Den, CtxI)) {926Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);927// -1.0 / sqrt(x) -> fneg(rsq(x))928return IsNegative ? Builder.CreateFNeg(Result) : Result;929}930931return emitRsqIEEE1ULP(Builder, Den, IsNegative);932}933934return nullptr;935}936937// Optimize fdiv with rcp:938//939// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is940// allowed with unsafe-fp-math or afn.941//942// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0943Value *944AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,945Value *Den, FastMathFlags FMF,946const Instruction *CtxI) const {947// rcp_f16 is accurate to 0.51 ulp.948// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.949// rcp_f64 is never accurate.950assert(Den->getType()->isFloatTy());951952if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {953bool IsNegative = false;954if (CLHS->isExactlyValue(1.0) ||955(IsNegative = CLHS->isExactlyValue(-1.0))) {956Value *Src = Den;957958if (HasFP32DenormalFlush || FMF.approxFunc()) {959// -1.0 / x -> 1.0 / fneg(x)960if (IsNegative)961Src = Builder.CreateFNeg(Src);962963// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to964// the CI documentation has a worst case error of 1 ulp.965// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK966// to use it as long as we aren't trying to use denormals.967//968// v_rcp_f16 and v_rsq_f16 DO support denormals.969970// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't971// insert rsq intrinsic here.972973// 1.0 / x -> rcp(x)974return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);975}976977// TODO: If the input isn't denormal, and we know the input exponent isn't978// big enough to introduce a denormal we can avoid the scaling.979return emitRcpIEEE1ULP(Builder, Src, IsNegative);980}981}982983if (FMF.allowReciprocal()) {984// x / y -> x * (1.0 / y)985986// TODO: Could avoid denormal scaling and use raw rcp if we knew the output987// will never underflow.988if (HasFP32DenormalFlush || FMF.approxFunc()) {989Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);990return Builder.CreateFMul(Num, Recip);991}992993Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);994return Builder.CreateFMul(Num, Recip);995}996997return nullptr;998}9991000// optimize with fdiv.fast:1001//1002// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.1003//1004// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.1005//1006// NOTE: optimizeWithRcp should be tried first because rcp is the preference.1007Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(1008IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {1009// fdiv.fast can achieve 2.5 ULP accuracy.1010if (ReqdAccuracy < 2.5f)1011return nullptr;10121013// Only have fdiv.fast for f32.1014assert(Den->getType()->isFloatTy());10151016bool NumIsOne = false;1017if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {1018if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))1019NumIsOne = true;1020}10211022// fdiv does not support denormals. But 1.0/x is always fine to use it.1023//1024// TODO: This works for any value with a specific known exponent range, don't1025// just limit to constant 1.1026if (!HasFP32DenormalFlush && !NumIsOne)1027return nullptr;10281029return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});1030}10311032Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(1033IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,1034FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,1035float ReqdDivAccuracy) const {1036if (RsqOp) {1037Value *Rsq =1038optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);1039if (Rsq)1040return Rsq;1041}10421043Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);1044if (Rcp)1045return Rcp;10461047// In the basic case fdiv_fast has the same instruction count as the frexp div1048// expansion. Slightly prefer fdiv_fast since it ends in an fmul that can1049// potentially be fused into a user. Also, materialization of the constants1050// can be reused for multiple instances.1051Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);1052if (FDivFast)1053return FDivFast;10541055return emitFrexpDiv(Builder, Num, Den, DivFMF);1056}10571058// Optimizations is performed based on fpmath, fast math flags as well as1059// denormals to optimize fdiv with either rcp or fdiv.fast.1060//1061// With rcp:1062// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is1063// allowed with unsafe-fp-math or afn.1064//1065// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.1066//1067// With fdiv.fast:1068// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.1069//1070// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.1071//1072// NOTE: rcp is the preference in cases that both are legal.1073bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {1074if (DisableFDivExpand)1075return false;10761077Type *Ty = FDiv.getType()->getScalarType();1078if (!Ty->isFloatTy())1079return false;10801081// The f64 rcp/rsq approximations are pretty inaccurate. We can do an1082// expansion around them in codegen. f16 is good enough to always use.10831084const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);1085const FastMathFlags DivFMF = FPOp->getFastMathFlags();1086const float ReqdAccuracy = FPOp->getFPAccuracy();10871088FastMathFlags SqrtFMF;10891090Value *Num = FDiv.getOperand(0);1091Value *Den = FDiv.getOperand(1);10921093Value *RsqOp = nullptr;1094auto *DenII = dyn_cast<IntrinsicInst>(Den);1095if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&1096DenII->hasOneUse()) {1097const auto *SqrtOp = cast<FPMathOperator>(DenII);1098SqrtFMF = SqrtOp->getFastMathFlags();1099if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))1100RsqOp = SqrtOp->getOperand(0);1101}11021103// Inaccurate rcp is allowed with unsafe-fp-math or afn.1104//1105// Defer to codegen to handle this.1106//1107// TODO: Decide on an interpretation for interactions between afn + arcp +1108// !fpmath, and make it consistent between here and codegen. For now, defer1109// expansion of afn to codegen. The current interpretation is so aggressive we1110// don't need any pre-consideration here when we have better information. A1111// more conservative interpretation could use handling here.1112const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();1113if (!RsqOp && AllowInaccurateRcp)1114return false;11151116// Defer the correct implementations to codegen.1117if (ReqdAccuracy < 1.0f)1118return false;11191120IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));1121Builder.setFastMathFlags(DivFMF);1122Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());11231124SmallVector<Value *, 4> NumVals;1125SmallVector<Value *, 4> DenVals;1126SmallVector<Value *, 4> RsqDenVals;1127extractValues(Builder, NumVals, Num);1128extractValues(Builder, DenVals, Den);11291130if (RsqOp)1131extractValues(Builder, RsqDenVals, RsqOp);11321133SmallVector<Value *, 4> ResultVals(NumVals.size());1134for (int I = 0, E = NumVals.size(); I != E; ++I) {1135Value *NumElt = NumVals[I];1136Value *DenElt = DenVals[I];1137Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;11381139Value *NewElt =1140visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,1141cast<Instruction>(FPOp), ReqdAccuracy);1142if (!NewElt) {1143// Keep the original, but scalarized.11441145// This has the unfortunate side effect of sometimes scalarizing when1146// we're not going to do anything.1147NewElt = Builder.CreateFDiv(NumElt, DenElt);1148if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))1149NewEltInst->copyMetadata(FDiv);1150}11511152ResultVals[I] = NewElt;1153}11541155Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);11561157if (NewVal) {1158FDiv.replaceAllUsesWith(NewVal);1159NewVal->takeName(&FDiv);1160RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);1161}11621163return true;1164}11651166static bool hasUnsafeFPMath(const Function &F) {1167Attribute Attr = F.getFnAttribute("unsafe-fp-math");1168return Attr.getValueAsBool();1169}11701171static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,1172Value *LHS, Value *RHS) {1173Type *I32Ty = Builder.getInt32Ty();1174Type *I64Ty = Builder.getInt64Ty();11751176Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);1177Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);1178Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);1179Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);1180Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));1181Hi = Builder.CreateTrunc(Hi, I32Ty);1182return std::pair(Lo, Hi);1183}11841185static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {1186return getMul64(Builder, LHS, RHS).second;1187}11881189/// Figure out how many bits are really needed for this division. \p AtLeast is1190/// an optimization hint to bypass the second ComputeNumSignBits call if we the1191/// first one is insufficient. Returns -1 on failure.1192int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,1193Value *Den, unsigned AtLeast,1194bool IsSigned) const {1195const DataLayout &DL = Mod->getDataLayout();1196unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);1197if (LHSSignBits < AtLeast)1198return -1;11991200unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);1201if (RHSSignBits < AtLeast)1202return -1;12031204unsigned SignBits = std::min(LHSSignBits, RHSSignBits);1205unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;1206if (IsSigned)1207++DivBits;1208return DivBits;1209}12101211// The fractional part of a float is enough to accurately represent up to1212// a 24-bit signed integer.1213Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,1214BinaryOperator &I, Value *Num,1215Value *Den, bool IsDiv,1216bool IsSigned) const {1217unsigned SSBits = Num->getType()->getScalarSizeInBits();1218// If Num bits <= 24, assume 0 signbits.1219unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);1220int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);1221if (DivBits == -1)1222return nullptr;1223return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);1224}12251226Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(1227IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,1228unsigned DivBits, bool IsDiv, bool IsSigned) const {1229Type *I32Ty = Builder.getInt32Ty();1230Num = Builder.CreateTrunc(Num, I32Ty);1231Den = Builder.CreateTrunc(Den, I32Ty);12321233Type *F32Ty = Builder.getFloatTy();1234ConstantInt *One = Builder.getInt32(1);1235Value *JQ = One;12361237if (IsSigned) {1238// char|short jq = ia ^ ib;1239JQ = Builder.CreateXor(Num, Den);12401241// jq = jq >> (bitsize - 2)1242JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));12431244// jq = jq | 0x11245JQ = Builder.CreateOr(JQ, One);1246}12471248// int ia = (int)LHS;1249Value *IA = Num;12501251// int ib, (int)RHS;1252Value *IB = Den;12531254// float fa = (float)ia;1255Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)1256: Builder.CreateUIToFP(IA, F32Ty);12571258// float fb = (float)ib;1259Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)1260: Builder.CreateUIToFP(IB,F32Ty);12611262Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,1263Builder.getFloatTy());1264Value *RCP = Builder.CreateCall(RcpDecl, { FB });1265Value *FQM = Builder.CreateFMul(FA, RCP);12661267// fq = trunc(fqm);1268CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);1269FQ->copyFastMathFlags(Builder.getFastMathFlags());12701271// float fqneg = -fq;1272Value *FQNeg = Builder.CreateFNeg(FQ);12731274// float fr = mad(fqneg, fb, fa);1275auto FMAD = !ST->hasMadMacF32Insts()1276? Intrinsic::fma1277: (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;1278Value *FR = Builder.CreateIntrinsic(FMAD,1279{FQNeg->getType()}, {FQNeg, FB, FA}, FQ);12801281// int iq = (int)fq;1282Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)1283: Builder.CreateFPToUI(FQ, I32Ty);12841285// fr = fabs(fr);1286FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);12871288// fb = fabs(fb);1289FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);12901291// int cv = fr >= fb;1292Value *CV = Builder.CreateFCmpOGE(FR, FB);12931294// jq = (cv ? jq : 0);1295JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));12961297// dst = iq + jq;1298Value *Div = Builder.CreateAdd(IQ, JQ);12991300Value *Res = Div;1301if (!IsDiv) {1302// Rem needs compensation, it's easier to recompute it1303Value *Rem = Builder.CreateMul(Div, Den);1304Res = Builder.CreateSub(Num, Rem);1305}13061307if (DivBits != 0 && DivBits < 32) {1308// Extend in register from the number of bits this divide really is.1309if (IsSigned) {1310int InRegBits = 32 - DivBits;13111312Res = Builder.CreateShl(Res, InRegBits);1313Res = Builder.CreateAShr(Res, InRegBits);1314} else {1315ConstantInt *TruncMask1316= Builder.getInt32((UINT64_C(1) << DivBits) - 1);1317Res = Builder.CreateAnd(Res, TruncMask);1318}1319}13201321return Res;1322}13231324// Try to recognize special cases the DAG will emit special, better expansions1325// than the general expansion we do here.13261327// TODO: It would be better to just directly handle those optimizations here.1328bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,1329Value *Num,1330Value *Den) const {1331if (Constant *C = dyn_cast<Constant>(Den)) {1332// Arbitrary constants get a better expansion as long as a wider mulhi is1333// legal.1334if (C->getType()->getScalarSizeInBits() <= 32)1335return true;13361337// TODO: Sdiv check for not exact for some reason.13381339// If there's no wider mulhi, there's only a better expansion for powers of1340// two.1341// TODO: Should really know for each vector element.1342if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))1343return true;13441345return false;1346}13471348if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {1349// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 21350if (BinOpDen->getOpcode() == Instruction::Shl &&1351isa<Constant>(BinOpDen->getOperand(0)) &&1352isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,13530, AC, &I, DT)) {1354return true;1355}1356}13571358return false;1359}13601361static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {1362// Check whether the sign can be determined statically.1363KnownBits Known = computeKnownBits(V, *DL);1364if (Known.isNegative())1365return Constant::getAllOnesValue(V->getType());1366if (Known.isNonNegative())1367return Constant::getNullValue(V->getType());1368return Builder.CreateAShr(V, Builder.getInt32(31));1369}13701371Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,1372BinaryOperator &I, Value *X,1373Value *Y) const {1374Instruction::BinaryOps Opc = I.getOpcode();1375assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||1376Opc == Instruction::SRem || Opc == Instruction::SDiv);13771378FastMathFlags FMF;1379FMF.setFast();1380Builder.setFastMathFlags(FMF);13811382if (divHasSpecialOptimization(I, X, Y))1383return nullptr; // Keep it for later optimization.13841385bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;1386bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;13871388Type *Ty = X->getType();1389Type *I32Ty = Builder.getInt32Ty();1390Type *F32Ty = Builder.getFloatTy();13911392if (Ty->getScalarSizeInBits() != 32) {1393if (IsSigned) {1394X = Builder.CreateSExtOrTrunc(X, I32Ty);1395Y = Builder.CreateSExtOrTrunc(Y, I32Ty);1396} else {1397X = Builder.CreateZExtOrTrunc(X, I32Ty);1398Y = Builder.CreateZExtOrTrunc(Y, I32Ty);1399}1400}14011402if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {1403return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :1404Builder.CreateZExtOrTrunc(Res, Ty);1405}14061407ConstantInt *Zero = Builder.getInt32(0);1408ConstantInt *One = Builder.getInt32(1);14091410Value *Sign = nullptr;1411if (IsSigned) {1412Value *SignX = getSign32(X, Builder, DL);1413Value *SignY = getSign32(Y, Builder, DL);1414// Remainder sign is the same as LHS1415Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;14161417X = Builder.CreateAdd(X, SignX);1418Y = Builder.CreateAdd(Y, SignY);14191420X = Builder.CreateXor(X, SignX);1421Y = Builder.CreateXor(Y, SignY);1422}14231424// The algorithm here is based on ideas from "Software Integer Division", Tom1425// Rodeheffer, August 2008.1426//1427// unsigned udiv(unsigned x, unsigned y) {1428// // Initial estimate of inv(y). The constant is less than 2^32 to ensure1429// // that this is a lower bound on inv(y), even if some of the calculations1430// // round up.1431// unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));1432//1433// // One round of UNR (Unsigned integer Newton-Raphson) to improve z.1434// // Empirically this is guaranteed to give a "two-y" lower bound on1435// // inv(y).1436// z += umulh(z, -y * z);1437//1438// // Quotient/remainder estimate.1439// unsigned q = umulh(x, z);1440// unsigned r = x - q * y;1441//1442// // Two rounds of quotient/remainder refinement.1443// if (r >= y) {1444// ++q;1445// r -= y;1446// }1447// if (r >= y) {1448// ++q;1449// r -= y;1450// }1451//1452// return q;1453// }14541455// Initial estimate of inv(y).1456Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);1457Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);1458Value *RcpY = Builder.CreateCall(Rcp, {FloatY});1459Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));1460Value *ScaledY = Builder.CreateFMul(RcpY, Scale);1461Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);14621463// One round of UNR.1464Value *NegY = Builder.CreateSub(Zero, Y);1465Value *NegYZ = Builder.CreateMul(NegY, Z);1466Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));14671468// Quotient/remainder estimate.1469Value *Q = getMulHu(Builder, X, Z);1470Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));14711472// First quotient/remainder refinement.1473Value *Cond = Builder.CreateICmpUGE(R, Y);1474if (IsDiv)1475Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);1476R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);14771478// Second quotient/remainder refinement.1479Cond = Builder.CreateICmpUGE(R, Y);1480Value *Res;1481if (IsDiv)1482Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);1483else1484Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);14851486if (IsSigned) {1487Res = Builder.CreateXor(Res, Sign);1488Res = Builder.CreateSub(Res, Sign);1489Res = Builder.CreateSExtOrTrunc(Res, Ty);1490} else {1491Res = Builder.CreateZExtOrTrunc(Res, Ty);1492}1493return Res;1494}14951496Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,1497BinaryOperator &I, Value *Num,1498Value *Den) const {1499if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))1500return nullptr; // Keep it for later optimization.15011502Instruction::BinaryOps Opc = I.getOpcode();15031504bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;1505bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;15061507int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);1508if (NumDivBits == -1)1509return nullptr;15101511Value *Narrowed = nullptr;1512if (NumDivBits <= 24) {1513Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,1514IsDiv, IsSigned);1515} else if (NumDivBits <= 32) {1516Narrowed = expandDivRem32(Builder, I, Num, Den);1517}15181519if (Narrowed) {1520return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :1521Builder.CreateZExt(Narrowed, Num->getType());1522}15231524return nullptr;1525}15261527void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {1528Instruction::BinaryOps Opc = I.getOpcode();1529// Do the general expansion.1530if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {1531expandDivisionUpTo64Bits(&I);1532return;1533}15341535if (Opc == Instruction::URem || Opc == Instruction::SRem) {1536expandRemainderUpTo64Bits(&I);1537return;1538}15391540llvm_unreachable("not a division");1541}15421543bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {1544if (foldBinOpIntoSelect(I))1545return true;15461547if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&1548UA->isUniform(&I) && promoteUniformOpToI32(I))1549return true;15501551if (UseMul24Intrin && replaceMulWithMul24(I))1552return true;15531554bool Changed = false;1555Instruction::BinaryOps Opc = I.getOpcode();1556Type *Ty = I.getType();1557Value *NewDiv = nullptr;1558unsigned ScalarSize = Ty->getScalarSizeInBits();15591560SmallVector<BinaryOperator *, 8> Div64ToExpand;15611562if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||1563Opc == Instruction::SRem || Opc == Instruction::SDiv) &&1564ScalarSize <= 64 &&1565!DisableIDivExpand) {1566Value *Num = I.getOperand(0);1567Value *Den = I.getOperand(1);1568IRBuilder<> Builder(&I);1569Builder.SetCurrentDebugLocation(I.getDebugLoc());15701571if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {1572NewDiv = PoisonValue::get(VT);15731574for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {1575Value *NumEltN = Builder.CreateExtractElement(Num, N);1576Value *DenEltN = Builder.CreateExtractElement(Den, N);15771578Value *NewElt;1579if (ScalarSize <= 32) {1580NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);1581if (!NewElt)1582NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);1583} else {1584// See if this 64-bit division can be shrunk to 32/24-bits before1585// producing the general expansion.1586NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);1587if (!NewElt) {1588// The general 64-bit expansion introduces control flow and doesn't1589// return the new value. Just insert a scalar copy and defer1590// expanding it.1591NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);1592Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));1593}1594}15951596if (auto *NewEltI = dyn_cast<Instruction>(NewElt))1597NewEltI->copyIRFlags(&I);15981599NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);1600}1601} else {1602if (ScalarSize <= 32)1603NewDiv = expandDivRem32(Builder, I, Num, Den);1604else {1605NewDiv = shrinkDivRem64(Builder, I, Num, Den);1606if (!NewDiv)1607Div64ToExpand.push_back(&I);1608}1609}16101611if (NewDiv) {1612I.replaceAllUsesWith(NewDiv);1613I.eraseFromParent();1614Changed = true;1615}1616}16171618if (ExpandDiv64InIR) {1619// TODO: We get much worse code in specially handled constant cases.1620for (BinaryOperator *Div : Div64ToExpand) {1621expandDivRem64(*Div);1622FlowChanged = true;1623Changed = true;1624}1625}16261627return Changed;1628}16291630bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {1631if (!WidenLoads)1632return false;16331634if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||1635I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&1636canWidenScalarExtLoad(I)) {1637IRBuilder<> Builder(&I);1638Builder.SetCurrentDebugLocation(I.getDebugLoc());16391640Type *I32Ty = Builder.getInt32Ty();1641LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());1642WidenLoad->copyMetadata(I);16431644// If we have range metadata, we need to convert the type, and not make1645// assumptions about the high bits.1646if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {1647ConstantInt *Lower =1648mdconst::extract<ConstantInt>(Range->getOperand(0));16491650if (Lower->isNullValue()) {1651WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);1652} else {1653Metadata *LowAndHigh[] = {1654ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),1655// Don't make assumptions about the high bits.1656ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))1657};16581659WidenLoad->setMetadata(LLVMContext::MD_range,1660MDNode::get(Mod->getContext(), LowAndHigh));1661}1662}16631664int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());1665Type *IntNTy = Builder.getIntNTy(TySize);1666Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);1667Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());1668I.replaceAllUsesWith(ValOrig);1669I.eraseFromParent();1670return true;1671}16721673return false;1674}16751676bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {1677bool Changed = false;16781679if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&1680UA->isUniform(&I))1681Changed |= promoteUniformOpToI32(I);16821683return Changed;1684}16851686bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {1687Value *Cond = I.getCondition();1688Value *TrueVal = I.getTrueValue();1689Value *FalseVal = I.getFalseValue();1690Value *CmpVal;1691FCmpInst::Predicate Pred;16921693if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {1694if (UA->isUniform(&I))1695return promoteUniformOpToI32(I);1696return false;1697}16981699// Match fract pattern with nan check.1700if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))1701return false;17021703FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);1704if (!FPOp)1705return false;17061707IRBuilder<> Builder(&I);1708Builder.setFastMathFlags(FPOp->getFastMathFlags());17091710auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);1711auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);17121713Value *Fract = nullptr;1714if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&1715CmpVal == matchFractPat(*IIFalse)) {1716// isnan(x) ? x : fract(x)1717Fract = applyFractPat(Builder, CmpVal);1718} else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&1719CmpVal == matchFractPat(*IITrue)) {1720// !isnan(x) ? fract(x) : x1721Fract = applyFractPat(Builder, CmpVal);1722} else1723return false;17241725Fract->takeName(&I);1726I.replaceAllUsesWith(Fract);1727RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);1728return true;1729}17301731static bool areInSameBB(const Value *A, const Value *B) {1732const auto *IA = dyn_cast<Instruction>(A);1733const auto *IB = dyn_cast<Instruction>(B);1734return IA && IB && IA->getParent() == IB->getParent();1735}17361737// Helper for breaking large PHIs that returns true when an extractelement on V1738// is likely to be folded away by the DAG combiner.1739static bool isInterestingPHIIncomingValue(const Value *V) {1740const auto *FVT = dyn_cast<FixedVectorType>(V->getType());1741if (!FVT)1742return false;17431744const Value *CurVal = V;17451746// Check for insertelements, keeping track of the elements covered.1747BitVector EltsCovered(FVT->getNumElements());1748while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {1749const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));17501751// Non constant index/out of bounds index -> folding is unlikely.1752// The latter is more of a sanity check because canonical IR should just1753// have replaced those with poison.1754if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())1755return false;17561757const auto *VecSrc = IE->getOperand(0);17581759// If the vector source is another instruction, it must be in the same basic1760// block. Otherwise, the DAGCombiner won't see the whole thing and is1761// unlikely to be able to do anything interesting here.1762if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))1763return false;17641765CurVal = VecSrc;1766EltsCovered.set(Idx->getZExtValue());17671768// All elements covered.1769if (EltsCovered.all())1770return true;1771}17721773// We either didn't find a single insertelement, or the insertelement chain1774// ended before all elements were covered. Check for other interesting values.17751776// Constants are always interesting because we can just constant fold the1777// extractelements.1778if (isa<Constant>(CurVal))1779return true;17801781// shufflevector is likely to be profitable if either operand is a constant,1782// or if either source is in the same block.1783// This is because shufflevector is most often lowered as a series of1784// insert/extract elements anyway.1785if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {1786return isa<Constant>(SV->getOperand(1)) ||1787areInSameBB(SV, SV->getOperand(0)) ||1788areInSameBB(SV, SV->getOperand(1));1789}17901791return false;1792}17931794static void collectPHINodes(const PHINode &I,1795SmallPtrSet<const PHINode *, 8> &SeenPHIs) {1796const auto [It, Inserted] = SeenPHIs.insert(&I);1797if (!Inserted)1798return;17991800for (const Value *Inc : I.incoming_values()) {1801if (const auto *PhiInc = dyn_cast<PHINode>(Inc))1802collectPHINodes(*PhiInc, SeenPHIs);1803}18041805for (const User *U : I.users()) {1806if (const auto *PhiU = dyn_cast<PHINode>(U))1807collectPHINodes(*PhiU, SeenPHIs);1808}1809}18101811bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {1812// Check in the cache first.1813if (const auto It = BreakPhiNodesCache.find(&I);1814It != BreakPhiNodesCache.end())1815return It->second;18161817// We consider PHI nodes as part of "chains", so given a PHI node I, we1818// recursively consider all its users and incoming values that are also PHI1819// nodes. We then make a decision about all of those PHIs at once. Either they1820// all get broken up, or none of them do. That way, we avoid cases where a1821// single PHI is/is not broken and we end up reforming/exploding a vector1822// multiple times, or even worse, doing it in a loop.1823SmallPtrSet<const PHINode *, 8> WorkList;1824collectPHINodes(I, WorkList);18251826#ifndef NDEBUG1827// Check that none of the PHI nodes in the worklist are in the map. If some of1828// them are, it means we're not good enough at collecting related PHIs.1829for (const PHINode *WLP : WorkList) {1830assert(BreakPhiNodesCache.count(WLP) == 0);1831}1832#endif18331834// To consider a PHI profitable to break, we need to see some interesting1835// incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist1836// must have one to consider all PHIs breakable.1837//1838// This threshold has been determined through performance testing.1839//1840// Note that the computation below is equivalent to1841//1842// (unsigned)ceil((K / 3.0) * 2)1843//1844// It's simply written this way to avoid mixing integral/FP arithmetic.1845const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);1846unsigned NumBreakablePHIs = 0;1847bool CanBreak = false;1848for (const PHINode *Cur : WorkList) {1849// Don't break PHIs that have no interesting incoming values. That is, where1850// there is no clear opportunity to fold the "extractelement" instructions1851// we would add.1852//1853// Note: IC does not run after this pass, so we're only interested in the1854// foldings that the DAG combiner can do.1855if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {1856if (++NumBreakablePHIs >= Threshold) {1857CanBreak = true;1858break;1859}1860}1861}18621863for (const PHINode *Cur : WorkList)1864BreakPhiNodesCache[Cur] = CanBreak;18651866return CanBreak;1867}18681869/// Helper class for "break large PHIs" (visitPHINode).1870///1871/// This represents a slice of a PHI's incoming value, which is made up of:1872/// - The type of the slice (Ty)1873/// - The index in the incoming value's vector where the slice starts (Idx)1874/// - The number of elements in the slice (NumElts).1875/// It also keeps track of the NewPHI node inserted for this particular slice.1876///1877/// Slice examples:1878/// <4 x i64> -> Split into four i64 slices.1879/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]1880/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.1881/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]1882class VectorSlice {1883public:1884VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)1885: Ty(Ty), Idx(Idx), NumElts(NumElts) {}18861887Type *Ty = nullptr;1888unsigned Idx = 0;1889unsigned NumElts = 0;1890PHINode *NewPHI = nullptr;18911892/// Slice \p Inc according to the information contained within this slice.1893/// This is cached, so if called multiple times for the same \p BB & \p Inc1894/// pair, it returns the same Sliced value as well.1895///1896/// Note this *intentionally* does not return the same value for, say,1897/// [%bb.0, %0] & [%bb.1, %0] as:1898/// - It could cause issues with dominance (e.g. if bb.1 is seen first, then1899/// the value in bb.1 may not be reachable from bb.0 if it's its1900/// predecessor.)1901/// - We also want to make our extract instructions as local as possible so1902/// the DAG has better chances of folding them out. Duplicating them like1903/// that is beneficial in that regard.1904///1905/// This is both a minor optimization to avoid creating duplicate1906/// instructions, but also a requirement for correctness. It is not forbidden1907/// for a PHI node to have the same [BB, Val] pair multiple times. If we1908/// returned a new value each time, those previously identical pairs would all1909/// have different incoming values (from the same block) and it'd cause a "PHI1910/// node has multiple entries for the same basic block with different incoming1911/// values!" verifier error.1912Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {1913Value *&Res = SlicedVals[{BB, Inc}];1914if (Res)1915return Res;19161917IRBuilder<> B(BB->getTerminator());1918if (Instruction *IncInst = dyn_cast<Instruction>(Inc))1919B.SetCurrentDebugLocation(IncInst->getDebugLoc());19201921if (NumElts > 1) {1922SmallVector<int, 4> Mask;1923for (unsigned K = Idx; K < (Idx + NumElts); ++K)1924Mask.push_back(K);1925Res = B.CreateShuffleVector(Inc, Mask, NewValName);1926} else1927Res = B.CreateExtractElement(Inc, Idx, NewValName);19281929return Res;1930}19311932private:1933SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;1934};19351936bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {1937// Break-up fixed-vector PHIs into smaller pieces.1938// Default threshold is 32, so it breaks up any vector that's >32 bits into1939// its elements, or into 32-bit pieces (for 8/16 bit elts).1940//1941// This is only helpful for DAGISel because it doesn't handle large PHIs as1942// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.1943// With large, odd-sized PHIs we may end up needing many `build_vector`1944// operations with most elements being "undef". This inhibits a lot of1945// optimization opportunities and can result in unreasonably high register1946// pressure and the inevitable stack spilling.1947if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)1948return false;19491950FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());1951if (!FVT || FVT->getNumElements() == 1 ||1952DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)1953return false;19541955if (!ForceBreakLargePHIs && !canBreakPHINode(I))1956return false;19571958std::vector<VectorSlice> Slices;19591960Type *EltTy = FVT->getElementType();1961{1962unsigned Idx = 0;1963// For 8/16 bits type, don't scalarize fully but break it up into as many1964// 32-bit slices as we can, and scalarize the tail.1965const unsigned EltSize = DL->getTypeSizeInBits(EltTy);1966const unsigned NumElts = FVT->getNumElements();1967if (EltSize == 8 || EltSize == 16) {1968const unsigned SubVecSize = (32 / EltSize);1969Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);1970for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;1971Idx += SubVecSize)1972Slices.emplace_back(SubVecTy, Idx, SubVecSize);1973}19741975// Scalarize all remaining elements.1976for (; Idx < NumElts; ++Idx)1977Slices.emplace_back(EltTy, Idx, 1);1978}19791980assert(Slices.size() > 1);19811982// Create one PHI per vector piece. The "VectorSlice" class takes care of1983// creating the necessary instruction to extract the relevant slices of each1984// incoming value.1985IRBuilder<> B(I.getParent());1986B.SetCurrentDebugLocation(I.getDebugLoc());19871988unsigned IncNameSuffix = 0;1989for (VectorSlice &S : Slices) {1990// We need to reset the build on each iteration, because getSlicedVal may1991// have inserted something into I's BB.1992B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());1993S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());19941995for (const auto &[Idx, BB] : enumerate(I.blocks())) {1996S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),1997"largephi.extractslice" +1998std::to_string(IncNameSuffix++)),1999BB);2000}2001}20022003// And replace this PHI with a vector of all the previous PHI values.2004Value *Vec = PoisonValue::get(FVT);2005unsigned NameSuffix = 0;2006for (VectorSlice &S : Slices) {2007const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);2008if (S.NumElts > 1)2009Vec =2010B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);2011else2012Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);2013}20142015I.replaceAllUsesWith(Vec);2016I.eraseFromParent();2017return true;2018}20192020/// \param V Value to check2021/// \param DL DataLayout2022/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)2023/// \param AS Target Address Space2024/// \return true if \p V cannot be the null value of \p AS, false otherwise.2025static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,2026const AMDGPUTargetMachine &TM, unsigned AS) {2027// Pointer cannot be null if it's a block address, GV or alloca.2028// NOTE: We don't support extern_weak, but if we did, we'd need to check for2029// it as the symbol could be null in such cases.2030if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))2031return true;20322033// Check nonnull arguments.2034if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())2035return true;20362037// getUnderlyingObject may have looked through another addrspacecast, although2038// the optimizable situations most likely folded out by now.2039if (AS != cast<PointerType>(V->getType())->getAddressSpace())2040return false;20412042// TODO: Calls that return nonnull?20432044// For all other things, use KnownBits.2045// We either use 0 or all bits set to indicate null, so check whether the2046// value can be zero or all ones.2047//2048// TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some2049// address spaces have non-zero null values.2050auto SrcPtrKB = computeKnownBits(V, DL);2051const auto NullVal = TM.getNullPointerValue(AS);20522053assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));2054assert((NullVal == 0 || NullVal == -1) &&2055"don't know how to check for this null value!");2056return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();2057}20582059bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {2060// Intrinsic doesn't support vectors, also it seems that it's often difficult2061// to prove that a vector cannot have any nulls in it so it's unclear if it's2062// worth supporting.2063if (I.getType()->isVectorTy())2064return false;20652066// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.2067// This is only worthwhile for casts from/to priv/local to flat.2068const unsigned SrcAS = I.getSrcAddressSpace();2069const unsigned DstAS = I.getDestAddressSpace();20702071bool CanLower = false;2072if (SrcAS == AMDGPUAS::FLAT_ADDRESS)2073CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||2074DstAS == AMDGPUAS::PRIVATE_ADDRESS);2075else if (DstAS == AMDGPUAS::FLAT_ADDRESS)2076CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||2077SrcAS == AMDGPUAS::PRIVATE_ADDRESS);2078if (!CanLower)2079return false;20802081SmallVector<const Value *, 4> WorkList;2082getUnderlyingObjects(I.getOperand(0), WorkList);2083if (!all_of(WorkList, [&](const Value *V) {2084return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);2085}))2086return false;20872088IRBuilder<> B(&I);2089auto *Intrin = B.CreateIntrinsic(2090I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});2091I.replaceAllUsesWith(Intrin);2092I.eraseFromParent();2093return true;2094}20952096bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {2097switch (I.getIntrinsicID()) {2098case Intrinsic::bitreverse:2099return visitBitreverseIntrinsicInst(I);2100case Intrinsic::minnum:2101return visitMinNum(I);2102case Intrinsic::sqrt:2103return visitSqrt(I);2104default:2105return false;2106}2107}21082109bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {2110bool Changed = false;21112112if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&2113UA->isUniform(&I))2114Changed |= promoteUniformBitreverseToI32(I);21152116return Changed;2117}21182119/// Match non-nan fract pattern.2120/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)2121///2122/// If fract is a useful instruction for the subtarget. Does not account for the2123/// nan handling; the instruction has a nan check on the input value.2124Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {2125if (ST->hasFractBug())2126return nullptr;21272128if (I.getIntrinsicID() != Intrinsic::minnum)2129return nullptr;21302131Type *Ty = I.getType();2132if (!isLegalFloatingTy(Ty->getScalarType()))2133return nullptr;21342135Value *Arg0 = I.getArgOperand(0);2136Value *Arg1 = I.getArgOperand(1);21372138const APFloat *C;2139if (!match(Arg1, m_APFloat(C)))2140return nullptr;21412142APFloat One(1.0);2143bool LosesInfo;2144One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);21452146// Match nextafter(1.0, -1)2147One.next(true);2148if (One != *C)2149return nullptr;21502151Value *FloorSrc;2152if (match(Arg0, m_FSub(m_Value(FloorSrc),2153m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))2154return FloorSrc;2155return nullptr;2156}21572158Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,2159Value *FractArg) {2160SmallVector<Value *, 4> FractVals;2161extractValues(Builder, FractVals, FractArg);21622163SmallVector<Value *, 4> ResultVals(FractVals.size());21642165Type *Ty = FractArg->getType()->getScalarType();2166for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {2167ResultVals[I] =2168Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});2169}21702171return insertValues(Builder, FractArg->getType(), ResultVals);2172}21732174bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {2175Value *FractArg = matchFractPat(I);2176if (!FractArg)2177return false;21782179// Match pattern for fract intrinsic in contexts where the nan check has been2180// optimized out (and hope the knowledge the source can't be nan wasn't lost).2181if (!I.hasNoNaNs() &&2182!isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))2183return false;21842185IRBuilder<> Builder(&I);2186FastMathFlags FMF = I.getFastMathFlags();2187FMF.setNoNaNs();2188Builder.setFastMathFlags(FMF);21892190Value *Fract = applyFractPat(Builder, FractArg);2191Fract->takeName(&I);2192I.replaceAllUsesWith(Fract);21932194RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);2195return true;2196}21972198static bool isOneOrNegOne(const Value *Val) {2199const APFloat *C;2200return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;2201}22022203// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.2204bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {2205Type *Ty = Sqrt.getType()->getScalarType();2206if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))2207return false;22082209const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);2210FastMathFlags SqrtFMF = FPOp->getFastMathFlags();22112212// We're trying to handle the fast-but-not-that-fast case only. The lowering2213// of fast llvm.sqrt will give the raw instruction anyway.2214if (SqrtFMF.approxFunc() || HasUnsafeFPMath)2215return false;22162217const float ReqdAccuracy = FPOp->getFPAccuracy();22182219// Defer correctly rounded expansion to codegen.2220if (ReqdAccuracy < 1.0f)2221return false;22222223// FIXME: This is an ugly hack for this pass using forward iteration instead2224// of reverse. If it worked like a normal combiner, the rsq would form before2225// we saw a sqrt call.2226auto *FDiv =2227dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());2228if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&2229FDiv->getFPAccuracy() >= 1.0f &&2230canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&2231// TODO: We should also handle the arcp case for the fdiv with non-1 value2232isOneOrNegOne(FDiv->getOperand(0)))2233return false;22342235Value *SrcVal = Sqrt.getOperand(0);2236bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);22372238// The raw instruction is 1 ulp, but the correction for denormal handling2239// brings it to 2.2240if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)2241return false;22422243IRBuilder<> Builder(&Sqrt);2244SmallVector<Value *, 4> SrcVals;2245extractValues(Builder, SrcVals, SrcVal);22462247SmallVector<Value *, 4> ResultVals(SrcVals.size());2248for (int I = 0, E = SrcVals.size(); I != E; ++I) {2249if (CanTreatAsDAZ)2250ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);2251else2252ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);2253}22542255Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);2256NewSqrt->takeName(&Sqrt);2257Sqrt.replaceAllUsesWith(NewSqrt);2258Sqrt.eraseFromParent();2259return true;2260}22612262bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {2263Impl.Mod = &M;2264Impl.DL = &Impl.Mod->getDataLayout();2265Impl.SqrtF32 = nullptr;2266Impl.LdexpF32 = nullptr;2267return false;2268}22692270bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {2271if (skipFunction(F))2272return false;22732274auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();2275if (!TPC)2276return false;22772278const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();2279Impl.TM = &TM;2280Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);2281Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);2282Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);2283Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();2284auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();2285Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;2286Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);2287SIModeRegisterDefaults Mode(F, *Impl.ST);2288Impl.HasFP32DenormalFlush =2289Mode.FP32Denormals == DenormalMode::getPreserveSign();2290return Impl.run(F);2291}22922293PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,2294FunctionAnalysisManager &FAM) {2295AMDGPUCodeGenPrepareImpl Impl;2296Impl.Mod = F.getParent();2297Impl.DL = &Impl.Mod->getDataLayout();2298Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);2299Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);2300Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);2301Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);2302Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);2303Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);2304Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);2305SIModeRegisterDefaults Mode(F, *Impl.ST);2306Impl.HasFP32DenormalFlush =2307Mode.FP32Denormals == DenormalMode::getPreserveSign();2308PreservedAnalyses PA = PreservedAnalyses::none();2309if (!Impl.FlowChanged)2310PA.preserveSet<CFGAnalyses>();2311return Impl.run(F) ? PA : PreservedAnalyses::all();2312}23132314INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,2315"AMDGPU IR optimizations", false, false)2316INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)2317INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)2318INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)2319INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",2320false, false)23212322char AMDGPUCodeGenPrepare::ID = 0;23232324FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {2325return new AMDGPUCodeGenPrepare();2326}232723282329