Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
35266 views
//===- AggressiveInstCombine.cpp ------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements the aggressive expression pattern combiner classes.9// Currently, it handles expression patterns for:10// * Truncate instruction11//12//===----------------------------------------------------------------------===//1314#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"15#include "AggressiveInstCombineInternal.h"16#include "llvm/ADT/Statistic.h"17#include "llvm/Analysis/AliasAnalysis.h"18#include "llvm/Analysis/AssumptionCache.h"19#include "llvm/Analysis/BasicAliasAnalysis.h"20#include "llvm/Analysis/ConstantFolding.h"21#include "llvm/Analysis/DomTreeUpdater.h"22#include "llvm/Analysis/GlobalsModRef.h"23#include "llvm/Analysis/TargetLibraryInfo.h"24#include "llvm/Analysis/TargetTransformInfo.h"25#include "llvm/Analysis/ValueTracking.h"26#include "llvm/IR/DataLayout.h"27#include "llvm/IR/Dominators.h"28#include "llvm/IR/Function.h"29#include "llvm/IR/IRBuilder.h"30#include "llvm/IR/PatternMatch.h"31#include "llvm/Transforms/Utils/BasicBlockUtils.h"32#include "llvm/Transforms/Utils/BuildLibCalls.h"33#include "llvm/Transforms/Utils/Local.h"3435using namespace llvm;36using namespace PatternMatch;3738#define DEBUG_TYPE "aggressive-instcombine"3940STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");41STATISTIC(NumGuardedRotates,42"Number of guarded rotates transformed into funnel shifts");43STATISTIC(NumGuardedFunnelShifts,44"Number of guarded funnel shifts transformed into funnel shifts");45STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");4647static cl::opt<unsigned> MaxInstrsToScan(48"aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden,49cl::desc("Max number of instructions to scan for aggressive instcombine."));5051static cl::opt<unsigned> StrNCmpInlineThreshold(52"strncmp-inline-threshold", cl::init(3), cl::Hidden,53cl::desc("The maximum length of a constant string for a builtin string cmp "54"call eligible for inlining. The default value is 3."));5556static cl::opt<unsigned>57MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden,58cl::desc("The maximum length of a constant string to "59"inline a memchr call."));6061/// Match a pattern for a bitwise funnel/rotate operation that partially guards62/// against undefined behavior by branching around the funnel-shift/rotation63/// when the shift amount is 0.64static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {65if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)66return false;6768// As with the one-use checks below, this is not strictly necessary, but we69// are being cautious to avoid potential perf regressions on targets that70// do not actually have a funnel/rotate instruction (where the funnel shift71// would be expanded back into math/shift/logic ops).72if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))73return false;7475// Match V to funnel shift left/right and capture the source operands and76// shift amount.77auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,78Value *&ShAmt) {79unsigned Width = V->getType()->getScalarSizeInBits();8081// fshl(ShVal0, ShVal1, ShAmt)82// == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))83if (match(V, m_OneUse(m_c_Or(84m_Shl(m_Value(ShVal0), m_Value(ShAmt)),85m_LShr(m_Value(ShVal1),86m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) {87return Intrinsic::fshl;88}8990// fshr(ShVal0, ShVal1, ShAmt)91// == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))92if (match(V,93m_OneUse(m_c_Or(m_Shl(m_Value(ShVal0), m_Sub(m_SpecificInt(Width),94m_Value(ShAmt))),95m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) {96return Intrinsic::fshr;97}9899return Intrinsic::not_intrinsic;100};101102// One phi operand must be a funnel/rotate operation, and the other phi103// operand must be the source value of that funnel/rotate operation:104// phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]105// phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]106// phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]107PHINode &Phi = cast<PHINode>(I);108unsigned FunnelOp = 0, GuardOp = 1;109Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);110Value *ShVal0, *ShVal1, *ShAmt;111Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);112if (IID == Intrinsic::not_intrinsic ||113(IID == Intrinsic::fshl && ShVal0 != P1) ||114(IID == Intrinsic::fshr && ShVal1 != P1)) {115IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);116if (IID == Intrinsic::not_intrinsic ||117(IID == Intrinsic::fshl && ShVal0 != P0) ||118(IID == Intrinsic::fshr && ShVal1 != P0))119return false;120assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&121"Pattern must match funnel shift left or right");122std::swap(FunnelOp, GuardOp);123}124125// The incoming block with our source operand must be the "guard" block.126// That must contain a cmp+branch to avoid the funnel/rotate when the shift127// amount is equal to 0. The other incoming block is the block with the128// funnel/rotate.129BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);130BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);131Instruction *TermI = GuardBB->getTerminator();132133// Ensure that the shift values dominate each block.134if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))135return false;136137ICmpInst::Predicate Pred;138BasicBlock *PhiBB = Phi.getParent();139if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),140m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))141return false;142143if (Pred != CmpInst::ICMP_EQ)144return false;145146IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());147148if (ShVal0 == ShVal1)149++NumGuardedRotates;150else151++NumGuardedFunnelShifts;152153// If this is not a rotate then the select was blocking poison from the154// 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.155bool IsFshl = IID == Intrinsic::fshl;156if (ShVal0 != ShVal1) {157if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1))158ShVal1 = Builder.CreateFreeze(ShVal1);159else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0))160ShVal0 = Builder.CreateFreeze(ShVal0);161}162163// We matched a variation of this IR pattern:164// GuardBB:165// %cmp = icmp eq i32 %ShAmt, 0166// br i1 %cmp, label %PhiBB, label %FunnelBB167// FunnelBB:168// %sub = sub i32 32, %ShAmt169// %shr = lshr i32 %ShVal1, %sub170// %shl = shl i32 %ShVal0, %ShAmt171// %fsh = or i32 %shr, %shl172// br label %PhiBB173// PhiBB:174// %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]175// -->176// llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)177Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());178Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));179return true;180}181182/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and183/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain184/// of 'and' ops, then we also need to capture the fact that we saw an185/// "and X, 1", so that's an extra return value for that case.186struct MaskOps {187Value *Root = nullptr;188APInt Mask;189bool MatchAndChain;190bool FoundAnd1 = false;191192MaskOps(unsigned BitWidth, bool MatchAnds)193: Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}194};195196/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a197/// chain of 'and' or 'or' instructions looking for shift ops of a common source198/// value. Examples:199/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)200/// returns { X, 0x129 }201/// and (and (X >> 1), 1), (X >> 4)202/// returns { X, 0x12 }203static bool matchAndOrChain(Value *V, MaskOps &MOps) {204Value *Op0, *Op1;205if (MOps.MatchAndChain) {206// Recurse through a chain of 'and' operands. This requires an extra check207// vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere208// in the chain to know that all of the high bits are cleared.209if (match(V, m_And(m_Value(Op0), m_One()))) {210MOps.FoundAnd1 = true;211return matchAndOrChain(Op0, MOps);212}213if (match(V, m_And(m_Value(Op0), m_Value(Op1))))214return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);215} else {216// Recurse through a chain of 'or' operands.217if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))218return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);219}220221// We need a shift-right or a bare value representing a compare of bit 0 of222// the original source operand.223Value *Candidate;224const APInt *BitIndex = nullptr;225if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))226Candidate = V;227228// Initialize result source operand.229if (!MOps.Root)230MOps.Root = Candidate;231232// The shift constant is out-of-range? This code hasn't been simplified.233if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))234return false;235236// Fill in the mask bit derived from the shift constant.237MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);238return MOps.Root == Candidate;239}240241/// Match patterns that correspond to "any-bits-set" and "all-bits-set".242/// These will include a chain of 'or' or 'and'-shifted bits from a243/// common source value:244/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0245/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask246/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns247/// that differ only with a final 'not' of the result. We expect that final248/// 'not' to be folded with the compare that we create here (invert predicate).249static bool foldAnyOrAllBitsSet(Instruction &I) {250// The 'any-bits-set' ('or' chain) pattern is simpler to match because the251// final "and X, 1" instruction must be the final op in the sequence.252bool MatchAllBitsSet;253if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))254MatchAllBitsSet = true;255else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))256MatchAllBitsSet = false;257else258return false;259260MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);261if (MatchAllBitsSet) {262if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)263return false;264} else {265if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))266return false;267}268269// The pattern was found. Create a masked compare that replaces all of the270// shift and logic ops.271IRBuilder<> Builder(&I);272Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);273Value *And = Builder.CreateAnd(MOps.Root, Mask);274Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)275: Builder.CreateIsNotNull(And);276Value *Zext = Builder.CreateZExt(Cmp, I.getType());277I.replaceAllUsesWith(Zext);278++NumAnyOrAllBitsSet;279return true;280}281282// Try to recognize below function as popcount intrinsic.283// This is the "best" algorithm from284// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel285// Also used in TargetLowering::expandCTPOP().286//287// int popcount(unsigned int i) {288// i = i - ((i >> 1) & 0x55555555);289// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);290// i = ((i + (i >> 4)) & 0x0F0F0F0F);291// return (i * 0x01010101) >> 24;292// }293static bool tryToRecognizePopCount(Instruction &I) {294if (I.getOpcode() != Instruction::LShr)295return false;296297Type *Ty = I.getType();298if (!Ty->isIntOrIntVectorTy())299return false;300301unsigned Len = Ty->getScalarSizeInBits();302// FIXME: fix Len == 8 and other irregular type lengths.303if (!(Len <= 128 && Len > 8 && Len % 8 == 0))304return false;305306APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));307APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));308APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));309APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));310APInt MaskShift = APInt(Len, Len - 8);311312Value *Op0 = I.getOperand(0);313Value *Op1 = I.getOperand(1);314Value *MulOp0;315// Matching "(i * 0x01010101...) >> 24".316if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&317match(Op1, m_SpecificInt(MaskShift))) {318Value *ShiftOp0;319// Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".320if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),321m_Deferred(ShiftOp0)),322m_SpecificInt(Mask0F)))) {323Value *AndOp0;324// Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".325if (match(ShiftOp0,326m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),327m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),328m_SpecificInt(Mask33))))) {329Value *Root, *SubOp1;330// Matching "i - ((i >> 1) & 0x55555555...)".331if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&332match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),333m_SpecificInt(Mask55)))) {334LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");335IRBuilder<> Builder(&I);336Function *Func = Intrinsic::getDeclaration(337I.getModule(), Intrinsic::ctpop, I.getType());338I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));339++NumPopCountRecognized;340return true;341}342}343}344}345346return false;347}348349/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and350/// C2 saturate the value of the fp conversion. The transform is not reversable351/// as the fptosi.sat is more defined than the input - all values produce a352/// valid value for the fptosi.sat, where as some produce poison for original353/// that were out of range of the integer conversion. The reversed pattern may354/// use fmax and fmin instead. As we cannot directly reverse the transform, and355/// it is not always profitable, we make it conditional on the cost being356/// reported as lower by TTI.357static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {358// Look for min(max(fptosi, converting to fptosi_sat.359Value *In;360const APInt *MinC, *MaxC;361if (!match(&I, m_SMax(m_OneUse(m_SMin(m_OneUse(m_FPToSI(m_Value(In))),362m_APInt(MinC))),363m_APInt(MaxC))) &&364!match(&I, m_SMin(m_OneUse(m_SMax(m_OneUse(m_FPToSI(m_Value(In))),365m_APInt(MaxC))),366m_APInt(MinC))))367return false;368369// Check that the constants clamp a saturate.370if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1)371return false;372373Type *IntTy = I.getType();374Type *FpTy = In->getType();375Type *SatTy =376IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1);377if (auto *VecTy = dyn_cast<VectorType>(IntTy))378SatTy = VectorType::get(SatTy, VecTy->getElementCount());379380// Get the cost of the intrinsic, and check that against the cost of381// fptosi+smin+smax382InstructionCost SatCost = TTI.getIntrinsicInstrCost(383IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),384TTI::TCK_RecipThroughput);385SatCost += TTI.getCastInstrCost(Instruction::SExt, IntTy, SatTy,386TTI::CastContextHint::None,387TTI::TCK_RecipThroughput);388389InstructionCost MinMaxCost = TTI.getCastInstrCost(390Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None,391TTI::TCK_RecipThroughput);392MinMaxCost += TTI.getIntrinsicInstrCost(393IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}),394TTI::TCK_RecipThroughput);395MinMaxCost += TTI.getIntrinsicInstrCost(396IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}),397TTI::TCK_RecipThroughput);398399if (SatCost >= MinMaxCost)400return false;401402IRBuilder<> Builder(&I);403Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat,404{SatTy, FpTy});405Value *Sat = Builder.CreateCall(Fn, In);406I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));407return true;408}409410/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids411/// pessimistic codegen that has to account for setting errno and can enable412/// vectorization.413static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,414TargetLibraryInfo &TLI, AssumptionCache &AC,415DominatorTree &DT) {416417Module *M = Call->getModule();418419// If (1) this is a sqrt libcall, (2) we can assume that NAN is not created420// (because NNAN or the operand arg must not be less than -0.0) and (2) we421// would not end up lowering to a libcall anyway (which could change the value422// of errno), then:423// (1) errno won't be set.424// (2) it is safe to convert this to an intrinsic call.425Type *Ty = Call->getType();426Value *Arg = Call->getArgOperand(0);427if (TTI.haveFastSqrt(Ty) &&428(Call->hasNoNaNs() ||429cannotBeOrderedLessThanZero(430Arg, 0,431SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {432IRBuilder<> Builder(Call);433IRBuilderBase::FastMathFlagGuard Guard(Builder);434Builder.setFastMathFlags(Call->getFastMathFlags());435436Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);437Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");438Call->replaceAllUsesWith(NewSqrt);439440// Explicitly erase the old call because a call with side effects is not441// trivially dead.442Call->eraseFromParent();443return true;444}445446return false;447}448449// Check if this array of constants represents a cttz table.450// Iterate over the elements from \p Table by trying to find/match all451// the numbers from 0 to \p InputBits that should represent cttz results.452static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,453uint64_t Shift, uint64_t InputBits) {454unsigned Length = Table.getNumElements();455if (Length < InputBits || Length > InputBits * 2)456return false;457458APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);459unsigned Matched = 0;460461for (unsigned i = 0; i < Length; i++) {462uint64_t Element = Table.getElementAsInteger(i);463if (Element >= InputBits)464continue;465466// Check if \p Element matches a concrete answer. It could fail for some467// elements that are never accessed, so we keep iterating over each element468// from the table. The number of matched elements should be equal to the469// number of potential right answers which is \p InputBits actually.470if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)471Matched++;472}473474return Matched == InputBits;475}476477// Try to recognize table-based ctz implementation.478// E.g., an example in C (for more cases please see the llvm/tests):479// int f(unsigned x) {480// static const char table[32] =481// {0, 1, 28, 2, 29, 14, 24, 3, 30,482// 22, 20, 15, 25, 17, 4, 8, 31, 27,483// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};484// return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];485// }486// this can be lowered to `cttz` instruction.487// There is also a special case when the element is 0.488//489// Here are some examples or LLVM IR for a 64-bit target:490//491// CASE 1:492// %sub = sub i32 0, %x493// %and = and i32 %sub, %x494// %mul = mul i32 %and, 125613361495// %shr = lshr i32 %mul, 27496// %idxprom = zext i32 %shr to i64497// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0,498// i64 %idxprom499// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8500//501// CASE 2:502// %sub = sub i32 0, %x503// %and = and i32 %sub, %x504// %mul = mul i32 %and, 72416175505// %shr = lshr i32 %mul, 26506// %idxprom = zext i32 %shr to i64507// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table,508// i64 0, i64 %idxprom509// %0 = load i16, i16* %arrayidx, align 2, !tbaa !8510//511// CASE 3:512// %sub = sub i32 0, %x513// %and = and i32 %sub, %x514// %mul = mul i32 %and, 81224991515// %shr = lshr i32 %mul, 27516// %idxprom = zext i32 %shr to i64517// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table,518// i64 0, i64 %idxprom519// %0 = load i32, i32* %arrayidx, align 4, !tbaa !8520//521// CASE 4:522// %sub = sub i64 0, %x523// %and = and i64 %sub, %x524// %mul = mul i64 %and, 283881067100198605525// %shr = lshr i64 %mul, 58526// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0,527// i64 %shr528// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8529//530// All this can be lowered to @llvm.cttz.i32/64 intrinsic.531static bool tryToRecognizeTableBasedCttz(Instruction &I) {532LoadInst *LI = dyn_cast<LoadInst>(&I);533if (!LI)534return false;535536Type *AccessType = LI->getType();537if (!AccessType->isIntegerTy())538return false;539540GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());541if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 2)542return false;543544if (!GEP->getSourceElementType()->isArrayTy())545return false;546547uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();548if (ArraySize != 32 && ArraySize != 64)549return false;550551GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());552if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())553return false;554555ConstantDataArray *ConstData =556dyn_cast<ConstantDataArray>(GVTable->getInitializer());557if (!ConstData)558return false;559560if (!match(GEP->idx_begin()->get(), m_ZeroInt()))561return false;562563Value *Idx2 = std::next(GEP->idx_begin())->get();564Value *X1;565uint64_t MulConst, ShiftConst;566// FIXME: 64-bit targets have `i64` type for the GEP index, so this match will567// probably fail for other (e.g. 32-bit) targets.568if (!match(Idx2, m_ZExtOrSelf(569m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),570m_ConstantInt(MulConst)),571m_ConstantInt(ShiftConst)))))572return false;573574unsigned InputBits = X1->getType()->getScalarSizeInBits();575if (InputBits != 32 && InputBits != 64)576return false;577578// Shift should extract top 5..7 bits.579if (InputBits - Log2_32(InputBits) != ShiftConst &&580InputBits - Log2_32(InputBits) - 1 != ShiftConst)581return false;582583if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))584return false;585586auto ZeroTableElem = ConstData->getElementAsInteger(0);587bool DefinedForZero = ZeroTableElem == InputBits;588589IRBuilder<> B(LI);590ConstantInt *BoolConst = B.getInt1(!DefinedForZero);591Type *XType = X1->getType();592auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst});593Value *ZExtOrTrunc = nullptr;594595if (DefinedForZero) {596ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, AccessType);597} else {598// If the value in elem 0 isn't the same as InputBits, we still want to599// produce the value from the table.600auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));601auto Select =602B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);603604// NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target605// it should be handled as: `cttz(x) & (typeSize - 1)`.606607ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);608}609610LI->replaceAllUsesWith(ZExtOrTrunc);611612return true;613}614615/// This is used by foldLoadsRecursive() to capture a Root Load node which is616/// of type or(load, load) and recursively build the wide load. Also capture the617/// shift amount, zero extend type and loadSize.618struct LoadOps {619LoadInst *Root = nullptr;620LoadInst *RootInsert = nullptr;621bool FoundRoot = false;622uint64_t LoadSize = 0;623const APInt *Shift = nullptr;624Type *ZextType;625AAMDNodes AATags;626};627628// Identify and Merge consecutive loads recursively which is of the form629// (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1630// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)631static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,632AliasAnalysis &AA) {633const APInt *ShAmt2 = nullptr;634Value *X;635Instruction *L1, *L2;636637// Go to the last node with loads.638if (match(V, m_OneUse(m_c_Or(639m_Value(X),640m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))),641m_APInt(ShAmt2)))))) ||642match(V, m_OneUse(m_Or(m_Value(X),643m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) {644if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot)645// Avoid Partial chain merge.646return false;647} else648return false;649650// Check if the pattern has loads651LoadInst *LI1 = LOps.Root;652const APInt *ShAmt1 = LOps.Shift;653if (LOps.FoundRoot == false &&654(match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) ||655match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))),656m_APInt(ShAmt1)))))) {657LI1 = dyn_cast<LoadInst>(L1);658}659LoadInst *LI2 = dyn_cast<LoadInst>(L2);660661// Check if loads are same, atomic, volatile and having same address space.662if (LI1 == LI2 || !LI1 || !LI2 || !LI1->isSimple() || !LI2->isSimple() ||663LI1->getPointerAddressSpace() != LI2->getPointerAddressSpace())664return false;665666// Check if Loads come from same BB.667if (LI1->getParent() != LI2->getParent())668return false;669670// Find the data layout671bool IsBigEndian = DL.isBigEndian();672673// Check if loads are consecutive and same size.674Value *Load1Ptr = LI1->getPointerOperand();675APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);676Load1Ptr =677Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset1,678/* AllowNonInbounds */ true);679680Value *Load2Ptr = LI2->getPointerOperand();681APInt Offset2(DL.getIndexTypeSizeInBits(Load2Ptr->getType()), 0);682Load2Ptr =683Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset2,684/* AllowNonInbounds */ true);685686// Verify if both loads have same base pointers and load sizes are same.687uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits();688uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits();689if (Load1Ptr != Load2Ptr || LoadSize1 != LoadSize2)690return false;691692// Support Loadsizes greater or equal to 8bits and only power of 2.693if (LoadSize1 < 8 || !isPowerOf2_64(LoadSize1))694return false;695696// Alias Analysis to check for stores b/w the loads.697LoadInst *Start = LOps.FoundRoot ? LOps.RootInsert : LI1, *End = LI2;698MemoryLocation Loc;699if (!Start->comesBefore(End)) {700std::swap(Start, End);701Loc = MemoryLocation::get(End);702if (LOps.FoundRoot)703Loc = Loc.getWithNewSize(LOps.LoadSize);704} else705Loc = MemoryLocation::get(End);706unsigned NumScanned = 0;707for (Instruction &Inst :708make_range(Start->getIterator(), End->getIterator())) {709if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))710return false;711712// Ignore debug info so that's not counted against MaxInstrsToScan.713// Otherwise debug info could affect codegen.714if (!isa<DbgInfoIntrinsic>(Inst) && ++NumScanned > MaxInstrsToScan)715return false;716}717718// Make sure Load with lower Offset is at LI1719bool Reverse = false;720if (Offset2.slt(Offset1)) {721std::swap(LI1, LI2);722std::swap(ShAmt1, ShAmt2);723std::swap(Offset1, Offset2);724std::swap(Load1Ptr, Load2Ptr);725std::swap(LoadSize1, LoadSize2);726Reverse = true;727}728729// Big endian swap the shifts730if (IsBigEndian)731std::swap(ShAmt1, ShAmt2);732733// Find Shifts values.734uint64_t Shift1 = 0, Shift2 = 0;735if (ShAmt1)736Shift1 = ShAmt1->getZExtValue();737if (ShAmt2)738Shift2 = ShAmt2->getZExtValue();739740// First load is always LI1. This is where we put the new load.741// Use the merged load size available from LI1 for forward loads.742if (LOps.FoundRoot) {743if (!Reverse)744LoadSize1 = LOps.LoadSize;745else746LoadSize2 = LOps.LoadSize;747}748749// Verify if shift amount and load index aligns and verifies that loads750// are consecutive.751uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;752uint64_t PrevSize =753DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));754if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)755return false;756757// Update LOps758AAMDNodes AATags1 = LOps.AATags;759AAMDNodes AATags2 = LI2->getAAMetadata();760if (LOps.FoundRoot == false) {761LOps.FoundRoot = true;762AATags1 = LI1->getAAMetadata();763}764LOps.LoadSize = LoadSize1 + LoadSize2;765LOps.RootInsert = Start;766767// Concatenate the AATags of the Merged Loads.768LOps.AATags = AATags1.concat(AATags2);769770LOps.Root = LI1;771LOps.Shift = ShAmt1;772LOps.ZextType = X->getType();773return true;774}775776// For a given BB instruction, evaluate all loads in the chain that form a777// pattern which suggests that the loads can be combined. The one and only use778// of the loads is to form a wider load.779static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,780TargetTransformInfo &TTI, AliasAnalysis &AA,781const DominatorTree &DT) {782// Only consider load chains of scalar values.783if (isa<VectorType>(I.getType()))784return false;785786LoadOps LOps;787if (!foldLoadsRecursive(&I, LOps, DL, AA) || !LOps.FoundRoot)788return false;789790IRBuilder<> Builder(&I);791LoadInst *NewLoad = nullptr, *LI1 = LOps.Root;792793IntegerType *WiderType = IntegerType::get(I.getContext(), LOps.LoadSize);794// TTI based checks if we want to proceed with wider load795bool Allowed = TTI.isTypeLegal(WiderType);796if (!Allowed)797return false;798799unsigned AS = LI1->getPointerAddressSpace();800unsigned Fast = 0;801Allowed = TTI.allowsMisalignedMemoryAccesses(I.getContext(), LOps.LoadSize,802AS, LI1->getAlign(), &Fast);803if (!Allowed || !Fast)804return false;805806// Get the Index and Ptr for the new GEP.807Value *Load1Ptr = LI1->getPointerOperand();808Builder.SetInsertPoint(LOps.RootInsert);809if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {810APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);811Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(812DL, Offset1, /* AllowNonInbounds */ true);813Load1Ptr = Builder.CreatePtrAdd(Load1Ptr, Builder.getInt(Offset1));814}815// Generate wider load.816NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(),817LI1->isVolatile(), "");818NewLoad->takeName(LI1);819// Set the New Load AATags Metadata.820if (LOps.AATags)821NewLoad->setAAMetadata(LOps.AATags);822823Value *NewOp = NewLoad;824// Check if zero extend needed.825if (LOps.ZextType)826NewOp = Builder.CreateZExt(NewOp, LOps.ZextType);827828// Check if shift needed. We need to shift with the amount of load1829// shift if not zero.830if (LOps.Shift)831NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift));832I.replaceAllUsesWith(NewOp);833834return true;835}836837// Calculate GEP Stride and accumulated const ModOffset. Return Stride and838// ModOffset839static std::pair<APInt, APInt>840getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL) {841unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());842std::optional<APInt> Stride;843APInt ModOffset(BW, 0);844// Return a minimum gep stride, greatest common divisor of consective gep845// index scales(c.f. Bézout's identity).846while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) {847MapVector<Value *, APInt> VarOffsets;848if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))849break;850851for (auto [V, Scale] : VarOffsets) {852// Only keep a power of two factor for non-inbounds853if (!GEP->isInBounds())854Scale = APInt::getOneBitSet(Scale.getBitWidth(), Scale.countr_zero());855856if (!Stride)857Stride = Scale;858else859Stride = APIntOps::GreatestCommonDivisor(*Stride, Scale);860}861862PtrOp = GEP->getPointerOperand();863}864865// Check whether pointer arrives back at Global Variable via at least one GEP.866// Even if it doesn't, we can check by alignment.867if (!isa<GlobalVariable>(PtrOp) || !Stride)868return {APInt(BW, 1), APInt(BW, 0)};869870// In consideration of signed GEP indices, non-negligible offset become871// remainder of division by minimum GEP stride.872ModOffset = ModOffset.srem(*Stride);873if (ModOffset.isNegative())874ModOffset += *Stride;875876return {*Stride, ModOffset};877}878879/// If C is a constant patterned array and all valid loaded results for given880/// alignment are same to a constant, return that constant.881static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {882auto *LI = dyn_cast<LoadInst>(&I);883if (!LI || LI->isVolatile())884return false;885886// We can only fold the load if it is from a constant global with definitive887// initializer. Skip expensive logic if this is not the case.888auto *PtrOp = LI->getPointerOperand();889auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(PtrOp));890if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())891return false;892893// Bail for large initializers in excess of 4K to avoid too many scans.894Constant *C = GV->getInitializer();895uint64_t GVSize = DL.getTypeAllocSize(C->getType());896if (!GVSize || 4096 < GVSize)897return false;898899Type *LoadTy = LI->getType();900unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());901auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL);902903// Any possible offset could be multiple of GEP stride. And any valid904// offset is multiple of load alignment, so checking only multiples of bigger905// one is sufficient to say results' equality.906if (auto LA = LI->getAlign();907LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) {908ConstOffset = APInt(BW, 0);909Stride = APInt(BW, LA.value());910}911912Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);913if (!Ca)914return false;915916unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);917for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)918if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))919return false;920921I.replaceAllUsesWith(Ca);922923return true;924}925926namespace {927class StrNCmpInliner {928public:929StrNCmpInliner(CallInst *CI, LibFunc Func, DomTreeUpdater *DTU,930const DataLayout &DL)931: CI(CI), Func(Func), DTU(DTU), DL(DL) {}932933bool optimizeStrNCmp();934935private:936void inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped);937938CallInst *CI;939LibFunc Func;940DomTreeUpdater *DTU;941const DataLayout &DL;942};943944} // namespace945946/// First we normalize calls to strncmp/strcmp to the form of947/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2948/// (without considering '\0').949///950/// Examples:951///952/// \code953/// strncmp(s, "a", 3) -> compare(s, "a", 2)954/// strncmp(s, "abc", 3) -> compare(s, "abc", 3)955/// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)956/// strcmp(s, "a") -> compare(s, "a", 2)957///958/// char s2[] = {'a'}959/// strncmp(s, s2, 3) -> compare(s, s2, 3)960///961/// char s2[] = {'a', 'b', 'c', 'd'}962/// strncmp(s, s2, 3) -> compare(s, s2, 3)963/// \endcode964///965/// We only handle cases where N and exactly one of s1 and s2 are constant.966/// Cases that s1 and s2 are both constant are already handled by the967/// instcombine pass.968///969/// We do not handle cases where N > StrNCmpInlineThreshold.970///971/// We also do not handles cases where N < 2, which are already972/// handled by the instcombine pass.973///974bool StrNCmpInliner::optimizeStrNCmp() {975if (StrNCmpInlineThreshold < 2)976return false;977978if (!isOnlyUsedInZeroComparison(CI))979return false;980981Value *Str1P = CI->getArgOperand(0);982Value *Str2P = CI->getArgOperand(1);983// Should be handled elsewhere.984if (Str1P == Str2P)985return false;986987StringRef Str1, Str2;988bool HasStr1 = getConstantStringInfo(Str1P, Str1, /*TrimAtNul=*/false);989bool HasStr2 = getConstantStringInfo(Str2P, Str2, /*TrimAtNul=*/false);990if (HasStr1 == HasStr2)991return false;992993// Note that '\0' and characters after it are not trimmed.994StringRef Str = HasStr1 ? Str1 : Str2;995Value *StrP = HasStr1 ? Str2P : Str1P;996997size_t Idx = Str.find('\0');998uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + 1;999if (Func == LibFunc_strncmp) {1000if (auto *ConstInt = dyn_cast<ConstantInt>(CI->getArgOperand(2)))1001N = std::min(N, ConstInt->getZExtValue());1002else1003return false;1004}1005// Now N means how many bytes we need to compare at most.1006if (N > Str.size() || N < 2 || N > StrNCmpInlineThreshold)1007return false;10081009// Cases where StrP has two or more dereferenceable bytes might be better1010// optimized elsewhere.1011bool CanBeNull = false, CanBeFreed = false;1012if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > 1)1013return false;1014inlineCompare(StrP, Str, N, HasStr1);1015return true;1016}10171018/// Convert1019///1020/// \code1021/// ret = compare(s1, s2, N)1022/// \endcode1023///1024/// into1025///1026/// \code1027/// ret = (int)s1[0] - (int)s2[0]1028/// if (ret != 0)1029/// goto NE1030/// ...1031/// ret = (int)s1[N-2] - (int)s2[N-2]1032/// if (ret != 0)1033/// goto NE1034/// ret = (int)s1[N-1] - (int)s2[N-1]1035/// NE:1036/// \endcode1037///1038/// CFG before and after the transformation:1039///1040/// (before)1041/// BBCI1042///1043/// (after)1044/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail1045/// | ^1046/// E |1047/// | |1048/// BBSubs[1] (sub,icmp) --NE-----+1049/// ... |1050/// BBSubs[N-1] (sub) ---------+1051///1052void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,1053bool Swapped) {1054auto &Ctx = CI->getContext();1055IRBuilder<> B(Ctx);10561057BasicBlock *BBCI = CI->getParent();1058BasicBlock *BBTail =1059SplitBlock(BBCI, CI, DTU, nullptr, nullptr, BBCI->getName() + ".tail");10601061SmallVector<BasicBlock *> BBSubs;1062for (uint64_t I = 0; I < N; ++I)1063BBSubs.push_back(1064BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));1065BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);10661067cast<BranchInst>(BBCI->getTerminator())->setSuccessor(0, BBSubs[0]);10681069B.SetInsertPoint(BBNE);1070PHINode *Phi = B.CreatePHI(CI->getType(), N);1071B.CreateBr(BBTail);10721073Value *Base = LHS;1074for (uint64_t i = 0; i < N; ++i) {1075B.SetInsertPoint(BBSubs[i]);1076Value *VL =1077B.CreateZExt(B.CreateLoad(B.getInt8Ty(),1078B.CreateInBoundsPtrAdd(Base, B.getInt64(i))),1079CI->getType());1080Value *VR =1081ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));1082Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);1083if (i < N - 1)1084B.CreateCondBr(B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)),1085BBNE, BBSubs[i + 1]);1086else1087B.CreateBr(BBNE);10881089Phi->addIncoming(Sub, BBSubs[i]);1090}10911092CI->replaceAllUsesWith(Phi);1093CI->eraseFromParent();10941095if (DTU) {1096SmallVector<DominatorTree::UpdateType, 8> Updates;1097Updates.push_back({DominatorTree::Insert, BBCI, BBSubs[0]});1098for (uint64_t i = 0; i < N; ++i) {1099if (i < N - 1)1100Updates.push_back({DominatorTree::Insert, BBSubs[i], BBSubs[i + 1]});1101Updates.push_back({DominatorTree::Insert, BBSubs[i], BBNE});1102}1103Updates.push_back({DominatorTree::Insert, BBNE, BBTail});1104Updates.push_back({DominatorTree::Delete, BBCI, BBTail});1105DTU->applyUpdates(Updates);1106}1107}11081109/// Convert memchr with a small constant string into a switch1110static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,1111const DataLayout &DL) {1112if (isa<Constant>(Call->getArgOperand(1)))1113return false;11141115StringRef Str;1116Value *Base = Call->getArgOperand(0);1117if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false))1118return false;11191120uint64_t N = Str.size();1121if (auto *ConstInt = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {1122uint64_t Val = ConstInt->getZExtValue();1123// Ignore the case that n is larger than the size of string.1124if (Val > N)1125return false;1126N = Val;1127} else1128return false;11291130if (N > MemChrInlineThreshold)1131return false;11321133BasicBlock *BB = Call->getParent();1134BasicBlock *BBNext = SplitBlock(BB, Call, DTU);1135IRBuilder<> IRB(BB);1136IntegerType *ByteTy = IRB.getInt8Ty();1137BB->getTerminator()->eraseFromParent();1138SwitchInst *SI = IRB.CreateSwitch(1139IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);1140Type *IndexTy = DL.getIndexType(Call->getType());1141SmallVector<DominatorTree::UpdateType, 8> Updates;11421143BasicBlock *BBSuccess = BasicBlock::Create(1144Call->getContext(), "memchr.success", BB->getParent(), BBNext);1145IRB.SetInsertPoint(BBSuccess);1146PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx");1147Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI);1148IRB.CreateBr(BBNext);1149if (DTU)1150Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext});11511152SmallPtrSet<ConstantInt *, 4> Cases;1153for (uint64_t I = 0; I < N; ++I) {1154ConstantInt *CaseVal = ConstantInt::get(ByteTy, Str[I]);1155if (!Cases.insert(CaseVal).second)1156continue;11571158BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case",1159BB->getParent(), BBSuccess);1160SI->addCase(CaseVal, BBCase);1161IRB.SetInsertPoint(BBCase);1162IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase);1163IRB.CreateBr(BBSuccess);1164if (DTU) {1165Updates.push_back({DominatorTree::Insert, BB, BBCase});1166Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess});1167}1168}11691170PHINode *PHI =1171PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin());1172PHI->addIncoming(Constant::getNullValue(Call->getType()), BB);1173PHI->addIncoming(FirstOccursLocation, BBSuccess);11741175Call->replaceAllUsesWith(PHI);1176Call->eraseFromParent();11771178if (DTU)1179DTU->applyUpdates(Updates);11801181return true;1182}11831184static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,1185TargetLibraryInfo &TLI, AssumptionCache &AC,1186DominatorTree &DT, const DataLayout &DL,1187bool &MadeCFGChange) {11881189auto *CI = dyn_cast<CallInst>(&I);1190if (!CI || CI->isNoBuiltin())1191return false;11921193Function *CalledFunc = CI->getCalledFunction();1194if (!CalledFunc)1195return false;11961197LibFunc LF;1198if (!TLI.getLibFunc(*CalledFunc, LF) ||1199!isLibFuncEmittable(CI->getModule(), &TLI, LF))1200return false;12011202DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);12031204switch (LF) {1205case LibFunc_sqrt:1206case LibFunc_sqrtf:1207case LibFunc_sqrtl:1208return foldSqrt(CI, LF, TTI, TLI, AC, DT);1209case LibFunc_strcmp:1210case LibFunc_strncmp:1211if (StrNCmpInliner(CI, LF, &DTU, DL).optimizeStrNCmp()) {1212MadeCFGChange = true;1213return true;1214}1215break;1216case LibFunc_memchr:1217if (foldMemChr(CI, &DTU, DL)) {1218MadeCFGChange = true;1219return true;1220}1221break;1222default:;1223}1224return false;1225}12261227/// This is the entry point for folds that could be implemented in regular1228/// InstCombine, but they are separated because they are not expected to1229/// occur frequently and/or have more than a constant-length pattern match.1230static bool foldUnusualPatterns(Function &F, DominatorTree &DT,1231TargetTransformInfo &TTI,1232TargetLibraryInfo &TLI, AliasAnalysis &AA,1233AssumptionCache &AC, bool &MadeCFGChange) {1234bool MadeChange = false;1235for (BasicBlock &BB : F) {1236// Ignore unreachable basic blocks.1237if (!DT.isReachableFromEntry(&BB))1238continue;12391240const DataLayout &DL = F.getDataLayout();12411242// Walk the block backwards for efficiency. We're matching a chain of1243// use->defs, so we're more likely to succeed by starting from the bottom.1244// Also, we want to avoid matching partial patterns.1245// TODO: It would be more efficient if we removed dead instructions1246// iteratively in this loop rather than waiting until the end.1247for (Instruction &I : make_early_inc_range(llvm::reverse(BB))) {1248MadeChange |= foldAnyOrAllBitsSet(I);1249MadeChange |= foldGuardedFunnelShift(I, DT);1250MadeChange |= tryToRecognizePopCount(I);1251MadeChange |= tryToFPToSat(I, TTI);1252MadeChange |= tryToRecognizeTableBasedCttz(I);1253MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);1254MadeChange |= foldPatternedLoads(I, DL);1255// NOTE: This function introduces erasing of the instruction `I`, so it1256// needs to be called at the end of this sequence, otherwise we may make1257// bugs.1258MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);1259}1260}12611262// We're done with transforms, so remove dead instructions.1263if (MadeChange)1264for (BasicBlock &BB : F)1265SimplifyInstructionsInBlock(&BB);12661267return MadeChange;1268}12691270/// This is the entry point for all transforms. Pass manager differences are1271/// handled in the callers of this function.1272static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,1273TargetLibraryInfo &TLI, DominatorTree &DT,1274AliasAnalysis &AA, bool &MadeCFGChange) {1275bool MadeChange = false;1276const DataLayout &DL = F.getDataLayout();1277TruncInstCombine TIC(AC, TLI, DL, DT);1278MadeChange |= TIC.run(F);1279MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);1280return MadeChange;1281}12821283PreservedAnalyses AggressiveInstCombinePass::run(Function &F,1284FunctionAnalysisManager &AM) {1285auto &AC = AM.getResult<AssumptionAnalysis>(F);1286auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);1287auto &DT = AM.getResult<DominatorTreeAnalysis>(F);1288auto &TTI = AM.getResult<TargetIRAnalysis>(F);1289auto &AA = AM.getResult<AAManager>(F);1290bool MadeCFGChange = false;1291if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {1292// No changes, all analyses are preserved.1293return PreservedAnalyses::all();1294}1295// Mark all the analyses that instcombine updates as preserved.1296PreservedAnalyses PA;1297if (MadeCFGChange)1298PA.preserve<DominatorTreeAnalysis>();1299else1300PA.preserveSet<CFGAnalyses>();1301return PA;1302}130313041305