Path: blob/main/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
35232 views
//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass munges the code in the input function to better prepare it for9// SelectionDAG-based code generation. This works around limitations in it's10// basic-block-at-a-time approach. It should eventually be removed.11//12//===----------------------------------------------------------------------===//1314#include "llvm/CodeGen/CodeGenPrepare.h"15#include "llvm/ADT/APInt.h"16#include "llvm/ADT/ArrayRef.h"17#include "llvm/ADT/DenseMap.h"18#include "llvm/ADT/MapVector.h"19#include "llvm/ADT/PointerIntPair.h"20#include "llvm/ADT/STLExtras.h"21#include "llvm/ADT/SmallPtrSet.h"22#include "llvm/ADT/SmallVector.h"23#include "llvm/ADT/Statistic.h"24#include "llvm/Analysis/BlockFrequencyInfo.h"25#include "llvm/Analysis/BranchProbabilityInfo.h"26#include "llvm/Analysis/InstructionSimplify.h"27#include "llvm/Analysis/LoopInfo.h"28#include "llvm/Analysis/ProfileSummaryInfo.h"29#include "llvm/Analysis/TargetLibraryInfo.h"30#include "llvm/Analysis/TargetTransformInfo.h"31#include "llvm/Analysis/ValueTracking.h"32#include "llvm/Analysis/VectorUtils.h"33#include "llvm/CodeGen/Analysis.h"34#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"35#include "llvm/CodeGen/ISDOpcodes.h"36#include "llvm/CodeGen/SelectionDAGNodes.h"37#include "llvm/CodeGen/TargetLowering.h"38#include "llvm/CodeGen/TargetPassConfig.h"39#include "llvm/CodeGen/TargetSubtargetInfo.h"40#include "llvm/CodeGen/ValueTypes.h"41#include "llvm/CodeGenTypes/MachineValueType.h"42#include "llvm/Config/llvm-config.h"43#include "llvm/IR/Argument.h"44#include "llvm/IR/Attributes.h"45#include "llvm/IR/BasicBlock.h"46#include "llvm/IR/Constant.h"47#include "llvm/IR/Constants.h"48#include "llvm/IR/DataLayout.h"49#include "llvm/IR/DebugInfo.h"50#include "llvm/IR/DerivedTypes.h"51#include "llvm/IR/Dominators.h"52#include "llvm/IR/Function.h"53#include "llvm/IR/GetElementPtrTypeIterator.h"54#include "llvm/IR/GlobalValue.h"55#include "llvm/IR/GlobalVariable.h"56#include "llvm/IR/IRBuilder.h"57#include "llvm/IR/InlineAsm.h"58#include "llvm/IR/InstrTypes.h"59#include "llvm/IR/Instruction.h"60#include "llvm/IR/Instructions.h"61#include "llvm/IR/IntrinsicInst.h"62#include "llvm/IR/Intrinsics.h"63#include "llvm/IR/IntrinsicsAArch64.h"64#include "llvm/IR/LLVMContext.h"65#include "llvm/IR/MDBuilder.h"66#include "llvm/IR/Module.h"67#include "llvm/IR/Operator.h"68#include "llvm/IR/PatternMatch.h"69#include "llvm/IR/ProfDataUtils.h"70#include "llvm/IR/Statepoint.h"71#include "llvm/IR/Type.h"72#include "llvm/IR/Use.h"73#include "llvm/IR/User.h"74#include "llvm/IR/Value.h"75#include "llvm/IR/ValueHandle.h"76#include "llvm/IR/ValueMap.h"77#include "llvm/InitializePasses.h"78#include "llvm/Pass.h"79#include "llvm/Support/BlockFrequency.h"80#include "llvm/Support/BranchProbability.h"81#include "llvm/Support/Casting.h"82#include "llvm/Support/CommandLine.h"83#include "llvm/Support/Compiler.h"84#include "llvm/Support/Debug.h"85#include "llvm/Support/ErrorHandling.h"86#include "llvm/Support/MathExtras.h"87#include "llvm/Support/raw_ostream.h"88#include "llvm/Target/TargetMachine.h"89#include "llvm/Target/TargetOptions.h"90#include "llvm/Transforms/Utils/BasicBlockUtils.h"91#include "llvm/Transforms/Utils/BypassSlowDivision.h"92#include "llvm/Transforms/Utils/Local.h"93#include "llvm/Transforms/Utils/SimplifyLibCalls.h"94#include "llvm/Transforms/Utils/SizeOpts.h"95#include <algorithm>96#include <cassert>97#include <cstdint>98#include <iterator>99#include <limits>100#include <memory>101#include <optional>102#include <utility>103#include <vector>104105using namespace llvm;106using namespace llvm::PatternMatch;107108#define DEBUG_TYPE "codegenprepare"109110STATISTIC(NumBlocksElim, "Number of blocks eliminated");111STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");112STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");113STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "114"sunken Cmps");115STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "116"of sunken Casts");117STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "118"computations were sunk");119STATISTIC(NumMemoryInstsPhiCreated,120"Number of phis created when address "121"computations were sunk to memory instructions");122STATISTIC(NumMemoryInstsSelectCreated,123"Number of select created when address "124"computations were sunk to memory instructions");125STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");126STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");127STATISTIC(NumAndsAdded,128"Number of and mask instructions added to form ext loads");129STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");130STATISTIC(NumRetsDup, "Number of return instructions duplicated");131STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");132STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");133STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");134135static cl::opt<bool> DisableBranchOpts(136"disable-cgp-branch-opts", cl::Hidden, cl::init(false),137cl::desc("Disable branch optimizations in CodeGenPrepare"));138139static cl::opt<bool>140DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),141cl::desc("Disable GC optimizations in CodeGenPrepare"));142143static cl::opt<bool>144DisableSelectToBranch("disable-cgp-select2branch", cl::Hidden,145cl::init(false),146cl::desc("Disable select to branch conversion."));147148static cl::opt<bool>149AddrSinkUsingGEPs("addr-sink-using-gep", cl::Hidden, cl::init(true),150cl::desc("Address sinking in CGP using GEPs."));151152static cl::opt<bool>153EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(true),154cl::desc("Enable sinkinig and/cmp into branches."));155156static cl::opt<bool> DisableStoreExtract(157"disable-cgp-store-extract", cl::Hidden, cl::init(false),158cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));159160static cl::opt<bool> StressStoreExtract(161"stress-cgp-store-extract", cl::Hidden, cl::init(false),162cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));163164static cl::opt<bool> DisableExtLdPromotion(165"disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),166cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "167"CodeGenPrepare"));168169static cl::opt<bool> StressExtLdPromotion(170"stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),171cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "172"optimization in CodeGenPrepare"));173174static cl::opt<bool> DisablePreheaderProtect(175"disable-preheader-prot", cl::Hidden, cl::init(false),176cl::desc("Disable protection against removing loop preheaders"));177178static cl::opt<bool> ProfileGuidedSectionPrefix(179"profile-guided-section-prefix", cl::Hidden, cl::init(true),180cl::desc("Use profile info to add section prefix for hot/cold functions"));181182static cl::opt<bool> ProfileUnknownInSpecialSection(183"profile-unknown-in-special-section", cl::Hidden,184cl::desc("In profiling mode like sampleFDO, if a function doesn't have "185"profile, we cannot tell the function is cold for sure because "186"it may be a function newly added without ever being sampled. "187"With the flag enabled, compiler can put such profile unknown "188"functions into a special section, so runtime system can choose "189"to handle it in a different way than .text section, to save "190"RAM for example. "));191192static cl::opt<bool> BBSectionsGuidedSectionPrefix(193"bbsections-guided-section-prefix", cl::Hidden, cl::init(true),194cl::desc("Use the basic-block-sections profile to determine the text "195"section prefix for hot functions. Functions with "196"basic-block-sections profile will be placed in `.text.hot` "197"regardless of their FDO profile info. Other functions won't be "198"impacted, i.e., their prefixes will be decided by FDO/sampleFDO "199"profiles."));200201static cl::opt<uint64_t> FreqRatioToSkipMerge(202"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),203cl::desc("Skip merging empty blocks if (frequency of empty block) / "204"(frequency of destination block) is greater than this ratio"));205206static cl::opt<bool> ForceSplitStore(207"force-split-store", cl::Hidden, cl::init(false),208cl::desc("Force store splitting no matter what the target query says."));209210static cl::opt<bool> EnableTypePromotionMerge(211"cgp-type-promotion-merge", cl::Hidden,212cl::desc("Enable merging of redundant sexts when one is dominating"213" the other."),214cl::init(true));215216static cl::opt<bool> DisableComplexAddrModes(217"disable-complex-addr-modes", cl::Hidden, cl::init(false),218cl::desc("Disables combining addressing modes with different parts "219"in optimizeMemoryInst."));220221static cl::opt<bool>222AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),223cl::desc("Allow creation of Phis in Address sinking."));224225static cl::opt<bool> AddrSinkNewSelects(226"addr-sink-new-select", cl::Hidden, cl::init(true),227cl::desc("Allow creation of selects in Address sinking."));228229static cl::opt<bool> AddrSinkCombineBaseReg(230"addr-sink-combine-base-reg", cl::Hidden, cl::init(true),231cl::desc("Allow combining of BaseReg field in Address sinking."));232233static cl::opt<bool> AddrSinkCombineBaseGV(234"addr-sink-combine-base-gv", cl::Hidden, cl::init(true),235cl::desc("Allow combining of BaseGV field in Address sinking."));236237static cl::opt<bool> AddrSinkCombineBaseOffs(238"addr-sink-combine-base-offs", cl::Hidden, cl::init(true),239cl::desc("Allow combining of BaseOffs field in Address sinking."));240241static cl::opt<bool> AddrSinkCombineScaledReg(242"addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),243cl::desc("Allow combining of ScaledReg field in Address sinking."));244245static cl::opt<bool>246EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,247cl::init(true),248cl::desc("Enable splitting large offset of GEP."));249250static cl::opt<bool> EnableICMP_EQToICMP_ST(251"cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),252cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));253254static cl::opt<bool>255VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),256cl::desc("Enable BFI update verification for "257"CodeGenPrepare."));258259static cl::opt<bool>260OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(true),261cl::desc("Enable converting phi types in CodeGenPrepare"));262263static cl::opt<unsigned>264HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(10000), cl::Hidden,265cl::desc("Least BB number of huge function."));266267static cl::opt<unsigned>268MaxAddressUsersToScan("cgp-max-address-users-to-scan", cl::init(100),269cl::Hidden,270cl::desc("Max number of address users to look at"));271272static cl::opt<bool>273DisableDeletePHIs("disable-cgp-delete-phis", cl::Hidden, cl::init(false),274cl::desc("Disable elimination of dead PHI nodes."));275276namespace {277278enum ExtType {279ZeroExtension, // Zero extension has been seen.280SignExtension, // Sign extension has been seen.281BothExtension // This extension type is used if we saw sext after282// ZeroExtension had been set, or if we saw zext after283// SignExtension had been set. It makes the type284// information of a promoted instruction invalid.285};286287enum ModifyDT {288NotModifyDT, // Not Modify any DT.289ModifyBBDT, // Modify the Basic Block Dominator Tree.290ModifyInstDT // Modify the Instruction Dominator in a Basic Block,291// This usually means we move/delete/insert instruction292// in a Basic Block. So we should re-iterate instructions293// in such Basic Block.294};295296using SetOfInstrs = SmallPtrSet<Instruction *, 16>;297using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;298using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;299using SExts = SmallVector<Instruction *, 16>;300using ValueToSExts = MapVector<Value *, SExts>;301302class TypePromotionTransaction;303304class CodeGenPrepare {305friend class CodeGenPrepareLegacyPass;306const TargetMachine *TM = nullptr;307const TargetSubtargetInfo *SubtargetInfo = nullptr;308const TargetLowering *TLI = nullptr;309const TargetRegisterInfo *TRI = nullptr;310const TargetTransformInfo *TTI = nullptr;311const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;312const TargetLibraryInfo *TLInfo = nullptr;313LoopInfo *LI = nullptr;314std::unique_ptr<BlockFrequencyInfo> BFI;315std::unique_ptr<BranchProbabilityInfo> BPI;316ProfileSummaryInfo *PSI = nullptr;317318/// As we scan instructions optimizing them, this is the next instruction319/// to optimize. Transforms that can invalidate this should update it.320BasicBlock::iterator CurInstIterator;321322/// Keeps track of non-local addresses that have been sunk into a block.323/// This allows us to avoid inserting duplicate code for blocks with324/// multiple load/stores of the same address. The usage of WeakTrackingVH325/// enables SunkAddrs to be treated as a cache whose entries can be326/// invalidated if a sunken address computation has been erased.327ValueMap<Value *, WeakTrackingVH> SunkAddrs;328329/// Keeps track of all instructions inserted for the current function.330SetOfInstrs InsertedInsts;331332/// Keeps track of the type of the related instruction before their333/// promotion for the current function.334InstrToOrigTy PromotedInsts;335336/// Keep track of instructions removed during promotion.337SetOfInstrs RemovedInsts;338339/// Keep track of sext chains based on their initial value.340DenseMap<Value *, Instruction *> SeenChainsForSExt;341342/// Keep track of GEPs accessing the same data structures such as structs or343/// arrays that are candidates to be split later because of their large344/// size.345MapVector<AssertingVH<Value>,346SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>347LargeOffsetGEPMap;348349/// Keep track of new GEP base after splitting the GEPs having large offset.350SmallSet<AssertingVH<Value>, 2> NewGEPBases;351352/// Map serial numbers to Large offset GEPs.353DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;354355/// Keep track of SExt promoted.356ValueToSExts ValToSExtendedUses;357358/// True if the function has the OptSize attribute.359bool OptSize;360361/// DataLayout for the Function being processed.362const DataLayout *DL = nullptr;363364/// Building the dominator tree can be expensive, so we only build it365/// lazily and update it when required.366std::unique_ptr<DominatorTree> DT;367368public:369CodeGenPrepare(){};370CodeGenPrepare(const TargetMachine *TM) : TM(TM){};371/// If encounter huge function, we need to limit the build time.372bool IsHugeFunc = false;373374/// FreshBBs is like worklist, it collected the updated BBs which need375/// to be optimized again.376/// Note: Consider building time in this pass, when a BB updated, we need377/// to insert such BB into FreshBBs for huge function.378SmallSet<BasicBlock *, 32> FreshBBs;379380void releaseMemory() {381// Clear per function information.382InsertedInsts.clear();383PromotedInsts.clear();384FreshBBs.clear();385BPI.reset();386BFI.reset();387}388389bool run(Function &F, FunctionAnalysisManager &AM);390391private:392template <typename F>393void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {394// Substituting can cause recursive simplifications, which can invalidate395// our iterator. Use a WeakTrackingVH to hold onto it in case this396// happens.397Value *CurValue = &*CurInstIterator;398WeakTrackingVH IterHandle(CurValue);399400f();401402// If the iterator instruction was recursively deleted, start over at the403// start of the block.404if (IterHandle != CurValue) {405CurInstIterator = BB->begin();406SunkAddrs.clear();407}408}409410// Get the DominatorTree, building if necessary.411DominatorTree &getDT(Function &F) {412if (!DT)413DT = std::make_unique<DominatorTree>(F);414return *DT;415}416417void removeAllAssertingVHReferences(Value *V);418bool eliminateAssumptions(Function &F);419bool eliminateFallThrough(Function &F, DominatorTree *DT = nullptr);420bool eliminateMostlyEmptyBlocks(Function &F);421BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);422bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;423void eliminateMostlyEmptyBlock(BasicBlock *BB);424bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,425bool isPreheader);426bool makeBitReverse(Instruction &I);427bool optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT);428bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT);429bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,430unsigned AddrSpace);431bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);432bool optimizeInlineAsmInst(CallInst *CS);433bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);434bool optimizeExt(Instruction *&I);435bool optimizeExtUses(Instruction *I);436bool optimizeLoadExt(LoadInst *Load);437bool optimizeShiftInst(BinaryOperator *BO);438bool optimizeFunnelShift(IntrinsicInst *Fsh);439bool optimizeSelectInst(SelectInst *SI);440bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);441bool optimizeSwitchType(SwitchInst *SI);442bool optimizeSwitchPhiConstants(SwitchInst *SI);443bool optimizeSwitchInst(SwitchInst *SI);444bool optimizeExtractElementInst(Instruction *Inst);445bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);446bool fixupDbgValue(Instruction *I);447bool fixupDbgVariableRecord(DbgVariableRecord &I);448bool fixupDbgVariableRecordsOnInst(Instruction &I);449bool placeDbgValues(Function &F);450bool placePseudoProbes(Function &F);451bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,452LoadInst *&LI, Instruction *&Inst, bool HasPromoted);453bool tryToPromoteExts(TypePromotionTransaction &TPT,454const SmallVectorImpl<Instruction *> &Exts,455SmallVectorImpl<Instruction *> &ProfitablyMovedExts,456unsigned CreatedInstsCost = 0);457bool mergeSExts(Function &F);458bool splitLargeGEPOffsets();459bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,460SmallPtrSetImpl<Instruction *> &DeletedInstrs);461bool optimizePhiTypes(Function &F);462bool performAddressTypePromotion(463Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,464bool HasPromoted, TypePromotionTransaction &TPT,465SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);466bool splitBranchCondition(Function &F, ModifyDT &ModifiedDT);467bool simplifyOffsetableRelocate(GCStatepointInst &I);468469bool tryToSinkFreeOperands(Instruction *I);470bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,471CmpInst *Cmp, Intrinsic::ID IID);472bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);473bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);474bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);475void verifyBFIUpdates(Function &F);476bool _run(Function &F);477};478479class CodeGenPrepareLegacyPass : public FunctionPass {480public:481static char ID; // Pass identification, replacement for typeid482483CodeGenPrepareLegacyPass() : FunctionPass(ID) {484initializeCodeGenPrepareLegacyPassPass(*PassRegistry::getPassRegistry());485}486487bool runOnFunction(Function &F) override;488489StringRef getPassName() const override { return "CodeGen Prepare"; }490491void getAnalysisUsage(AnalysisUsage &AU) const override {492// FIXME: When we can selectively preserve passes, preserve the domtree.493AU.addRequired<ProfileSummaryInfoWrapperPass>();494AU.addRequired<TargetLibraryInfoWrapperPass>();495AU.addRequired<TargetPassConfig>();496AU.addRequired<TargetTransformInfoWrapperPass>();497AU.addRequired<LoopInfoWrapperPass>();498AU.addUsedIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();499}500};501502} // end anonymous namespace503504char CodeGenPrepareLegacyPass::ID = 0;505506bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {507if (skipFunction(F))508return false;509auto TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();510CodeGenPrepare CGP(TM);511CGP.DL = &F.getDataLayout();512CGP.SubtargetInfo = TM->getSubtargetImpl(F);513CGP.TLI = CGP.SubtargetInfo->getTargetLowering();514CGP.TRI = CGP.SubtargetInfo->getRegisterInfo();515CGP.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);516CGP.TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);517CGP.LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();518CGP.BPI.reset(new BranchProbabilityInfo(F, *CGP.LI));519CGP.BFI.reset(new BlockFrequencyInfo(F, *CGP.BPI, *CGP.LI));520CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();521auto BBSPRWP =522getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();523CGP.BBSectionsProfileReader = BBSPRWP ? &BBSPRWP->getBBSPR() : nullptr;524525return CGP._run(F);526}527528INITIALIZE_PASS_BEGIN(CodeGenPrepareLegacyPass, DEBUG_TYPE,529"Optimize for code generation", false, false)530INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)531INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)532INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)533INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)534INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)535INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)536INITIALIZE_PASS_END(CodeGenPrepareLegacyPass, DEBUG_TYPE,537"Optimize for code generation", false, false)538539FunctionPass *llvm::createCodeGenPrepareLegacyPass() {540return new CodeGenPrepareLegacyPass();541}542543PreservedAnalyses CodeGenPreparePass::run(Function &F,544FunctionAnalysisManager &AM) {545CodeGenPrepare CGP(TM);546547bool Changed = CGP.run(F, AM);548if (!Changed)549return PreservedAnalyses::all();550551PreservedAnalyses PA;552PA.preserve<TargetLibraryAnalysis>();553PA.preserve<TargetIRAnalysis>();554PA.preserve<LoopAnalysis>();555return PA;556}557558bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {559DL = &F.getDataLayout();560SubtargetInfo = TM->getSubtargetImpl(F);561TLI = SubtargetInfo->getTargetLowering();562TRI = SubtargetInfo->getRegisterInfo();563TLInfo = &AM.getResult<TargetLibraryAnalysis>(F);564TTI = &AM.getResult<TargetIRAnalysis>(F);565LI = &AM.getResult<LoopAnalysis>(F);566BPI.reset(new BranchProbabilityInfo(F, *LI));567BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));568auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);569PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());570BBSectionsProfileReader =571AM.getCachedResult<BasicBlockSectionsProfileReaderAnalysis>(F);572return _run(F);573}574575bool CodeGenPrepare::_run(Function &F) {576bool EverMadeChange = false;577578OptSize = F.hasOptSize();579// Use the basic-block-sections profile to promote hot functions to .text.hot580// if requested.581if (BBSectionsGuidedSectionPrefix && BBSectionsProfileReader &&582BBSectionsProfileReader->isFunctionHot(F.getName())) {583F.setSectionPrefix("hot");584} else if (ProfileGuidedSectionPrefix) {585// The hot attribute overwrites profile count based hotness while profile586// counts based hotness overwrite the cold attribute.587// This is a conservative behabvior.588if (F.hasFnAttribute(Attribute::Hot) ||589PSI->isFunctionHotInCallGraph(&F, *BFI))590F.setSectionPrefix("hot");591// If PSI shows this function is not hot, we will placed the function592// into unlikely section if (1) PSI shows this is a cold function, or593// (2) the function has a attribute of cold.594else if (PSI->isFunctionColdInCallGraph(&F, *BFI) ||595F.hasFnAttribute(Attribute::Cold))596F.setSectionPrefix("unlikely");597else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&598PSI->isFunctionHotnessUnknown(F))599F.setSectionPrefix("unknown");600}601602/// This optimization identifies DIV instructions that can be603/// profitably bypassed and carried out with a shorter, faster divide.604if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {605const DenseMap<unsigned int, unsigned int> &BypassWidths =606TLI->getBypassSlowDivWidths();607BasicBlock *BB = &*F.begin();608while (BB != nullptr) {609// bypassSlowDivision may create new BBs, but we don't want to reapply the610// optimization to those blocks.611BasicBlock *Next = BB->getNextNode();612// F.hasOptSize is already checked in the outer if statement.613if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))614EverMadeChange |= bypassSlowDivision(BB, BypassWidths);615BB = Next;616}617}618619// Get rid of @llvm.assume builtins before attempting to eliminate empty620// blocks, since there might be blocks that only contain @llvm.assume calls621// (plus arguments that we can get rid of).622EverMadeChange |= eliminateAssumptions(F);623624// Eliminate blocks that contain only PHI nodes and an625// unconditional branch.626EverMadeChange |= eliminateMostlyEmptyBlocks(F);627628ModifyDT ModifiedDT = ModifyDT::NotModifyDT;629if (!DisableBranchOpts)630EverMadeChange |= splitBranchCondition(F, ModifiedDT);631632// Split some critical edges where one of the sources is an indirect branch,633// to help generate sane code for PHIs involving such edges.634EverMadeChange |=635SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true);636637// If we are optimzing huge function, we need to consider the build time.638// Because the basic algorithm's complex is near O(N!).639IsHugeFunc = F.size() > HugeFuncThresholdInCGPP;640641// Transformations above may invalidate dominator tree and/or loop info.642DT.reset();643LI->releaseMemory();644LI->analyze(getDT(F));645646bool MadeChange = true;647bool FuncIterated = false;648while (MadeChange) {649MadeChange = false;650651for (BasicBlock &BB : llvm::make_early_inc_range(F)) {652if (FuncIterated && !FreshBBs.contains(&BB))653continue;654655ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT;656bool Changed = optimizeBlock(BB, ModifiedDTOnIteration);657658if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT)659DT.reset();660661MadeChange |= Changed;662if (IsHugeFunc) {663// If the BB is updated, it may still has chance to be optimized.664// This usually happen at sink optimization.665// For example:666//667// bb0:668// %and = and i32 %a, 4669// %cmp = icmp eq i32 %and, 0670//671// If the %cmp sink to other BB, the %and will has chance to sink.672if (Changed)673FreshBBs.insert(&BB);674else if (FuncIterated)675FreshBBs.erase(&BB);676} else {677// For small/normal functions, we restart BB iteration if the dominator678// tree of the Function was changed.679if (ModifiedDTOnIteration != ModifyDT::NotModifyDT)680break;681}682}683// We have iterated all the BB in the (only work for huge) function.684FuncIterated = IsHugeFunc;685686if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())687MadeChange |= mergeSExts(F);688if (!LargeOffsetGEPMap.empty())689MadeChange |= splitLargeGEPOffsets();690MadeChange |= optimizePhiTypes(F);691692if (MadeChange)693eliminateFallThrough(F, DT.get());694695#ifndef NDEBUG696if (MadeChange && VerifyLoopInfo)697LI->verify(getDT(F));698#endif699700// Really free removed instructions during promotion.701for (Instruction *I : RemovedInsts)702I->deleteValue();703704EverMadeChange |= MadeChange;705SeenChainsForSExt.clear();706ValToSExtendedUses.clear();707RemovedInsts.clear();708LargeOffsetGEPMap.clear();709LargeOffsetGEPID.clear();710}711712NewGEPBases.clear();713SunkAddrs.clear();714715if (!DisableBranchOpts) {716MadeChange = false;717// Use a set vector to get deterministic iteration order. The order the718// blocks are removed may affect whether or not PHI nodes in successors719// are removed.720SmallSetVector<BasicBlock *, 8> WorkList;721for (BasicBlock &BB : F) {722SmallVector<BasicBlock *, 2> Successors(successors(&BB));723MadeChange |= ConstantFoldTerminator(&BB, true);724if (!MadeChange)725continue;726727for (BasicBlock *Succ : Successors)728if (pred_empty(Succ))729WorkList.insert(Succ);730}731732// Delete the dead blocks and any of their dead successors.733MadeChange |= !WorkList.empty();734while (!WorkList.empty()) {735BasicBlock *BB = WorkList.pop_back_val();736SmallVector<BasicBlock *, 2> Successors(successors(BB));737738DeleteDeadBlock(BB);739740for (BasicBlock *Succ : Successors)741if (pred_empty(Succ))742WorkList.insert(Succ);743}744745// Merge pairs of basic blocks with unconditional branches, connected by746// a single edge.747if (EverMadeChange || MadeChange)748MadeChange |= eliminateFallThrough(F);749750EverMadeChange |= MadeChange;751}752753if (!DisableGCOpts) {754SmallVector<GCStatepointInst *, 2> Statepoints;755for (BasicBlock &BB : F)756for (Instruction &I : BB)757if (auto *SP = dyn_cast<GCStatepointInst>(&I))758Statepoints.push_back(SP);759for (auto &I : Statepoints)760EverMadeChange |= simplifyOffsetableRelocate(*I);761}762763// Do this last to clean up use-before-def scenarios introduced by other764// preparatory transforms.765EverMadeChange |= placeDbgValues(F);766EverMadeChange |= placePseudoProbes(F);767768#ifndef NDEBUG769if (VerifyBFIUpdates)770verifyBFIUpdates(F);771#endif772773return EverMadeChange;774}775776bool CodeGenPrepare::eliminateAssumptions(Function &F) {777bool MadeChange = false;778for (BasicBlock &BB : F) {779CurInstIterator = BB.begin();780while (CurInstIterator != BB.end()) {781Instruction *I = &*(CurInstIterator++);782if (auto *Assume = dyn_cast<AssumeInst>(I)) {783MadeChange = true;784Value *Operand = Assume->getOperand(0);785Assume->eraseFromParent();786787resetIteratorIfInvalidatedWhileCalling(&BB, [&]() {788RecursivelyDeleteTriviallyDeadInstructions(Operand, TLInfo, nullptr);789});790}791}792}793return MadeChange;794}795796/// An instruction is about to be deleted, so remove all references to it in our797/// GEP-tracking data strcutures.798void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {799LargeOffsetGEPMap.erase(V);800NewGEPBases.erase(V);801802auto GEP = dyn_cast<GetElementPtrInst>(V);803if (!GEP)804return;805806LargeOffsetGEPID.erase(GEP);807808auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand());809if (VecI == LargeOffsetGEPMap.end())810return;811812auto &GEPVector = VecI->second;813llvm::erase_if(GEPVector, [=](auto &Elt) { return Elt.first == GEP; });814815if (GEPVector.empty())816LargeOffsetGEPMap.erase(VecI);817}818819// Verify BFI has been updated correctly by recomputing BFI and comparing them.820void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {821DominatorTree NewDT(F);822LoopInfo NewLI(NewDT);823BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);824BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);825NewBFI.verifyMatch(*BFI);826}827828/// Merge basic blocks which are connected by a single edge, where one of the829/// basic blocks has a single successor pointing to the other basic block,830/// which has a single predecessor.831bool CodeGenPrepare::eliminateFallThrough(Function &F, DominatorTree *DT) {832bool Changed = false;833// Scan all of the blocks in the function, except for the entry block.834// Use a temporary array to avoid iterator being invalidated when835// deleting blocks.836SmallVector<WeakTrackingVH, 16> Blocks;837for (auto &Block : llvm::drop_begin(F))838Blocks.push_back(&Block);839840SmallSet<WeakTrackingVH, 16> Preds;841for (auto &Block : Blocks) {842auto *BB = cast_or_null<BasicBlock>(Block);843if (!BB)844continue;845// If the destination block has a single pred, then this is a trivial846// edge, just collapse it.847BasicBlock *SinglePred = BB->getSinglePredecessor();848849// Don't merge if BB's address is taken.850if (!SinglePred || SinglePred == BB || BB->hasAddressTaken())851continue;852853// Make an effort to skip unreachable blocks.854if (DT && !DT->isReachableFromEntry(BB))855continue;856857BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());858if (Term && !Term->isConditional()) {859Changed = true;860LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");861862// Merge BB into SinglePred and delete it.863MergeBlockIntoPredecessor(BB, /* DTU */ nullptr, LI, /* MSSAU */ nullptr,864/* MemDep */ nullptr,865/* PredecessorWithTwoSuccessors */ false, DT);866Preds.insert(SinglePred);867868if (IsHugeFunc) {869// Update FreshBBs to optimize the merged BB.870FreshBBs.insert(SinglePred);871FreshBBs.erase(BB);872}873}874}875876// (Repeatedly) merging blocks into their predecessors can create redundant877// debug intrinsics.878for (const auto &Pred : Preds)879if (auto *BB = cast_or_null<BasicBlock>(Pred))880RemoveRedundantDbgInstrs(BB);881882return Changed;883}884885/// Find a destination block from BB if BB is mergeable empty block.886BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {887// If this block doesn't end with an uncond branch, ignore it.888BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());889if (!BI || !BI->isUnconditional())890return nullptr;891892// If the instruction before the branch (skipping debug info) isn't a phi893// node, then other stuff is happening here.894BasicBlock::iterator BBI = BI->getIterator();895if (BBI != BB->begin()) {896--BBI;897while (isa<DbgInfoIntrinsic>(BBI)) {898if (BBI == BB->begin())899break;900--BBI;901}902if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))903return nullptr;904}905906// Do not break infinite loops.907BasicBlock *DestBB = BI->getSuccessor(0);908if (DestBB == BB)909return nullptr;910911if (!canMergeBlocks(BB, DestBB))912DestBB = nullptr;913914return DestBB;915}916917/// Eliminate blocks that contain only PHI nodes, debug info directives, and an918/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split919/// edges in ways that are non-optimal for isel. Start by eliminating these920/// blocks so we can split them the way we want them.921bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {922SmallPtrSet<BasicBlock *, 16> Preheaders;923SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());924while (!LoopList.empty()) {925Loop *L = LoopList.pop_back_val();926llvm::append_range(LoopList, *L);927if (BasicBlock *Preheader = L->getLoopPreheader())928Preheaders.insert(Preheader);929}930931bool MadeChange = false;932// Copy blocks into a temporary array to avoid iterator invalidation issues933// as we remove them.934// Note that this intentionally skips the entry block.935SmallVector<WeakTrackingVH, 16> Blocks;936for (auto &Block : llvm::drop_begin(F)) {937// Delete phi nodes that could block deleting other empty blocks.938if (!DisableDeletePHIs)939MadeChange |= DeleteDeadPHIs(&Block, TLInfo);940Blocks.push_back(&Block);941}942943for (auto &Block : Blocks) {944BasicBlock *BB = cast_or_null<BasicBlock>(Block);945if (!BB)946continue;947BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);948if (!DestBB ||949!isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))950continue;951952eliminateMostlyEmptyBlock(BB);953MadeChange = true;954}955return MadeChange;956}957958bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,959BasicBlock *DestBB,960bool isPreheader) {961// Do not delete loop preheaders if doing so would create a critical edge.962// Loop preheaders can be good locations to spill registers. If the963// preheader is deleted and we create a critical edge, registers may be964// spilled in the loop body instead.965if (!DisablePreheaderProtect && isPreheader &&966!(BB->getSinglePredecessor() &&967BB->getSinglePredecessor()->getSingleSuccessor()))968return false;969970// Skip merging if the block's successor is also a successor to any callbr971// that leads to this block.972// FIXME: Is this really needed? Is this a correctness issue?973for (BasicBlock *Pred : predecessors(BB)) {974if (isa<CallBrInst>(Pred->getTerminator()) &&975llvm::is_contained(successors(Pred), DestBB))976return false;977}978979// Try to skip merging if the unique predecessor of BB is terminated by a980// switch or indirect branch instruction, and BB is used as an incoming block981// of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to982// add COPY instructions in the predecessor of BB instead of BB (if it is not983// merged). Note that the critical edge created by merging such blocks wont be984// split in MachineSink because the jump table is not analyzable. By keeping985// such empty block (BB), ISel will place COPY instructions in BB, not in the986// predecessor of BB.987BasicBlock *Pred = BB->getUniquePredecessor();988if (!Pred || !(isa<SwitchInst>(Pred->getTerminator()) ||989isa<IndirectBrInst>(Pred->getTerminator())))990return true;991992if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())993return true;994995// We use a simple cost heuristic which determine skipping merging is996// profitable if the cost of skipping merging is less than the cost of997// merging : Cost(skipping merging) < Cost(merging BB), where the998// Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and999// the Cost(merging BB) is Freq(Pred) * Cost(Copy).1000// Assuming Cost(Copy) == Cost(Branch), we could simplify it to :1001// Freq(Pred) / Freq(BB) > 2.1002// Note that if there are multiple empty blocks sharing the same incoming1003// value for the PHIs in the DestBB, we consider them together. In such1004// case, Cost(merging BB) will be the sum of their frequencies.10051006if (!isa<PHINode>(DestBB->begin()))1007return true;10081009SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;10101011// Find all other incoming blocks from which incoming values of all PHIs in1012// DestBB are the same as the ones from BB.1013for (BasicBlock *DestBBPred : predecessors(DestBB)) {1014if (DestBBPred == BB)1015continue;10161017if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {1018return DestPN.getIncomingValueForBlock(BB) ==1019DestPN.getIncomingValueForBlock(DestBBPred);1020}))1021SameIncomingValueBBs.insert(DestBBPred);1022}10231024// See if all BB's incoming values are same as the value from Pred. In this1025// case, no reason to skip merging because COPYs are expected to be place in1026// Pred already.1027if (SameIncomingValueBBs.count(Pred))1028return true;10291030BlockFrequency PredFreq = BFI->getBlockFreq(Pred);1031BlockFrequency BBFreq = BFI->getBlockFreq(BB);10321033for (auto *SameValueBB : SameIncomingValueBBs)1034if (SameValueBB->getUniquePredecessor() == Pred &&1035DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))1036BBFreq += BFI->getBlockFreq(SameValueBB);10371038std::optional<BlockFrequency> Limit = BBFreq.mul(FreqRatioToSkipMerge);1039return !Limit || PredFreq <= *Limit;1040}10411042/// Return true if we can merge BB into DestBB if there is a single1043/// unconditional branch between them, and BB contains no other non-phi1044/// instructions.1045bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,1046const BasicBlock *DestBB) const {1047// We only want to eliminate blocks whose phi nodes are used by phi nodes in1048// the successor. If there are more complex condition (e.g. preheaders),1049// don't mess around with them.1050for (const PHINode &PN : BB->phis()) {1051for (const User *U : PN.users()) {1052const Instruction *UI = cast<Instruction>(U);1053if (UI->getParent() != DestBB || !isa<PHINode>(UI))1054return false;1055// If User is inside DestBB block and it is a PHINode then check1056// incoming value. If incoming value is not from BB then this is1057// a complex condition (e.g. preheaders) we want to avoid here.1058if (UI->getParent() == DestBB) {1059if (const PHINode *UPN = dyn_cast<PHINode>(UI))1060for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {1061Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));1062if (Insn && Insn->getParent() == BB &&1063Insn->getParent() != UPN->getIncomingBlock(I))1064return false;1065}1066}1067}1068}10691070// If BB and DestBB contain any common predecessors, then the phi nodes in BB1071// and DestBB may have conflicting incoming values for the block. If so, we1072// can't merge the block.1073const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());1074if (!DestBBPN)1075return true; // no conflict.10761077// Collect the preds of BB.1078SmallPtrSet<const BasicBlock *, 16> BBPreds;1079if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {1080// It is faster to get preds from a PHI than with pred_iterator.1081for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)1082BBPreds.insert(BBPN->getIncomingBlock(i));1083} else {1084BBPreds.insert(pred_begin(BB), pred_end(BB));1085}10861087// Walk the preds of DestBB.1088for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {1089BasicBlock *Pred = DestBBPN->getIncomingBlock(i);1090if (BBPreds.count(Pred)) { // Common predecessor?1091for (const PHINode &PN : DestBB->phis()) {1092const Value *V1 = PN.getIncomingValueForBlock(Pred);1093const Value *V2 = PN.getIncomingValueForBlock(BB);10941095// If V2 is a phi node in BB, look up what the mapped value will be.1096if (const PHINode *V2PN = dyn_cast<PHINode>(V2))1097if (V2PN->getParent() == BB)1098V2 = V2PN->getIncomingValueForBlock(Pred);10991100// If there is a conflict, bail out.1101if (V1 != V2)1102return false;1103}1104}1105}11061107return true;1108}11091110/// Replace all old uses with new ones, and push the updated BBs into FreshBBs.1111static void replaceAllUsesWith(Value *Old, Value *New,1112SmallSet<BasicBlock *, 32> &FreshBBs,1113bool IsHuge) {1114auto *OldI = dyn_cast<Instruction>(Old);1115if (OldI) {1116for (Value::user_iterator UI = OldI->user_begin(), E = OldI->user_end();1117UI != E; ++UI) {1118Instruction *User = cast<Instruction>(*UI);1119if (IsHuge)1120FreshBBs.insert(User->getParent());1121}1122}1123Old->replaceAllUsesWith(New);1124}11251126/// Eliminate a basic block that has only phi's and an unconditional branch in1127/// it.1128void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {1129BranchInst *BI = cast<BranchInst>(BB->getTerminator());1130BasicBlock *DestBB = BI->getSuccessor(0);11311132LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"1133<< *BB << *DestBB);11341135// If the destination block has a single pred, then this is a trivial edge,1136// just collapse it.1137if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {1138if (SinglePred != DestBB) {1139assert(SinglePred == BB &&1140"Single predecessor not the same as predecessor");1141// Merge DestBB into SinglePred/BB and delete it.1142MergeBlockIntoPredecessor(DestBB);1143// Note: BB(=SinglePred) will not be deleted on this path.1144// DestBB(=its single successor) is the one that was deleted.1145LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");11461147if (IsHugeFunc) {1148// Update FreshBBs to optimize the merged BB.1149FreshBBs.insert(SinglePred);1150FreshBBs.erase(DestBB);1151}1152return;1153}1154}11551156// Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB1157// to handle the new incoming edges it is about to have.1158for (PHINode &PN : DestBB->phis()) {1159// Remove the incoming value for BB, and remember it.1160Value *InVal = PN.removeIncomingValue(BB, false);11611162// Two options: either the InVal is a phi node defined in BB or it is some1163// value that dominates BB.1164PHINode *InValPhi = dyn_cast<PHINode>(InVal);1165if (InValPhi && InValPhi->getParent() == BB) {1166// Add all of the input values of the input PHI as inputs of this phi.1167for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)1168PN.addIncoming(InValPhi->getIncomingValue(i),1169InValPhi->getIncomingBlock(i));1170} else {1171// Otherwise, add one instance of the dominating value for each edge that1172// we will be adding.1173if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {1174for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)1175PN.addIncoming(InVal, BBPN->getIncomingBlock(i));1176} else {1177for (BasicBlock *Pred : predecessors(BB))1178PN.addIncoming(InVal, Pred);1179}1180}1181}11821183// The PHIs are now updated, change everything that refers to BB to use1184// DestBB and remove BB.1185BB->replaceAllUsesWith(DestBB);1186BB->eraseFromParent();1187++NumBlocksElim;11881189LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");1190}11911192// Computes a map of base pointer relocation instructions to corresponding1193// derived pointer relocation instructions given a vector of all relocate calls1194static void computeBaseDerivedRelocateMap(1195const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,1196MapVector<GCRelocateInst *, SmallVector<GCRelocateInst *, 0>>1197&RelocateInstMap) {1198// Collect information in two maps: one primarily for locating the base object1199// while filling the second map; the second map is the final structure holding1200// a mapping between Base and corresponding Derived relocate calls1201MapVector<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;1202for (auto *ThisRelocate : AllRelocateCalls) {1203auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),1204ThisRelocate->getDerivedPtrIndex());1205RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));1206}1207for (auto &Item : RelocateIdxMap) {1208std::pair<unsigned, unsigned> Key = Item.first;1209if (Key.first == Key.second)1210// Base relocation: nothing to insert1211continue;12121213GCRelocateInst *I = Item.second;1214auto BaseKey = std::make_pair(Key.first, Key.first);12151216// We're iterating over RelocateIdxMap so we cannot modify it.1217auto MaybeBase = RelocateIdxMap.find(BaseKey);1218if (MaybeBase == RelocateIdxMap.end())1219// TODO: We might want to insert a new base object relocate and gep off1220// that, if there are enough derived object relocates.1221continue;12221223RelocateInstMap[MaybeBase->second].push_back(I);1224}1225}12261227// Accepts a GEP and extracts the operands into a vector provided they're all1228// small integer constants1229static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,1230SmallVectorImpl<Value *> &OffsetV) {1231for (unsigned i = 1; i < GEP->getNumOperands(); i++) {1232// Only accept small constant integer operands1233auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));1234if (!Op || Op->getZExtValue() > 20)1235return false;1236}12371238for (unsigned i = 1; i < GEP->getNumOperands(); i++)1239OffsetV.push_back(GEP->getOperand(i));1240return true;1241}12421243// Takes a RelocatedBase (base pointer relocation instruction) and Targets to1244// replace, computes a replacement, and affects it.1245static bool1246simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,1247const SmallVectorImpl<GCRelocateInst *> &Targets) {1248bool MadeChange = false;1249// We must ensure the relocation of derived pointer is defined after1250// relocation of base pointer. If we find a relocation corresponding to base1251// defined earlier than relocation of base then we move relocation of base1252// right before found relocation. We consider only relocation in the same1253// basic block as relocation of base. Relocations from other basic block will1254// be skipped by optimization and we do not care about them.1255for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();1256&*R != RelocatedBase; ++R)1257if (auto *RI = dyn_cast<GCRelocateInst>(R))1258if (RI->getStatepoint() == RelocatedBase->getStatepoint())1259if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {1260RelocatedBase->moveBefore(RI);1261MadeChange = true;1262break;1263}12641265for (GCRelocateInst *ToReplace : Targets) {1266assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&1267"Not relocating a derived object of the original base object");1268if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {1269// A duplicate relocate call. TODO: coalesce duplicates.1270continue;1271}12721273if (RelocatedBase->getParent() != ToReplace->getParent()) {1274// Base and derived relocates are in different basic blocks.1275// In this case transform is only valid when base dominates derived1276// relocate. However it would be too expensive to check dominance1277// for each such relocate, so we skip the whole transformation.1278continue;1279}12801281Value *Base = ToReplace->getBasePtr();1282auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());1283if (!Derived || Derived->getPointerOperand() != Base)1284continue;12851286SmallVector<Value *, 2> OffsetV;1287if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))1288continue;12891290// Create a Builder and replace the target callsite with a gep1291assert(RelocatedBase->getNextNode() &&1292"Should always have one since it's not a terminator");12931294// Insert after RelocatedBase1295IRBuilder<> Builder(RelocatedBase->getNextNode());1296Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());12971298// If gc_relocate does not match the actual type, cast it to the right type.1299// In theory, there must be a bitcast after gc_relocate if the type does not1300// match, and we should reuse it to get the derived pointer. But it could be1301// cases like this:1302// bb1:1303// ...1304// %g1 = call coldcc i8 addrspace(1)*1305// @llvm.experimental.gc.relocate.p1i8(...) br label %merge1306//1307// bb2:1308// ...1309// %g2 = call coldcc i8 addrspace(1)*1310// @llvm.experimental.gc.relocate.p1i8(...) br label %merge1311//1312// merge:1313// %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]1314// %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*1315//1316// In this case, we can not find the bitcast any more. So we insert a new1317// bitcast no matter there is already one or not. In this way, we can handle1318// all cases, and the extra bitcast should be optimized away in later1319// passes.1320Value *ActualRelocatedBase = RelocatedBase;1321if (RelocatedBase->getType() != Base->getType()) {1322ActualRelocatedBase =1323Builder.CreateBitCast(RelocatedBase, Base->getType());1324}1325Value *Replacement =1326Builder.CreateGEP(Derived->getSourceElementType(), ActualRelocatedBase,1327ArrayRef(OffsetV));1328Replacement->takeName(ToReplace);1329// If the newly generated derived pointer's type does not match the original1330// derived pointer's type, cast the new derived pointer to match it. Same1331// reasoning as above.1332Value *ActualReplacement = Replacement;1333if (Replacement->getType() != ToReplace->getType()) {1334ActualReplacement =1335Builder.CreateBitCast(Replacement, ToReplace->getType());1336}1337ToReplace->replaceAllUsesWith(ActualReplacement);1338ToReplace->eraseFromParent();13391340MadeChange = true;1341}1342return MadeChange;1343}13441345// Turns this:1346//1347// %base = ...1348// %ptr = gep %base + 151349// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)1350// %base' = relocate(%tok, i32 4, i32 4)1351// %ptr' = relocate(%tok, i32 4, i32 5)1352// %val = load %ptr'1353//1354// into this:1355//1356// %base = ...1357// %ptr = gep %base + 151358// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)1359// %base' = gc.relocate(%tok, i32 4, i32 4)1360// %ptr' = gep %base' + 151361// %val = load %ptr'1362bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {1363bool MadeChange = false;1364SmallVector<GCRelocateInst *, 2> AllRelocateCalls;1365for (auto *U : I.users())1366if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))1367// Collect all the relocate calls associated with a statepoint1368AllRelocateCalls.push_back(Relocate);13691370// We need at least one base pointer relocation + one derived pointer1371// relocation to mangle1372if (AllRelocateCalls.size() < 2)1373return false;13741375// RelocateInstMap is a mapping from the base relocate instruction to the1376// corresponding derived relocate instructions1377MapVector<GCRelocateInst *, SmallVector<GCRelocateInst *, 0>> RelocateInstMap;1378computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);1379if (RelocateInstMap.empty())1380return false;13811382for (auto &Item : RelocateInstMap)1383// Item.first is the RelocatedBase to offset against1384// Item.second is the vector of Targets to replace1385MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);1386return MadeChange;1387}13881389/// Sink the specified cast instruction into its user blocks.1390static bool SinkCast(CastInst *CI) {1391BasicBlock *DefBB = CI->getParent();13921393/// InsertedCasts - Only insert a cast in each block once.1394DenseMap<BasicBlock *, CastInst *> InsertedCasts;13951396bool MadeChange = false;1397for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();1398UI != E;) {1399Use &TheUse = UI.getUse();1400Instruction *User = cast<Instruction>(*UI);14011402// Figure out which BB this cast is used in. For PHI's this is the1403// appropriate predecessor block.1404BasicBlock *UserBB = User->getParent();1405if (PHINode *PN = dyn_cast<PHINode>(User)) {1406UserBB = PN->getIncomingBlock(TheUse);1407}14081409// Preincrement use iterator so we don't invalidate it.1410++UI;14111412// The first insertion point of a block containing an EH pad is after the1413// pad. If the pad is the user, we cannot sink the cast past the pad.1414if (User->isEHPad())1415continue;14161417// If the block selected to receive the cast is an EH pad that does not1418// allow non-PHI instructions before the terminator, we can't sink the1419// cast.1420if (UserBB->getTerminator()->isEHPad())1421continue;14221423// If this user is in the same block as the cast, don't change the cast.1424if (UserBB == DefBB)1425continue;14261427// If we have already inserted a cast into this block, use it.1428CastInst *&InsertedCast = InsertedCasts[UserBB];14291430if (!InsertedCast) {1431BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();1432assert(InsertPt != UserBB->end());1433InsertedCast = cast<CastInst>(CI->clone());1434InsertedCast->insertBefore(*UserBB, InsertPt);1435}14361437// Replace a use of the cast with a use of the new cast.1438TheUse = InsertedCast;1439MadeChange = true;1440++NumCastUses;1441}14421443// If we removed all uses, nuke the cast.1444if (CI->use_empty()) {1445salvageDebugInfo(*CI);1446CI->eraseFromParent();1447MadeChange = true;1448}14491450return MadeChange;1451}14521453/// If the specified cast instruction is a noop copy (e.g. it's casting from1454/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to1455/// reduce the number of virtual registers that must be created and coalesced.1456///1457/// Return true if any changes are made.1458static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,1459const DataLayout &DL) {1460// Sink only "cheap" (or nop) address-space casts. This is a weaker condition1461// than sinking only nop casts, but is helpful on some platforms.1462if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {1463if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),1464ASC->getDestAddressSpace()))1465return false;1466}14671468// If this is a noop copy,1469EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());1470EVT DstVT = TLI.getValueType(DL, CI->getType());14711472// This is an fp<->int conversion?1473if (SrcVT.isInteger() != DstVT.isInteger())1474return false;14751476// If this is an extension, it will be a zero or sign extension, which1477// isn't a noop.1478if (SrcVT.bitsLT(DstVT))1479return false;14801481// If these values will be promoted, find out what they will be promoted1482// to. This helps us consider truncates on PPC as noop copies when they1483// are.1484if (TLI.getTypeAction(CI->getContext(), SrcVT) ==1485TargetLowering::TypePromoteInteger)1486SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);1487if (TLI.getTypeAction(CI->getContext(), DstVT) ==1488TargetLowering::TypePromoteInteger)1489DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);14901491// If, after promotion, these are the same types, this is a noop copy.1492if (SrcVT != DstVT)1493return false;14941495return SinkCast(CI);1496}14971498// Match a simple increment by constant operation. Note that if a sub is1499// matched, the step is negated (as if the step had been canonicalized to1500// an add, even though we leave the instruction alone.)1501static bool matchIncrement(const Instruction *IVInc, Instruction *&LHS,1502Constant *&Step) {1503if (match(IVInc, m_Add(m_Instruction(LHS), m_Constant(Step))) ||1504match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::uadd_with_overflow>(1505m_Instruction(LHS), m_Constant(Step)))))1506return true;1507if (match(IVInc, m_Sub(m_Instruction(LHS), m_Constant(Step))) ||1508match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::usub_with_overflow>(1509m_Instruction(LHS), m_Constant(Step))))) {1510Step = ConstantExpr::getNeg(Step);1511return true;1512}1513return false;1514}15151516/// If given \p PN is an inductive variable with value IVInc coming from the1517/// backedge, and on each iteration it gets increased by Step, return pair1518/// <IVInc, Step>. Otherwise, return std::nullopt.1519static std::optional<std::pair<Instruction *, Constant *>>1520getIVIncrement(const PHINode *PN, const LoopInfo *LI) {1521const Loop *L = LI->getLoopFor(PN->getParent());1522if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())1523return std::nullopt;1524auto *IVInc =1525dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));1526if (!IVInc || LI->getLoopFor(IVInc->getParent()) != L)1527return std::nullopt;1528Instruction *LHS = nullptr;1529Constant *Step = nullptr;1530if (matchIncrement(IVInc, LHS, Step) && LHS == PN)1531return std::make_pair(IVInc, Step);1532return std::nullopt;1533}15341535static bool isIVIncrement(const Value *V, const LoopInfo *LI) {1536auto *I = dyn_cast<Instruction>(V);1537if (!I)1538return false;1539Instruction *LHS = nullptr;1540Constant *Step = nullptr;1541if (!matchIncrement(I, LHS, Step))1542return false;1543if (auto *PN = dyn_cast<PHINode>(LHS))1544if (auto IVInc = getIVIncrement(PN, LI))1545return IVInc->first == I;1546return false;1547}15481549bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,1550Value *Arg0, Value *Arg1,1551CmpInst *Cmp,1552Intrinsic::ID IID) {1553auto IsReplacableIVIncrement = [this, &Cmp](BinaryOperator *BO) {1554if (!isIVIncrement(BO, LI))1555return false;1556const Loop *L = LI->getLoopFor(BO->getParent());1557assert(L && "L should not be null after isIVIncrement()");1558// Do not risk on moving increment into a child loop.1559if (LI->getLoopFor(Cmp->getParent()) != L)1560return false;15611562// Finally, we need to ensure that the insert point will dominate all1563// existing uses of the increment.15641565auto &DT = getDT(*BO->getParent()->getParent());1566if (DT.dominates(Cmp->getParent(), BO->getParent()))1567// If we're moving up the dom tree, all uses are trivially dominated.1568// (This is the common case for code produced by LSR.)1569return true;15701571// Otherwise, special case the single use in the phi recurrence.1572return BO->hasOneUse() && DT.dominates(Cmp->getParent(), L->getLoopLatch());1573};1574if (BO->getParent() != Cmp->getParent() && !IsReplacableIVIncrement(BO)) {1575// We used to use a dominator tree here to allow multi-block optimization.1576// But that was problematic because:1577// 1. It could cause a perf regression by hoisting the math op into the1578// critical path.1579// 2. It could cause a perf regression by creating a value that was live1580// across multiple blocks and increasing register pressure.1581// 3. Use of a dominator tree could cause large compile-time regression.1582// This is because we recompute the DT on every change in the main CGP1583// run-loop. The recomputing is probably unnecessary in many cases, so if1584// that was fixed, using a DT here would be ok.1585//1586// There is one important particular case we still want to handle: if BO is1587// the IV increment. Important properties that make it profitable:1588// - We can speculate IV increment anywhere in the loop (as long as the1589// indvar Phi is its only user);1590// - Upon computing Cmp, we effectively compute something equivalent to the1591// IV increment (despite it loops differently in the IR). So moving it up1592// to the cmp point does not really increase register pressure.1593return false;1594}15951596// We allow matching the canonical IR (add X, C) back to (usubo X, -C).1597if (BO->getOpcode() == Instruction::Add &&1598IID == Intrinsic::usub_with_overflow) {1599assert(isa<Constant>(Arg1) && "Unexpected input for usubo");1600Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));1601}16021603// Insert at the first instruction of the pair.1604Instruction *InsertPt = nullptr;1605for (Instruction &Iter : *Cmp->getParent()) {1606// If BO is an XOR, it is not guaranteed that it comes after both inputs to1607// the overflow intrinsic are defined.1608if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {1609InsertPt = &Iter;1610break;1611}1612}1613assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");16141615IRBuilder<> Builder(InsertPt);1616Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);1617if (BO->getOpcode() != Instruction::Xor) {1618Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");1619replaceAllUsesWith(BO, Math, FreshBBs, IsHugeFunc);1620} else1621assert(BO->hasOneUse() &&1622"Patterns with XOr should use the BO only in the compare");1623Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");1624replaceAllUsesWith(Cmp, OV, FreshBBs, IsHugeFunc);1625Cmp->eraseFromParent();1626BO->eraseFromParent();1627return true;1628}16291630/// Match special-case patterns that check for unsigned add overflow.1631static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,1632BinaryOperator *&Add) {1633// Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)1634// Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)1635Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);16361637// We are not expecting non-canonical/degenerate code. Just bail out.1638if (isa<Constant>(A))1639return false;16401641ICmpInst::Predicate Pred = Cmp->getPredicate();1642if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))1643B = ConstantInt::get(B->getType(), 1);1644else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))1645B = ConstantInt::get(B->getType(), -1);1646else1647return false;16481649// Check the users of the variable operand of the compare looking for an add1650// with the adjusted constant.1651for (User *U : A->users()) {1652if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {1653Add = cast<BinaryOperator>(U);1654return true;1655}1656}1657return false;1658}16591660/// Try to combine the compare into a call to the llvm.uadd.with.overflow1661/// intrinsic. Return true if any changes were made.1662bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,1663ModifyDT &ModifiedDT) {1664bool EdgeCase = false;1665Value *A, *B;1666BinaryOperator *Add;1667if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {1668if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))1669return false;1670// Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.1671A = Add->getOperand(0);1672B = Add->getOperand(1);1673EdgeCase = true;1674}16751676if (!TLI->shouldFormOverflowOp(ISD::UADDO,1677TLI->getValueType(*DL, Add->getType()),1678Add->hasNUsesOrMore(EdgeCase ? 1 : 2)))1679return false;16801681// We don't want to move around uses of condition values this late, so we1682// check if it is legal to create the call to the intrinsic in the basic1683// block containing the icmp.1684if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())1685return false;16861687if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,1688Intrinsic::uadd_with_overflow))1689return false;16901691// Reset callers - do not crash by iterating over a dead instruction.1692ModifiedDT = ModifyDT::ModifyInstDT;1693return true;1694}16951696bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,1697ModifyDT &ModifiedDT) {1698// We are not expecting non-canonical/degenerate code. Just bail out.1699Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);1700if (isa<Constant>(A) && isa<Constant>(B))1701return false;17021703// Convert (A u> B) to (A u< B) to simplify pattern matching.1704ICmpInst::Predicate Pred = Cmp->getPredicate();1705if (Pred == ICmpInst::ICMP_UGT) {1706std::swap(A, B);1707Pred = ICmpInst::ICMP_ULT;1708}1709// Convert special-case: (A == 0) is the same as (A u< 1).1710if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {1711B = ConstantInt::get(B->getType(), 1);1712Pred = ICmpInst::ICMP_ULT;1713}1714// Convert special-case: (A != 0) is the same as (0 u< A).1715if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {1716std::swap(A, B);1717Pred = ICmpInst::ICMP_ULT;1718}1719if (Pred != ICmpInst::ICMP_ULT)1720return false;17211722// Walk the users of a variable operand of a compare looking for a subtract or1723// add with that same operand. Also match the 2nd operand of the compare to1724// the add/sub, but that may be a negated constant operand of an add.1725Value *CmpVariableOperand = isa<Constant>(A) ? B : A;1726BinaryOperator *Sub = nullptr;1727for (User *U : CmpVariableOperand->users()) {1728// A - B, A u< B --> usubo(A, B)1729if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {1730Sub = cast<BinaryOperator>(U);1731break;1732}17331734// A + (-C), A u< C (canonicalized form of (sub A, C))1735const APInt *CmpC, *AddC;1736if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&1737match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {1738Sub = cast<BinaryOperator>(U);1739break;1740}1741}1742if (!Sub)1743return false;17441745if (!TLI->shouldFormOverflowOp(ISD::USUBO,1746TLI->getValueType(*DL, Sub->getType()),1747Sub->hasNUsesOrMore(1)))1748return false;17491750if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),1751Cmp, Intrinsic::usub_with_overflow))1752return false;17531754// Reset callers - do not crash by iterating over a dead instruction.1755ModifiedDT = ModifyDT::ModifyInstDT;1756return true;1757}17581759/// Sink the given CmpInst into user blocks to reduce the number of virtual1760/// registers that must be created and coalesced. This is a clear win except on1761/// targets with multiple condition code registers (PowerPC), where it might1762/// lose; some adjustment may be wanted there.1763///1764/// Return true if any changes are made.1765static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {1766if (TLI.hasMultipleConditionRegisters())1767return false;17681769// Avoid sinking soft-FP comparisons, since this can move them into a loop.1770if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))1771return false;17721773// Only insert a cmp in each block once.1774DenseMap<BasicBlock *, CmpInst *> InsertedCmps;17751776bool MadeChange = false;1777for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();1778UI != E;) {1779Use &TheUse = UI.getUse();1780Instruction *User = cast<Instruction>(*UI);17811782// Preincrement use iterator so we don't invalidate it.1783++UI;17841785// Don't bother for PHI nodes.1786if (isa<PHINode>(User))1787continue;17881789// Figure out which BB this cmp is used in.1790BasicBlock *UserBB = User->getParent();1791BasicBlock *DefBB = Cmp->getParent();17921793// If this user is in the same block as the cmp, don't change the cmp.1794if (UserBB == DefBB)1795continue;17961797// If we have already inserted a cmp into this block, use it.1798CmpInst *&InsertedCmp = InsertedCmps[UserBB];17991800if (!InsertedCmp) {1801BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();1802assert(InsertPt != UserBB->end());1803InsertedCmp = CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),1804Cmp->getOperand(0), Cmp->getOperand(1), "");1805InsertedCmp->insertBefore(*UserBB, InsertPt);1806// Propagate the debug info.1807InsertedCmp->setDebugLoc(Cmp->getDebugLoc());1808}18091810// Replace a use of the cmp with a use of the new cmp.1811TheUse = InsertedCmp;1812MadeChange = true;1813++NumCmpUses;1814}18151816// If we removed all uses, nuke the cmp.1817if (Cmp->use_empty()) {1818Cmp->eraseFromParent();1819MadeChange = true;1820}18211822return MadeChange;1823}18241825/// For pattern like:1826///1827/// DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)1828/// ...1829/// DomBB:1830/// ...1831/// br DomCond, TrueBB, CmpBB1832/// CmpBB: (with DomBB being the single predecessor)1833/// ...1834/// Cmp = icmp eq CmpOp0, CmpOp11835/// ...1836///1837/// It would use two comparison on targets that lowering of icmp sgt/slt is1838/// different from lowering of icmp eq (PowerPC). This function try to convert1839/// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.1840/// After that, DomCond and Cmp can use the same comparison so reduce one1841/// comparison.1842///1843/// Return true if any changes are made.1844static bool foldICmpWithDominatingICmp(CmpInst *Cmp,1845const TargetLowering &TLI) {1846if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())1847return false;18481849ICmpInst::Predicate Pred = Cmp->getPredicate();1850if (Pred != ICmpInst::ICMP_EQ)1851return false;18521853// If icmp eq has users other than BranchInst and SelectInst, converting it to1854// icmp slt/sgt would introduce more redundant LLVM IR.1855for (User *U : Cmp->users()) {1856if (isa<BranchInst>(U))1857continue;1858if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp)1859continue;1860return false;1861}18621863// This is a cheap/incomplete check for dominance - just match a single1864// predecessor with a conditional branch.1865BasicBlock *CmpBB = Cmp->getParent();1866BasicBlock *DomBB = CmpBB->getSinglePredecessor();1867if (!DomBB)1868return false;18691870// We want to ensure that the only way control gets to the comparison of1871// interest is that a less/greater than comparison on the same operands is1872// false.1873Value *DomCond;1874BasicBlock *TrueBB, *FalseBB;1875if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))1876return false;1877if (CmpBB != FalseBB)1878return false;18791880Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1);1881ICmpInst::Predicate DomPred;1882if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1))))1883return false;1884if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)1885return false;18861887// Convert the equality comparison to the opposite of the dominating1888// comparison and swap the direction for all branch/select users.1889// We have conceptually converted:1890// Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;1891// to1892// Res = (a < b) ? <LT_RES> : (a > b) ? <GT_RES> : <EQ_RES>;1893// And similarly for branches.1894for (User *U : Cmp->users()) {1895if (auto *BI = dyn_cast<BranchInst>(U)) {1896assert(BI->isConditional() && "Must be conditional");1897BI->swapSuccessors();1898continue;1899}1900if (auto *SI = dyn_cast<SelectInst>(U)) {1901// Swap operands1902SI->swapValues();1903SI->swapProfMetadata();1904continue;1905}1906llvm_unreachable("Must be a branch or a select");1907}1908Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred));1909return true;1910}19111912/// Many architectures use the same instruction for both subtract and cmp. Try1913/// to swap cmp operands to match subtract operations to allow for CSE.1914static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {1915Value *Op0 = Cmp->getOperand(0);1916Value *Op1 = Cmp->getOperand(1);1917if (!Op0->getType()->isIntegerTy() || isa<Constant>(Op0) ||1918isa<Constant>(Op1) || Op0 == Op1)1919return false;19201921// If a subtract already has the same operands as a compare, swapping would be1922// bad. If a subtract has the same operands as a compare but in reverse order,1923// then swapping is good.1924int GoodToSwap = 0;1925unsigned NumInspected = 0;1926for (const User *U : Op0->users()) {1927// Avoid walking many users.1928if (++NumInspected > 128)1929return false;1930if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))1931GoodToSwap++;1932else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))1933GoodToSwap--;1934}19351936if (GoodToSwap > 0) {1937Cmp->swapOperands();1938return true;1939}1940return false;1941}19421943static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,1944const DataLayout &DL) {1945FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);1946if (!FCmp)1947return false;19481949// Don't fold if the target offers free fabs and the predicate is legal.1950EVT VT = TLI.getValueType(DL, Cmp->getOperand(0)->getType());1951if (TLI.isFAbsFree(VT) &&1952TLI.isCondCodeLegal(getFCmpCondCode(FCmp->getPredicate()),1953VT.getSimpleVT()))1954return false;19551956// Reverse the canonicalization if it is a FP class test1957auto ShouldReverseTransform = [](FPClassTest ClassTest) {1958return ClassTest == fcInf || ClassTest == (fcInf | fcNan);1959};1960auto [ClassVal, ClassTest] =1961fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),1962FCmp->getOperand(0), FCmp->getOperand(1));1963if (!ClassVal)1964return false;19651966if (!ShouldReverseTransform(ClassTest) && !ShouldReverseTransform(~ClassTest))1967return false;19681969IRBuilder<> Builder(Cmp);1970Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);1971Cmp->replaceAllUsesWith(IsFPClass);1972RecursivelyDeleteTriviallyDeadInstructions(Cmp);1973return true;1974}19751976bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {1977if (sinkCmpExpression(Cmp, *TLI))1978return true;19791980if (combineToUAddWithOverflow(Cmp, ModifiedDT))1981return true;19821983if (combineToUSubWithOverflow(Cmp, ModifiedDT))1984return true;19851986if (foldICmpWithDominatingICmp(Cmp, *TLI))1987return true;19881989if (swapICmpOperandsToExposeCSEOpportunities(Cmp))1990return true;19911992if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))1993return true;19941995return false;1996}19971998/// Duplicate and sink the given 'and' instruction into user blocks where it is1999/// used in a compare to allow isel to generate better code for targets where2000/// this operation can be combined.2001///2002/// Return true if any changes are made.2003static bool sinkAndCmp0Expression(Instruction *AndI, const TargetLowering &TLI,2004SetOfInstrs &InsertedInsts) {2005// Double-check that we're not trying to optimize an instruction that was2006// already optimized by some other part of this pass.2007assert(!InsertedInsts.count(AndI) &&2008"Attempting to optimize already optimized and instruction");2009(void)InsertedInsts;20102011// Nothing to do for single use in same basic block.2012if (AndI->hasOneUse() &&2013AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())2014return false;20152016// Try to avoid cases where sinking/duplicating is likely to increase register2017// pressure.2018if (!isa<ConstantInt>(AndI->getOperand(0)) &&2019!isa<ConstantInt>(AndI->getOperand(1)) &&2020AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())2021return false;20222023for (auto *U : AndI->users()) {2024Instruction *User = cast<Instruction>(U);20252026// Only sink 'and' feeding icmp with 0.2027if (!isa<ICmpInst>(User))2028return false;20292030auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));2031if (!CmpC || !CmpC->isZero())2032return false;2033}20342035if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))2036return false;20372038LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");2039LLVM_DEBUG(AndI->getParent()->dump());20402041// Push the 'and' into the same block as the icmp 0. There should only be2042// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any2043// others, so we don't need to keep track of which BBs we insert into.2044for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();2045UI != E;) {2046Use &TheUse = UI.getUse();2047Instruction *User = cast<Instruction>(*UI);20482049// Preincrement use iterator so we don't invalidate it.2050++UI;20512052LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");20532054// Keep the 'and' in the same place if the use is already in the same block.2055Instruction *InsertPt =2056User->getParent() == AndI->getParent() ? AndI : User;2057Instruction *InsertedAnd = BinaryOperator::Create(2058Instruction::And, AndI->getOperand(0), AndI->getOperand(1), "",2059InsertPt->getIterator());2060// Propagate the debug info.2061InsertedAnd->setDebugLoc(AndI->getDebugLoc());20622063// Replace a use of the 'and' with a use of the new 'and'.2064TheUse = InsertedAnd;2065++NumAndUses;2066LLVM_DEBUG(User->getParent()->dump());2067}20682069// We removed all uses, nuke the and.2070AndI->eraseFromParent();2071return true;2072}20732074/// Check if the candidates could be combined with a shift instruction, which2075/// includes:2076/// 1. Truncate instruction2077/// 2. And instruction and the imm is a mask of the low bits:2078/// imm & (imm+1) == 02079static bool isExtractBitsCandidateUse(Instruction *User) {2080if (!isa<TruncInst>(User)) {2081if (User->getOpcode() != Instruction::And ||2082!isa<ConstantInt>(User->getOperand(1)))2083return false;20842085const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();20862087if ((Cimm & (Cimm + 1)).getBoolValue())2088return false;2089}2090return true;2091}20922093/// Sink both shift and truncate instruction to the use of truncate's BB.2094static bool2095SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,2096DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,2097const TargetLowering &TLI, const DataLayout &DL) {2098BasicBlock *UserBB = User->getParent();2099DenseMap<BasicBlock *, CastInst *> InsertedTruncs;2100auto *TruncI = cast<TruncInst>(User);2101bool MadeChange = false;21022103for (Value::user_iterator TruncUI = TruncI->user_begin(),2104TruncE = TruncI->user_end();2105TruncUI != TruncE;) {21062107Use &TruncTheUse = TruncUI.getUse();2108Instruction *TruncUser = cast<Instruction>(*TruncUI);2109// Preincrement use iterator so we don't invalidate it.21102111++TruncUI;21122113int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());2114if (!ISDOpcode)2115continue;21162117// If the use is actually a legal node, there will not be an2118// implicit truncate.2119// FIXME: always querying the result type is just an2120// approximation; some nodes' legality is determined by the2121// operand or other means. There's no good way to find out though.2122if (TLI.isOperationLegalOrCustom(2123ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))2124continue;21252126// Don't bother for PHI nodes.2127if (isa<PHINode>(TruncUser))2128continue;21292130BasicBlock *TruncUserBB = TruncUser->getParent();21312132if (UserBB == TruncUserBB)2133continue;21342135BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];2136CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];21372138if (!InsertedShift && !InsertedTrunc) {2139BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();2140assert(InsertPt != TruncUserBB->end());2141// Sink the shift2142if (ShiftI->getOpcode() == Instruction::AShr)2143InsertedShift =2144BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "");2145else2146InsertedShift =2147BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "");2148InsertedShift->setDebugLoc(ShiftI->getDebugLoc());2149InsertedShift->insertBefore(*TruncUserBB, InsertPt);21502151// Sink the trunc2152BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();2153TruncInsertPt++;2154// It will go ahead of any debug-info.2155TruncInsertPt.setHeadBit(true);2156assert(TruncInsertPt != TruncUserBB->end());21572158InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,2159TruncI->getType(), "");2160InsertedTrunc->insertBefore(*TruncUserBB, TruncInsertPt);2161InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());21622163MadeChange = true;21642165TruncTheUse = InsertedTrunc;2166}2167}2168return MadeChange;2169}21702171/// Sink the shift *right* instruction into user blocks if the uses could2172/// potentially be combined with this shift instruction and generate BitExtract2173/// instruction. It will only be applied if the architecture supports BitExtract2174/// instruction. Here is an example:2175/// BB1:2176/// %x.extract.shift = lshr i64 %arg1, 322177/// BB2:2178/// %x.extract.trunc = trunc i64 %x.extract.shift to i162179/// ==>2180///2181/// BB2:2182/// %x.extract.shift.1 = lshr i64 %arg1, 322183/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i162184///2185/// CodeGen will recognize the pattern in BB2 and generate BitExtract2186/// instruction.2187/// Return true if any changes are made.2188static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,2189const TargetLowering &TLI,2190const DataLayout &DL) {2191BasicBlock *DefBB = ShiftI->getParent();21922193/// Only insert instructions in each block once.2194DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;21952196bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));21972198bool MadeChange = false;2199for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();2200UI != E;) {2201Use &TheUse = UI.getUse();2202Instruction *User = cast<Instruction>(*UI);2203// Preincrement use iterator so we don't invalidate it.2204++UI;22052206// Don't bother for PHI nodes.2207if (isa<PHINode>(User))2208continue;22092210if (!isExtractBitsCandidateUse(User))2211continue;22122213BasicBlock *UserBB = User->getParent();22142215if (UserBB == DefBB) {2216// If the shift and truncate instruction are in the same BB. The use of2217// the truncate(TruncUse) may still introduce another truncate if not2218// legal. In this case, we would like to sink both shift and truncate2219// instruction to the BB of TruncUse.2220// for example:2221// BB1:2222// i64 shift.result = lshr i64 opnd, imm2223// trunc.result = trunc shift.result to i162224//2225// BB2:2226// ----> We will have an implicit truncate here if the architecture does2227// not have i16 compare.2228// cmp i16 trunc.result, opnd22229//2230if (isa<TruncInst>(User) &&2231shiftIsLegal2232// If the type of the truncate is legal, no truncate will be2233// introduced in other basic blocks.2234&& (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))2235MadeChange =2236SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);22372238continue;2239}2240// If we have already inserted a shift into this block, use it.2241BinaryOperator *&InsertedShift = InsertedShifts[UserBB];22422243if (!InsertedShift) {2244BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();2245assert(InsertPt != UserBB->end());22462247if (ShiftI->getOpcode() == Instruction::AShr)2248InsertedShift =2249BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "");2250else2251InsertedShift =2252BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "");2253InsertedShift->insertBefore(*UserBB, InsertPt);2254InsertedShift->setDebugLoc(ShiftI->getDebugLoc());22552256MadeChange = true;2257}22582259// Replace a use of the shift with a use of the new shift.2260TheUse = InsertedShift;2261}22622263// If we removed all uses, or there are none, nuke the shift.2264if (ShiftI->use_empty()) {2265salvageDebugInfo(*ShiftI);2266ShiftI->eraseFromParent();2267MadeChange = true;2268}22692270return MadeChange;2271}22722273/// If counting leading or trailing zeros is an expensive operation and a zero2274/// input is defined, add a check for zero to avoid calling the intrinsic.2275///2276/// We want to transform:2277/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)2278///2279/// into:2280/// entry:2281/// %cmpz = icmp eq i64 %A, 02282/// br i1 %cmpz, label %cond.end, label %cond.false2283/// cond.false:2284/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)2285/// br label %cond.end2286/// cond.end:2287/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]2288///2289/// If the transform is performed, return true and set ModifiedDT to true.2290static bool despeculateCountZeros(IntrinsicInst *CountZeros,2291LoopInfo &LI,2292const TargetLowering *TLI,2293const DataLayout *DL, ModifyDT &ModifiedDT,2294SmallSet<BasicBlock *, 32> &FreshBBs,2295bool IsHugeFunc) {2296// If a zero input is undefined, it doesn't make sense to despeculate that.2297if (match(CountZeros->getOperand(1), m_One()))2298return false;22992300// If it's cheap to speculate, there's nothing to do.2301Type *Ty = CountZeros->getType();2302auto IntrinsicID = CountZeros->getIntrinsicID();2303if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||2304(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))2305return false;23062307// Only handle legal scalar cases. Anything else requires too much work.2308unsigned SizeInBits = Ty->getScalarSizeInBits();2309if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())2310return false;23112312// Bail if the value is never zero.2313Use &Op = CountZeros->getOperandUse(0);2314if (isKnownNonZero(Op, *DL))2315return false;23162317// The intrinsic will be sunk behind a compare against zero and branch.2318BasicBlock *StartBlock = CountZeros->getParent();2319BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");2320if (IsHugeFunc)2321FreshBBs.insert(CallBlock);23222323// Create another block after the count zero intrinsic. A PHI will be added2324// in this block to select the result of the intrinsic or the bit-width2325// constant if the input to the intrinsic is zero.2326BasicBlock::iterator SplitPt = std::next(BasicBlock::iterator(CountZeros));2327// Any debug-info after CountZeros should not be included.2328SplitPt.setHeadBit(true);2329BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");2330if (IsHugeFunc)2331FreshBBs.insert(EndBlock);23322333// Update the LoopInfo. The new blocks are in the same loop as the start2334// block.2335if (Loop *L = LI.getLoopFor(StartBlock)) {2336L->addBasicBlockToLoop(CallBlock, LI);2337L->addBasicBlockToLoop(EndBlock, LI);2338}23392340// Set up a builder to create a compare, conditional branch, and PHI.2341IRBuilder<> Builder(CountZeros->getContext());2342Builder.SetInsertPoint(StartBlock->getTerminator());2343Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());23442345// Replace the unconditional branch that was created by the first split with2346// a compare against zero and a conditional branch.2347Value *Zero = Constant::getNullValue(Ty);2348// Avoid introducing branch on poison. This also replaces the ctz operand.2349if (!isGuaranteedNotToBeUndefOrPoison(Op))2350Op = Builder.CreateFreeze(Op, Op->getName() + ".fr");2351Value *Cmp = Builder.CreateICmpEQ(Op, Zero, "cmpz");2352Builder.CreateCondBr(Cmp, EndBlock, CallBlock);2353StartBlock->getTerminator()->eraseFromParent();23542355// Create a PHI in the end block to select either the output of the intrinsic2356// or the bit width of the operand.2357Builder.SetInsertPoint(EndBlock, EndBlock->begin());2358PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");2359replaceAllUsesWith(CountZeros, PN, FreshBBs, IsHugeFunc);2360Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));2361PN->addIncoming(BitWidth, StartBlock);2362PN->addIncoming(CountZeros, CallBlock);23632364// We are explicitly handling the zero case, so we can set the intrinsic's2365// undefined zero argument to 'true'. This will also prevent reprocessing the2366// intrinsic; we only despeculate when a zero input is defined.2367CountZeros->setArgOperand(1, Builder.getTrue());2368ModifiedDT = ModifyDT::ModifyBBDT;2369return true;2370}23712372bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {2373BasicBlock *BB = CI->getParent();23742375// Lower inline assembly if we can.2376// If we found an inline asm expession, and if the target knows how to2377// lower it to normal LLVM code, do so now.2378if (CI->isInlineAsm()) {2379if (TLI->ExpandInlineAsm(CI)) {2380// Avoid invalidating the iterator.2381CurInstIterator = BB->begin();2382// Avoid processing instructions out of order, which could cause2383// reuse before a value is defined.2384SunkAddrs.clear();2385return true;2386}2387// Sink address computing for memory operands into the block.2388if (optimizeInlineAsmInst(CI))2389return true;2390}23912392// Align the pointer arguments to this call if the target thinks it's a good2393// idea2394unsigned MinSize;2395Align PrefAlign;2396if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {2397for (auto &Arg : CI->args()) {2398// We want to align both objects whose address is used directly and2399// objects whose address is used in casts and GEPs, though it only makes2400// sense for GEPs if the offset is a multiple of the desired alignment and2401// if size - offset meets the size threshold.2402if (!Arg->getType()->isPointerTy())2403continue;2404APInt Offset(DL->getIndexSizeInBits(2405cast<PointerType>(Arg->getType())->getAddressSpace()),24060);2407Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);2408uint64_t Offset2 = Offset.getLimitedValue();2409if (!isAligned(PrefAlign, Offset2))2410continue;2411AllocaInst *AI;2412if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlign() < PrefAlign &&2413DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)2414AI->setAlignment(PrefAlign);2415// Global variables can only be aligned if they are defined in this2416// object (i.e. they are uniquely initialized in this object), and2417// over-aligning global variables that have an explicit section is2418// forbidden.2419GlobalVariable *GV;2420if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&2421GV->getPointerAlignment(*DL) < PrefAlign &&2422DL->getTypeAllocSize(GV->getValueType()) >= MinSize + Offset2)2423GV->setAlignment(PrefAlign);2424}2425}2426// If this is a memcpy (or similar) then we may be able to improve the2427// alignment.2428if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {2429Align DestAlign = getKnownAlignment(MI->getDest(), *DL);2430MaybeAlign MIDestAlign = MI->getDestAlign();2431if (!MIDestAlign || DestAlign > *MIDestAlign)2432MI->setDestAlignment(DestAlign);2433if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {2434MaybeAlign MTISrcAlign = MTI->getSourceAlign();2435Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);2436if (!MTISrcAlign || SrcAlign > *MTISrcAlign)2437MTI->setSourceAlignment(SrcAlign);2438}2439}24402441// If we have a cold call site, try to sink addressing computation into the2442// cold block. This interacts with our handling for loads and stores to2443// ensure that we can fold all uses of a potential addressing computation2444// into their uses. TODO: generalize this to work over profiling data2445if (CI->hasFnAttr(Attribute::Cold) && !OptSize &&2446!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))2447for (auto &Arg : CI->args()) {2448if (!Arg->getType()->isPointerTy())2449continue;2450unsigned AS = Arg->getType()->getPointerAddressSpace();2451if (optimizeMemoryInst(CI, Arg, Arg->getType(), AS))2452return true;2453}24542455IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);2456if (II) {2457switch (II->getIntrinsicID()) {2458default:2459break;2460case Intrinsic::assume:2461llvm_unreachable("llvm.assume should have been removed already");2462case Intrinsic::allow_runtime_check:2463case Intrinsic::allow_ubsan_check:2464case Intrinsic::experimental_widenable_condition: {2465// Give up on future widening opportunities so that we can fold away dead2466// paths and merge blocks before going into block-local instruction2467// selection.2468if (II->use_empty()) {2469II->eraseFromParent();2470return true;2471}2472Constant *RetVal = ConstantInt::getTrue(II->getContext());2473resetIteratorIfInvalidatedWhileCalling(BB, [&]() {2474replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);2475});2476return true;2477}2478case Intrinsic::objectsize:2479llvm_unreachable("llvm.objectsize.* should have been lowered already");2480case Intrinsic::is_constant:2481llvm_unreachable("llvm.is.constant.* should have been lowered already");2482case Intrinsic::aarch64_stlxr:2483case Intrinsic::aarch64_stxr: {2484ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));2485if (!ExtVal || !ExtVal->hasOneUse() ||2486ExtVal->getParent() == CI->getParent())2487return false;2488// Sink a zext feeding stlxr/stxr before it, so it can be folded into it.2489ExtVal->moveBefore(CI);2490// Mark this instruction as "inserted by CGP", so that other2491// optimizations don't touch it.2492InsertedInsts.insert(ExtVal);2493return true;2494}24952496case Intrinsic::launder_invariant_group:2497case Intrinsic::strip_invariant_group: {2498Value *ArgVal = II->getArgOperand(0);2499auto it = LargeOffsetGEPMap.find(II);2500if (it != LargeOffsetGEPMap.end()) {2501// Merge entries in LargeOffsetGEPMap to reflect the RAUW.2502// Make sure not to have to deal with iterator invalidation2503// after possibly adding ArgVal to LargeOffsetGEPMap.2504auto GEPs = std::move(it->second);2505LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());2506LargeOffsetGEPMap.erase(II);2507}25082509replaceAllUsesWith(II, ArgVal, FreshBBs, IsHugeFunc);2510II->eraseFromParent();2511return true;2512}2513case Intrinsic::cttz:2514case Intrinsic::ctlz:2515// If counting zeros is expensive, try to avoid it.2516return despeculateCountZeros(II, *LI, TLI, DL, ModifiedDT, FreshBBs,2517IsHugeFunc);2518case Intrinsic::fshl:2519case Intrinsic::fshr:2520return optimizeFunnelShift(II);2521case Intrinsic::dbg_assign:2522case Intrinsic::dbg_value:2523return fixupDbgValue(II);2524case Intrinsic::masked_gather:2525return optimizeGatherScatterInst(II, II->getArgOperand(0));2526case Intrinsic::masked_scatter:2527return optimizeGatherScatterInst(II, II->getArgOperand(1));2528}25292530SmallVector<Value *, 2> PtrOps;2531Type *AccessTy;2532if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))2533while (!PtrOps.empty()) {2534Value *PtrVal = PtrOps.pop_back_val();2535unsigned AS = PtrVal->getType()->getPointerAddressSpace();2536if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))2537return true;2538}2539}25402541// From here on out we're working with named functions.2542if (!CI->getCalledFunction())2543return false;25442545// Lower all default uses of _chk calls. This is very similar2546// to what InstCombineCalls does, but here we are only lowering calls2547// to fortified library functions (e.g. __memcpy_chk) that have the default2548// "don't know" as the objectsize. Anything else should be left alone.2549FortifiedLibCallSimplifier Simplifier(TLInfo, true);2550IRBuilder<> Builder(CI);2551if (Value *V = Simplifier.optimizeCall(CI, Builder)) {2552replaceAllUsesWith(CI, V, FreshBBs, IsHugeFunc);2553CI->eraseFromParent();2554return true;2555}25562557return false;2558}25592560static bool isIntrinsicOrLFToBeTailCalled(const TargetLibraryInfo *TLInfo,2561const CallInst *CI) {2562assert(CI && CI->use_empty());25632564if (const auto *II = dyn_cast<IntrinsicInst>(CI))2565switch (II->getIntrinsicID()) {2566case Intrinsic::memset:2567case Intrinsic::memcpy:2568case Intrinsic::memmove:2569return true;2570default:2571return false;2572}25732574LibFunc LF;2575Function *Callee = CI->getCalledFunction();2576if (Callee && TLInfo && TLInfo->getLibFunc(*Callee, LF))2577switch (LF) {2578case LibFunc_strcpy:2579case LibFunc_strncpy:2580case LibFunc_strcat:2581case LibFunc_strncat:2582return true;2583default:2584return false;2585}25862587return false;2588}25892590/// Look for opportunities to duplicate return instructions to the predecessor2591/// to enable tail call optimizations. The case it is currently looking for is2592/// the following one. Known intrinsics or library function that may be tail2593/// called are taken into account as well.2594/// @code2595/// bb0:2596/// %tmp0 = tail call i32 @f0()2597/// br label %return2598/// bb1:2599/// %tmp1 = tail call i32 @f1()2600/// br label %return2601/// bb2:2602/// %tmp2 = tail call i32 @f2()2603/// br label %return2604/// return:2605/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]2606/// ret i32 %retval2607/// @endcode2608///2609/// =>2610///2611/// @code2612/// bb0:2613/// %tmp0 = tail call i32 @f0()2614/// ret i32 %tmp02615/// bb1:2616/// %tmp1 = tail call i32 @f1()2617/// ret i32 %tmp12618/// bb2:2619/// %tmp2 = tail call i32 @f2()2620/// ret i32 %tmp22621/// @endcode2622bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,2623ModifyDT &ModifiedDT) {2624if (!BB->getTerminator())2625return false;26262627ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());2628if (!RetI)2629return false;26302631assert(LI->getLoopFor(BB) == nullptr && "A return block cannot be in a loop");26322633PHINode *PN = nullptr;2634ExtractValueInst *EVI = nullptr;2635BitCastInst *BCI = nullptr;2636Value *V = RetI->getReturnValue();2637if (V) {2638BCI = dyn_cast<BitCastInst>(V);2639if (BCI)2640V = BCI->getOperand(0);26412642EVI = dyn_cast<ExtractValueInst>(V);2643if (EVI) {2644V = EVI->getOperand(0);2645if (!llvm::all_of(EVI->indices(), [](unsigned idx) { return idx == 0; }))2646return false;2647}26482649PN = dyn_cast<PHINode>(V);2650}26512652if (PN && PN->getParent() != BB)2653return false;26542655auto isLifetimeEndOrBitCastFor = [](const Instruction *Inst) {2656const BitCastInst *BC = dyn_cast<BitCastInst>(Inst);2657if (BC && BC->hasOneUse())2658Inst = BC->user_back();26592660if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))2661return II->getIntrinsicID() == Intrinsic::lifetime_end;2662return false;2663};26642665// Make sure there are no instructions between the first instruction2666// and return.2667const Instruction *BI = BB->getFirstNonPHI();2668// Skip over debug and the bitcast.2669while (isa<DbgInfoIntrinsic>(BI) || BI == BCI || BI == EVI ||2670isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI))2671BI = BI->getNextNode();2672if (BI != RetI)2673return false;26742675/// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail2676/// call.2677const Function *F = BB->getParent();2678SmallVector<BasicBlock *, 4> TailCallBBs;2679if (PN) {2680for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {2681// Look through bitcasts.2682Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();2683CallInst *CI = dyn_cast<CallInst>(IncomingVal);2684BasicBlock *PredBB = PN->getIncomingBlock(I);2685// Make sure the phi value is indeed produced by the tail call.2686if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&2687TLI->mayBeEmittedAsTailCall(CI) &&2688attributesPermitTailCall(F, CI, RetI, *TLI)) {2689TailCallBBs.push_back(PredBB);2690} else {2691// Consider the cases in which the phi value is indirectly produced by2692// the tail call, for example when encountering memset(), memmove(),2693// strcpy(), whose return value may have been optimized out. In such2694// cases, the value needs to be the first function argument.2695//2696// bb0:2697// tail call void @llvm.memset.p0.i64(ptr %0, i8 0, i64 %1)2698// br label %return2699// return:2700// %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]2701if (PredBB && PredBB->getSingleSuccessor() == BB)2702CI = dyn_cast_or_null<CallInst>(2703PredBB->getTerminator()->getPrevNonDebugInstruction(true));27042705if (CI && CI->use_empty() &&2706isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&2707IncomingVal == CI->getArgOperand(0) &&2708TLI->mayBeEmittedAsTailCall(CI) &&2709attributesPermitTailCall(F, CI, RetI, *TLI))2710TailCallBBs.push_back(PredBB);2711}2712}2713} else {2714SmallPtrSet<BasicBlock *, 4> VisitedBBs;2715for (BasicBlock *Pred : predecessors(BB)) {2716if (!VisitedBBs.insert(Pred).second)2717continue;2718if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {2719CallInst *CI = dyn_cast<CallInst>(I);2720if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&2721attributesPermitTailCall(F, CI, RetI, *TLI)) {2722// Either we return void or the return value must be the first2723// argument of a known intrinsic or library function.2724if (!V || isa<UndefValue>(V) ||2725(isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&2726V == CI->getArgOperand(0))) {2727TailCallBBs.push_back(Pred);2728}2729}2730}2731}2732}27332734bool Changed = false;2735for (auto const &TailCallBB : TailCallBBs) {2736// Make sure the call instruction is followed by an unconditional branch to2737// the return block.2738BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());2739if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)2740continue;27412742// Duplicate the return into TailCallBB.2743(void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);2744assert(!VerifyBFIUpdates ||2745BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));2746BFI->setBlockFreq(BB,2747(BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)));2748ModifiedDT = ModifyDT::ModifyBBDT;2749Changed = true;2750++NumRetsDup;2751}27522753// If we eliminated all predecessors of the block, delete the block now.2754if (Changed && !BB->hasAddressTaken() && pred_empty(BB))2755BB->eraseFromParent();27562757return Changed;2758}27592760//===----------------------------------------------------------------------===//2761// Memory Optimization2762//===----------------------------------------------------------------------===//27632764namespace {27652766/// This is an extended version of TargetLowering::AddrMode2767/// which holds actual Value*'s for register values.2768struct ExtAddrMode : public TargetLowering::AddrMode {2769Value *BaseReg = nullptr;2770Value *ScaledReg = nullptr;2771Value *OriginalValue = nullptr;2772bool InBounds = true;27732774enum FieldName {2775NoField = 0x00,2776BaseRegField = 0x01,2777BaseGVField = 0x02,2778BaseOffsField = 0x04,2779ScaledRegField = 0x08,2780ScaleField = 0x10,2781MultipleFields = 0xff2782};27832784ExtAddrMode() = default;27852786void print(raw_ostream &OS) const;2787void dump() const;27882789FieldName compare(const ExtAddrMode &other) {2790// First check that the types are the same on each field, as differing types2791// is something we can't cope with later on.2792if (BaseReg && other.BaseReg &&2793BaseReg->getType() != other.BaseReg->getType())2794return MultipleFields;2795if (BaseGV && other.BaseGV && BaseGV->getType() != other.BaseGV->getType())2796return MultipleFields;2797if (ScaledReg && other.ScaledReg &&2798ScaledReg->getType() != other.ScaledReg->getType())2799return MultipleFields;28002801// Conservatively reject 'inbounds' mismatches.2802if (InBounds != other.InBounds)2803return MultipleFields;28042805// Check each field to see if it differs.2806unsigned Result = NoField;2807if (BaseReg != other.BaseReg)2808Result |= BaseRegField;2809if (BaseGV != other.BaseGV)2810Result |= BaseGVField;2811if (BaseOffs != other.BaseOffs)2812Result |= BaseOffsField;2813if (ScaledReg != other.ScaledReg)2814Result |= ScaledRegField;2815// Don't count 0 as being a different scale, because that actually means2816// unscaled (which will already be counted by having no ScaledReg).2817if (Scale && other.Scale && Scale != other.Scale)2818Result |= ScaleField;28192820if (llvm::popcount(Result) > 1)2821return MultipleFields;2822else2823return static_cast<FieldName>(Result);2824}28252826// An AddrMode is trivial if it involves no calculation i.e. it is just a base2827// with no offset.2828bool isTrivial() {2829// An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is2830// trivial if at most one of these terms is nonzero, except that BaseGV and2831// BaseReg both being zero actually means a null pointer value, which we2832// consider to be 'non-zero' here.2833return !BaseOffs && !Scale && !(BaseGV && BaseReg);2834}28352836Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {2837switch (Field) {2838default:2839return nullptr;2840case BaseRegField:2841return BaseReg;2842case BaseGVField:2843return BaseGV;2844case ScaledRegField:2845return ScaledReg;2846case BaseOffsField:2847return ConstantInt::get(IntPtrTy, BaseOffs);2848}2849}28502851void SetCombinedField(FieldName Field, Value *V,2852const SmallVectorImpl<ExtAddrMode> &AddrModes) {2853switch (Field) {2854default:2855llvm_unreachable("Unhandled fields are expected to be rejected earlier");2856break;2857case ExtAddrMode::BaseRegField:2858BaseReg = V;2859break;2860case ExtAddrMode::BaseGVField:2861// A combined BaseGV is an Instruction, not a GlobalValue, so it goes2862// in the BaseReg field.2863assert(BaseReg == nullptr);2864BaseReg = V;2865BaseGV = nullptr;2866break;2867case ExtAddrMode::ScaledRegField:2868ScaledReg = V;2869// If we have a mix of scaled and unscaled addrmodes then we want scale2870// to be the scale and not zero.2871if (!Scale)2872for (const ExtAddrMode &AM : AddrModes)2873if (AM.Scale) {2874Scale = AM.Scale;2875break;2876}2877break;2878case ExtAddrMode::BaseOffsField:2879// The offset is no longer a constant, so it goes in ScaledReg with a2880// scale of 1.2881assert(ScaledReg == nullptr);2882ScaledReg = V;2883Scale = 1;2884BaseOffs = 0;2885break;2886}2887}2888};28892890#ifndef NDEBUG2891static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {2892AM.print(OS);2893return OS;2894}2895#endif28962897#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2898void ExtAddrMode::print(raw_ostream &OS) const {2899bool NeedPlus = false;2900OS << "[";2901if (InBounds)2902OS << "inbounds ";2903if (BaseGV) {2904OS << "GV:";2905BaseGV->printAsOperand(OS, /*PrintType=*/false);2906NeedPlus = true;2907}29082909if (BaseOffs) {2910OS << (NeedPlus ? " + " : "") << BaseOffs;2911NeedPlus = true;2912}29132914if (BaseReg) {2915OS << (NeedPlus ? " + " : "") << "Base:";2916BaseReg->printAsOperand(OS, /*PrintType=*/false);2917NeedPlus = true;2918}2919if (Scale) {2920OS << (NeedPlus ? " + " : "") << Scale << "*";2921ScaledReg->printAsOperand(OS, /*PrintType=*/false);2922}29232924OS << ']';2925}29262927LLVM_DUMP_METHOD void ExtAddrMode::dump() const {2928print(dbgs());2929dbgs() << '\n';2930}2931#endif29322933} // end anonymous namespace29342935namespace {29362937/// This class provides transaction based operation on the IR.2938/// Every change made through this class is recorded in the internal state and2939/// can be undone (rollback) until commit is called.2940/// CGP does not check if instructions could be speculatively executed when2941/// moved. Preserving the original location would pessimize the debugging2942/// experience, as well as negatively impact the quality of sample PGO.2943class TypePromotionTransaction {2944/// This represents the common interface of the individual transaction.2945/// Each class implements the logic for doing one specific modification on2946/// the IR via the TypePromotionTransaction.2947class TypePromotionAction {2948protected:2949/// The Instruction modified.2950Instruction *Inst;29512952public:2953/// Constructor of the action.2954/// The constructor performs the related action on the IR.2955TypePromotionAction(Instruction *Inst) : Inst(Inst) {}29562957virtual ~TypePromotionAction() = default;29582959/// Undo the modification done by this action.2960/// When this method is called, the IR must be in the same state as it was2961/// before this action was applied.2962/// \pre Undoing the action works if and only if the IR is in the exact same2963/// state as it was directly after this action was applied.2964virtual void undo() = 0;29652966/// Advocate every change made by this action.2967/// When the results on the IR of the action are to be kept, it is important2968/// to call this function, otherwise hidden information may be kept forever.2969virtual void commit() {2970// Nothing to be done, this action is not doing anything.2971}2972};29732974/// Utility to remember the position of an instruction.2975class InsertionHandler {2976/// Position of an instruction.2977/// Either an instruction:2978/// - Is the first in a basic block: BB is used.2979/// - Has a previous instruction: PrevInst is used.2980union {2981Instruction *PrevInst;2982BasicBlock *BB;2983} Point;2984std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;29852986/// Remember whether or not the instruction had a previous instruction.2987bool HasPrevInstruction;29882989public:2990/// Record the position of \p Inst.2991InsertionHandler(Instruction *Inst) {2992HasPrevInstruction = (Inst != &*(Inst->getParent()->begin()));2993BasicBlock *BB = Inst->getParent();29942995// Record where we would have to re-insert the instruction in the sequence2996// of DbgRecords, if we ended up reinserting.2997if (BB->IsNewDbgInfoFormat)2998BeforeDbgRecord = Inst->getDbgReinsertionPosition();29993000if (HasPrevInstruction) {3001Point.PrevInst = &*std::prev(Inst->getIterator());3002} else {3003Point.BB = BB;3004}3005}30063007/// Insert \p Inst at the recorded position.3008void insert(Instruction *Inst) {3009if (HasPrevInstruction) {3010if (Inst->getParent())3011Inst->removeFromParent();3012Inst->insertAfter(&*Point.PrevInst);3013} else {3014BasicBlock::iterator Position = Point.BB->getFirstInsertionPt();3015if (Inst->getParent())3016Inst->moveBefore(*Point.BB, Position);3017else3018Inst->insertBefore(*Point.BB, Position);3019}30203021Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDbgRecord);3022}3023};30243025/// Move an instruction before another.3026class InstructionMoveBefore : public TypePromotionAction {3027/// Original position of the instruction.3028InsertionHandler Position;30293030public:3031/// Move \p Inst before \p Before.3032InstructionMoveBefore(Instruction *Inst, Instruction *Before)3033: TypePromotionAction(Inst), Position(Inst) {3034LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before3035<< "\n");3036Inst->moveBefore(Before);3037}30383039/// Move the instruction back to its original position.3040void undo() override {3041LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");3042Position.insert(Inst);3043}3044};30453046/// Set the operand of an instruction with a new value.3047class OperandSetter : public TypePromotionAction {3048/// Original operand of the instruction.3049Value *Origin;30503051/// Index of the modified instruction.3052unsigned Idx;30533054public:3055/// Set \p Idx operand of \p Inst with \p NewVal.3056OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)3057: TypePromotionAction(Inst), Idx(Idx) {3058LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"3059<< "for:" << *Inst << "\n"3060<< "with:" << *NewVal << "\n");3061Origin = Inst->getOperand(Idx);3062Inst->setOperand(Idx, NewVal);3063}30643065/// Restore the original value of the instruction.3066void undo() override {3067LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"3068<< "for: " << *Inst << "\n"3069<< "with: " << *Origin << "\n");3070Inst->setOperand(Idx, Origin);3071}3072};30733074/// Hide the operands of an instruction.3075/// Do as if this instruction was not using any of its operands.3076class OperandsHider : public TypePromotionAction {3077/// The list of original operands.3078SmallVector<Value *, 4> OriginalValues;30793080public:3081/// Remove \p Inst from the uses of the operands of \p Inst.3082OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {3083LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");3084unsigned NumOpnds = Inst->getNumOperands();3085OriginalValues.reserve(NumOpnds);3086for (unsigned It = 0; It < NumOpnds; ++It) {3087// Save the current operand.3088Value *Val = Inst->getOperand(It);3089OriginalValues.push_back(Val);3090// Set a dummy one.3091// We could use OperandSetter here, but that would imply an overhead3092// that we are not willing to pay.3093Inst->setOperand(It, UndefValue::get(Val->getType()));3094}3095}30963097/// Restore the original list of uses.3098void undo() override {3099LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");3100for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)3101Inst->setOperand(It, OriginalValues[It]);3102}3103};31043105/// Build a truncate instruction.3106class TruncBuilder : public TypePromotionAction {3107Value *Val;31083109public:3110/// Build a truncate instruction of \p Opnd producing a \p Ty3111/// result.3112/// trunc Opnd to Ty.3113TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {3114IRBuilder<> Builder(Opnd);3115Builder.SetCurrentDebugLocation(DebugLoc());3116Val = Builder.CreateTrunc(Opnd, Ty, "promoted");3117LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");3118}31193120/// Get the built value.3121Value *getBuiltValue() { return Val; }31223123/// Remove the built instruction.3124void undo() override {3125LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");3126if (Instruction *IVal = dyn_cast<Instruction>(Val))3127IVal->eraseFromParent();3128}3129};31303131/// Build a sign extension instruction.3132class SExtBuilder : public TypePromotionAction {3133Value *Val;31343135public:3136/// Build a sign extension instruction of \p Opnd producing a \p Ty3137/// result.3138/// sext Opnd to Ty.3139SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)3140: TypePromotionAction(InsertPt) {3141IRBuilder<> Builder(InsertPt);3142Val = Builder.CreateSExt(Opnd, Ty, "promoted");3143LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");3144}31453146/// Get the built value.3147Value *getBuiltValue() { return Val; }31483149/// Remove the built instruction.3150void undo() override {3151LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");3152if (Instruction *IVal = dyn_cast<Instruction>(Val))3153IVal->eraseFromParent();3154}3155};31563157/// Build a zero extension instruction.3158class ZExtBuilder : public TypePromotionAction {3159Value *Val;31603161public:3162/// Build a zero extension instruction of \p Opnd producing a \p Ty3163/// result.3164/// zext Opnd to Ty.3165ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)3166: TypePromotionAction(InsertPt) {3167IRBuilder<> Builder(InsertPt);3168Builder.SetCurrentDebugLocation(DebugLoc());3169Val = Builder.CreateZExt(Opnd, Ty, "promoted");3170LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");3171}31723173/// Get the built value.3174Value *getBuiltValue() { return Val; }31753176/// Remove the built instruction.3177void undo() override {3178LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");3179if (Instruction *IVal = dyn_cast<Instruction>(Val))3180IVal->eraseFromParent();3181}3182};31833184/// Mutate an instruction to another type.3185class TypeMutator : public TypePromotionAction {3186/// Record the original type.3187Type *OrigTy;31883189public:3190/// Mutate the type of \p Inst into \p NewTy.3191TypeMutator(Instruction *Inst, Type *NewTy)3192: TypePromotionAction(Inst), OrigTy(Inst->getType()) {3193LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy3194<< "\n");3195Inst->mutateType(NewTy);3196}31973198/// Mutate the instruction back to its original type.3199void undo() override {3200LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy3201<< "\n");3202Inst->mutateType(OrigTy);3203}3204};32053206/// Replace the uses of an instruction by another instruction.3207class UsesReplacer : public TypePromotionAction {3208/// Helper structure to keep track of the replaced uses.3209struct InstructionAndIdx {3210/// The instruction using the instruction.3211Instruction *Inst;32123213/// The index where this instruction is used for Inst.3214unsigned Idx;32153216InstructionAndIdx(Instruction *Inst, unsigned Idx)3217: Inst(Inst), Idx(Idx) {}3218};32193220/// Keep track of the original uses (pair Instruction, Index).3221SmallVector<InstructionAndIdx, 4> OriginalUses;3222/// Keep track of the debug users.3223SmallVector<DbgValueInst *, 1> DbgValues;3224/// And non-instruction debug-users too.3225SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;32263227/// Keep track of the new value so that we can undo it by replacing3228/// instances of the new value with the original value.3229Value *New;32303231using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;32323233public:3234/// Replace all the use of \p Inst by \p New.3235UsesReplacer(Instruction *Inst, Value *New)3236: TypePromotionAction(Inst), New(New) {3237LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New3238<< "\n");3239// Record the original uses.3240for (Use &U : Inst->uses()) {3241Instruction *UserI = cast<Instruction>(U.getUser());3242OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));3243}3244// Record the debug uses separately. They are not in the instruction's3245// use list, but they are replaced by RAUW.3246findDbgValues(DbgValues, Inst, &DbgVariableRecords);32473248// Now, we can replace the uses.3249Inst->replaceAllUsesWith(New);3250}32513252/// Reassign the original uses of Inst to Inst.3253void undo() override {3254LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");3255for (InstructionAndIdx &Use : OriginalUses)3256Use.Inst->setOperand(Use.Idx, Inst);3257// RAUW has replaced all original uses with references to the new value,3258// including the debug uses. Since we are undoing the replacements,3259// the original debug uses must also be reinstated to maintain the3260// correctness and utility of debug value instructions.3261for (auto *DVI : DbgValues)3262DVI->replaceVariableLocationOp(New, Inst);3263// Similar story with DbgVariableRecords, the non-instruction3264// representation of dbg.values.3265for (DbgVariableRecord *DVR : DbgVariableRecords)3266DVR->replaceVariableLocationOp(New, Inst);3267}3268};32693270/// Remove an instruction from the IR.3271class InstructionRemover : public TypePromotionAction {3272/// Original position of the instruction.3273InsertionHandler Inserter;32743275/// Helper structure to hide all the link to the instruction. In other3276/// words, this helps to do as if the instruction was removed.3277OperandsHider Hider;32783279/// Keep track of the uses replaced, if any.3280UsesReplacer *Replacer = nullptr;32813282/// Keep track of instructions removed.3283SetOfInstrs &RemovedInsts;32843285public:3286/// Remove all reference of \p Inst and optionally replace all its3287/// uses with New.3288/// \p RemovedInsts Keep track of the instructions removed by this Action.3289/// \pre If !Inst->use_empty(), then New != nullptr3290InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,3291Value *New = nullptr)3292: TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),3293RemovedInsts(RemovedInsts) {3294if (New)3295Replacer = new UsesReplacer(Inst, New);3296LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");3297RemovedInsts.insert(Inst);3298/// The instructions removed here will be freed after completing3299/// optimizeBlock() for all blocks as we need to keep track of the3300/// removed instructions during promotion.3301Inst->removeFromParent();3302}33033304~InstructionRemover() override { delete Replacer; }33053306InstructionRemover &operator=(const InstructionRemover &other) = delete;3307InstructionRemover(const InstructionRemover &other) = delete;33083309/// Resurrect the instruction and reassign it to the proper uses if3310/// new value was provided when build this action.3311void undo() override {3312LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");3313Inserter.insert(Inst);3314if (Replacer)3315Replacer->undo();3316Hider.undo();3317RemovedInsts.erase(Inst);3318}3319};33203321public:3322/// Restoration point.3323/// The restoration point is a pointer to an action instead of an iterator3324/// because the iterator may be invalidated but not the pointer.3325using ConstRestorationPt = const TypePromotionAction *;33263327TypePromotionTransaction(SetOfInstrs &RemovedInsts)3328: RemovedInsts(RemovedInsts) {}33293330/// Advocate every changes made in that transaction. Return true if any change3331/// happen.3332bool commit();33333334/// Undo all the changes made after the given point.3335void rollback(ConstRestorationPt Point);33363337/// Get the current restoration point.3338ConstRestorationPt getRestorationPoint() const;33393340/// \name API for IR modification with state keeping to support rollback.3341/// @{3342/// Same as Instruction::setOperand.3343void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);33443345/// Same as Instruction::eraseFromParent.3346void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);33473348/// Same as Value::replaceAllUsesWith.3349void replaceAllUsesWith(Instruction *Inst, Value *New);33503351/// Same as Value::mutateType.3352void mutateType(Instruction *Inst, Type *NewTy);33533354/// Same as IRBuilder::createTrunc.3355Value *createTrunc(Instruction *Opnd, Type *Ty);33563357/// Same as IRBuilder::createSExt.3358Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);33593360/// Same as IRBuilder::createZExt.3361Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);33623363private:3364/// The ordered list of actions made so far.3365SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;33663367using CommitPt =3368SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;33693370SetOfInstrs &RemovedInsts;3371};33723373} // end anonymous namespace33743375void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,3376Value *NewVal) {3377Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(3378Inst, Idx, NewVal));3379}33803381void TypePromotionTransaction::eraseInstruction(Instruction *Inst,3382Value *NewVal) {3383Actions.push_back(3384std::make_unique<TypePromotionTransaction::InstructionRemover>(3385Inst, RemovedInsts, NewVal));3386}33873388void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,3389Value *New) {3390Actions.push_back(3391std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));3392}33933394void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {3395Actions.push_back(3396std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));3397}33983399Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, Type *Ty) {3400std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));3401Value *Val = Ptr->getBuiltValue();3402Actions.push_back(std::move(Ptr));3403return Val;3404}34053406Value *TypePromotionTransaction::createSExt(Instruction *Inst, Value *Opnd,3407Type *Ty) {3408std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));3409Value *Val = Ptr->getBuiltValue();3410Actions.push_back(std::move(Ptr));3411return Val;3412}34133414Value *TypePromotionTransaction::createZExt(Instruction *Inst, Value *Opnd,3415Type *Ty) {3416std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));3417Value *Val = Ptr->getBuiltValue();3418Actions.push_back(std::move(Ptr));3419return Val;3420}34213422TypePromotionTransaction::ConstRestorationPt3423TypePromotionTransaction::getRestorationPoint() const {3424return !Actions.empty() ? Actions.back().get() : nullptr;3425}34263427bool TypePromotionTransaction::commit() {3428for (std::unique_ptr<TypePromotionAction> &Action : Actions)3429Action->commit();3430bool Modified = !Actions.empty();3431Actions.clear();3432return Modified;3433}34343435void TypePromotionTransaction::rollback(3436TypePromotionTransaction::ConstRestorationPt Point) {3437while (!Actions.empty() && Point != Actions.back().get()) {3438std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();3439Curr->undo();3440}3441}34423443namespace {34443445/// A helper class for matching addressing modes.3446///3447/// This encapsulates the logic for matching the target-legal addressing modes.3448class AddressingModeMatcher {3449SmallVectorImpl<Instruction *> &AddrModeInsts;3450const TargetLowering &TLI;3451const TargetRegisterInfo &TRI;3452const DataLayout &DL;3453const LoopInfo &LI;3454const std::function<const DominatorTree &()> getDTFn;34553456/// AccessTy/MemoryInst - This is the type for the access (e.g. double) and3457/// the memory instruction that we're computing this address for.3458Type *AccessTy;3459unsigned AddrSpace;3460Instruction *MemoryInst;34613462/// This is the addressing mode that we're building up. This is3463/// part of the return value of this addressing mode matching stuff.3464ExtAddrMode &AddrMode;34653466/// The instructions inserted by other CodeGenPrepare optimizations.3467const SetOfInstrs &InsertedInsts;34683469/// A map from the instructions to their type before promotion.3470InstrToOrigTy &PromotedInsts;34713472/// The ongoing transaction where every action should be registered.3473TypePromotionTransaction &TPT;34743475// A GEP which has too large offset to be folded into the addressing mode.3476std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;34773478/// This is set to true when we should not do profitability checks.3479/// When true, IsProfitableToFoldIntoAddressingMode always returns true.3480bool IgnoreProfitability;34813482/// True if we are optimizing for size.3483bool OptSize = false;34843485ProfileSummaryInfo *PSI;3486BlockFrequencyInfo *BFI;34873488AddressingModeMatcher(3489SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,3490const TargetRegisterInfo &TRI, const LoopInfo &LI,3491const std::function<const DominatorTree &()> getDTFn, Type *AT,3492unsigned AS, Instruction *MI, ExtAddrMode &AM,3493const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,3494TypePromotionTransaction &TPT,3495std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,3496bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)3497: AddrModeInsts(AMI), TLI(TLI), TRI(TRI),3498DL(MI->getDataLayout()), LI(LI), getDTFn(getDTFn),3499AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),3500InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),3501LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {3502IgnoreProfitability = false;3503}35043505public:3506/// Find the maximal addressing mode that a load/store of V can fold,3507/// give an access type of AccessTy. This returns a list of involved3508/// instructions in AddrModeInsts.3509/// \p InsertedInsts The instructions inserted by other CodeGenPrepare3510/// optimizations.3511/// \p PromotedInsts maps the instructions to their type before promotion.3512/// \p The ongoing transaction where every action should be registered.3513static ExtAddrMode3514Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,3515SmallVectorImpl<Instruction *> &AddrModeInsts,3516const TargetLowering &TLI, const LoopInfo &LI,3517const std::function<const DominatorTree &()> getDTFn,3518const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,3519InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,3520std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,3521bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {3522ExtAddrMode Result;35233524bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, LI, getDTFn,3525AccessTy, AS, MemoryInst, Result,3526InsertedInsts, PromotedInsts, TPT,3527LargeOffsetGEP, OptSize, PSI, BFI)3528.matchAddr(V, 0);3529(void)Success;3530assert(Success && "Couldn't select *anything*?");3531return Result;3532}35333534private:3535bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);3536bool matchAddr(Value *Addr, unsigned Depth);3537bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,3538bool *MovedAway = nullptr);3539bool isProfitableToFoldIntoAddressingMode(Instruction *I,3540ExtAddrMode &AMBefore,3541ExtAddrMode &AMAfter);3542bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);3543bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,3544Value *PromotedOperand) const;3545};35463547class PhiNodeSet;35483549/// An iterator for PhiNodeSet.3550class PhiNodeSetIterator {3551PhiNodeSet *const Set;3552size_t CurrentIndex = 0;35533554public:3555/// The constructor. Start should point to either a valid element, or be equal3556/// to the size of the underlying SmallVector of the PhiNodeSet.3557PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start);3558PHINode *operator*() const;3559PhiNodeSetIterator &operator++();3560bool operator==(const PhiNodeSetIterator &RHS) const;3561bool operator!=(const PhiNodeSetIterator &RHS) const;3562};35633564/// Keeps a set of PHINodes.3565///3566/// This is a minimal set implementation for a specific use case:3567/// It is very fast when there are very few elements, but also provides good3568/// performance when there are many. It is similar to SmallPtrSet, but also3569/// provides iteration by insertion order, which is deterministic and stable3570/// across runs. It is also similar to SmallSetVector, but provides removing3571/// elements in O(1) time. This is achieved by not actually removing the element3572/// from the underlying vector, so comes at the cost of using more memory, but3573/// that is fine, since PhiNodeSets are used as short lived objects.3574class PhiNodeSet {3575friend class PhiNodeSetIterator;35763577using MapType = SmallDenseMap<PHINode *, size_t, 32>;3578using iterator = PhiNodeSetIterator;35793580/// Keeps the elements in the order of their insertion in the underlying3581/// vector. To achieve constant time removal, it never deletes any element.3582SmallVector<PHINode *, 32> NodeList;35833584/// Keeps the elements in the underlying set implementation. This (and not the3585/// NodeList defined above) is the source of truth on whether an element3586/// is actually in the collection.3587MapType NodeMap;35883589/// Points to the first valid (not deleted) element when the set is not empty3590/// and the value is not zero. Equals to the size of the underlying vector3591/// when the set is empty. When the value is 0, as in the beginning, the3592/// first element may or may not be valid.3593size_t FirstValidElement = 0;35943595public:3596/// Inserts a new element to the collection.3597/// \returns true if the element is actually added, i.e. was not in the3598/// collection before the operation.3599bool insert(PHINode *Ptr) {3600if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {3601NodeList.push_back(Ptr);3602return true;3603}3604return false;3605}36063607/// Removes the element from the collection.3608/// \returns whether the element is actually removed, i.e. was in the3609/// collection before the operation.3610bool erase(PHINode *Ptr) {3611if (NodeMap.erase(Ptr)) {3612SkipRemovedElements(FirstValidElement);3613return true;3614}3615return false;3616}36173618/// Removes all elements and clears the collection.3619void clear() {3620NodeMap.clear();3621NodeList.clear();3622FirstValidElement = 0;3623}36243625/// \returns an iterator that will iterate the elements in the order of3626/// insertion.3627iterator begin() {3628if (FirstValidElement == 0)3629SkipRemovedElements(FirstValidElement);3630return PhiNodeSetIterator(this, FirstValidElement);3631}36323633/// \returns an iterator that points to the end of the collection.3634iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }36353636/// Returns the number of elements in the collection.3637size_t size() const { return NodeMap.size(); }36383639/// \returns 1 if the given element is in the collection, and 0 if otherwise.3640size_t count(PHINode *Ptr) const { return NodeMap.count(Ptr); }36413642private:3643/// Updates the CurrentIndex so that it will point to a valid element.3644///3645/// If the element of NodeList at CurrentIndex is valid, it does not3646/// change it. If there are no more valid elements, it updates CurrentIndex3647/// to point to the end of the NodeList.3648void SkipRemovedElements(size_t &CurrentIndex) {3649while (CurrentIndex < NodeList.size()) {3650auto it = NodeMap.find(NodeList[CurrentIndex]);3651// If the element has been deleted and added again later, NodeMap will3652// point to a different index, so CurrentIndex will still be invalid.3653if (it != NodeMap.end() && it->second == CurrentIndex)3654break;3655++CurrentIndex;3656}3657}3658};36593660PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)3661: Set(Set), CurrentIndex(Start) {}36623663PHINode *PhiNodeSetIterator::operator*() const {3664assert(CurrentIndex < Set->NodeList.size() &&3665"PhiNodeSet access out of range");3666return Set->NodeList[CurrentIndex];3667}36683669PhiNodeSetIterator &PhiNodeSetIterator::operator++() {3670assert(CurrentIndex < Set->NodeList.size() &&3671"PhiNodeSet access out of range");3672++CurrentIndex;3673Set->SkipRemovedElements(CurrentIndex);3674return *this;3675}36763677bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {3678return CurrentIndex == RHS.CurrentIndex;3679}36803681bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {3682return !((*this) == RHS);3683}36843685/// Keep track of simplification of Phi nodes.3686/// Accept the set of all phi nodes and erase phi node from this set3687/// if it is simplified.3688class SimplificationTracker {3689DenseMap<Value *, Value *> Storage;3690const SimplifyQuery &SQ;3691// Tracks newly created Phi nodes. The elements are iterated by insertion3692// order.3693PhiNodeSet AllPhiNodes;3694// Tracks newly created Select nodes.3695SmallPtrSet<SelectInst *, 32> AllSelectNodes;36963697public:3698SimplificationTracker(const SimplifyQuery &sq) : SQ(sq) {}36993700Value *Get(Value *V) {3701do {3702auto SV = Storage.find(V);3703if (SV == Storage.end())3704return V;3705V = SV->second;3706} while (true);3707}37083709Value *Simplify(Value *Val) {3710SmallVector<Value *, 32> WorkList;3711SmallPtrSet<Value *, 32> Visited;3712WorkList.push_back(Val);3713while (!WorkList.empty()) {3714auto *P = WorkList.pop_back_val();3715if (!Visited.insert(P).second)3716continue;3717if (auto *PI = dyn_cast<Instruction>(P))3718if (Value *V = simplifyInstruction(cast<Instruction>(PI), SQ)) {3719for (auto *U : PI->users())3720WorkList.push_back(cast<Value>(U));3721Put(PI, V);3722PI->replaceAllUsesWith(V);3723if (auto *PHI = dyn_cast<PHINode>(PI))3724AllPhiNodes.erase(PHI);3725if (auto *Select = dyn_cast<SelectInst>(PI))3726AllSelectNodes.erase(Select);3727PI->eraseFromParent();3728}3729}3730return Get(Val);3731}37323733void Put(Value *From, Value *To) { Storage.insert({From, To}); }37343735void ReplacePhi(PHINode *From, PHINode *To) {3736Value *OldReplacement = Get(From);3737while (OldReplacement != From) {3738From = To;3739To = dyn_cast<PHINode>(OldReplacement);3740OldReplacement = Get(From);3741}3742assert(To && Get(To) == To && "Replacement PHI node is already replaced.");3743Put(From, To);3744From->replaceAllUsesWith(To);3745AllPhiNodes.erase(From);3746From->eraseFromParent();3747}37483749PhiNodeSet &newPhiNodes() { return AllPhiNodes; }37503751void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }37523753void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }37543755unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }37563757unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }37583759void destroyNewNodes(Type *CommonType) {3760// For safe erasing, replace the uses with dummy value first.3761auto *Dummy = PoisonValue::get(CommonType);3762for (auto *I : AllPhiNodes) {3763I->replaceAllUsesWith(Dummy);3764I->eraseFromParent();3765}3766AllPhiNodes.clear();3767for (auto *I : AllSelectNodes) {3768I->replaceAllUsesWith(Dummy);3769I->eraseFromParent();3770}3771AllSelectNodes.clear();3772}3773};37743775/// A helper class for combining addressing modes.3776class AddressingModeCombiner {3777typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;3778typedef std::pair<PHINode *, PHINode *> PHIPair;37793780private:3781/// The addressing modes we've collected.3782SmallVector<ExtAddrMode, 16> AddrModes;37833784/// The field in which the AddrModes differ, when we have more than one.3785ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;37863787/// Are the AddrModes that we have all just equal to their original values?3788bool AllAddrModesTrivial = true;37893790/// Common Type for all different fields in addressing modes.3791Type *CommonType = nullptr;37923793/// SimplifyQuery for simplifyInstruction utility.3794const SimplifyQuery &SQ;37953796/// Original Address.3797Value *Original;37983799/// Common value among addresses3800Value *CommonValue = nullptr;38013802public:3803AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)3804: SQ(_SQ), Original(OriginalValue) {}38053806~AddressingModeCombiner() { eraseCommonValueIfDead(); }38073808/// Get the combined AddrMode3809const ExtAddrMode &getAddrMode() const { return AddrModes[0]; }38103811/// Add a new AddrMode if it's compatible with the AddrModes we already3812/// have.3813/// \return True iff we succeeded in doing so.3814bool addNewAddrMode(ExtAddrMode &NewAddrMode) {3815// Take note of if we have any non-trivial AddrModes, as we need to detect3816// when all AddrModes are trivial as then we would introduce a phi or select3817// which just duplicates what's already there.3818AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();38193820// If this is the first addrmode then everything is fine.3821if (AddrModes.empty()) {3822AddrModes.emplace_back(NewAddrMode);3823return true;3824}38253826// Figure out how different this is from the other address modes, which we3827// can do just by comparing against the first one given that we only care3828// about the cumulative difference.3829ExtAddrMode::FieldName ThisDifferentField =3830AddrModes[0].compare(NewAddrMode);3831if (DifferentField == ExtAddrMode::NoField)3832DifferentField = ThisDifferentField;3833else if (DifferentField != ThisDifferentField)3834DifferentField = ExtAddrMode::MultipleFields;38353836// If NewAddrMode differs in more than one dimension we cannot handle it.3837bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;38383839// If Scale Field is different then we reject.3840CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;38413842// We also must reject the case when base offset is different and3843// scale reg is not null, we cannot handle this case due to merge of3844// different offsets will be used as ScaleReg.3845CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||3846!NewAddrMode.ScaledReg);38473848// We also must reject the case when GV is different and BaseReg installed3849// due to we want to use base reg as a merge of GV values.3850CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||3851!NewAddrMode.HasBaseReg);38523853// Even if NewAddMode is the same we still need to collect it due to3854// original value is different. And later we will need all original values3855// as anchors during finding the common Phi node.3856if (CanHandle)3857AddrModes.emplace_back(NewAddrMode);3858else3859AddrModes.clear();38603861return CanHandle;3862}38633864/// Combine the addressing modes we've collected into a single3865/// addressing mode.3866/// \return True iff we successfully combined them or we only had one so3867/// didn't need to combine them anyway.3868bool combineAddrModes() {3869// If we have no AddrModes then they can't be combined.3870if (AddrModes.size() == 0)3871return false;38723873// A single AddrMode can trivially be combined.3874if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)3875return true;38763877// If the AddrModes we collected are all just equal to the value they are3878// derived from then combining them wouldn't do anything useful.3879if (AllAddrModesTrivial)3880return false;38813882if (!addrModeCombiningAllowed())3883return false;38843885// Build a map between <original value, basic block where we saw it> to3886// value of base register.3887// Bail out if there is no common type.3888FoldAddrToValueMapping Map;3889if (!initializeMap(Map))3890return false;38913892CommonValue = findCommon(Map);3893if (CommonValue)3894AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);3895return CommonValue != nullptr;3896}38973898private:3899/// `CommonValue` may be a placeholder inserted by us.3900/// If the placeholder is not used, we should remove this dead instruction.3901void eraseCommonValueIfDead() {3902if (CommonValue && CommonValue->getNumUses() == 0)3903if (Instruction *CommonInst = dyn_cast<Instruction>(CommonValue))3904CommonInst->eraseFromParent();3905}39063907/// Initialize Map with anchor values. For address seen3908/// we set the value of different field saw in this address.3909/// At the same time we find a common type for different field we will3910/// use to create new Phi/Select nodes. Keep it in CommonType field.3911/// Return false if there is no common type found.3912bool initializeMap(FoldAddrToValueMapping &Map) {3913// Keep track of keys where the value is null. We will need to replace it3914// with constant null when we know the common type.3915SmallVector<Value *, 2> NullValue;3916Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());3917for (auto &AM : AddrModes) {3918Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);3919if (DV) {3920auto *Type = DV->getType();3921if (CommonType && CommonType != Type)3922return false;3923CommonType = Type;3924Map[AM.OriginalValue] = DV;3925} else {3926NullValue.push_back(AM.OriginalValue);3927}3928}3929assert(CommonType && "At least one non-null value must be!");3930for (auto *V : NullValue)3931Map[V] = Constant::getNullValue(CommonType);3932return true;3933}39343935/// We have mapping between value A and other value B where B was a field in3936/// addressing mode represented by A. Also we have an original value C3937/// representing an address we start with. Traversing from C through phi and3938/// selects we ended up with A's in a map. This utility function tries to find3939/// a value V which is a field in addressing mode C and traversing through phi3940/// nodes and selects we will end up in corresponded values B in a map.3941/// The utility will create a new Phi/Selects if needed.3942// The simple example looks as follows:3943// BB1:3944// p1 = b1 + 403945// br cond BB2, BB33946// BB2:3947// p2 = b2 + 403948// br BB33949// BB3:3950// p = phi [p1, BB1], [p2, BB2]3951// v = load p3952// Map is3953// p1 -> b13954// p2 -> b23955// Request is3956// p -> ?3957// The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.3958Value *findCommon(FoldAddrToValueMapping &Map) {3959// Tracks the simplification of newly created phi nodes. The reason we use3960// this mapping is because we will add new created Phi nodes in AddrToBase.3961// Simplification of Phi nodes is recursive, so some Phi node may3962// be simplified after we added it to AddrToBase. In reality this3963// simplification is possible only if original phi/selects were not3964// simplified yet.3965// Using this mapping we can find the current value in AddrToBase.3966SimplificationTracker ST(SQ);39673968// First step, DFS to create PHI nodes for all intermediate blocks.3969// Also fill traverse order for the second step.3970SmallVector<Value *, 32> TraverseOrder;3971InsertPlaceholders(Map, TraverseOrder, ST);39723973// Second Step, fill new nodes by merged values and simplify if possible.3974FillPlaceholders(Map, TraverseOrder, ST);39753976if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {3977ST.destroyNewNodes(CommonType);3978return nullptr;3979}39803981// Now we'd like to match New Phi nodes to existed ones.3982unsigned PhiNotMatchedCount = 0;3983if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {3984ST.destroyNewNodes(CommonType);3985return nullptr;3986}39873988auto *Result = ST.Get(Map.find(Original)->second);3989if (Result) {3990NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;3991NumMemoryInstsSelectCreated += ST.countNewSelectNodes();3992}3993return Result;3994}39953996/// Try to match PHI node to Candidate.3997/// Matcher tracks the matched Phi nodes.3998bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,3999SmallSetVector<PHIPair, 8> &Matcher,4000PhiNodeSet &PhiNodesToMatch) {4001SmallVector<PHIPair, 8> WorkList;4002Matcher.insert({PHI, Candidate});4003SmallSet<PHINode *, 8> MatchedPHIs;4004MatchedPHIs.insert(PHI);4005WorkList.push_back({PHI, Candidate});4006SmallSet<PHIPair, 8> Visited;4007while (!WorkList.empty()) {4008auto Item = WorkList.pop_back_val();4009if (!Visited.insert(Item).second)4010continue;4011// We iterate over all incoming values to Phi to compare them.4012// If values are different and both of them Phi and the first one is a4013// Phi we added (subject to match) and both of them is in the same basic4014// block then we can match our pair if values match. So we state that4015// these values match and add it to work list to verify that.4016for (auto *B : Item.first->blocks()) {4017Value *FirstValue = Item.first->getIncomingValueForBlock(B);4018Value *SecondValue = Item.second->getIncomingValueForBlock(B);4019if (FirstValue == SecondValue)4020continue;40214022PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);4023PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);40244025// One of them is not Phi or4026// The first one is not Phi node from the set we'd like to match or4027// Phi nodes from different basic blocks then4028// we will not be able to match.4029if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||4030FirstPhi->getParent() != SecondPhi->getParent())4031return false;40324033// If we already matched them then continue.4034if (Matcher.count({FirstPhi, SecondPhi}))4035continue;4036// So the values are different and does not match. So we need them to4037// match. (But we register no more than one match per PHI node, so that4038// we won't later try to replace them twice.)4039if (MatchedPHIs.insert(FirstPhi).second)4040Matcher.insert({FirstPhi, SecondPhi});4041// But me must check it.4042WorkList.push_back({FirstPhi, SecondPhi});4043}4044}4045return true;4046}40474048/// For the given set of PHI nodes (in the SimplificationTracker) try4049/// to find their equivalents.4050/// Returns false if this matching fails and creation of new Phi is disabled.4051bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,4052unsigned &PhiNotMatchedCount) {4053// Matched and PhiNodesToMatch iterate their elements in a deterministic4054// order, so the replacements (ReplacePhi) are also done in a deterministic4055// order.4056SmallSetVector<PHIPair, 8> Matched;4057SmallPtrSet<PHINode *, 8> WillNotMatch;4058PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();4059while (PhiNodesToMatch.size()) {4060PHINode *PHI = *PhiNodesToMatch.begin();40614062// Add us, if no Phi nodes in the basic block we do not match.4063WillNotMatch.clear();4064WillNotMatch.insert(PHI);40654066// Traverse all Phis until we found equivalent or fail to do that.4067bool IsMatched = false;4068for (auto &P : PHI->getParent()->phis()) {4069// Skip new Phi nodes.4070if (PhiNodesToMatch.count(&P))4071continue;4072if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))4073break;4074// If it does not match, collect all Phi nodes from matcher.4075// if we end up with no match, them all these Phi nodes will not match4076// later.4077for (auto M : Matched)4078WillNotMatch.insert(M.first);4079Matched.clear();4080}4081if (IsMatched) {4082// Replace all matched values and erase them.4083for (auto MV : Matched)4084ST.ReplacePhi(MV.first, MV.second);4085Matched.clear();4086continue;4087}4088// If we are not allowed to create new nodes then bail out.4089if (!AllowNewPhiNodes)4090return false;4091// Just remove all seen values in matcher. They will not match anything.4092PhiNotMatchedCount += WillNotMatch.size();4093for (auto *P : WillNotMatch)4094PhiNodesToMatch.erase(P);4095}4096return true;4097}4098/// Fill the placeholders with values from predecessors and simplify them.4099void FillPlaceholders(FoldAddrToValueMapping &Map,4100SmallVectorImpl<Value *> &TraverseOrder,4101SimplificationTracker &ST) {4102while (!TraverseOrder.empty()) {4103Value *Current = TraverseOrder.pop_back_val();4104assert(Map.contains(Current) && "No node to fill!!!");4105Value *V = Map[Current];41064107if (SelectInst *Select = dyn_cast<SelectInst>(V)) {4108// CurrentValue also must be Select.4109auto *CurrentSelect = cast<SelectInst>(Current);4110auto *TrueValue = CurrentSelect->getTrueValue();4111assert(Map.contains(TrueValue) && "No True Value!");4112Select->setTrueValue(ST.Get(Map[TrueValue]));4113auto *FalseValue = CurrentSelect->getFalseValue();4114assert(Map.contains(FalseValue) && "No False Value!");4115Select->setFalseValue(ST.Get(Map[FalseValue]));4116} else {4117// Must be a Phi node then.4118auto *PHI = cast<PHINode>(V);4119// Fill the Phi node with values from predecessors.4120for (auto *B : predecessors(PHI->getParent())) {4121Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);4122assert(Map.contains(PV) && "No predecessor Value!");4123PHI->addIncoming(ST.Get(Map[PV]), B);4124}4125}4126Map[Current] = ST.Simplify(V);4127}4128}41294130/// Starting from original value recursively iterates over def-use chain up to4131/// known ending values represented in a map. For each traversed phi/select4132/// inserts a placeholder Phi or Select.4133/// Reports all new created Phi/Select nodes by adding them to set.4134/// Also reports and order in what values have been traversed.4135void InsertPlaceholders(FoldAddrToValueMapping &Map,4136SmallVectorImpl<Value *> &TraverseOrder,4137SimplificationTracker &ST) {4138SmallVector<Value *, 32> Worklist;4139assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&4140"Address must be a Phi or Select node");4141auto *Dummy = PoisonValue::get(CommonType);4142Worklist.push_back(Original);4143while (!Worklist.empty()) {4144Value *Current = Worklist.pop_back_val();4145// if it is already visited or it is an ending value then skip it.4146if (Map.contains(Current))4147continue;4148TraverseOrder.push_back(Current);41494150// CurrentValue must be a Phi node or select. All others must be covered4151// by anchors.4152if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {4153// Is it OK to get metadata from OrigSelect?!4154// Create a Select placeholder with dummy value.4155SelectInst *Select =4156SelectInst::Create(CurrentSelect->getCondition(), Dummy, Dummy,4157CurrentSelect->getName(),4158CurrentSelect->getIterator(), CurrentSelect);4159Map[Current] = Select;4160ST.insertNewSelect(Select);4161// We are interested in True and False values.4162Worklist.push_back(CurrentSelect->getTrueValue());4163Worklist.push_back(CurrentSelect->getFalseValue());4164} else {4165// It must be a Phi node then.4166PHINode *CurrentPhi = cast<PHINode>(Current);4167unsigned PredCount = CurrentPhi->getNumIncomingValues();4168PHINode *PHI =4169PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi->getIterator());4170Map[Current] = PHI;4171ST.insertNewPhi(PHI);4172append_range(Worklist, CurrentPhi->incoming_values());4173}4174}4175}41764177bool addrModeCombiningAllowed() {4178if (DisableComplexAddrModes)4179return false;4180switch (DifferentField) {4181default:4182return false;4183case ExtAddrMode::BaseRegField:4184return AddrSinkCombineBaseReg;4185case ExtAddrMode::BaseGVField:4186return AddrSinkCombineBaseGV;4187case ExtAddrMode::BaseOffsField:4188return AddrSinkCombineBaseOffs;4189case ExtAddrMode::ScaledRegField:4190return AddrSinkCombineScaledReg;4191}4192}4193};4194} // end anonymous namespace41954196/// Try adding ScaleReg*Scale to the current addressing mode.4197/// Return true and update AddrMode if this addr mode is legal for the target,4198/// false if not.4199bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,4200unsigned Depth) {4201// If Scale is 1, then this is the same as adding ScaleReg to the addressing4202// mode. Just process that directly.4203if (Scale == 1)4204return matchAddr(ScaleReg, Depth);42054206// If the scale is 0, it takes nothing to add this.4207if (Scale == 0)4208return true;42094210// If we already have a scale of this value, we can add to it, otherwise, we4211// need an available scale field.4212if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)4213return false;42144215ExtAddrMode TestAddrMode = AddrMode;42164217// Add scale to turn X*4+X*3 -> X*7. This could also do things like4218// [A+B + A*7] -> [B+A*8].4219TestAddrMode.Scale += Scale;4220TestAddrMode.ScaledReg = ScaleReg;42214222// If the new address isn't legal, bail out.4223if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))4224return false;42254226// It was legal, so commit it.4227AddrMode = TestAddrMode;42284229// Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now4230// to see if ScaleReg is actually X+C. If so, we can turn this into adding4231// X*Scale + C*Scale to addr mode. If we found available IV increment, do not4232// go any further: we can reuse it and cannot eliminate it.4233ConstantInt *CI = nullptr;4234Value *AddLHS = nullptr;4235if (isa<Instruction>(ScaleReg) && // not a constant expr.4236match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&4237!isIVIncrement(ScaleReg, &LI) && CI->getValue().isSignedIntN(64)) {4238TestAddrMode.InBounds = false;4239TestAddrMode.ScaledReg = AddLHS;4240TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;42414242// If this addressing mode is legal, commit it and remember that we folded4243// this instruction.4244if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {4245AddrModeInsts.push_back(cast<Instruction>(ScaleReg));4246AddrMode = TestAddrMode;4247return true;4248}4249// Restore status quo.4250TestAddrMode = AddrMode;4251}42524253// If this is an add recurrence with a constant step, return the increment4254// instruction and the canonicalized step.4255auto GetConstantStep =4256[this](const Value *V) -> std::optional<std::pair<Instruction *, APInt>> {4257auto *PN = dyn_cast<PHINode>(V);4258if (!PN)4259return std::nullopt;4260auto IVInc = getIVIncrement(PN, &LI);4261if (!IVInc)4262return std::nullopt;4263// TODO: The result of the intrinsics above is two-complement. However when4264// IV inc is expressed as add or sub, iv.next is potentially a poison value.4265// If it has nuw or nsw flags, we need to make sure that these flags are4266// inferrable at the point of memory instruction. Otherwise we are replacing4267// well-defined two-complement computation with poison. Currently, to avoid4268// potentially complex analysis needed to prove this, we reject such cases.4269if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(IVInc->first))4270if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap())4271return std::nullopt;4272if (auto *ConstantStep = dyn_cast<ConstantInt>(IVInc->second))4273return std::make_pair(IVInc->first, ConstantStep->getValue());4274return std::nullopt;4275};42764277// Try to account for the following special case:4278// 1. ScaleReg is an inductive variable;4279// 2. We use it with non-zero offset;4280// 3. IV's increment is available at the point of memory instruction.4281//4282// In this case, we may reuse the IV increment instead of the IV Phi to4283// achieve the following advantages:4284// 1. If IV step matches the offset, we will have no need in the offset;4285// 2. Even if they don't match, we will reduce the overlap of living IV4286// and IV increment, that will potentially lead to better register4287// assignment.4288if (AddrMode.BaseOffs) {4289if (auto IVStep = GetConstantStep(ScaleReg)) {4290Instruction *IVInc = IVStep->first;4291// The following assert is important to ensure a lack of infinite loops.4292// This transforms is (intentionally) the inverse of the one just above.4293// If they don't agree on the definition of an increment, we'd alternate4294// back and forth indefinitely.4295assert(isIVIncrement(IVInc, &LI) && "implied by GetConstantStep");4296APInt Step = IVStep->second;4297APInt Offset = Step * AddrMode.Scale;4298if (Offset.isSignedIntN(64)) {4299TestAddrMode.InBounds = false;4300TestAddrMode.ScaledReg = IVInc;4301TestAddrMode.BaseOffs -= Offset.getLimitedValue();4302// If this addressing mode is legal, commit it..4303// (Note that we defer the (expensive) domtree base legality check4304// to the very last possible point.)4305if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace) &&4306getDTFn().dominates(IVInc, MemoryInst)) {4307AddrModeInsts.push_back(cast<Instruction>(IVInc));4308AddrMode = TestAddrMode;4309return true;4310}4311// Restore status quo.4312TestAddrMode = AddrMode;4313}4314}4315}43164317// Otherwise, just return what we have.4318return true;4319}43204321/// This is a little filter, which returns true if an addressing computation4322/// involving I might be folded into a load/store accessing it.4323/// This doesn't need to be perfect, but needs to accept at least4324/// the set of instructions that MatchOperationAddr can.4325static bool MightBeFoldableInst(Instruction *I) {4326switch (I->getOpcode()) {4327case Instruction::BitCast:4328case Instruction::AddrSpaceCast:4329// Don't touch identity bitcasts.4330if (I->getType() == I->getOperand(0)->getType())4331return false;4332return I->getType()->isIntOrPtrTy();4333case Instruction::PtrToInt:4334// PtrToInt is always a noop, as we know that the int type is pointer sized.4335return true;4336case Instruction::IntToPtr:4337// We know the input is intptr_t, so this is foldable.4338return true;4339case Instruction::Add:4340return true;4341case Instruction::Mul:4342case Instruction::Shl:4343// Can only handle X*C and X << C.4344return isa<ConstantInt>(I->getOperand(1));4345case Instruction::GetElementPtr:4346return true;4347default:4348return false;4349}4350}43514352/// Check whether or not \p Val is a legal instruction for \p TLI.4353/// \note \p Val is assumed to be the product of some type promotion.4354/// Therefore if \p Val has an undefined state in \p TLI, this is assumed4355/// to be legal, as the non-promoted value would have had the same state.4356static bool isPromotedInstructionLegal(const TargetLowering &TLI,4357const DataLayout &DL, Value *Val) {4358Instruction *PromotedInst = dyn_cast<Instruction>(Val);4359if (!PromotedInst)4360return false;4361int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());4362// If the ISDOpcode is undefined, it was undefined before the promotion.4363if (!ISDOpcode)4364return true;4365// Otherwise, check if the promoted instruction is legal or not.4366return TLI.isOperationLegalOrCustom(4367ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));4368}43694370namespace {43714372/// Hepler class to perform type promotion.4373class TypePromotionHelper {4374/// Utility function to add a promoted instruction \p ExtOpnd to4375/// \p PromotedInsts and record the type of extension we have seen.4376static void addPromotedInst(InstrToOrigTy &PromotedInsts,4377Instruction *ExtOpnd, bool IsSExt) {4378ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;4379InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);4380if (It != PromotedInsts.end()) {4381// If the new extension is same as original, the information in4382// PromotedInsts[ExtOpnd] is still correct.4383if (It->second.getInt() == ExtTy)4384return;43854386// Now the new extension is different from old extension, we make4387// the type information invalid by setting extension type to4388// BothExtension.4389ExtTy = BothExtension;4390}4391PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);4392}43934394/// Utility function to query the original type of instruction \p Opnd4395/// with a matched extension type. If the extension doesn't match, we4396/// cannot use the information we had on the original type.4397/// BothExtension doesn't match any extension type.4398static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,4399Instruction *Opnd, bool IsSExt) {4400ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;4401InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);4402if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)4403return It->second.getPointer();4404return nullptr;4405}44064407/// Utility function to check whether or not a sign or zero extension4408/// of \p Inst with \p ConsideredExtType can be moved through \p Inst by4409/// either using the operands of \p Inst or promoting \p Inst.4410/// The type of the extension is defined by \p IsSExt.4411/// In other words, check if:4412/// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.4413/// #1 Promotion applies:4414/// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).4415/// #2 Operand reuses:4416/// ext opnd1 to ConsideredExtType.4417/// \p PromotedInsts maps the instructions to their type before promotion.4418static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,4419const InstrToOrigTy &PromotedInsts, bool IsSExt);44204421/// Utility function to determine if \p OpIdx should be promoted when4422/// promoting \p Inst.4423static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {4424return !(isa<SelectInst>(Inst) && OpIdx == 0);4425}44264427/// Utility function to promote the operand of \p Ext when this4428/// operand is a promotable trunc or sext or zext.4429/// \p PromotedInsts maps the instructions to their type before promotion.4430/// \p CreatedInstsCost[out] contains the cost of all instructions4431/// created to promote the operand of Ext.4432/// Newly added extensions are inserted in \p Exts.4433/// Newly added truncates are inserted in \p Truncs.4434/// Should never be called directly.4435/// \return The promoted value which is used instead of Ext.4436static Value *promoteOperandForTruncAndAnyExt(4437Instruction *Ext, TypePromotionTransaction &TPT,4438InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,4439SmallVectorImpl<Instruction *> *Exts,4440SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);44414442/// Utility function to promote the operand of \p Ext when this4443/// operand is promotable and is not a supported trunc or sext.4444/// \p PromotedInsts maps the instructions to their type before promotion.4445/// \p CreatedInstsCost[out] contains the cost of all the instructions4446/// created to promote the operand of Ext.4447/// Newly added extensions are inserted in \p Exts.4448/// Newly added truncates are inserted in \p Truncs.4449/// Should never be called directly.4450/// \return The promoted value which is used instead of Ext.4451static Value *promoteOperandForOther(Instruction *Ext,4452TypePromotionTransaction &TPT,4453InstrToOrigTy &PromotedInsts,4454unsigned &CreatedInstsCost,4455SmallVectorImpl<Instruction *> *Exts,4456SmallVectorImpl<Instruction *> *Truncs,4457const TargetLowering &TLI, bool IsSExt);44584459/// \see promoteOperandForOther.4460static Value *signExtendOperandForOther(4461Instruction *Ext, TypePromotionTransaction &TPT,4462InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,4463SmallVectorImpl<Instruction *> *Exts,4464SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {4465return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,4466Exts, Truncs, TLI, true);4467}44684469/// \see promoteOperandForOther.4470static Value *zeroExtendOperandForOther(4471Instruction *Ext, TypePromotionTransaction &TPT,4472InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,4473SmallVectorImpl<Instruction *> *Exts,4474SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {4475return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,4476Exts, Truncs, TLI, false);4477}44784479public:4480/// Type for the utility function that promotes the operand of Ext.4481using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,4482InstrToOrigTy &PromotedInsts,4483unsigned &CreatedInstsCost,4484SmallVectorImpl<Instruction *> *Exts,4485SmallVectorImpl<Instruction *> *Truncs,4486const TargetLowering &TLI);44874488/// Given a sign/zero extend instruction \p Ext, return the appropriate4489/// action to promote the operand of \p Ext instead of using Ext.4490/// \return NULL if no promotable action is possible with the current4491/// sign extension.4492/// \p InsertedInsts keeps track of all the instructions inserted by the4493/// other CodeGenPrepare optimizations. This information is important4494/// because we do not want to promote these instructions as CodeGenPrepare4495/// will reinsert them later. Thus creating an infinite loop: create/remove.4496/// \p PromotedInsts maps the instructions to their type before promotion.4497static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,4498const TargetLowering &TLI,4499const InstrToOrigTy &PromotedInsts);4500};45014502} // end anonymous namespace45034504bool TypePromotionHelper::canGetThrough(const Instruction *Inst,4505Type *ConsideredExtType,4506const InstrToOrigTy &PromotedInsts,4507bool IsSExt) {4508// The promotion helper does not know how to deal with vector types yet.4509// To be able to fix that, we would need to fix the places where we4510// statically extend, e.g., constants and such.4511if (Inst->getType()->isVectorTy())4512return false;45134514// We can always get through zext.4515if (isa<ZExtInst>(Inst))4516return true;45174518// sext(sext) is ok too.4519if (IsSExt && isa<SExtInst>(Inst))4520return true;45214522// We can get through binary operator, if it is legal. In other words, the4523// binary operator must have a nuw or nsw flag.4524if (const auto *BinOp = dyn_cast<BinaryOperator>(Inst))4525if (isa<OverflowingBinaryOperator>(BinOp) &&4526((!IsSExt && BinOp->hasNoUnsignedWrap()) ||4527(IsSExt && BinOp->hasNoSignedWrap())))4528return true;45294530// ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))4531if ((Inst->getOpcode() == Instruction::And ||4532Inst->getOpcode() == Instruction::Or))4533return true;45344535// ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))4536if (Inst->getOpcode() == Instruction::Xor) {4537// Make sure it is not a NOT.4538if (const auto *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)))4539if (!Cst->getValue().isAllOnes())4540return true;4541}45424543// zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))4544// It may change a poisoned value into a regular value, like4545// zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 124546// poisoned value regular value4547// It should be OK since undef covers valid value.4548if (Inst->getOpcode() == Instruction::LShr && !IsSExt)4549return true;45504551// and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)4552// It may change a poisoned value into a regular value, like4553// zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 124554// poisoned value regular value4555// It should be OK since undef covers valid value.4556if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {4557const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());4558if (ExtInst->hasOneUse()) {4559const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());4560if (AndInst && AndInst->getOpcode() == Instruction::And) {4561const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));4562if (Cst &&4563Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))4564return true;4565}4566}4567}45684569// Check if we can do the following simplification.4570// ext(trunc(opnd)) --> ext(opnd)4571if (!isa<TruncInst>(Inst))4572return false;45734574Value *OpndVal = Inst->getOperand(0);4575// Check if we can use this operand in the extension.4576// If the type is larger than the result type of the extension, we cannot.4577if (!OpndVal->getType()->isIntegerTy() ||4578OpndVal->getType()->getIntegerBitWidth() >4579ConsideredExtType->getIntegerBitWidth())4580return false;45814582// If the operand of the truncate is not an instruction, we will not have4583// any information on the dropped bits.4584// (Actually we could for constant but it is not worth the extra logic).4585Instruction *Opnd = dyn_cast<Instruction>(OpndVal);4586if (!Opnd)4587return false;45884589// Check if the source of the type is narrow enough.4590// I.e., check that trunc just drops extended bits of the same kind of4591// the extension.4592// #1 get the type of the operand and check the kind of the extended bits.4593const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);4594if (OpndType)4595;4596else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))4597OpndType = Opnd->getOperand(0)->getType();4598else4599return false;46004601// #2 check that the truncate just drops extended bits.4602return Inst->getType()->getIntegerBitWidth() >=4603OpndType->getIntegerBitWidth();4604}46054606TypePromotionHelper::Action TypePromotionHelper::getAction(4607Instruction *Ext, const SetOfInstrs &InsertedInsts,4608const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {4609assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&4610"Unexpected instruction type");4611Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));4612Type *ExtTy = Ext->getType();4613bool IsSExt = isa<SExtInst>(Ext);4614// If the operand of the extension is not an instruction, we cannot4615// get through.4616// If it, check we can get through.4617if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))4618return nullptr;46194620// Do not promote if the operand has been added by codegenprepare.4621// Otherwise, it means we are undoing an optimization that is likely to be4622// redone, thus causing potential infinite loop.4623if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))4624return nullptr;46254626// SExt or Trunc instructions.4627// Return the related handler.4628if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||4629isa<ZExtInst>(ExtOpnd))4630return promoteOperandForTruncAndAnyExt;46314632// Regular instruction.4633// Abort early if we will have to insert non-free instructions.4634if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))4635return nullptr;4636return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;4637}46384639Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(4640Instruction *SExt, TypePromotionTransaction &TPT,4641InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,4642SmallVectorImpl<Instruction *> *Exts,4643SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {4644// By construction, the operand of SExt is an instruction. Otherwise we cannot4645// get through it and this method should not be called.4646Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));4647Value *ExtVal = SExt;4648bool HasMergedNonFreeExt = false;4649if (isa<ZExtInst>(SExtOpnd)) {4650// Replace s|zext(zext(opnd))4651// => zext(opnd).4652HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);4653Value *ZExt =4654TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());4655TPT.replaceAllUsesWith(SExt, ZExt);4656TPT.eraseInstruction(SExt);4657ExtVal = ZExt;4658} else {4659// Replace z|sext(trunc(opnd)) or sext(sext(opnd))4660// => z|sext(opnd).4661TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));4662}4663CreatedInstsCost = 0;46644665// Remove dead code.4666if (SExtOpnd->use_empty())4667TPT.eraseInstruction(SExtOpnd);46684669// Check if the extension is still needed.4670Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);4671if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {4672if (ExtInst) {4673if (Exts)4674Exts->push_back(ExtInst);4675CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;4676}4677return ExtVal;4678}46794680// At this point we have: ext ty opnd to ty.4681// Reassign the uses of ExtInst to the opnd and remove ExtInst.4682Value *NextVal = ExtInst->getOperand(0);4683TPT.eraseInstruction(ExtInst, NextVal);4684return NextVal;4685}46864687Value *TypePromotionHelper::promoteOperandForOther(4688Instruction *Ext, TypePromotionTransaction &TPT,4689InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,4690SmallVectorImpl<Instruction *> *Exts,4691SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,4692bool IsSExt) {4693// By construction, the operand of Ext is an instruction. Otherwise we cannot4694// get through it and this method should not be called.4695Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));4696CreatedInstsCost = 0;4697if (!ExtOpnd->hasOneUse()) {4698// ExtOpnd will be promoted.4699// All its uses, but Ext, will need to use a truncated value of the4700// promoted version.4701// Create the truncate now.4702Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());4703if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {4704// Insert it just after the definition.4705ITrunc->moveAfter(ExtOpnd);4706if (Truncs)4707Truncs->push_back(ITrunc);4708}47094710TPT.replaceAllUsesWith(ExtOpnd, Trunc);4711// Restore the operand of Ext (which has been replaced by the previous call4712// to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.4713TPT.setOperand(Ext, 0, ExtOpnd);4714}47154716// Get through the Instruction:4717// 1. Update its type.4718// 2. Replace the uses of Ext by Inst.4719// 3. Extend each operand that needs to be extended.47204721// Remember the original type of the instruction before promotion.4722// This is useful to know that the high bits are sign extended bits.4723addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);4724// Step #1.4725TPT.mutateType(ExtOpnd, Ext->getType());4726// Step #2.4727TPT.replaceAllUsesWith(Ext, ExtOpnd);4728// Step #3.4729LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");4730for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;4731++OpIdx) {4732LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');4733if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||4734!shouldExtOperand(ExtOpnd, OpIdx)) {4735LLVM_DEBUG(dbgs() << "No need to propagate\n");4736continue;4737}4738// Check if we can statically extend the operand.4739Value *Opnd = ExtOpnd->getOperand(OpIdx);4740if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {4741LLVM_DEBUG(dbgs() << "Statically extend\n");4742unsigned BitWidth = Ext->getType()->getIntegerBitWidth();4743APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)4744: Cst->getValue().zext(BitWidth);4745TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));4746continue;4747}4748// UndefValue are typed, so we have to statically sign extend them.4749if (isa<UndefValue>(Opnd)) {4750LLVM_DEBUG(dbgs() << "Statically extend\n");4751TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));4752continue;4753}47544755// Otherwise we have to explicitly sign extend the operand.4756Value *ValForExtOpnd = IsSExt4757? TPT.createSExt(ExtOpnd, Opnd, Ext->getType())4758: TPT.createZExt(ExtOpnd, Opnd, Ext->getType());4759TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);4760Instruction *InstForExtOpnd = dyn_cast<Instruction>(ValForExtOpnd);4761if (!InstForExtOpnd)4762continue;47634764if (Exts)4765Exts->push_back(InstForExtOpnd);47664767CreatedInstsCost += !TLI.isExtFree(InstForExtOpnd);4768}4769LLVM_DEBUG(dbgs() << "Extension is useless now\n");4770TPT.eraseInstruction(Ext);4771return ExtOpnd;4772}47734774/// Check whether or not promoting an instruction to a wider type is profitable.4775/// \p NewCost gives the cost of extension instructions created by the4776/// promotion.4777/// \p OldCost gives the cost of extension instructions before the promotion4778/// plus the number of instructions that have been4779/// matched in the addressing mode the promotion.4780/// \p PromotedOperand is the value that has been promoted.4781/// \return True if the promotion is profitable, false otherwise.4782bool AddressingModeMatcher::isPromotionProfitable(4783unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {4784LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost4785<< '\n');4786// The cost of the new extensions is greater than the cost of the4787// old extension plus what we folded.4788// This is not profitable.4789if (NewCost > OldCost)4790return false;4791if (NewCost < OldCost)4792return true;4793// The promotion is neutral but it may help folding the sign extension in4794// loads for instance.4795// Check that we did not create an illegal instruction.4796return isPromotedInstructionLegal(TLI, DL, PromotedOperand);4797}47984799/// Given an instruction or constant expr, see if we can fold the operation4800/// into the addressing mode. If so, update the addressing mode and return4801/// true, otherwise return false without modifying AddrMode.4802/// If \p MovedAway is not NULL, it contains the information of whether or4803/// not AddrInst has to be folded into the addressing mode on success.4804/// If \p MovedAway == true, \p AddrInst will not be part of the addressing4805/// because it has been moved away.4806/// Thus AddrInst must not be added in the matched instructions.4807/// This state can happen when AddrInst is a sext, since it may be moved away.4808/// Therefore, AddrInst may not be valid when MovedAway is true and it must4809/// not be referenced anymore.4810bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,4811unsigned Depth,4812bool *MovedAway) {4813// Avoid exponential behavior on extremely deep expression trees.4814if (Depth >= 5)4815return false;48164817// By default, all matched instructions stay in place.4818if (MovedAway)4819*MovedAway = false;48204821switch (Opcode) {4822case Instruction::PtrToInt:4823// PtrToInt is always a noop, as we know that the int type is pointer sized.4824return matchAddr(AddrInst->getOperand(0), Depth);4825case Instruction::IntToPtr: {4826auto AS = AddrInst->getType()->getPointerAddressSpace();4827auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));4828// This inttoptr is a no-op if the integer type is pointer sized.4829if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)4830return matchAddr(AddrInst->getOperand(0), Depth);4831return false;4832}4833case Instruction::BitCast:4834// BitCast is always a noop, and we can handle it as long as it is4835// int->int or pointer->pointer (we don't want int<->fp or something).4836if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&4837// Don't touch identity bitcasts. These were probably put here by LSR,4838// and we don't want to mess around with them. Assume it knows what it4839// is doing.4840AddrInst->getOperand(0)->getType() != AddrInst->getType())4841return matchAddr(AddrInst->getOperand(0), Depth);4842return false;4843case Instruction::AddrSpaceCast: {4844unsigned SrcAS =4845AddrInst->getOperand(0)->getType()->getPointerAddressSpace();4846unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();4847if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))4848return matchAddr(AddrInst->getOperand(0), Depth);4849return false;4850}4851case Instruction::Add: {4852// Check to see if we can merge in one operand, then the other. If so, we4853// win.4854ExtAddrMode BackupAddrMode = AddrMode;4855unsigned OldSize = AddrModeInsts.size();4856// Start a transaction at this point.4857// The LHS may match but not the RHS.4858// Therefore, we need a higher level restoration point to undo partially4859// matched operation.4860TypePromotionTransaction::ConstRestorationPt LastKnownGood =4861TPT.getRestorationPoint();48624863// Try to match an integer constant second to increase its chance of ending4864// up in `BaseOffs`, resp. decrease its chance of ending up in `BaseReg`.4865int First = 0, Second = 1;4866if (isa<ConstantInt>(AddrInst->getOperand(First))4867&& !isa<ConstantInt>(AddrInst->getOperand(Second)))4868std::swap(First, Second);4869AddrMode.InBounds = false;4870if (matchAddr(AddrInst->getOperand(First), Depth + 1) &&4871matchAddr(AddrInst->getOperand(Second), Depth + 1))4872return true;48734874// Restore the old addr mode info.4875AddrMode = BackupAddrMode;4876AddrModeInsts.resize(OldSize);4877TPT.rollback(LastKnownGood);48784879// Otherwise this was over-aggressive. Try merging operands in the opposite4880// order.4881if (matchAddr(AddrInst->getOperand(Second), Depth + 1) &&4882matchAddr(AddrInst->getOperand(First), Depth + 1))4883return true;48844885// Otherwise we definitely can't merge the ADD in.4886AddrMode = BackupAddrMode;4887AddrModeInsts.resize(OldSize);4888TPT.rollback(LastKnownGood);4889break;4890}4891// case Instruction::Or:4892// TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.4893// break;4894case Instruction::Mul:4895case Instruction::Shl: {4896// Can only handle X*C and X << C.4897AddrMode.InBounds = false;4898ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));4899if (!RHS || RHS->getBitWidth() > 64)4900return false;4901int64_t Scale = Opcode == Instruction::Shl4902? 1LL << RHS->getLimitedValue(RHS->getBitWidth() - 1)4903: RHS->getSExtValue();49044905return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);4906}4907case Instruction::GetElementPtr: {4908// Scan the GEP. We check it if it contains constant offsets and at most4909// one variable offset.4910int VariableOperand = -1;4911unsigned VariableScale = 0;49124913int64_t ConstantOffset = 0;4914gep_type_iterator GTI = gep_type_begin(AddrInst);4915for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {4916if (StructType *STy = GTI.getStructTypeOrNull()) {4917const StructLayout *SL = DL.getStructLayout(STy);4918unsigned Idx =4919cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();4920ConstantOffset += SL->getElementOffset(Idx);4921} else {4922TypeSize TS = GTI.getSequentialElementStride(DL);4923if (TS.isNonZero()) {4924// The optimisations below currently only work for fixed offsets.4925if (TS.isScalable())4926return false;4927int64_t TypeSize = TS.getFixedValue();4928if (ConstantInt *CI =4929dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {4930const APInt &CVal = CI->getValue();4931if (CVal.getSignificantBits() <= 64) {4932ConstantOffset += CVal.getSExtValue() * TypeSize;4933continue;4934}4935}4936// We only allow one variable index at the moment.4937if (VariableOperand != -1)4938return false;49394940// Remember the variable index.4941VariableOperand = i;4942VariableScale = TypeSize;4943}4944}4945}49464947// A common case is for the GEP to only do a constant offset. In this case,4948// just add it to the disp field and check validity.4949if (VariableOperand == -1) {4950AddrMode.BaseOffs += ConstantOffset;4951if (matchAddr(AddrInst->getOperand(0), Depth + 1)) {4952if (!cast<GEPOperator>(AddrInst)->isInBounds())4953AddrMode.InBounds = false;4954return true;4955}4956AddrMode.BaseOffs -= ConstantOffset;49574958if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&4959TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&4960ConstantOffset > 0) {4961// Record GEPs with non-zero offsets as candidates for splitting in4962// the event that the offset cannot fit into the r+i addressing mode.4963// Simple and common case that only one GEP is used in calculating the4964// address for the memory access.4965Value *Base = AddrInst->getOperand(0);4966auto *BaseI = dyn_cast<Instruction>(Base);4967auto *GEP = cast<GetElementPtrInst>(AddrInst);4968if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||4969(BaseI && !isa<CastInst>(BaseI) &&4970!isa<GetElementPtrInst>(BaseI))) {4971// Make sure the parent block allows inserting non-PHI instructions4972// before the terminator.4973BasicBlock *Parent = BaseI ? BaseI->getParent()4974: &GEP->getFunction()->getEntryBlock();4975if (!Parent->getTerminator()->isEHPad())4976LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);4977}4978}49794980return false;4981}49824983// Save the valid addressing mode in case we can't match.4984ExtAddrMode BackupAddrMode = AddrMode;4985unsigned OldSize = AddrModeInsts.size();49864987// See if the scale and offset amount is valid for this target.4988AddrMode.BaseOffs += ConstantOffset;4989if (!cast<GEPOperator>(AddrInst)->isInBounds())4990AddrMode.InBounds = false;49914992// Match the base operand of the GEP.4993if (!matchAddr(AddrInst->getOperand(0), Depth + 1)) {4994// If it couldn't be matched, just stuff the value in a register.4995if (AddrMode.HasBaseReg) {4996AddrMode = BackupAddrMode;4997AddrModeInsts.resize(OldSize);4998return false;4999}5000AddrMode.HasBaseReg = true;5001AddrMode.BaseReg = AddrInst->getOperand(0);5002}50035004// Match the remaining variable portion of the GEP.5005if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,5006Depth)) {5007// If it couldn't be matched, try stuffing the base into a register5008// instead of matching it, and retrying the match of the scale.5009AddrMode = BackupAddrMode;5010AddrModeInsts.resize(OldSize);5011if (AddrMode.HasBaseReg)5012return false;5013AddrMode.HasBaseReg = true;5014AddrMode.BaseReg = AddrInst->getOperand(0);5015AddrMode.BaseOffs += ConstantOffset;5016if (!matchScaledValue(AddrInst->getOperand(VariableOperand),5017VariableScale, Depth)) {5018// If even that didn't work, bail.5019AddrMode = BackupAddrMode;5020AddrModeInsts.resize(OldSize);5021return false;5022}5023}50245025return true;5026}5027case Instruction::SExt:5028case Instruction::ZExt: {5029Instruction *Ext = dyn_cast<Instruction>(AddrInst);5030if (!Ext)5031return false;50325033// Try to move this ext out of the way of the addressing mode.5034// Ask for a method for doing so.5035TypePromotionHelper::Action TPH =5036TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);5037if (!TPH)5038return false;50395040TypePromotionTransaction::ConstRestorationPt LastKnownGood =5041TPT.getRestorationPoint();5042unsigned CreatedInstsCost = 0;5043unsigned ExtCost = !TLI.isExtFree(Ext);5044Value *PromotedOperand =5045TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);5046// SExt has been moved away.5047// Thus either it will be rematched later in the recursive calls or it is5048// gone. Anyway, we must not fold it into the addressing mode at this point.5049// E.g.,5050// op = add opnd, 15051// idx = ext op5052// addr = gep base, idx5053// is now:5054// promotedOpnd = ext opnd <- no match here5055// op = promoted_add promotedOpnd, 1 <- match (later in recursive calls)5056// addr = gep base, op <- match5057if (MovedAway)5058*MovedAway = true;50595060assert(PromotedOperand &&5061"TypePromotionHelper should have filtered out those cases");50625063ExtAddrMode BackupAddrMode = AddrMode;5064unsigned OldSize = AddrModeInsts.size();50655066if (!matchAddr(PromotedOperand, Depth) ||5067// The total of the new cost is equal to the cost of the created5068// instructions.5069// The total of the old cost is equal to the cost of the extension plus5070// what we have saved in the addressing mode.5071!isPromotionProfitable(CreatedInstsCost,5072ExtCost + (AddrModeInsts.size() - OldSize),5073PromotedOperand)) {5074AddrMode = BackupAddrMode;5075AddrModeInsts.resize(OldSize);5076LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");5077TPT.rollback(LastKnownGood);5078return false;5079}5080return true;5081}5082case Instruction::Call:5083if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {5084if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {5085GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));5086if (TLI.addressingModeSupportsTLS(GV))5087return matchAddr(AddrInst->getOperand(0), Depth);5088}5089}5090break;5091}5092return false;5093}50945095/// If we can, try to add the value of 'Addr' into the current addressing mode.5096/// If Addr can't be added to AddrMode this returns false and leaves AddrMode5097/// unmodified. This assumes that Addr is either a pointer type or intptr_t5098/// for the target.5099///5100bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {5101// Start a transaction at this point that we will rollback if the matching5102// fails.5103TypePromotionTransaction::ConstRestorationPt LastKnownGood =5104TPT.getRestorationPoint();5105if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {5106if (CI->getValue().isSignedIntN(64)) {5107// Fold in immediates if legal for the target.5108AddrMode.BaseOffs += CI->getSExtValue();5109if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))5110return true;5111AddrMode.BaseOffs -= CI->getSExtValue();5112}5113} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {5114// If this is a global variable, try to fold it into the addressing mode.5115if (!AddrMode.BaseGV) {5116AddrMode.BaseGV = GV;5117if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))5118return true;5119AddrMode.BaseGV = nullptr;5120}5121} else if (Instruction *I = dyn_cast<Instruction>(Addr)) {5122ExtAddrMode BackupAddrMode = AddrMode;5123unsigned OldSize = AddrModeInsts.size();51245125// Check to see if it is possible to fold this operation.5126bool MovedAway = false;5127if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {5128// This instruction may have been moved away. If so, there is nothing5129// to check here.5130if (MovedAway)5131return true;5132// Okay, it's possible to fold this. Check to see if it is actually5133// *profitable* to do so. We use a simple cost model to avoid increasing5134// register pressure too much.5135if (I->hasOneUse() ||5136isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {5137AddrModeInsts.push_back(I);5138return true;5139}51405141// It isn't profitable to do this, roll back.5142AddrMode = BackupAddrMode;5143AddrModeInsts.resize(OldSize);5144TPT.rollback(LastKnownGood);5145}5146} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {5147if (matchOperationAddr(CE, CE->getOpcode(), Depth))5148return true;5149TPT.rollback(LastKnownGood);5150} else if (isa<ConstantPointerNull>(Addr)) {5151// Null pointer gets folded without affecting the addressing mode.5152return true;5153}51545155// Worse case, the target should support [reg] addressing modes. :)5156if (!AddrMode.HasBaseReg) {5157AddrMode.HasBaseReg = true;5158AddrMode.BaseReg = Addr;5159// Still check for legality in case the target supports [imm] but not [i+r].5160if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))5161return true;5162AddrMode.HasBaseReg = false;5163AddrMode.BaseReg = nullptr;5164}51655166// If the base register is already taken, see if we can do [r+r].5167if (AddrMode.Scale == 0) {5168AddrMode.Scale = 1;5169AddrMode.ScaledReg = Addr;5170if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))5171return true;5172AddrMode.Scale = 0;5173AddrMode.ScaledReg = nullptr;5174}5175// Couldn't match.5176TPT.rollback(LastKnownGood);5177return false;5178}51795180/// Check to see if all uses of OpVal by the specified inline asm call are due5181/// to memory operands. If so, return true, otherwise return false.5182static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,5183const TargetLowering &TLI,5184const TargetRegisterInfo &TRI) {5185const Function *F = CI->getFunction();5186TargetLowering::AsmOperandInfoVector TargetConstraints =5187TLI.ParseConstraints(F->getDataLayout(), &TRI, *CI);51885189for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {5190// Compute the constraint code and ConstraintType to use.5191TLI.ComputeConstraintToUse(OpInfo, SDValue());51925193// If this asm operand is our Value*, and if it isn't an indirect memory5194// operand, we can't fold it! TODO: Also handle C_Address?5195if (OpInfo.CallOperandVal == OpVal &&5196(OpInfo.ConstraintType != TargetLowering::C_Memory ||5197!OpInfo.isIndirect))5198return false;5199}52005201return true;5202}52035204/// Recursively walk all the uses of I until we find a memory use.5205/// If we find an obviously non-foldable instruction, return true.5206/// Add accessed addresses and types to MemoryUses.5207static bool FindAllMemoryUses(5208Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,5209SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,5210const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,5211BlockFrequencyInfo *BFI, unsigned &SeenInsts) {5212// If we already considered this instruction, we're done.5213if (!ConsideredInsts.insert(I).second)5214return false;52155216// If this is an obviously unfoldable instruction, bail out.5217if (!MightBeFoldableInst(I))5218return true;52195220// Loop over all the uses, recursively processing them.5221for (Use &U : I->uses()) {5222// Conservatively return true if we're seeing a large number or a deep chain5223// of users. This avoids excessive compilation times in pathological cases.5224if (SeenInsts++ >= MaxAddressUsersToScan)5225return true;52265227Instruction *UserI = cast<Instruction>(U.getUser());5228if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {5229MemoryUses.push_back({&U, LI->getType()});5230continue;5231}52325233if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {5234if (U.getOperandNo() != StoreInst::getPointerOperandIndex())5235return true; // Storing addr, not into addr.5236MemoryUses.push_back({&U, SI->getValueOperand()->getType()});5237continue;5238}52395240if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {5241if (U.getOperandNo() != AtomicRMWInst::getPointerOperandIndex())5242return true; // Storing addr, not into addr.5243MemoryUses.push_back({&U, RMW->getValOperand()->getType()});5244continue;5245}52465247if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {5248if (U.getOperandNo() != AtomicCmpXchgInst::getPointerOperandIndex())5249return true; // Storing addr, not into addr.5250MemoryUses.push_back({&U, CmpX->getCompareOperand()->getType()});5251continue;5252}52535254if (CallInst *CI = dyn_cast<CallInst>(UserI)) {5255if (CI->hasFnAttr(Attribute::Cold)) {5256// If this is a cold call, we can sink the addressing calculation into5257// the cold path. See optimizeCallInst5258bool OptForSize =5259OptSize || llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);5260if (!OptForSize)5261continue;5262}52635264InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());5265if (!IA)5266return true;52675268// If this is a memory operand, we're cool, otherwise bail out.5269if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))5270return true;5271continue;5272}52735274if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,5275PSI, BFI, SeenInsts))5276return true;5277}52785279return false;5280}52815282static bool FindAllMemoryUses(5283Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,5284const TargetLowering &TLI, const TargetRegisterInfo &TRI, bool OptSize,5285ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {5286unsigned SeenInsts = 0;5287SmallPtrSet<Instruction *, 16> ConsideredInsts;5288return FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,5289PSI, BFI, SeenInsts);5290}529152925293/// Return true if Val is already known to be live at the use site that we're5294/// folding it into. If so, there is no cost to include it in the addressing5295/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the5296/// instruction already.5297bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,5298Value *KnownLive1,5299Value *KnownLive2) {5300// If Val is either of the known-live values, we know it is live!5301if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)5302return true;53035304// All values other than instructions and arguments (e.g. constants) are live.5305if (!isa<Instruction>(Val) && !isa<Argument>(Val))5306return true;53075308// If Val is a constant sized alloca in the entry block, it is live, this is5309// true because it is just a reference to the stack/frame pointer, which is5310// live for the whole function.5311if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))5312if (AI->isStaticAlloca())5313return true;53145315// Check to see if this value is already used in the memory instruction's5316// block. If so, it's already live into the block at the very least, so we5317// can reasonably fold it.5318return Val->isUsedInBasicBlock(MemoryInst->getParent());5319}53205321/// It is possible for the addressing mode of the machine to fold the specified5322/// instruction into a load or store that ultimately uses it.5323/// However, the specified instruction has multiple uses.5324/// Given this, it may actually increase register pressure to fold it5325/// into the load. For example, consider this code:5326///5327/// X = ...5328/// Y = X+15329/// use(Y) -> nonload/store5330/// Z = Y+15331/// load Z5332///5333/// In this case, Y has multiple uses, and can be folded into the load of Z5334/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to5335/// be live at the use(Y) line. If we don't fold Y into load Z, we use one5336/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the5337/// number of computations either.5338///5339/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If5340/// X was live across 'load Z' for other reasons, we actually *would* want to5341/// fold the addressing mode in the Z case. This would make Y die earlier.5342bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode(5343Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) {5344if (IgnoreProfitability)5345return true;53465347// AMBefore is the addressing mode before this instruction was folded into it,5348// and AMAfter is the addressing mode after the instruction was folded. Get5349// the set of registers referenced by AMAfter and subtract out those5350// referenced by AMBefore: this is the set of values which folding in this5351// address extends the lifetime of.5352//5353// Note that there are only two potential values being referenced here,5354// BaseReg and ScaleReg (global addresses are always available, as are any5355// folded immediates).5356Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;53575358// If the BaseReg or ScaledReg was referenced by the previous addrmode, their5359// lifetime wasn't extended by adding this instruction.5360if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))5361BaseReg = nullptr;5362if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))5363ScaledReg = nullptr;53645365// If folding this instruction (and it's subexprs) didn't extend any live5366// ranges, we're ok with it.5367if (!BaseReg && !ScaledReg)5368return true;53695370// If all uses of this instruction can have the address mode sunk into them,5371// we can remove the addressing mode and effectively trade one live register5372// for another (at worst.) In this context, folding an addressing mode into5373// the use is just a particularly nice way of sinking it.5374SmallVector<std::pair<Use *, Type *>, 16> MemoryUses;5375if (FindAllMemoryUses(I, MemoryUses, TLI, TRI, OptSize, PSI, BFI))5376return false; // Has a non-memory, non-foldable use!53775378// Now that we know that all uses of this instruction are part of a chain of5379// computation involving only operations that could theoretically be folded5380// into a memory use, loop over each of these memory operation uses and see5381// if they could *actually* fold the instruction. The assumption is that5382// addressing modes are cheap and that duplicating the computation involved5383// many times is worthwhile, even on a fastpath. For sinking candidates5384// (i.e. cold call sites), this serves as a way to prevent excessive code5385// growth since most architectures have some reasonable small and fast way to5386// compute an effective address. (i.e LEA on x86)5387SmallVector<Instruction *, 32> MatchedAddrModeInsts;5388for (const std::pair<Use *, Type *> &Pair : MemoryUses) {5389Value *Address = Pair.first->get();5390Instruction *UserI = cast<Instruction>(Pair.first->getUser());5391Type *AddressAccessTy = Pair.second;5392unsigned AS = Address->getType()->getPointerAddressSpace();53935394// Do a match against the root of this address, ignoring profitability. This5395// will tell us if the addressing mode for the memory operation will5396// *actually* cover the shared instruction.5397ExtAddrMode Result;5398std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,53990);5400TypePromotionTransaction::ConstRestorationPt LastKnownGood =5401TPT.getRestorationPoint();5402AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn,5403AddressAccessTy, AS, UserI, Result,5404InsertedInsts, PromotedInsts, TPT,5405LargeOffsetGEP, OptSize, PSI, BFI);5406Matcher.IgnoreProfitability = true;5407bool Success = Matcher.matchAddr(Address, 0);5408(void)Success;5409assert(Success && "Couldn't select *anything*?");54105411// The match was to check the profitability, the changes made are not5412// part of the original matcher. Therefore, they should be dropped5413// otherwise the original matcher will not present the right state.5414TPT.rollback(LastKnownGood);54155416// If the match didn't cover I, then it won't be shared by it.5417if (!is_contained(MatchedAddrModeInsts, I))5418return false;54195420MatchedAddrModeInsts.clear();5421}54225423return true;5424}54255426/// Return true if the specified values are defined in a5427/// different basic block than BB.5428static bool IsNonLocalValue(Value *V, BasicBlock *BB) {5429if (Instruction *I = dyn_cast<Instruction>(V))5430return I->getParent() != BB;5431return false;5432}54335434/// Sink addressing mode computation immediate before MemoryInst if doing so5435/// can be done without increasing register pressure. The need for the5436/// register pressure constraint means this can end up being an all or nothing5437/// decision for all uses of the same addressing computation.5438///5439/// Load and Store Instructions often have addressing modes that can do5440/// significant amounts of computation. As such, instruction selection will try5441/// to get the load or store to do as much computation as possible for the5442/// program. The problem is that isel can only see within a single block. As5443/// such, we sink as much legal addressing mode work into the block as possible.5444///5445/// This method is used to optimize both load/store and inline asms with memory5446/// operands. It's also used to sink addressing computations feeding into cold5447/// call sites into their (cold) basic block.5448///5449/// The motivation for handling sinking into cold blocks is that doing so can5450/// both enable other address mode sinking (by satisfying the register pressure5451/// constraint above), and reduce register pressure globally (by removing the5452/// addressing mode computation from the fast path entirely.).5453bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,5454Type *AccessTy, unsigned AddrSpace) {5455Value *Repl = Addr;54565457// Try to collapse single-value PHI nodes. This is necessary to undo5458// unprofitable PRE transformations.5459SmallVector<Value *, 8> worklist;5460SmallPtrSet<Value *, 16> Visited;5461worklist.push_back(Addr);54625463// Use a worklist to iteratively look through PHI and select nodes, and5464// ensure that the addressing mode obtained from the non-PHI/select roots of5465// the graph are compatible.5466bool PhiOrSelectSeen = false;5467SmallVector<Instruction *, 16> AddrModeInsts;5468const SimplifyQuery SQ(*DL, TLInfo);5469AddressingModeCombiner AddrModes(SQ, Addr);5470TypePromotionTransaction TPT(RemovedInsts);5471TypePromotionTransaction::ConstRestorationPt LastKnownGood =5472TPT.getRestorationPoint();5473while (!worklist.empty()) {5474Value *V = worklist.pop_back_val();54755476// We allow traversing cyclic Phi nodes.5477// In case of success after this loop we ensure that traversing through5478// Phi nodes ends up with all cases to compute address of the form5479// BaseGV + Base + Scale * Index + Offset5480// where Scale and Offset are constans and BaseGV, Base and Index5481// are exactly the same Values in all cases.5482// It means that BaseGV, Scale and Offset dominate our memory instruction5483// and have the same value as they had in address computation represented5484// as Phi. So we can safely sink address computation to memory instruction.5485if (!Visited.insert(V).second)5486continue;54875488// For a PHI node, push all of its incoming values.5489if (PHINode *P = dyn_cast<PHINode>(V)) {5490append_range(worklist, P->incoming_values());5491PhiOrSelectSeen = true;5492continue;5493}5494// Similar for select.5495if (SelectInst *SI = dyn_cast<SelectInst>(V)) {5496worklist.push_back(SI->getFalseValue());5497worklist.push_back(SI->getTrueValue());5498PhiOrSelectSeen = true;5499continue;5500}55015502// For non-PHIs, determine the addressing mode being computed. Note that5503// the result may differ depending on what other uses our candidate5504// addressing instructions might have.5505AddrModeInsts.clear();5506std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,55070);5508// Defer the query (and possible computation of) the dom tree to point of5509// actual use. It's expected that most address matches don't actually need5510// the domtree.5511auto getDTFn = [MemoryInst, this]() -> const DominatorTree & {5512Function *F = MemoryInst->getParent()->getParent();5513return this->getDT(*F);5514};5515ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(5516V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn,5517*TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,5518BFI.get());55195520GetElementPtrInst *GEP = LargeOffsetGEP.first;5521if (GEP && !NewGEPBases.count(GEP)) {5522// If splitting the underlying data structure can reduce the offset of a5523// GEP, collect the GEP. Skip the GEPs that are the new bases of5524// previously split data structures.5525LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);5526LargeOffsetGEPID.insert(std::make_pair(GEP, LargeOffsetGEPID.size()));5527}55285529NewAddrMode.OriginalValue = V;5530if (!AddrModes.addNewAddrMode(NewAddrMode))5531break;5532}55335534// Try to combine the AddrModes we've collected. If we couldn't collect any,5535// or we have multiple but either couldn't combine them or combining them5536// wouldn't do anything useful, bail out now.5537if (!AddrModes.combineAddrModes()) {5538TPT.rollback(LastKnownGood);5539return false;5540}5541bool Modified = TPT.commit();55425543// Get the combined AddrMode (or the only AddrMode, if we only had one).5544ExtAddrMode AddrMode = AddrModes.getAddrMode();55455546// If all the instructions matched are already in this BB, don't do anything.5547// If we saw a Phi node then it is not local definitely, and if we saw a5548// select then we want to push the address calculation past it even if it's5549// already in this BB.5550if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {5551return IsNonLocalValue(V, MemoryInst->getParent());5552})) {5553LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode5554<< "\n");5555return Modified;5556}55575558// Insert this computation right after this user. Since our caller is5559// scanning from the top of the BB to the bottom, reuse of the expr are5560// guaranteed to happen later.5561IRBuilder<> Builder(MemoryInst);55625563// Now that we determined the addressing expression we want to use and know5564// that we have to sink it into this block. Check to see if we have already5565// done this for some other load/store instr in this block. If so, reuse5566// the computation. Before attempting reuse, check if the address is valid5567// as it may have been erased.55685569WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];55705571Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;5572Type *IntPtrTy = DL->getIntPtrType(Addr->getType());5573if (SunkAddr) {5574LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode5575<< " for " << *MemoryInst << "\n");5576if (SunkAddr->getType() != Addr->getType()) {5577if (SunkAddr->getType()->getPointerAddressSpace() !=5578Addr->getType()->getPointerAddressSpace() &&5579!DL->isNonIntegralPointerType(Addr->getType())) {5580// There are two reasons the address spaces might not match: a no-op5581// addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a5582// ptrtoint/inttoptr pair to ensure we match the original semantics.5583// TODO: allow bitcast between different address space pointers with the5584// same size.5585SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");5586SunkAddr =5587Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");5588} else5589SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());5590}5591} else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&5592SubtargetInfo->addrSinkUsingGEPs())) {5593// By default, we use the GEP-based method when AA is used later. This5594// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.5595LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode5596<< " for " << *MemoryInst << "\n");5597Value *ResultPtr = nullptr, *ResultIndex = nullptr;55985599// First, find the pointer.5600if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {5601ResultPtr = AddrMode.BaseReg;5602AddrMode.BaseReg = nullptr;5603}56045605if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {5606// We can't add more than one pointer together, nor can we scale a5607// pointer (both of which seem meaningless).5608if (ResultPtr || AddrMode.Scale != 1)5609return Modified;56105611ResultPtr = AddrMode.ScaledReg;5612AddrMode.Scale = 0;5613}56145615// It is only safe to sign extend the BaseReg if we know that the math5616// required to create it did not overflow before we extend it. Since5617// the original IR value was tossed in favor of a constant back when5618// the AddrMode was created we need to bail out gracefully if widths5619// do not match instead of extending it.5620//5621// (See below for code to add the scale.)5622if (AddrMode.Scale) {5623Type *ScaledRegTy = AddrMode.ScaledReg->getType();5624if (cast<IntegerType>(IntPtrTy)->getBitWidth() >5625cast<IntegerType>(ScaledRegTy)->getBitWidth())5626return Modified;5627}56285629GlobalValue *BaseGV = AddrMode.BaseGV;5630if (BaseGV != nullptr) {5631if (ResultPtr)5632return Modified;56335634if (BaseGV->isThreadLocal()) {5635ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);5636} else {5637ResultPtr = BaseGV;5638}5639}56405641// If the real base value actually came from an inttoptr, then the matcher5642// will look through it and provide only the integer value. In that case,5643// use it here.5644if (!DL->isNonIntegralPointerType(Addr->getType())) {5645if (!ResultPtr && AddrMode.BaseReg) {5646ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),5647"sunkaddr");5648AddrMode.BaseReg = nullptr;5649} else if (!ResultPtr && AddrMode.Scale == 1) {5650ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),5651"sunkaddr");5652AddrMode.Scale = 0;5653}5654}56555656if (!ResultPtr && !AddrMode.BaseReg && !AddrMode.Scale &&5657!AddrMode.BaseOffs) {5658SunkAddr = Constant::getNullValue(Addr->getType());5659} else if (!ResultPtr) {5660return Modified;5661} else {5662Type *I8PtrTy =5663Builder.getPtrTy(Addr->getType()->getPointerAddressSpace());56645665// Start with the base register. Do this first so that subsequent address5666// matching finds it last, which will prevent it from trying to match it5667// as the scaled value in case it happens to be a mul. That would be5668// problematic if we've sunk a different mul for the scale, because then5669// we'd end up sinking both muls.5670if (AddrMode.BaseReg) {5671Value *V = AddrMode.BaseReg;5672if (V->getType() != IntPtrTy)5673V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");56745675ResultIndex = V;5676}56775678// Add the scale value.5679if (AddrMode.Scale) {5680Value *V = AddrMode.ScaledReg;5681if (V->getType() == IntPtrTy) {5682// done.5683} else {5684assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <5685cast<IntegerType>(V->getType())->getBitWidth() &&5686"We can't transform if ScaledReg is too narrow");5687V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");5688}56895690if (AddrMode.Scale != 1)5691V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),5692"sunkaddr");5693if (ResultIndex)5694ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");5695else5696ResultIndex = V;5697}56985699// Add in the Base Offset if present.5700if (AddrMode.BaseOffs) {5701Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);5702if (ResultIndex) {5703// We need to add this separately from the scale above to help with5704// SDAG consecutive load/store merging.5705if (ResultPtr->getType() != I8PtrTy)5706ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);5707ResultPtr = Builder.CreatePtrAdd(ResultPtr, ResultIndex, "sunkaddr",5708AddrMode.InBounds);5709}57105711ResultIndex = V;5712}57135714if (!ResultIndex) {5715SunkAddr = ResultPtr;5716} else {5717if (ResultPtr->getType() != I8PtrTy)5718ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);5719SunkAddr = Builder.CreatePtrAdd(ResultPtr, ResultIndex, "sunkaddr",5720AddrMode.InBounds);5721}57225723if (SunkAddr->getType() != Addr->getType()) {5724if (SunkAddr->getType()->getPointerAddressSpace() !=5725Addr->getType()->getPointerAddressSpace() &&5726!DL->isNonIntegralPointerType(Addr->getType())) {5727// There are two reasons the address spaces might not match: a no-op5728// addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a5729// ptrtoint/inttoptr pair to ensure we match the original semantics.5730// TODO: allow bitcast between different address space pointers with5731// the same size.5732SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");5733SunkAddr =5734Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");5735} else5736SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());5737}5738}5739} else {5740// We'd require a ptrtoint/inttoptr down the line, which we can't do for5741// non-integral pointers, so in that case bail out now.5742Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;5743Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;5744PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);5745PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);5746if (DL->isNonIntegralPointerType(Addr->getType()) ||5747(BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||5748(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||5749(AddrMode.BaseGV &&5750DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))5751return Modified;57525753LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode5754<< " for " << *MemoryInst << "\n");5755Type *IntPtrTy = DL->getIntPtrType(Addr->getType());5756Value *Result = nullptr;57575758// Start with the base register. Do this first so that subsequent address5759// matching finds it last, which will prevent it from trying to match it5760// as the scaled value in case it happens to be a mul. That would be5761// problematic if we've sunk a different mul for the scale, because then5762// we'd end up sinking both muls.5763if (AddrMode.BaseReg) {5764Value *V = AddrMode.BaseReg;5765if (V->getType()->isPointerTy())5766V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");5767if (V->getType() != IntPtrTy)5768V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");5769Result = V;5770}57715772// Add the scale value.5773if (AddrMode.Scale) {5774Value *V = AddrMode.ScaledReg;5775if (V->getType() == IntPtrTy) {5776// done.5777} else if (V->getType()->isPointerTy()) {5778V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");5779} else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <5780cast<IntegerType>(V->getType())->getBitWidth()) {5781V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");5782} else {5783// It is only safe to sign extend the BaseReg if we know that the math5784// required to create it did not overflow before we extend it. Since5785// the original IR value was tossed in favor of a constant back when5786// the AddrMode was created we need to bail out gracefully if widths5787// do not match instead of extending it.5788Instruction *I = dyn_cast_or_null<Instruction>(Result);5789if (I && (Result != AddrMode.BaseReg))5790I->eraseFromParent();5791return Modified;5792}5793if (AddrMode.Scale != 1)5794V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),5795"sunkaddr");5796if (Result)5797Result = Builder.CreateAdd(Result, V, "sunkaddr");5798else5799Result = V;5800}58015802// Add in the BaseGV if present.5803GlobalValue *BaseGV = AddrMode.BaseGV;5804if (BaseGV != nullptr) {5805Value *BaseGVPtr;5806if (BaseGV->isThreadLocal()) {5807BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV);5808} else {5809BaseGVPtr = BaseGV;5810}5811Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr");5812if (Result)5813Result = Builder.CreateAdd(Result, V, "sunkaddr");5814else5815Result = V;5816}58175818// Add in the Base Offset if present.5819if (AddrMode.BaseOffs) {5820Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);5821if (Result)5822Result = Builder.CreateAdd(Result, V, "sunkaddr");5823else5824Result = V;5825}58265827if (!Result)5828SunkAddr = Constant::getNullValue(Addr->getType());5829else5830SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");5831}58325833MemoryInst->replaceUsesOfWith(Repl, SunkAddr);5834// Store the newly computed address into the cache. In the case we reused a5835// value, this should be idempotent.5836SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);58375838// If we have no uses, recursively delete the value and all dead instructions5839// using it.5840if (Repl->use_empty()) {5841resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() {5842RecursivelyDeleteTriviallyDeadInstructions(5843Repl, TLInfo, nullptr,5844[&](Value *V) { removeAllAssertingVHReferences(V); });5845});5846}5847++NumMemoryInsts;5848return true;5849}58505851/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find5852/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can5853/// only handle a 2 operand GEP in the same basic block or a splat constant5854/// vector. The 2 operands to the GEP must have a scalar pointer and a vector5855/// index.5856///5857/// If the existing GEP has a vector base pointer that is splat, we can look5858/// through the splat to find the scalar pointer. If we can't find a scalar5859/// pointer there's nothing we can do.5860///5861/// If we have a GEP with more than 2 indices where the middle indices are all5862/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.5863///5864/// If the final index isn't a vector or is a splat, we can emit a scalar GEP5865/// followed by a GEP with an all zeroes vector index. This will enable5866/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a5867/// zero index.5868bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,5869Value *Ptr) {5870Value *NewAddr;58715872if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {5873// Don't optimize GEPs that don't have indices.5874if (!GEP->hasIndices())5875return false;58765877// If the GEP and the gather/scatter aren't in the same BB, don't optimize.5878// FIXME: We should support this by sinking the GEP.5879if (MemoryInst->getParent() != GEP->getParent())5880return false;58815882SmallVector<Value *, 2> Ops(GEP->operands());58835884bool RewriteGEP = false;58855886if (Ops[0]->getType()->isVectorTy()) {5887Ops[0] = getSplatValue(Ops[0]);5888if (!Ops[0])5889return false;5890RewriteGEP = true;5891}58925893unsigned FinalIndex = Ops.size() - 1;58945895// Ensure all but the last index is 0.5896// FIXME: This isn't strictly required. All that's required is that they are5897// all scalars or splats.5898for (unsigned i = 1; i < FinalIndex; ++i) {5899auto *C = dyn_cast<Constant>(Ops[i]);5900if (!C)5901return false;5902if (isa<VectorType>(C->getType()))5903C = C->getSplatValue();5904auto *CI = dyn_cast_or_null<ConstantInt>(C);5905if (!CI || !CI->isZero())5906return false;5907// Scalarize the index if needed.5908Ops[i] = CI;5909}59105911// Try to scalarize the final index.5912if (Ops[FinalIndex]->getType()->isVectorTy()) {5913if (Value *V = getSplatValue(Ops[FinalIndex])) {5914auto *C = dyn_cast<ConstantInt>(V);5915// Don't scalarize all zeros vector.5916if (!C || !C->isZero()) {5917Ops[FinalIndex] = V;5918RewriteGEP = true;5919}5920}5921}59225923// If we made any changes or the we have extra operands, we need to generate5924// new instructions.5925if (!RewriteGEP && Ops.size() == 2)5926return false;59275928auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();59295930IRBuilder<> Builder(MemoryInst);59315932Type *SourceTy = GEP->getSourceElementType();5933Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());59345935// If the final index isn't a vector, emit a scalar GEP containing all ops5936// and a vector GEP with all zeroes final index.5937if (!Ops[FinalIndex]->getType()->isVectorTy()) {5938NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());5939auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);5940auto *SecondTy = GetElementPtrInst::getIndexedType(5941SourceTy, ArrayRef(Ops).drop_front());5942NewAddr =5943Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));5944} else {5945Value *Base = Ops[0];5946Value *Index = Ops[FinalIndex];59475948// Create a scalar GEP if there are more than 2 operands.5949if (Ops.size() != 2) {5950// Replace the last index with 0.5951Ops[FinalIndex] =5952Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());5953Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front());5954SourceTy = GetElementPtrInst::getIndexedType(5955SourceTy, ArrayRef(Ops).drop_front());5956}59575958// Now create the GEP with scalar pointer and vector index.5959NewAddr = Builder.CreateGEP(SourceTy, Base, Index);5960}5961} else if (!isa<Constant>(Ptr)) {5962// Not a GEP, maybe its a splat and we can create a GEP to enable5963// SelectionDAGBuilder to use it as a uniform base.5964Value *V = getSplatValue(Ptr);5965if (!V)5966return false;59675968auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();59695970IRBuilder<> Builder(MemoryInst);59715972// Emit a vector GEP with a scalar pointer and all 0s vector index.5973Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());5974auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);5975Type *ScalarTy;5976if (cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() ==5977Intrinsic::masked_gather) {5978ScalarTy = MemoryInst->getType()->getScalarType();5979} else {5980assert(cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() ==5981Intrinsic::masked_scatter);5982ScalarTy = MemoryInst->getOperand(0)->getType()->getScalarType();5983}5984NewAddr = Builder.CreateGEP(ScalarTy, V, Constant::getNullValue(IndexTy));5985} else {5986// Constant, SelectionDAGBuilder knows to check if its a splat.5987return false;5988}59895990MemoryInst->replaceUsesOfWith(Ptr, NewAddr);59915992// If we have no uses, recursively delete the value and all dead instructions5993// using it.5994if (Ptr->use_empty())5995RecursivelyDeleteTriviallyDeadInstructions(5996Ptr, TLInfo, nullptr,5997[&](Value *V) { removeAllAssertingVHReferences(V); });59985999return true;6000}60016002/// If there are any memory operands, use OptimizeMemoryInst to sink their6003/// address computing into the block when possible / profitable.6004bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {6005bool MadeChange = false;60066007const TargetRegisterInfo *TRI =6008TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();6009TargetLowering::AsmOperandInfoVector TargetConstraints =6010TLI->ParseConstraints(*DL, TRI, *CS);6011unsigned ArgNo = 0;6012for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {6013// Compute the constraint code and ConstraintType to use.6014TLI->ComputeConstraintToUse(OpInfo, SDValue());60156016// TODO: Also handle C_Address?6017if (OpInfo.ConstraintType == TargetLowering::C_Memory &&6018OpInfo.isIndirect) {6019Value *OpVal = CS->getArgOperand(ArgNo++);6020MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);6021} else if (OpInfo.Type == InlineAsm::isInput)6022ArgNo++;6023}60246025return MadeChange;6026}60276028/// Check if all the uses of \p Val are equivalent (or free) zero or6029/// sign extensions.6030static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {6031assert(!Val->use_empty() && "Input must have at least one use");6032const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());6033bool IsSExt = isa<SExtInst>(FirstUser);6034Type *ExtTy = FirstUser->getType();6035for (const User *U : Val->users()) {6036const Instruction *UI = cast<Instruction>(U);6037if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))6038return false;6039Type *CurTy = UI->getType();6040// Same input and output types: Same instruction after CSE.6041if (CurTy == ExtTy)6042continue;60436044// If IsSExt is true, we are in this situation:6045// a = Val6046// b = sext ty1 a to ty26047// c = sext ty1 a to ty36048// Assuming ty2 is shorter than ty3, this could be turned into:6049// a = Val6050// b = sext ty1 a to ty26051// c = sext ty2 b to ty36052// However, the last sext is not free.6053if (IsSExt)6054return false;60556056// This is a ZExt, maybe this is free to extend from one type to another.6057// In that case, we would not account for a different use.6058Type *NarrowTy;6059Type *LargeTy;6060if (ExtTy->getScalarType()->getIntegerBitWidth() >6061CurTy->getScalarType()->getIntegerBitWidth()) {6062NarrowTy = CurTy;6063LargeTy = ExtTy;6064} else {6065NarrowTy = ExtTy;6066LargeTy = CurTy;6067}60686069if (!TLI.isZExtFree(NarrowTy, LargeTy))6070return false;6071}6072// All uses are the same or can be derived from one another for free.6073return true;6074}60756076/// Try to speculatively promote extensions in \p Exts and continue6077/// promoting through newly promoted operands recursively as far as doing so is6078/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.6079/// When some promotion happened, \p TPT contains the proper state to revert6080/// them.6081///6082/// \return true if some promotion happened, false otherwise.6083bool CodeGenPrepare::tryToPromoteExts(6084TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,6085SmallVectorImpl<Instruction *> &ProfitablyMovedExts,6086unsigned CreatedInstsCost) {6087bool Promoted = false;60886089// Iterate over all the extensions to try to promote them.6090for (auto *I : Exts) {6091// Early check if we directly have ext(load).6092if (isa<LoadInst>(I->getOperand(0))) {6093ProfitablyMovedExts.push_back(I);6094continue;6095}60966097// Check whether or not we want to do any promotion. The reason we have6098// this check inside the for loop is to catch the case where an extension6099// is directly fed by a load because in such case the extension can be moved6100// up without any promotion on its operands.6101if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)6102return false;61036104// Get the action to perform the promotion.6105TypePromotionHelper::Action TPH =6106TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);6107// Check if we can promote.6108if (!TPH) {6109// Save the current extension as we cannot move up through its operand.6110ProfitablyMovedExts.push_back(I);6111continue;6112}61136114// Save the current state.6115TypePromotionTransaction::ConstRestorationPt LastKnownGood =6116TPT.getRestorationPoint();6117SmallVector<Instruction *, 4> NewExts;6118unsigned NewCreatedInstsCost = 0;6119unsigned ExtCost = !TLI->isExtFree(I);6120// Promote.6121Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,6122&NewExts, nullptr, *TLI);6123assert(PromotedVal &&6124"TypePromotionHelper should have filtered out those cases");61256126// We would be able to merge only one extension in a load.6127// Therefore, if we have more than 1 new extension we heuristically6128// cut this search path, because it means we degrade the code quality.6129// With exactly 2, the transformation is neutral, because we will merge6130// one extension but leave one. However, we optimistically keep going,6131// because the new extension may be removed too. Also avoid replacing a6132// single free extension with multiple extensions, as this increases the6133// number of IR instructions while not providing any savings.6134long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;6135// FIXME: It would be possible to propagate a negative value instead of6136// conservatively ceiling it to 0.6137TotalCreatedInstsCost =6138std::max((long long)0, (TotalCreatedInstsCost - ExtCost));6139if (!StressExtLdPromotion &&6140(TotalCreatedInstsCost > 1 ||6141!isPromotedInstructionLegal(*TLI, *DL, PromotedVal) ||6142(ExtCost == 0 && NewExts.size() > 1))) {6143// This promotion is not profitable, rollback to the previous state, and6144// save the current extension in ProfitablyMovedExts as the latest6145// speculative promotion turned out to be unprofitable.6146TPT.rollback(LastKnownGood);6147ProfitablyMovedExts.push_back(I);6148continue;6149}6150// Continue promoting NewExts as far as doing so is profitable.6151SmallVector<Instruction *, 2> NewlyMovedExts;6152(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);6153bool NewPromoted = false;6154for (auto *ExtInst : NewlyMovedExts) {6155Instruction *MovedExt = cast<Instruction>(ExtInst);6156Value *ExtOperand = MovedExt->getOperand(0);6157// If we have reached to a load, we need this extra profitability check6158// as it could potentially be merged into an ext(load).6159if (isa<LoadInst>(ExtOperand) &&6160!(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||6161(ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))6162continue;61636164ProfitablyMovedExts.push_back(MovedExt);6165NewPromoted = true;6166}61676168// If none of speculative promotions for NewExts is profitable, rollback6169// and save the current extension (I) as the last profitable extension.6170if (!NewPromoted) {6171TPT.rollback(LastKnownGood);6172ProfitablyMovedExts.push_back(I);6173continue;6174}6175// The promotion is profitable.6176Promoted = true;6177}6178return Promoted;6179}61806181/// Merging redundant sexts when one is dominating the other.6182bool CodeGenPrepare::mergeSExts(Function &F) {6183bool Changed = false;6184for (auto &Entry : ValToSExtendedUses) {6185SExts &Insts = Entry.second;6186SExts CurPts;6187for (Instruction *Inst : Insts) {6188if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||6189Inst->getOperand(0) != Entry.first)6190continue;6191bool inserted = false;6192for (auto &Pt : CurPts) {6193if (getDT(F).dominates(Inst, Pt)) {6194replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc);6195RemovedInsts.insert(Pt);6196Pt->removeFromParent();6197Pt = Inst;6198inserted = true;6199Changed = true;6200break;6201}6202if (!getDT(F).dominates(Pt, Inst))6203// Give up if we need to merge in a common dominator as the6204// experiments show it is not profitable.6205continue;6206replaceAllUsesWith(Inst, Pt, FreshBBs, IsHugeFunc);6207RemovedInsts.insert(Inst);6208Inst->removeFromParent();6209inserted = true;6210Changed = true;6211break;6212}6213if (!inserted)6214CurPts.push_back(Inst);6215}6216}6217return Changed;6218}62196220// Splitting large data structures so that the GEPs accessing them can have6221// smaller offsets so that they can be sunk to the same blocks as their users.6222// For example, a large struct starting from %base is split into two parts6223// where the second part starts from %new_base.6224//6225// Before:6226// BB0:6227// %base =6228//6229// BB1:6230// %gep0 = gep %base, off06231// %gep1 = gep %base, off16232// %gep2 = gep %base, off26233//6234// BB2:6235// %load1 = load %gep06236// %load2 = load %gep16237// %load3 = load %gep26238//6239// After:6240// BB0:6241// %base =6242// %new_base = gep %base, off06243//6244// BB1:6245// %new_gep0 = %new_base6246// %new_gep1 = gep %new_base, off1 - off06247// %new_gep2 = gep %new_base, off2 - off06248//6249// BB2:6250// %load1 = load i32, i32* %new_gep06251// %load2 = load i32, i32* %new_gep16252// %load3 = load i32, i32* %new_gep26253//6254// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because6255// their offsets are smaller enough to fit into the addressing mode.6256bool CodeGenPrepare::splitLargeGEPOffsets() {6257bool Changed = false;6258for (auto &Entry : LargeOffsetGEPMap) {6259Value *OldBase = Entry.first;6260SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>6261&LargeOffsetGEPs = Entry.second;6262auto compareGEPOffset =6263[&](const std::pair<GetElementPtrInst *, int64_t> &LHS,6264const std::pair<GetElementPtrInst *, int64_t> &RHS) {6265if (LHS.first == RHS.first)6266return false;6267if (LHS.second != RHS.second)6268return LHS.second < RHS.second;6269return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];6270};6271// Sorting all the GEPs of the same data structures based on the offsets.6272llvm::sort(LargeOffsetGEPs, compareGEPOffset);6273LargeOffsetGEPs.erase(llvm::unique(LargeOffsetGEPs), LargeOffsetGEPs.end());6274// Skip if all the GEPs have the same offsets.6275if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)6276continue;6277GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;6278int64_t BaseOffset = LargeOffsetGEPs.begin()->second;6279Value *NewBaseGEP = nullptr;62806281auto createNewBase = [&](int64_t BaseOffset, Value *OldBase,6282GetElementPtrInst *GEP) {6283LLVMContext &Ctx = GEP->getContext();6284Type *PtrIdxTy = DL->getIndexType(GEP->getType());6285Type *I8PtrTy =6286PointerType::get(Ctx, GEP->getType()->getPointerAddressSpace());62876288BasicBlock::iterator NewBaseInsertPt;6289BasicBlock *NewBaseInsertBB;6290if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {6291// If the base of the struct is an instruction, the new base will be6292// inserted close to it.6293NewBaseInsertBB = BaseI->getParent();6294if (isa<PHINode>(BaseI))6295NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();6296else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {6297NewBaseInsertBB =6298SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);6299NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();6300} else6301NewBaseInsertPt = std::next(BaseI->getIterator());6302} else {6303// If the current base is an argument or global value, the new base6304// will be inserted to the entry block.6305NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();6306NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();6307}6308IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);6309// Create a new base.6310Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);6311NewBaseGEP = OldBase;6312if (NewBaseGEP->getType() != I8PtrTy)6313NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);6314NewBaseGEP =6315NewBaseBuilder.CreatePtrAdd(NewBaseGEP, BaseIndex, "splitgep");6316NewGEPBases.insert(NewBaseGEP);6317return;6318};63196320// Check whether all the offsets can be encoded with prefered common base.6321if (int64_t PreferBase = TLI->getPreferredLargeGEPBaseOffset(6322LargeOffsetGEPs.front().second, LargeOffsetGEPs.back().second)) {6323BaseOffset = PreferBase;6324// Create a new base if the offset of the BaseGEP can be decoded with one6325// instruction.6326createNewBase(BaseOffset, OldBase, BaseGEP);6327}63286329auto *LargeOffsetGEP = LargeOffsetGEPs.begin();6330while (LargeOffsetGEP != LargeOffsetGEPs.end()) {6331GetElementPtrInst *GEP = LargeOffsetGEP->first;6332int64_t Offset = LargeOffsetGEP->second;6333if (Offset != BaseOffset) {6334TargetLowering::AddrMode AddrMode;6335AddrMode.HasBaseReg = true;6336AddrMode.BaseOffs = Offset - BaseOffset;6337// The result type of the GEP might not be the type of the memory6338// access.6339if (!TLI->isLegalAddressingMode(*DL, AddrMode,6340GEP->getResultElementType(),6341GEP->getAddressSpace())) {6342// We need to create a new base if the offset to the current base is6343// too large to fit into the addressing mode. So, a very large struct6344// may be split into several parts.6345BaseGEP = GEP;6346BaseOffset = Offset;6347NewBaseGEP = nullptr;6348}6349}63506351// Generate a new GEP to replace the current one.6352Type *PtrIdxTy = DL->getIndexType(GEP->getType());63536354if (!NewBaseGEP) {6355// Create a new base if we don't have one yet. Find the insertion6356// pointer for the new base first.6357createNewBase(BaseOffset, OldBase, GEP);6358}63596360IRBuilder<> Builder(GEP);6361Value *NewGEP = NewBaseGEP;6362if (Offset != BaseOffset) {6363// Calculate the new offset for the new GEP.6364Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset);6365NewGEP = Builder.CreatePtrAdd(NewBaseGEP, Index);6366}6367replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);6368LargeOffsetGEPID.erase(GEP);6369LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);6370GEP->eraseFromParent();6371Changed = true;6372}6373}6374return Changed;6375}63766377bool CodeGenPrepare::optimizePhiType(6378PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,6379SmallPtrSetImpl<Instruction *> &DeletedInstrs) {6380// We are looking for a collection on interconnected phi nodes that together6381// only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts6382// are of the same type. Convert the whole set of nodes to the type of the6383// bitcast.6384Type *PhiTy = I->getType();6385Type *ConvertTy = nullptr;6386if (Visited.count(I) ||6387(!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))6388return false;63896390SmallVector<Instruction *, 4> Worklist;6391Worklist.push_back(cast<Instruction>(I));6392SmallPtrSet<PHINode *, 4> PhiNodes;6393SmallPtrSet<ConstantData *, 4> Constants;6394PhiNodes.insert(I);6395Visited.insert(I);6396SmallPtrSet<Instruction *, 4> Defs;6397SmallPtrSet<Instruction *, 4> Uses;6398// This works by adding extra bitcasts between load/stores and removing6399// existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))6400// we can get in the situation where we remove a bitcast in one iteration6401// just to add it again in the next. We need to ensure that at least one6402// bitcast we remove are anchored to something that will not change back.6403bool AnyAnchored = false;64046405while (!Worklist.empty()) {6406Instruction *II = Worklist.pop_back_val();64076408if (auto *Phi = dyn_cast<PHINode>(II)) {6409// Handle Defs, which might also be PHI's6410for (Value *V : Phi->incoming_values()) {6411if (auto *OpPhi = dyn_cast<PHINode>(V)) {6412if (!PhiNodes.count(OpPhi)) {6413if (!Visited.insert(OpPhi).second)6414return false;6415PhiNodes.insert(OpPhi);6416Worklist.push_back(OpPhi);6417}6418} else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {6419if (!OpLoad->isSimple())6420return false;6421if (Defs.insert(OpLoad).second)6422Worklist.push_back(OpLoad);6423} else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {6424if (Defs.insert(OpEx).second)6425Worklist.push_back(OpEx);6426} else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {6427if (!ConvertTy)6428ConvertTy = OpBC->getOperand(0)->getType();6429if (OpBC->getOperand(0)->getType() != ConvertTy)6430return false;6431if (Defs.insert(OpBC).second) {6432Worklist.push_back(OpBC);6433AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&6434!isa<ExtractElementInst>(OpBC->getOperand(0));6435}6436} else if (auto *OpC = dyn_cast<ConstantData>(V))6437Constants.insert(OpC);6438else6439return false;6440}6441}64426443// Handle uses which might also be phi's6444for (User *V : II->users()) {6445if (auto *OpPhi = dyn_cast<PHINode>(V)) {6446if (!PhiNodes.count(OpPhi)) {6447if (Visited.count(OpPhi))6448return false;6449PhiNodes.insert(OpPhi);6450Visited.insert(OpPhi);6451Worklist.push_back(OpPhi);6452}6453} else if (auto *OpStore = dyn_cast<StoreInst>(V)) {6454if (!OpStore->isSimple() || OpStore->getOperand(0) != II)6455return false;6456Uses.insert(OpStore);6457} else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {6458if (!ConvertTy)6459ConvertTy = OpBC->getType();6460if (OpBC->getType() != ConvertTy)6461return false;6462Uses.insert(OpBC);6463AnyAnchored |=6464any_of(OpBC->users(), [](User *U) { return !isa<StoreInst>(U); });6465} else {6466return false;6467}6468}6469}64706471if (!ConvertTy || !AnyAnchored ||6472!TLI->shouldConvertPhiType(PhiTy, ConvertTy))6473return false;64746475LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to "6476<< *ConvertTy << "\n");64776478// Create all the new phi nodes of the new type, and bitcast any loads to the6479// correct type.6480ValueToValueMap ValMap;6481for (ConstantData *C : Constants)6482ValMap[C] = ConstantExpr::getBitCast(C, ConvertTy);6483for (Instruction *D : Defs) {6484if (isa<BitCastInst>(D)) {6485ValMap[D] = D->getOperand(0);6486DeletedInstrs.insert(D);6487} else {6488BasicBlock::iterator insertPt = std::next(D->getIterator());6489ValMap[D] = new BitCastInst(D, ConvertTy, D->getName() + ".bc", insertPt);6490}6491}6492for (PHINode *Phi : PhiNodes)6493ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),6494Phi->getName() + ".tc", Phi->getIterator());6495// Pipe together all the PhiNodes.6496for (PHINode *Phi : PhiNodes) {6497PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);6498for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)6499NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],6500Phi->getIncomingBlock(i));6501Visited.insert(NewPhi);6502}6503// And finally pipe up the stores and bitcasts6504for (Instruction *U : Uses) {6505if (isa<BitCastInst>(U)) {6506DeletedInstrs.insert(U);6507replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc);6508} else {6509U->setOperand(0, new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc",6510U->getIterator()));6511}6512}65136514// Save the removed phis to be deleted later.6515for (PHINode *Phi : PhiNodes)6516DeletedInstrs.insert(Phi);6517return true;6518}65196520bool CodeGenPrepare::optimizePhiTypes(Function &F) {6521if (!OptimizePhiTypes)6522return false;65236524bool Changed = false;6525SmallPtrSet<PHINode *, 4> Visited;6526SmallPtrSet<Instruction *, 4> DeletedInstrs;65276528// Attempt to optimize all the phis in the functions to the correct type.6529for (auto &BB : F)6530for (auto &Phi : BB.phis())6531Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);65326533// Remove any old phi's that have been converted.6534for (auto *I : DeletedInstrs) {6535replaceAllUsesWith(I, PoisonValue::get(I->getType()), FreshBBs, IsHugeFunc);6536I->eraseFromParent();6537}65386539return Changed;6540}65416542/// Return true, if an ext(load) can be formed from an extension in6543/// \p MovedExts.6544bool CodeGenPrepare::canFormExtLd(6545const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,6546Instruction *&Inst, bool HasPromoted) {6547for (auto *MovedExtInst : MovedExts) {6548if (isa<LoadInst>(MovedExtInst->getOperand(0))) {6549LI = cast<LoadInst>(MovedExtInst->getOperand(0));6550Inst = MovedExtInst;6551break;6552}6553}6554if (!LI)6555return false;65566557// If they're already in the same block, there's nothing to do.6558// Make the cheap checks first if we did not promote.6559// If we promoted, we need to check if it is indeed profitable.6560if (!HasPromoted && LI->getParent() == Inst->getParent())6561return false;65626563return TLI->isExtLoad(LI, Inst, *DL);6564}65656566/// Move a zext or sext fed by a load into the same basic block as the load,6567/// unless conditions are unfavorable. This allows SelectionDAG to fold the6568/// extend into the load.6569///6570/// E.g.,6571/// \code6572/// %ld = load i32* %addr6573/// %add = add nuw i32 %ld, 46574/// %zext = zext i32 %add to i646575// \endcode6576/// =>6577/// \code6578/// %ld = load i32* %addr6579/// %zext = zext i32 %ld to i646580/// %add = add nuw i64 %zext, 46581/// \encode6582/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which6583/// allow us to match zext(load i32*) to i64.6584///6585/// Also, try to promote the computations used to obtain a sign extended6586/// value used into memory accesses.6587/// E.g.,6588/// \code6589/// a = add nsw i32 b, 36590/// d = sext i32 a to i646591/// e = getelementptr ..., i64 d6592/// \endcode6593/// =>6594/// \code6595/// f = sext i32 b to i646596/// a = add nsw i64 f, 36597/// e = getelementptr ..., i64 a6598/// \endcode6599///6600/// \p Inst[in/out] the extension may be modified during the process if some6601/// promotions apply.6602bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {6603bool AllowPromotionWithoutCommonHeader = false;6604/// See if it is an interesting sext operations for the address type6605/// promotion before trying to promote it, e.g., the ones with the right6606/// type and used in memory accesses.6607bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(6608*Inst, AllowPromotionWithoutCommonHeader);6609TypePromotionTransaction TPT(RemovedInsts);6610TypePromotionTransaction::ConstRestorationPt LastKnownGood =6611TPT.getRestorationPoint();6612SmallVector<Instruction *, 1> Exts;6613SmallVector<Instruction *, 2> SpeculativelyMovedExts;6614Exts.push_back(Inst);66156616bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);66176618// Look for a load being extended.6619LoadInst *LI = nullptr;6620Instruction *ExtFedByLoad;66216622// Try to promote a chain of computation if it allows to form an extended6623// load.6624if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {6625assert(LI && ExtFedByLoad && "Expect a valid load and extension");6626TPT.commit();6627// Move the extend into the same block as the load.6628ExtFedByLoad->moveAfter(LI);6629++NumExtsMoved;6630Inst = ExtFedByLoad;6631return true;6632}66336634// Continue promoting SExts if known as considerable depending on targets.6635if (ATPConsiderable &&6636performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,6637HasPromoted, TPT, SpeculativelyMovedExts))6638return true;66396640TPT.rollback(LastKnownGood);6641return false;6642}66436644// Perform address type promotion if doing so is profitable.6645// If AllowPromotionWithoutCommonHeader == false, we should find other sext6646// instructions that sign extended the same initial value. However, if6647// AllowPromotionWithoutCommonHeader == true, we expect promoting the6648// extension is just profitable.6649bool CodeGenPrepare::performAddressTypePromotion(6650Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,6651bool HasPromoted, TypePromotionTransaction &TPT,6652SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {6653bool Promoted = false;6654SmallPtrSet<Instruction *, 1> UnhandledExts;6655bool AllSeenFirst = true;6656for (auto *I : SpeculativelyMovedExts) {6657Value *HeadOfChain = I->getOperand(0);6658DenseMap<Value *, Instruction *>::iterator AlreadySeen =6659SeenChainsForSExt.find(HeadOfChain);6660// If there is an unhandled SExt which has the same header, try to promote6661// it as well.6662if (AlreadySeen != SeenChainsForSExt.end()) {6663if (AlreadySeen->second != nullptr)6664UnhandledExts.insert(AlreadySeen->second);6665AllSeenFirst = false;6666}6667}66686669if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&6670SpeculativelyMovedExts.size() == 1)) {6671TPT.commit();6672if (HasPromoted)6673Promoted = true;6674for (auto *I : SpeculativelyMovedExts) {6675Value *HeadOfChain = I->getOperand(0);6676SeenChainsForSExt[HeadOfChain] = nullptr;6677ValToSExtendedUses[HeadOfChain].push_back(I);6678}6679// Update Inst as promotion happen.6680Inst = SpeculativelyMovedExts.pop_back_val();6681} else {6682// This is the first chain visited from the header, keep the current chain6683// as unhandled. Defer to promote this until we encounter another SExt6684// chain derived from the same header.6685for (auto *I : SpeculativelyMovedExts) {6686Value *HeadOfChain = I->getOperand(0);6687SeenChainsForSExt[HeadOfChain] = Inst;6688}6689return false;6690}66916692if (!AllSeenFirst && !UnhandledExts.empty())6693for (auto *VisitedSExt : UnhandledExts) {6694if (RemovedInsts.count(VisitedSExt))6695continue;6696TypePromotionTransaction TPT(RemovedInsts);6697SmallVector<Instruction *, 1> Exts;6698SmallVector<Instruction *, 2> Chains;6699Exts.push_back(VisitedSExt);6700bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);6701TPT.commit();6702if (HasPromoted)6703Promoted = true;6704for (auto *I : Chains) {6705Value *HeadOfChain = I->getOperand(0);6706// Mark this as handled.6707SeenChainsForSExt[HeadOfChain] = nullptr;6708ValToSExtendedUses[HeadOfChain].push_back(I);6709}6710}6711return Promoted;6712}67136714bool CodeGenPrepare::optimizeExtUses(Instruction *I) {6715BasicBlock *DefBB = I->getParent();67166717// If the result of a {s|z}ext and its source are both live out, rewrite all6718// other uses of the source with result of extension.6719Value *Src = I->getOperand(0);6720if (Src->hasOneUse())6721return false;67226723// Only do this xform if truncating is free.6724if (!TLI->isTruncateFree(I->getType(), Src->getType()))6725return false;67266727// Only safe to perform the optimization if the source is also defined in6728// this block.6729if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())6730return false;67316732bool DefIsLiveOut = false;6733for (User *U : I->users()) {6734Instruction *UI = cast<Instruction>(U);67356736// Figure out which BB this ext is used in.6737BasicBlock *UserBB = UI->getParent();6738if (UserBB == DefBB)6739continue;6740DefIsLiveOut = true;6741break;6742}6743if (!DefIsLiveOut)6744return false;67456746// Make sure none of the uses are PHI nodes.6747for (User *U : Src->users()) {6748Instruction *UI = cast<Instruction>(U);6749BasicBlock *UserBB = UI->getParent();6750if (UserBB == DefBB)6751continue;6752// Be conservative. We don't want this xform to end up introducing6753// reloads just before load / store instructions.6754if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))6755return false;6756}67576758// InsertedTruncs - Only insert one trunc in each block once.6759DenseMap<BasicBlock *, Instruction *> InsertedTruncs;67606761bool MadeChange = false;6762for (Use &U : Src->uses()) {6763Instruction *User = cast<Instruction>(U.getUser());67646765// Figure out which BB this ext is used in.6766BasicBlock *UserBB = User->getParent();6767if (UserBB == DefBB)6768continue;67696770// Both src and def are live in this block. Rewrite the use.6771Instruction *&InsertedTrunc = InsertedTruncs[UserBB];67726773if (!InsertedTrunc) {6774BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();6775assert(InsertPt != UserBB->end());6776InsertedTrunc = new TruncInst(I, Src->getType(), "");6777InsertedTrunc->insertBefore(*UserBB, InsertPt);6778InsertedInsts.insert(InsertedTrunc);6779}67806781// Replace a use of the {s|z}ext source with a use of the result.6782U = InsertedTrunc;6783++NumExtUses;6784MadeChange = true;6785}67866787return MadeChange;6788}67896790// Find loads whose uses only use some of the loaded value's bits. Add an "and"6791// just after the load if the target can fold this into one extload instruction,6792// with the hope of eliminating some of the other later "and" instructions using6793// the loaded value. "and"s that are made trivially redundant by the insertion6794// of the new "and" are removed by this function, while others (e.g. those whose6795// path from the load goes through a phi) are left for isel to potentially6796// remove.6797//6798// For example:6799//6800// b0:6801// x = load i326802// ...6803// b1:6804// y = and x, 0xff6805// z = use y6806//6807// becomes:6808//6809// b0:6810// x = load i326811// x' = and x, 0xff6812// ...6813// b1:6814// z = use x'6815//6816// whereas:6817//6818// b0:6819// x1 = load i326820// ...6821// b1:6822// x2 = load i326823// ...6824// b2:6825// x = phi x1, x26826// y = and x, 0xff6827//6828// becomes (after a call to optimizeLoadExt for each load):6829//6830// b0:6831// x1 = load i326832// x1' = and x1, 0xff6833// ...6834// b1:6835// x2 = load i326836// x2' = and x2, 0xff6837// ...6838// b2:6839// x = phi x1', x2'6840// y = and x, 0xff6841bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {6842if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())6843return false;68446845// Skip loads we've already transformed.6846if (Load->hasOneUse() &&6847InsertedInsts.count(cast<Instruction>(*Load->user_begin())))6848return false;68496850// Look at all uses of Load, looking through phis, to determine how many bits6851// of the loaded value are needed.6852SmallVector<Instruction *, 8> WorkList;6853SmallPtrSet<Instruction *, 16> Visited;6854SmallVector<Instruction *, 8> AndsToMaybeRemove;6855for (auto *U : Load->users())6856WorkList.push_back(cast<Instruction>(U));68576858EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());6859unsigned BitWidth = LoadResultVT.getSizeInBits();6860// If the BitWidth is 0, do not try to optimize the type6861if (BitWidth == 0)6862return false;68636864APInt DemandBits(BitWidth, 0);6865APInt WidestAndBits(BitWidth, 0);68666867while (!WorkList.empty()) {6868Instruction *I = WorkList.pop_back_val();68696870// Break use-def graph loops.6871if (!Visited.insert(I).second)6872continue;68736874// For a PHI node, push all of its users.6875if (auto *Phi = dyn_cast<PHINode>(I)) {6876for (auto *U : Phi->users())6877WorkList.push_back(cast<Instruction>(U));6878continue;6879}68806881switch (I->getOpcode()) {6882case Instruction::And: {6883auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));6884if (!AndC)6885return false;6886APInt AndBits = AndC->getValue();6887DemandBits |= AndBits;6888// Keep track of the widest and mask we see.6889if (AndBits.ugt(WidestAndBits))6890WidestAndBits = AndBits;6891if (AndBits == WidestAndBits && I->getOperand(0) == Load)6892AndsToMaybeRemove.push_back(I);6893break;6894}68956896case Instruction::Shl: {6897auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));6898if (!ShlC)6899return false;6900uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);6901DemandBits.setLowBits(BitWidth - ShiftAmt);6902break;6903}69046905case Instruction::Trunc: {6906EVT TruncVT = TLI->getValueType(*DL, I->getType());6907unsigned TruncBitWidth = TruncVT.getSizeInBits();6908DemandBits.setLowBits(TruncBitWidth);6909break;6910}69116912default:6913return false;6914}6915}69166917uint32_t ActiveBits = DemandBits.getActiveBits();6918// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the6919// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,6920// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but6921// (and (load x) 1) is not matched as a single instruction, rather as a LDR6922// followed by an AND.6923// TODO: Look into removing this restriction by fixing backends to either6924// return false for isLoadExtLegal for i1 or have them select this pattern to6925// a single instruction.6926//6927// Also avoid hoisting if we didn't see any ands with the exact DemandBits6928// mask, since these are the only ands that will be removed by isel.6929if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||6930WidestAndBits != DemandBits)6931return false;69326933LLVMContext &Ctx = Load->getType()->getContext();6934Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);6935EVT TruncVT = TLI->getValueType(*DL, TruncTy);69366937// Reject cases that won't be matched as extloads.6938if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||6939!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))6940return false;69416942IRBuilder<> Builder(Load->getNextNonDebugInstruction());6943auto *NewAnd = cast<Instruction>(6944Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));6945// Mark this instruction as "inserted by CGP", so that other6946// optimizations don't touch it.6947InsertedInsts.insert(NewAnd);69486949// Replace all uses of load with new and (except for the use of load in the6950// new and itself).6951replaceAllUsesWith(Load, NewAnd, FreshBBs, IsHugeFunc);6952NewAnd->setOperand(0, Load);69536954// Remove any and instructions that are now redundant.6955for (auto *And : AndsToMaybeRemove)6956// Check that the and mask is the same as the one we decided to put on the6957// new and.6958if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {6959replaceAllUsesWith(And, NewAnd, FreshBBs, IsHugeFunc);6960if (&*CurInstIterator == And)6961CurInstIterator = std::next(And->getIterator());6962And->eraseFromParent();6963++NumAndUses;6964}69656966++NumAndsAdded;6967return true;6968}69696970/// Check if V (an operand of a select instruction) is an expensive instruction6971/// that is only used once.6972static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {6973auto *I = dyn_cast<Instruction>(V);6974// If it's safe to speculatively execute, then it should not have side6975// effects; therefore, it's safe to sink and possibly *not* execute.6976return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&6977TTI->isExpensiveToSpeculativelyExecute(I);6978}69796980/// Returns true if a SelectInst should be turned into an explicit branch.6981static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,6982const TargetLowering *TLI,6983SelectInst *SI) {6984// If even a predictable select is cheap, then a branch can't be cheaper.6985if (!TLI->isPredictableSelectExpensive())6986return false;69876988// FIXME: This should use the same heuristics as IfConversion to determine6989// whether a select is better represented as a branch.69906991// If metadata tells us that the select condition is obviously predictable,6992// then we want to replace the select with a branch.6993uint64_t TrueWeight, FalseWeight;6994if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {6995uint64_t Max = std::max(TrueWeight, FalseWeight);6996uint64_t Sum = TrueWeight + FalseWeight;6997if (Sum != 0) {6998auto Probability = BranchProbability::getBranchProbability(Max, Sum);6999if (Probability > TTI->getPredictableBranchThreshold())7000return true;7001}7002}70037004CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());70057006// If a branch is predictable, an out-of-order CPU can avoid blocking on its7007// comparison condition. If the compare has more than one use, there's7008// probably another cmov or setcc around, so it's not worth emitting a branch.7009if (!Cmp || !Cmp->hasOneUse())7010return false;70117012// If either operand of the select is expensive and only needed on one side7013// of the select, we should form a branch.7014if (sinkSelectOperand(TTI, SI->getTrueValue()) ||7015sinkSelectOperand(TTI, SI->getFalseValue()))7016return true;70177018return false;7019}70207021/// If \p isTrue is true, return the true value of \p SI, otherwise return7022/// false value of \p SI. If the true/false value of \p SI is defined by any7023/// select instructions in \p Selects, look through the defining select7024/// instruction until the true/false value is not defined in \p Selects.7025static Value *7026getTrueOrFalseValue(SelectInst *SI, bool isTrue,7027const SmallPtrSet<const Instruction *, 2> &Selects) {7028Value *V = nullptr;70297030for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);7031DefSI = dyn_cast<SelectInst>(V)) {7032assert(DefSI->getCondition() == SI->getCondition() &&7033"The condition of DefSI does not match with SI");7034V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());7035}70367037assert(V && "Failed to get select true/false value");7038return V;7039}70407041bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {7042assert(Shift->isShift() && "Expected a shift");70437044// If this is (1) a vector shift, (2) shifts by scalars are cheaper than7045// general vector shifts, and (3) the shift amount is a select-of-splatted7046// values, hoist the shifts before the select:7047// shift Op0, (select Cond, TVal, FVal) -->7048// select Cond, (shift Op0, TVal), (shift Op0, FVal)7049//7050// This is inverting a generic IR transform when we know that the cost of a7051// general vector shift is more than the cost of 2 shift-by-scalars.7052// We can't do this effectively in SDAG because we may not be able to7053// determine if the select operands are splats from within a basic block.7054Type *Ty = Shift->getType();7055if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))7056return false;7057Value *Cond, *TVal, *FVal;7058if (!match(Shift->getOperand(1),7059m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))7060return false;7061if (!isSplatValue(TVal) || !isSplatValue(FVal))7062return false;70637064IRBuilder<> Builder(Shift);7065BinaryOperator::BinaryOps Opcode = Shift->getOpcode();7066Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);7067Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);7068Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);7069replaceAllUsesWith(Shift, NewSel, FreshBBs, IsHugeFunc);7070Shift->eraseFromParent();7071return true;7072}70737074bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {7075Intrinsic::ID Opcode = Fsh->getIntrinsicID();7076assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&7077"Expected a funnel shift");70787079// If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper7080// than general vector shifts, and (3) the shift amount is select-of-splatted7081// values, hoist the funnel shifts before the select:7082// fsh Op0, Op1, (select Cond, TVal, FVal) -->7083// select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)7084//7085// This is inverting a generic IR transform when we know that the cost of a7086// general vector shift is more than the cost of 2 shift-by-scalars.7087// We can't do this effectively in SDAG because we may not be able to7088// determine if the select operands are splats from within a basic block.7089Type *Ty = Fsh->getType();7090if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))7091return false;7092Value *Cond, *TVal, *FVal;7093if (!match(Fsh->getOperand(2),7094m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))7095return false;7096if (!isSplatValue(TVal) || !isSplatValue(FVal))7097return false;70987099IRBuilder<> Builder(Fsh);7100Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);7101Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, TVal});7102Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, FVal});7103Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);7104replaceAllUsesWith(Fsh, NewSel, FreshBBs, IsHugeFunc);7105Fsh->eraseFromParent();7106return true;7107}71087109/// If we have a SelectInst that will likely profit from branch prediction,7110/// turn it into a branch.7111bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {7112if (DisableSelectToBranch)7113return false;71147115// If the SelectOptimize pass is enabled, selects have already been optimized.7116if (!getCGPassBuilderOption().DisableSelectOptimize)7117return false;71187119// Find all consecutive select instructions that share the same condition.7120SmallVector<SelectInst *, 2> ASI;7121ASI.push_back(SI);7122for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);7123It != SI->getParent()->end(); ++It) {7124SelectInst *I = dyn_cast<SelectInst>(&*It);7125if (I && SI->getCondition() == I->getCondition()) {7126ASI.push_back(I);7127} else {7128break;7129}7130}71317132SelectInst *LastSI = ASI.back();7133// Increment the current iterator to skip all the rest of select instructions7134// because they will be either "not lowered" or "all lowered" to branch.7135CurInstIterator = std::next(LastSI->getIterator());7136// Examine debug-info attached to the consecutive select instructions. They7137// won't be individually optimised by optimizeInst, so we need to perform7138// DbgVariableRecord maintenence here instead.7139for (SelectInst *SI : ArrayRef(ASI).drop_front())7140fixupDbgVariableRecordsOnInst(*SI);71417142bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);71437144// Can we convert the 'select' to CF ?7145if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))7146return false;71477148TargetLowering::SelectSupportKind SelectKind;7149if (SI->getType()->isVectorTy())7150SelectKind = TargetLowering::ScalarCondVectorVal;7151else7152SelectKind = TargetLowering::ScalarValSelect;71537154if (TLI->isSelectSupported(SelectKind) &&7155(!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize ||7156llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))7157return false;71587159// The DominatorTree needs to be rebuilt by any consumers after this7160// transformation. We simply reset here rather than setting the ModifiedDT7161// flag to avoid restarting the function walk in runOnFunction for each7162// select optimized.7163DT.reset();71647165// Transform a sequence like this:7166// start:7167// %cmp = cmp uge i32 %a, %b7168// %sel = select i1 %cmp, i32 %c, i32 %d7169//7170// Into:7171// start:7172// %cmp = cmp uge i32 %a, %b7173// %cmp.frozen = freeze %cmp7174// br i1 %cmp.frozen, label %select.true, label %select.false7175// select.true:7176// br label %select.end7177// select.false:7178// br label %select.end7179// select.end:7180// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]7181//7182// %cmp should be frozen, otherwise it may introduce undefined behavior.7183// In addition, we may sink instructions that produce %c or %d from7184// the entry block into the destination(s) of the new branch.7185// If the true or false blocks do not contain a sunken instruction, that7186// block and its branch may be optimized away. In that case, one side of the7187// first branch will point directly to select.end, and the corresponding PHI7188// predecessor block will be the start block.71897190// Collect values that go on the true side and the values that go on the false7191// side.7192SmallVector<Instruction *> TrueInstrs, FalseInstrs;7193for (SelectInst *SI : ASI) {7194if (Value *V = SI->getTrueValue(); sinkSelectOperand(TTI, V))7195TrueInstrs.push_back(cast<Instruction>(V));7196if (Value *V = SI->getFalseValue(); sinkSelectOperand(TTI, V))7197FalseInstrs.push_back(cast<Instruction>(V));7198}71997200// Split the select block, according to how many (if any) values go on each7201// side.7202BasicBlock *StartBlock = SI->getParent();7203BasicBlock::iterator SplitPt = std::next(BasicBlock::iterator(LastSI));7204// We should split before any debug-info.7205SplitPt.setHeadBit(true);72067207IRBuilder<> IB(SI);7208auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");72097210BasicBlock *TrueBlock = nullptr;7211BasicBlock *FalseBlock = nullptr;7212BasicBlock *EndBlock = nullptr;7213BranchInst *TrueBranch = nullptr;7214BranchInst *FalseBranch = nullptr;7215if (TrueInstrs.size() == 0) {7216FalseBranch = cast<BranchInst>(SplitBlockAndInsertIfElse(7217CondFr, SplitPt, false, nullptr, nullptr, LI));7218FalseBlock = FalseBranch->getParent();7219EndBlock = cast<BasicBlock>(FalseBranch->getOperand(0));7220} else if (FalseInstrs.size() == 0) {7221TrueBranch = cast<BranchInst>(SplitBlockAndInsertIfThen(7222CondFr, SplitPt, false, nullptr, nullptr, LI));7223TrueBlock = TrueBranch->getParent();7224EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0));7225} else {7226Instruction *ThenTerm = nullptr;7227Instruction *ElseTerm = nullptr;7228SplitBlockAndInsertIfThenElse(CondFr, SplitPt, &ThenTerm, &ElseTerm,7229nullptr, nullptr, LI);7230TrueBranch = cast<BranchInst>(ThenTerm);7231FalseBranch = cast<BranchInst>(ElseTerm);7232TrueBlock = TrueBranch->getParent();7233FalseBlock = FalseBranch->getParent();7234EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0));7235}72367237EndBlock->setName("select.end");7238if (TrueBlock)7239TrueBlock->setName("select.true.sink");7240if (FalseBlock)7241FalseBlock->setName(FalseInstrs.size() == 0 ? "select.false"7242: "select.false.sink");72437244if (IsHugeFunc) {7245if (TrueBlock)7246FreshBBs.insert(TrueBlock);7247if (FalseBlock)7248FreshBBs.insert(FalseBlock);7249FreshBBs.insert(EndBlock);7250}72517252BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));72537254static const unsigned MD[] = {7255LLVMContext::MD_prof, LLVMContext::MD_unpredictable,7256LLVMContext::MD_make_implicit, LLVMContext::MD_dbg};7257StartBlock->getTerminator()->copyMetadata(*SI, MD);72587259// Sink expensive instructions into the conditional blocks to avoid executing7260// them speculatively.7261for (Instruction *I : TrueInstrs)7262I->moveBefore(TrueBranch);7263for (Instruction *I : FalseInstrs)7264I->moveBefore(FalseBranch);72657266// If we did not create a new block for one of the 'true' or 'false' paths7267// of the condition, it means that side of the branch goes to the end block7268// directly and the path originates from the start block from the point of7269// view of the new PHI.7270if (TrueBlock == nullptr)7271TrueBlock = StartBlock;7272else if (FalseBlock == nullptr)7273FalseBlock = StartBlock;72747275SmallPtrSet<const Instruction *, 2> INS;7276INS.insert(ASI.begin(), ASI.end());7277// Use reverse iterator because later select may use the value of the7278// earlier select, and we need to propagate value through earlier select7279// to get the PHI operand.7280for (SelectInst *SI : llvm::reverse(ASI)) {7281// The select itself is replaced with a PHI Node.7282PHINode *PN = PHINode::Create(SI->getType(), 2, "");7283PN->insertBefore(EndBlock->begin());7284PN->takeName(SI);7285PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);7286PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);7287PN->setDebugLoc(SI->getDebugLoc());72887289replaceAllUsesWith(SI, PN, FreshBBs, IsHugeFunc);7290SI->eraseFromParent();7291INS.erase(SI);7292++NumSelectsExpanded;7293}72947295// Instruct OptimizeBlock to skip to the next block.7296CurInstIterator = StartBlock->end();7297return true;7298}72997300/// Some targets only accept certain types for splat inputs. For example a VDUP7301/// in MVE takes a GPR (integer) register, and the instruction that incorporate7302/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.7303bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {7304// Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only7305if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),7306m_Undef(), m_ZeroMask())))7307return false;7308Type *NewType = TLI->shouldConvertSplatType(SVI);7309if (!NewType)7310return false;73117312auto *SVIVecType = cast<FixedVectorType>(SVI->getType());7313assert(!NewType->isVectorTy() && "Expected a scalar type!");7314assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&7315"Expected a type of the same size!");7316auto *NewVecType =7317FixedVectorType::get(NewType, SVIVecType->getNumElements());73187319// Create a bitcast (shuffle (insert (bitcast(..))))7320IRBuilder<> Builder(SVI->getContext());7321Builder.SetInsertPoint(SVI);7322Value *BC1 = Builder.CreateBitCast(7323cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);7324Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1);7325Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);73267327replaceAllUsesWith(SVI, BC2, FreshBBs, IsHugeFunc);7328RecursivelyDeleteTriviallyDeadInstructions(7329SVI, TLInfo, nullptr,7330[&](Value *V) { removeAllAssertingVHReferences(V); });73317332// Also hoist the bitcast up to its operand if it they are not in the same7333// block.7334if (auto *BCI = dyn_cast<Instruction>(BC1))7335if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))7336if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&7337!Op->isTerminator() && !Op->isEHPad())7338BCI->moveAfter(Op);73397340return true;7341}73427343bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {7344// If the operands of I can be folded into a target instruction together with7345// I, duplicate and sink them.7346SmallVector<Use *, 4> OpsToSink;7347if (!TLI->shouldSinkOperands(I, OpsToSink))7348return false;73497350// OpsToSink can contain multiple uses in a use chain (e.g.7351// (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating7352// uses must come first, so we process the ops in reverse order so as to not7353// create invalid IR.7354BasicBlock *TargetBB = I->getParent();7355bool Changed = false;7356SmallVector<Use *, 4> ToReplace;7357Instruction *InsertPoint = I;7358DenseMap<const Instruction *, unsigned long> InstOrdering;7359unsigned long InstNumber = 0;7360for (const auto &I : *TargetBB)7361InstOrdering[&I] = InstNumber++;73627363for (Use *U : reverse(OpsToSink)) {7364auto *UI = cast<Instruction>(U->get());7365if (isa<PHINode>(UI))7366continue;7367if (UI->getParent() == TargetBB) {7368if (InstOrdering[UI] < InstOrdering[InsertPoint])7369InsertPoint = UI;7370continue;7371}7372ToReplace.push_back(U);7373}73747375SetVector<Instruction *> MaybeDead;7376DenseMap<Instruction *, Instruction *> NewInstructions;7377for (Use *U : ToReplace) {7378auto *UI = cast<Instruction>(U->get());7379Instruction *NI = UI->clone();73807381if (IsHugeFunc) {7382// Now we clone an instruction, its operands' defs may sink to this BB7383// now. So we put the operands defs' BBs into FreshBBs to do optimization.7384for (unsigned I = 0; I < NI->getNumOperands(); ++I) {7385auto *OpDef = dyn_cast<Instruction>(NI->getOperand(I));7386if (!OpDef)7387continue;7388FreshBBs.insert(OpDef->getParent());7389}7390}73917392NewInstructions[UI] = NI;7393MaybeDead.insert(UI);7394LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");7395NI->insertBefore(InsertPoint);7396InsertPoint = NI;7397InsertedInsts.insert(NI);73987399// Update the use for the new instruction, making sure that we update the7400// sunk instruction uses, if it is part of a chain that has already been7401// sunk.7402Instruction *OldI = cast<Instruction>(U->getUser());7403if (NewInstructions.count(OldI))7404NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);7405else7406U->set(NI);7407Changed = true;7408}74097410// Remove instructions that are dead after sinking.7411for (auto *I : MaybeDead) {7412if (!I->hasNUsesOrMore(1)) {7413LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");7414I->eraseFromParent();7415}7416}74177418return Changed;7419}74207421bool CodeGenPrepare::optimizeSwitchType(SwitchInst *SI) {7422Value *Cond = SI->getCondition();7423Type *OldType = Cond->getType();7424LLVMContext &Context = Cond->getContext();7425EVT OldVT = TLI->getValueType(*DL, OldType);7426MVT RegType = TLI->getPreferredSwitchConditionType(Context, OldVT);7427unsigned RegWidth = RegType.getSizeInBits();74287429if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())7430return false;74317432// If the register width is greater than the type width, expand the condition7433// of the switch instruction and each case constant to the width of the7434// register. By widening the type of the switch condition, subsequent7435// comparisons (for case comparisons) will not need to be extended to the7436// preferred register width, so we will potentially eliminate N-1 extends,7437// where N is the number of cases in the switch.7438auto *NewType = Type::getIntNTy(Context, RegWidth);74397440// Extend the switch condition and case constants using the target preferred7441// extend unless the switch condition is a function argument with an extend7442// attribute. In that case, we can avoid an unnecessary mask/extension by7443// matching the argument extension instead.7444Instruction::CastOps ExtType = Instruction::ZExt;7445// Some targets prefer SExt over ZExt.7446if (TLI->isSExtCheaperThanZExt(OldVT, RegType))7447ExtType = Instruction::SExt;74487449if (auto *Arg = dyn_cast<Argument>(Cond)) {7450if (Arg->hasSExtAttr())7451ExtType = Instruction::SExt;7452if (Arg->hasZExtAttr())7453ExtType = Instruction::ZExt;7454}74557456auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);7457ExtInst->insertBefore(SI);7458ExtInst->setDebugLoc(SI->getDebugLoc());7459SI->setCondition(ExtInst);7460for (auto Case : SI->cases()) {7461const APInt &NarrowConst = Case.getCaseValue()->getValue();7462APInt WideConst = (ExtType == Instruction::ZExt)7463? NarrowConst.zext(RegWidth)7464: NarrowConst.sext(RegWidth);7465Case.setValue(ConstantInt::get(Context, WideConst));7466}74677468return true;7469}74707471bool CodeGenPrepare::optimizeSwitchPhiConstants(SwitchInst *SI) {7472// The SCCP optimization tends to produce code like this:7473// switch(x) { case 42: phi(42, ...) }7474// Materializing the constant for the phi-argument needs instructions; So we7475// change the code to:7476// switch(x) { case 42: phi(x, ...) }74777478Value *Condition = SI->getCondition();7479// Avoid endless loop in degenerate case.7480if (isa<ConstantInt>(*Condition))7481return false;74827483bool Changed = false;7484BasicBlock *SwitchBB = SI->getParent();7485Type *ConditionType = Condition->getType();74867487for (const SwitchInst::CaseHandle &Case : SI->cases()) {7488ConstantInt *CaseValue = Case.getCaseValue();7489BasicBlock *CaseBB = Case.getCaseSuccessor();7490// Set to true if we previously checked that `CaseBB` is only reached by7491// a single case from this switch.7492bool CheckedForSinglePred = false;7493for (PHINode &PHI : CaseBB->phis()) {7494Type *PHIType = PHI.getType();7495// If ZExt is free then we can also catch patterns like this:7496// switch((i32)x) { case 42: phi((i64)42, ...); }7497// and replace `(i64)42` with `zext i32 %x to i64`.7498bool TryZExt =7499PHIType->isIntegerTy() &&7500PHIType->getIntegerBitWidth() > ConditionType->getIntegerBitWidth() &&7501TLI->isZExtFree(ConditionType, PHIType);7502if (PHIType == ConditionType || TryZExt) {7503// Set to true to skip this case because of multiple preds.7504bool SkipCase = false;7505Value *Replacement = nullptr;7506for (unsigned I = 0, E = PHI.getNumIncomingValues(); I != E; I++) {7507Value *PHIValue = PHI.getIncomingValue(I);7508if (PHIValue != CaseValue) {7509if (!TryZExt)7510continue;7511ConstantInt *PHIValueInt = dyn_cast<ConstantInt>(PHIValue);7512if (!PHIValueInt ||7513PHIValueInt->getValue() !=7514CaseValue->getValue().zext(PHIType->getIntegerBitWidth()))7515continue;7516}7517if (PHI.getIncomingBlock(I) != SwitchBB)7518continue;7519// We cannot optimize if there are multiple case labels jumping to7520// this block. This check may get expensive when there are many7521// case labels so we test for it last.7522if (!CheckedForSinglePred) {7523CheckedForSinglePred = true;7524if (SI->findCaseDest(CaseBB) == nullptr) {7525SkipCase = true;7526break;7527}7528}75297530if (Replacement == nullptr) {7531if (PHIValue == CaseValue) {7532Replacement = Condition;7533} else {7534IRBuilder<> Builder(SI);7535Replacement = Builder.CreateZExt(Condition, PHIType);7536}7537}7538PHI.setIncomingValue(I, Replacement);7539Changed = true;7540}7541if (SkipCase)7542break;7543}7544}7545}7546return Changed;7547}75487549bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {7550bool Changed = optimizeSwitchType(SI);7551Changed |= optimizeSwitchPhiConstants(SI);7552return Changed;7553}75547555namespace {75567557/// Helper class to promote a scalar operation to a vector one.7558/// This class is used to move downward extractelement transition.7559/// E.g.,7560/// a = vector_op <2 x i32>7561/// b = extractelement <2 x i32> a, i32 07562/// c = scalar_op b7563/// store c7564///7565/// =>7566/// a = vector_op <2 x i32>7567/// c = vector_op a (equivalent to scalar_op on the related lane)7568/// * d = extractelement <2 x i32> c, i32 07569/// * store d7570/// Assuming both extractelement and store can be combine, we get rid of the7571/// transition.7572class VectorPromoteHelper {7573/// DataLayout associated with the current module.7574const DataLayout &DL;75757576/// Used to perform some checks on the legality of vector operations.7577const TargetLowering &TLI;75787579/// Used to estimated the cost of the promoted chain.7580const TargetTransformInfo &TTI;75817582/// The transition being moved downwards.7583Instruction *Transition;75847585/// The sequence of instructions to be promoted.7586SmallVector<Instruction *, 4> InstsToBePromoted;75877588/// Cost of combining a store and an extract.7589unsigned StoreExtractCombineCost;75907591/// Instruction that will be combined with the transition.7592Instruction *CombineInst = nullptr;75937594/// The instruction that represents the current end of the transition.7595/// Since we are faking the promotion until we reach the end of the chain7596/// of computation, we need a way to get the current end of the transition.7597Instruction *getEndOfTransition() const {7598if (InstsToBePromoted.empty())7599return Transition;7600return InstsToBePromoted.back();7601}76027603/// Return the index of the original value in the transition.7604/// E.g., for "extractelement <2 x i32> c, i32 1" the original value,7605/// c, is at index 0.7606unsigned getTransitionOriginalValueIdx() const {7607assert(isa<ExtractElementInst>(Transition) &&7608"Other kind of transitions are not supported yet");7609return 0;7610}76117612/// Return the index of the index in the transition.7613/// E.g., for "extractelement <2 x i32> c, i32 0" the index7614/// is at index 1.7615unsigned getTransitionIdx() const {7616assert(isa<ExtractElementInst>(Transition) &&7617"Other kind of transitions are not supported yet");7618return 1;7619}76207621/// Get the type of the transition.7622/// This is the type of the original value.7623/// E.g., for "extractelement <2 x i32> c, i32 1" the type of the7624/// transition is <2 x i32>.7625Type *getTransitionType() const {7626return Transition->getOperand(getTransitionOriginalValueIdx())->getType();7627}76287629/// Promote \p ToBePromoted by moving \p Def downward through.7630/// I.e., we have the following sequence:7631/// Def = Transition <ty1> a to <ty2>7632/// b = ToBePromoted <ty2> Def, ...7633/// =>7634/// b = ToBePromoted <ty1> a, ...7635/// Def = Transition <ty1> ToBePromoted to <ty2>7636void promoteImpl(Instruction *ToBePromoted);76377638/// Check whether or not it is profitable to promote all the7639/// instructions enqueued to be promoted.7640bool isProfitableToPromote() {7641Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());7642unsigned Index = isa<ConstantInt>(ValIdx)7643? cast<ConstantInt>(ValIdx)->getZExtValue()7644: -1;7645Type *PromotedType = getTransitionType();76467647StoreInst *ST = cast<StoreInst>(CombineInst);7648unsigned AS = ST->getPointerAddressSpace();7649// Check if this store is supported.7650if (!TLI.allowsMisalignedMemoryAccesses(7651TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,7652ST->getAlign())) {7653// If this is not supported, there is no way we can combine7654// the extract with the store.7655return false;7656}76577658// The scalar chain of computation has to pay for the transition7659// scalar to vector.7660// The vector chain has to account for the combining cost.7661enum TargetTransformInfo::TargetCostKind CostKind =7662TargetTransformInfo::TCK_RecipThroughput;7663InstructionCost ScalarCost =7664TTI.getVectorInstrCost(*Transition, PromotedType, CostKind, Index);7665InstructionCost VectorCost = StoreExtractCombineCost;7666for (const auto &Inst : InstsToBePromoted) {7667// Compute the cost.7668// By construction, all instructions being promoted are arithmetic ones.7669// Moreover, one argument is a constant that can be viewed as a splat7670// constant.7671Value *Arg0 = Inst->getOperand(0);7672bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||7673isa<ConstantFP>(Arg0);7674TargetTransformInfo::OperandValueInfo Arg0Info, Arg1Info;7675if (IsArg0Constant)7676Arg0Info.Kind = TargetTransformInfo::OK_UniformConstantValue;7677else7678Arg1Info.Kind = TargetTransformInfo::OK_UniformConstantValue;76797680ScalarCost += TTI.getArithmeticInstrCost(7681Inst->getOpcode(), Inst->getType(), CostKind, Arg0Info, Arg1Info);7682VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,7683CostKind, Arg0Info, Arg1Info);7684}7685LLVM_DEBUG(7686dbgs() << "Estimated cost of computation to be promoted:\nScalar: "7687<< ScalarCost << "\nVector: " << VectorCost << '\n');7688return ScalarCost > VectorCost;7689}76907691/// Generate a constant vector with \p Val with the same7692/// number of elements as the transition.7693/// \p UseSplat defines whether or not \p Val should be replicated7694/// across the whole vector.7695/// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,7696/// otherwise we generate a vector with as many undef as possible:7697/// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only7698/// used at the index of the extract.7699Value *getConstantVector(Constant *Val, bool UseSplat) const {7700unsigned ExtractIdx = std::numeric_limits<unsigned>::max();7701if (!UseSplat) {7702// If we cannot determine where the constant must be, we have to7703// use a splat constant.7704Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());7705if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))7706ExtractIdx = CstVal->getSExtValue();7707else7708UseSplat = true;7709}77107711ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();7712if (UseSplat)7713return ConstantVector::getSplat(EC, Val);77147715if (!EC.isScalable()) {7716SmallVector<Constant *, 4> ConstVec;7717UndefValue *UndefVal = UndefValue::get(Val->getType());7718for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {7719if (Idx == ExtractIdx)7720ConstVec.push_back(Val);7721else7722ConstVec.push_back(UndefVal);7723}7724return ConstantVector::get(ConstVec);7725} else7726llvm_unreachable(7727"Generate scalable vector for non-splat is unimplemented");7728}77297730/// Check if promoting to a vector type an operand at \p OperandIdx7731/// in \p Use can trigger undefined behavior.7732static bool canCauseUndefinedBehavior(const Instruction *Use,7733unsigned OperandIdx) {7734// This is not safe to introduce undef when the operand is on7735// the right hand side of a division-like instruction.7736if (OperandIdx != 1)7737return false;7738switch (Use->getOpcode()) {7739default:7740return false;7741case Instruction::SDiv:7742case Instruction::UDiv:7743case Instruction::SRem:7744case Instruction::URem:7745return true;7746case Instruction::FDiv:7747case Instruction::FRem:7748return !Use->hasNoNaNs();7749}7750llvm_unreachable(nullptr);7751}77527753public:7754VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,7755const TargetTransformInfo &TTI, Instruction *Transition,7756unsigned CombineCost)7757: DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),7758StoreExtractCombineCost(CombineCost) {7759assert(Transition && "Do not know how to promote null");7760}77617762/// Check if we can promote \p ToBePromoted to \p Type.7763bool canPromote(const Instruction *ToBePromoted) const {7764// We could support CastInst too.7765return isa<BinaryOperator>(ToBePromoted);7766}77677768/// Check if it is profitable to promote \p ToBePromoted7769/// by moving downward the transition through.7770bool shouldPromote(const Instruction *ToBePromoted) const {7771// Promote only if all the operands can be statically expanded.7772// Indeed, we do not want to introduce any new kind of transitions.7773for (const Use &U : ToBePromoted->operands()) {7774const Value *Val = U.get();7775if (Val == getEndOfTransition()) {7776// If the use is a division and the transition is on the rhs,7777// we cannot promote the operation, otherwise we may create a7778// division by zero.7779if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))7780return false;7781continue;7782}7783if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&7784!isa<ConstantFP>(Val))7785return false;7786}7787// Check that the resulting operation is legal.7788int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());7789if (!ISDOpcode)7790return false;7791return StressStoreExtract ||7792TLI.isOperationLegalOrCustom(7793ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));7794}77957796/// Check whether or not \p Use can be combined7797/// with the transition.7798/// I.e., is it possible to do Use(Transition) => AnotherUse?7799bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }78007801/// Record \p ToBePromoted as part of the chain to be promoted.7802void enqueueForPromotion(Instruction *ToBePromoted) {7803InstsToBePromoted.push_back(ToBePromoted);7804}78057806/// Set the instruction that will be combined with the transition.7807void recordCombineInstruction(Instruction *ToBeCombined) {7808assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");7809CombineInst = ToBeCombined;7810}78117812/// Promote all the instructions enqueued for promotion if it is7813/// is profitable.7814/// \return True if the promotion happened, false otherwise.7815bool promote() {7816// Check if there is something to promote.7817// Right now, if we do not have anything to combine with,7818// we assume the promotion is not profitable.7819if (InstsToBePromoted.empty() || !CombineInst)7820return false;78217822// Check cost.7823if (!StressStoreExtract && !isProfitableToPromote())7824return false;78257826// Promote.7827for (auto &ToBePromoted : InstsToBePromoted)7828promoteImpl(ToBePromoted);7829InstsToBePromoted.clear();7830return true;7831}7832};78337834} // end anonymous namespace78357836void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {7837// At this point, we know that all the operands of ToBePromoted but Def7838// can be statically promoted.7839// For Def, we need to use its parameter in ToBePromoted:7840// b = ToBePromoted ty1 a7841// Def = Transition ty1 b to ty27842// Move the transition down.7843// 1. Replace all uses of the promoted operation by the transition.7844// = ... b => = ... Def.7845assert(ToBePromoted->getType() == Transition->getType() &&7846"The type of the result of the transition does not match "7847"the final type");7848ToBePromoted->replaceAllUsesWith(Transition);7849// 2. Update the type of the uses.7850// b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.7851Type *TransitionTy = getTransitionType();7852ToBePromoted->mutateType(TransitionTy);7853// 3. Update all the operands of the promoted operation with promoted7854// operands.7855// b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.7856for (Use &U : ToBePromoted->operands()) {7857Value *Val = U.get();7858Value *NewVal = nullptr;7859if (Val == Transition)7860NewVal = Transition->getOperand(getTransitionOriginalValueIdx());7861else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||7862isa<ConstantFP>(Val)) {7863// Use a splat constant if it is not safe to use undef.7864NewVal = getConstantVector(7865cast<Constant>(Val),7866isa<UndefValue>(Val) ||7867canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));7868} else7869llvm_unreachable("Did you modified shouldPromote and forgot to update "7870"this?");7871ToBePromoted->setOperand(U.getOperandNo(), NewVal);7872}7873Transition->moveAfter(ToBePromoted);7874Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);7875}78767877/// Some targets can do store(extractelement) with one instruction.7878/// Try to push the extractelement towards the stores when the target7879/// has this feature and this is profitable.7880bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {7881unsigned CombineCost = std::numeric_limits<unsigned>::max();7882if (DisableStoreExtract ||7883(!StressStoreExtract &&7884!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),7885Inst->getOperand(1), CombineCost)))7886return false;78877888// At this point we know that Inst is a vector to scalar transition.7889// Try to move it down the def-use chain, until:7890// - We can combine the transition with its single use7891// => we got rid of the transition.7892// - We escape the current basic block7893// => we would need to check that we are moving it at a cheaper place and7894// we do not do that for now.7895BasicBlock *Parent = Inst->getParent();7896LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');7897VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);7898// If the transition has more than one use, assume this is not going to be7899// beneficial.7900while (Inst->hasOneUse()) {7901Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());7902LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');79037904if (ToBePromoted->getParent() != Parent) {7905LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("7906<< ToBePromoted->getParent()->getName()7907<< ") than the transition (" << Parent->getName()7908<< ").\n");7909return false;7910}79117912if (VPH.canCombine(ToBePromoted)) {7913LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'7914<< "will be combined with: " << *ToBePromoted << '\n');7915VPH.recordCombineInstruction(ToBePromoted);7916bool Changed = VPH.promote();7917NumStoreExtractExposed += Changed;7918return Changed;7919}79207921LLVM_DEBUG(dbgs() << "Try promoting.\n");7922if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))7923return false;79247925LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");79267927VPH.enqueueForPromotion(ToBePromoted);7928Inst = ToBePromoted;7929}7930return false;7931}79327933/// For the instruction sequence of store below, F and I values7934/// are bundled together as an i64 value before being stored into memory.7935/// Sometimes it is more efficient to generate separate stores for F and I,7936/// which can remove the bitwise instructions or sink them to colder places.7937///7938/// (store (or (zext (bitcast F to i32) to i64),7939/// (shl (zext I to i64), 32)), addr) -->7940/// (store F, addr) and (store I, addr+4)7941///7942/// Similarly, splitting for other merged store can also be beneficial, like:7943/// For pair of {i32, i32}, i64 store --> two i32 stores.7944/// For pair of {i32, i16}, i64 store --> two i32 stores.7945/// For pair of {i16, i16}, i32 store --> two i16 stores.7946/// For pair of {i16, i8}, i32 store --> two i16 stores.7947/// For pair of {i8, i8}, i16 store --> two i8 stores.7948///7949/// We allow each target to determine specifically which kind of splitting is7950/// supported.7951///7952/// The store patterns are commonly seen from the simple code snippet below7953/// if only std::make_pair(...) is sroa transformed before inlined into hoo.7954/// void goo(const std::pair<int, float> &);7955/// hoo() {7956/// ...7957/// goo(std::make_pair(tmp, ftmp));7958/// ...7959/// }7960///7961/// Although we already have similar splitting in DAG Combine, we duplicate7962/// it in CodeGenPrepare to catch the case in which pattern is across7963/// multiple BBs. The logic in DAG Combine is kept to catch case generated7964/// during code expansion.7965static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,7966const TargetLowering &TLI) {7967// Handle simple but common cases only.7968Type *StoreType = SI.getValueOperand()->getType();79697970// The code below assumes shifting a value by <number of bits>,7971// whereas scalable vectors would have to be shifted by7972// <2log(vscale) + number of bits> in order to store the7973// low/high parts. Bailing out for now.7974if (StoreType->isScalableTy())7975return false;79767977if (!DL.typeSizeEqualsStoreSize(StoreType) ||7978DL.getTypeSizeInBits(StoreType) == 0)7979return false;79807981unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;7982Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);7983if (!DL.typeSizeEqualsStoreSize(SplitStoreType))7984return false;79857986// Don't split the store if it is volatile.7987if (SI.isVolatile())7988return false;79897990// Match the following patterns:7991// (store (or (zext LValue to i64),7992// (shl (zext HValue to i64), 32)), HalfValBitSize)7993// or7994// (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)7995// (zext LValue to i64),7996// Expect both operands of OR and the first operand of SHL have only7997// one use.7998Value *LValue, *HValue;7999if (!match(SI.getValueOperand(),8000m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),8001m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),8002m_SpecificInt(HalfValBitSize))))))8003return false;80048005// Check LValue and HValue are int with size less or equal than 32.8006if (!LValue->getType()->isIntegerTy() ||8007DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||8008!HValue->getType()->isIntegerTy() ||8009DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)8010return false;80118012// If LValue/HValue is a bitcast instruction, use the EVT before bitcast8013// as the input of target query.8014auto *LBC = dyn_cast<BitCastInst>(LValue);8015auto *HBC = dyn_cast<BitCastInst>(HValue);8016EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())8017: EVT::getEVT(LValue->getType());8018EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())8019: EVT::getEVT(HValue->getType());8020if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))8021return false;80228023// Start to split store.8024IRBuilder<> Builder(SI.getContext());8025Builder.SetInsertPoint(&SI);80268027// If LValue/HValue is a bitcast in another BB, create a new one in current8028// BB so it may be merged with the splitted stores by dag combiner.8029if (LBC && LBC->getParent() != SI.getParent())8030LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());8031if (HBC && HBC->getParent() != SI.getParent())8032HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());80338034bool IsLE = SI.getDataLayout().isLittleEndian();8035auto CreateSplitStore = [&](Value *V, bool Upper) {8036V = Builder.CreateZExtOrBitCast(V, SplitStoreType);8037Value *Addr = SI.getPointerOperand();8038Align Alignment = SI.getAlign();8039const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);8040if (IsOffsetStore) {8041Addr = Builder.CreateGEP(8042SplitStoreType, Addr,8043ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));80448045// When splitting the store in half, naturally one half will retain the8046// alignment of the original wider store, regardless of whether it was8047// over-aligned or not, while the other will require adjustment.8048Alignment = commonAlignment(Alignment, HalfValBitSize / 8);8049}8050Builder.CreateAlignedStore(V, Addr, Alignment);8051};80528053CreateSplitStore(LValue, false);8054CreateSplitStore(HValue, true);80558056// Delete the old store.8057SI.eraseFromParent();8058return true;8059}80608061// Return true if the GEP has two operands, the first operand is of a sequential8062// type, and the second operand is a constant.8063static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {8064gep_type_iterator I = gep_type_begin(*GEP);8065return GEP->getNumOperands() == 2 && I.isSequential() &&8066isa<ConstantInt>(GEP->getOperand(1));8067}80688069// Try unmerging GEPs to reduce liveness interference (register pressure) across8070// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,8071// reducing liveness interference across those edges benefits global register8072// allocation. Currently handles only certain cases.8073//8074// For example, unmerge %GEPI and %UGEPI as below.8075//8076// ---------- BEFORE ----------8077// SrcBlock:8078// ...8079// %GEPIOp = ...8080// ...8081// %GEPI = gep %GEPIOp, Idx8082// ...8083// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]8084// (* %GEPI is alive on the indirectbr edges due to other uses ahead)8085// (* %GEPIOp is alive on the indirectbr edges only because of it's used by8086// %UGEPI)8087//8088// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)8089// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)8090// ...8091//8092// DstBi:8093// ...8094// %UGEPI = gep %GEPIOp, UIdx8095// ...8096// ---------------------------8097//8098// ---------- AFTER ----------8099// SrcBlock:8100// ... (same as above)8101// (* %GEPI is still alive on the indirectbr edges)8102// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the8103// unmerging)8104// ...8105//8106// DstBi:8107// ...8108// %UGEPI = gep %GEPI, (UIdx-Idx)8109// ...8110// ---------------------------8111//8112// The register pressure on the IndirectBr edges is reduced because %GEPIOp is8113// no longer alive on them.8114//8115// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging8116// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as8117// not to disable further simplications and optimizations as a result of GEP8118// merging.8119//8120// Note this unmerging may increase the length of the data flow critical path8121// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff8122// between the register pressure and the length of data-flow critical8123// path. Restricting this to the uncommon IndirectBr case would minimize the8124// impact of potentially longer critical path, if any, and the impact on compile8125// time.8126static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,8127const TargetTransformInfo *TTI) {8128BasicBlock *SrcBlock = GEPI->getParent();8129// Check that SrcBlock ends with an IndirectBr. If not, give up. The common8130// (non-IndirectBr) cases exit early here.8131if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))8132return false;8133// Check that GEPI is a simple gep with a single constant index.8134if (!GEPSequentialConstIndexed(GEPI))8135return false;8136ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));8137// Check that GEPI is a cheap one.8138if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),8139TargetTransformInfo::TCK_SizeAndLatency) >8140TargetTransformInfo::TCC_Basic)8141return false;8142Value *GEPIOp = GEPI->getOperand(0);8143// Check that GEPIOp is an instruction that's also defined in SrcBlock.8144if (!isa<Instruction>(GEPIOp))8145return false;8146auto *GEPIOpI = cast<Instruction>(GEPIOp);8147if (GEPIOpI->getParent() != SrcBlock)8148return false;8149// Check that GEP is used outside the block, meaning it's alive on the8150// IndirectBr edge(s).8151if (llvm::none_of(GEPI->users(), [&](User *Usr) {8152if (auto *I = dyn_cast<Instruction>(Usr)) {8153if (I->getParent() != SrcBlock) {8154return true;8155}8156}8157return false;8158}))8159return false;8160// The second elements of the GEP chains to be unmerged.8161std::vector<GetElementPtrInst *> UGEPIs;8162// Check each user of GEPIOp to check if unmerging would make GEPIOp not alive8163// on IndirectBr edges.8164for (User *Usr : GEPIOp->users()) {8165if (Usr == GEPI)8166continue;8167// Check if Usr is an Instruction. If not, give up.8168if (!isa<Instruction>(Usr))8169return false;8170auto *UI = cast<Instruction>(Usr);8171// Check if Usr in the same block as GEPIOp, which is fine, skip.8172if (UI->getParent() == SrcBlock)8173continue;8174// Check if Usr is a GEP. If not, give up.8175if (!isa<GetElementPtrInst>(Usr))8176return false;8177auto *UGEPI = cast<GetElementPtrInst>(Usr);8178// Check if UGEPI is a simple gep with a single constant index and GEPIOp is8179// the pointer operand to it. If so, record it in the vector. If not, give8180// up.8181if (!GEPSequentialConstIndexed(UGEPI))8182return false;8183if (UGEPI->getOperand(0) != GEPIOp)8184return false;8185if (UGEPI->getSourceElementType() != GEPI->getSourceElementType())8186return false;8187if (GEPIIdx->getType() !=8188cast<ConstantInt>(UGEPI->getOperand(1))->getType())8189return false;8190ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));8191if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),8192TargetTransformInfo::TCK_SizeAndLatency) >8193TargetTransformInfo::TCC_Basic)8194return false;8195UGEPIs.push_back(UGEPI);8196}8197if (UGEPIs.size() == 0)8198return false;8199// Check the materializing cost of (Uidx-Idx).8200for (GetElementPtrInst *UGEPI : UGEPIs) {8201ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));8202APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();8203InstructionCost ImmCost = TTI->getIntImmCost(8204NewIdx, GEPIIdx->getType(), TargetTransformInfo::TCK_SizeAndLatency);8205if (ImmCost > TargetTransformInfo::TCC_Basic)8206return false;8207}8208// Now unmerge between GEPI and UGEPIs.8209for (GetElementPtrInst *UGEPI : UGEPIs) {8210UGEPI->setOperand(0, GEPI);8211ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));8212Constant *NewUGEPIIdx = ConstantInt::get(8213GEPIIdx->getType(), UGEPIIdx->getValue() - GEPIIdx->getValue());8214UGEPI->setOperand(1, NewUGEPIIdx);8215// If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not8216// inbounds to avoid UB.8217if (!GEPI->isInBounds()) {8218UGEPI->setIsInBounds(false);8219}8220}8221// After unmerging, verify that GEPIOp is actually only used in SrcBlock (not8222// alive on IndirectBr edges).8223assert(llvm::none_of(GEPIOp->users(),8224[&](User *Usr) {8225return cast<Instruction>(Usr)->getParent() != SrcBlock;8226}) &&8227"GEPIOp is used outside SrcBlock");8228return true;8229}82308231static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,8232SmallSet<BasicBlock *, 32> &FreshBBs,8233bool IsHugeFunc) {8234// Try and convert8235// %c = icmp ult %x, 88236// br %c, bla, blb8237// %tc = lshr %x, 38238// to8239// %tc = lshr %x, 38240// %c = icmp eq %tc, 08241// br %c, bla, blb8242// Creating the cmp to zero can be better for the backend, especially if the8243// lshr produces flags that can be used automatically.8244if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())8245return false;82468247ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());8248if (!Cmp || !isa<ConstantInt>(Cmp->getOperand(1)) || !Cmp->hasOneUse())8249return false;82508251Value *X = Cmp->getOperand(0);8252APInt CmpC = cast<ConstantInt>(Cmp->getOperand(1))->getValue();82538254for (auto *U : X->users()) {8255Instruction *UI = dyn_cast<Instruction>(U);8256// A quick dominance check8257if (!UI ||8258(UI->getParent() != Branch->getParent() &&8259UI->getParent() != Branch->getSuccessor(0) &&8260UI->getParent() != Branch->getSuccessor(1)) ||8261(UI->getParent() != Branch->getParent() &&8262!UI->getParent()->getSinglePredecessor()))8263continue;82648265if (CmpC.isPowerOf2() && Cmp->getPredicate() == ICmpInst::ICMP_ULT &&8266match(UI, m_Shr(m_Specific(X), m_SpecificInt(CmpC.logBase2())))) {8267IRBuilder<> Builder(Branch);8268if (UI->getParent() != Branch->getParent())8269UI->moveBefore(Branch);8270UI->dropPoisonGeneratingFlags();8271Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,8272ConstantInt::get(UI->getType(), 0));8273LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");8274LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");8275replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);8276return true;8277}8278if (Cmp->isEquality() &&8279(match(UI, m_Add(m_Specific(X), m_SpecificInt(-CmpC))) ||8280match(UI, m_Sub(m_Specific(X), m_SpecificInt(CmpC))))) {8281IRBuilder<> Builder(Branch);8282if (UI->getParent() != Branch->getParent())8283UI->moveBefore(Branch);8284UI->dropPoisonGeneratingFlags();8285Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,8286ConstantInt::get(UI->getType(), 0));8287LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");8288LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");8289replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);8290return true;8291}8292}8293return false;8294}82958296bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {8297bool AnyChange = false;8298AnyChange = fixupDbgVariableRecordsOnInst(*I);82998300// Bail out if we inserted the instruction to prevent optimizations from8301// stepping on each other's toes.8302if (InsertedInsts.count(I))8303return AnyChange;83048305// TODO: Move into the switch on opcode below here.8306if (PHINode *P = dyn_cast<PHINode>(I)) {8307// It is possible for very late stage optimizations (such as SimplifyCFG)8308// to introduce PHI nodes too late to be cleaned up. If we detect such a8309// trivial PHI, go ahead and zap it here.8310if (Value *V = simplifyInstruction(P, {*DL, TLInfo})) {8311LargeOffsetGEPMap.erase(P);8312replaceAllUsesWith(P, V, FreshBBs, IsHugeFunc);8313P->eraseFromParent();8314++NumPHIsElim;8315return true;8316}8317return AnyChange;8318}83198320if (CastInst *CI = dyn_cast<CastInst>(I)) {8321// If the source of the cast is a constant, then this should have8322// already been constant folded. The only reason NOT to constant fold8323// it is if something (e.g. LSR) was careful to place the constant8324// evaluation in a block other than then one that uses it (e.g. to hoist8325// the address of globals out of a loop). If this is the case, we don't8326// want to forward-subst the cast.8327if (isa<Constant>(CI->getOperand(0)))8328return AnyChange;83298330if (OptimizeNoopCopyExpression(CI, *TLI, *DL))8331return true;83328333if ((isa<UIToFPInst>(I) || isa<SIToFPInst>(I) || isa<FPToUIInst>(I) ||8334isa<TruncInst>(I)) &&8335TLI->optimizeExtendOrTruncateConversion(8336I, LI->getLoopFor(I->getParent()), *TTI))8337return true;83388339if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {8340/// Sink a zext or sext into its user blocks if the target type doesn't8341/// fit in one register8342if (TLI->getTypeAction(CI->getContext(),8343TLI->getValueType(*DL, CI->getType())) ==8344TargetLowering::TypeExpandInteger) {8345return SinkCast(CI);8346} else {8347if (TLI->optimizeExtendOrTruncateConversion(8348I, LI->getLoopFor(I->getParent()), *TTI))8349return true;83508351bool MadeChange = optimizeExt(I);8352return MadeChange | optimizeExtUses(I);8353}8354}8355return AnyChange;8356}83578358if (auto *Cmp = dyn_cast<CmpInst>(I))8359if (optimizeCmp(Cmp, ModifiedDT))8360return true;83618362if (LoadInst *LI = dyn_cast<LoadInst>(I)) {8363LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);8364bool Modified = optimizeLoadExt(LI);8365unsigned AS = LI->getPointerAddressSpace();8366Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);8367return Modified;8368}83698370if (StoreInst *SI = dyn_cast<StoreInst>(I)) {8371if (splitMergedValStore(*SI, *DL, *TLI))8372return true;8373SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);8374unsigned AS = SI->getPointerAddressSpace();8375return optimizeMemoryInst(I, SI->getOperand(1),8376SI->getOperand(0)->getType(), AS);8377}83788379if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {8380unsigned AS = RMW->getPointerAddressSpace();8381return optimizeMemoryInst(I, RMW->getPointerOperand(), RMW->getType(), AS);8382}83838384if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {8385unsigned AS = CmpX->getPointerAddressSpace();8386return optimizeMemoryInst(I, CmpX->getPointerOperand(),8387CmpX->getCompareOperand()->getType(), AS);8388}83898390BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);83918392if (BinOp && BinOp->getOpcode() == Instruction::And && EnableAndCmpSinking &&8393sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))8394return true;83958396// TODO: Move this into the switch on opcode - it handles shifts already.8397if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||8398BinOp->getOpcode() == Instruction::LShr)) {8399ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));8400if (CI && TLI->hasExtractBitsInsn())8401if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))8402return true;8403}84048405if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {8406if (GEPI->hasAllZeroIndices()) {8407/// The GEP operand must be a pointer, so must its result -> BitCast8408Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),8409GEPI->getName(), GEPI->getIterator());8410NC->setDebugLoc(GEPI->getDebugLoc());8411replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc);8412RecursivelyDeleteTriviallyDeadInstructions(8413GEPI, TLInfo, nullptr,8414[&](Value *V) { removeAllAssertingVHReferences(V); });8415++NumGEPsElim;8416optimizeInst(NC, ModifiedDT);8417return true;8418}8419if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {8420return true;8421}8422}84238424if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {8425// freeze(icmp a, const)) -> icmp (freeze a), const8426// This helps generate efficient conditional jumps.8427Instruction *CmpI = nullptr;8428if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))8429CmpI = II;8430else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))8431CmpI = F->getFastMathFlags().none() ? F : nullptr;84328433if (CmpI && CmpI->hasOneUse()) {8434auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);8435bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||8436isa<ConstantPointerNull>(Op0);8437bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||8438isa<ConstantPointerNull>(Op1);8439if (Const0 || Const1) {8440if (!Const0 || !Const1) {8441auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI->getIterator());8442F->takeName(FI);8443CmpI->setOperand(Const0 ? 1 : 0, F);8444}8445replaceAllUsesWith(FI, CmpI, FreshBBs, IsHugeFunc);8446FI->eraseFromParent();8447return true;8448}8449}8450return AnyChange;8451}84528453if (tryToSinkFreeOperands(I))8454return true;84558456switch (I->getOpcode()) {8457case Instruction::Shl:8458case Instruction::LShr:8459case Instruction::AShr:8460return optimizeShiftInst(cast<BinaryOperator>(I));8461case Instruction::Call:8462return optimizeCallInst(cast<CallInst>(I), ModifiedDT);8463case Instruction::Select:8464return optimizeSelectInst(cast<SelectInst>(I));8465case Instruction::ShuffleVector:8466return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));8467case Instruction::Switch:8468return optimizeSwitchInst(cast<SwitchInst>(I));8469case Instruction::ExtractElement:8470return optimizeExtractElementInst(cast<ExtractElementInst>(I));8471case Instruction::Br:8472return optimizeBranch(cast<BranchInst>(I), *TLI, FreshBBs, IsHugeFunc);8473}84748475return AnyChange;8476}84778478/// Given an OR instruction, check to see if this is a bitreverse8479/// idiom. If so, insert the new intrinsic and return true.8480bool CodeGenPrepare::makeBitReverse(Instruction &I) {8481if (!I.getType()->isIntegerTy() ||8482!TLI->isOperationLegalOrCustom(ISD::BITREVERSE,8483TLI->getValueType(*DL, I.getType(), true)))8484return false;84858486SmallVector<Instruction *, 4> Insts;8487if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))8488return false;8489Instruction *LastInst = Insts.back();8490replaceAllUsesWith(&I, LastInst, FreshBBs, IsHugeFunc);8491RecursivelyDeleteTriviallyDeadInstructions(8492&I, TLInfo, nullptr,8493[&](Value *V) { removeAllAssertingVHReferences(V); });8494return true;8495}84968497// In this pass we look for GEP and cast instructions that are used8498// across basic blocks and rewrite them to improve basic-block-at-a-time8499// selection.8500bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) {8501SunkAddrs.clear();8502bool MadeChange = false;85038504do {8505CurInstIterator = BB.begin();8506ModifiedDT = ModifyDT::NotModifyDT;8507while (CurInstIterator != BB.end()) {8508MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);8509if (ModifiedDT != ModifyDT::NotModifyDT) {8510// For huge function we tend to quickly go though the inner optmization8511// opportunities in the BB. So we go back to the BB head to re-optimize8512// each instruction instead of go back to the function head.8513if (IsHugeFunc) {8514DT.reset();8515getDT(*BB.getParent());8516break;8517} else {8518return true;8519}8520}8521}8522} while (ModifiedDT == ModifyDT::ModifyInstDT);85238524bool MadeBitReverse = true;8525while (MadeBitReverse) {8526MadeBitReverse = false;8527for (auto &I : reverse(BB)) {8528if (makeBitReverse(I)) {8529MadeBitReverse = MadeChange = true;8530break;8531}8532}8533}8534MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);85358536return MadeChange;8537}85388539// Some CGP optimizations may move or alter what's computed in a block. Check8540// whether a dbg.value intrinsic could be pointed at a more appropriate operand.8541bool CodeGenPrepare::fixupDbgValue(Instruction *I) {8542assert(isa<DbgValueInst>(I));8543DbgValueInst &DVI = *cast<DbgValueInst>(I);85448545// Does this dbg.value refer to a sunk address calculation?8546bool AnyChange = false;8547SmallDenseSet<Value *> LocationOps(DVI.location_ops().begin(),8548DVI.location_ops().end());8549for (Value *Location : LocationOps) {8550WeakTrackingVH SunkAddrVH = SunkAddrs[Location];8551Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;8552if (SunkAddr) {8553// Point dbg.value at locally computed address, which should give the best8554// opportunity to be accurately lowered. This update may change the type8555// of pointer being referred to; however this makes no difference to8556// debugging information, and we can't generate bitcasts that may affect8557// codegen.8558DVI.replaceVariableLocationOp(Location, SunkAddr);8559AnyChange = true;8560}8561}8562return AnyChange;8563}85648565bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) {8566bool AnyChange = false;8567for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))8568AnyChange |= fixupDbgVariableRecord(DVR);8569return AnyChange;8570}85718572// FIXME: should updating debug-info really cause the "changed" flag to fire,8573// which can cause a function to be reprocessed?8574bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) {8575if (DVR.Type != DbgVariableRecord::LocationType::Value &&8576DVR.Type != DbgVariableRecord::LocationType::Assign)8577return false;85788579// Does this DbgVariableRecord refer to a sunk address calculation?8580bool AnyChange = false;8581SmallDenseSet<Value *> LocationOps(DVR.location_ops().begin(),8582DVR.location_ops().end());8583for (Value *Location : LocationOps) {8584WeakTrackingVH SunkAddrVH = SunkAddrs[Location];8585Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;8586if (SunkAddr) {8587// Point dbg.value at locally computed address, which should give the best8588// opportunity to be accurately lowered. This update may change the type8589// of pointer being referred to; however this makes no difference to8590// debugging information, and we can't generate bitcasts that may affect8591// codegen.8592DVR.replaceVariableLocationOp(Location, SunkAddr);8593AnyChange = true;8594}8595}8596return AnyChange;8597}85988599static void DbgInserterHelper(DbgValueInst *DVI, Instruction *VI) {8600DVI->removeFromParent();8601if (isa<PHINode>(VI))8602DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());8603else8604DVI->insertAfter(VI);8605}86068607static void DbgInserterHelper(DbgVariableRecord *DVR, Instruction *VI) {8608DVR->removeFromParent();8609BasicBlock *VIBB = VI->getParent();8610if (isa<PHINode>(VI))8611VIBB->insertDbgRecordBefore(DVR, VIBB->getFirstInsertionPt());8612else8613VIBB->insertDbgRecordAfter(DVR, VI);8614}86158616// A llvm.dbg.value may be using a value before its definition, due to8617// optimizations in this pass and others. Scan for such dbg.values, and rescue8618// them by moving the dbg.value to immediately after the value definition.8619// FIXME: Ideally this should never be necessary, and this has the potential8620// to re-order dbg.value intrinsics.8621bool CodeGenPrepare::placeDbgValues(Function &F) {8622bool MadeChange = false;8623DominatorTree DT(F);86248625auto DbgProcessor = [&](auto *DbgItem, Instruction *Position) {8626SmallVector<Instruction *, 4> VIs;8627for (Value *V : DbgItem->location_ops())8628if (Instruction *VI = dyn_cast_or_null<Instruction>(V))8629VIs.push_back(VI);86308631// This item may depend on multiple instructions, complicating any8632// potential sink. This block takes the defensive approach, opting to8633// "undef" the item if it has more than one instruction and any of them do8634// not dominate iem.8635for (Instruction *VI : VIs) {8636if (VI->isTerminator())8637continue;86388639// If VI is a phi in a block with an EHPad terminator, we can't insert8640// after it.8641if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())8642continue;86438644// If the defining instruction dominates the dbg.value, we do not need8645// to move the dbg.value.8646if (DT.dominates(VI, Position))8647continue;86488649// If we depend on multiple instructions and any of them doesn't8650// dominate this DVI, we probably can't salvage it: moving it to8651// after any of the instructions could cause us to lose the others.8652if (VIs.size() > 1) {8653LLVM_DEBUG(8654dbgs()8655<< "Unable to find valid location for Debug Value, undefing:\n"8656<< *DbgItem);8657DbgItem->setKillLocation();8658break;8659}86608661LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"8662<< *DbgItem << ' ' << *VI);8663DbgInserterHelper(DbgItem, VI);8664MadeChange = true;8665++NumDbgValueMoved;8666}8667};86688669for (BasicBlock &BB : F) {8670for (Instruction &Insn : llvm::make_early_inc_range(BB)) {8671// Process dbg.value intrinsics.8672DbgValueInst *DVI = dyn_cast<DbgValueInst>(&Insn);8673if (DVI) {8674DbgProcessor(DVI, DVI);8675continue;8676}86778678// If this isn't a dbg.value, process any attached DbgVariableRecord8679// records attached to this instruction.8680for (DbgVariableRecord &DVR : llvm::make_early_inc_range(8681filterDbgVars(Insn.getDbgRecordRange()))) {8682if (DVR.Type != DbgVariableRecord::LocationType::Value)8683continue;8684DbgProcessor(&DVR, &Insn);8685}8686}8687}86888689return MadeChange;8690}86918692// Group scattered pseudo probes in a block to favor SelectionDAG. Scattered8693// probes can be chained dependencies of other regular DAG nodes and block DAG8694// combine optimizations.8695bool CodeGenPrepare::placePseudoProbes(Function &F) {8696bool MadeChange = false;8697for (auto &Block : F) {8698// Move the rest probes to the beginning of the block.8699auto FirstInst = Block.getFirstInsertionPt();8700while (FirstInst != Block.end() && FirstInst->isDebugOrPseudoInst())8701++FirstInst;8702BasicBlock::iterator I(FirstInst);8703I++;8704while (I != Block.end()) {8705if (auto *II = dyn_cast<PseudoProbeInst>(I++)) {8706II->moveBefore(&*FirstInst);8707MadeChange = true;8708}8709}8710}8711return MadeChange;8712}87138714/// Scale down both weights to fit into uint32_t.8715static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {8716uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;8717uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;8718NewTrue = NewTrue / Scale;8719NewFalse = NewFalse / Scale;8720}87218722/// Some targets prefer to split a conditional branch like:8723/// \code8724/// %0 = icmp ne i32 %a, 08725/// %1 = icmp ne i32 %b, 08726/// %or.cond = or i1 %0, %18727/// br i1 %or.cond, label %TrueBB, label %FalseBB8728/// \endcode8729/// into multiple branch instructions like:8730/// \code8731/// bb1:8732/// %0 = icmp ne i32 %a, 08733/// br i1 %0, label %TrueBB, label %bb28734/// bb2:8735/// %1 = icmp ne i32 %b, 08736/// br i1 %1, label %TrueBB, label %FalseBB8737/// \endcode8738/// This usually allows instruction selection to do even further optimizations8739/// and combine the compare with the branch instruction. Currently this is8740/// applied for targets which have "cheap" jump instructions.8741///8742/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.8743///8744bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {8745if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())8746return false;87478748bool MadeChange = false;8749for (auto &BB : F) {8750// Does this BB end with the following?8751// %cond1 = icmp|fcmp|binary instruction ...8752// %cond2 = icmp|fcmp|binary instruction ...8753// %cond.or = or|and i1 %cond1, cond28754// br i1 %cond.or label %dest1, label %dest2"8755Instruction *LogicOp;8756BasicBlock *TBB, *FBB;8757if (!match(BB.getTerminator(),8758m_Br(m_OneUse(m_Instruction(LogicOp)), TBB, FBB)))8759continue;87608761auto *Br1 = cast<BranchInst>(BB.getTerminator());8762if (Br1->getMetadata(LLVMContext::MD_unpredictable))8763continue;87648765// The merging of mostly empty BB can cause a degenerate branch.8766if (TBB == FBB)8767continue;87688769unsigned Opc;8770Value *Cond1, *Cond2;8771if (match(LogicOp,8772m_LogicalAnd(m_OneUse(m_Value(Cond1)), m_OneUse(m_Value(Cond2)))))8773Opc = Instruction::And;8774else if (match(LogicOp, m_LogicalOr(m_OneUse(m_Value(Cond1)),8775m_OneUse(m_Value(Cond2)))))8776Opc = Instruction::Or;8777else8778continue;87798780auto IsGoodCond = [](Value *Cond) {8781return match(8782Cond,8783m_CombineOr(m_Cmp(), m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),8784m_LogicalOr(m_Value(), m_Value()))));8785};8786if (!IsGoodCond(Cond1) || !IsGoodCond(Cond2))8787continue;87888789LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());87908791// Create a new BB.8792auto *TmpBB =8793BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",8794BB.getParent(), BB.getNextNode());8795if (IsHugeFunc)8796FreshBBs.insert(TmpBB);87978798// Update original basic block by using the first condition directly by the8799// branch instruction and removing the no longer needed and/or instruction.8800Br1->setCondition(Cond1);8801LogicOp->eraseFromParent();88028803// Depending on the condition we have to either replace the true or the8804// false successor of the original branch instruction.8805if (Opc == Instruction::And)8806Br1->setSuccessor(0, TmpBB);8807else8808Br1->setSuccessor(1, TmpBB);88098810// Fill in the new basic block.8811auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);8812if (auto *I = dyn_cast<Instruction>(Cond2)) {8813I->removeFromParent();8814I->insertBefore(Br2);8815}88168817// Update PHI nodes in both successors. The original BB needs to be8818// replaced in one successor's PHI nodes, because the branch comes now from8819// the newly generated BB (NewBB). In the other successor we need to add one8820// incoming edge to the PHI nodes, because both branch instructions target8821// now the same successor. Depending on the original branch condition8822// (and/or) we have to swap the successors (TrueDest, FalseDest), so that8823// we perform the correct update for the PHI nodes.8824// This doesn't change the successor order of the just created branch8825// instruction (or any other instruction).8826if (Opc == Instruction::Or)8827std::swap(TBB, FBB);88288829// Replace the old BB with the new BB.8830TBB->replacePhiUsesWith(&BB, TmpBB);88318832// Add another incoming edge from the new BB.8833for (PHINode &PN : FBB->phis()) {8834auto *Val = PN.getIncomingValueForBlock(&BB);8835PN.addIncoming(Val, TmpBB);8836}88378838// Update the branch weights (from SelectionDAGBuilder::8839// FindMergedConditions).8840if (Opc == Instruction::Or) {8841// Codegen X | Y as:8842// BB1:8843// jmp_if_X TBB8844// jmp TmpBB8845// TmpBB:8846// jmp_if_Y TBB8847// jmp FBB8848//88498850// We have flexibility in setting Prob for BB1 and Prob for NewBB.8851// The requirement is that8852// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)8853// = TrueProb for original BB.8854// Assuming the original weights are A and B, one choice is to set BB1's8855// weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice8856// assumes that8857// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.8858// Another choice is to assume TrueProb for BB1 equals to TrueProb for8859// TmpBB, but the math is more complicated.8860uint64_t TrueWeight, FalseWeight;8861if (extractBranchWeights(*Br1, TrueWeight, FalseWeight)) {8862uint64_t NewTrueWeight = TrueWeight;8863uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;8864scaleWeights(NewTrueWeight, NewFalseWeight);8865Br1->setMetadata(LLVMContext::MD_prof,8866MDBuilder(Br1->getContext())8867.createBranchWeights(TrueWeight, FalseWeight,8868hasBranchWeightOrigin(*Br1)));88698870NewTrueWeight = TrueWeight;8871NewFalseWeight = 2 * FalseWeight;8872scaleWeights(NewTrueWeight, NewFalseWeight);8873Br2->setMetadata(LLVMContext::MD_prof,8874MDBuilder(Br2->getContext())8875.createBranchWeights(TrueWeight, FalseWeight));8876}8877} else {8878// Codegen X & Y as:8879// BB1:8880// jmp_if_X TmpBB8881// jmp FBB8882// TmpBB:8883// jmp_if_Y TBB8884// jmp FBB8885//8886// This requires creation of TmpBB after CurBB.88878888// We have flexibility in setting Prob for BB1 and Prob for TmpBB.8889// The requirement is that8890// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)8891// = FalseProb for original BB.8892// Assuming the original weights are A and B, one choice is to set BB1's8893// weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice8894// assumes that8895// FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.8896uint64_t TrueWeight, FalseWeight;8897if (extractBranchWeights(*Br1, TrueWeight, FalseWeight)) {8898uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;8899uint64_t NewFalseWeight = FalseWeight;8900scaleWeights(NewTrueWeight, NewFalseWeight);8901Br1->setMetadata(LLVMContext::MD_prof,8902MDBuilder(Br1->getContext())8903.createBranchWeights(TrueWeight, FalseWeight));89048905NewTrueWeight = 2 * TrueWeight;8906NewFalseWeight = FalseWeight;8907scaleWeights(NewTrueWeight, NewFalseWeight);8908Br2->setMetadata(LLVMContext::MD_prof,8909MDBuilder(Br2->getContext())8910.createBranchWeights(TrueWeight, FalseWeight));8911}8912}89138914ModifiedDT = ModifyDT::ModifyBBDT;8915MadeChange = true;89168917LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();8918TmpBB->dump());8919}8920return MadeChange;8921}892289238924