Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
35271 views
//===----------------- LoopRotationUtils.cpp -----------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file provides utilities to convert a loop into a loop with bottom test.9//10//===----------------------------------------------------------------------===//1112#include "llvm/Transforms/Utils/LoopRotationUtils.h"13#include "llvm/ADT/Statistic.h"14#include "llvm/Analysis/AssumptionCache.h"15#include "llvm/Analysis/CodeMetrics.h"16#include "llvm/Analysis/DomTreeUpdater.h"17#include "llvm/Analysis/InstructionSimplify.h"18#include "llvm/Analysis/LoopInfo.h"19#include "llvm/Analysis/MemorySSA.h"20#include "llvm/Analysis/MemorySSAUpdater.h"21#include "llvm/Analysis/ScalarEvolution.h"22#include "llvm/Analysis/ValueTracking.h"23#include "llvm/IR/CFG.h"24#include "llvm/IR/DebugInfo.h"25#include "llvm/IR/Dominators.h"26#include "llvm/IR/IntrinsicInst.h"27#include "llvm/IR/MDBuilder.h"28#include "llvm/IR/ProfDataUtils.h"29#include "llvm/Support/CommandLine.h"30#include "llvm/Support/Debug.h"31#include "llvm/Support/raw_ostream.h"32#include "llvm/Transforms/Utils/BasicBlockUtils.h"33#include "llvm/Transforms/Utils/Cloning.h"34#include "llvm/Transforms/Utils/Local.h"35#include "llvm/Transforms/Utils/SSAUpdater.h"36#include "llvm/Transforms/Utils/ValueMapper.h"37using namespace llvm;3839#define DEBUG_TYPE "loop-rotate"4041STATISTIC(NumNotRotatedDueToHeaderSize,42"Number of loops not rotated due to the header size");43STATISTIC(NumInstrsHoisted,44"Number of instructions hoisted into loop preheader");45STATISTIC(NumInstrsDuplicated,46"Number of instructions cloned into loop preheader");47STATISTIC(NumRotated, "Number of loops rotated");4849static cl::opt<bool>50MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,51cl::desc("Allow loop rotation multiple times in order to reach "52"a better latch exit"));5354// Probability that a rotated loop has zero trip count / is never entered.55static constexpr uint32_t ZeroTripCountWeights[] = {1, 127};5657namespace {58/// A simple loop rotation transformation.59class LoopRotate {60const unsigned MaxHeaderSize;61LoopInfo *LI;62const TargetTransformInfo *TTI;63AssumptionCache *AC;64DominatorTree *DT;65ScalarEvolution *SE;66MemorySSAUpdater *MSSAU;67const SimplifyQuery &SQ;68bool RotationOnly;69bool IsUtilMode;70bool PrepareForLTO;7172public:73LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,74const TargetTransformInfo *TTI, AssumptionCache *AC,75DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,76const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,77bool PrepareForLTO)78: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),79MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),80IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}81bool processLoop(Loop *L);8283private:84bool rotateLoop(Loop *L, bool SimplifiedLatch);85bool simplifyLoopLatch(Loop *L);86};87} // end anonymous namespace8889/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not90/// previously exist in the map, and the value was inserted.91static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {92bool Inserted = VM.insert({K, V}).second;93assert(Inserted);94(void)Inserted;95}96/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the97/// old header into the preheader. If there were uses of the values produced by98/// these instruction that were outside of the loop, we have to insert PHI nodes99/// to merge the two values. Do this now.100static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,101BasicBlock *OrigPreheader,102ValueToValueMapTy &ValueMap,103ScalarEvolution *SE,104SmallVectorImpl<PHINode*> *InsertedPHIs) {105// Remove PHI node entries that are no longer live.106BasicBlock::iterator I, E = OrigHeader->end();107for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)108PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));109110// Now fix up users of the instructions in OrigHeader, inserting PHI nodes111// as necessary.112SSAUpdater SSA(InsertedPHIs);113for (I = OrigHeader->begin(); I != E; ++I) {114Value *OrigHeaderVal = &*I;115116// If there are no uses of the value (e.g. because it returns void), there117// is nothing to rewrite.118if (OrigHeaderVal->use_empty())119continue;120121Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);122123// The value now exits in two versions: the initial value in the preheader124// and the loop "next" value in the original header.125SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());126// Force re-computation of OrigHeaderVal, as some users now need to use the127// new PHI node.128if (SE)129SE->forgetValue(OrigHeaderVal);130SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);131SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);132133// Visit each use of the OrigHeader instruction.134for (Use &U : llvm::make_early_inc_range(OrigHeaderVal->uses())) {135// SSAUpdater can't handle a non-PHI use in the same block as an136// earlier def. We can easily handle those cases manually.137Instruction *UserInst = cast<Instruction>(U.getUser());138if (!isa<PHINode>(UserInst)) {139BasicBlock *UserBB = UserInst->getParent();140141// The original users in the OrigHeader are already using the142// original definitions.143if (UserBB == OrigHeader)144continue;145146// Users in the OrigPreHeader need to use the value to which the147// original definitions are mapped.148if (UserBB == OrigPreheader) {149U = OrigPreHeaderVal;150continue;151}152}153154// Anything else can be handled by SSAUpdater.155SSA.RewriteUse(U);156}157158// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug159// intrinsics.160SmallVector<DbgValueInst *, 1> DbgValues;161SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;162llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);163for (auto &DbgValue : DbgValues) {164// The original users in the OrigHeader are already using the original165// definitions.166BasicBlock *UserBB = DbgValue->getParent();167if (UserBB == OrigHeader)168continue;169170// Users in the OrigPreHeader need to use the value to which the171// original definitions are mapped and anything else can be handled by172// the SSAUpdater. To avoid adding PHINodes, check if the value is173// available in UserBB, if not substitute undef.174Value *NewVal;175if (UserBB == OrigPreheader)176NewVal = OrigPreHeaderVal;177else if (SSA.HasValueForBlock(UserBB))178NewVal = SSA.GetValueInMiddleOfBlock(UserBB);179else180NewVal = UndefValue::get(OrigHeaderVal->getType());181DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal);182}183184// RemoveDIs: duplicate implementation for non-instruction debug-info185// storage in DbgVariableRecords.186for (DbgVariableRecord *DVR : DbgVariableRecords) {187// The original users in the OrigHeader are already using the original188// definitions.189BasicBlock *UserBB = DVR->getMarker()->getParent();190if (UserBB == OrigHeader)191continue;192193// Users in the OrigPreHeader need to use the value to which the194// original definitions are mapped and anything else can be handled by195// the SSAUpdater. To avoid adding PHINodes, check if the value is196// available in UserBB, if not substitute undef.197Value *NewVal;198if (UserBB == OrigPreheader)199NewVal = OrigPreHeaderVal;200else if (SSA.HasValueForBlock(UserBB))201NewVal = SSA.GetValueInMiddleOfBlock(UserBB);202else203NewVal = UndefValue::get(OrigHeaderVal->getType());204DVR->replaceVariableLocationOp(OrigHeaderVal, NewVal);205}206}207}208209// Assuming both header and latch are exiting, look for a phi which is only210// used outside the loop (via a LCSSA phi) in the exit from the header.211// This means that rotating the loop can remove the phi.212static bool profitableToRotateLoopExitingLatch(Loop *L) {213BasicBlock *Header = L->getHeader();214BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());215assert(BI && BI->isConditional() && "need header with conditional exit");216BasicBlock *HeaderExit = BI->getSuccessor(0);217if (L->contains(HeaderExit))218HeaderExit = BI->getSuccessor(1);219220for (auto &Phi : Header->phis()) {221// Look for uses of this phi in the loop/via exits other than the header.222if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {223return cast<Instruction>(U)->getParent() != HeaderExit;224}))225continue;226return true;227}228return false;229}230231// Check that latch exit is deoptimizing (which means - very unlikely to happen)232// and there is another exit from the loop which is non-deoptimizing.233// If we rotate latch to that exit our loop has a better chance of being fully234// canonical.235//236// It can give false positives in some rare cases.237static bool canRotateDeoptimizingLatchExit(Loop *L) {238BasicBlock *Latch = L->getLoopLatch();239assert(Latch && "need latch");240BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());241// Need normal exiting latch.242if (!BI || !BI->isConditional())243return false;244245BasicBlock *Exit = BI->getSuccessor(1);246if (L->contains(Exit))247Exit = BI->getSuccessor(0);248249// Latch exit is non-deoptimizing, no need to rotate.250if (!Exit->getPostdominatingDeoptimizeCall())251return false;252253SmallVector<BasicBlock *, 4> Exits;254L->getUniqueExitBlocks(Exits);255if (!Exits.empty()) {256// There is at least one non-deoptimizing exit.257//258// Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,259// as it can conservatively return false for deoptimizing exits with260// complex enough control flow down to deoptimize call.261//262// That means here we can report success for a case where263// all exits are deoptimizing but one of them has complex enough264// control flow (e.g. with loops).265//266// That should be a very rare case and false positives for this function267// have compile-time effect only.268return any_of(Exits, [](const BasicBlock *BB) {269return !BB->getPostdominatingDeoptimizeCall();270});271}272return false;273}274275static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,276bool HasConditionalPreHeader,277bool SuccsSwapped) {278MDNode *WeightMD = getBranchWeightMDNode(PreHeaderBI);279if (WeightMD == nullptr)280return;281282// LoopBI should currently be a clone of PreHeaderBI with the same283// metadata. But we double check to make sure we don't have a degenerate case284// where instsimplify changed the instructions.285if (WeightMD != getBranchWeightMDNode(LoopBI))286return;287288SmallVector<uint32_t, 2> Weights;289extractFromBranchWeightMD32(WeightMD, Weights);290if (Weights.size() != 2)291return;292uint32_t OrigLoopExitWeight = Weights[0];293uint32_t OrigLoopBackedgeWeight = Weights[1];294295if (SuccsSwapped)296std::swap(OrigLoopExitWeight, OrigLoopBackedgeWeight);297298// Update branch weights. Consider the following edge-counts:299//300// | |-------- |301// V V | V302// Br i1 ... | Br i1 ...303// | | | | |304// x| y| | becomes: | y0| |-----305// V V | | V V |306// Exit Loop | | Loop |307// | | | Br i1 ... |308// ----- | | | |309// x0| x1| y1 | |310// V V ----311// Exit312//313// The following must hold:314// - x == x0 + x1 # counts to "exit" must stay the same.315// - y0 == x - x0 == x1 # how often loop was entered at all.316// - y1 == y - y0 # How often loop was repeated (after first iter.).317//318// We cannot generally deduce how often we had a zero-trip count loop so we319// have to make a guess for how to distribute x among the new x0 and x1.320321uint32_t ExitWeight0; // aka x0322uint32_t ExitWeight1; // aka x1323uint32_t EnterWeight; // aka y0324uint32_t LoopBackWeight; // aka y1325if (OrigLoopExitWeight > 0 && OrigLoopBackedgeWeight > 0) {326ExitWeight0 = 0;327if (HasConditionalPreHeader) {328// Here we cannot know how many 0-trip count loops we have, so we guess:329if (OrigLoopBackedgeWeight >= OrigLoopExitWeight) {330// If the loop count is bigger than the exit count then we set331// probabilities as if 0-trip count nearly never happens.332ExitWeight0 = ZeroTripCountWeights[0];333// Scale up counts if necessary so we can match `ZeroTripCountWeights`334// for the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.335while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) {336// ... but don't overflow.337uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1);338if ((OrigLoopBackedgeWeight & HighBit) != 0 ||339(OrigLoopExitWeight & HighBit) != 0)340break;341OrigLoopBackedgeWeight <<= 1;342OrigLoopExitWeight <<= 1;343}344} else {345// If there's a higher exit-count than backedge-count then we set346// probabilities as if there are only 0-trip and 1-trip cases.347ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;348}349} else {350// Theoretically, if the loop body must be executed at least once, the351// backedge count must be not less than exit count. However the branch352// weight collected by sampling-based PGO may be not very accurate due to353// sampling. Therefore this workaround is required here to avoid underflow354// of unsigned in following update of branch weight.355if (OrigLoopExitWeight > OrigLoopBackedgeWeight)356OrigLoopBackedgeWeight = OrigLoopExitWeight;357}358assert(OrigLoopExitWeight >= ExitWeight0 && "Bad branch weight");359ExitWeight1 = OrigLoopExitWeight - ExitWeight0;360EnterWeight = ExitWeight1;361assert(OrigLoopBackedgeWeight >= EnterWeight && "Bad branch weight");362LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight;363} else if (OrigLoopExitWeight == 0) {364if (OrigLoopBackedgeWeight == 0) {365// degenerate case... keep everything zero...366ExitWeight0 = 0;367ExitWeight1 = 0;368EnterWeight = 0;369LoopBackWeight = 0;370} else {371// Special case "LoopExitWeight == 0" weights which behaves like an372// endless where we don't want loop-enttry (y0) to be the same as373// loop-exit (x1).374ExitWeight0 = 0;375ExitWeight1 = 0;376EnterWeight = 1;377LoopBackWeight = OrigLoopBackedgeWeight;378}379} else {380// loop is never entered.381assert(OrigLoopBackedgeWeight == 0 && "remaining case is backedge zero");382ExitWeight0 = 1;383ExitWeight1 = 1;384EnterWeight = 0;385LoopBackWeight = 0;386}387388const uint32_t LoopBIWeights[] = {389SuccsSwapped ? LoopBackWeight : ExitWeight1,390SuccsSwapped ? ExitWeight1 : LoopBackWeight,391};392setBranchWeights(LoopBI, LoopBIWeights, /*IsExpected=*/false);393if (HasConditionalPreHeader) {394const uint32_t PreHeaderBIWeights[] = {395SuccsSwapped ? EnterWeight : ExitWeight0,396SuccsSwapped ? ExitWeight0 : EnterWeight,397};398setBranchWeights(PreHeaderBI, PreHeaderBIWeights, /*IsExpected=*/false);399}400}401402/// Rotate loop LP. Return true if the loop is rotated.403///404/// \param SimplifiedLatch is true if the latch was just folded into the final405/// loop exit. In this case we may want to rotate even though the new latch is406/// now an exiting branch. This rotation would have happened had the latch not407/// been simplified. However, if SimplifiedLatch is false, then we avoid408/// rotating loops in which the latch exits to avoid excessive or endless409/// rotation. LoopRotate should be repeatable and converge to a canonical410/// form. This property is satisfied because simplifying the loop latch can only411/// happen once across multiple invocations of the LoopRotate pass.412///413/// If -loop-rotate-multi is enabled we can do multiple rotations in one go414/// so to reach a suitable (non-deoptimizing) exit.415bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {416// If the loop has only one block then there is not much to rotate.417if (L->getBlocks().size() == 1)418return false;419420bool Rotated = false;421do {422BasicBlock *OrigHeader = L->getHeader();423BasicBlock *OrigLatch = L->getLoopLatch();424425BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());426if (!BI || BI->isUnconditional())427return Rotated;428429// If the loop header is not one of the loop exiting blocks then430// either this loop is already rotated or it is not431// suitable for loop rotation transformations.432if (!L->isLoopExiting(OrigHeader))433return Rotated;434435// If the loop latch already contains a branch that leaves the loop then the436// loop is already rotated.437if (!OrigLatch)438return Rotated;439440// Rotate if either the loop latch does *not* exit the loop, or if the loop441// latch was just simplified. Or if we think it will be profitable.442if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&443!profitableToRotateLoopExitingLatch(L) &&444!canRotateDeoptimizingLatchExit(L))445return Rotated;446447// Check size of original header and reject loop if it is very big or we can't448// duplicate blocks inside it.449{450SmallPtrSet<const Value *, 32> EphValues;451CodeMetrics::collectEphemeralValues(L, AC, EphValues);452453CodeMetrics Metrics;454Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);455if (Metrics.notDuplicatable) {456LLVM_DEBUG(457dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"458<< " instructions: ";459L->dump());460return Rotated;461}462if (Metrics.Convergence != ConvergenceKind::None) {463LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "464"instructions: ";465L->dump());466return Rotated;467}468if (!Metrics.NumInsts.isValid()) {469LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"470" with invalid cost: ";471L->dump());472return Rotated;473}474if (Metrics.NumInsts > MaxHeaderSize) {475LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "476<< Metrics.NumInsts477<< " instructions, which is more than the threshold ("478<< MaxHeaderSize << " instructions): ";479L->dump());480++NumNotRotatedDueToHeaderSize;481return Rotated;482}483484// When preparing for LTO, avoid rotating loops with calls that could be485// inlined during the LTO stage.486if (PrepareForLTO && Metrics.NumInlineCandidates > 0)487return Rotated;488}489490// Now, this loop is suitable for rotation.491BasicBlock *OrigPreheader = L->getLoopPreheader();492493// If the loop could not be converted to canonical form, it must have an494// indirectbr in it, just give up.495if (!OrigPreheader || !L->hasDedicatedExits())496return Rotated;497498// Anything ScalarEvolution may know about this loop or the PHI nodes499// in its header will soon be invalidated. We should also invalidate500// all outer loops because insertion and deletion of blocks that happens501// during the rotation may violate invariants related to backedge taken502// infos in them.503if (SE) {504SE->forgetTopmostLoop(L);505// We may hoist some instructions out of loop. In case if they were cached506// as "loop variant" or "loop computable", these caches must be dropped.507// We also may fold basic blocks, so cached block dispositions also need508// to be dropped.509SE->forgetBlockAndLoopDispositions();510}511512LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());513if (MSSAU && VerifyMemorySSA)514MSSAU->getMemorySSA()->verifyMemorySSA();515516// Find new Loop header. NewHeader is a Header's one and only successor517// that is inside loop. Header's other successor is outside the518// loop. Otherwise loop is not suitable for rotation.519BasicBlock *Exit = BI->getSuccessor(0);520BasicBlock *NewHeader = BI->getSuccessor(1);521bool BISuccsSwapped = L->contains(Exit);522if (BISuccsSwapped)523std::swap(Exit, NewHeader);524assert(NewHeader && "Unable to determine new loop header");525assert(L->contains(NewHeader) && !L->contains(Exit) &&526"Unable to determine loop header and exit blocks");527528// This code assumes that the new header has exactly one predecessor.529// Remove any single-entry PHI nodes in it.530assert(NewHeader->getSinglePredecessor() &&531"New header doesn't have one pred!");532FoldSingleEntryPHINodes(NewHeader);533534// Begin by walking OrigHeader and populating ValueMap with an entry for535// each Instruction.536BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();537ValueToValueMapTy ValueMap, ValueMapMSSA;538539// For PHI nodes, the value available in OldPreHeader is just the540// incoming value from OldPreHeader.541for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)542InsertNewValueIntoMap(ValueMap, PN,543PN->getIncomingValueForBlock(OrigPreheader));544545// For the rest of the instructions, either hoist to the OrigPreheader if546// possible or create a clone in the OldPreHeader if not.547Instruction *LoopEntryBranch = OrigPreheader->getTerminator();548549// Record all debug intrinsics preceding LoopEntryBranch to avoid550// duplication.551using DbgIntrinsicHash =552std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;553auto makeHash = [](auto *D) -> DbgIntrinsicHash {554auto VarLocOps = D->location_ops();555return {{hash_combine_range(VarLocOps.begin(), VarLocOps.end()),556D->getVariable()},557D->getExpression()};558};559560SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;561for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {562if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {563DbgIntrinsics.insert(makeHash(DII));564// Until RemoveDIs supports dbg.declares in DbgVariableRecord format,565// we'll need to collect DbgVariableRecords attached to any other debug566// intrinsics.567for (const DbgVariableRecord &DVR :568filterDbgVars(DII->getDbgRecordRange()))569DbgIntrinsics.insert(makeHash(&DVR));570} else {571break;572}573}574575// Build DbgVariableRecord hashes for DbgVariableRecords attached to the576// terminator, which isn't considered in the loop above.577for (const DbgVariableRecord &DVR :578filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))579DbgIntrinsics.insert(makeHash(&DVR));580581// Remember the local noalias scope declarations in the header. After the582// rotation, they must be duplicated and the scope must be cloned. This583// avoids unwanted interaction across iterations.584SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;585for (Instruction &I : *OrigHeader)586if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))587NoAliasDeclInstructions.push_back(Decl);588589Module *M = OrigHeader->getModule();590591// Track the next DbgRecord to clone. If we have a sequence where an592// instruction is hoisted instead of being cloned:593// DbgRecord blah594// %foo = add i32 0, 0595// DbgRecord xyzzy596// %bar = call i32 @foobar()597// where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once598// attached to %foo, then when %foo his hoisted it will "fall down" onto the599// function call:600// DbgRecord blah601// DbgRecord xyzzy602// %bar = call i32 @foobar()603// causing it to appear attached to the call too.604//605// To avoid this, cloneDebugInfoFrom takes an optional "start cloning from606// here" position to account for this behaviour. We point it at any607// DbgRecords on the next instruction, here labelled xyzzy, before we hoist608// %foo. Later, we only only clone DbgRecords from that position (xyzzy)609// onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as610// a range because it gives us a natural way of testing whether611// there were DbgRecords on the next instruction before we hoisted things).612iterator_range<DbgRecord::self_iterator> NextDbgInsts =613(I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();614615while (I != E) {616Instruction *Inst = &*I++;617618// If the instruction's operands are invariant and it doesn't read or write619// memory, then it is safe to hoist. Doing this doesn't change the order of620// execution in the preheader, but does prevent the instruction from621// executing in each iteration of the loop. This means it is safe to hoist622// something that might trap, but isn't safe to hoist something that reads623// memory (without proving that the loop doesn't write).624if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&625!Inst->mayWriteToMemory() && !Inst->isTerminator() &&626!isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst) &&627// It is not safe to hoist the value of these instructions in628// coroutines, as the addresses of otherwise eligible variables (e.g.629// thread-local variables and errno) may change if the coroutine is630// resumed in a different thread.Therefore, we disable this631// optimization for correctness. However, this may block other correct632// optimizations.633// FIXME: This should be reverted once we have a better model for634// memory access in coroutines.635!Inst->getFunction()->isPresplitCoroutine()) {636637if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&638!NextDbgInsts.empty()) {639auto DbgValueRange =640LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());641RemapDbgRecordRange(M, DbgValueRange, ValueMap,642RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);643// Erase anything we've seen before.644for (DbgVariableRecord &DVR :645make_early_inc_range(filterDbgVars(DbgValueRange)))646if (DbgIntrinsics.count(makeHash(&DVR)))647DVR.eraseFromParent();648}649650NextDbgInsts = I->getDbgRecordRange();651652Inst->moveBefore(LoopEntryBranch);653654++NumInstrsHoisted;655continue;656}657658// Otherwise, create a duplicate of the instruction.659Instruction *C = Inst->clone();660C->insertBefore(LoopEntryBranch);661662++NumInstrsDuplicated;663664if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&665!NextDbgInsts.empty()) {666auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());667RemapDbgRecordRange(M, Range, ValueMap,668RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);669NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();670// Erase anything we've seen before.671for (DbgVariableRecord &DVR :672make_early_inc_range(filterDbgVars(Range)))673if (DbgIntrinsics.count(makeHash(&DVR)))674DVR.eraseFromParent();675}676677// Eagerly remap the operands of the instruction.678RemapInstruction(C, ValueMap,679RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);680681// Avoid inserting the same intrinsic twice.682if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))683if (DbgIntrinsics.count(makeHash(DII))) {684C->eraseFromParent();685continue;686}687688// With the operands remapped, see if the instruction constant folds or is689// otherwise simplifyable. This commonly occurs because the entry from PHI690// nodes allows icmps and other instructions to fold.691Value *V = simplifyInstruction(C, SQ);692if (V && LI->replacementPreservesLCSSAForm(C, V)) {693// If so, then delete the temporary instruction and stick the folded value694// in the map.695InsertNewValueIntoMap(ValueMap, Inst, V);696if (!C->mayHaveSideEffects()) {697C->eraseFromParent();698C = nullptr;699}700} else {701InsertNewValueIntoMap(ValueMap, Inst, C);702}703if (C) {704// Otherwise, stick the new instruction into the new block!705C->setName(Inst->getName());706707if (auto *II = dyn_cast<AssumeInst>(C))708AC->registerAssumption(II);709// MemorySSA cares whether the cloned instruction was inserted or not, and710// not whether it can be remapped to a simplified value.711if (MSSAU)712InsertNewValueIntoMap(ValueMapMSSA, Inst, C);713}714}715716if (!NoAliasDeclInstructions.empty()) {717// There are noalias scope declarations:718// (general):719// Original: OrigPre { OrigHeader NewHeader ... Latch }720// after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }721//722// with D: llvm.experimental.noalias.scope.decl,723// U: !noalias or !alias.scope depending on D724// ... { D U1 U2 } can transform into:725// (0) : ... { D U1 U2 } // no relevant rotation for this part726// (1) : ... D' { U1 U2 D } // D is part of OrigHeader727// (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader728//729// We now want to transform:730// (1) -> : ... D' { D U1 U2 D'' }731// (2) -> : ... D' U1' { D U2 D'' U1'' }732// D: original llvm.experimental.noalias.scope.decl733// D', U1': duplicate with replaced scopes734// D'', U1'': different duplicate with replaced scopes735// This ensures a safe fallback to 'may_alias' introduced by the rotate,736// as U1'' and U1' scopes will not be compatible wrt to the local restrict737738// Clone the llvm.experimental.noalias.decl again for the NewHeader.739BasicBlock::iterator NewHeaderInsertionPoint =740NewHeader->getFirstNonPHIIt();741for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {742LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:"743<< *NAD << "\n");744Instruction *NewNAD = NAD->clone();745NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);746}747748// Scopes must now be duplicated, once for OrigHeader and once for749// OrigPreHeader'.750{751auto &Context = NewHeader->getContext();752753SmallVector<MDNode *, 8> NoAliasDeclScopes;754for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)755NoAliasDeclScopes.push_back(NAD->getScopeList());756757LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n");758cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,759"h.rot");760LLVM_DEBUG(OrigHeader->dump());761762// Keep the compile time impact low by only adapting the inserted block763// of instructions in the OrigPreHeader. This might result in slightly764// more aliasing between these instructions and those that were already765// present, but it will be much faster when the original PreHeader is766// large.767LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n");768auto *FirstDecl =769cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);770auto *LastInst = &OrigPreheader->back();771cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,772Context, "pre.rot");773LLVM_DEBUG(OrigPreheader->dump());774775LLVM_DEBUG(dbgs() << " Updated NewHeader:\n");776LLVM_DEBUG(NewHeader->dump());777}778}779780// Along with all the other instructions, we just cloned OrigHeader's781// terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's782// successors by duplicating their incoming values for OrigHeader.783for (BasicBlock *SuccBB : successors(OrigHeader))784for (BasicBlock::iterator BI = SuccBB->begin();785PHINode *PN = dyn_cast<PHINode>(BI); ++BI)786PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);787788// Now that OrigPreHeader has a clone of OrigHeader's terminator, remove789// OrigPreHeader's old terminator (the original branch into the loop), and790// remove the corresponding incoming values from the PHI nodes in OrigHeader.791LoopEntryBranch->eraseFromParent();792OrigPreheader->flushTerminatorDbgRecords();793794// Update MemorySSA before the rewrite call below changes the 1:1795// instruction:cloned_instruction_or_value mapping.796if (MSSAU) {797InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);798MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,799ValueMapMSSA);800}801802SmallVector<PHINode*, 2> InsertedPHIs;803// If there were any uses of instructions in the duplicated block outside the804// loop, update them, inserting PHI nodes as required805RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,806&InsertedPHIs);807808// Attach dbg.value intrinsics to the new phis if that phi uses a value that809// previously had debug metadata attached. This keeps the debug info810// up-to-date in the loop body.811if (!InsertedPHIs.empty())812insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);813814// NewHeader is now the header of the loop.815L->moveToHeader(NewHeader);816assert(L->getHeader() == NewHeader && "Latch block is our new header");817818// Inform DT about changes to the CFG.819if (DT) {820// The OrigPreheader branches to the NewHeader and Exit now. Then, inform821// the DT about the removed edge to the OrigHeader (that got removed).822SmallVector<DominatorTree::UpdateType, 3> Updates;823Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});824Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});825Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});826827if (MSSAU) {828MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);829if (VerifyMemorySSA)830MSSAU->getMemorySSA()->verifyMemorySSA();831} else {832DT->applyUpdates(Updates);833}834}835836// At this point, we've finished our major CFG changes. As part of cloning837// the loop into the preheader we've simplified instructions and the838// duplicated conditional branch may now be branching on a constant. If it is839// branching on a constant and if that constant means that we enter the loop,840// then we fold away the cond branch to an uncond branch. This simplifies the841// loop in cases important for nested loops, and it also means we don't have842// to split as many edges.843BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());844assert(PHBI->isConditional() && "Should be clone of BI condbr!");845const Value *Cond = PHBI->getCondition();846const bool HasConditionalPreHeader =847!isa<ConstantInt>(Cond) ||848PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;849850updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);851852if (HasConditionalPreHeader) {853// The conditional branch can't be folded, handle the general case.854// Split edges as necessary to preserve LoopSimplify form.855856// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and857// thus is not a preheader anymore.858// Split the edge to form a real preheader.859BasicBlock *NewPH = SplitCriticalEdge(860OrigPreheader, NewHeader,861CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());862NewPH->setName(NewHeader->getName() + ".lr.ph");863864// Preserve canonical loop form, which means that 'Exit' should have only865// one predecessor. Note that Exit could be an exit block for multiple866// nested loops, causing both of the edges to now be critical and need to867// be split.868SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));869bool SplitLatchEdge = false;870for (BasicBlock *ExitPred : ExitPreds) {871// We only need to split loop exit edges.872Loop *PredLoop = LI->getLoopFor(ExitPred);873if (!PredLoop || PredLoop->contains(Exit) ||874isa<IndirectBrInst>(ExitPred->getTerminator()))875continue;876SplitLatchEdge |= L->getLoopLatch() == ExitPred;877BasicBlock *ExitSplit = SplitCriticalEdge(878ExitPred, Exit,879CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());880ExitSplit->moveBefore(Exit);881}882assert(SplitLatchEdge &&883"Despite splitting all preds, failed to split latch exit?");884(void)SplitLatchEdge;885} else {886// We can fold the conditional branch in the preheader, this makes things887// simpler. The first step is to remove the extra edge to the Exit block.888Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);889BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());890NewBI->setDebugLoc(PHBI->getDebugLoc());891PHBI->eraseFromParent();892893// With our CFG finalized, update DomTree if it is available.894if (DT) DT->deleteEdge(OrigPreheader, Exit);895896// Update MSSA too, if available.897if (MSSAU)898MSSAU->removeEdge(OrigPreheader, Exit);899}900901assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");902assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");903904if (MSSAU && VerifyMemorySSA)905MSSAU->getMemorySSA()->verifyMemorySSA();906907// Now that the CFG and DomTree are in a consistent state again, try to merge908// the OrigHeader block into OrigLatch. This will succeed if they are909// connected by an unconditional branch. This is just a cleanup so the910// emitted code isn't too gross in this common case.911DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);912BasicBlock *PredBB = OrigHeader->getUniquePredecessor();913bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);914if (DidMerge)915RemoveRedundantDbgInstrs(PredBB);916917if (MSSAU && VerifyMemorySSA)918MSSAU->getMemorySSA()->verifyMemorySSA();919920LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());921922++NumRotated;923924Rotated = true;925SimplifiedLatch = false;926927// Check that new latch is a deoptimizing exit and then repeat rotation if possible.928// Deoptimizing latch exit is not a generally typical case, so we just loop over.929// TODO: if it becomes a performance bottleneck extend rotation algorithm930// to handle multiple rotations in one go.931} while (MultiRotate && canRotateDeoptimizingLatchExit(L));932933934return true;935}936937/// Determine whether the instructions in this range may be safely and cheaply938/// speculated. This is not an important enough situation to develop complex939/// heuristics. We handle a single arithmetic instruction along with any type940/// conversions.941static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,942BasicBlock::iterator End, Loop *L) {943bool seenIncrement = false;944bool MultiExitLoop = false;945946if (!L->getExitingBlock())947MultiExitLoop = true;948949for (BasicBlock::iterator I = Begin; I != End; ++I) {950951if (!isSafeToSpeculativelyExecute(&*I))952return false;953954if (isa<DbgInfoIntrinsic>(I))955continue;956957switch (I->getOpcode()) {958default:959return false;960case Instruction::GetElementPtr:961// GEPs are cheap if all indices are constant.962if (!cast<GEPOperator>(I)->hasAllConstantIndices())963return false;964// fall-thru to increment case965[[fallthrough]];966case Instruction::Add:967case Instruction::Sub:968case Instruction::And:969case Instruction::Or:970case Instruction::Xor:971case Instruction::Shl:972case Instruction::LShr:973case Instruction::AShr: {974Value *IVOpnd =975!isa<Constant>(I->getOperand(0))976? I->getOperand(0)977: !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;978if (!IVOpnd)979return false;980981// If increment operand is used outside of the loop, this speculation982// could cause extra live range interference.983if (MultiExitLoop) {984for (User *UseI : IVOpnd->users()) {985auto *UserInst = cast<Instruction>(UseI);986if (!L->contains(UserInst))987return false;988}989}990991if (seenIncrement)992return false;993seenIncrement = true;994break;995}996case Instruction::Trunc:997case Instruction::ZExt:998case Instruction::SExt:999// ignore type conversions1000break;1001}1002}1003return true;1004}10051006/// Fold the loop tail into the loop exit by speculating the loop tail1007/// instructions. Typically, this is a single post-increment. In the case of a1008/// simple 2-block loop, hoisting the increment can be much better than1009/// duplicating the entire loop header. In the case of loops with early exits,1010/// rotation will not work anyway, but simplifyLoopLatch will put the loop in1011/// canonical form so downstream passes can handle it.1012///1013/// I don't believe this invalidates SCEV.1014bool LoopRotate::simplifyLoopLatch(Loop *L) {1015BasicBlock *Latch = L->getLoopLatch();1016if (!Latch || Latch->hasAddressTaken())1017return false;10181019BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());1020if (!Jmp || !Jmp->isUnconditional())1021return false;10221023BasicBlock *LastExit = Latch->getSinglePredecessor();1024if (!LastExit || !L->isLoopExiting(LastExit))1025return false;10261027BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());1028if (!BI)1029return false;10301031if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))1032return false;10331034LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "1035<< LastExit->getName() << "\n");10361037DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);1038MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,1039/*PredecessorWithTwoSuccessors=*/true);10401041if (SE) {1042// Merging blocks may remove blocks reference in the block disposition cache. Clear the cache.1043SE->forgetBlockAndLoopDispositions();1044}10451046if (MSSAU && VerifyMemorySSA)1047MSSAU->getMemorySSA()->verifyMemorySSA();10481049return true;1050}10511052/// Rotate \c L, and return true if any modification was made.1053bool LoopRotate::processLoop(Loop *L) {1054// Save the loop metadata.1055MDNode *LoopMD = L->getLoopID();10561057bool SimplifiedLatch = false;10581059// Simplify the loop latch before attempting to rotate the header1060// upward. Rotation may not be needed if the loop tail can be folded into the1061// loop exit.1062if (!RotationOnly)1063SimplifiedLatch = simplifyLoopLatch(L);10641065bool MadeChange = rotateLoop(L, SimplifiedLatch);1066assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&1067"Loop latch should be exiting after loop-rotate.");10681069// Restore the loop metadata.1070// NB! We presume LoopRotation DOESN'T ADD its own metadata.1071if ((MadeChange || SimplifiedLatch) && LoopMD)1072L->setLoopID(LoopMD);10731074return MadeChange || SimplifiedLatch;1075}107610771078/// The utility to convert a loop into a loop with bottom test.1079bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,1080AssumptionCache *AC, DominatorTree *DT,1081ScalarEvolution *SE, MemorySSAUpdater *MSSAU,1082const SimplifyQuery &SQ, bool RotationOnly = true,1083unsigned Threshold = unsigned(-1),1084bool IsUtilMode = true, bool PrepareForLTO) {1085LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,1086IsUtilMode, PrepareForLTO);1087return LR.processLoop(L);1088}108910901091