Path: blob/main/contrib/llvm-project/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
35234 views
//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// Loops should be simplified before this analysis.9//10//===----------------------------------------------------------------------===//1112#include "llvm/Analysis/BlockFrequencyInfoImpl.h"13#include "llvm/ADT/APInt.h"14#include "llvm/ADT/DenseMap.h"15#include "llvm/ADT/SCCIterator.h"16#include "llvm/ADT/SmallString.h"17#include "llvm/Config/llvm-config.h"18#include "llvm/IR/Function.h"19#include "llvm/Support/BlockFrequency.h"20#include "llvm/Support/BranchProbability.h"21#include "llvm/Support/Compiler.h"22#include "llvm/Support/Debug.h"23#include "llvm/Support/MathExtras.h"24#include "llvm/Support/ScaledNumber.h"25#include "llvm/Support/raw_ostream.h"26#include <algorithm>27#include <cassert>28#include <cstddef>29#include <cstdint>30#include <iterator>31#include <list>32#include <numeric>33#include <optional>34#include <utility>35#include <vector>3637using namespace llvm;38using namespace llvm::bfi_detail;3940#define DEBUG_TYPE "block-freq"4142namespace llvm {43cl::opt<bool> CheckBFIUnknownBlockQueries(44"check-bfi-unknown-block-queries",45cl::init(false), cl::Hidden,46cl::desc("Check if block frequency is queried for an unknown block "47"for debugging missed BFI updates"));4849cl::opt<bool> UseIterativeBFIInference(50"use-iterative-bfi-inference", cl::Hidden,51cl::desc("Apply an iterative post-processing to infer correct BFI counts"));5253cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock(54"iterative-bfi-max-iterations-per-block", cl::init(1000), cl::Hidden,55cl::desc("Iterative inference: maximum number of update iterations "56"per block"));5758cl::opt<double> IterativeBFIPrecision(59"iterative-bfi-precision", cl::init(1e-12), cl::Hidden,60cl::desc("Iterative inference: delta convergence precision; smaller values "61"typically lead to better results at the cost of worsen runtime"));62} // namespace llvm6364ScaledNumber<uint64_t> BlockMass::toScaled() const {65if (isFull())66return ScaledNumber<uint64_t>(1, 0);67return ScaledNumber<uint64_t>(getMass() + 1, -64);68}6970#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)71LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }72#endif7374static char getHexDigit(int N) {75assert(N < 16);76if (N < 10)77return '0' + N;78return 'a' + N - 10;79}8081raw_ostream &BlockMass::print(raw_ostream &OS) const {82for (int Digits = 0; Digits < 16; ++Digits)83OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);84return OS;85}8687namespace {8889using BlockNode = BlockFrequencyInfoImplBase::BlockNode;90using Distribution = BlockFrequencyInfoImplBase::Distribution;91using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList;92using Scaled64 = BlockFrequencyInfoImplBase::Scaled64;93using LoopData = BlockFrequencyInfoImplBase::LoopData;94using Weight = BlockFrequencyInfoImplBase::Weight;95using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData;9697/// Dithering mass distributer.98///99/// This class splits up a single mass into portions by weight, dithering to100/// spread out error. No mass is lost. The dithering precision depends on the101/// precision of the product of \a BlockMass and \a BranchProbability.102///103/// The distribution algorithm follows.104///105/// 1. Initialize by saving the sum of the weights in \a RemWeight and the106/// mass to distribute in \a RemMass.107///108/// 2. For each portion:109///110/// 1. Construct a branch probability, P, as the portion's weight divided111/// by the current value of \a RemWeight.112/// 2. Calculate the portion's mass as \a RemMass times P.113/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting114/// the current portion's weight and mass.115struct DitheringDistributer {116uint32_t RemWeight;117BlockMass RemMass;118119DitheringDistributer(Distribution &Dist, const BlockMass &Mass);120121BlockMass takeMass(uint32_t Weight);122};123124} // end anonymous namespace125126DitheringDistributer::DitheringDistributer(Distribution &Dist,127const BlockMass &Mass) {128Dist.normalize();129RemWeight = Dist.Total;130RemMass = Mass;131}132133BlockMass DitheringDistributer::takeMass(uint32_t Weight) {134assert(Weight && "invalid weight");135assert(Weight <= RemWeight);136BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);137138// Decrement totals (dither).139RemWeight -= Weight;140RemMass -= Mass;141return Mass;142}143144void Distribution::add(const BlockNode &Node, uint64_t Amount,145Weight::DistType Type) {146assert(Amount && "invalid weight of 0");147uint64_t NewTotal = Total + Amount;148149// Check for overflow. It should be impossible to overflow twice.150bool IsOverflow = NewTotal < Total;151assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");152DidOverflow |= IsOverflow;153154// Update the total.155Total = NewTotal;156157// Save the weight.158Weights.push_back(Weight(Type, Node, Amount));159}160161static void combineWeight(Weight &W, const Weight &OtherW) {162assert(OtherW.TargetNode.isValid());163if (!W.Amount) {164W = OtherW;165return;166}167assert(W.Type == OtherW.Type);168assert(W.TargetNode == OtherW.TargetNode);169assert(OtherW.Amount && "Expected non-zero weight");170if (W.Amount > W.Amount + OtherW.Amount)171// Saturate on overflow.172W.Amount = UINT64_MAX;173else174W.Amount += OtherW.Amount;175}176177static void combineWeightsBySorting(WeightList &Weights) {178// Sort so edges to the same node are adjacent.179llvm::sort(Weights, [](const Weight &L, const Weight &R) {180return L.TargetNode < R.TargetNode;181});182183// Combine adjacent edges.184WeightList::iterator O = Weights.begin();185for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;186++O, (I = L)) {187*O = *I;188189// Find the adjacent weights to the same node.190for (++L; L != E && I->TargetNode == L->TargetNode; ++L)191combineWeight(*O, *L);192}193194// Erase extra entries.195Weights.erase(O, Weights.end());196}197198static void combineWeightsByHashing(WeightList &Weights) {199// Collect weights into a DenseMap.200using HashTable = DenseMap<BlockNode::IndexType, Weight>;201202HashTable Combined(NextPowerOf2(2 * Weights.size()));203for (const Weight &W : Weights)204combineWeight(Combined[W.TargetNode.Index], W);205206// Check whether anything changed.207if (Weights.size() == Combined.size())208return;209210// Fill in the new weights.211Weights.clear();212Weights.reserve(Combined.size());213for (const auto &I : Combined)214Weights.push_back(I.second);215}216217static void combineWeights(WeightList &Weights) {218// Use a hash table for many successors to keep this linear.219if (Weights.size() > 128) {220combineWeightsByHashing(Weights);221return;222}223224combineWeightsBySorting(Weights);225}226227static uint64_t shiftRightAndRound(uint64_t N, int Shift) {228assert(Shift >= 0);229assert(Shift < 64);230if (!Shift)231return N;232return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));233}234235void Distribution::normalize() {236// Early exit for termination nodes.237if (Weights.empty())238return;239240// Only bother if there are multiple successors.241if (Weights.size() > 1)242combineWeights(Weights);243244// Early exit when combined into a single successor.245if (Weights.size() == 1) {246Total = 1;247Weights.front().Amount = 1;248return;249}250251// Determine how much to shift right so that the total fits into 32-bits.252//253// If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1254// for each weight can cause a 32-bit overflow.255int Shift = 0;256if (DidOverflow)257Shift = 33;258else if (Total > UINT32_MAX)259Shift = 33 - llvm::countl_zero(Total);260261// Early exit if nothing needs to be scaled.262if (!Shift) {263// If we didn't overflow then combineWeights() shouldn't have changed the264// sum of the weights, but let's double-check.265assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),266[](uint64_t Sum, const Weight &W) {267return Sum + W.Amount;268}) &&269"Expected total to be correct");270return;271}272273// Recompute the total through accumulation (rather than shifting it) so that274// it's accurate after shifting and any changes combineWeights() made above.275Total = 0;276277// Sum the weights to each node and shift right if necessary.278for (Weight &W : Weights) {279// Scale down below UINT32_MAX. Since Shift is larger than necessary, we280// can round here without concern about overflow.281assert(W.TargetNode.isValid());282W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));283assert(W.Amount <= UINT32_MAX);284285// Update the total.286Total += W.Amount;287}288assert(Total <= UINT32_MAX);289}290291void BlockFrequencyInfoImplBase::clear() {292// Swap with a default-constructed std::vector, since std::vector<>::clear()293// does not actually clear heap storage.294std::vector<FrequencyData>().swap(Freqs);295IsIrrLoopHeader.clear();296std::vector<WorkingData>().swap(Working);297Loops.clear();298}299300/// Clear all memory not needed downstream.301///302/// Releases all memory not used downstream. In particular, saves Freqs.303static void cleanup(BlockFrequencyInfoImplBase &BFI) {304std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));305SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader));306BFI.clear();307BFI.Freqs = std::move(SavedFreqs);308BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);309}310311bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,312const LoopData *OuterLoop,313const BlockNode &Pred,314const BlockNode &Succ,315uint64_t Weight) {316if (!Weight)317Weight = 1;318319auto isLoopHeader = [&OuterLoop](const BlockNode &Node) {320return OuterLoop && OuterLoop->isHeader(Node);321};322323BlockNode Resolved = Working[Succ.Index].getResolvedNode();324325#ifndef NDEBUG326auto debugSuccessor = [&](const char *Type) {327dbgs() << " =>"328<< " [" << Type << "] weight = " << Weight;329if (!isLoopHeader(Resolved))330dbgs() << ", succ = " << getBlockName(Succ);331if (Resolved != Succ)332dbgs() << ", resolved = " << getBlockName(Resolved);333dbgs() << "\n";334};335(void)debugSuccessor;336#endif337338if (isLoopHeader(Resolved)) {339LLVM_DEBUG(debugSuccessor("backedge"));340Dist.addBackedge(Resolved, Weight);341return true;342}343344if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {345LLVM_DEBUG(debugSuccessor(" exit "));346Dist.addExit(Resolved, Weight);347return true;348}349350if (Resolved < Pred) {351if (!isLoopHeader(Pred)) {352// If OuterLoop is an irreducible loop, we can't actually handle this.353assert((!OuterLoop || !OuterLoop->isIrreducible()) &&354"unhandled irreducible control flow");355356// Irreducible backedge. Abort.357LLVM_DEBUG(debugSuccessor("abort!!!"));358return false;359}360361// If "Pred" is a loop header, then this isn't really a backedge; rather,362// OuterLoop must be irreducible. These false backedges can come only from363// secondary loop headers.364assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) &&365"unhandled irreducible control flow");366}367368LLVM_DEBUG(debugSuccessor(" local "));369Dist.addLocal(Resolved, Weight);370return true;371}372373bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(374const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) {375// Copy the exit map into Dist.376for (const auto &I : Loop.Exits)377if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first,378I.second.getMass()))379// Irreducible backedge.380return false;381382return true;383}384385/// Compute the loop scale for a loop.386void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {387// Compute loop scale.388LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");389390// Infinite loops need special handling. If we give the back edge an infinite391// mass, they may saturate all the other scales in the function down to 1,392// making all the other region temperatures look exactly the same. Choose an393// arbitrary scale to avoid these issues.394//395// FIXME: An alternate way would be to select a symbolic scale which is later396// replaced to be the maximum of all computed scales plus 1. This would397// appropriately describe the loop as having a large scale, without skewing398// the final frequency computation.399const Scaled64 InfiniteLoopScale(1, 12);400401// LoopScale == 1 / ExitMass402// ExitMass == HeadMass - BackedgeMass403BlockMass TotalBackedgeMass;404for (auto &Mass : Loop.BackedgeMass)405TotalBackedgeMass += Mass;406BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;407408// Block scale stores the inverse of the scale. If this is an infinite loop,409// its exit mass will be zero. In this case, use an arbitrary scale for the410// loop scale.411Loop.Scale =412ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse();413414LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " ("415<< BlockMass::getFull() << " - " << TotalBackedgeMass416<< ")\n"417<< " - scale = " << Loop.Scale << "\n");418}419420/// Package up a loop.421void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {422LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");423424// Clear the subloop exits to prevent quadratic memory usage.425for (const BlockNode &M : Loop.Nodes) {426if (auto *Loop = Working[M.Index].getPackagedLoop())427Loop->Exits.clear();428LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");429}430Loop.IsPackaged = true;431}432433#ifndef NDEBUG434static void debugAssign(const BlockFrequencyInfoImplBase &BFI,435const DitheringDistributer &D, const BlockNode &T,436const BlockMass &M, const char *Desc) {437dbgs() << " => assign " << M << " (" << D.RemMass << ")";438if (Desc)439dbgs() << " [" << Desc << "]";440if (T.isValid())441dbgs() << " to " << BFI.getBlockName(T);442dbgs() << "\n";443}444#endif445446void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,447LoopData *OuterLoop,448Distribution &Dist) {449BlockMass Mass = Working[Source.Index].getMass();450LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n");451452// Distribute mass to successors as laid out in Dist.453DitheringDistributer D(Dist, Mass);454455for (const Weight &W : Dist.Weights) {456// Check for a local edge (non-backedge and non-exit).457BlockMass Taken = D.takeMass(W.Amount);458if (W.Type == Weight::Local) {459Working[W.TargetNode.Index].getMass() += Taken;460LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));461continue;462}463464// Backedges and exits only make sense if we're processing a loop.465assert(OuterLoop && "backedge or exit outside of loop");466467// Check for a backedge.468if (W.Type == Weight::Backedge) {469OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken;470LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));471continue;472}473474// This must be an exit.475assert(W.Type == Weight::Exit);476OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));477LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));478}479}480481static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,482const Scaled64 &Min, const Scaled64 &Max) {483// Scale the Factor to a size that creates integers. If possible scale484// integers so that Max == UINT64_MAX so that they can be best differentiated.485// Is is possible that the range between min and max cannot be accurately486// represented in a 64bit integer without either loosing precision for small487// values (so small unequal numbers all map to 1) or saturaturing big numbers488// loosing precision for big numbers (so unequal big numbers may map to489// UINT64_MAX). We choose to loose precision for small numbers.490const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;491// Users often add up multiple BlockFrequency values or multiply them with492// things like instruction costs. Leave some room to avoid saturating493// operations reaching UIN64_MAX too early.494const unsigned Slack = 10;495Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;496497// Translate the floats to integers.498LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max499<< ", factor = " << ScalingFactor << "\n");500(void)Min;501for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {502Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;503BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());504LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "505<< BFI.Freqs[Index].Scaled << ", scaled = " << Scaled506<< ", int = " << BFI.Freqs[Index].Integer << "\n");507}508}509510/// Unwrap a loop package.511///512/// Visits all the members of a loop, adjusting their BlockData according to513/// the loop's pseudo-node.514static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {515LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)516<< ": mass = " << Loop.Mass << ", scale = " << Loop.Scale517<< "\n");518Loop.Scale *= Loop.Mass.toScaled();519Loop.IsPackaged = false;520LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n");521522// Propagate the head scale through the loop. Since members are visited in523// RPO, the head scale will be updated by the loop scale first, and then the524// final head scale will be used for updated the rest of the members.525for (const BlockNode &N : Loop.Nodes) {526const auto &Working = BFI.Working[N.Index];527Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale528: BFI.Freqs[N.Index].Scaled;529Scaled64 New = Loop.Scale * F;530LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => "531<< New << "\n");532F = New;533}534}535536void BlockFrequencyInfoImplBase::unwrapLoops() {537// Set initial frequencies from loop-local masses.538for (size_t Index = 0; Index < Working.size(); ++Index)539Freqs[Index].Scaled = Working[Index].Mass.toScaled();540541for (LoopData &Loop : Loops)542unwrapLoop(*this, Loop);543}544545void BlockFrequencyInfoImplBase::finalizeMetrics() {546// Unwrap loop packages in reverse post-order, tracking min and max547// frequencies.548auto Min = Scaled64::getLargest();549auto Max = Scaled64::getZero();550for (size_t Index = 0; Index < Working.size(); ++Index) {551// Update min/max scale.552Min = std::min(Min, Freqs[Index].Scaled);553Max = std::max(Max, Freqs[Index].Scaled);554}555556// Convert to integers.557convertFloatingToInteger(*this, Min, Max);558559// Clean up data structures.560cleanup(*this);561562// Print out the final stats.563LLVM_DEBUG(dump());564}565566BlockFrequency567BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {568if (!Node.isValid()) {569#ifndef NDEBUG570if (CheckBFIUnknownBlockQueries) {571SmallString<256> Msg;572raw_svector_ostream OS(Msg);573OS << "*** Detected BFI query for unknown block " << getBlockName(Node);574report_fatal_error(OS.str());575}576#endif577return BlockFrequency(0);578}579return BlockFrequency(Freqs[Node.Index].Integer);580}581582std::optional<uint64_t>583BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F,584const BlockNode &Node,585bool AllowSynthetic) const {586return getProfileCountFromFreq(F, getBlockFreq(Node), AllowSynthetic);587}588589std::optional<uint64_t> BlockFrequencyInfoImplBase::getProfileCountFromFreq(590const Function &F, BlockFrequency Freq, bool AllowSynthetic) const {591auto EntryCount = F.getEntryCount(AllowSynthetic);592if (!EntryCount)593return std::nullopt;594// Use 128 bit APInt to do the arithmetic to avoid overflow.595APInt BlockCount(128, EntryCount->getCount());596APInt BlockFreq(128, Freq.getFrequency());597APInt EntryFreq(128, getEntryFreq().getFrequency());598BlockCount *= BlockFreq;599// Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned600// lshr by 1 gives EntryFreq/2.601BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq);602return BlockCount.getLimitedValue();603}604605bool606BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) {607if (!Node.isValid())608return false;609return IsIrrLoopHeader.test(Node.Index);610}611612Scaled64613BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {614if (!Node.isValid())615return Scaled64::getZero();616return Freqs[Node.Index].Scaled;617}618619void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node,620BlockFrequency Freq) {621assert(Node.isValid() && "Expected valid node");622assert(Node.Index < Freqs.size() && "Expected legal index");623Freqs[Node.Index].Integer = Freq.getFrequency();624}625626std::string627BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {628return {};629}630631std::string632BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {633return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");634}635636void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {637Start = OuterLoop.getHeader();638Nodes.reserve(OuterLoop.Nodes.size());639for (auto N : OuterLoop.Nodes)640addNode(N);641indexNodes();642}643644void IrreducibleGraph::addNodesInFunction() {645Start = 0;646for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)647if (!BFI.Working[Index].isPackaged())648addNode(Index);649indexNodes();650}651652void IrreducibleGraph::indexNodes() {653for (auto &I : Nodes)654Lookup[I.Node.Index] = &I;655}656657void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,658const BFIBase::LoopData *OuterLoop) {659if (OuterLoop && OuterLoop->isHeader(Succ))660return;661auto L = Lookup.find(Succ.Index);662if (L == Lookup.end())663return;664IrrNode &SuccIrr = *L->second;665Irr.Edges.push_back(&SuccIrr);666SuccIrr.Edges.push_front(&Irr);667++SuccIrr.NumIn;668}669670namespace llvm {671672template <> struct GraphTraits<IrreducibleGraph> {673using GraphT = bfi_detail::IrreducibleGraph;674using NodeRef = const GraphT::IrrNode *;675using ChildIteratorType = GraphT::IrrNode::iterator;676677static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; }678static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }679static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); }680};681682} // end namespace llvm683684/// Find extra irreducible headers.685///686/// Find entry blocks and other blocks with backedges, which exist when \c G687/// contains irreducible sub-SCCs.688static void findIrreducibleHeaders(689const BlockFrequencyInfoImplBase &BFI,690const IrreducibleGraph &G,691const std::vector<const IrreducibleGraph::IrrNode *> &SCC,692LoopData::NodeList &Headers, LoopData::NodeList &Others) {693// Map from nodes in the SCC to whether it's an entry block.694SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC;695696// InSCC also acts the set of nodes in the graph. Seed it.697for (const auto *I : SCC)698InSCC[I] = false;699700for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) {701auto &Irr = *I->first;702for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {703if (InSCC.count(P))704continue;705706// This is an entry block.707I->second = true;708Headers.push_back(Irr.Node);709LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node)710<< "\n");711break;712}713}714assert(Headers.size() >= 2 &&715"Expected irreducible CFG; -loop-info is likely invalid");716if (Headers.size() == InSCC.size()) {717// Every block is a header.718llvm::sort(Headers);719return;720}721722// Look for extra headers from irreducible sub-SCCs.723for (const auto &I : InSCC) {724// Entry blocks are already headers.725if (I.second)726continue;727728auto &Irr = *I.first;729for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {730// Skip forward edges.731if (P->Node < Irr.Node)732continue;733734// Skip predecessors from entry blocks. These can have inverted735// ordering.736if (InSCC.lookup(P))737continue;738739// Store the extra header.740Headers.push_back(Irr.Node);741LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node)742<< "\n");743break;744}745if (Headers.back() == Irr.Node)746// Added this as a header.747continue;748749// This is not a header.750Others.push_back(Irr.Node);751LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n");752}753llvm::sort(Headers);754llvm::sort(Others);755}756757static void createIrreducibleLoop(758BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G,759LoopData *OuterLoop, std::list<LoopData>::iterator Insert,760const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {761// Translate the SCC into RPO.762LLVM_DEBUG(dbgs() << " - found-scc\n");763764LoopData::NodeList Headers;765LoopData::NodeList Others;766findIrreducibleHeaders(BFI, G, SCC, Headers, Others);767768auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),769Headers.end(), Others.begin(), Others.end());770771// Update loop hierarchy.772for (const auto &N : Loop->Nodes)773if (BFI.Working[N.Index].isLoopHeader())774BFI.Working[N.Index].Loop->Parent = &*Loop;775else776BFI.Working[N.Index].Loop = &*Loop;777}778779iterator_range<std::list<LoopData>::iterator>780BlockFrequencyInfoImplBase::analyzeIrreducible(781const IrreducibleGraph &G, LoopData *OuterLoop,782std::list<LoopData>::iterator Insert) {783assert((OuterLoop == nullptr) == (Insert == Loops.begin()));784auto Prev = OuterLoop ? std::prev(Insert) : Loops.end();785786for (auto I = scc_begin(G); !I.isAtEnd(); ++I) {787if (I->size() < 2)788continue;789790// Translate the SCC into RPO.791createIrreducibleLoop(*this, G, OuterLoop, Insert, *I);792}793794if (OuterLoop)795return make_range(std::next(Prev), Insert);796return make_range(Loops.begin(), Insert);797}798799void800BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {801OuterLoop.Exits.clear();802for (auto &Mass : OuterLoop.BackedgeMass)803Mass = BlockMass::getEmpty();804auto O = OuterLoop.Nodes.begin() + 1;805for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)806if (!Working[I->Index].isPackaged())807*O++ = *I;808OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());809}810811void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {812assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");813814// Since the loop has more than one header block, the mass flowing back into815// each header will be different. Adjust the mass in each header loop to816// reflect the masses flowing through back edges.817//818// To do this, we distribute the initial mass using the backedge masses819// as weights for the distribution.820BlockMass LoopMass = BlockMass::getFull();821Distribution Dist;822823LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n");824for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {825auto &HeaderNode = Loop.Nodes[H];826auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)];827LLVM_DEBUG(dbgs() << " - Add back edge mass for node "828<< getBlockName(HeaderNode) << ": " << BackedgeMass829<< "\n");830if (BackedgeMass.getMass() > 0)831Dist.addLocal(HeaderNode, BackedgeMass.getMass());832else833LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n");834}835836DitheringDistributer D(Dist, LoopMass);837838LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass839<< " to headers using above weights\n");840for (const Weight &W : Dist.Weights) {841BlockMass Taken = D.takeMass(W.Amount);842assert(W.Type == Weight::Local && "all weights should be local");843Working[W.TargetNode.Index].getMass() = Taken;844LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));845}846}847848void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) {849BlockMass LoopMass = BlockMass::getFull();850DitheringDistributer D(Dist, LoopMass);851for (const Weight &W : Dist.Weights) {852BlockMass Taken = D.takeMass(W.Amount);853assert(W.Type == Weight::Local && "all weights should be local");854Working[W.TargetNode.Index].getMass() = Taken;855LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));856}857}858859860