Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp
35271 views
//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// The file implements "cache-aware" layout algorithms of basic blocks and9// functions in a binary.10//11// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG12// optimizing jump locality and thus processor I-cache utilization. This is13// achieved via increasing the number of fall-through jumps and co-locating14// frequently executed nodes together. The name follows the underlying15// optimization problem, Extended-TSP, which is a generalization of classical16// (maximum) Traveling Salesmen Problem.17//18// The algorithm is a greedy heuristic that works with chains (ordered lists)19// of basic blocks. Initially all chains are isolated basic blocks. On every20// iteration, we pick a pair of chains whose merging yields the biggest increase21// in the ExtTSP score, which models how i-cache "friendly" a specific chain is.22// A pair of chains giving the maximum gain is merged into a new chain. The23// procedure stops when there is only one chain left, or when merging does not24// increase ExtTSP. In the latter case, the remaining chains are sorted by25// density in the decreasing order.26//27// An important aspect is the way two chains are merged. Unlike earlier28// algorithms (e.g., based on the approach of Pettis-Hansen), two29// chains, X and Y, are first split into three, X1, X2, and Y. Then we30// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,31// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.32// This improves the quality of the final result (the search space is larger)33// while keeping the implementation sufficiently fast.34//35// Reference:36// * A. Newell and S. Pupyrev, Improved Basic Block Reordering,37// IEEE Transactions on Computers, 202038// https://arxiv.org/abs/1809.0467639//40//===----------------------------------------------------------------------===//4142#include "llvm/Transforms/Utils/CodeLayout.h"43#include "llvm/Support/CommandLine.h"44#include "llvm/Support/Debug.h"4546#include <cmath>47#include <set>4849using namespace llvm;50using namespace llvm::codelayout;5152#define DEBUG_TYPE "code-layout"5354namespace llvm {55cl::opt<bool> EnableExtTspBlockPlacement(56"enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),57cl::desc("Enable machine block placement based on the ext-tsp model, "58"optimizing I-cache utilization."));5960cl::opt<bool> ApplyExtTspWithoutProfile(61"ext-tsp-apply-without-profile",62cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),63cl::init(true), cl::Hidden);64} // namespace llvm6566// Algorithm-specific params for Ext-TSP. The values are tuned for the best67// performance of large-scale front-end bound binaries.68static cl::opt<double> ForwardWeightCond(69"ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),70cl::desc("The weight of conditional forward jumps for ExtTSP value"));7172static cl::opt<double> ForwardWeightUncond(73"ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),74cl::desc("The weight of unconditional forward jumps for ExtTSP value"));7576static cl::opt<double> BackwardWeightCond(77"ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),78cl::desc("The weight of conditional backward jumps for ExtTSP value"));7980static cl::opt<double> BackwardWeightUncond(81"ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),82cl::desc("The weight of unconditional backward jumps for ExtTSP value"));8384static cl::opt<double> FallthroughWeightCond(85"ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),86cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));8788static cl::opt<double> FallthroughWeightUncond(89"ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),90cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));9192static cl::opt<unsigned> ForwardDistance(93"ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),94cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));9596static cl::opt<unsigned> BackwardDistance(97"ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),98cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));99100// The maximum size of a chain created by the algorithm. The size is bounded101// so that the algorithm can efficiently process extremely large instances.102static cl::opt<unsigned>103MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),104cl::desc("The maximum size of a chain to create"));105106// The maximum size of a chain for splitting. Larger values of the threshold107// may yield better quality at the cost of worsen run-time.108static cl::opt<unsigned> ChainSplitThreshold(109"ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),110cl::desc("The maximum size of a chain to apply splitting"));111112// The maximum ratio between densities of two chains for merging.113static cl::opt<double> MaxMergeDensityRatio(114"ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),115cl::desc("The maximum ratio between densities of two chains for merging"));116117// Algorithm-specific options for CDSort.118static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,119cl::desc("The size of the cache"));120121static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,122cl::desc("The size of a line in the cache"));123124static cl::opt<unsigned>125CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,126cl::desc("The maximum size of a chain to create"));127128static cl::opt<double> DistancePower(129"cdsort-distance-power", cl::ReallyHidden,130cl::desc("The power exponent for the distance-based locality"));131132static cl::opt<double> FrequencyScale(133"cdsort-frequency-scale", cl::ReallyHidden,134cl::desc("The scale factor for the frequency-based locality"));135136namespace {137138// Epsilon for comparison of doubles.139constexpr double EPS = 1e-8;140141// Compute the Ext-TSP score for a given jump.142double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,143double Weight) {144if (JumpDist > JumpMaxDist)145return 0;146double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;147return Weight * Prob * Count;148}149150// Compute the Ext-TSP score for a jump between a given pair of blocks,151// using their sizes, (estimated) addresses and the jump execution count.152double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,153uint64_t Count, bool IsConditional) {154// Fallthrough155if (SrcAddr + SrcSize == DstAddr) {156return jumpExtTSPScore(0, 1, Count,157IsConditional ? FallthroughWeightCond158: FallthroughWeightUncond);159}160// Forward161if (SrcAddr + SrcSize < DstAddr) {162const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);163return jumpExtTSPScore(Dist, ForwardDistance, Count,164IsConditional ? ForwardWeightCond165: ForwardWeightUncond);166}167// Backward168const uint64_t Dist = SrcAddr + SrcSize - DstAddr;169return jumpExtTSPScore(Dist, BackwardDistance, Count,170IsConditional ? BackwardWeightCond171: BackwardWeightUncond);172}173174/// A type of merging two chains, X and Y. The former chain is split into175/// X1 and X2 and then concatenated with Y in the order specified by the type.176enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };177178/// The gain of merging two chains, that is, the Ext-TSP score of the merge179/// together with the corresponding merge 'type' and 'offset'.180struct MergeGainT {181explicit MergeGainT() = default;182explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)183: Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}184185double score() const { return Score; }186187size_t mergeOffset() const { return MergeOffset; }188189MergeTypeT mergeType() const { return MergeType; }190191void setMergeType(MergeTypeT Ty) { MergeType = Ty; }192193// Returns 'true' iff Other is preferred over this.194bool operator<(const MergeGainT &Other) const {195return (Other.Score > EPS && Other.Score > Score + EPS);196}197198// Update the current gain if Other is preferred over this.199void updateIfLessThan(const MergeGainT &Other) {200if (*this < Other)201*this = Other;202}203204private:205double Score{-1.0};206size_t MergeOffset{0};207MergeTypeT MergeType{MergeTypeT::X_Y};208};209210struct JumpT;211struct ChainT;212struct ChainEdge;213214/// A node in the graph, typically corresponding to a basic block in the CFG or215/// a function in the call graph.216struct NodeT {217NodeT(const NodeT &) = delete;218NodeT(NodeT &&) = default;219NodeT &operator=(const NodeT &) = delete;220NodeT &operator=(NodeT &&) = default;221222explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)223: Index(Index), Size(Size), ExecutionCount(Count) {}224225bool isEntry() const { return Index == 0; }226227// Check if Other is a successor of the node.228bool isSuccessor(const NodeT *Other) const;229230// The total execution count of outgoing jumps.231uint64_t outCount() const;232233// The total execution count of incoming jumps.234uint64_t inCount() const;235236// The original index of the node in graph.237size_t Index{0};238// The index of the node in the current chain.239size_t CurIndex{0};240// The size of the node in the binary.241uint64_t Size{0};242// The execution count of the node in the profile data.243uint64_t ExecutionCount{0};244// The current chain of the node.245ChainT *CurChain{nullptr};246// The offset of the node in the current chain.247mutable uint64_t EstimatedAddr{0};248// Forced successor of the node in the graph.249NodeT *ForcedSucc{nullptr};250// Forced predecessor of the node in the graph.251NodeT *ForcedPred{nullptr};252// Outgoing jumps from the node.253std::vector<JumpT *> OutJumps;254// Incoming jumps to the node.255std::vector<JumpT *> InJumps;256};257258/// An arc in the graph, typically corresponding to a jump between two nodes.259struct JumpT {260JumpT(const JumpT &) = delete;261JumpT(JumpT &&) = default;262JumpT &operator=(const JumpT &) = delete;263JumpT &operator=(JumpT &&) = default;264265explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)266: Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}267268// Source node of the jump.269NodeT *Source;270// Target node of the jump.271NodeT *Target;272// Execution count of the arc in the profile data.273uint64_t ExecutionCount{0};274// Whether the jump corresponds to a conditional branch.275bool IsConditional{false};276// The offset of the jump from the source node.277uint64_t Offset{0};278};279280/// A chain (ordered sequence) of nodes in the graph.281struct ChainT {282ChainT(const ChainT &) = delete;283ChainT(ChainT &&) = default;284ChainT &operator=(const ChainT &) = delete;285ChainT &operator=(ChainT &&) = default;286287explicit ChainT(uint64_t Id, NodeT *Node)288: Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),289Nodes(1, Node) {}290291size_t numBlocks() const { return Nodes.size(); }292293double density() const { return ExecutionCount / Size; }294295bool isEntry() const { return Nodes[0]->Index == 0; }296297bool isCold() const {298for (NodeT *Node : Nodes) {299if (Node->ExecutionCount > 0)300return false;301}302return true;303}304305ChainEdge *getEdge(ChainT *Other) const {306for (const auto &[Chain, ChainEdge] : Edges) {307if (Chain == Other)308return ChainEdge;309}310return nullptr;311}312313void removeEdge(ChainT *Other) {314auto It = Edges.begin();315while (It != Edges.end()) {316if (It->first == Other) {317Edges.erase(It);318return;319}320It++;321}322}323324void addEdge(ChainT *Other, ChainEdge *Edge) {325Edges.push_back(std::make_pair(Other, Edge));326}327328void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {329Nodes = std::move(MergedBlocks);330// Update the chain's data.331ExecutionCount += Other->ExecutionCount;332Size += Other->Size;333Id = Nodes[0]->Index;334// Update the node's data.335for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {336Nodes[Idx]->CurChain = this;337Nodes[Idx]->CurIndex = Idx;338}339}340341void mergeEdges(ChainT *Other);342343void clear() {344Nodes.clear();345Nodes.shrink_to_fit();346Edges.clear();347Edges.shrink_to_fit();348}349350// Unique chain identifier.351uint64_t Id;352// Cached ext-tsp score for the chain.353double Score{0};354// The total execution count of the chain. Since the execution count of355// a basic block is uint64_t, using doubles here to avoid overflow.356double ExecutionCount{0};357// The total size of the chain.358uint64_t Size{0};359// Nodes of the chain.360std::vector<NodeT *> Nodes;361// Adjacent chains and corresponding edges (lists of jumps).362std::vector<std::pair<ChainT *, ChainEdge *>> Edges;363};364365/// An edge in the graph representing jumps between two chains.366/// When nodes are merged into chains, the edges are combined too so that367/// there is always at most one edge between a pair of chains.368struct ChainEdge {369ChainEdge(const ChainEdge &) = delete;370ChainEdge(ChainEdge &&) = default;371ChainEdge &operator=(const ChainEdge &) = delete;372ChainEdge &operator=(ChainEdge &&) = delete;373374explicit ChainEdge(JumpT *Jump)375: SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),376Jumps(1, Jump) {}377378ChainT *srcChain() const { return SrcChain; }379380ChainT *dstChain() const { return DstChain; }381382bool isSelfEdge() const { return SrcChain == DstChain; }383384const std::vector<JumpT *> &jumps() const { return Jumps; }385386void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }387388void moveJumps(ChainEdge *Other) {389Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());390Other->Jumps.clear();391Other->Jumps.shrink_to_fit();392}393394void changeEndpoint(ChainT *From, ChainT *To) {395if (From == SrcChain)396SrcChain = To;397if (From == DstChain)398DstChain = To;399}400401bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {402return Src == SrcChain ? CacheValidForward : CacheValidBackward;403}404405MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {406return Src == SrcChain ? CachedGainForward : CachedGainBackward;407}408409void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {410if (Src == SrcChain) {411CachedGainForward = MergeGain;412CacheValidForward = true;413} else {414CachedGainBackward = MergeGain;415CacheValidBackward = true;416}417}418419void invalidateCache() {420CacheValidForward = false;421CacheValidBackward = false;422}423424void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }425426MergeGainT getMergeGain() const { return CachedGain; }427428double gain() const { return CachedGain.score(); }429430private:431// Source chain.432ChainT *SrcChain{nullptr};433// Destination chain.434ChainT *DstChain{nullptr};435// Original jumps in the binary with corresponding execution counts.436std::vector<JumpT *> Jumps;437// Cached gain value for merging the pair of chains.438MergeGainT CachedGain;439440// Cached gain values for merging the pair of chains. Since the gain of441// merging (Src, Dst) and (Dst, Src) might be different, we store both values442// here and a flag indicating which of the options results in a higher gain.443// Cached gain values.444MergeGainT CachedGainForward;445MergeGainT CachedGainBackward;446// Whether the cached value must be recomputed.447bool CacheValidForward{false};448bool CacheValidBackward{false};449};450451bool NodeT::isSuccessor(const NodeT *Other) const {452for (JumpT *Jump : OutJumps)453if (Jump->Target == Other)454return true;455return false;456}457458uint64_t NodeT::outCount() const {459uint64_t Count = 0;460for (JumpT *Jump : OutJumps)461Count += Jump->ExecutionCount;462return Count;463}464465uint64_t NodeT::inCount() const {466uint64_t Count = 0;467for (JumpT *Jump : InJumps)468Count += Jump->ExecutionCount;469return Count;470}471472void ChainT::mergeEdges(ChainT *Other) {473// Update edges adjacent to chain Other.474for (const auto &[DstChain, DstEdge] : Other->Edges) {475ChainT *TargetChain = DstChain == Other ? this : DstChain;476ChainEdge *CurEdge = getEdge(TargetChain);477if (CurEdge == nullptr) {478DstEdge->changeEndpoint(Other, this);479this->addEdge(TargetChain, DstEdge);480if (DstChain != this && DstChain != Other)481DstChain->addEdge(this, DstEdge);482} else {483CurEdge->moveJumps(DstEdge);484}485// Cleanup leftover edge.486if (DstChain != Other)487DstChain->removeEdge(Other);488}489}490491using NodeIter = std::vector<NodeT *>::const_iterator;492static std::vector<NodeT *> EmptyList;493494/// A wrapper around three concatenated vectors (chains) of nodes; it is used495/// to avoid extra instantiation of the vectors.496struct MergedNodesT {497MergedNodesT(NodeIter Begin1, NodeIter End1,498NodeIter Begin2 = EmptyList.begin(),499NodeIter End2 = EmptyList.end(),500NodeIter Begin3 = EmptyList.begin(),501NodeIter End3 = EmptyList.end())502: Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),503End3(End3) {}504505template <typename F> void forEach(const F &Func) const {506for (auto It = Begin1; It != End1; It++)507Func(*It);508for (auto It = Begin2; It != End2; It++)509Func(*It);510for (auto It = Begin3; It != End3; It++)511Func(*It);512}513514std::vector<NodeT *> getNodes() const {515std::vector<NodeT *> Result;516Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +517std::distance(Begin3, End3));518Result.insert(Result.end(), Begin1, End1);519Result.insert(Result.end(), Begin2, End2);520Result.insert(Result.end(), Begin3, End3);521return Result;522}523524const NodeT *getFirstNode() const { return *Begin1; }525526private:527NodeIter Begin1;528NodeIter End1;529NodeIter Begin2;530NodeIter End2;531NodeIter Begin3;532NodeIter End3;533};534535/// A wrapper around two concatenated vectors (chains) of jumps.536struct MergedJumpsT {537MergedJumpsT(const std::vector<JumpT *> *Jumps1,538const std::vector<JumpT *> *Jumps2 = nullptr) {539assert(!Jumps1->empty() && "cannot merge empty jump list");540JumpArray[0] = Jumps1;541JumpArray[1] = Jumps2;542}543544template <typename F> void forEach(const F &Func) const {545for (auto Jumps : JumpArray)546if (Jumps != nullptr)547for (JumpT *Jump : *Jumps)548Func(Jump);549}550551private:552std::array<const std::vector<JumpT *> *, 2> JumpArray{nullptr, nullptr};553};554555/// Merge two chains of nodes respecting a given 'type' and 'offset'.556///557/// If MergeType == 0, then the result is a concatenation of two chains.558/// Otherwise, the first chain is cut into two sub-chains at the offset,559/// and merged using all possible ways of concatenating three chains.560MergedNodesT mergeNodes(const std::vector<NodeT *> &X,561const std::vector<NodeT *> &Y, size_t MergeOffset,562MergeTypeT MergeType) {563// Split the first chain, X, into X1 and X2.564NodeIter BeginX1 = X.begin();565NodeIter EndX1 = X.begin() + MergeOffset;566NodeIter BeginX2 = X.begin() + MergeOffset;567NodeIter EndX2 = X.end();568NodeIter BeginY = Y.begin();569NodeIter EndY = Y.end();570571// Construct a new chain from the three existing ones.572switch (MergeType) {573case MergeTypeT::X_Y:574return MergedNodesT(BeginX1, EndX2, BeginY, EndY);575case MergeTypeT::Y_X:576return MergedNodesT(BeginY, EndY, BeginX1, EndX2);577case MergeTypeT::X1_Y_X2:578return MergedNodesT(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);579case MergeTypeT::Y_X2_X1:580return MergedNodesT(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);581case MergeTypeT::X2_X1_Y:582return MergedNodesT(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);583}584llvm_unreachable("unexpected chain merge type");585}586587/// The implementation of the ExtTSP algorithm.588class ExtTSPImpl {589public:590ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,591ArrayRef<EdgeCount> EdgeCounts)592: NumNodes(NodeSizes.size()) {593initialize(NodeSizes, NodeCounts, EdgeCounts);594}595596/// Run the algorithm and return an optimized ordering of nodes.597std::vector<uint64_t> run() {598// Pass 1: Merge nodes with their mutually forced successors599mergeForcedPairs();600601// Pass 2: Merge pairs of chains while improving the ExtTSP objective602mergeChainPairs();603604// Pass 3: Merge cold nodes to reduce code size605mergeColdChains();606607// Collect nodes from all chains608return concatChains();609}610611private:612/// Initialize the algorithm's data structures.613void initialize(const ArrayRef<uint64_t> &NodeSizes,614const ArrayRef<uint64_t> &NodeCounts,615const ArrayRef<EdgeCount> &EdgeCounts) {616// Initialize nodes.617AllNodes.reserve(NumNodes);618for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {619uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);620uint64_t ExecutionCount = NodeCounts[Idx];621// The execution count of the entry node is set to at least one.622if (Idx == 0 && ExecutionCount == 0)623ExecutionCount = 1;624AllNodes.emplace_back(Idx, Size, ExecutionCount);625}626627// Initialize jumps between the nodes.628SuccNodes.resize(NumNodes);629PredNodes.resize(NumNodes);630std::vector<uint64_t> OutDegree(NumNodes, 0);631AllJumps.reserve(EdgeCounts.size());632for (auto Edge : EdgeCounts) {633++OutDegree[Edge.src];634// Ignore self-edges.635if (Edge.src == Edge.dst)636continue;637638SuccNodes[Edge.src].push_back(Edge.dst);639PredNodes[Edge.dst].push_back(Edge.src);640if (Edge.count > 0) {641NodeT &PredNode = AllNodes[Edge.src];642NodeT &SuccNode = AllNodes[Edge.dst];643AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);644SuccNode.InJumps.push_back(&AllJumps.back());645PredNode.OutJumps.push_back(&AllJumps.back());646// Adjust execution counts.647PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Edge.count);648SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Edge.count);649}650}651for (JumpT &Jump : AllJumps) {652assert(OutDegree[Jump.Source->Index] > 0 &&653"incorrectly computed out-degree of the block");654Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;655}656657// Initialize chains.658AllChains.reserve(NumNodes);659HotChains.reserve(NumNodes);660for (NodeT &Node : AllNodes) {661// Create a chain.662AllChains.emplace_back(Node.Index, &Node);663Node.CurChain = &AllChains.back();664if (Node.ExecutionCount > 0)665HotChains.push_back(&AllChains.back());666}667668// Initialize chain edges.669AllEdges.reserve(AllJumps.size());670for (NodeT &PredNode : AllNodes) {671for (JumpT *Jump : PredNode.OutJumps) {672assert(Jump->ExecutionCount > 0 && "incorrectly initialized jump");673NodeT *SuccNode = Jump->Target;674ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);675// This edge is already present in the graph.676if (CurEdge != nullptr) {677assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);678CurEdge->appendJump(Jump);679continue;680}681// This is a new edge.682AllEdges.emplace_back(Jump);683PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());684SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());685}686}687}688689/// For a pair of nodes, A and B, node B is the forced successor of A,690/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps691/// to B are from A. Such nodes should be adjacent in the optimal ordering;692/// the method finds and merges such pairs of nodes.693void mergeForcedPairs() {694// Find forced pairs of blocks.695for (NodeT &Node : AllNodes) {696if (SuccNodes[Node.Index].size() == 1 &&697PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&698SuccNodes[Node.Index][0] != 0) {699size_t SuccIndex = SuccNodes[Node.Index][0];700Node.ForcedSucc = &AllNodes[SuccIndex];701AllNodes[SuccIndex].ForcedPred = &Node;702}703}704705// There might be 'cycles' in the forced dependencies, since profile706// data isn't 100% accurate. Typically this is observed in loops, when the707// loop edges are the hottest successors for the basic blocks of the loop.708// Break the cycles by choosing the node with the smallest index as the709// head. This helps to keep the original order of the loops, which likely710// have already been rotated in the optimized manner.711for (NodeT &Node : AllNodes) {712if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)713continue;714715NodeT *SuccNode = Node.ForcedSucc;716while (SuccNode != nullptr && SuccNode != &Node) {717SuccNode = SuccNode->ForcedSucc;718}719if (SuccNode == nullptr)720continue;721// Break the cycle.722AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;723Node.ForcedPred = nullptr;724}725726// Merge nodes with their fallthrough successors.727for (NodeT &Node : AllNodes) {728if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {729const NodeT *CurBlock = &Node;730while (CurBlock->ForcedSucc != nullptr) {731const NodeT *NextBlock = CurBlock->ForcedSucc;732mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);733CurBlock = NextBlock;734}735}736}737}738739/// Merge pairs of chains while improving the ExtTSP objective.740void mergeChainPairs() {741/// Deterministically compare pairs of chains.742auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,743const ChainT *A2, const ChainT *B2) {744return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);745};746747while (HotChains.size() > 1) {748ChainT *BestChainPred = nullptr;749ChainT *BestChainSucc = nullptr;750MergeGainT BestGain;751// Iterate over all pairs of chains.752for (ChainT *ChainPred : HotChains) {753// Get candidates for merging with the current chain.754for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {755// Ignore loop edges.756if (Edge->isSelfEdge())757continue;758// Skip the merge if the combined chain violates the maximum specified759// size.760if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)761continue;762// Don't merge the chains if they have vastly different densities.763// Skip the merge if the ratio between the densities exceeds764// MaxMergeDensityRatio. Smaller values of the option result in fewer765// merges, and hence, more chains.766const double ChainPredDensity = ChainPred->density();767const double ChainSuccDensity = ChainSucc->density();768assert(ChainPredDensity > 0.0 && ChainSuccDensity > 0.0 &&769"incorrectly computed chain densities");770auto [MinDensity, MaxDensity] =771std::minmax(ChainPredDensity, ChainSuccDensity);772const double Ratio = MaxDensity / MinDensity;773if (Ratio > MaxMergeDensityRatio)774continue;775776// Compute the gain of merging the two chains.777MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);778if (CurGain.score() <= EPS)779continue;780781if (BestGain < CurGain ||782(std::abs(CurGain.score() - BestGain.score()) < EPS &&783compareChainPairs(ChainPred, ChainSucc, BestChainPred,784BestChainSucc))) {785BestGain = CurGain;786BestChainPred = ChainPred;787BestChainSucc = ChainSucc;788}789}790}791792// Stop merging when there is no improvement.793if (BestGain.score() <= EPS)794break;795796// Merge the best pair of chains.797mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),798BestGain.mergeType());799}800}801802/// Merge remaining nodes into chains w/o taking jump counts into803/// consideration. This allows to maintain the original node order in the804/// absence of profile data.805void mergeColdChains() {806for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {807// Iterating in reverse order to make sure original fallthrough jumps are808// merged first; this might be beneficial for code size.809size_t NumSuccs = SuccNodes[SrcBB].size();810for (size_t Idx = 0; Idx < NumSuccs; Idx++) {811size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];812ChainT *SrcChain = AllNodes[SrcBB].CurChain;813ChainT *DstChain = AllNodes[DstBB].CurChain;814if (SrcChain != DstChain && !DstChain->isEntry() &&815SrcChain->Nodes.back()->Index == SrcBB &&816DstChain->Nodes.front()->Index == DstBB &&817SrcChain->isCold() == DstChain->isCold()) {818mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);819}820}821}822}823824/// Compute the Ext-TSP score for a given node order and a list of jumps.825double extTSPScore(const MergedNodesT &Nodes,826const MergedJumpsT &Jumps) const {827uint64_t CurAddr = 0;828Nodes.forEach([&](const NodeT *Node) {829Node->EstimatedAddr = CurAddr;830CurAddr += Node->Size;831});832833double Score = 0;834Jumps.forEach([&](const JumpT *Jump) {835const NodeT *SrcBlock = Jump->Source;836const NodeT *DstBlock = Jump->Target;837Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,838DstBlock->EstimatedAddr, Jump->ExecutionCount,839Jump->IsConditional);840});841return Score;842}843844/// Compute the gain of merging two chains.845///846/// The function considers all possible ways of merging two chains and847/// computes the one having the largest increase in ExtTSP objective. The848/// result is a pair with the first element being the gain and the second849/// element being the corresponding merging type.850MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,851ChainEdge *Edge) const {852if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))853return Edge->getCachedMergeGain(ChainPred, ChainSucc);854855assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");856// Precompute jumps between ChainPred and ChainSucc.857ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);858MergedJumpsT Jumps(&Edge->jumps(), EdgePP ? &EdgePP->jumps() : nullptr);859860// This object holds the best chosen gain of merging two chains.861MergeGainT Gain = MergeGainT();862863/// Given a merge offset and a list of merge types, try to merge two chains864/// and update Gain with a better alternative.865auto tryChainMerging = [&](size_t Offset,866const std::vector<MergeTypeT> &MergeTypes) {867// Skip merging corresponding to concatenation w/o splitting.868if (Offset == 0 || Offset == ChainPred->Nodes.size())869return;870// Skip merging if it breaks Forced successors.871NodeT *Node = ChainPred->Nodes[Offset - 1];872if (Node->ForcedSucc != nullptr)873return;874// Apply the merge, compute the corresponding gain, and update the best875// value, if the merge is beneficial.876for (const MergeTypeT &MergeType : MergeTypes) {877Gain.updateIfLessThan(878computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));879}880};881882// Try to concatenate two chains w/o splitting.883Gain.updateIfLessThan(884computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));885886// Attach (a part of) ChainPred before the first node of ChainSucc.887for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {888const NodeT *SrcBlock = Jump->Source;889if (SrcBlock->CurChain != ChainPred)890continue;891size_t Offset = SrcBlock->CurIndex + 1;892tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});893}894895// Attach (a part of) ChainPred after the last node of ChainSucc.896for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {897const NodeT *DstBlock = Jump->Target;898if (DstBlock->CurChain != ChainPred)899continue;900size_t Offset = DstBlock->CurIndex;901tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});902}903904// Try to break ChainPred in various ways and concatenate with ChainSucc.905if (ChainPred->Nodes.size() <= ChainSplitThreshold) {906for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {907// Do not split the chain along a fall-through jump. One of the two908// loops above may still "break" such a jump whenever it results in a909// new fall-through.910const NodeT *BB = ChainPred->Nodes[Offset - 1];911const NodeT *BB2 = ChainPred->Nodes[Offset];912if (BB->isSuccessor(BB2))913continue;914915// In practice, applying X2_Y_X1 merging almost never provides benefits;916// thus, we exclude it from consideration to reduce the search space.917tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,918MergeTypeT::X2_X1_Y});919}920}921922Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);923return Gain;924}925926/// Compute the score gain of merging two chains, respecting a given927/// merge 'type' and 'offset'.928///929/// The two chains are not modified in the method.930MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,931const MergedJumpsT &Jumps, size_t MergeOffset,932MergeTypeT MergeType) const {933MergedNodesT MergedNodes =934mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);935936// Do not allow a merge that does not preserve the original entry point.937if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&938!MergedNodes.getFirstNode()->isEntry())939return MergeGainT();940941// The gain for the new chain.942double NewScore = extTSPScore(MergedNodes, Jumps);943double CurScore = ChainPred->Score;944return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);945}946947/// Merge chain From into chain Into, update the list of active chains,948/// adjacency information, and the corresponding cached values.949void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,950MergeTypeT MergeType) {951assert(Into != From && "a chain cannot be merged with itself");952953// Merge the nodes.954MergedNodesT MergedNodes =955mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);956Into->merge(From, MergedNodes.getNodes());957958// Merge the edges.959Into->mergeEdges(From);960From->clear();961962// Update cached ext-tsp score for the new chain.963ChainEdge *SelfEdge = Into->getEdge(Into);964if (SelfEdge != nullptr) {965MergedNodes = MergedNodesT(Into->Nodes.begin(), Into->Nodes.end());966MergedJumpsT MergedJumps(&SelfEdge->jumps());967Into->Score = extTSPScore(MergedNodes, MergedJumps);968}969970// Remove the chain from the list of active chains.971llvm::erase(HotChains, From);972973// Invalidate caches.974for (auto EdgeIt : Into->Edges)975EdgeIt.second->invalidateCache();976}977978/// Concatenate all chains into the final order.979std::vector<uint64_t> concatChains() {980// Collect non-empty chains.981std::vector<const ChainT *> SortedChains;982for (ChainT &Chain : AllChains) {983if (!Chain.Nodes.empty())984SortedChains.push_back(&Chain);985}986987// Sorting chains by density in the decreasing order.988std::sort(SortedChains.begin(), SortedChains.end(),989[&](const ChainT *L, const ChainT *R) {990// Place the entry point at the beginning of the order.991if (L->isEntry() != R->isEntry())992return L->isEntry();993994// Compare by density and break ties by chain identifiers.995return std::make_tuple(-L->density(), L->Id) <996std::make_tuple(-R->density(), R->Id);997});998999// Collect the nodes in the order specified by their chains.1000std::vector<uint64_t> Order;1001Order.reserve(NumNodes);1002for (const ChainT *Chain : SortedChains)1003for (NodeT *Node : Chain->Nodes)1004Order.push_back(Node->Index);1005return Order;1006}10071008private:1009/// The number of nodes in the graph.1010const size_t NumNodes;10111012/// Successors of each node.1013std::vector<std::vector<uint64_t>> SuccNodes;10141015/// Predecessors of each node.1016std::vector<std::vector<uint64_t>> PredNodes;10171018/// All nodes (basic blocks) in the graph.1019std::vector<NodeT> AllNodes;10201021/// All jumps between the nodes.1022std::vector<JumpT> AllJumps;10231024/// All chains of nodes.1025std::vector<ChainT> AllChains;10261027/// All edges between the chains.1028std::vector<ChainEdge> AllEdges;10291030/// Active chains. The vector gets updated at runtime when chains are merged.1031std::vector<ChainT *> HotChains;1032};10331034/// The implementation of the Cache-Directed Sort (CDSort) algorithm for1035/// ordering functions represented by a call graph.1036class CDSortImpl {1037public:1038CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,1039ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,1040ArrayRef<uint64_t> EdgeOffsets)1041: Config(Config), NumNodes(NodeSizes.size()) {1042initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);1043}10441045/// Run the algorithm and return an ordered set of function clusters.1046std::vector<uint64_t> run() {1047// Merge pairs of chains while improving the objective.1048mergeChainPairs();10491050// Collect nodes from all the chains.1051return concatChains();1052}10531054private:1055/// Initialize the algorithm's data structures.1056void initialize(const ArrayRef<uint64_t> &NodeSizes,1057const ArrayRef<uint64_t> &NodeCounts,1058const ArrayRef<EdgeCount> &EdgeCounts,1059const ArrayRef<uint64_t> &EdgeOffsets) {1060// Initialize nodes.1061AllNodes.reserve(NumNodes);1062for (uint64_t Node = 0; Node < NumNodes; Node++) {1063uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);1064uint64_t ExecutionCount = NodeCounts[Node];1065AllNodes.emplace_back(Node, Size, ExecutionCount);1066TotalSamples += ExecutionCount;1067if (ExecutionCount > 0)1068TotalSize += Size;1069}10701071// Initialize jumps between the nodes.1072SuccNodes.resize(NumNodes);1073PredNodes.resize(NumNodes);1074AllJumps.reserve(EdgeCounts.size());1075for (size_t I = 0; I < EdgeCounts.size(); I++) {1076auto [Pred, Succ, Count] = EdgeCounts[I];1077// Ignore recursive calls.1078if (Pred == Succ)1079continue;10801081SuccNodes[Pred].push_back(Succ);1082PredNodes[Succ].push_back(Pred);1083if (Count > 0) {1084NodeT &PredNode = AllNodes[Pred];1085NodeT &SuccNode = AllNodes[Succ];1086AllJumps.emplace_back(&PredNode, &SuccNode, Count);1087AllJumps.back().Offset = EdgeOffsets[I];1088SuccNode.InJumps.push_back(&AllJumps.back());1089PredNode.OutJumps.push_back(&AllJumps.back());1090// Adjust execution counts.1091PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Count);1092SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Count);1093}1094}10951096// Initialize chains.1097AllChains.reserve(NumNodes);1098for (NodeT &Node : AllNodes) {1099// Adjust execution counts.1100Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());1101Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());1102// Create chain.1103AllChains.emplace_back(Node.Index, &Node);1104Node.CurChain = &AllChains.back();1105}11061107// Initialize chain edges.1108AllEdges.reserve(AllJumps.size());1109for (NodeT &PredNode : AllNodes) {1110for (JumpT *Jump : PredNode.OutJumps) {1111NodeT *SuccNode = Jump->Target;1112ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);1113// This edge is already present in the graph.1114if (CurEdge != nullptr) {1115assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);1116CurEdge->appendJump(Jump);1117continue;1118}1119// This is a new edge.1120AllEdges.emplace_back(Jump);1121PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());1122SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());1123}1124}1125}11261127/// Merge pairs of chains while there is an improvement in the objective.1128void mergeChainPairs() {1129// Create a priority queue containing all edges ordered by the merge gain.1130auto GainComparator = [](ChainEdge *L, ChainEdge *R) {1131return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <1132std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);1133};1134std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);11351136// Insert the edges into the queue.1137[[maybe_unused]] size_t NumActiveChains = 0;1138for (NodeT &Node : AllNodes) {1139if (Node.ExecutionCount == 0)1140continue;1141++NumActiveChains;1142for (const auto &[_, Edge] : Node.CurChain->Edges) {1143// Ignore self-edges.1144if (Edge->isSelfEdge())1145continue;1146// Ignore already processed edges.1147if (Edge->gain() != -1.0)1148continue;11491150// Compute the gain of merging the two chains.1151MergeGainT Gain = getBestMergeGain(Edge);1152Edge->setMergeGain(Gain);11531154if (Edge->gain() > EPS)1155Queue.insert(Edge);1156}1157}11581159// Merge the chains while the gain of merging is positive.1160while (!Queue.empty()) {1161// Extract the best (top) edge for merging.1162ChainEdge *BestEdge = *Queue.begin();1163Queue.erase(Queue.begin());1164ChainT *BestSrcChain = BestEdge->srcChain();1165ChainT *BestDstChain = BestEdge->dstChain();11661167// Remove outdated edges from the queue.1168for (const auto &[_, ChainEdge] : BestSrcChain->Edges)1169Queue.erase(ChainEdge);1170for (const auto &[_, ChainEdge] : BestDstChain->Edges)1171Queue.erase(ChainEdge);11721173// Merge the best pair of chains.1174MergeGainT BestGain = BestEdge->getMergeGain();1175mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),1176BestGain.mergeType());1177--NumActiveChains;11781179// Insert newly created edges into the queue.1180for (const auto &[_, Edge] : BestSrcChain->Edges) {1181// Ignore loop edges.1182if (Edge->isSelfEdge())1183continue;1184if (Edge->srcChain()->numBlocks() + Edge->dstChain()->numBlocks() >1185Config.MaxChainSize)1186continue;11871188// Compute the gain of merging the two chains.1189MergeGainT Gain = getBestMergeGain(Edge);1190Edge->setMergeGain(Gain);11911192if (Edge->gain() > EPS)1193Queue.insert(Edge);1194}1195}11961197LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"1198<< " of chains from " << NumNodes << " to "1199<< NumActiveChains << "\n");1200}12011202/// Compute the gain of merging two chains.1203///1204/// The function considers all possible ways of merging two chains and1205/// computes the one having the largest increase in ExtTSP objective. The1206/// result is a pair with the first element being the gain and the second1207/// element being the corresponding merging type.1208MergeGainT getBestMergeGain(ChainEdge *Edge) const {1209assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");1210// Precompute jumps between ChainPred and ChainSucc.1211MergedJumpsT Jumps(&Edge->jumps());1212ChainT *SrcChain = Edge->srcChain();1213ChainT *DstChain = Edge->dstChain();12141215// This object holds the best currently chosen gain of merging two chains.1216MergeGainT Gain = MergeGainT();12171218/// Given a list of merge types, try to merge two chains and update Gain1219/// with a better alternative.1220auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {1221// Apply the merge, compute the corresponding gain, and update the best1222// value, if the merge is beneficial.1223for (const MergeTypeT &MergeType : MergeTypes) {1224MergeGainT NewGain =1225computeMergeGain(SrcChain, DstChain, Jumps, MergeType);12261227// When forward and backward gains are the same, prioritize merging that1228// preserves the original order of the functions in the binary.1229if (std::abs(Gain.score() - NewGain.score()) < EPS) {1230if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||1231(MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {1232Gain = NewGain;1233}1234} else if (NewGain.score() > Gain.score() + EPS) {1235Gain = NewGain;1236}1237}1238};12391240// Try to concatenate two chains w/o splitting.1241tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});12421243return Gain;1244}12451246/// Compute the score gain of merging two chains, respecting a given type.1247///1248/// The two chains are not modified in the method.1249MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,1250const MergedJumpsT &Jumps,1251MergeTypeT MergeType) const {1252// This doesn't depend on the ordering of the nodes1253double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);12541255// Merge offset is always 0, as the chains are not split.1256size_t MergeOffset = 0;1257auto MergedBlocks =1258mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);1259double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);12601261double GainScore = DistGain + Config.FrequencyScale * FreqGain;1262// Scale the result to increase the importance of merging short chains.1263if (GainScore >= 0.0)1264GainScore /= std::min(ChainPred->Size, ChainSucc->Size);12651266return MergeGainT(GainScore, MergeOffset, MergeType);1267}12681269/// Compute the change of the frequency locality after merging the chains.1270double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {1271auto missProbability = [&](double ChainDensity) {1272double PageSamples = ChainDensity * Config.CacheSize;1273if (PageSamples >= TotalSamples)1274return 0.0;1275double P = PageSamples / TotalSamples;1276return pow(1.0 - P, static_cast<double>(Config.CacheEntries));1277};12781279// Cache misses on the chains before merging.1280double CurScore =1281ChainPred->ExecutionCount * missProbability(ChainPred->density()) +1282ChainSucc->ExecutionCount * missProbability(ChainSucc->density());12831284// Cache misses on the merged chain1285double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;1286double MergedSize = ChainPred->Size + ChainSucc->Size;1287double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;1288double NewScore = MergedCounts * missProbability(MergedDensity);12891290return CurScore - NewScore;1291}12921293/// Compute the distance locality for a jump / call.1294double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {1295uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;1296double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);1297return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);1298}12991300/// Compute the change of the distance locality after merging the chains.1301double distBasedLocalityGain(const MergedNodesT &Nodes,1302const MergedJumpsT &Jumps) const {1303uint64_t CurAddr = 0;1304Nodes.forEach([&](const NodeT *Node) {1305Node->EstimatedAddr = CurAddr;1306CurAddr += Node->Size;1307});13081309double CurScore = 0;1310double NewScore = 0;1311Jumps.forEach([&](const JumpT *Jump) {1312uint64_t SrcAddr = Jump->Source->EstimatedAddr + Jump->Offset;1313uint64_t DstAddr = Jump->Target->EstimatedAddr;1314NewScore += distScore(SrcAddr, DstAddr, Jump->ExecutionCount);1315CurScore += distScore(0, TotalSize, Jump->ExecutionCount);1316});1317return NewScore - CurScore;1318}13191320/// Merge chain From into chain Into, update the list of active chains,1321/// adjacency information, and the corresponding cached values.1322void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,1323MergeTypeT MergeType) {1324assert(Into != From && "a chain cannot be merged with itself");13251326// Merge the nodes.1327MergedNodesT MergedNodes =1328mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);1329Into->merge(From, MergedNodes.getNodes());13301331// Merge the edges.1332Into->mergeEdges(From);1333From->clear();1334}13351336/// Concatenate all chains into the final order.1337std::vector<uint64_t> concatChains() {1338// Collect chains and calculate density stats for their sorting.1339std::vector<const ChainT *> SortedChains;1340DenseMap<const ChainT *, double> ChainDensity;1341for (ChainT &Chain : AllChains) {1342if (!Chain.Nodes.empty()) {1343SortedChains.push_back(&Chain);1344// Using doubles to avoid overflow of ExecutionCounts.1345double Size = 0;1346double ExecutionCount = 0;1347for (NodeT *Node : Chain.Nodes) {1348Size += static_cast<double>(Node->Size);1349ExecutionCount += static_cast<double>(Node->ExecutionCount);1350}1351assert(Size > 0 && "a chain of zero size");1352ChainDensity[&Chain] = ExecutionCount / Size;1353}1354}13551356// Sort chains by density in the decreasing order.1357std::sort(SortedChains.begin(), SortedChains.end(),1358[&](const ChainT *L, const ChainT *R) {1359const double DL = ChainDensity[L];1360const double DR = ChainDensity[R];1361// Compare by density and break ties by chain identifiers.1362return std::make_tuple(-DL, L->Id) <1363std::make_tuple(-DR, R->Id);1364});13651366// Collect the nodes in the order specified by their chains.1367std::vector<uint64_t> Order;1368Order.reserve(NumNodes);1369for (const ChainT *Chain : SortedChains)1370for (NodeT *Node : Chain->Nodes)1371Order.push_back(Node->Index);1372return Order;1373}13741375private:1376/// Config for the algorithm.1377const CDSortConfig Config;13781379/// The number of nodes in the graph.1380const size_t NumNodes;13811382/// Successors of each node.1383std::vector<std::vector<uint64_t>> SuccNodes;13841385/// Predecessors of each node.1386std::vector<std::vector<uint64_t>> PredNodes;13871388/// All nodes (functions) in the graph.1389std::vector<NodeT> AllNodes;13901391/// All jumps (function calls) between the nodes.1392std::vector<JumpT> AllJumps;13931394/// All chains of nodes.1395std::vector<ChainT> AllChains;13961397/// All edges between the chains.1398std::vector<ChainEdge> AllEdges;13991400/// The total number of samples in the graph.1401uint64_t TotalSamples{0};14021403/// The total size of the nodes in the graph.1404uint64_t TotalSize{0};1405};14061407} // end of anonymous namespace14081409std::vector<uint64_t>1410codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,1411ArrayRef<uint64_t> NodeCounts,1412ArrayRef<EdgeCount> EdgeCounts) {1413// Verify correctness of the input data.1414assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");1415assert(NodeSizes.size() > 2 && "Incorrect input");14161417// Apply the reordering algorithm.1418ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);1419std::vector<uint64_t> Result = Alg.run();14201421// Verify correctness of the output.1422assert(Result.front() == 0 && "Original entry point is not preserved");1423assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");1424return Result;1425}14261427double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,1428ArrayRef<uint64_t> NodeSizes,1429ArrayRef<uint64_t> NodeCounts,1430ArrayRef<EdgeCount> EdgeCounts) {1431// Estimate addresses of the blocks in memory.1432std::vector<uint64_t> Addr(NodeSizes.size(), 0);1433for (size_t Idx = 1; Idx < Order.size(); Idx++) {1434Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];1435}1436std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);1437for (auto Edge : EdgeCounts)1438++OutDegree[Edge.src];14391440// Increase the score for each jump.1441double Score = 0;1442for (auto Edge : EdgeCounts) {1443bool IsConditional = OutDegree[Edge.src] > 1;1444Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],1445Edge.count, IsConditional);1446}1447return Score;1448}14491450double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,1451ArrayRef<uint64_t> NodeCounts,1452ArrayRef<EdgeCount> EdgeCounts) {1453std::vector<uint64_t> Order(NodeSizes.size());1454for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {1455Order[Idx] = Idx;1456}1457return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);1458}14591460std::vector<uint64_t> codelayout::computeCacheDirectedLayout(1461const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,1462ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,1463ArrayRef<uint64_t> CallOffsets) {1464// Verify correctness of the input data.1465assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");14661467// Apply the reordering algorithm.1468CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);1469std::vector<uint64_t> Result = Alg.run();1470assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");1471return Result;1472}14731474std::vector<uint64_t> codelayout::computeCacheDirectedLayout(1475ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,1476ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {1477CDSortConfig Config;1478// Populate the config from the command-line options.1479if (CacheEntries.getNumOccurrences() > 0)1480Config.CacheEntries = CacheEntries;1481if (CacheSize.getNumOccurrences() > 0)1482Config.CacheSize = CacheSize;1483if (CDMaxChainSize.getNumOccurrences() > 0)1484Config.MaxChainSize = CDMaxChainSize;1485if (DistancePower.getNumOccurrences() > 0)1486Config.DistancePower = DistancePower;1487if (FrequencyScale.getNumOccurrences() > 0)1488Config.FrequencyScale = FrequencyScale;1489return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,1490CallOffsets);1491}149214931494