Path: blob/main/contrib/llvm-project/llvm/lib/ProfileData/MemProfRadixTree.cpp
213765 views
//===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7// This file contains logic that implements a space efficient radix tree8// encoding for callstacks used by MemProf.9//10//===----------------------------------------------------------------------===//1112#include "llvm/ProfileData/MemProfRadixTree.h"1314namespace llvm {15namespace memprof {16// Encode a call stack into RadixArray. Return the starting index within17// RadixArray. For each call stack we encode, we emit two or three components18// into RadixArray. If a given call stack doesn't have a common prefix relative19// to the previous one, we emit:20//21// - the frames in the given call stack in the root-to-leaf order22//23// - the length of the given call stack24//25// If a given call stack has a non-empty common prefix relative to the previous26// one, we emit:27//28// - the relative location of the common prefix, encoded as a negative number.29//30// - a portion of the given call stack that's beyond the common prefix31//32// - the length of the given call stack, including the length of the common33// prefix.34//35// The resulting RadixArray requires a somewhat unintuitive backward traversal36// to reconstruct a call stack -- read the call stack length and scan backward37// while collecting frames in the leaf to root order. build, the caller of this38// function, reverses RadixArray in place so that we can reconstruct a call39// stack as if we were deserializing an array in a typical way -- the call stack40// length followed by the frames in the leaf-to-root order except that we need41// to handle pointers to parents along the way.42//43// To quickly determine the location of the common prefix within RadixArray,44// Indexes caches the indexes of the previous call stack's frames within45// RadixArray.46template <typename FrameIdTy>47LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(48const llvm::SmallVector<FrameIdTy> *CallStack,49const llvm::SmallVector<FrameIdTy> *Prev,50const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {51// Compute the length of the common root prefix between Prev and CallStack.52uint32_t CommonLen = 0;53if (Prev) {54auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),55CallStack->rend());56CommonLen = std::distance(CallStack->rbegin(), Pos.second);57}5859// Drop the portion beyond CommonLen.60assert(CommonLen <= Indexes.size());61Indexes.resize(CommonLen);6263// Append a pointer to the parent.64if (CommonLen) {65uint32_t CurrentIndex = RadixArray.size();66uint32_t ParentIndex = Indexes.back();67// The offset to the parent must be negative because we are pointing to an68// element we've already added to RadixArray.69assert(ParentIndex < CurrentIndex);70RadixArray.push_back(ParentIndex - CurrentIndex);71}7273// Copy the part of the call stack beyond the common prefix to RadixArray.74assert(CommonLen <= CallStack->size());75for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {76// Remember the index of F in RadixArray.77Indexes.push_back(RadixArray.size());78RadixArray.push_back(79MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);80}81assert(CallStack->size() == Indexes.size());8283// End with the call stack length.84RadixArray.push_back(CallStack->size());8586// Return the index within RadixArray where we can start reconstructing a87// given call stack from.88return RadixArray.size() - 1;89}9091template <typename FrameIdTy>92void CallStackRadixTreeBuilder<FrameIdTy>::build(93llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>94&&MemProfCallStackData,95const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,96llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {97// Take the vector portion of MemProfCallStackData. The vector is exactly98// what we need to sort. Also, we no longer need its lookup capability.99llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();100101// Return early if we have no work to do.102if (CallStacks.empty()) {103RadixArray.clear();104CallStackPos.clear();105return;106}107108// Sorting the list of call stacks in the dictionary order is sufficient to109// maximize the length of the common prefix between two adjacent call stacks110// and thus minimize the length of RadixArray. However, we go one step111// further and try to reduce the number of times we follow pointers to parents112// during deserilization. Consider a poorly encoded radix tree:113//114// CallStackId 1: f1 -> f2 -> f3115// |116// CallStackId 2: +--- f4 -> f5117// |118// CallStackId 3: +--> f6119//120// Here, f2 and f4 appear once and twice, respectively, in the call stacks.121// Once we encode CallStackId 1 into RadixArray, every other call stack with122// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3123// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to124// parents twice.125//126// We try to alleviate the situation by sorting the list of call stacks by127// comparing the popularity of frames rather than the integer values of128// FrameIds. In the example above, f4 is more popular than f2, so we sort the129// call stacks and encode them as:130//131// CallStackId 2: f1 -- f4 -> f5132// | |133// CallStackId 3: | +--> f6134// |135// CallStackId 1: +--> f2 -> f3136//137// Notice that CallStackId 3 follows a pointer to a parent only once.138//139// All this is a quick-n-dirty trick to reduce the number of jumps. The140// proper way would be to compute the weight of each radix tree node -- how141// many call stacks use a given radix tree node, and encode a radix tree from142// the heaviest node first. We do not do so because that's a lot of work.143llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {144// Call stacks are stored from leaf to root. Perform comparisons from the145// root.146return std::lexicographical_compare(147L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),148[&](FrameIdTy F1, FrameIdTy F2) {149uint64_t H1 = FrameHistogram[F1].Count;150uint64_t H2 = FrameHistogram[F2].Count;151// Popular frames should come later because we encode call stacks from152// the last one in the list.153if (H1 != H2)154return H1 < H2;155// For sort stability.156return F1 < F2;157});158});159160// Reserve some reasonable amount of storage.161RadixArray.clear();162RadixArray.reserve(CallStacks.size() * 8);163164// Indexes will grow as long as the longest call stack.165Indexes.clear();166Indexes.reserve(512);167168// CallStackPos will grow to exactly CallStacks.size() entries.169CallStackPos.clear();170CallStackPos.reserve(CallStacks.size());171172// Compute the radix array. We encode one call stack at a time, computing the173// longest prefix that's shared with the previous call stack we encode. For174// each call stack we encode, we remember a mapping from CallStackId to its175// position within RadixArray.176//177// As an optimization, we encode from the last call stack in CallStacks to178// reduce the number of times we follow pointers to the parents. Consider the179// list of call stacks that has been sorted in the dictionary order:180//181// Call Stack 1: F1182// Call Stack 2: F1 -> F2183// Call Stack 3: F1 -> F2 -> F3184//185// If we traversed CallStacks in the forward order, we would end up with a186// radix tree like:187//188// Call Stack 1: F1189// |190// Call Stack 2: +---> F2191// |192// Call Stack 3: +---> F3193//194// Notice that each call stack jumps to the previous one. However, if we195// traverse CallStacks in the reverse order, then Call Stack 3 has the196// complete call stack encoded without any pointers. Call Stack 1 and 2 point197// to appropriate prefixes of Call Stack 3.198const llvm::SmallVector<FrameIdTy> *Prev = nullptr;199for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {200LinearCallStackId Pos =201encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);202CallStackPos.insert({CSId, Pos});203Prev = &CallStack;204}205206// "RadixArray.size() - 1" below is problematic if RadixArray is empty.207assert(!RadixArray.empty());208209// Reverse the radix array in place. We do so mostly for intuitive210// deserialization where we would read the length field and then the call211// stack frames proper just like any other array deserialization, except212// that we have occasional jumps to take advantage of prefixes.213for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)214std::swap(RadixArray[I], RadixArray[J]);215216// "Reverse" the indexes stored in CallStackPos.217for (auto &[K, V] : CallStackPos)218V = RadixArray.size() - 1 - V;219}220221// Explicitly instantiate class with the utilized FrameIdTy.222template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>;223template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>;224225template <typename FrameIdTy>226llvm::DenseMap<FrameIdTy, FrameStat>227computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>228&MemProfCallStackData) {229llvm::DenseMap<FrameIdTy, FrameStat> Histogram;230231for (const auto &KV : MemProfCallStackData) {232const auto &CS = KV.second;233for (unsigned I = 0, E = CS.size(); I != E; ++I) {234auto &S = Histogram[CS[I]];235++S.Count;236S.PositionSum += I;237}238}239return Histogram;240}241242// Explicitly instantiate function with the utilized FrameIdTy.243template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>244computeFrameHistogram<FrameId>(245llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>246&MemProfCallStackData);247template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>248computeFrameHistogram<LinearFrameId>(249llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>250&MemProfCallStackData);251} // namespace memprof252} // namespace llvm253254255