Path: blob/main/contrib/llvm-project/llvm/lib/ProfileData/MemProf.cpp
35234 views
#include "llvm/ProfileData/MemProf.h"1#include "llvm/ADT/SmallVector.h"2#include "llvm/IR/Function.h"3#include "llvm/ProfileData/InstrProf.h"4#include "llvm/ProfileData/SampleProf.h"5#include "llvm/Support/BLAKE3.h"6#include "llvm/Support/Endian.h"7#include "llvm/Support/EndianStream.h"8#include "llvm/Support/HashBuilder.h"910namespace llvm {11namespace memprof {12MemProfSchema getFullSchema() {13MemProfSchema List;14#define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);15#include "llvm/ProfileData/MIBEntryDef.inc"16#undef MIBEntryDef17return List;18}1920MemProfSchema getHotColdSchema() {21return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime,22Meta::TotalLifetimeAccessDensity};23}2425static size_t serializedSizeV0(const IndexedAllocationInfo &IAI,26const MemProfSchema &Schema) {27size_t Size = 0;28// The number of frames to serialize.29Size += sizeof(uint64_t);30// The callstack frame ids.31Size += sizeof(FrameId) * IAI.CallStack.size();32// The size of the payload.33Size += PortableMemInfoBlock::serializedSize(Schema);34return Size;35}3637static size_t serializedSizeV2(const IndexedAllocationInfo &IAI,38const MemProfSchema &Schema) {39size_t Size = 0;40// The CallStackId41Size += sizeof(CallStackId);42// The size of the payload.43Size += PortableMemInfoBlock::serializedSize(Schema);44return Size;45}4647static size_t serializedSizeV3(const IndexedAllocationInfo &IAI,48const MemProfSchema &Schema) {49size_t Size = 0;50// The linear call stack ID.51Size += sizeof(LinearCallStackId);52// The size of the payload.53Size += PortableMemInfoBlock::serializedSize(Schema);54return Size;55}5657size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,58IndexedVersion Version) const {59switch (Version) {60case Version0:61case Version1:62return serializedSizeV0(*this, Schema);63case Version2:64return serializedSizeV2(*this, Schema);65case Version3:66return serializedSizeV3(*this, Schema);67}68llvm_unreachable("unsupported MemProf version");69}7071static size_t serializedSizeV0(const IndexedMemProfRecord &Record,72const MemProfSchema &Schema) {73// The number of alloc sites to serialize.74size_t Result = sizeof(uint64_t);75for (const IndexedAllocationInfo &N : Record.AllocSites)76Result += N.serializedSize(Schema, Version0);7778// The number of callsites we have information for.79Result += sizeof(uint64_t);80for (const auto &Frames : Record.CallSites) {81// The number of frame ids to serialize.82Result += sizeof(uint64_t);83Result += Frames.size() * sizeof(FrameId);84}85return Result;86}8788static size_t serializedSizeV2(const IndexedMemProfRecord &Record,89const MemProfSchema &Schema) {90// The number of alloc sites to serialize.91size_t Result = sizeof(uint64_t);92for (const IndexedAllocationInfo &N : Record.AllocSites)93Result += N.serializedSize(Schema, Version2);9495// The number of callsites we have information for.96Result += sizeof(uint64_t);97// The CallStackId98Result += Record.CallSiteIds.size() * sizeof(CallStackId);99return Result;100}101102static size_t serializedSizeV3(const IndexedMemProfRecord &Record,103const MemProfSchema &Schema) {104// The number of alloc sites to serialize.105size_t Result = sizeof(uint64_t);106for (const IndexedAllocationInfo &N : Record.AllocSites)107Result += N.serializedSize(Schema, Version3);108109// The number of callsites we have information for.110Result += sizeof(uint64_t);111// The linear call stack ID.112Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId);113return Result;114}115116size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,117IndexedVersion Version) const {118switch (Version) {119case Version0:120case Version1:121return serializedSizeV0(*this, Schema);122case Version2:123return serializedSizeV2(*this, Schema);124case Version3:125return serializedSizeV3(*this, Schema);126}127llvm_unreachable("unsupported MemProf version");128}129130static void serializeV0(const IndexedMemProfRecord &Record,131const MemProfSchema &Schema, raw_ostream &OS) {132using namespace support;133134endian::Writer LE(OS, llvm::endianness::little);135136LE.write<uint64_t>(Record.AllocSites.size());137for (const IndexedAllocationInfo &N : Record.AllocSites) {138LE.write<uint64_t>(N.CallStack.size());139for (const FrameId &Id : N.CallStack)140LE.write<FrameId>(Id);141N.Info.serialize(Schema, OS);142}143144// Related contexts.145LE.write<uint64_t>(Record.CallSites.size());146for (const auto &Frames : Record.CallSites) {147LE.write<uint64_t>(Frames.size());148for (const FrameId &Id : Frames)149LE.write<FrameId>(Id);150}151}152153static void serializeV2(const IndexedMemProfRecord &Record,154const MemProfSchema &Schema, raw_ostream &OS) {155using namespace support;156157endian::Writer LE(OS, llvm::endianness::little);158159LE.write<uint64_t>(Record.AllocSites.size());160for (const IndexedAllocationInfo &N : Record.AllocSites) {161LE.write<CallStackId>(N.CSId);162N.Info.serialize(Schema, OS);163}164165// Related contexts.166LE.write<uint64_t>(Record.CallSiteIds.size());167for (const auto &CSId : Record.CallSiteIds)168LE.write<CallStackId>(CSId);169}170171static void serializeV3(172const IndexedMemProfRecord &Record, const MemProfSchema &Schema,173raw_ostream &OS,174llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) {175using namespace support;176177endian::Writer LE(OS, llvm::endianness::little);178179LE.write<uint64_t>(Record.AllocSites.size());180for (const IndexedAllocationInfo &N : Record.AllocSites) {181assert(MemProfCallStackIndexes.contains(N.CSId));182LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]);183N.Info.serialize(Schema, OS);184}185186// Related contexts.187LE.write<uint64_t>(Record.CallSiteIds.size());188for (const auto &CSId : Record.CallSiteIds) {189assert(MemProfCallStackIndexes.contains(CSId));190LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]);191}192}193194void IndexedMemProfRecord::serialize(195const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version,196llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes)197const {198switch (Version) {199case Version0:200case Version1:201serializeV0(*this, Schema, OS);202return;203case Version2:204serializeV2(*this, Schema, OS);205return;206case Version3:207serializeV3(*this, Schema, OS, *MemProfCallStackIndexes);208return;209}210llvm_unreachable("unsupported MemProf version");211}212213static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,214const unsigned char *Ptr) {215using namespace support;216217IndexedMemProfRecord Record;218219// Read the meminfo nodes.220const uint64_t NumNodes =221endian::readNext<uint64_t, llvm::endianness::little>(Ptr);222for (uint64_t I = 0; I < NumNodes; I++) {223IndexedAllocationInfo Node;224const uint64_t NumFrames =225endian::readNext<uint64_t, llvm::endianness::little>(Ptr);226for (uint64_t J = 0; J < NumFrames; J++) {227const FrameId Id =228endian::readNext<FrameId, llvm::endianness::little>(Ptr);229Node.CallStack.push_back(Id);230}231Node.CSId = hashCallStack(Node.CallStack);232Node.Info.deserialize(Schema, Ptr);233Ptr += PortableMemInfoBlock::serializedSize(Schema);234Record.AllocSites.push_back(Node);235}236237// Read the callsite information.238const uint64_t NumCtxs =239endian::readNext<uint64_t, llvm::endianness::little>(Ptr);240for (uint64_t J = 0; J < NumCtxs; J++) {241const uint64_t NumFrames =242endian::readNext<uint64_t, llvm::endianness::little>(Ptr);243llvm::SmallVector<FrameId> Frames;244Frames.reserve(NumFrames);245for (uint64_t K = 0; K < NumFrames; K++) {246const FrameId Id =247endian::readNext<FrameId, llvm::endianness::little>(Ptr);248Frames.push_back(Id);249}250Record.CallSites.push_back(Frames);251Record.CallSiteIds.push_back(hashCallStack(Frames));252}253254return Record;255}256257static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,258const unsigned char *Ptr) {259using namespace support;260261IndexedMemProfRecord Record;262263// Read the meminfo nodes.264const uint64_t NumNodes =265endian::readNext<uint64_t, llvm::endianness::little>(Ptr);266Record.AllocSites.reserve(NumNodes);267for (uint64_t I = 0; I < NumNodes; I++) {268IndexedAllocationInfo Node;269Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);270Node.Info.deserialize(Schema, Ptr);271Ptr += PortableMemInfoBlock::serializedSize(Schema);272Record.AllocSites.push_back(Node);273}274275// Read the callsite information.276const uint64_t NumCtxs =277endian::readNext<uint64_t, llvm::endianness::little>(Ptr);278Record.CallSiteIds.reserve(NumCtxs);279for (uint64_t J = 0; J < NumCtxs; J++) {280CallStackId CSId =281endian::readNext<CallStackId, llvm::endianness::little>(Ptr);282Record.CallSiteIds.push_back(CSId);283}284285return Record;286}287288static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema,289const unsigned char *Ptr) {290using namespace support;291292IndexedMemProfRecord Record;293294// Read the meminfo nodes.295const uint64_t NumNodes =296endian::readNext<uint64_t, llvm::endianness::little>(Ptr);297Record.AllocSites.reserve(NumNodes);298for (uint64_t I = 0; I < NumNodes; I++) {299IndexedAllocationInfo Node;300Node.CSId =301endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);302Node.Info.deserialize(Schema, Ptr);303Ptr += PortableMemInfoBlock::serializedSize(Schema);304Record.AllocSites.push_back(Node);305}306307// Read the callsite information.308const uint64_t NumCtxs =309endian::readNext<uint64_t, llvm::endianness::little>(Ptr);310Record.CallSiteIds.reserve(NumCtxs);311for (uint64_t J = 0; J < NumCtxs; J++) {312// We are storing LinearCallStackId in CallSiteIds, which is a vector of313// CallStackId. Assert that CallStackId is no smaller than314// LinearCallStackId.315static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId));316LinearCallStackId CSId =317endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);318Record.CallSiteIds.push_back(CSId);319}320321return Record;322}323324IndexedMemProfRecord325IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,326const unsigned char *Ptr,327IndexedVersion Version) {328switch (Version) {329case Version0:330case Version1:331return deserializeV0(Schema, Ptr);332case Version2:333return deserializeV2(Schema, Ptr);334case Version3:335return deserializeV3(Schema, Ptr);336}337llvm_unreachable("unsupported MemProf version");338}339340MemProfRecord IndexedMemProfRecord::toMemProfRecord(341llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const {342MemProfRecord Record;343344Record.AllocSites.reserve(AllocSites.size());345for (const IndexedAllocationInfo &IndexedAI : AllocSites) {346AllocationInfo AI;347AI.Info = IndexedAI.Info;348AI.CallStack = Callback(IndexedAI.CSId);349Record.AllocSites.push_back(std::move(AI));350}351352Record.CallSites.reserve(CallSiteIds.size());353for (CallStackId CSId : CallSiteIds)354Record.CallSites.push_back(Callback(CSId));355356return Record;357}358359GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {360// Canonicalize the function name to drop suffixes such as ".llvm.". Note361// we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop362// those by default. This is by design to differentiate internal linkage363// functions during matching. By dropping the other suffixes we can then match364// functions in the profile use phase prior to their addition. Note that this365// applies to both instrumented and sampled function names.366StringRef CanonicalName =367sampleprof::FunctionSamples::getCanonicalFnName(FunctionName);368369// We use the function guid which we expect to be a uint64_t. At370// this time, it is the lower 64 bits of the md5 of the canonical371// function name.372return Function::getGUID(CanonicalName);373}374375Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {376using namespace support;377378const unsigned char *Ptr = Buffer;379const uint64_t NumSchemaIds =380endian::readNext<uint64_t, llvm::endianness::little>(Ptr);381if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {382return make_error<InstrProfError>(instrprof_error::malformed,383"memprof schema invalid");384}385386MemProfSchema Result;387for (size_t I = 0; I < NumSchemaIds; I++) {388const uint64_t Tag =389endian::readNext<uint64_t, llvm::endianness::little>(Ptr);390if (Tag >= static_cast<uint64_t>(Meta::Size)) {391return make_error<InstrProfError>(instrprof_error::malformed,392"memprof schema invalid");393}394Result.push_back(static_cast<Meta>(Tag));395}396// Advance the buffer to one past the schema if we succeeded.397Buffer = Ptr;398return Result;399}400401CallStackId hashCallStack(ArrayRef<FrameId> CS) {402llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>403HashBuilder;404for (FrameId F : CS)405HashBuilder.add(F);406llvm::BLAKE3Result<8> Hash = HashBuilder.final();407CallStackId CSId;408std::memcpy(&CSId, Hash.data(), sizeof(Hash));409return CSId;410}411412// Encode a call stack into RadixArray. Return the starting index within413// RadixArray. For each call stack we encode, we emit two or three components414// into RadixArray. If a given call stack doesn't have a common prefix relative415// to the previous one, we emit:416//417// - the frames in the given call stack in the root-to-leaf order418//419// - the length of the given call stack420//421// If a given call stack has a non-empty common prefix relative to the previous422// one, we emit:423//424// - the relative location of the common prefix, encoded as a negative number.425//426// - a portion of the given call stack that's beyond the common prefix427//428// - the length of the given call stack, including the length of the common429// prefix.430//431// The resulting RadixArray requires a somewhat unintuitive backward traversal432// to reconstruct a call stack -- read the call stack length and scan backward433// while collecting frames in the leaf to root order. build, the caller of this434// function, reverses RadixArray in place so that we can reconstruct a call435// stack as if we were deserializing an array in a typical way -- the call stack436// length followed by the frames in the leaf-to-root order except that we need437// to handle pointers to parents along the way.438//439// To quickly determine the location of the common prefix within RadixArray,440// Indexes caches the indexes of the previous call stack's frames within441// RadixArray.442LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(443const llvm::SmallVector<FrameId> *CallStack,444const llvm::SmallVector<FrameId> *Prev,445const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {446// Compute the length of the common root prefix between Prev and CallStack.447uint32_t CommonLen = 0;448if (Prev) {449auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),450CallStack->rend());451CommonLen = std::distance(CallStack->rbegin(), Pos.second);452}453454// Drop the portion beyond CommonLen.455assert(CommonLen <= Indexes.size());456Indexes.resize(CommonLen);457458// Append a pointer to the parent.459if (CommonLen) {460uint32_t CurrentIndex = RadixArray.size();461uint32_t ParentIndex = Indexes.back();462// The offset to the parent must be negative because we are pointing to an463// element we've already added to RadixArray.464assert(ParentIndex < CurrentIndex);465RadixArray.push_back(ParentIndex - CurrentIndex);466}467468// Copy the part of the call stack beyond the common prefix to RadixArray.469assert(CommonLen <= CallStack->size());470for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {471// Remember the index of F in RadixArray.472Indexes.push_back(RadixArray.size());473RadixArray.push_back(MemProfFrameIndexes.find(F)->second);474}475assert(CallStack->size() == Indexes.size());476477// End with the call stack length.478RadixArray.push_back(CallStack->size());479480// Return the index within RadixArray where we can start reconstructing a481// given call stack from.482return RadixArray.size() - 1;483}484485void CallStackRadixTreeBuilder::build(486llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>487&&MemProfCallStackData,488const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,489llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {490// Take the vector portion of MemProfCallStackData. The vector is exactly491// what we need to sort. Also, we no longer need its lookup capability.492llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();493494// Return early if we have no work to do.495if (CallStacks.empty()) {496RadixArray.clear();497CallStackPos.clear();498return;499}500501// Sorting the list of call stacks in the dictionary order is sufficient to502// maximize the length of the common prefix between two adjacent call stacks503// and thus minimize the length of RadixArray. However, we go one step504// further and try to reduce the number of times we follow pointers to parents505// during deserilization. Consider a poorly encoded radix tree:506//507// CallStackId 1: f1 -> f2 -> f3508// |509// CallStackId 2: +--- f4 -> f5510// |511// CallStackId 3: +--> f6512//513// Here, f2 and f4 appear once and twice, respectively, in the call stacks.514// Once we encode CallStackId 1 into RadixArray, every other call stack with515// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3516// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to517// parents twice.518//519// We try to alleviate the situation by sorting the list of call stacks by520// comparing the popularity of frames rather than the integer values of521// FrameIds. In the example above, f4 is more popular than f2, so we sort the522// call stacks and encode them as:523//524// CallStackId 2: f1 -- f4 -> f5525// | |526// CallStackId 3: | +--> f6527// |528// CallStackId 1: +--> f2 -> f3529//530// Notice that CallStackId 3 follows a pointer to a parent only once.531//532// All this is a quick-n-dirty trick to reduce the number of jumps. The533// proper way would be to compute the weight of each radix tree node -- how534// many call stacks use a given radix tree node, and encode a radix tree from535// the heaviest node first. We do not do so because that's a lot of work.536llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {537// Call stacks are stored from leaf to root. Perform comparisons from the538// root.539return std::lexicographical_compare(540L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),541[&](FrameId F1, FrameId F2) {542uint64_t H1 = FrameHistogram[F1].Count;543uint64_t H2 = FrameHistogram[F2].Count;544// Popular frames should come later because we encode call stacks from545// the last one in the list.546if (H1 != H2)547return H1 < H2;548// For sort stability.549return F1 < F2;550});551});552553// Reserve some reasonable amount of storage.554RadixArray.clear();555RadixArray.reserve(CallStacks.size() * 8);556557// Indexes will grow as long as the longest call stack.558Indexes.clear();559Indexes.reserve(512);560561// CallStackPos will grow to exactly CallStacks.size() entries.562CallStackPos.clear();563CallStackPos.reserve(CallStacks.size());564565// Compute the radix array. We encode one call stack at a time, computing the566// longest prefix that's shared with the previous call stack we encode. For567// each call stack we encode, we remember a mapping from CallStackId to its568// position within RadixArray.569//570// As an optimization, we encode from the last call stack in CallStacks to571// reduce the number of times we follow pointers to the parents. Consider the572// list of call stacks that has been sorted in the dictionary order:573//574// Call Stack 1: F1575// Call Stack 2: F1 -> F2576// Call Stack 3: F1 -> F2 -> F3577//578// If we traversed CallStacks in the forward order, we would end up with a579// radix tree like:580//581// Call Stack 1: F1582// |583// Call Stack 2: +---> F2584// |585// Call Stack 3: +---> F3586//587// Notice that each call stack jumps to the previous one. However, if we588// traverse CallStacks in the reverse order, then Call Stack 3 has the589// complete call stack encoded without any pointers. Call Stack 1 and 2 point590// to appropriate prefixes of Call Stack 3.591const llvm::SmallVector<FrameId> *Prev = nullptr;592for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {593LinearCallStackId Pos =594encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);595CallStackPos.insert({CSId, Pos});596Prev = &CallStack;597}598599// "RadixArray.size() - 1" below is problematic if RadixArray is empty.600assert(!RadixArray.empty());601602// Reverse the radix array in place. We do so mostly for intuitive603// deserialization where we would read the length field and then the call604// stack frames proper just like any other array deserialization, except605// that we have occasional jumps to take advantage of prefixes.606for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)607std::swap(RadixArray[I], RadixArray[J]);608609// "Reverse" the indexes stored in CallStackPos.610for (auto &[K, V] : CallStackPos)611V = RadixArray.size() - 1 - V;612}613614llvm::DenseMap<FrameId, FrameStat>615computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>616&MemProfCallStackData) {617llvm::DenseMap<FrameId, FrameStat> Histogram;618619for (const auto &KV : MemProfCallStackData) {620const auto &CS = KV.second;621for (unsigned I = 0, E = CS.size(); I != E; ++I) {622auto &S = Histogram[CS[I]];623++S.Count;624S.PositionSum += I;625}626}627return Histogram;628}629630void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {631for (const auto &AS : Record.AllocSites) {632assert(AS.CSId == hashCallStack(AS.CallStack));633(void)AS;634}635}636637void verifyFunctionProfileData(638const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord>639&FunctionProfileData) {640for (const auto &[GUID, Record] : FunctionProfileData) {641(void)GUID;642verifyIndexedMemProfRecord(Record);643}644}645646} // namespace memprof647} // namespace llvm648649650