Path: blob/main/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
35293 views
//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// The data structures defined in this file are based on the reference9// implementation which is available at10// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp11//12//===----------------------------------------------------------------------===//1314#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"15#include "llvm/DebugInfo/CodeView/RecordName.h"16#include "llvm/DebugInfo/CodeView/RecordSerialization.h"17#include "llvm/DebugInfo/CodeView/SymbolRecord.h"18#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"19#include "llvm/DebugInfo/MSF/MSFBuilder.h"20#include "llvm/DebugInfo/MSF/MSFCommon.h"21#include "llvm/DebugInfo/MSF/MappedBlockStream.h"22#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"23#include "llvm/DebugInfo/PDB/Native/Hash.h"24#include "llvm/DebugInfo/PDB/Native/RawTypes.h"25#include "llvm/Support/BinaryItemStream.h"26#include "llvm/Support/BinaryStreamWriter.h"27#include "llvm/Support/Parallel.h"28#include "llvm/Support/TimeProfiler.h"29#include "llvm/Support/xxhash.h"30#include <algorithm>31#include <vector>3233using namespace llvm;34using namespace llvm::msf;35using namespace llvm::pdb;36using namespace llvm::codeview;3738// Helper class for building the public and global PDB hash table buckets.39struct llvm::pdb::GSIHashStreamBuilder {40// Sum of the size of all public or global records.41uint32_t RecordByteSize = 0;4243std::vector<PSHashRecord> HashRecords;4445// The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The46// reference implementation builds a hash table with IPHR_HASH buckets in it.47// The last bucket is used to link together free hash table cells in a linked48// list, but it is always empty in the compressed, on-disk format. However,49// the bitmap must have a bit for it.50std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;5152std::vector<support::ulittle32_t> HashBuckets;5354uint32_t calculateSerializedLength() const;55Error commit(BinaryStreamWriter &Writer);5657void finalizePublicBuckets();58void finalizeGlobalBuckets(uint32_t RecordZeroOffset);5960// Assign public and global symbol records into hash table buckets.61// Modifies the list of records to store the bucket index, but does not62// change the order.63void finalizeBuckets(uint32_t RecordZeroOffset,64MutableArrayRef<BulkPublic> Globals);65};6667// DenseMapInfo implementation for deduplicating symbol records.68struct llvm::pdb::SymbolDenseMapInfo {69static inline CVSymbol getEmptyKey() {70static CVSymbol Empty;71return Empty;72}73static inline CVSymbol getTombstoneKey() {74static CVSymbol Tombstone(75DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());76return Tombstone;77}78static unsigned getHashValue(const CVSymbol &Val) {79return xxh3_64bits(Val.RecordData);80}81static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {82return LHS.RecordData == RHS.RecordData;83}84};8586namespace {87LLVM_PACKED_START88struct PublicSym32Layout {89RecordPrefix Prefix;90PublicSym32Header Pub;91// char Name[];92};93LLVM_PACKED_END94} // namespace9596// Calculate how much memory this public needs when serialized.97static uint32_t sizeOfPublic(const BulkPublic &Pub) {98uint32_t NameLen = Pub.NameLen;99NameLen = std::min(NameLen,100uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));101return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);102}103104static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {105// Assume the caller has allocated sizeOfPublic bytes.106uint32_t NameLen = std::min(107Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));108size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);109assert(Size == sizeOfPublic(Pub));110auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);111FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);112FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);113FixedMem->Pub.Flags = Pub.Flags;114FixedMem->Pub.Offset = Pub.Offset;115FixedMem->Pub.Segment = Pub.Segment;116char *NameMem = reinterpret_cast<char *>(FixedMem + 1);117memcpy(NameMem, Pub.Name, NameLen);118// Zero the null terminator and remaining bytes.119memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);120return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));121}122123uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {124uint32_t Size = sizeof(GSIHashHeader);125Size += HashRecords.size() * sizeof(PSHashRecord);126Size += HashBitmap.size() * sizeof(uint32_t);127Size += HashBuckets.size() * sizeof(uint32_t);128return Size;129}130131Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {132GSIHashHeader Header;133Header.VerSignature = GSIHashHeader::HdrSignature;134Header.VerHdr = GSIHashHeader::HdrVersion;135Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);136Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;137138if (auto EC = Writer.writeObject(Header))139return EC;140141if (auto EC = Writer.writeArray(ArrayRef(HashRecords)))142return EC;143if (auto EC = Writer.writeArray(ArrayRef(HashBitmap)))144return EC;145if (auto EC = Writer.writeArray(ArrayRef(HashBuckets)))146return EC;147return Error::success();148}149150static bool isAsciiString(StringRef S) {151return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });152}153154// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp155static int gsiRecordCmp(StringRef S1, StringRef S2) {156size_t LS = S1.size();157size_t RS = S2.size();158// Shorter strings always compare less than longer strings.159if (LS != RS)160return (LS > RS) - (LS < RS);161162// If either string contains non ascii characters, memcmp them.163if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))164return memcmp(S1.data(), S2.data(), LS);165166// Both strings are ascii, perform a case-insensitive comparison.167return S1.compare_insensitive(S2.data());168}169170void GSIStreamBuilder::finalizePublicBuckets() {171PSH->finalizeBuckets(0, Publics);172}173174void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {175// Build up a list of globals to be bucketed. Use the BulkPublic data176// structure for this purpose, even though these are global records, not177// public records. Most of the same fields are required:178// - Name179// - NameLen180// - SymOffset181// - BucketIdx182// The dead fields are Offset, Segment, and Flags.183std::vector<BulkPublic> Records;184Records.resize(Globals.size());185uint32_t SymOffset = RecordZeroOffset;186for (size_t I = 0, E = Globals.size(); I < E; ++I) {187StringRef Name = getSymbolName(Globals[I]);188Records[I].Name = Name.data();189Records[I].NameLen = Name.size();190Records[I].SymOffset = SymOffset;191SymOffset += Globals[I].length();192}193194GSH->finalizeBuckets(RecordZeroOffset, Records);195}196197void GSIHashStreamBuilder::finalizeBuckets(198uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {199// Hash every name in parallel.200parallelFor(0, Records.size(), [&](size_t I) {201Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);202});203204// Count up the size of each bucket. Then, use an exclusive prefix sum to205// calculate the bucket start offsets. This is C++17 std::exclusive_scan, but206// we can't use it yet.207uint32_t BucketStarts[IPHR_HASH] = {0};208for (const BulkPublic &P : Records)209++BucketStarts[P.BucketIdx];210uint32_t Sum = 0;211for (uint32_t &B : BucketStarts) {212uint32_t Size = B;213B = Sum;214Sum += Size;215}216217// Place globals into the hash table in bucket order. When placing a global,218// update the bucket start. Every hash table slot should be filled. Always use219// a refcount of one for now.220HashRecords.resize(Records.size());221uint32_t BucketCursors[IPHR_HASH];222memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));223for (int I = 0, E = Records.size(); I < E; ++I) {224uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;225HashRecords[HashIdx].Off = I;226HashRecords[HashIdx].CRef = 1;227}228229// Within the buckets, sort each bucket by memcmp of the symbol's name. It's230// important that we use the same sorting algorithm as is used by the231// reference implementation to ensure that the search for a record within a232// bucket can properly early-out when it detects the record won't be found.233// The algorithm used here corresponds to the function234// caseInsensitiveComparePchPchCchCch in the reference implementation.235parallelFor(0, IPHR_HASH, [&](size_t I) {236auto B = HashRecords.begin() + BucketStarts[I];237auto E = HashRecords.begin() + BucketCursors[I];238if (B == E)239return;240auto BucketCmp = [Records](const PSHashRecord &LHash,241const PSHashRecord &RHash) {242const BulkPublic &L = Records[uint32_t(LHash.Off)];243const BulkPublic &R = Records[uint32_t(RHash.Off)];244assert(L.BucketIdx == R.BucketIdx);245int Cmp = gsiRecordCmp(L.getName(), R.getName());246if (Cmp != 0)247return Cmp < 0;248// This comparison is necessary to make the sorting stable in the presence249// of two static globals with the same name. The easiest way to observe250// this is with S_LDATA32 records.251return L.SymOffset < R.SymOffset;252};253llvm::sort(B, E, BucketCmp);254255// After we are done sorting, replace the global indices with the stream256// offsets of each global. Add one when writing symbol offsets to disk.257// See GSI1::fixSymRecs.258for (PSHashRecord &HRec : make_range(B, E))259HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;260});261262// For each non-empty bucket, push the bucket start offset into HashBuckets263// and set a bit in the hash bitmap.264for (uint32_t I = 0; I < HashBitmap.size(); ++I) {265uint32_t Word = 0;266for (uint32_t J = 0; J < 32; ++J) {267// Skip empty buckets.268uint32_t BucketIdx = I * 32 + J;269if (BucketIdx >= IPHR_HASH ||270BucketStarts[BucketIdx] == BucketCursors[BucketIdx])271continue;272Word |= (1U << J);273274// Calculate what the offset of the first hash record in the chain would275// be if it were inflated to contain 32-bit pointers. On a 32-bit system,276// each record would be 12 bytes. See HROffsetCalc in gsi.h.277const int SizeOfHROffsetCalc = 12;278ulittle32_t ChainStartOff =279ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);280HashBuckets.push_back(ChainStartOff);281}282HashBitmap[I] = Word;283}284}285286GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)287: Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),288GSH(std::make_unique<GSIHashStreamBuilder>()) {}289290GSIStreamBuilder::~GSIStreamBuilder() = default;291292uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {293uint32_t Size = 0;294Size += sizeof(PublicsStreamHeader);295Size += PSH->calculateSerializedLength();296Size += Publics.size() * sizeof(uint32_t); // AddrMap297// FIXME: Add thunk map and section offsets for incremental linking.298299return Size;300}301302uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {303return GSH->calculateSerializedLength();304}305306Error GSIStreamBuilder::finalizeMsfLayout() {307// First we write public symbol records, then we write global symbol records.308finalizePublicBuckets();309finalizeGlobalBuckets(PSH->RecordByteSize);310311Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());312if (!Idx)313return Idx.takeError();314GlobalsStreamIndex = *Idx;315316Idx = Msf.addStream(calculatePublicsHashStreamSize());317if (!Idx)318return Idx.takeError();319PublicsStreamIndex = *Idx;320321uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;322323Idx = Msf.addStream(RecordBytes);324if (!Idx)325return Idx.takeError();326RecordStreamIndex = *Idx;327return Error::success();328}329330void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {331assert(Publics.empty() && PSH->RecordByteSize == 0 &&332"publics can only be added once");333Publics = std::move(PublicsIn);334335// Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.336parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {337return L.getName() < R.getName();338});339340// Assign offsets and calculate the length of the public symbol records.341uint32_t SymOffset = 0;342for (BulkPublic &Pub : Publics) {343Pub.SymOffset = SymOffset;344SymOffset += sizeOfPublic(Pub);345}346347// Remember the length of the public stream records.348PSH->RecordByteSize = SymOffset;349}350351void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {352serializeAndAddGlobal(Sym);353}354355void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {356serializeAndAddGlobal(Sym);357}358359void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {360serializeAndAddGlobal(Sym);361}362363template <typename T>364void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {365T Copy(Symbol);366addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),367CodeViewContainer::Pdb));368}369370void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {371// Ignore duplicate typedefs and constants.372if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {373auto Iter = GlobalsSeen.insert(Symbol);374if (!Iter.second)375return;376}377GSH->RecordByteSize += Symbol.length();378Globals.push_back(Symbol);379}380381// Serialize each public and write it.382static Error writePublics(BinaryStreamWriter &Writer,383ArrayRef<BulkPublic> Publics) {384std::vector<uint8_t> Storage;385for (const BulkPublic &Pub : Publics) {386Storage.resize(sizeOfPublic(Pub));387serializePublic(Storage.data(), Pub);388if (Error E = Writer.writeBytes(Storage))389return E;390}391return Error::success();392}393394static Error writeRecords(BinaryStreamWriter &Writer,395ArrayRef<CVSymbol> Records) {396BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);397ItemStream.setItems(Records);398BinaryStreamRef RecordsRef(ItemStream);399return Writer.writeStreamRef(RecordsRef);400}401402Error GSIStreamBuilder::commitSymbolRecordStream(403WritableBinaryStreamRef Stream) {404BinaryStreamWriter Writer(Stream);405406// Write public symbol records first, followed by global symbol records. This407// must match the order that we assume in finalizeMsfLayout when computing408// PSHZero and GSHZero.409if (auto EC = writePublics(Writer, Publics))410return EC;411if (auto EC = writeRecords(Writer, Globals))412return EC;413414return Error::success();415}416417static std::vector<support::ulittle32_t>418computeAddrMap(ArrayRef<BulkPublic> Publics) {419// Build a parallel vector of indices into the Publics vector, and sort it by420// address.421std::vector<ulittle32_t> PubAddrMap;422PubAddrMap.reserve(Publics.size());423for (int I = 0, E = Publics.size(); I < E; ++I)424PubAddrMap.push_back(ulittle32_t(I));425426auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {427const BulkPublic &L = Publics[LIdx];428const BulkPublic &R = Publics[RIdx];429if (L.Segment != R.Segment)430return L.Segment < R.Segment;431if (L.Offset != R.Offset)432return L.Offset < R.Offset;433// parallelSort is unstable, so we have to do name comparison to ensure434// that two names for the same location come out in a deterministic order.435return L.getName() < R.getName();436};437parallelSort(PubAddrMap, AddrCmp);438439// Rewrite the public symbol indices into symbol offsets.440for (ulittle32_t &Entry : PubAddrMap)441Entry = Publics[Entry].SymOffset;442return PubAddrMap;443}444445Error GSIStreamBuilder::commitPublicsHashStream(446WritableBinaryStreamRef Stream) {447BinaryStreamWriter Writer(Stream);448PublicsStreamHeader Header;449450// FIXME: Fill these in. They are for incremental linking.451Header.SymHash = PSH->calculateSerializedLength();452Header.AddrMap = Publics.size() * 4;453Header.NumThunks = 0;454Header.SizeOfThunk = 0;455Header.ISectThunkTable = 0;456memset(Header.Padding, 0, sizeof(Header.Padding));457Header.OffThunkTable = 0;458Header.NumSections = 0;459if (auto EC = Writer.writeObject(Header))460return EC;461462if (auto EC = PSH->commit(Writer))463return EC;464465std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);466assert(PubAddrMap.size() == Publics.size());467if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap)))468return EC;469470return Error::success();471}472473Error GSIStreamBuilder::commitGlobalsHashStream(474WritableBinaryStreamRef Stream) {475BinaryStreamWriter Writer(Stream);476return GSH->commit(Writer);477}478479Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,480WritableBinaryStreamRef Buffer) {481llvm::TimeTraceScope timeScope("Commit GSI stream");482auto GS = WritableMappedBlockStream::createIndexedStream(483Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());484auto PS = WritableMappedBlockStream::createIndexedStream(485Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());486auto PRS = WritableMappedBlockStream::createIndexedStream(487Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());488489if (auto EC = commitSymbolRecordStream(*PRS))490return EC;491if (auto EC = commitGlobalsHashStream(*GS))492return EC;493if (auto EC = commitPublicsHashStream(*PS))494return EC;495return Error::success();496}497498499