Path: blob/main/contrib/llvm-project/lld/MachO/ExportTrie.cpp
34878 views
//===- ExportTrie.cpp -----------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This is a partial implementation of the Mach-O export trie format. It's9// essentially a symbol table encoded as a compressed prefix trie, meaning that10// the common prefixes of each symbol name are shared for a more compact11// representation. The prefixes are stored on the edges of the trie, and one12// edge can represent multiple characters. For example, given two exported13// symbols _bar and _baz, we will have a trie like this (terminal nodes are14// marked with an asterisk):15//16// +-+-+17// | | // root node18// +-+-+19// |20// | _ba21// |22// +-+-+23// | |24// +-+-+25// r / \ z26// / \27// +-+-+ +-+-+28// | * | | * |29// +-+-+ +-+-+30//31// More documentation of the format can be found in32// llvm/tools/obj2yaml/macho2yaml.cpp.33//34//===----------------------------------------------------------------------===//3536#include "ExportTrie.h"37#include "Symbols.h"3839#include "lld/Common/ErrorHandler.h"40#include "lld/Common/Memory.h"41#include "llvm/BinaryFormat/MachO.h"42#include "llvm/Support/LEB128.h"43#include <optional>4445using namespace llvm;46using namespace lld;47using namespace lld::macho;4849namespace {5051struct Edge {52Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}5354StringRef substring;55struct TrieNode *child;56};5758struct ExportInfo {59uint64_t address;60uint64_t ordinal = 0;61uint8_t flags = 0;62ExportInfo(const Symbol &sym, uint64_t imageBase)63: address(sym.getVA() - imageBase) {64using namespace llvm::MachO;65if (sym.isWeakDef())66flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;67if (sym.isTlv())68flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;69// TODO: Add proper support for stub-and-resolver flags.7071if (auto *defined = dyn_cast<Defined>(&sym)) {72if (defined->isAbsolute())73flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;74} else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {75flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;76if (!dysym->isDynamicLookup())77ordinal = dysym->getFile()->ordinal;78}79}80};8182} // namespace8384struct macho::TrieNode {85std::vector<Edge> edges;86std::optional<ExportInfo> info;87// Estimated offset from the start of the serialized trie to the current node.88// This will converge to the true offset when updateOffset() is run to a89// fixpoint.90size_t offset = 0;9192uint32_t getTerminalSize() const;93// Returns whether the new estimated offset differs from the old one.94bool updateOffset(size_t &nextOffset);95void writeTo(uint8_t *buf) const;96};9798// For regular symbols, the node layout (excluding the children) is99//100// uleb128 terminalSize;101// uleb128 flags;102// uleb128 address;103//104// For re-exported symbols, the layout is105//106// uleb128 terminalSize;107// uleb128 flags;108// uleb128 ordinal;109// char[] originalName;110//111// If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias112// _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo113// re-exports _bar directly (i.e. not via an alias), then originalName will be114// the empty string.115//116// TODO: Support aliased re-exports. (Since we don't yet support these,117// originalName will always be the empty string.)118//119// For stub-and-resolver nodes, the layout is120//121// uleb128 terminalSize;122// uleb128 flags;123// uleb128 stubAddress;124// uleb128 resolverAddress;125//126// TODO: Support stub-and-resolver nodes.127uint32_t TrieNode::getTerminalSize() const {128uint32_t size = getULEB128Size(info->flags);129if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)130size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator131else132size += getULEB128Size(info->address);133return size;134}135136bool TrieNode::updateOffset(size_t &nextOffset) {137// Size of the whole node (including the terminalSize and the outgoing edges.)138// In contrast, terminalSize only records the size of the other data in the139// node.140size_t nodeSize;141if (info) {142uint32_t terminalSize = getTerminalSize();143// Overall node size so far is the uleb128 size of the length of the symbol144// info + the symbol info itself.145nodeSize = terminalSize + getULEB128Size(terminalSize);146} else {147nodeSize = 1; // Size of terminalSize (which has a value of 0)148}149// Compute size of all child edges.150++nodeSize; // Byte for number of children.151for (const Edge &edge : edges) {152nodeSize += edge.substring.size() + 1 // String length.153+ getULEB128Size(edge.child->offset); // Offset len.154}155// On input, 'nextOffset' is the new preferred location for this node.156bool result = (offset != nextOffset);157// Store new location in node object for use by parents.158offset = nextOffset;159nextOffset += nodeSize;160return result;161}162163void TrieNode::writeTo(uint8_t *buf) const {164buf += offset;165if (info) {166uint32_t terminalSize = getTerminalSize();167buf += encodeULEB128(terminalSize, buf);168buf += encodeULEB128(info->flags, buf);169if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {170buf += encodeULEB128(info->ordinal, buf);171*buf++ = 0; // empty originalName string172} else {173buf += encodeULEB128(info->address, buf);174}175} else {176// TrieNode with no Symbol info.177*buf++ = 0; // terminalSize178}179// Add number of children. TODO: Handle case where we have more than 256.180assert(edges.size() < 256);181*buf++ = edges.size();182// Append each child edge substring and node offset.183for (const Edge &edge : edges) {184memcpy(buf, edge.substring.data(), edge.substring.size());185buf += edge.substring.size();186*buf++ = '\0';187buf += encodeULEB128(edge.child->offset, buf);188}189}190191TrieBuilder::~TrieBuilder() {192for (TrieNode *node : nodes)193delete node;194}195196TrieNode *TrieBuilder::makeNode() {197auto *node = new TrieNode();198nodes.emplace_back(node);199return node;200}201202static int charAt(const Symbol *sym, size_t pos) {203StringRef str = sym->getName();204if (pos >= str.size())205return -1;206return str[pos];207}208209// Build the trie by performing a three-way radix quicksort: We start by sorting210// the strings by their first characters, then sort the strings with the same211// first characters by their second characters, and so on recursively. Each212// time the prefixes diverge, we add a node to the trie.213//214// node: The most recently created node along this path in the trie (i.e.215// the furthest from the root.)216// lastPos: The prefix length of the most recently created node, i.e. the number217// of characters along its path from the root.218// pos: The string index we are currently sorting on. Note that each symbol219// S contained in vec has the same prefix S[0...pos).220void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,221TrieNode *node, size_t lastPos, size_t pos) {222tailcall:223if (vec.empty())224return;225226// Partition items so that items in [0, i) are less than the pivot,227// [i, j) are the same as the pivot, and [j, vec.size()) are greater than228// the pivot.229const Symbol *pivotSymbol = vec[vec.size() / 2];230int pivot = charAt(pivotSymbol, pos);231size_t i = 0;232size_t j = vec.size();233for (size_t k = 0; k < j;) {234int c = charAt(vec[k], pos);235if (c < pivot)236std::swap(vec[i++], vec[k++]);237else if (c > pivot)238std::swap(vec[--j], vec[k]);239else240k++;241}242243bool isTerminal = pivot == -1;244bool prefixesDiverge = i != 0 || j != vec.size();245if (lastPos != pos && (isTerminal || prefixesDiverge)) {246TrieNode *newNode = makeNode();247node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),248newNode);249node = newNode;250lastPos = pos;251}252253sortAndBuild(vec.slice(0, i), node, lastPos, pos);254sortAndBuild(vec.slice(j), node, lastPos, pos);255256if (isTerminal) {257assert(j - i == 1); // no duplicate symbols258node->info = ExportInfo(*pivotSymbol, imageBase);259} else {260// This is the tail-call-optimized version of the following:261// sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);262vec = vec.slice(i, j - i);263++pos;264goto tailcall;265}266}267268size_t TrieBuilder::build() {269if (exported.empty())270return 0;271272TrieNode *root = makeNode();273sortAndBuild(exported, root, 0, 0);274275// Assign each node in the vector an offset in the trie stream, iterating276// until all uleb128 sizes have stabilized.277size_t offset;278bool more;279do {280offset = 0;281more = false;282for (TrieNode *node : nodes)283more |= node->updateOffset(offset);284} while (more);285286return offset;287}288289void TrieBuilder::writeTo(uint8_t *buf) const {290for (TrieNode *node : nodes)291node->writeTo(buf);292}293294namespace {295296// Parse a serialized trie and invoke a callback for each entry.297class TrieParser {298public:299TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)300: start(buf), end(start + size), callback(callback) {}301302void parse(const uint8_t *buf, const Twine &cumulativeString);303304void parse() { parse(start, ""); }305306const uint8_t *start;307const uint8_t *end;308const TrieEntryCallback &callback;309};310311} // namespace312313void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {314if (buf >= end)315fatal("Node offset points outside export section");316317unsigned ulebSize;318uint64_t terminalSize = decodeULEB128(buf, &ulebSize);319buf += ulebSize;320uint64_t flags = 0;321size_t offset;322if (terminalSize != 0) {323flags = decodeULEB128(buf, &ulebSize);324callback(cumulativeString, flags);325}326buf += terminalSize;327uint8_t numEdges = *buf++;328for (uint8_t i = 0; i < numEdges; ++i) {329const char *cbuf = reinterpret_cast<const char *>(buf);330StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));331buf += substring.size() + 1;332offset = decodeULEB128(buf, &ulebSize);333buf += ulebSize;334parse(start + offset, cumulativeString + substring);335}336}337338void macho::parseTrie(const uint8_t *buf, size_t size,339const TrieEntryCallback &callback) {340if (size == 0)341return;342343TrieParser(buf, size, callback).parse();344}345346347