Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/lld/MachO/ExportTrie.cpp
34878 views
1
//===- ExportTrie.cpp -----------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This is a partial implementation of the Mach-O export trie format. It's
10
// essentially a symbol table encoded as a compressed prefix trie, meaning that
11
// the common prefixes of each symbol name are shared for a more compact
12
// representation. The prefixes are stored on the edges of the trie, and one
13
// edge can represent multiple characters. For example, given two exported
14
// symbols _bar and _baz, we will have a trie like this (terminal nodes are
15
// marked with an asterisk):
16
//
17
// +-+-+
18
// | | // root node
19
// +-+-+
20
// |
21
// | _ba
22
// |
23
// +-+-+
24
// | |
25
// +-+-+
26
// r / \ z
27
// / \
28
// +-+-+ +-+-+
29
// | * | | * |
30
// +-+-+ +-+-+
31
//
32
// More documentation of the format can be found in
33
// llvm/tools/obj2yaml/macho2yaml.cpp.
34
//
35
//===----------------------------------------------------------------------===//
36
37
#include "ExportTrie.h"
38
#include "Symbols.h"
39
40
#include "lld/Common/ErrorHandler.h"
41
#include "lld/Common/Memory.h"
42
#include "llvm/BinaryFormat/MachO.h"
43
#include "llvm/Support/LEB128.h"
44
#include <optional>
45
46
using namespace llvm;
47
using namespace lld;
48
using namespace lld::macho;
49
50
namespace {
51
52
struct Edge {
53
Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54
55
StringRef substring;
56
struct TrieNode *child;
57
};
58
59
struct ExportInfo {
60
uint64_t address;
61
uint64_t ordinal = 0;
62
uint8_t flags = 0;
63
ExportInfo(const Symbol &sym, uint64_t imageBase)
64
: address(sym.getVA() - imageBase) {
65
using namespace llvm::MachO;
66
if (sym.isWeakDef())
67
flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68
if (sym.isTlv())
69
flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
70
// TODO: Add proper support for stub-and-resolver flags.
71
72
if (auto *defined = dyn_cast<Defined>(&sym)) {
73
if (defined->isAbsolute())
74
flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
75
} else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {
76
flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
77
if (!dysym->isDynamicLookup())
78
ordinal = dysym->getFile()->ordinal;
79
}
80
}
81
};
82
83
} // namespace
84
85
struct macho::TrieNode {
86
std::vector<Edge> edges;
87
std::optional<ExportInfo> info;
88
// Estimated offset from the start of the serialized trie to the current node.
89
// This will converge to the true offset when updateOffset() is run to a
90
// fixpoint.
91
size_t offset = 0;
92
93
uint32_t getTerminalSize() const;
94
// Returns whether the new estimated offset differs from the old one.
95
bool updateOffset(size_t &nextOffset);
96
void writeTo(uint8_t *buf) const;
97
};
98
99
// For regular symbols, the node layout (excluding the children) is
100
//
101
// uleb128 terminalSize;
102
// uleb128 flags;
103
// uleb128 address;
104
//
105
// For re-exported symbols, the layout is
106
//
107
// uleb128 terminalSize;
108
// uleb128 flags;
109
// uleb128 ordinal;
110
// char[] originalName;
111
//
112
// If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
113
// _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
114
// re-exports _bar directly (i.e. not via an alias), then originalName will be
115
// the empty string.
116
//
117
// TODO: Support aliased re-exports. (Since we don't yet support these,
118
// originalName will always be the empty string.)
119
//
120
// For stub-and-resolver nodes, the layout is
121
//
122
// uleb128 terminalSize;
123
// uleb128 flags;
124
// uleb128 stubAddress;
125
// uleb128 resolverAddress;
126
//
127
// TODO: Support stub-and-resolver nodes.
128
uint32_t TrieNode::getTerminalSize() const {
129
uint32_t size = getULEB128Size(info->flags);
130
if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
131
size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator
132
else
133
size += getULEB128Size(info->address);
134
return size;
135
}
136
137
bool TrieNode::updateOffset(size_t &nextOffset) {
138
// Size of the whole node (including the terminalSize and the outgoing edges.)
139
// In contrast, terminalSize only records the size of the other data in the
140
// node.
141
size_t nodeSize;
142
if (info) {
143
uint32_t terminalSize = getTerminalSize();
144
// Overall node size so far is the uleb128 size of the length of the symbol
145
// info + the symbol info itself.
146
nodeSize = terminalSize + getULEB128Size(terminalSize);
147
} else {
148
nodeSize = 1; // Size of terminalSize (which has a value of 0)
149
}
150
// Compute size of all child edges.
151
++nodeSize; // Byte for number of children.
152
for (const Edge &edge : edges) {
153
nodeSize += edge.substring.size() + 1 // String length.
154
+ getULEB128Size(edge.child->offset); // Offset len.
155
}
156
// On input, 'nextOffset' is the new preferred location for this node.
157
bool result = (offset != nextOffset);
158
// Store new location in node object for use by parents.
159
offset = nextOffset;
160
nextOffset += nodeSize;
161
return result;
162
}
163
164
void TrieNode::writeTo(uint8_t *buf) const {
165
buf += offset;
166
if (info) {
167
uint32_t terminalSize = getTerminalSize();
168
buf += encodeULEB128(terminalSize, buf);
169
buf += encodeULEB128(info->flags, buf);
170
if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
171
buf += encodeULEB128(info->ordinal, buf);
172
*buf++ = 0; // empty originalName string
173
} else {
174
buf += encodeULEB128(info->address, buf);
175
}
176
} else {
177
// TrieNode with no Symbol info.
178
*buf++ = 0; // terminalSize
179
}
180
// Add number of children. TODO: Handle case where we have more than 256.
181
assert(edges.size() < 256);
182
*buf++ = edges.size();
183
// Append each child edge substring and node offset.
184
for (const Edge &edge : edges) {
185
memcpy(buf, edge.substring.data(), edge.substring.size());
186
buf += edge.substring.size();
187
*buf++ = '\0';
188
buf += encodeULEB128(edge.child->offset, buf);
189
}
190
}
191
192
TrieBuilder::~TrieBuilder() {
193
for (TrieNode *node : nodes)
194
delete node;
195
}
196
197
TrieNode *TrieBuilder::makeNode() {
198
auto *node = new TrieNode();
199
nodes.emplace_back(node);
200
return node;
201
}
202
203
static int charAt(const Symbol *sym, size_t pos) {
204
StringRef str = sym->getName();
205
if (pos >= str.size())
206
return -1;
207
return str[pos];
208
}
209
210
// Build the trie by performing a three-way radix quicksort: We start by sorting
211
// the strings by their first characters, then sort the strings with the same
212
// first characters by their second characters, and so on recursively. Each
213
// time the prefixes diverge, we add a node to the trie.
214
//
215
// node: The most recently created node along this path in the trie (i.e.
216
// the furthest from the root.)
217
// lastPos: The prefix length of the most recently created node, i.e. the number
218
// of characters along its path from the root.
219
// pos: The string index we are currently sorting on. Note that each symbol
220
// S contained in vec has the same prefix S[0...pos).
221
void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
222
TrieNode *node, size_t lastPos, size_t pos) {
223
tailcall:
224
if (vec.empty())
225
return;
226
227
// Partition items so that items in [0, i) are less than the pivot,
228
// [i, j) are the same as the pivot, and [j, vec.size()) are greater than
229
// the pivot.
230
const Symbol *pivotSymbol = vec[vec.size() / 2];
231
int pivot = charAt(pivotSymbol, pos);
232
size_t i = 0;
233
size_t j = vec.size();
234
for (size_t k = 0; k < j;) {
235
int c = charAt(vec[k], pos);
236
if (c < pivot)
237
std::swap(vec[i++], vec[k++]);
238
else if (c > pivot)
239
std::swap(vec[--j], vec[k]);
240
else
241
k++;
242
}
243
244
bool isTerminal = pivot == -1;
245
bool prefixesDiverge = i != 0 || j != vec.size();
246
if (lastPos != pos && (isTerminal || prefixesDiverge)) {
247
TrieNode *newNode = makeNode();
248
node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
249
newNode);
250
node = newNode;
251
lastPos = pos;
252
}
253
254
sortAndBuild(vec.slice(0, i), node, lastPos, pos);
255
sortAndBuild(vec.slice(j), node, lastPos, pos);
256
257
if (isTerminal) {
258
assert(j - i == 1); // no duplicate symbols
259
node->info = ExportInfo(*pivotSymbol, imageBase);
260
} else {
261
// This is the tail-call-optimized version of the following:
262
// sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
263
vec = vec.slice(i, j - i);
264
++pos;
265
goto tailcall;
266
}
267
}
268
269
size_t TrieBuilder::build() {
270
if (exported.empty())
271
return 0;
272
273
TrieNode *root = makeNode();
274
sortAndBuild(exported, root, 0, 0);
275
276
// Assign each node in the vector an offset in the trie stream, iterating
277
// until all uleb128 sizes have stabilized.
278
size_t offset;
279
bool more;
280
do {
281
offset = 0;
282
more = false;
283
for (TrieNode *node : nodes)
284
more |= node->updateOffset(offset);
285
} while (more);
286
287
return offset;
288
}
289
290
void TrieBuilder::writeTo(uint8_t *buf) const {
291
for (TrieNode *node : nodes)
292
node->writeTo(buf);
293
}
294
295
namespace {
296
297
// Parse a serialized trie and invoke a callback for each entry.
298
class TrieParser {
299
public:
300
TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
301
: start(buf), end(start + size), callback(callback) {}
302
303
void parse(const uint8_t *buf, const Twine &cumulativeString);
304
305
void parse() { parse(start, ""); }
306
307
const uint8_t *start;
308
const uint8_t *end;
309
const TrieEntryCallback &callback;
310
};
311
312
} // namespace
313
314
void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
315
if (buf >= end)
316
fatal("Node offset points outside export section");
317
318
unsigned ulebSize;
319
uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
320
buf += ulebSize;
321
uint64_t flags = 0;
322
size_t offset;
323
if (terminalSize != 0) {
324
flags = decodeULEB128(buf, &ulebSize);
325
callback(cumulativeString, flags);
326
}
327
buf += terminalSize;
328
uint8_t numEdges = *buf++;
329
for (uint8_t i = 0; i < numEdges; ++i) {
330
const char *cbuf = reinterpret_cast<const char *>(buf);
331
StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
332
buf += substring.size() + 1;
333
offset = decodeULEB128(buf, &ulebSize);
334
buf += ulebSize;
335
parse(start + offset, cumulativeString + substring);
336
}
337
}
338
339
void macho::parseTrie(const uint8_t *buf, size_t size,
340
const TrieEntryCallback &callback) {
341
if (size == 0)
342
return;
343
344
TrieParser(buf, size, callback).parse();
345
}
346
347