Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/tools/extract_metadata.py
6173 views
1
# Copyright 2022 The Emscripten Authors. All rights reserved.
2
# Emscripten is available under two separate licenses, the MIT license and the
3
# University of Illinois/NCSA Open Source License. Both these licenses can be
4
# found in the LICENSE file.
5
6
import logging
7
from dataclasses import dataclass
8
9
from . import webassembly
10
from .settings import settings
11
from .utils import exit_with_error
12
from .webassembly import AtomicOpCode, MemoryOpCode, OpCode
13
14
logger = logging.getLogger('extract_metadata')
15
16
17
def skip_function_header(module):
18
num_local_decls = module.read_uleb()
19
while num_local_decls:
20
local_count = module.read_uleb() # noqa
21
local_type = module.read_type() # noqa
22
num_local_decls -= 1
23
24
25
def is_orig_main_wrapper(module, function):
26
module.get_types()
27
module.get_function_types()
28
module.seek(function.offset)
29
skip_function_header(module)
30
end = function.offset + function.size
31
while module.tell() != end:
32
opcode = module.read_byte()
33
try:
34
opcode = OpCode(opcode)
35
except ValueError:
36
return False
37
match opcode:
38
case OpCode.CALL:
39
callee = module.read_uleb()
40
callee_type = module.get_function_type(callee)
41
if len(callee_type.params) != 0:
42
return False
43
case OpCode.LOCAL_GET | OpCode.LOCAL_SET:
44
module.read_uleb() # local index
45
case OpCode.END | OpCode.RETURN:
46
pass
47
case _:
48
# Any other opcodes and we assume this not a simple wrapper
49
return False
50
51
assert opcode == OpCode.END
52
return True
53
54
55
def get_const_expr_value(expr):
56
assert len(expr) == 2
57
assert expr[1][0] == OpCode.END
58
opcode, immediates = expr[0]
59
match opcode:
60
case OpCode.I32_CONST | OpCode.I64_CONST:
61
assert len(immediates) == 1
62
return immediates[0]
63
case OpCode.GLOBAL_GET:
64
return 0
65
case _:
66
exit_with_error('unexpected opcode in const expr: %s', opcode)
67
68
69
def get_global_value(globl):
70
return get_const_expr_value(globl.init)
71
72
73
def parse_function_for_memory_inits(module, func_index, offset_map):
74
"""Very limited function parser that uses `memory.init` instructions
75
to derive segment offset.
76
77
When segments are passive they don't have an offset but (at least with
78
llvm-generated code) are loaded during the start function
79
(`__wasm_init_memory`) using `memory.init` instructions.
80
81
Here we parse the `__wasm_init_memory` function and make many assumptions
82
about its layout. For example, we assume the first argument to `memory.init`
83
is either an `i32.const` or the result of an `i32.add`.
84
"""
85
segments = module.get_segments()
86
func = module.get_function(func_index)
87
module.seek(func.offset)
88
skip_function_header(module)
89
end = func.offset + func.size
90
const_values = []
91
call_targets = []
92
while module.tell() != end:
93
opcode = OpCode(module.read_byte())
94
match opcode:
95
case OpCode.END | OpCode.NOP | OpCode.DROP | OpCode.I32_ADD | OpCode.I64_ADD:
96
pass
97
case OpCode.BLOCK:
98
module.read_type()
99
case OpCode.I32_CONST | OpCode.I64_CONST:
100
const_values.append(module.read_sleb())
101
case OpCode.GLOBAL_SET | OpCode.BR | OpCode.GLOBAL_GET | OpCode.LOCAL_SET | OpCode.LOCAL_GET | OpCode.LOCAL_TEE:
102
module.read_uleb()
103
case OpCode.CALL:
104
call_targets.append(module.read_uleb())
105
case OpCode.MEMORY_PREFIX:
106
opcode = MemoryOpCode(module.read_byte())
107
match opcode:
108
case MemoryOpCode.MEMORY_INIT:
109
segment_idx = module.read_uleb()
110
segment = segments[segment_idx]
111
offset = to_unsigned(const_values[-3])
112
offset_map[segment] = offset
113
memory = module.read_uleb()
114
assert memory == 0
115
case MemoryOpCode.MEMORY_FILL:
116
memory = module.read_uleb() # noqa
117
assert memory == 0
118
case MemoryOpCode.MEMORY_DROP:
119
segment = module.read_uleb() # noqa
120
case _:
121
assert False, "unknown: %s" % opcode
122
case OpCode.ATOMIC_PREFIX:
123
opcode = AtomicOpCode(module.read_byte())
124
if opcode in (AtomicOpCode.ATOMIC_I32_RMW_CMPXCHG, AtomicOpCode.ATOMIC_I32_STORE,
125
AtomicOpCode.ATOMIC_NOTIFY, AtomicOpCode.ATOMIC_WAIT32,
126
AtomicOpCode.ATOMIC_WAIT64):
127
module.read_uleb()
128
module.read_uleb()
129
else:
130
assert False, "unknown: %s" % opcode
131
case OpCode.BR_TABLE:
132
count = module.read_uleb()
133
for _ in range(count):
134
depth = module.read_uleb() # noqa
135
default = module.read_uleb() # noqa
136
case _:
137
assert False, "unknown: %s" % opcode
138
139
# Recursion is safe here because the layout of the wasm-ld-generated
140
# start function has a specific structure and has at most on level
141
# of call stack depth.
142
for t in call_targets:
143
parse_function_for_memory_inits(module, t, offset_map)
144
145
146
@webassembly.memoize
147
def get_passive_segment_offsets(module):
148
start_func_index = module.get_start()
149
assert start_func_index is not None
150
offset_map = {}
151
parse_function_for_memory_inits(module, start_func_index, offset_map)
152
return offset_map
153
154
155
def to_unsigned(val):
156
if val < 0:
157
return val & ((2 ** 32) - 1)
158
else:
159
return val
160
161
162
def find_segment_with_address(module, address):
163
segments = module.get_segments()
164
active = [s for s in segments if s.init]
165
166
for seg in active:
167
offset = to_unsigned(get_const_expr_value(seg.init))
168
if offset is None:
169
continue
170
if address >= offset and address < offset + seg.size:
171
return (seg, address - offset)
172
173
passive = [s for s in segments if not s.init]
174
if passive:
175
offset_map = get_passive_segment_offsets(module)
176
for seg, offset in offset_map.items():
177
if address >= offset and address < offset + seg.size:
178
return (seg, address - offset)
179
180
raise AssertionError('unable to find segment for address: %s' % address)
181
182
183
def data_to_string(data):
184
data = data.decode('utf8')
185
# We have at least one test (test/core/test_utf8.c) that uses a double
186
# backslash in the C++ source code, in order to represent a single backslash.
187
# This is because these strings historically were written and read back via
188
# JSON and a single slash is interpreted as an escape char there.
189
# Technically this escaping is no longer needed and could be removed
190
# but in order to maintain compatibility we strip out the double
191
# slashes here.
192
data = data.replace('\\\\', '\\')
193
return data
194
195
196
def get_section_strings(module, export_map, section_name):
197
start_name = f'__start_{section_name}'
198
stop_name = f'__stop_{section_name}'
199
if start_name not in export_map or stop_name not in export_map:
200
logger.debug(f'no start/stop symbols found for section: {section_name}')
201
return {}
202
203
start = export_map[start_name]
204
end = export_map[stop_name]
205
start_global = module.get_global(start.index)
206
end_global = module.get_global(end.index)
207
start_addr = to_unsigned(get_global_value(start_global))
208
end_addr = to_unsigned(get_global_value(end_global))
209
210
seg = find_segment_with_address(module, start_addr)
211
if not seg:
212
exit_with_error(f'unable to find segment starting at __start_{section_name}: {start_addr}')
213
seg, seg_offset = seg
214
215
asm_strings = {}
216
str_start = seg_offset
217
data = module.read_at(seg.offset, seg.size)
218
size = end_addr - start_addr
219
end = seg_offset + size
220
while str_start < end:
221
str_end = data.find(b'\0', str_start)
222
asm_strings[start_addr - seg_offset + str_start] = data_to_string(data[str_start:str_end])
223
str_start = str_end + 1
224
return asm_strings
225
226
227
def get_main_reads_params(module, export_map):
228
if settings.STANDALONE_WASM:
229
return True
230
231
main = export_map.get('main') or export_map.get('__main_argc_argv')
232
if not main or main.kind != webassembly.ExternType.FUNC:
233
return False
234
235
main_func = module.get_function(main.index)
236
if is_orig_main_wrapper(module, main_func):
237
# If main is simple wrapper function then we know that __original_main
238
# doesn't read arguments.
239
return False
240
241
# By default assume params are read
242
return True
243
244
245
def get_function_exports(module):
246
rtn = {}
247
for e in module.get_exports():
248
if e.kind == webassembly.ExternType.FUNC:
249
rtn[e.name] = module.get_function_type(e.index)
250
return rtn
251
252
253
def get_other_exports(module):
254
rtn = []
255
for e in module.get_exports():
256
if e.kind == webassembly.ExternType.GLOBAL:
257
rtn.append((e, module.get_global(e.index)))
258
elif e.kind != webassembly.ExternType.FUNC:
259
rtn.append((e, None))
260
return rtn
261
262
263
def read_module_imports(module, metadata):
264
em_js_funcs = metadata.em_js_funcs
265
types = module.get_types()
266
267
imports = metadata.imports = []
268
invoke_funcs = metadata.invoke_funcs = []
269
em_js_func_types = metadata.em_js_func_types = {}
270
271
for i in module.get_imports():
272
if i.kind == webassembly.ExternType.FUNC:
273
if i.field.startswith('invoke_'):
274
invoke_funcs.append(i.field)
275
else:
276
if i.field in em_js_funcs:
277
em_js_func_types[i.field] = types[i.type]
278
imports.append(i.field)
279
elif i.kind in (webassembly.ExternType.GLOBAL, webassembly.ExternType.TAG):
280
imports.append(i.field)
281
282
283
def update_metadata(filename, metadata):
284
with webassembly.Module(filename) as module:
285
metadata.function_exports = get_function_exports(module)
286
metadata.other_exports = get_other_exports(module)
287
metadata.all_exports = [e.name.removeprefix('__em_js__') for e in module.get_exports()]
288
read_module_imports(module, metadata)
289
290
291
def get_string_at(module, address):
292
seg, offset = find_segment_with_address(module, address)
293
data = module.read_at(seg.offset, seg.size)
294
str_end = data.find(b'\0', offset)
295
return data_to_string(data[offset:str_end])
296
297
298
@dataclass(init=False)
299
class Metadata:
300
imports: list[str]
301
export: list[str]
302
em_asm_consts: dict[int, str]
303
js_deps: list[str]
304
em_js_funcs: dict[str, str]
305
em_js_func_types: dict[str, webassembly.FuncType]
306
features: list[str]
307
invoke_funcs: list[str]
308
main_reads_params: bool
309
function_exports: dict[str, webassembly.FuncType]
310
other_exports: list[webassembly.Export]
311
all_exports: list[str]
312
313
314
def extract_metadata(filename):
315
em_js_funcs = {}
316
317
with webassembly.Module(filename) as module:
318
exports = module.get_exports()
319
320
export_map = {e.name: e for e in exports}
321
for e in exports:
322
if e.kind == webassembly.ExternType.GLOBAL and e.name.startswith('__em_js__'):
323
name = e.name.removeprefix('__em_js__')
324
globl = module.get_global(e.index)
325
string_address = to_unsigned(get_global_value(globl))
326
em_js_funcs[name] = get_string_at(module, string_address)
327
328
features = module.get_target_features()
329
features = [f'--enable-{feature}' for feature, used in features.items() if used == webassembly.TargetFeaturePrefix.USED]
330
features = [f.replace('--enable-atomics', '--enable-threads') for f in features]
331
features = [f.replace('--enable-simd128', '--enable-simd') for f in features]
332
features = [f.replace('--enable-nontrapping-fptoint', '--enable-nontrapping-float-to-int') for f in features]
333
334
# If main does not read its parameters, it will just be a stub that
335
# calls __original_main (which has no parameters).
336
metadata = Metadata()
337
metadata.function_exports = get_function_exports(module)
338
metadata.other_exports = get_other_exports(module)
339
metadata.all_exports = [e.name.removeprefix('__em_js__') for e in exports]
340
metadata.em_asm_consts = get_section_strings(module, export_map, 'em_asm')
341
metadata.js_deps = [d for d in get_section_strings(module, export_map, 'em_lib_deps').values() if d]
342
metadata.em_js_funcs = em_js_funcs
343
metadata.features = features
344
metadata.main_reads_params = get_main_reads_params(module, export_map)
345
346
read_module_imports(module, metadata)
347
348
# print("Metadata parsed: " + pprint.pformat(metadata))
349
return metadata
350
351