Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/tools/extract_metadata.py
4128 views
1
# Copyright 2022 The Emscripten Authors. All rights reserved.
2
# Emscripten is available under two separate licenses, the MIT license and the
3
# University of Illinois/NCSA Open Source License. Both these licenses can be
4
# found in the LICENSE file.
5
6
import logging
7
from typing import List, Dict
8
9
from . import webassembly, utils
10
from .webassembly import OpCode, AtomicOpCode, MemoryOpCode
11
from .shared import exit_with_error
12
from .settings import settings
13
14
15
logger = logging.getLogger('extract_metadata')
16
17
18
def skip_function_header(module):
19
num_local_decls = module.read_uleb()
20
while num_local_decls:
21
local_count = module.read_uleb() # noqa
22
local_type = module.read_type() # noqa
23
num_local_decls -= 1
24
25
26
def is_orig_main_wrapper(module, function):
27
module.get_types()
28
module.get_function_types()
29
module.seek(function.offset)
30
skip_function_header(module)
31
end = function.offset + function.size
32
while module.tell() != end:
33
opcode = module.read_byte()
34
try:
35
opcode = OpCode(opcode)
36
except ValueError:
37
return False
38
if opcode == OpCode.CALL:
39
callee = module.read_uleb()
40
callee_type = module.get_function_type(callee)
41
if len(callee_type.params) != 0:
42
return False
43
elif opcode in (OpCode.LOCAL_GET, OpCode.LOCAL_SET):
44
module.read_uleb() # local index
45
elif opcode in (OpCode.END, OpCode.RETURN):
46
pass
47
else:
48
# Any other opcodes and we assume this not a simple wrapper
49
return False
50
51
assert opcode == OpCode.END
52
return True
53
54
55
def get_const_expr_value(expr):
56
assert len(expr) == 2
57
assert expr[1][0] == OpCode.END
58
opcode, immediates = expr[0]
59
if opcode in (OpCode.I32_CONST, OpCode.I64_CONST):
60
assert len(immediates) == 1
61
return immediates[0]
62
elif opcode in (OpCode.GLOBAL_GET,):
63
return 0
64
else:
65
exit_with_error('unexpected opcode in const expr: ' + str(opcode))
66
67
68
def get_global_value(globl):
69
return get_const_expr_value(globl.init)
70
71
72
def parse_function_for_memory_inits(module, func_index, offset_map):
73
"""Very limited function parser that uses `memory.init` instructions
74
to derive segment offset.
75
76
When segments are passive they don't have an offset but (at least with
77
llvm-generated code) are loaded during the start function
78
(`__wasm_init_memory`) using `memory.init` instructions.
79
80
Here we parse the `__wasm_init_memory` function and make many assumptions
81
about its layout. For example, we assume the first argument to `memory.init`
82
is either an `i32.const` or the result of an `i32.add`.
83
"""
84
segments = module.get_segments()
85
func = module.get_function(func_index)
86
module.seek(func.offset)
87
skip_function_header(module)
88
end = func.offset + func.size
89
const_values = []
90
call_targets = []
91
while module.tell() != end:
92
opcode = OpCode(module.read_byte())
93
if opcode in (OpCode.END, OpCode.NOP, OpCode.DROP, OpCode.I32_ADD, OpCode.I64_ADD):
94
pass
95
elif opcode in (OpCode.BLOCK,):
96
module.read_type()
97
elif opcode in (OpCode.I32_CONST, OpCode.I64_CONST):
98
const_values.append(module.read_sleb())
99
elif opcode in (OpCode.GLOBAL_SET, OpCode.BR, OpCode.GLOBAL_GET, OpCode.LOCAL_SET, OpCode.LOCAL_GET, OpCode.LOCAL_TEE):
100
module.read_uleb()
101
elif opcode == OpCode.CALL:
102
call_targets.append(module.read_uleb())
103
elif opcode == OpCode.MEMORY_PREFIX:
104
opcode = MemoryOpCode(module.read_byte())
105
if opcode == MemoryOpCode.MEMORY_INIT:
106
segment_idx = module.read_uleb()
107
segment = segments[segment_idx]
108
offset = to_unsigned(const_values[-3])
109
offset_map[segment] = offset
110
memory = module.read_uleb()
111
assert memory == 0
112
elif opcode == MemoryOpCode.MEMORY_FILL:
113
memory = module.read_uleb() # noqa
114
assert memory == 0
115
elif opcode == MemoryOpCode.MEMORY_DROP:
116
segment = module.read_uleb() # noqa
117
else:
118
assert False, "unknown: %s" % opcode
119
elif opcode == OpCode.ATOMIC_PREFIX:
120
opcode = AtomicOpCode(module.read_byte())
121
if opcode in (AtomicOpCode.ATOMIC_I32_RMW_CMPXCHG, AtomicOpCode.ATOMIC_I32_STORE,
122
AtomicOpCode.ATOMIC_NOTIFY, AtomicOpCode.ATOMIC_WAIT32,
123
AtomicOpCode.ATOMIC_WAIT64):
124
module.read_uleb()
125
module.read_uleb()
126
else:
127
assert False, "unknown: %s" % opcode
128
elif opcode == OpCode.BR_TABLE:
129
count = module.read_uleb()
130
for _ in range(count):
131
depth = module.read_uleb() # noqa
132
default = module.read_uleb() # noqa
133
else:
134
assert False, "unknown: %s" % opcode
135
136
# Recursion is safe here because the layout of the wasm-ld-generated
137
# start function has a specific structure and has at most on level
138
# of call stack depth.
139
for t in call_targets:
140
parse_function_for_memory_inits(module, t, offset_map)
141
142
143
@webassembly.memoize
144
def get_passive_segment_offsets(module):
145
start_func_index = module.get_start()
146
assert start_func_index is not None
147
offset_map = {}
148
parse_function_for_memory_inits(module, start_func_index, offset_map)
149
return offset_map
150
151
152
def to_unsigned(val):
153
if val < 0:
154
return val & ((2 ** 32) - 1)
155
else:
156
return val
157
158
159
def find_segment_with_address(module, address):
160
segments = module.get_segments()
161
active = [s for s in segments if s.init]
162
163
for seg in active:
164
offset = to_unsigned(get_const_expr_value(seg.init))
165
if offset is None:
166
continue
167
if address >= offset and address < offset + seg.size:
168
return (seg, address - offset)
169
170
passive = [s for s in segments if not s.init]
171
if passive:
172
offset_map = get_passive_segment_offsets(module)
173
for seg, offset in offset_map.items():
174
if address >= offset and address < offset + seg.size:
175
return (seg, address - offset)
176
177
raise AssertionError('unable to find segment for address: %s' % address)
178
179
180
def data_to_string(data):
181
data = data.decode('utf8')
182
# We have at least one test (test/core/test_utf8.c) that uses a double
183
# backslash in the C++ source code, in order to represent a single backslash.
184
# This is because these strings historically were written and read back via
185
# JSON and a single slash is interpreted as an escape char there.
186
# Technically this escaping is no longer needed and could be removed
187
# but in order to maintain compatibility we strip out the double
188
# slashes here.
189
data = data.replace('\\\\', '\\')
190
return data
191
192
193
def get_section_strings(module, export_map, section_name):
194
start_name = f'__start_{section_name}'
195
stop_name = f'__stop_{section_name}'
196
if start_name not in export_map or stop_name not in export_map:
197
logger.debug(f'no start/stop symbols found for section: {section_name}')
198
return {}
199
200
start = export_map[start_name]
201
end = export_map[stop_name]
202
start_global = module.get_global(start.index)
203
end_global = module.get_global(end.index)
204
start_addr = to_unsigned(get_global_value(start_global))
205
end_addr = to_unsigned(get_global_value(end_global))
206
207
seg = find_segment_with_address(module, start_addr)
208
if not seg:
209
exit_with_error(f'unable to find segment starting at __start_{section_name}: {start_addr}')
210
seg, seg_offset = seg
211
212
asm_strings = {}
213
str_start = seg_offset
214
data = module.read_at(seg.offset, seg.size)
215
size = end_addr - start_addr
216
end = seg_offset + size
217
while str_start < end:
218
str_end = data.find(b'\0', str_start)
219
asm_strings[start_addr - seg_offset + str_start] = data_to_string(data[str_start:str_end])
220
str_start = str_end + 1
221
return asm_strings
222
223
224
def get_main_reads_params(module, export_map):
225
if settings.STANDALONE_WASM:
226
return True
227
228
main = export_map.get('main') or export_map.get('__main_argc_argv')
229
if not main or main.kind != webassembly.ExternType.FUNC:
230
return False
231
232
main_func = module.get_function(main.index)
233
if is_orig_main_wrapper(module, main_func):
234
# If main is simple wrapper function then we know that __original_main
235
# doesn't read arguments.
236
return False
237
238
# By default assume params are read
239
return True
240
241
242
def get_global_exports(module, exports):
243
global_exports = {}
244
for export in exports:
245
if export.kind == webassembly.ExternType.GLOBAL:
246
g = module.get_global(export.index)
247
global_exports[export.name] = str(get_global_value(g))
248
return global_exports
249
250
251
def get_function_exports(module):
252
rtn = {}
253
for e in module.get_exports():
254
if e.kind == webassembly.ExternType.FUNC:
255
rtn[e.name] = module.get_function_type(e.index)
256
return rtn
257
258
259
def get_tag_exports(module):
260
rtn = []
261
for e in module.get_exports():
262
if e.kind == webassembly.ExternType.TAG:
263
rtn.append(e.name)
264
return rtn
265
266
267
def read_module_imports(module, metadata):
268
em_js_funcs = metadata.em_js_funcs
269
types = module.get_types()
270
271
imports = metadata.imports = []
272
invoke_funcs = metadata.invoke_funcs = []
273
em_js_func_types = metadata.em_js_func_types = {}
274
275
for i in module.get_imports():
276
if i.kind == webassembly.ExternType.FUNC:
277
if i.field.startswith('invoke_'):
278
invoke_funcs.append(i.field)
279
else:
280
if i.field in em_js_funcs:
281
em_js_func_types[i.field] = types[i.type]
282
imports.append(i.field)
283
elif i.kind in (webassembly.ExternType.GLOBAL, webassembly.ExternType.TAG):
284
imports.append(i.field)
285
286
287
def update_metadata(filename, metadata):
288
with webassembly.Module(filename) as module:
289
metadata.function_exports = get_function_exports(module)
290
metadata.tag_exports = get_tag_exports(module)
291
metadata.all_exports = [utils.removeprefix(e.name, '__em_js__') for e in module.get_exports()]
292
read_module_imports(module, metadata)
293
294
295
def get_string_at(module, address):
296
seg, offset = find_segment_with_address(module, address)
297
data = module.read_at(seg.offset, seg.size)
298
str_end = data.find(b'\0', offset)
299
return data_to_string(data[offset:str_end])
300
301
302
class Metadata:
303
imports: List[str]
304
export: List[str]
305
em_asm_consts: Dict[int, str]
306
js_deps: List[str]
307
em_js_funcs: Dict[str, str]
308
em_js_func_types: Dict[str, webassembly.FuncType]
309
features: List[str]
310
invoke_funcs: List[str]
311
main_reads_params: bool
312
global_exports: Dict[str, str]
313
function_exports: Dict[str, webassembly.FuncType]
314
tag_exports: List[str]
315
all_exports: List[str]
316
317
def __init__(self):
318
pass
319
320
321
def extract_metadata(filename):
322
em_js_funcs = {}
323
324
with webassembly.Module(filename) as module:
325
exports = module.get_exports()
326
327
export_map = {e.name: e for e in exports}
328
for e in exports:
329
if e.kind == webassembly.ExternType.GLOBAL and e.name.startswith('__em_js__'):
330
name = utils.removeprefix(e.name, '__em_js__')
331
globl = module.get_global(e.index)
332
string_address = to_unsigned(get_global_value(globl))
333
em_js_funcs[name] = get_string_at(module, string_address)
334
335
features = module.get_target_features()
336
features = [f'--enable-{feature}' for feature, used in features.items() if used == webassembly.TargetFeaturePrefix.USED]
337
features = [f.replace('--enable-atomics', '--enable-threads') for f in features]
338
features = [f.replace('--enable-simd128', '--enable-simd') for f in features]
339
features = [f.replace('--enable-nontrapping-fptoint', '--enable-nontrapping-float-to-int') for f in features]
340
341
# If main does not read its parameters, it will just be a stub that
342
# calls __original_main (which has no parameters).
343
metadata = Metadata()
344
metadata.function_exports = get_function_exports(module)
345
metadata.tag_exports = get_tag_exports(module)
346
metadata.all_exports = [utils.removeprefix(e.name, '__em_js__') for e in exports]
347
metadata.em_asm_consts = get_section_strings(module, export_map, 'em_asm')
348
metadata.js_deps = [d for d in get_section_strings(module, export_map, 'em_lib_deps').values() if d]
349
metadata.em_js_funcs = em_js_funcs
350
metadata.features = features
351
metadata.main_reads_params = get_main_reads_params(module, export_map)
352
metadata.global_exports = get_global_exports(module, exports)
353
354
read_module_imports(module, metadata)
355
356
# print("Metadata parsed: " + pprint.pformat(metadata))
357
return metadata
358
359