Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/tools/empath-split.py
6175 views
1
#!/usr/bin/env python3
2
# Copyright 2025 The Emscripten Authors. All rights reserved.
3
# Emscripten is available under two separate licenses, the MIT license and the
4
# University of Illinois/NCSA Open Source License. Both these licenses can be
5
# found in the LICENSE file.
6
7
"""
8
Wrapper for 'wasm-split --multi-split' functionality. This script generates a
9
.manifest file based on the list of user source paths, using source map
10
information.
11
12
This assumes the name section exists in the input wasm file, and also assumes
13
the sourceMappingURL section exists in the input or a source map file is
14
separately supplied with --sourcemap. If we have two files a.c and b.c, to
15
generate a source map and the name section, if you compile and link within a
16
single command, you can do something like
17
$ emcc -g2 -gsource-map a.c b.c -o result.js
18
If you want to compile and link in separate commands, you can do
19
$ emcc -gsource-map a.c -o a.o
20
$ emcc -gsource-map b.c -o b.o
21
$ emcc -g2 -gsource-map a.o b.o -o result.js
22
See https://emscripten.org/docs/porting/Debugging.html for more details.
23
24
This takes a wasm file and a paths file as inputs. The paths file defines how
25
to split modules. The format is similar to the manifest file for wasm-split, but
26
with paths instead of function names. A module is defined by a name on a line,
27
followed by paths on subsequent lines. Modules are separated by empty lines.
28
Module names be written with a colon (:).
29
For example:
30
module1:
31
path/to/a
32
path/to/b
33
34
module2:
35
path/to/c
36
37
This will create two modules, 'module1' and 'module2'. 'module1' will contain
38
functions from source files under path/to/a and path/to/b. 'module2' will
39
contain functions from source files under path/to/c.
40
41
If a specified path contains another specified path, functions contained in the
42
inner path will be split as the inner path's module, and the rest of the
43
functions will be split as the outer path's module. Functions that do not belong
44
to any of the specified paths will remain in the primary module.
45
46
The paths in the paths file can be either absolute or relative, but they should
47
match those of 'sources' field in the source map file. Sometimes a source map's
48
'sources' field contains paths relative to a build directory, so source files
49
may be recorded as '../src/subdir/test.c', for example. In this case, if you
50
want to split the directory src/subdir, you should list it as ../src/subdir. You
51
can manually open the source map file and check 'sources' field, but we also
52
have an option to help that. You can do like
53
$ empath-split --print-sources test.wasm
54
or
55
$ empath-split --print-sources --source-map test.wasm.map
56
to print the list of sources in 'sources' field in the source map. Note that
57
emscripten's libraries' source files have /emsdk/emscripten prefix, which is a
58
fake deterministic prefix to produce reproducible builds across platforms.
59
"""
60
61
import argparse
62
import json
63
import os
64
import sys
65
import tempfile
66
from pathlib import PurePath
67
68
__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
69
__rootdir__ = os.path.dirname(__scriptdir__)
70
sys.path.insert(0, __rootdir__)
71
72
from tools import building, diagnostics, emsymbolizer, utils, webassembly
73
from tools.utils import exit_with_error
74
75
76
def parse_args():
77
parser = argparse.ArgumentParser(
78
description='Split a wasm file based on user paths',
79
epilog="""
80
This is a wrapper for 'wasm-split --multi-split' functionality, so you should
81
add wasm-split's command line options as well. You should or may want to add
82
wasm-split options like -o (--output), --out-prefix, -g, and feature
83
enabling/disabling options. Run 'wasm-split -h' for the list of options. But you
84
should NOT add --manifest, because this will be generated from this script.
85
""")
86
parser.add_argument('wasm', nargs='?', help='Path to the input wasm file')
87
parser.add_argument('paths_file', nargs='?', help='Path to the input file containing paths')
88
parser.add_argument('-s', '--sourcemap', help='Force source map file')
89
parser.add_argument('-v', '--verbose', action='store_true',
90
help='Print verbose info for debugging this script')
91
parser.add_argument('--wasm-split', help='Path to wasm-split executable')
92
parser.add_argument('--preserve-manifest', action='store_true',
93
help='Preserve generated manifest file. This sets --verbose too.')
94
parser.add_argument('--print-sources', action='store_true',
95
help='Print the list of sources in the source map to help figure out splitting boundaries. Does NOT perform the splitting.')
96
97
args, forwarded_args = parser.parse_known_args()
98
if args.preserve_manifest:
99
args.verbose = True
100
if not args.wasm_split:
101
args.wasm_split = utils.find_exe(building.get_binaryen_bin(), 'wasm-split')
102
103
if '--manifest' in forwarded_args:
104
parser.error('manifest file will be generated by this script and should not be given')
105
106
if args.print_sources:
107
if not args.wasm and not args.sourcemap:
108
parser.error('--print-sources requires either wasm or --sourcemap')
109
return args, forwarded_args
110
111
if not args.wasm and not args.paths_file:
112
parser.error("the following arguments are required: wasm, paths_file")
113
if not args.paths_file:
114
parser.error("the following arguments are required: paths_file")
115
if '-o' not in forwarded_args and '--output' not in forwarded_args:
116
parser.error('-o (--output) is required')
117
return args, forwarded_args
118
119
120
def check_errors(args):
121
if args.wasm and not os.path.isfile(args.wasm):
122
exit_with_error(f"'{args.wasm}' was not found or not a file")
123
if args.paths_file and not os.path.isfile(args.paths_file):
124
exit_with_error(f"'{args.paths_file}' was not found or not a file")
125
126
if args.sourcemap:
127
sourcemap = args.sourcemap
128
129
if args.wasm:
130
with webassembly.Module(args.wasm) as module:
131
if not args.sourcemap:
132
if not emsymbolizer.get_sourceMappingURL_section(module):
133
exit_with_error('sourceMappingURL section does not exist')
134
sourcemap = module.get_sourceMappingURL()
135
if not module.has_name_section():
136
exit_with_error('Name section does not exist')
137
138
if not os.path.isfile(sourcemap):
139
exit_with_error(f"'{sourcemap}' was not found or not a file")
140
if not os.path.isfile(args.wasm_split):
141
exit_with_error(f"'{args.wasm_split}' was not found or not a file")
142
143
# Check source map validity. Just perform simple checks to make sure mandatory
144
# fields exist.
145
try:
146
with open(sourcemap) as f:
147
source_map_data = json.load(f)
148
except json.JSONDecodeError:
149
exit_with_error(f'Invalid JSON format in file {args.sourcemap}')
150
for field in ['version', 'sources', 'mappings']:
151
if field not in source_map_data:
152
exit_with_error(f"Field '{field}' is missing in the source map")
153
154
155
def get_sourceMappingURL(wasm, arg_sourcemap):
156
if arg_sourcemap:
157
return arg_sourcemap
158
with webassembly.Module(wasm) as module:
159
return module.get_sourceMappingURL()
160
161
162
def print_sources(sourcemap):
163
with open(sourcemap) as f:
164
sources = json.load(f).get('sources')
165
assert(isinstance(sources, list))
166
for src in sources:
167
print(src)
168
169
170
def get_path_to_functions_map(wasm, sourcemap, paths):
171
def is_synthesized_func(func):
172
# TODO There can be more
173
synthesized_names = [
174
'main',
175
'__wasm_call_ctors',
176
'__clang_call_terminate',
177
]
178
synthesized_prefixes = [
179
'legalstub$',
180
'legalfunc$',
181
'__cxx_global_',
182
'_GLOBAL__',
183
'virtual thunk to ',
184
]
185
if func in synthesized_names:
186
return True
187
return func.startswith(tuple(synthesized_prefixes))
188
189
# Compute {func_name: src file} map, and invert it to get
190
# {src file: list of functions} map, and construct {path: list of functions}
191
# map from it
192
with webassembly.Module(wasm) as module:
193
funcs = module.get_functions()
194
func_names = module.get_function_names()
195
assert len(funcs) == len(func_names)
196
197
func_to_src = {}
198
src_to_funcs = {}
199
200
sm = emsymbolizer.WasmSourceMap()
201
sm.parse(sourcemap)
202
203
for func_name, func in zip(func_names, funcs, strict=True):
204
# From the last address, decrement the address by 1 until we find location
205
# info with source file information. The reason we do this is to reduce
206
# the probability of picking an address where another function is inlined
207
# into, picking the inlined function's source.
208
# We start from the end because it is simpler; it is harder to compute the
209
# first instruction's address, because there is a gap for local types
210
# between function offset and the first instruction.
211
addr = func.offset + func.size - 1
212
while addr > func.offset:
213
loc = sm.lookup(addr, func.offset)
214
# This means there is no source map mappings for the entire function
215
# (because we give func.offset as a lower bound). Exit the loop.
216
if not loc:
217
break
218
# Exit the loop only if a location info with source file information is
219
# found. If not, continue the search.
220
if loc.source:
221
break
222
addr -= 1
223
224
if loc and loc.source:
225
func_to_src[func_name] = utils.normalize_path(loc.source)
226
else:
227
if not is_synthesized_func(func_name):
228
diagnostics.warn(f"No source file information found in the source map for function '{func_name}'")
229
230
for func_name, src in func_to_src.items():
231
if src not in src_to_funcs:
232
src_to_funcs[src] = []
233
src_to_funcs[src].append(func_name)
234
235
# Visit paths in the reverse sorting order, so that we can process inner paths
236
# first.
237
# e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign
238
# functions contained in /a/b/c to it first and assign the remaining functions
239
# to /a/b.
240
visited_funcs = set()
241
path_to_funcs = {}
242
for path in sorted(paths, reverse=True):
243
ppath = PurePath(path)
244
path_to_funcs[path] = []
245
for src, funcs in src_to_funcs.items():
246
psrc = PurePath(src)
247
if ppath == psrc or ppath in psrc.parents:
248
for func in funcs:
249
if func not in visited_funcs:
250
visited_funcs.add(func)
251
path_to_funcs[path].append(func)
252
return path_to_funcs
253
254
255
# 1. Strip whitespaces
256
# 2. Normalize separators
257
# 3. Make /a/b/c and /a/b/c/ equivalent
258
def normalize_path(path):
259
return utils.normalize_path(path.strip()).rstrip(os.sep)
260
261
262
def parse_paths_file(paths_file_content):
263
module_to_paths = {}
264
path_to_module = {}
265
cur_module = None
266
cur_paths = []
267
268
for line in paths_file_content.splitlines():
269
line = line.strip()
270
if not line:
271
if cur_module:
272
if not cur_paths:
273
diagnostics.warn(f"Module '{cur_module}' has no paths specified.")
274
module_to_paths[cur_module] = cur_paths
275
cur_module = None
276
cur_paths = []
277
continue
278
279
if not cur_module:
280
if line[-1] != ':':
281
exit_with_error(f'Module name should end with a colon: {line}')
282
if len(line) == 1:
283
exit_with_error('Module name is empty')
284
cur_module = line[:-1]
285
else:
286
path = normalize_path(line)
287
if path in path_to_module:
288
exit_with_error("Path '{path}' cannot be assigned to module '{cur_module}; it is already assigned to module '{path_to_module[path]}'")
289
cur_paths.append(path)
290
path_to_module[path] = cur_module
291
292
if cur_module:
293
if not cur_paths:
294
diagnostics.warn(f"Module '{cur_module}' has no paths specified.")
295
module_to_paths[cur_module] = cur_paths
296
297
if not module_to_paths:
298
exit_with_error('The paths file is empty or invalid.')
299
300
return module_to_paths
301
302
303
def main():
304
args, forwarded_args = parse_args()
305
check_errors(args)
306
307
sourcemap = get_sourceMappingURL(args.wasm, args.sourcemap)
308
if args.print_sources:
309
print_sources(sourcemap)
310
return
311
312
content = utils.read_file(args.paths_file)
313
module_to_paths = parse_paths_file(content)
314
315
# Compute {path: list of functions} map
316
all_paths = []
317
for paths in module_to_paths.values():
318
all_paths.extend(paths)
319
path_to_funcs = get_path_to_functions_map(args.wasm, sourcemap, all_paths)
320
321
# Write .manifest file
322
f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False)
323
manifest = f.name
324
try:
325
for i, (module, paths) in enumerate(module_to_paths.items()):
326
if i != 0: # Unless we are the first entry add a newline separator
327
f.write('\n')
328
funcs = []
329
for path in paths:
330
if not path_to_funcs[path]:
331
diagnostics.warn(f'{path} does not match any functions')
332
funcs += path_to_funcs[path]
333
if not funcs:
334
diagnostics.warn(f"Module '{module}' does not match any functions")
335
336
if args.verbose:
337
print(f'{module}: {len(funcs)} functions')
338
for path in paths:
339
if path in path_to_funcs:
340
print(f' {path}: {len(path_to_funcs[path])} functions')
341
for func in path_to_funcs[path]:
342
print(' ' + func)
343
print()
344
345
f.write(f'{module}:\n')
346
for func in funcs:
347
f.write(func + '\n')
348
f.close()
349
350
cmd = [args.wasm_split, '--multi-split', args.wasm, '--manifest', manifest]
351
if args.verbose:
352
# This option is used both in this script and wasm-split
353
cmd.append('-v')
354
cmd += forwarded_args
355
if args.verbose:
356
print('\n' + ' '.join(cmd))
357
utils.run_process(cmd)
358
finally:
359
if not args.preserve_manifest:
360
os.remove(manifest)
361
362
363
if __name__ == '__main__':
364
sys.exit(main())
365
366