CoCalc -- wasm-sourcemap.py

GitHub Repository: emscripten-core/emscripten
Path: blob/main/tools/wasm-sourcemap.py
⁴¹²⁸ views
1
#!/usr/bin/env python3
2
# Copyright 2018 The Emscripten Authors.  All rights reserved.
3
# Emscripten is available under two separate licenses, the MIT license and the
4
# University of Illinois/NCSA Open Source License.  Both these licenses can be
5
# found in the LICENSE file.
6

7
"""Utility tools that extracts DWARF information encoded in a wasm output
8
produced by the LLVM tools, and encodes it as a wasm source map. Additionally,
9
it can collect original sources, change files prefixes, and strip debug
10
sections from a wasm file.
11
"""
12

13
import argparse
14
import json
15
import logging
16
from math import floor, log
17
import os
18
import re
19
from subprocess import Popen, PIPE
20
from pathlib import Path
21
import sys
22

23
__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
24
__rootdir__ = os.path.dirname(__scriptdir__)
25
sys.path.insert(0, __rootdir__)
26

27
from tools import utils
28
from tools.system_libs import DETERMINISTIC_PREFIX
29
from tools.shared import path_from_root
30

31
EMSCRIPTEN_PREFIX = utils.normalize_path(path_from_root())
32

33
logger = logging.getLogger('wasm-sourcemap')
34

35

36
def parse_args():
37
  parser = argparse.ArgumentParser(prog='wasm-sourcemap.py', description=__doc__)
38
  parser.add_argument('wasm', help='wasm file')
39
  parser.add_argument('-o', '--output', help='output source map')
40
  parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[])
41
  parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map')
42
  parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[])
43
  parser.add_argument('-w', nargs='?', help='set output wasm file')
44
  parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections')
45
  parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest')
46
  parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable")
47
  parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS)
48
  parser.add_argument('--basepath', help='base path for source files, which will be relative to this')
49
  return parser.parse_args()
50

51

52
class Prefixes:
53
  def __init__(self, args, base_path=None, preserve_deterministic_prefix=True):
54
    prefixes = []
55
    for p in args:
56
      if '=' in p:
57
        prefix, replacement = p.split('=')
58
        prefixes.append({'prefix': utils.normalize_path(prefix), 'replacement': replacement})
59
      else:
60
        prefixes.append({'prefix': utils.normalize_path(p), 'replacement': ''})
61
    self.base_path = utils.normalize_path(base_path) if base_path is not None else None
62
    self.preserve_deterministic_prefix = preserve_deterministic_prefix
63
    self.prefixes = prefixes
64
    self.cache = {}
65

66
  def resolve(self, name):
67
    if name in self.cache:
68
      return self.cache[name]
69

70
    source = name
71
    if not self.preserve_deterministic_prefix and name.startswith(DETERMINISTIC_PREFIX):
72
      source = EMSCRIPTEN_PREFIX + utils.removeprefix(name, DETERMINISTIC_PREFIX)
73

74
    provided = False
75
    for p in self.prefixes:
76
      if source.startswith(p['prefix']):
77
        source = p['replacement'] + utils.removeprefix(source, p['prefix'])
78
        provided = True
79
        break
80

81
    # If prefixes were provided, we use that; otherwise if base_path is set, we
82
    # emit a relative path. For files with deterministic prefix, we never use
83
    # a relative path, precisely to preserve determinism, and because it would
84
    # still point to the wrong location, so we leave the filepath untouched to
85
    # let users map it to the proper location using prefix options.
86
    if not (source.startswith(DETERMINISTIC_PREFIX) or provided or self.base_path is None):
87
      try:
88
        source = os.path.relpath(source, self.base_path)
89
      except ValueError:
90
        source = os.path.abspath(source)
91
      source = utils.normalize_path(source)
92

93
    self.cache[name] = source
94
    return source
95

96

97
# SourceMapPrefixes contains resolver for file names that are:
98
#  - "sources" is for names that output to source maps JSON
99
#  - "load" is for paths that used to load source text
100
class SourceMapPrefixes:
101
  def __init__(self, sources, load, base_path):
102
    self.sources = Prefixes(sources, base_path=base_path)
103
    self.load = Prefixes(load, preserve_deterministic_prefix=False)
104

105

106
def encode_vlq(n):
107
  VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
108
  x = (n << 1) if n >= 0 else ((-n << 1) + 1)
109
  result = ""
110
  while x > 31:
111
    result = result + VLQ_CHARS[32 + (x & 31)]
112
    x = x >> 5
113
  return result + VLQ_CHARS[x]
114

115

116
def read_var_uint(wasm, pos):
117
  n = 0
118
  shift = 0
119
  b = ord(wasm[pos:pos + 1])
120
  pos = pos + 1
121
  while b >= 128:
122
    n = n | ((b - 128) << shift)
123
    b = ord(wasm[pos:pos + 1])
124
    pos = pos + 1
125
    shift += 7
126
  return n + (b << shift), pos
127

128

129
def strip_debug_sections(wasm):
130
  logger.debug('Strip debug sections')
131
  pos = 8
132
  stripped = wasm[:pos]
133

134
  while pos < len(wasm):
135
    section_start = pos
136
    section_id, pos_ = read_var_uint(wasm, pos)
137
    section_size, section_body = read_var_uint(wasm, pos_)
138
    pos = section_body + section_size
139
    if section_id == 0:
140
      name_len, name_pos = read_var_uint(wasm, section_body)
141
      name_end = name_pos + name_len
142
      name = wasm[name_pos:name_end]
143
      if name in {'linking', 'sourceMappingURL'} or name.startswith(('reloc..debug_', '.debug_')):
144
        continue  # skip debug related sections
145
    stripped = stripped + wasm[section_start:pos]
146

147
  return stripped
148

149

150
def encode_uint_var(n):
151
  result = bytearray()
152
  while n > 127:
153
    result.append(128 | (n & 127))
154
    n = n >> 7
155
  result.append(n)
156
  return bytes(result)
157

158

159
def append_source_mapping(wasm, url):
160
  logger.debug('Append sourceMappingURL section')
161
  section_name = "sourceMappingURL"
162
  section_content = encode_uint_var(len(section_name)) + section_name.encode() + encode_uint_var(len(url)) + url.encode()
163
  return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content
164

165

166
def get_code_section_offset(wasm):
167
  logger.debug('Read sections index')
168
  pos = 8
169

170
  while pos < len(wasm):
171
    section_id, pos_ = read_var_uint(wasm, pos)
172
    section_size, pos = read_var_uint(wasm, pos_)
173
    if section_id == 10:
174
      return pos
175
    pos = pos + section_size
176

177

178
def remove_dead_entries(entries):
179
  # Remove entries for dead functions. It is a heuristics to ignore data if the
180
  # function starting address near to 0 (is equal to its size field length).
181
  block_start = 0
182
  cur_entry = 0
183
  while cur_entry < len(entries):
184
    if not entries[cur_entry]['eos']:
185
      cur_entry += 1
186
      continue
187
    fn_start = entries[block_start]['address']
188
    # Calculate the LEB encoded function size (including size field)
189
    fn_size_length = floor(log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1
190
    min_live_offset = 1 + fn_size_length # 1 byte is for code section entries
191
    if fn_start < min_live_offset:
192
      # Remove dead code debug info block.
193
      del entries[block_start:cur_entry + 1]
194
      cur_entry = block_start
195
      continue
196
    cur_entry += 1
197
    block_start = cur_entry
198

199

200
# Given a string that has non-ASCII UTF-8 bytes 128-255 stored as octal sequences (\200 - \377), decode
201
# the sequences back to UTF-8. E.g. "C:\\\303\244 \303\266\\emsdk\\emscripten\\main" -> "C:\\ä ö\\emsdk\\emscripten\\main"
202
def decode_octal_encoded_utf8(str):
203
  out = bytearray(len(str))
204
  i = 0
205
  o = 0
206
  final_length = len(str)
207
  in_escape = False
208
  while i < len(str):
209
    if not in_escape and str[i] == '\\' and (str[i + 1] == '2' or str[i + 1] == '3'):
210
      out[o] = int(str[i + 1:i + 4], 8)
211
      i += 4
212
      final_length -= 3
213
      in_escape = False
214
    else:
215
      out[o] = ord(str[i])
216
      in_escape = False if in_escape else (str[i] == '\\')
217
      i += 1
218
    o += 1
219
  return out[:final_length].decode('utf-8')
220

221

222
def extract_comp_dir_map(text):
223
  map_stmt_list_to_comp_dir = {}
224
  chunks = re.split(r"0x[0-9a-f]*: DW_TAG_compile_unit", text)
225
  for chunk in chunks[1:]:
226
    stmt_list_match = re.search(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)", chunk)
227
    if stmt_list_match is not None:
228
      stmt_list = stmt_list_match.group(1)
229
      comp_dir_match = re.search(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)", chunk)
230
      comp_dir = decode_octal_encoded_utf8(comp_dir_match.group(1)) if comp_dir_match is not None else ''
231
      map_stmt_list_to_comp_dir[stmt_list] = comp_dir
232
  return map_stmt_list_to_comp_dir
233

234

235
def read_dwarf_entries(wasm, options):
236
  if options.dwarfdump_output:
237
    output = Path(options.dwarfdump_output).read_bytes()
238
  elif options.dwarfdump:
239
    logger.debug('Reading DWARF information from %s' % wasm)
240
    if not os.path.exists(options.dwarfdump):
241
      logger.error('llvm-dwarfdump not found: ' + options.dwarfdump)
242
      sys.exit(1)
243
    process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE)
244
    output, err = process.communicate()
245
    exit_code = process.wait()
246
    if exit_code != 0:
247
      logger.error('Error during llvm-dwarfdump execution (%s)' % exit_code)
248
      sys.exit(1)
249
  else:
250
    logger.error('Please specify either --dwarfdump or --dwarfdump-output')
251
    sys.exit(1)
252

253
  entries = []
254
  debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", output.decode('utf-8'))
255
  map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_line_chunks[0])
256
  for stmt_list, line_chunk in zip(debug_line_chunks[1::2], debug_line_chunks[2::2]):
257
    comp_dir = map_stmt_list_to_comp_dir.get(stmt_list, '')
258

259
    # include_directories[  1] = "/Users/yury/Work/junk/sqlite-playground/src"
260
    # file_names[  1]:
261
    #            name: "playground.c"
262
    #       dir_index: 1
263
    #        mod_time: 0x00000000
264
    #          length: 0x00000000
265
    #
266
    # Address            Line   Column File   ISA Discriminator Flags
267
    # ------------------ ------ ------ ------ --- ------------- -------------
268
    # 0x0000000000000006     22      0      1   0             0  is_stmt
269
    # 0x0000000000000007     23     10      1   0             0  is_stmt prologue_end
270
    # 0x000000000000000f     23      3      1   0             0
271
    # 0x0000000000000010     23      3      1   0             0  end_sequence
272
    # 0x0000000000000011     28      0      1   0             0  is_stmt
273

274
    include_directories = {'0': comp_dir}
275
    for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk):
276
      include_directories[dir.group(1)] = os.path.join(comp_dir, decode_octal_encoded_utf8(dir.group(2)))
277

278
    files = {}
279
    for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk):
280
      dir = include_directories[file.group(3)]
281
      file_path = os.path.join(dir, decode_octal_encoded_utf8(file.group(2)))
282
      files[file.group(1)] = file_path
283

284
    for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
285
      entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
286
      if not entry['eos']:
287
        entries.append(entry)
288
      else:
289
        # move end of function to the last END operator
290
        entry['address'] -= 1
291
        if entries[-1]['address'] == entry['address']:
292
          # last entry has the same address, reusing
293
          entries[-1]['eos'] = True
294
        else:
295
          entries.append(entry)
296

297
  remove_dead_entries(entries)
298

299
  # return entries sorted by the address field
300
  return sorted(entries, key=lambda entry: entry['address'])
301

302

303
def build_sourcemap(entries, code_section_offset, options):
304
  base_path = options.basepath
305
  collect_sources = options.sources
306
  prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path)
307

308
  sources = []
309
  sources_content = []
310
  mappings = []
311
  sources_map = {}
312
  last_address = 0
313
  last_source_id = 0
314
  last_line = 1
315
  last_column = 1
316

317
  for entry in entries:
318
    line = entry['line']
319
    column = entry['column']
320
    # ignore entries with line 0
321
    if line == 0:
322
      continue
323
    # start at least at column 1
324
    if column == 0:
325
      column = 1
326

327
    address = entry['address'] + code_section_offset
328
    file_name = utils.normalize_path(entry['file'])
329
    source_name = prefixes.sources.resolve(file_name)
330

331
    if source_name not in sources_map:
332
      source_id = len(sources)
333
      sources_map[source_name] = source_id
334
      sources.append(source_name)
335
      if collect_sources:
336
        load_name = prefixes.load.resolve(file_name)
337
        try:
338
          with open(load_name) as infile:
339
            source_content = infile.read()
340
          sources_content.append(source_content)
341
        except OSError:
342
          print('Failed to read source: %s' % load_name)
343
          sources_content.append(None)
344
    else:
345
      source_id = sources_map[source_name]
346

347
    address_delta = address - last_address
348
    source_id_delta = source_id - last_source_id
349
    line_delta = line - last_line
350
    column_delta = column - last_column
351
    mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta))
352
    last_address = address
353
    last_source_id = source_id
354
    last_line = line
355
    last_column = column
356

357
  return {'version': 3,
358
          'sources': sources,
359
          'sourcesContent': sources_content,
360
          'names': [],
361
          'mappings': ','.join(mappings)}
362

363

364
def main():
365
  options = parse_args()
366

367
  wasm_input = options.wasm
368
  with open(wasm_input, 'rb') as infile:
369
    wasm = infile.read()
370

371
  entries = read_dwarf_entries(wasm_input, options)
372

373
  code_section_offset = get_code_section_offset(wasm)
374

375
  logger.debug('Saving to %s' % options.output)
376
  map = build_sourcemap(entries, code_section_offset, options)
377
  with open(options.output, 'w', encoding='utf-8') as outfile:
378
    json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False)
379

380
  if options.strip:
381
    wasm = strip_debug_sections(wasm)
382

383
  if options.source_map_url:
384
    wasm = append_source_mapping(wasm, options.source_map_url)
385

386
  if options.w:
387
    logger.debug('Saving wasm to %s' % options.w)
388
    with open(options.w, 'wb') as outfile:
389
      outfile.write(wasm)
390

391
  logger.debug('Done')
392
  return 0
393

394

395
if __name__ == '__main__':
396
  logging.basicConfig(level=logging.DEBUG if os.environ.get('EMCC_DEBUG') else logging.INFO)
397
  sys.exit(main())
398

399
Product

Resources

Company