CoCalc -- tokenizer.py

GitHub Repository: sagemathinc/python-wasm
Path: blob/main/python/pylang/src/tokenizer.py
¹³⁹⁶ views
1
from __python__ import hash_literals  # type: ignore
2

3
# mypy
4
from __python__ import RegExp, String, undefined, parseFloat
5
from typing import Any, Callable, Optional, Union, List, Literal
6

7
from unicode_aliases import ALIAS_MAP  # type: ignore
8
from utils import make_predicate, characters, charAt, startswith
9
from ast_types import AST_Token
10
from errors import EOFError, SyntaxError
11
from string_interpolation import interpolate  # type: ignore
12

13
RE_HEX_NUMBER = RegExp(r"^0x[0-9a-f]+$", "i")
14
RE_OCT_NUMBER = RegExp(r"^0[0-7]+$")
15
RE_DEC_NUMBER = RegExp(r"^\d*\.?\d*(?:e[+-]?\d*(?:\d\.?|\.?\d)\d*)?$", "i")
16

17
OPERATOR_CHARS = make_predicate(characters("+-*&%=<>!?|~^@"))
18

19
ASCII_CONTROL_CHARS = {
20
    'a': 7,
21
    'b': 8,
22
    'f': 12,
23
    'n': 10,
24
    'r': 13,
25
    't': 9,
26
    'v': 11
27
}
28
HEX_PAT = RegExp(r"[a-fA-F0-9]")
29
NAME_PAT = RegExp(r"[a-zA-Z ]")
30

31
OPERATORS = make_predicate([
32
    "in", "instanceof", "typeof", "new", "void", "del", "+", "-", "not", "~",
33
    "&", "|", "^^", "^", "**", "*", "//", "/", "%", ">>", "<<", ">>>", "<",
34
    ">", "<=", ">=", "==", "is", "!=", "=", "+=", "-=", "//=", "/=", "*=",
35
    "%=", ">>=", "<<=", ">>>=", "|=", "^=", "&=", "and", "or", "@", "->"
36
])
37

38
OP_MAP = {
39
    'or': "||",
40
    'and': "&&",
41
    'not': "!",
42
    'del': "delete",
43
    'None': "null",
44
    'is': "===",
45
}
46

47
WHITESPACE_CHARS = make_predicate(
48
    characters(
49
        " \u00a0\n\r\t\f\u000b\u200b\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u202f\u205f\u3000"
50
    ))
51

52
PUNC_BEFORE_EXPRESSION = make_predicate(characters("[{(,.;:"))
53

54
PUNC_CHARS = make_predicate(characters("[]{}(),;:?"))
55

56
keywords = "as assert break class continue def del do elif else except finally for from global if import in is lambda new nonlocal pass raise return yield try while with or and not"
57

58
keywords_atom = "False None True"
59

60
# see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar
61
reserved_words = (
62
    "break case class catch const continue debugger default delete do else export extends"
63
    " finally for function if import in instanceof new return super switch this throw try typeof var void"
64
    " while with yield enum implements static private package let public protected interface await null true false"
65
)
66

67
keyword_before_expression = "return yield new del raise elif else if"
68

69
ALL_KEYWORDS = keywords + " " + keywords_atom
70

71
KEYWORDS = make_predicate(keywords)
72
RESERVED_WORDS = make_predicate(reserved_words)
73
KEYWORDS_BEFORE_EXPRESSION = make_predicate(keyword_before_expression)
74
KEYWORDS_ATOM = make_predicate(keywords_atom)
75
IDENTIFIER_PAT = RegExp(r"^[a-z_$][_a-z0-9$]*$", "i")
76

77

78
# https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
79
def is_string_modifier(val: str) -> bool:
80
    for ch in val:
81
        if ch not in 'vrufVRUF':
82
            return False
83
    return True
84

85

86
def is_letter(code: int) -> bool:
87
    return code >= 97 and code <= 122 or code >= 65 and code <= 90 or code >= 170 and UNICODE[
88
        'letter'].test(chr(code))
89

90

91
def is_digit(code: int) -> bool:
92
    return code >= 48 and code <= 57
93

94

95
def is_dot(code: int) -> bool:
96
    return code == 46
97

98

99
def is_alphanumeric_char(code: int) -> bool:
100
    return is_digit(code) or is_letter(code)
101

102

103
def is_unicode_combining_mark(ch: str) -> bool:
104
    return UNICODE['non_spacing_mark'].test(
105
        ch) or UNICODE['space_combining_mark'].test(ch)
106

107

108
def is_unicode_connector_punctuation(ch: str) -> bool:
109
    return UNICODE['connector_punctuation'].test(ch)
110

111

112
def is_identifier(name: str) -> bool:
113
    return not RESERVED_WORDS[name] and not KEYWORDS[
114
        name] and not KEYWORDS_ATOM[name] and IDENTIFIER_PAT.test(name)
115

116

117
def is_identifier_start(code: int) -> bool:
118
    return code is 36 or code is 95 or is_letter(code)
119

120

121
def is_identifier_char(ch: str) -> bool:
122
    code = ord(ch)
123
    return is_identifier_start(code) or is_digit(
124
        code) or code is 8204 or code is 8205 or is_unicode_combining_mark(
125
            ch) or is_unicode_connector_punctuation(ch)
126

127

128
def parse_js_number(num: str) -> Union[float, int]:
129
    if RE_HEX_NUMBER.test(num):
130
        return int(num[2:], 16)
131
    elif RE_OCT_NUMBER.test(num):
132
        return int(num[1:], 8)
133
    elif RE_DEC_NUMBER.test(num):
134
        return float(num)
135
    raise ValueError("invalid number")
136

137

138
# regexps adapted from http://xregexp.com/plugins/#unicode
139
UNICODE = {
140
    'letter':
141
    RegExp(
142
        "[\\u0041-\\u005A\\u0061-\\u007A\\u00AA\\u00B5\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02C1\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC\\u02EE\\u0370-\\u0374\\u0376\\u0377\\u037A-\\u037D\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u0523\\u0531-\\u0556\\u0559\\u0561-\\u0587\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0621-\\u064A\\u066E\\u066F\\u0671-\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE\\u06EF\\u06FA-\\u06FC\\u06FF\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1\\u07CA-\\u07EA\\u07F4\\u07F5\\u07FA\\u0904-\\u0939\\u093D\\u0950\\u0958-\\u0961\\u0971\\u0972\\u097B-\\u097F\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BD\\u09CE\\u09DC\\u09DD\\u09DF-\\u09E1\\u09F0\\u09F1\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AD0\\u0AE0\\u0AE1\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3D\\u0B5C\\u0B5D\\u0B5F-\\u0B61\\u0B71\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D\\u0C58\\u0C59\\u0C60\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD\\u0CDE\\u0CE0\\u0CE1\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D28\\u0D2A-\\u0D39\\u0D3D\\u0D60\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E30\\u0E32\\u0E33\\u0E40-\\u0E46\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB0\\u0EB2\\u0EB3\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EDC\\u0EDD\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C\\u0F88-\\u0F8B\\u1000-\\u102A\\u103F\\u1050-\\u1055\\u105A-\\u105D\\u1061\\u1065\\u1066\\u106E-\\u1070\\u1075-\\u1081\\u108E\\u10A0-\\u10C5\\u10D0-\\u10FA\\u10FC\\u1100-\\u1159\\u115F-\\u11A2\\u11A8-\\u11F9\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F4\\u1401-\\u166C\\u166F-\\u1676\\u1681-\\u169A\\u16A0-\\u16EA\\u1700-\\u170C\\u170E-\\u1711\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770\\u1780-\\u17B3\\u17D7\\u17DC\\u1820-\\u1877\\u1880-\\u18A8\\u18AA\\u1900-\\u191C\\u1950-\\u196D\\u1970-\\u1974\\u1980-\\u19A9\\u19C1-\\u19C7\\u1A00-\\u1A16\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE\\u1BAF\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C7D\\u1D00-\\u1DBF\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071\\u207F\\u2090-\\u2094\\u2102\\u2107\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2139\\u213C-\\u213F\\u2145-\\u2149\\u214E\\u2183\\u2184\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2C6F\\u2C71-\\u2C7D\\u2C80-\\u2CE4\\u2D00-\\u2D25\\u2D30-\\u2D65\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F\\u3005\\u3006\\u3031-\\u3035\\u303B\\u303C\\u3041-\\u3096\\u309D-\\u309F\\u30A1-\\u30FA\\u30FC-\\u30FF\\u3105-\\u312D\\u3131-\\u318E\\u31A0-\\u31B7\\u31F0-\\u31FF\\u3400\\u4DB5\\u4E00\\u9FC3\\uA000-\\uA48C\\uA500-\\uA60C\\uA610-\\uA61F\\uA62A\\uA62B\\uA640-\\uA65F\\uA662-\\uA66E\\uA67F-\\uA697\\uA717-\\uA71F\\uA722-\\uA788\\uA78B\\uA78C\\uA7FB-\\uA801\\uA803-\\uA805\\uA807-\\uA80A\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA90A-\\uA925\\uA930-\\uA946\\uAA00-\\uAA28\\uAA40-\\uAA42\\uAA44-\\uAA4B\\uAC00\\uD7A3\\uF900-\\uFA2D\\uFA30-\\uFA6A\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC]"
143
    ),
144
    'non_spacing_mark':
145
    RegExp(
146
        "[\\u0300-\\u036F\\u0483-\\u0487\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u064B-\\u065E\\u0670\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07A6-\\u07B0\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0900-\\u0902\\u093C\\u0941-\\u0948\\u094D\\u0951-\\u0955\\u0962\\u0963\\u0981\\u09BC\\u09C1-\\u09C4\\u09CD\\u09E2\\u09E3\\u0A01\\u0A02\\u0A3C\\u0A41\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A51\\u0A70\\u0A71\\u0A75\\u0A81\\u0A82\\u0ABC\\u0AC1-\\u0AC5\\u0AC7\\u0AC8\\u0ACD\\u0AE2\\u0AE3\\u0B01\\u0B3C\\u0B3F\\u0B41-\\u0B44\\u0B4D\\u0B56\\u0B62\\u0B63\\u0B82\\u0BC0\\u0BCD\\u0C3E-\\u0C40\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C62\\u0C63\\u0CBC\\u0CBF\\u0CC6\\u0CCC\\u0CCD\\u0CE2\\u0CE3\\u0D41-\\u0D44\\u0D4D\\u0D62\\u0D63\\u0DCA\\u0DD2-\\u0DD4\\u0DD6\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB\\u0EBC\\u0EC8-\\u0ECD\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F71-\\u0F7E\\u0F80-\\u0F84\\u0F86\\u0F87\\u0F90-\\u0F97\\u0F99-\\u0FBC\\u0FC6\\u102D-\\u1030\\u1032-\\u1037\\u1039\\u103A\\u103D\\u103E\\u1058\\u1059\\u105E-\\u1060\\u1071-\\u1074\\u1082\\u1085\\u1086\\u108D\\u109D\\u135F\\u1712-\\u1714\\u1732-\\u1734\\u1752\\u1753\\u1772\\u1773\\u17B7-\\u17BD\\u17C6\\u17C9-\\u17D3\\u17DD\\u180B-\\u180D\\u18A9\\u1920-\\u1922\\u1927\\u1928\\u1932\\u1939-\\u193B\\u1A17\\u1A18\\u1A56\\u1A58-\\u1A5E\\u1A60\\u1A62\\u1A65-\\u1A6C\\u1A73-\\u1A7C\\u1A7F\\u1B00-\\u1B03\\u1B34\\u1B36-\\u1B3A\\u1B3C\\u1B42\\u1B6B-\\u1B73\\u1B80\\u1B81\\u1BA2-\\u1BA5\\u1BA8\\u1BA9\\u1C2C-\\u1C33\\u1C36\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1DFF\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-\\u302F\\u3099\\u309A\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA802\\uA806\\uA80B\\uA825\\uA826\\uA8C4\\uA8E0-\\uA8F1\\uA926-\\uA92D\\uA947-\\uA951\\uA980-\\uA982\\uA9B3\\uA9B6-\\uA9B9\\uA9BC\\uAA29-\\uAA2E\\uAA31\\uAA32\\uAA35\\uAA36\\uAA43\\uAA4C\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABE5\\uABE8\\uABED\\uFB1E\\uFE00-\\uFE0F\\uFE20-\\uFE26]"
147
    ),
148
    'space_combining_mark':
149
    RegExp(
150
        "[\\u0903\\u093E-\\u0940\\u0949-\\u094C\\u094E\\u0982\\u0983\\u09BE-\\u09C0\\u09C7\\u09C8\\u09CB\\u09CC\\u09D7\\u0A03\\u0A3E-\\u0A40\\u0A83\\u0ABE-\\u0AC0\\u0AC9\\u0ACB\\u0ACC\\u0B02\\u0B03\\u0B3E\\u0B40\\u0B47\\u0B48\\u0B4B\\u0B4C\\u0B57\\u0BBE\\u0BBF\\u0BC1\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCC\\u0BD7\\u0C01-\\u0C03\\u0C41-\\u0C44\\u0C82\\u0C83\\u0CBE\\u0CC0-\\u0CC4\\u0CC7\\u0CC8\\u0CCA\\u0CCB\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D3E-\\u0D40\\u0D46-\\u0D48\\u0D4A-\\u0D4C\\u0D57\\u0D82\\u0D83\\u0DCF-\\u0DD1\\u0DD8-\\u0DDF\\u0DF2\\u0DF3\\u0F3E\\u0F3F\\u0F7F\\u102B\\u102C\\u1031\\u1038\\u103B\\u103C\\u1056\\u1057\\u1062-\\u1064\\u1067-\\u106D\\u1083\\u1084\\u1087-\\u108C\\u108F\\u109A-\\u109C\\u17B6\\u17BE-\\u17C5\\u17C7\\u17C8\\u1923-\\u1926\\u1929-\\u192B\\u1930\\u1931\\u1933-\\u1938\\u19B0-\\u19C0\\u19C8\\u19C9\\u1A19-\\u1A1B\\u1A55\\u1A57\\u1A61\\u1A63\\u1A64\\u1A6D-\\u1A72\\u1B04\\u1B35\\u1B3B\\u1B3D-\\u1B41\\u1B43\\u1B44\\u1B82\\u1BA1\\u1BA6\\u1BA7\\u1BAA\\u1C24-\\u1C2B\\u1C34\\u1C35\\u1CE1\\u1CF2\\uA823\\uA824\\uA827\\uA880\\uA881\\uA8B4-\\uA8C3\\uA952\\uA953\\uA983\\uA9B4\\uA9B5\\uA9BA\\uA9BB\\uA9BD-\\uA9C0\\uAA2F\\uAA30\\uAA33\\uAA34\\uAA4D\\uAA7B\\uABE3\\uABE4\\uABE6\\uABE7\\uABE9\\uABEA\\uABEC]"
151
    ),
152
    'connector_punctuation':
153
    RegExp(
154
        "[\\u005F\\u203F\\u2040\\u2054\\uFE33\\uFE34\\uFE4D-\\uFE4F\\uFF3F]")
155
}  # }}}
156

157

158
def is_token(token, type, val) -> bool:
159
    return token.type is type and (val is None or val is undefined
160
                                   or token.value is val)
161

162

163
def tokenizer(raw_text: str, filename: str) -> Callable[[], Any]:
164
    S = {
165
        'exponent': # parse ^ as exponent and ^^ as xor
166
        False,
167
        'text':
168
        raw_text.replace(RegExp(r"\r\n?|[\n\u2028\u2029]","g"),
169
                         "\n").replace(RegExp(r"\uFEFF","g"), ""),
170
        'filename':
171
        filename,
172
        'pos':
173
        0,
174
        'tokpos':
175
        0,
176
        'line':
177
        1,
178
        'tokline':
179
        0,
180
        'col':
181
        0,
182
        'tokcol':
183
        0,
184
        'newline_before':
185
        False,
186
        'regex_allowed':
187
        False,
188
        'comments_before':
189
        [],
190
        'whitespace_before':
191
        [],
192
        'newblock':
193
        False,
194
        'endblock':
195
        False,
196
        'indentation_matters':
197
        r'%js [ true ]',
198
        'cached_whitespace':
199
        "",
200
        'prev':
201
        undefined,
202
        'index_or_slice':
203
        r'%js [ false ]',
204
        'expecting_object_literal_key':
205
        False,  # This is set by the parser when it is expecting an object literal key
206
    }
207

208
    def peek() -> str:
209
        return charAt(S['text'], S['pos'])
210

211
    def peekpeek():
212
        return charAt(S['text'], S['pos'] + 1)
213

214
    def prevChar():
215
        return charAt(S['text'], S['tokpos'] - 1)
216

217
    def next(signal_eof=False, in_string=False):
218
        ch = charAt(S['text'], S['pos'])
219
        S['pos'] += 1
220
        if signal_eof and not ch:
221
            raise EOFError
222

223
        if ch is "\n":
224
            S['newline_before'] = S['newline_before'] or not in_string
225
            S['line'] += 1
226
            S['col'] = 0
227
        else:
228
            S['col'] += 1
229
        return ch
230

231
    def find(what: str, signal_eof: bool = False) -> int:
232
        pos = S['text'].indexOf(what, S['pos'])
233
        if signal_eof and pos is -1:
234
            raise EOFError
235
        return pos
236

237
    def start_token() -> None:
238
        S['tokline'] = S['line']
239
        S['tokcol'] = S['col']
240
        S['tokpos'] = S['pos']
241

242
    def token(type: str,
243
              value: Any,
244
              is_comment: bool = False,
245
              keep_newline: bool = False) -> AST_Token:
246
        if S['exponent'] and type == 'operator':
247
            if value == '^':
248
                value = '**'
249
            elif value == '^^':
250
                value = '^'
251
        S['regex_allowed'] = (
252
            type is "operator"
253
            or type is "keyword" and KEYWORDS_BEFORE_EXPRESSION[value]
254
            or type is "punc" and PUNC_BEFORE_EXPRESSION[value])
255

256
        if type is "operator" and value is "is" and S['text'].substr(
257
                S['pos']).trimLeft().substr(0, 4).trimRight() is "not":
258
            next_token()
259
            value = "!=="
260

261
        if type is "operator" and OP_MAP[value]:
262
            value = OP_MAP[value]
263

264
        ret = {
265
            'type': type,
266
            'value': value,
267
            'line': S['tokline'],
268
            'col': S['tokcol'],
269
            'pos': S['tokpos'],
270
            'endpos': S['pos'],
271
            'nlb': S['newline_before'],
272
            'file': filename,
273
            'leading_whitespace': S['whitespace_before'][-1] or '',
274
        }
275
        if not is_comment:
276
            ret['comments_before'] = S['comments_before']
277
            S['comments_before'] = []
278
            # make note of any newlines in the comments that came before
279
            for i in range(ret['comments_before.length']):
280
                ret['nlb'] = ret['nlb'] or ret['comments_before'][i]['nlb']
281

282
        if not keep_newline:
283
            S['newline_before'] = False
284

285
        if type is "punc":
286
            if (value is ":" and not S['index_or_slice'][-1]
287
                    and not S['expecting_object_literal_key'] and
288
                (not S['text'].substring(S['pos'] + 1, find("\n")).trim()
289
                 or not S['text'].substring(S['pos'] + 1, find("#")).trim())):
290
                S['newblock'] = True
291
                S['indentation_matters'].push(True)
292

293
            if value is "[":
294
                if S['prev'] and (S['prev'].type is "name" or
295
                                  (S['prev'].type is 'punc'
296
                                   and S['prev'].value in ')]')):
297
                    S['index_or_slice'].push(True)
298
                else:
299
                    S['index_or_slice'].push(False)
300
                S['indentation_matters'].push(False)
301
            elif value is "{" or value is "(":
302
                S['indentation_matters'].push(False)
303
            elif value is "]":
304
                S['index_or_slice'].pop()
305
                S['indentation_matters'].pop()
306
            elif value is "}" or value is ")":
307
                S['indentation_matters'].pop()
308
        S['prev'] = AST_Token(ret)
309
        return S['prev']
310

311
    # this will transform leading whitespace to block tokens unless
312
    # part of array/hash, and skip non-leading whitespace
313
    def parse_whitespace() -> Union[Literal[-1], Literal[1], Literal[0]]:
314
        leading_whitespace = ""
315
        whitespace_exists = False
316
        while WHITESPACE_CHARS[peek()]:
317
            whitespace_exists = True
318
            ch = next()
319
            if ch is "\n":
320
                leading_whitespace = ""
321
            else:
322
                leading_whitespace += ch
323
        if peek() is not "#":
324
            if not whitespace_exists:
325
                leading_whitespace = S['cached_whitespace']
326
            else:
327
                S['cached_whitespace'] = leading_whitespace
328
            if S['newline_before'] or S['endblock']:
329
                return test_indent_token(leading_whitespace)
330
        return 0
331

332
    def test_indent_token(
333
            leading_whitespace: str
334
    ) -> Union[Literal[-1], Literal[1], Literal[0]]:
335
        most_recent = S['whitespace_before'][-1] or ""
336
        S['endblock'] = False
337
        if S['indentation_matters'][
338
                -1] and leading_whitespace is not most_recent:
339
            if S['newblock'] and leading_whitespace and startswith(
340
                    leading_whitespace, most_recent):
341
                # positive indent, new block
342
                S['newblock'] = False
343
                S['whitespace_before'].push(leading_whitespace)
344
                return 1
345
            elif most_recent and startswith(most_recent, leading_whitespace):
346
                # negative indent, block is ending
347
                S['endblock'] = True
348
                S['whitespace_before'].pop()
349
                return -1
350
            else:
351
                # indent mismatch, inconsistent indentation
352
                parse_error("Inconsistent indentation")
353
        return 0
354

355
    def read_while(pred: Callable) -> str:
356
        ret = ""
357
        i = 0
358
        ch = peek()
359
        while ch and pred(ch, i):
360
            i += 1
361
            ret += next()
362
            ch = peek()
363
        return ret
364

365
    def parse_error(err: str, is_eof: bool = False) -> SyntaxError:
366
        raise SyntaxError(err, filename, S['tokline'], S['tokcol'],
367
                          S['tokpos'], is_eof)
368

369
    def read_num(prefix: str) -> Optional[AST_Token]:
370
        has_e = False
371
        has_x = False
372
        has_dot = prefix is "."
373

374
        # Read a binary number
375
        if not prefix and peek() is '0' and charAt(S['text'],
376
                                                   S['pos'] + 1) is 'b':
377
            next(), next()
378

379
            def is01(ch):
380
                return ch is '0' or ch is '1'
381

382
            num = read_while(is01)
383
            try:
384
                valid = int(num, 2)  # type: Union[float, int]
385
            except:
386
                parse_error('Invalid syntax for a binary number')
387
            return token('num', valid)
388
        seen = []  # type: List[str]
389

390
        def is_num(ch, i):
391
            nonlocal has_dot, has_e, has_x
392
            seen.push(ch)
393
            if ch is 'x' or ch is 'X':
394
                if has_x or seen.length is not 2 or seen[0] is not '0':
395
                    return False
396
                has_x = True
397
                return True
398
            elif ch is 'e' or ch is 'E':
399
                if has_x:
400
                    return True
401
                if has_e or i == 0:
402
                    return False
403
                has_e = True
404
                return True
405
            elif ch is '-':
406
                if i is 0 and not prefix:
407
                    return True
408
                if has_e and seen[i - 1].toLowerCase() is 'e':
409
                    return True
410
                return False
411
            elif ch is '+':
412
                if has_e and seen[i - 1].toLowerCase() is 'e':
413
                    return True
414
                return False
415
            elif ch is '.':
416
                # If next ch after this is also a ., then its
417
                # something like [389..5077], so we stop
418
                if peekpeek() is '.':
419
                    return False
420
                if not has_dot and not has_x and not has_e:
421
                    has_dot = True
422
                    return True
423
                return False
424
            return is_alphanumeric_char(ch.charCodeAt(0))
425

426
        num = read_while(is_num)
427
        if prefix:
428
            num = prefix + num
429

430
        try:
431
            valid = parse_js_number(num)
432
        except:
433
            parse_error("SyntaxError: invalid syntax in numeric literal -- " +
434
                        num)
435
            return undefined
436
        return token("num", valid)
437

438
    # This returns str or int, since it could be a
439
    # hex number or a hex character code.
440
    def read_hex_digits(count: int) -> Union[str, int]:
441
        ans = ''
442
        while count > 0:
443
            count -= 1
444
            if not HEX_PAT.test(peek()):
445
                return ans
446
            ans += next()
447
        nval = int(ans, 16)
448
        if nval > 0x10FFFF:
449
            return ans
450
        return nval
451

452
    def read_escape_sequence():
453
        q = next(True)
454
        if q is '\n':
455
            return ''
456
        if q is '\\':
457
            return q
458
        if '"\''.indexOf(q) is not -1:
459
            return q
460
        if ASCII_CONTROL_CHARS[q]:
461
            return String.fromCharCode(ASCII_CONTROL_CHARS[q])
462
        if '0' <= q <= '7':
463
            octal = q
464
            if '0' <= peek() <= '7':
465
                octal += next()
466
            if '0' <= peek() <= '7':
467
                octal += next()
468
            code = parseInt(octal, 8)
469
            if isNaN(code):
470
                return '\\' + octal
471
            return String.fromCharCode(code)
472
        if q is 'x':
473
            code = read_hex_digits(2)
474
            if jstype(code) is 'number':
475
                return String.fromCharCode(code)
476
            return '\\x' + code
477
        if q is 'u':
478
            code = read_hex_digits(4)
479
            if jstype(code) is 'number':
480
                return String.fromCharCode(code)
481
            return '\\u' + code
482
        if q is 'U':
483
            code = read_hex_digits(8)
484
            if jstype(code) is 'number':
485
                if code <= 0xFFFF:
486
                    return String.fromCharCode(code)
487
                code -= 0x10000
488
                return String.fromCharCode(0xD800 + (code >> 10),
489
                                           0xDC00 + (code & 0x3FF))
490
            return '\\U' + code
491
        if q is 'N' and peek() is '{':
492
            next()
493

494
            def is_name_ch(ch):
495
                return NAME_PAT.test(ch)
496

497
            name = read_while(is_name_ch)
498
            if peek() is not '}':
499
                return '\\N{' + name
500
            next()
501
            key = (name or '').toLowerCase()
502
            if not name or not Object.prototype.hasOwnProperty.call(
503
                    ALIAS_MAP, key):
504
                return '\\N{' + name + '}'
505
            code = ALIAS_MAP[key]
506
            if code <= 0xFFFF:
507
                return String.fromCharCode(code)
508
            code -= 0x10000
509
            return String.fromCharCode(0xD800 + (code >> 10),
510
                                       0xDC00 + (code & 0x3FF))
511
        return '\\' + q
512

513
    def with_eof_error(eof_error, cont):
514
        def eof_error():
515
            try:
516
                return cont.apply(None, arguments)
517
            except Exception as ex:
518
                if ex is EOFError:
519
                    parse_error(eof_error, True)
520
                else:
521
                    raise
522

523
        return eof_error
524

525
    def _read_string(is_raw_literal, is_js_literal):
526
        quote = next()
527
        tok_type = 'js' if is_js_literal else 'string'
528
        ret = ""
529
        is_multiline = False
530
        if peek() is quote:
531
            # two quotes in a row
532
            next(True)
533
            if peek() is quote:
534
                # multiline string (3 quotes in a row)
535
                next(True)
536
                is_multiline = True
537
            else:
538
                return token(tok_type, '')
539

540
        while True:
541
            ch = next(True, True)
542
            if not ch:
543
                break
544
            if ch is "\n" and not is_multiline:
545
                parse_error("End of line while scanning string literal")
546

547
            if ch is "\\":
548
                ret += (
549
                    '\\' +
550
                    next(True)) if is_raw_literal else read_escape_sequence()
551
                continue
552

553
            if ch is quote:
554
                if not is_multiline:
555
                    break
556
                if peek() is quote:
557
                    next()
558
                    if peek() is quote:
559
                        next()
560
                        break
561
                    else:
562
                        ch += quote
563
            ret += ch
564
        if is_raw_literal and ret[:3] == '%js' and WHITESPACE_CHARS[ret[3]]:
565
            return token(
566
                'js', ret[4:].trim())  # trim since really javascript string.
567
        return token(tok_type, ret)
568

569
    read_string = with_eof_error("Unterminated string constant", _read_string)
570

571
    def handle_interpolated_string(string, start_tok):
572
        def raise_error(err):
573
            raise SyntaxError(err, filename, start_tok.line, start_tok.col,
574
                              start_tok.pos, False)
575

576
        S['text'] = S['text'][:S['pos']] + '(' + interpolate(
577
            string, raise_error) + ')' + S['text'][S['pos']:]
578
        return token('punc', next())
579

580
    def read_line_comment(shebang):
581
        if not shebang:
582
            next()
583
        i = find("\n")
584

585
        if i is -1:
586
            ret = S['text'].substr(S['pos'])
587
            S['pos'] = S['text'].length
588
        else:
589
            ret = S['text'].substring(S['pos'], i)
590
            S['pos'] = i
591

592
        return token("shebang" if shebang else "comment1", ret, True)
593

594
    def read_name():
595
        name = ch = ""
596
        while True:
597
            ch = peek()
598
            if ch is None: break
599
            if ch is "\\":
600
                if charAt(S['text'], S['pos'] + 1) is "\n":
601
                    S['pos'] += 2
602
                    continue
603
                break
604
            elif is_identifier_char(ch):
605
                name += next()
606
            else:
607
                break
608
        return name
609

610
    def do_read_regexp():
611
        prev_backslash = False
612
        regexp = ch = ''
613
        in_class = False
614
        verbose_regexp = False
615
        in_comment = False
616

617
        if peek() is '/':
618
            next(True)
619
            if peek() is '/':
620
                verbose_regexp = True
621
                next(True)
622
            else:  # empty regexp (//)
623
                mods = read_name()
624
                return token("regexp", RegExp(regexp, mods))
625
        while True:
626
            ch = next(True)
627
            if not ch:
628
                break
629
            if in_comment:
630
                if ch is '\n':
631
                    in_comment = False
632
                continue
633
            if prev_backslash:
634
                regexp += "\\" + ch
635
                prev_backslash = False
636
            elif ch is "[":
637
                in_class = True
638
                regexp += ch
639
            elif ch is "]" and in_class:
640
                in_class = False
641
                regexp += ch
642
            elif ch is "/" and not in_class:
643
                if verbose_regexp:
644
                    if peek() is not '/':
645
                        regexp += '\\/'
646
                        continue
647
                    next(True)
648
                    if peek() is not '/':
649
                        regexp += '\\/\\/'
650
                        continue
651
                    next(True)
652
                break
653
            elif ch is "\\":
654
                prev_backslash = True
655
            elif verbose_regexp and not in_class and ' \n\r\t'.indexOf(
656
                    ch) is not -1:
657
                pass
658
            elif verbose_regexp and not in_class and ch is '#':
659
                in_comment = True
660
            else:
661
                regexp += ch
662

663
        mods = read_name()
664
        return token("regexp", RegExp(regexp, mods))
665

666
    read_regexp = with_eof_error("Unterminated regular expression",
667
                                 do_read_regexp)
668

669
    def read_operator(prefix):
670
        def grow(op):
671
            if not peek():
672
                return op
673

674
            bigger = op + peek()
675
            if OPERATORS[bigger]:
676
                next()
677
                return grow(bigger)
678
            else:
679
                return op
680

681
        op = grow(prefix or next())
682
        if op is '->':
683
            # pretend that this is an operator as the tokenizer only allows
684
            # one character punctuation.
685
            return token('punc', op)
686
        return token("operator", op)
687

688
    def handle_slash():
689
        next()
690
        return read_regexp("") if S['regex_allowed'] else read_operator("/")
691

692
    def handle_dot():
693
        next()
694
        c = peek().charCodeAt(0)
695

696
        if is_digit(c):
697
            return read_num(".")
698

699
        if is_dot(c):
700
            # ellipses: Two dots in a row, e.g., [a..b]
701
            next()
702
            return token("punc", "..")
703

704
        return token("punc", ".")
705

706
    def read_word():
707
        word = read_name()
708
        return token("atom", word) if KEYWORDS_ATOM[word] else (
709
            token("name", word) if not KEYWORDS[word] else
710
            (token("operator", word) if OPERATORS[word]
711
             and prevChar() is not "." else token("keyword", word)))
712

713
    def next_token():
714

715
        indent = parse_whitespace()
716
        # if indent is 1:
717
        #     return token("punc", "{")
718
        if indent is -1:
719
            return token("punc", "}", False, True)
720

721
        start_token()
722
        ch = peek()
723
        if not ch:
724
            return token("eof")
725

726
        code = ch.charCodeAt(0)
727
        tmp_ = code
728
        if tmp_ is 34 or tmp_ is 39:  # double-quote (") or single quote (')
729
            return read_string(False)
730
        elif tmp_ is 35:  # pound-sign (#)
731
            if S['pos'] is 0 and charAt(S['text'], 1) is '!':
732
                #shebang
733
                return read_line_comment(True)
734
            regex_allowed = S['regex_allowed']
735
            S['comments_before'].push(read_line_comment())
736
            S['regex_allowed'] = regex_allowed
737
            return next_token()
738
        elif tmp_ is 46:  # dot (.)
739
            return handle_dot()
740
        elif tmp_ is 47:  # slash (/)
741
            return handle_slash()
742

743
        if is_digit(code):
744
            return read_num()
745

746
        if PUNC_CHARS[ch]:
747
            return token("punc", next())
748

749
        if OPERATOR_CHARS[ch]:
750
            return read_operator()
751

752
        if code is 92 and charAt(S['text'], S['pos'] + 1) is "\n":
753
            # backslash(=92) will consume the newline character that follows
754
            next()
755
            # backslash
756
            next()
757
            # newline
758
            S['newline_before'] = False
759
            return next_token()
760

761
        if is_identifier_start(code):
762
            tok = read_word()
763
            if '\'"'.includes(peek()) and is_string_modifier(tok.value):
764
                mods = tok.value.toLowerCase()
765
                start_pos_for_string = S['tokpos']
766
                stok = read_string(
767
                    mods.indexOf('r') is not -1,
768
                    mods.indexOf('v') is not -1)
769
                tok.endpos = stok.endpos
770
                if stok.type is not 'js' and mods.indexOf('f') is not -1:
771
                    tok.col += start_pos_for_string - tok.pos
772
                    return handle_interpolated_string(stok.value, tok)
773
                tok.value = stok.value
774
                tok.type = stok.type
775
            return tok
776

777
        parse_error("Unexpected character '" + ch + "'")
778

779
    def context(nc):
780
        nonlocal S
781
        if nc:
782
            S = nc
783
        return S
784

785
    next_token.context = context  # type: ignore
786

787
    return next_token
788

789
Product

Resources

Company