Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/python-wasm
Path: blob/main/python/pylang/src/tokenizer.py
1396 views
1
from __python__ import hash_literals # type: ignore
2
3
# mypy
4
from __python__ import RegExp, String, undefined, parseFloat
5
from typing import Any, Callable, Optional, Union, List, Literal
6
7
from unicode_aliases import ALIAS_MAP # type: ignore
8
from utils import make_predicate, characters, charAt, startswith
9
from ast_types import AST_Token
10
from errors import EOFError, SyntaxError
11
from string_interpolation import interpolate # type: ignore
12
13
RE_HEX_NUMBER = RegExp(r"^0x[0-9a-f]+$", "i")
14
RE_OCT_NUMBER = RegExp(r"^0[0-7]+$")
15
RE_DEC_NUMBER = RegExp(r"^\d*\.?\d*(?:e[+-]?\d*(?:\d\.?|\.?\d)\d*)?$", "i")
16
17
OPERATOR_CHARS = make_predicate(characters("+-*&%=<>!?|~^@"))
18
19
ASCII_CONTROL_CHARS = {
20
'a': 7,
21
'b': 8,
22
'f': 12,
23
'n': 10,
24
'r': 13,
25
't': 9,
26
'v': 11
27
}
28
HEX_PAT = RegExp(r"[a-fA-F0-9]")
29
NAME_PAT = RegExp(r"[a-zA-Z ]")
30
31
OPERATORS = make_predicate([
32
"in", "instanceof", "typeof", "new", "void", "del", "+", "-", "not", "~",
33
"&", "|", "^^", "^", "**", "*", "//", "/", "%", ">>", "<<", ">>>", "<",
34
">", "<=", ">=", "==", "is", "!=", "=", "+=", "-=", "//=", "/=", "*=",
35
"%=", ">>=", "<<=", ">>>=", "|=", "^=", "&=", "and", "or", "@", "->"
36
])
37
38
OP_MAP = {
39
'or': "||",
40
'and': "&&",
41
'not': "!",
42
'del': "delete",
43
'None': "null",
44
'is': "===",
45
}
46
47
WHITESPACE_CHARS = make_predicate(
48
characters(
49
" \u00a0\n\r\t\f\u000b\u200b\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u202f\u205f\u3000"
50
))
51
52
PUNC_BEFORE_EXPRESSION = make_predicate(characters("[{(,.;:"))
53
54
PUNC_CHARS = make_predicate(characters("[]{}(),;:?"))
55
56
keywords = "as assert break class continue def del do elif else except finally for from global if import in is lambda new nonlocal pass raise return yield try while with or and not"
57
58
keywords_atom = "False None True"
59
60
# see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar
61
reserved_words = (
62
"break case class catch const continue debugger default delete do else export extends"
63
" finally for function if import in instanceof new return super switch this throw try typeof var void"
64
" while with yield enum implements static private package let public protected interface await null true false"
65
)
66
67
keyword_before_expression = "return yield new del raise elif else if"
68
69
ALL_KEYWORDS = keywords + " " + keywords_atom
70
71
KEYWORDS = make_predicate(keywords)
72
RESERVED_WORDS = make_predicate(reserved_words)
73
KEYWORDS_BEFORE_EXPRESSION = make_predicate(keyword_before_expression)
74
KEYWORDS_ATOM = make_predicate(keywords_atom)
75
IDENTIFIER_PAT = RegExp(r"^[a-z_$][_a-z0-9$]*$", "i")
76
77
78
# https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
79
def is_string_modifier(val: str) -> bool:
80
for ch in val:
81
if ch not in 'vrufVRUF':
82
return False
83
return True
84
85
86
def is_letter(code: int) -> bool:
87
return code >= 97 and code <= 122 or code >= 65 and code <= 90 or code >= 170 and UNICODE[
88
'letter'].test(chr(code))
89
90
91
def is_digit(code: int) -> bool:
92
return code >= 48 and code <= 57
93
94
95
def is_dot(code: int) -> bool:
96
return code == 46
97
98
99
def is_alphanumeric_char(code: int) -> bool:
100
return is_digit(code) or is_letter(code)
101
102
103
def is_unicode_combining_mark(ch: str) -> bool:
104
return UNICODE['non_spacing_mark'].test(
105
ch) or UNICODE['space_combining_mark'].test(ch)
106
107
108
def is_unicode_connector_punctuation(ch: str) -> bool:
109
return UNICODE['connector_punctuation'].test(ch)
110
111
112
def is_identifier(name: str) -> bool:
113
return not RESERVED_WORDS[name] and not KEYWORDS[
114
name] and not KEYWORDS_ATOM[name] and IDENTIFIER_PAT.test(name)
115
116
117
def is_identifier_start(code: int) -> bool:
118
return code is 36 or code is 95 or is_letter(code)
119
120
121
def is_identifier_char(ch: str) -> bool:
122
code = ord(ch)
123
return is_identifier_start(code) or is_digit(
124
code) or code is 8204 or code is 8205 or is_unicode_combining_mark(
125
ch) or is_unicode_connector_punctuation(ch)
126
127
128
def parse_js_number(num: str) -> Union[float, int]:
129
if RE_HEX_NUMBER.test(num):
130
return int(num[2:], 16)
131
elif RE_OCT_NUMBER.test(num):
132
return int(num[1:], 8)
133
elif RE_DEC_NUMBER.test(num):
134
return float(num)
135
raise ValueError("invalid number")
136
137
138
# regexps adapted from http://xregexp.com/plugins/#unicode
139
UNICODE = {
140
'letter':
141
RegExp(
142
"[\\u0041-\\u005A\\u0061-\\u007A\\u00AA\\u00B5\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02C1\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC\\u02EE\\u0370-\\u0374\\u0376\\u0377\\u037A-\\u037D\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u0523\\u0531-\\u0556\\u0559\\u0561-\\u0587\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0621-\\u064A\\u066E\\u066F\\u0671-\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE\\u06EF\\u06FA-\\u06FC\\u06FF\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1\\u07CA-\\u07EA\\u07F4\\u07F5\\u07FA\\u0904-\\u0939\\u093D\\u0950\\u0958-\\u0961\\u0971\\u0972\\u097B-\\u097F\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BD\\u09CE\\u09DC\\u09DD\\u09DF-\\u09E1\\u09F0\\u09F1\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AD0\\u0AE0\\u0AE1\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3D\\u0B5C\\u0B5D\\u0B5F-\\u0B61\\u0B71\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D\\u0C58\\u0C59\\u0C60\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD\\u0CDE\\u0CE0\\u0CE1\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D28\\u0D2A-\\u0D39\\u0D3D\\u0D60\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E30\\u0E32\\u0E33\\u0E40-\\u0E46\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB0\\u0EB2\\u0EB3\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EDC\\u0EDD\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C\\u0F88-\\u0F8B\\u1000-\\u102A\\u103F\\u1050-\\u1055\\u105A-\\u105D\\u1061\\u1065\\u1066\\u106E-\\u1070\\u1075-\\u1081\\u108E\\u10A0-\\u10C5\\u10D0-\\u10FA\\u10FC\\u1100-\\u1159\\u115F-\\u11A2\\u11A8-\\u11F9\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F4\\u1401-\\u166C\\u166F-\\u1676\\u1681-\\u169A\\u16A0-\\u16EA\\u1700-\\u170C\\u170E-\\u1711\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770\\u1780-\\u17B3\\u17D7\\u17DC\\u1820-\\u1877\\u1880-\\u18A8\\u18AA\\u1900-\\u191C\\u1950-\\u196D\\u1970-\\u1974\\u1980-\\u19A9\\u19C1-\\u19C7\\u1A00-\\u1A16\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE\\u1BAF\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C7D\\u1D00-\\u1DBF\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071\\u207F\\u2090-\\u2094\\u2102\\u2107\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2139\\u213C-\\u213F\\u2145-\\u2149\\u214E\\u2183\\u2184\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2C6F\\u2C71-\\u2C7D\\u2C80-\\u2CE4\\u2D00-\\u2D25\\u2D30-\\u2D65\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F\\u3005\\u3006\\u3031-\\u3035\\u303B\\u303C\\u3041-\\u3096\\u309D-\\u309F\\u30A1-\\u30FA\\u30FC-\\u30FF\\u3105-\\u312D\\u3131-\\u318E\\u31A0-\\u31B7\\u31F0-\\u31FF\\u3400\\u4DB5\\u4E00\\u9FC3\\uA000-\\uA48C\\uA500-\\uA60C\\uA610-\\uA61F\\uA62A\\uA62B\\uA640-\\uA65F\\uA662-\\uA66E\\uA67F-\\uA697\\uA717-\\uA71F\\uA722-\\uA788\\uA78B\\uA78C\\uA7FB-\\uA801\\uA803-\\uA805\\uA807-\\uA80A\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA90A-\\uA925\\uA930-\\uA946\\uAA00-\\uAA28\\uAA40-\\uAA42\\uAA44-\\uAA4B\\uAC00\\uD7A3\\uF900-\\uFA2D\\uFA30-\\uFA6A\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC]"
143
),
144
'non_spacing_mark':
145
RegExp(
146
"[\\u0300-\\u036F\\u0483-\\u0487\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u064B-\\u065E\\u0670\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07A6-\\u07B0\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0900-\\u0902\\u093C\\u0941-\\u0948\\u094D\\u0951-\\u0955\\u0962\\u0963\\u0981\\u09BC\\u09C1-\\u09C4\\u09CD\\u09E2\\u09E3\\u0A01\\u0A02\\u0A3C\\u0A41\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A51\\u0A70\\u0A71\\u0A75\\u0A81\\u0A82\\u0ABC\\u0AC1-\\u0AC5\\u0AC7\\u0AC8\\u0ACD\\u0AE2\\u0AE3\\u0B01\\u0B3C\\u0B3F\\u0B41-\\u0B44\\u0B4D\\u0B56\\u0B62\\u0B63\\u0B82\\u0BC0\\u0BCD\\u0C3E-\\u0C40\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C62\\u0C63\\u0CBC\\u0CBF\\u0CC6\\u0CCC\\u0CCD\\u0CE2\\u0CE3\\u0D41-\\u0D44\\u0D4D\\u0D62\\u0D63\\u0DCA\\u0DD2-\\u0DD4\\u0DD6\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB\\u0EBC\\u0EC8-\\u0ECD\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F71-\\u0F7E\\u0F80-\\u0F84\\u0F86\\u0F87\\u0F90-\\u0F97\\u0F99-\\u0FBC\\u0FC6\\u102D-\\u1030\\u1032-\\u1037\\u1039\\u103A\\u103D\\u103E\\u1058\\u1059\\u105E-\\u1060\\u1071-\\u1074\\u1082\\u1085\\u1086\\u108D\\u109D\\u135F\\u1712-\\u1714\\u1732-\\u1734\\u1752\\u1753\\u1772\\u1773\\u17B7-\\u17BD\\u17C6\\u17C9-\\u17D3\\u17DD\\u180B-\\u180D\\u18A9\\u1920-\\u1922\\u1927\\u1928\\u1932\\u1939-\\u193B\\u1A17\\u1A18\\u1A56\\u1A58-\\u1A5E\\u1A60\\u1A62\\u1A65-\\u1A6C\\u1A73-\\u1A7C\\u1A7F\\u1B00-\\u1B03\\u1B34\\u1B36-\\u1B3A\\u1B3C\\u1B42\\u1B6B-\\u1B73\\u1B80\\u1B81\\u1BA2-\\u1BA5\\u1BA8\\u1BA9\\u1C2C-\\u1C33\\u1C36\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1DFF\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-\\u302F\\u3099\\u309A\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA802\\uA806\\uA80B\\uA825\\uA826\\uA8C4\\uA8E0-\\uA8F1\\uA926-\\uA92D\\uA947-\\uA951\\uA980-\\uA982\\uA9B3\\uA9B6-\\uA9B9\\uA9BC\\uAA29-\\uAA2E\\uAA31\\uAA32\\uAA35\\uAA36\\uAA43\\uAA4C\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABE5\\uABE8\\uABED\\uFB1E\\uFE00-\\uFE0F\\uFE20-\\uFE26]"
147
),
148
'space_combining_mark':
149
RegExp(
150
"[\\u0903\\u093E-\\u0940\\u0949-\\u094C\\u094E\\u0982\\u0983\\u09BE-\\u09C0\\u09C7\\u09C8\\u09CB\\u09CC\\u09D7\\u0A03\\u0A3E-\\u0A40\\u0A83\\u0ABE-\\u0AC0\\u0AC9\\u0ACB\\u0ACC\\u0B02\\u0B03\\u0B3E\\u0B40\\u0B47\\u0B48\\u0B4B\\u0B4C\\u0B57\\u0BBE\\u0BBF\\u0BC1\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCC\\u0BD7\\u0C01-\\u0C03\\u0C41-\\u0C44\\u0C82\\u0C83\\u0CBE\\u0CC0-\\u0CC4\\u0CC7\\u0CC8\\u0CCA\\u0CCB\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D3E-\\u0D40\\u0D46-\\u0D48\\u0D4A-\\u0D4C\\u0D57\\u0D82\\u0D83\\u0DCF-\\u0DD1\\u0DD8-\\u0DDF\\u0DF2\\u0DF3\\u0F3E\\u0F3F\\u0F7F\\u102B\\u102C\\u1031\\u1038\\u103B\\u103C\\u1056\\u1057\\u1062-\\u1064\\u1067-\\u106D\\u1083\\u1084\\u1087-\\u108C\\u108F\\u109A-\\u109C\\u17B6\\u17BE-\\u17C5\\u17C7\\u17C8\\u1923-\\u1926\\u1929-\\u192B\\u1930\\u1931\\u1933-\\u1938\\u19B0-\\u19C0\\u19C8\\u19C9\\u1A19-\\u1A1B\\u1A55\\u1A57\\u1A61\\u1A63\\u1A64\\u1A6D-\\u1A72\\u1B04\\u1B35\\u1B3B\\u1B3D-\\u1B41\\u1B43\\u1B44\\u1B82\\u1BA1\\u1BA6\\u1BA7\\u1BAA\\u1C24-\\u1C2B\\u1C34\\u1C35\\u1CE1\\u1CF2\\uA823\\uA824\\uA827\\uA880\\uA881\\uA8B4-\\uA8C3\\uA952\\uA953\\uA983\\uA9B4\\uA9B5\\uA9BA\\uA9BB\\uA9BD-\\uA9C0\\uAA2F\\uAA30\\uAA33\\uAA34\\uAA4D\\uAA7B\\uABE3\\uABE4\\uABE6\\uABE7\\uABE9\\uABEA\\uABEC]"
151
),
152
'connector_punctuation':
153
RegExp(
154
"[\\u005F\\u203F\\u2040\\u2054\\uFE33\\uFE34\\uFE4D-\\uFE4F\\uFF3F]")
155
} # }}}
156
157
158
def is_token(token, type, val) -> bool:
159
return token.type is type and (val is None or val is undefined
160
or token.value is val)
161
162
163
def tokenizer(raw_text: str, filename: str) -> Callable[[], Any]:
164
S = {
165
'exponent': # parse ^ as exponent and ^^ as xor
166
False,
167
'text':
168
raw_text.replace(RegExp(r"\r\n?|[\n\u2028\u2029]","g"),
169
"\n").replace(RegExp(r"\uFEFF","g"), ""),
170
'filename':
171
filename,
172
'pos':
173
0,
174
'tokpos':
175
0,
176
'line':
177
1,
178
'tokline':
179
0,
180
'col':
181
0,
182
'tokcol':
183
0,
184
'newline_before':
185
False,
186
'regex_allowed':
187
False,
188
'comments_before':
189
[],
190
'whitespace_before':
191
[],
192
'newblock':
193
False,
194
'endblock':
195
False,
196
'indentation_matters':
197
r'%js [ true ]',
198
'cached_whitespace':
199
"",
200
'prev':
201
undefined,
202
'index_or_slice':
203
r'%js [ false ]',
204
'expecting_object_literal_key':
205
False, # This is set by the parser when it is expecting an object literal key
206
}
207
208
def peek() -> str:
209
return charAt(S['text'], S['pos'])
210
211
def peekpeek():
212
return charAt(S['text'], S['pos'] + 1)
213
214
def prevChar():
215
return charAt(S['text'], S['tokpos'] - 1)
216
217
def next(signal_eof=False, in_string=False):
218
ch = charAt(S['text'], S['pos'])
219
S['pos'] += 1
220
if signal_eof and not ch:
221
raise EOFError
222
223
if ch is "\n":
224
S['newline_before'] = S['newline_before'] or not in_string
225
S['line'] += 1
226
S['col'] = 0
227
else:
228
S['col'] += 1
229
return ch
230
231
def find(what: str, signal_eof: bool = False) -> int:
232
pos = S['text'].indexOf(what, S['pos'])
233
if signal_eof and pos is -1:
234
raise EOFError
235
return pos
236
237
def start_token() -> None:
238
S['tokline'] = S['line']
239
S['tokcol'] = S['col']
240
S['tokpos'] = S['pos']
241
242
def token(type: str,
243
value: Any,
244
is_comment: bool = False,
245
keep_newline: bool = False) -> AST_Token:
246
if S['exponent'] and type == 'operator':
247
if value == '^':
248
value = '**'
249
elif value == '^^':
250
value = '^'
251
S['regex_allowed'] = (
252
type is "operator"
253
or type is "keyword" and KEYWORDS_BEFORE_EXPRESSION[value]
254
or type is "punc" and PUNC_BEFORE_EXPRESSION[value])
255
256
if type is "operator" and value is "is" and S['text'].substr(
257
S['pos']).trimLeft().substr(0, 4).trimRight() is "not":
258
next_token()
259
value = "!=="
260
261
if type is "operator" and OP_MAP[value]:
262
value = OP_MAP[value]
263
264
ret = {
265
'type': type,
266
'value': value,
267
'line': S['tokline'],
268
'col': S['tokcol'],
269
'pos': S['tokpos'],
270
'endpos': S['pos'],
271
'nlb': S['newline_before'],
272
'file': filename,
273
'leading_whitespace': S['whitespace_before'][-1] or '',
274
}
275
if not is_comment:
276
ret['comments_before'] = S['comments_before']
277
S['comments_before'] = []
278
# make note of any newlines in the comments that came before
279
for i in range(ret['comments_before.length']):
280
ret['nlb'] = ret['nlb'] or ret['comments_before'][i]['nlb']
281
282
if not keep_newline:
283
S['newline_before'] = False
284
285
if type is "punc":
286
if (value is ":" and not S['index_or_slice'][-1]
287
and not S['expecting_object_literal_key'] and
288
(not S['text'].substring(S['pos'] + 1, find("\n")).trim()
289
or not S['text'].substring(S['pos'] + 1, find("#")).trim())):
290
S['newblock'] = True
291
S['indentation_matters'].push(True)
292
293
if value is "[":
294
if S['prev'] and (S['prev'].type is "name" or
295
(S['prev'].type is 'punc'
296
and S['prev'].value in ')]')):
297
S['index_or_slice'].push(True)
298
else:
299
S['index_or_slice'].push(False)
300
S['indentation_matters'].push(False)
301
elif value is "{" or value is "(":
302
S['indentation_matters'].push(False)
303
elif value is "]":
304
S['index_or_slice'].pop()
305
S['indentation_matters'].pop()
306
elif value is "}" or value is ")":
307
S['indentation_matters'].pop()
308
S['prev'] = AST_Token(ret)
309
return S['prev']
310
311
# this will transform leading whitespace to block tokens unless
312
# part of array/hash, and skip non-leading whitespace
313
def parse_whitespace() -> Union[Literal[-1], Literal[1], Literal[0]]:
314
leading_whitespace = ""
315
whitespace_exists = False
316
while WHITESPACE_CHARS[peek()]:
317
whitespace_exists = True
318
ch = next()
319
if ch is "\n":
320
leading_whitespace = ""
321
else:
322
leading_whitespace += ch
323
if peek() is not "#":
324
if not whitespace_exists:
325
leading_whitespace = S['cached_whitespace']
326
else:
327
S['cached_whitespace'] = leading_whitespace
328
if S['newline_before'] or S['endblock']:
329
return test_indent_token(leading_whitespace)
330
return 0
331
332
def test_indent_token(
333
leading_whitespace: str
334
) -> Union[Literal[-1], Literal[1], Literal[0]]:
335
most_recent = S['whitespace_before'][-1] or ""
336
S['endblock'] = False
337
if S['indentation_matters'][
338
-1] and leading_whitespace is not most_recent:
339
if S['newblock'] and leading_whitespace and startswith(
340
leading_whitespace, most_recent):
341
# positive indent, new block
342
S['newblock'] = False
343
S['whitespace_before'].push(leading_whitespace)
344
return 1
345
elif most_recent and startswith(most_recent, leading_whitespace):
346
# negative indent, block is ending
347
S['endblock'] = True
348
S['whitespace_before'].pop()
349
return -1
350
else:
351
# indent mismatch, inconsistent indentation
352
parse_error("Inconsistent indentation")
353
return 0
354
355
def read_while(pred: Callable) -> str:
356
ret = ""
357
i = 0
358
ch = peek()
359
while ch and pred(ch, i):
360
i += 1
361
ret += next()
362
ch = peek()
363
return ret
364
365
def parse_error(err: str, is_eof: bool = False) -> SyntaxError:
366
raise SyntaxError(err, filename, S['tokline'], S['tokcol'],
367
S['tokpos'], is_eof)
368
369
def read_num(prefix: str) -> Optional[AST_Token]:
370
has_e = False
371
has_x = False
372
has_dot = prefix is "."
373
374
# Read a binary number
375
if not prefix and peek() is '0' and charAt(S['text'],
376
S['pos'] + 1) is 'b':
377
next(), next()
378
379
def is01(ch):
380
return ch is '0' or ch is '1'
381
382
num = read_while(is01)
383
try:
384
valid = int(num, 2) # type: Union[float, int]
385
except:
386
parse_error('Invalid syntax for a binary number')
387
return token('num', valid)
388
seen = [] # type: List[str]
389
390
def is_num(ch, i):
391
nonlocal has_dot, has_e, has_x
392
seen.push(ch)
393
if ch is 'x' or ch is 'X':
394
if has_x or seen.length is not 2 or seen[0] is not '0':
395
return False
396
has_x = True
397
return True
398
elif ch is 'e' or ch is 'E':
399
if has_x:
400
return True
401
if has_e or i == 0:
402
return False
403
has_e = True
404
return True
405
elif ch is '-':
406
if i is 0 and not prefix:
407
return True
408
if has_e and seen[i - 1].toLowerCase() is 'e':
409
return True
410
return False
411
elif ch is '+':
412
if has_e and seen[i - 1].toLowerCase() is 'e':
413
return True
414
return False
415
elif ch is '.':
416
# If next ch after this is also a ., then its
417
# something like [389..5077], so we stop
418
if peekpeek() is '.':
419
return False
420
if not has_dot and not has_x and not has_e:
421
has_dot = True
422
return True
423
return False
424
return is_alphanumeric_char(ch.charCodeAt(0))
425
426
num = read_while(is_num)
427
if prefix:
428
num = prefix + num
429
430
try:
431
valid = parse_js_number(num)
432
except:
433
parse_error("SyntaxError: invalid syntax in numeric literal -- " +
434
num)
435
return undefined
436
return token("num", valid)
437
438
# This returns str or int, since it could be a
439
# hex number or a hex character code.
440
def read_hex_digits(count: int) -> Union[str, int]:
441
ans = ''
442
while count > 0:
443
count -= 1
444
if not HEX_PAT.test(peek()):
445
return ans
446
ans += next()
447
nval = int(ans, 16)
448
if nval > 0x10FFFF:
449
return ans
450
return nval
451
452
def read_escape_sequence():
453
q = next(True)
454
if q is '\n':
455
return ''
456
if q is '\\':
457
return q
458
if '"\''.indexOf(q) is not -1:
459
return q
460
if ASCII_CONTROL_CHARS[q]:
461
return String.fromCharCode(ASCII_CONTROL_CHARS[q])
462
if '0' <= q <= '7':
463
octal = q
464
if '0' <= peek() <= '7':
465
octal += next()
466
if '0' <= peek() <= '7':
467
octal += next()
468
code = parseInt(octal, 8)
469
if isNaN(code):
470
return '\\' + octal
471
return String.fromCharCode(code)
472
if q is 'x':
473
code = read_hex_digits(2)
474
if jstype(code) is 'number':
475
return String.fromCharCode(code)
476
return '\\x' + code
477
if q is 'u':
478
code = read_hex_digits(4)
479
if jstype(code) is 'number':
480
return String.fromCharCode(code)
481
return '\\u' + code
482
if q is 'U':
483
code = read_hex_digits(8)
484
if jstype(code) is 'number':
485
if code <= 0xFFFF:
486
return String.fromCharCode(code)
487
code -= 0x10000
488
return String.fromCharCode(0xD800 + (code >> 10),
489
0xDC00 + (code & 0x3FF))
490
return '\\U' + code
491
if q is 'N' and peek() is '{':
492
next()
493
494
def is_name_ch(ch):
495
return NAME_PAT.test(ch)
496
497
name = read_while(is_name_ch)
498
if peek() is not '}':
499
return '\\N{' + name
500
next()
501
key = (name or '').toLowerCase()
502
if not name or not Object.prototype.hasOwnProperty.call(
503
ALIAS_MAP, key):
504
return '\\N{' + name + '}'
505
code = ALIAS_MAP[key]
506
if code <= 0xFFFF:
507
return String.fromCharCode(code)
508
code -= 0x10000
509
return String.fromCharCode(0xD800 + (code >> 10),
510
0xDC00 + (code & 0x3FF))
511
return '\\' + q
512
513
def with_eof_error(eof_error, cont):
514
def eof_error():
515
try:
516
return cont.apply(None, arguments)
517
except Exception as ex:
518
if ex is EOFError:
519
parse_error(eof_error, True)
520
else:
521
raise
522
523
return eof_error
524
525
def _read_string(is_raw_literal, is_js_literal):
526
quote = next()
527
tok_type = 'js' if is_js_literal else 'string'
528
ret = ""
529
is_multiline = False
530
if peek() is quote:
531
# two quotes in a row
532
next(True)
533
if peek() is quote:
534
# multiline string (3 quotes in a row)
535
next(True)
536
is_multiline = True
537
else:
538
return token(tok_type, '')
539
540
while True:
541
ch = next(True, True)
542
if not ch:
543
break
544
if ch is "\n" and not is_multiline:
545
parse_error("End of line while scanning string literal")
546
547
if ch is "\\":
548
ret += (
549
'\\' +
550
next(True)) if is_raw_literal else read_escape_sequence()
551
continue
552
553
if ch is quote:
554
if not is_multiline:
555
break
556
if peek() is quote:
557
next()
558
if peek() is quote:
559
next()
560
break
561
else:
562
ch += quote
563
ret += ch
564
if is_raw_literal and ret[:3] == '%js' and WHITESPACE_CHARS[ret[3]]:
565
return token(
566
'js', ret[4:].trim()) # trim since really javascript string.
567
return token(tok_type, ret)
568
569
read_string = with_eof_error("Unterminated string constant", _read_string)
570
571
def handle_interpolated_string(string, start_tok):
572
def raise_error(err):
573
raise SyntaxError(err, filename, start_tok.line, start_tok.col,
574
start_tok.pos, False)
575
576
S['text'] = S['text'][:S['pos']] + '(' + interpolate(
577
string, raise_error) + ')' + S['text'][S['pos']:]
578
return token('punc', next())
579
580
def read_line_comment(shebang):
581
if not shebang:
582
next()
583
i = find("\n")
584
585
if i is -1:
586
ret = S['text'].substr(S['pos'])
587
S['pos'] = S['text'].length
588
else:
589
ret = S['text'].substring(S['pos'], i)
590
S['pos'] = i
591
592
return token("shebang" if shebang else "comment1", ret, True)
593
594
def read_name():
595
name = ch = ""
596
while True:
597
ch = peek()
598
if ch is None: break
599
if ch is "\\":
600
if charAt(S['text'], S['pos'] + 1) is "\n":
601
S['pos'] += 2
602
continue
603
break
604
elif is_identifier_char(ch):
605
name += next()
606
else:
607
break
608
return name
609
610
def do_read_regexp():
611
prev_backslash = False
612
regexp = ch = ''
613
in_class = False
614
verbose_regexp = False
615
in_comment = False
616
617
if peek() is '/':
618
next(True)
619
if peek() is '/':
620
verbose_regexp = True
621
next(True)
622
else: # empty regexp (//)
623
mods = read_name()
624
return token("regexp", RegExp(regexp, mods))
625
while True:
626
ch = next(True)
627
if not ch:
628
break
629
if in_comment:
630
if ch is '\n':
631
in_comment = False
632
continue
633
if prev_backslash:
634
regexp += "\\" + ch
635
prev_backslash = False
636
elif ch is "[":
637
in_class = True
638
regexp += ch
639
elif ch is "]" and in_class:
640
in_class = False
641
regexp += ch
642
elif ch is "/" and not in_class:
643
if verbose_regexp:
644
if peek() is not '/':
645
regexp += '\\/'
646
continue
647
next(True)
648
if peek() is not '/':
649
regexp += '\\/\\/'
650
continue
651
next(True)
652
break
653
elif ch is "\\":
654
prev_backslash = True
655
elif verbose_regexp and not in_class and ' \n\r\t'.indexOf(
656
ch) is not -1:
657
pass
658
elif verbose_regexp and not in_class and ch is '#':
659
in_comment = True
660
else:
661
regexp += ch
662
663
mods = read_name()
664
return token("regexp", RegExp(regexp, mods))
665
666
read_regexp = with_eof_error("Unterminated regular expression",
667
do_read_regexp)
668
669
def read_operator(prefix):
670
def grow(op):
671
if not peek():
672
return op
673
674
bigger = op + peek()
675
if OPERATORS[bigger]:
676
next()
677
return grow(bigger)
678
else:
679
return op
680
681
op = grow(prefix or next())
682
if op is '->':
683
# pretend that this is an operator as the tokenizer only allows
684
# one character punctuation.
685
return token('punc', op)
686
return token("operator", op)
687
688
def handle_slash():
689
next()
690
return read_regexp("") if S['regex_allowed'] else read_operator("/")
691
692
def handle_dot():
693
next()
694
c = peek().charCodeAt(0)
695
696
if is_digit(c):
697
return read_num(".")
698
699
if is_dot(c):
700
# ellipses: Two dots in a row, e.g., [a..b]
701
next()
702
return token("punc", "..")
703
704
return token("punc", ".")
705
706
def read_word():
707
word = read_name()
708
return token("atom", word) if KEYWORDS_ATOM[word] else (
709
token("name", word) if not KEYWORDS[word] else
710
(token("operator", word) if OPERATORS[word]
711
and prevChar() is not "." else token("keyword", word)))
712
713
def next_token():
714
715
indent = parse_whitespace()
716
# if indent is 1:
717
# return token("punc", "{")
718
if indent is -1:
719
return token("punc", "}", False, True)
720
721
start_token()
722
ch = peek()
723
if not ch:
724
return token("eof")
725
726
code = ch.charCodeAt(0)
727
tmp_ = code
728
if tmp_ is 34 or tmp_ is 39: # double-quote (") or single quote (')
729
return read_string(False)
730
elif tmp_ is 35: # pound-sign (#)
731
if S['pos'] is 0 and charAt(S['text'], 1) is '!':
732
#shebang
733
return read_line_comment(True)
734
regex_allowed = S['regex_allowed']
735
S['comments_before'].push(read_line_comment())
736
S['regex_allowed'] = regex_allowed
737
return next_token()
738
elif tmp_ is 46: # dot (.)
739
return handle_dot()
740
elif tmp_ is 47: # slash (/)
741
return handle_slash()
742
743
if is_digit(code):
744
return read_num()
745
746
if PUNC_CHARS[ch]:
747
return token("punc", next())
748
749
if OPERATOR_CHARS[ch]:
750
return read_operator()
751
752
if code is 92 and charAt(S['text'], S['pos'] + 1) is "\n":
753
# backslash(=92) will consume the newline character that follows
754
next()
755
# backslash
756
next()
757
# newline
758
S['newline_before'] = False
759
return next_token()
760
761
if is_identifier_start(code):
762
tok = read_word()
763
if '\'"'.includes(peek()) and is_string_modifier(tok.value):
764
mods = tok.value.toLowerCase()
765
start_pos_for_string = S['tokpos']
766
stok = read_string(
767
mods.indexOf('r') is not -1,
768
mods.indexOf('v') is not -1)
769
tok.endpos = stok.endpos
770
if stok.type is not 'js' and mods.indexOf('f') is not -1:
771
tok.col += start_pos_for_string - tok.pos
772
return handle_interpolated_string(stok.value, tok)
773
tok.value = stok.value
774
tok.type = stok.type
775
return tok
776
777
parse_error("Unexpected character '" + ch + "'")
778
779
def context(nc):
780
nonlocal S
781
if nc:
782
S = nc
783
return S
784
785
next_token.context = context # type: ignore
786
787
return next_token
788
789