Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/peg_generator/pegen/parser.py
12 views
1
import argparse
2
import sys
3
import time
4
import token
5
import tokenize
6
import traceback
7
from abc import abstractmethod
8
from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast
9
10
from pegen.tokenizer import Mark, Tokenizer, exact_token_types
11
12
T = TypeVar("T")
13
P = TypeVar("P", bound="Parser")
14
F = TypeVar("F", bound=Callable[..., Any])
15
16
17
def logger(method: F) -> F:
18
"""For non-memoized functions that we want to be logged.
19
20
(In practice this is only non-leader left-recursive functions.)
21
"""
22
method_name = method.__name__
23
24
def logger_wrapper(self: P, *args: object) -> T:
25
if not self._verbose:
26
return method(self, *args)
27
argsr = ",".join(repr(arg) for arg in args)
28
fill = " " * self._level
29
print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})")
30
self._level += 1
31
tree = method(self, *args)
32
self._level -= 1
33
print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}")
34
return tree
35
36
logger_wrapper.__wrapped__ = method # type: ignore
37
return cast(F, logger_wrapper)
38
39
40
def memoize(method: F) -> F:
41
"""Memoize a symbol method."""
42
method_name = method.__name__
43
44
def memoize_wrapper(self: P, *args: object) -> T:
45
mark = self._mark()
46
key = mark, method_name, args
47
# Fast path: cache hit, and not verbose.
48
if key in self._cache and not self._verbose:
49
tree, endmark = self._cache[key]
50
self._reset(endmark)
51
return tree
52
# Slow path: no cache hit, or verbose.
53
verbose = self._verbose
54
argsr = ",".join(repr(arg) for arg in args)
55
fill = " " * self._level
56
if key not in self._cache:
57
if verbose:
58
print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})")
59
self._level += 1
60
tree = method(self, *args)
61
self._level -= 1
62
if verbose:
63
print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}")
64
endmark = self._mark()
65
self._cache[key] = tree, endmark
66
else:
67
tree, endmark = self._cache[key]
68
if verbose:
69
print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}")
70
self._reset(endmark)
71
return tree
72
73
memoize_wrapper.__wrapped__ = method # type: ignore
74
return cast(F, memoize_wrapper)
75
76
77
def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]:
78
"""Memoize a left-recursive symbol method."""
79
method_name = method.__name__
80
81
def memoize_left_rec_wrapper(self: P) -> Optional[T]:
82
mark = self._mark()
83
key = mark, method_name, ()
84
# Fast path: cache hit, and not verbose.
85
if key in self._cache and not self._verbose:
86
tree, endmark = self._cache[key]
87
self._reset(endmark)
88
return tree
89
# Slow path: no cache hit, or verbose.
90
verbose = self._verbose
91
fill = " " * self._level
92
if key not in self._cache:
93
if verbose:
94
print(f"{fill}{method_name} ... (looking at {self.showpeek()})")
95
self._level += 1
96
97
# For left-recursive rules we manipulate the cache and
98
# loop until the rule shows no progress, then pick the
99
# previous result. For an explanation why this works, see
100
# https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion
101
# (But we use the memoization cache instead of a static
102
# variable; perhaps this is similar to a paper by Warth et al.
103
# (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08).
104
105
# Prime the cache with a failure.
106
self._cache[key] = None, mark
107
lastresult, lastmark = None, mark
108
depth = 0
109
if verbose:
110
print(f"{fill}Recursive {method_name} at {mark} depth {depth}")
111
112
while True:
113
self._reset(mark)
114
self.in_recursive_rule += 1
115
try:
116
result = method(self)
117
finally:
118
self.in_recursive_rule -= 1
119
endmark = self._mark()
120
depth += 1
121
if verbose:
122
print(
123
f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}"
124
)
125
if not result:
126
if verbose:
127
print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}")
128
break
129
if endmark <= lastmark:
130
if verbose:
131
print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}")
132
break
133
self._cache[key] = lastresult, lastmark = result, endmark
134
135
self._reset(lastmark)
136
tree = lastresult
137
138
self._level -= 1
139
if verbose:
140
print(f"{fill}{method_name}() -> {tree!s:.200} [cached]")
141
if tree:
142
endmark = self._mark()
143
else:
144
endmark = mark
145
self._reset(endmark)
146
self._cache[key] = tree, endmark
147
else:
148
tree, endmark = self._cache[key]
149
if verbose:
150
print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]")
151
if tree:
152
self._reset(endmark)
153
return tree
154
155
memoize_left_rec_wrapper.__wrapped__ = method # type: ignore
156
return memoize_left_rec_wrapper
157
158
159
class Parser:
160
"""Parsing base class."""
161
162
KEYWORDS: ClassVar[Tuple[str, ...]]
163
164
SOFT_KEYWORDS: ClassVar[Tuple[str, ...]]
165
166
def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False):
167
self._tokenizer = tokenizer
168
self._verbose = verbose
169
self._level = 0
170
self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {}
171
# Integer tracking whether we are in a left recursive rule or not. Can be useful
172
# for error reporting.
173
self.in_recursive_rule = 0
174
# Pass through common tokenizer methods.
175
self._mark = self._tokenizer.mark
176
self._reset = self._tokenizer.reset
177
178
@abstractmethod
179
def start(self) -> Any:
180
pass
181
182
def showpeek(self) -> str:
183
tok = self._tokenizer.peek()
184
return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
185
186
@memoize
187
def name(self) -> Optional[tokenize.TokenInfo]:
188
tok = self._tokenizer.peek()
189
if tok.type == token.NAME and tok.string not in self.KEYWORDS:
190
return self._tokenizer.getnext()
191
return None
192
193
@memoize
194
def number(self) -> Optional[tokenize.TokenInfo]:
195
tok = self._tokenizer.peek()
196
if tok.type == token.NUMBER:
197
return self._tokenizer.getnext()
198
return None
199
200
@memoize
201
def string(self) -> Optional[tokenize.TokenInfo]:
202
tok = self._tokenizer.peek()
203
if tok.type == token.STRING:
204
return self._tokenizer.getnext()
205
return None
206
207
@memoize
208
def op(self) -> Optional[tokenize.TokenInfo]:
209
tok = self._tokenizer.peek()
210
if tok.type == token.OP:
211
return self._tokenizer.getnext()
212
return None
213
214
@memoize
215
def type_comment(self) -> Optional[tokenize.TokenInfo]:
216
tok = self._tokenizer.peek()
217
if tok.type == token.TYPE_COMMENT:
218
return self._tokenizer.getnext()
219
return None
220
221
@memoize
222
def soft_keyword(self) -> Optional[tokenize.TokenInfo]:
223
tok = self._tokenizer.peek()
224
if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS:
225
return self._tokenizer.getnext()
226
return None
227
228
@memoize
229
def expect(self, type: str) -> Optional[tokenize.TokenInfo]:
230
tok = self._tokenizer.peek()
231
if tok.string == type:
232
return self._tokenizer.getnext()
233
if type in exact_token_types:
234
if tok.type == exact_token_types[type]:
235
return self._tokenizer.getnext()
236
if type in token.__dict__:
237
if tok.type == token.__dict__[type]:
238
return self._tokenizer.getnext()
239
if tok.type == token.OP and tok.string == type:
240
return self._tokenizer.getnext()
241
return None
242
243
def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]:
244
if res is None:
245
raise self.make_syntax_error(f"expected {expectation}")
246
return res
247
248
def positive_lookahead(self, func: Callable[..., T], *args: object) -> T:
249
mark = self._mark()
250
ok = func(*args)
251
self._reset(mark)
252
return ok
253
254
def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool:
255
mark = self._mark()
256
ok = func(*args)
257
self._reset(mark)
258
return not ok
259
260
def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError:
261
tok = self._tokenizer.diagnose()
262
return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line))
263
264
265
def simple_parser_main(parser_class: Type[Parser]) -> None:
266
argparser = argparse.ArgumentParser()
267
argparser.add_argument(
268
"-v",
269
"--verbose",
270
action="count",
271
default=0,
272
help="Print timing stats; repeat for more debug output",
273
)
274
argparser.add_argument(
275
"-q", "--quiet", action="store_true", help="Don't print the parsed program"
276
)
277
argparser.add_argument("filename", help="Input file ('-' to use stdin)")
278
279
args = argparser.parse_args()
280
verbose = args.verbose
281
verbose_tokenizer = verbose >= 3
282
verbose_parser = verbose == 2 or verbose >= 4
283
284
t0 = time.time()
285
286
filename = args.filename
287
if filename == "" or filename == "-":
288
filename = "<stdin>"
289
file = sys.stdin
290
else:
291
file = open(args.filename)
292
try:
293
tokengen = tokenize.generate_tokens(file.readline)
294
tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer)
295
parser = parser_class(tokenizer, verbose=verbose_parser)
296
tree = parser.start()
297
try:
298
if file.isatty():
299
endpos = 0
300
else:
301
endpos = file.tell()
302
except IOError:
303
endpos = 0
304
finally:
305
if file is not sys.stdin:
306
file.close()
307
308
t1 = time.time()
309
310
if not tree:
311
err = parser.make_syntax_error(filename)
312
traceback.print_exception(err.__class__, err, None)
313
sys.exit(1)
314
315
if not args.quiet:
316
print(tree)
317
318
if verbose:
319
dt = t1 - t0
320
diag = tokenizer.diagnose()
321
nlines = diag.end[0]
322
if diag.type == token.ENDMARKER:
323
nlines -= 1
324
print(f"Total time: {dt:.3f} sec; {nlines} lines", end="")
325
if endpos:
326
print(f" ({endpos} bytes)", end="")
327
if dt:
328
print(f"; {nlines / dt:.0f} lines/sec")
329
else:
330
print()
331
print("Caches sizes:")
332
print(f" token array : {len(tokenizer._tokens):10}")
333
print(f" cache : {len(parser._cache):10}")
334
## print_memstats()
335
336