Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/peg_generator/pegen/tokenizer.py
12 views
1
import token
2
import tokenize
3
from typing import Dict, Iterator, List
4
5
Mark = int # NewType('Mark', int)
6
7
exact_token_types = token.EXACT_TOKEN_TYPES
8
9
10
def shorttok(tok: tokenize.TokenInfo) -> str:
11
return "%-25.25s" % f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
12
13
14
class Tokenizer:
15
"""Caching wrapper for the tokenize module.
16
17
This is pretty tied to Python's syntax.
18
"""
19
20
_tokens: List[tokenize.TokenInfo]
21
22
def __init__(
23
self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False
24
):
25
self._tokengen = tokengen
26
self._tokens = []
27
self._index = 0
28
self._verbose = verbose
29
self._lines: Dict[int, str] = {}
30
self._path = path
31
if verbose:
32
self.report(False, False)
33
34
def getnext(self) -> tokenize.TokenInfo:
35
"""Return the next token and updates the index."""
36
cached = not self._index == len(self._tokens)
37
tok = self.peek()
38
self._index += 1
39
if self._verbose:
40
self.report(cached, False)
41
return tok
42
43
def peek(self) -> tokenize.TokenInfo:
44
"""Return the next token *without* updating the index."""
45
while self._index == len(self._tokens):
46
tok = next(self._tokengen)
47
if tok.type in (tokenize.NL, tokenize.COMMENT):
48
continue
49
if tok.type == token.ERRORTOKEN and tok.string.isspace():
50
continue
51
if (
52
tok.type == token.NEWLINE
53
and self._tokens
54
and self._tokens[-1].type == token.NEWLINE
55
):
56
continue
57
self._tokens.append(tok)
58
if not self._path:
59
self._lines[tok.start[0]] = tok.line
60
return self._tokens[self._index]
61
62
def diagnose(self) -> tokenize.TokenInfo:
63
if not self._tokens:
64
self.getnext()
65
return self._tokens[-1]
66
67
def get_last_non_whitespace_token(self) -> tokenize.TokenInfo:
68
for tok in reversed(self._tokens[: self._index]):
69
if tok.type != tokenize.ENDMARKER and (
70
tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT
71
):
72
break
73
return tok
74
75
def get_lines(self, line_numbers: List[int]) -> List[str]:
76
"""Retrieve source lines corresponding to line numbers."""
77
if self._lines:
78
lines = self._lines
79
else:
80
n = len(line_numbers)
81
lines = {}
82
count = 0
83
seen = 0
84
with open(self._path) as f:
85
for l in f:
86
count += 1
87
if count in line_numbers:
88
seen += 1
89
lines[count] = l
90
if seen == n:
91
break
92
93
return [lines[n] for n in line_numbers]
94
95
def mark(self) -> Mark:
96
return self._index
97
98
def reset(self, index: Mark) -> None:
99
if index == self._index:
100
return
101
assert 0 <= index <= len(self._tokens), (index, len(self._tokens))
102
old_index = self._index
103
self._index = index
104
if self._verbose:
105
self.report(True, index < old_index)
106
107
def report(self, cached: bool, back: bool) -> None:
108
if back:
109
fill = "-" * self._index + "-"
110
elif cached:
111
fill = "-" * self._index + ">"
112
else:
113
fill = "-" * self._index + "*"
114
if self._index == 0:
115
print(f"{fill} (Bof)")
116
else:
117
tok = self._tokens[self._index - 1]
118
print(f"{fill} {shorttok(tok)}")
119
120