CoCalc -- tokenizer.py

GitHub Repository: allendowney/cpython
Path: blob/main/Tools/peg_generator/pegen/tokenizer.py
¹² views
1
import token
2
import tokenize
3
from typing import Dict, Iterator, List
4

5
Mark = int  # NewType('Mark', int)
6

7
exact_token_types = token.EXACT_TOKEN_TYPES
8

9

10
def shorttok(tok: tokenize.TokenInfo) -> str:
11
    return "%-25.25s" % f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
12

13

14
class Tokenizer:
15
    """Caching wrapper for the tokenize module.
16

17
    This is pretty tied to Python's syntax.
18
    """
19

20
    _tokens: List[tokenize.TokenInfo]
21

22
    def __init__(
23
        self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False
24
    ):
25
        self._tokengen = tokengen
26
        self._tokens = []
27
        self._index = 0
28
        self._verbose = verbose
29
        self._lines: Dict[int, str] = {}
30
        self._path = path
31
        if verbose:
32
            self.report(False, False)
33

34
    def getnext(self) -> tokenize.TokenInfo:
35
        """Return the next token and updates the index."""
36
        cached = not self._index == len(self._tokens)
37
        tok = self.peek()
38
        self._index += 1
39
        if self._verbose:
40
            self.report(cached, False)
41
        return tok
42

43
    def peek(self) -> tokenize.TokenInfo:
44
        """Return the next token *without* updating the index."""
45
        while self._index == len(self._tokens):
46
            tok = next(self._tokengen)
47
            if tok.type in (tokenize.NL, tokenize.COMMENT):
48
                continue
49
            if tok.type == token.ERRORTOKEN and tok.string.isspace():
50
                continue
51
            if (
52
                tok.type == token.NEWLINE
53
                and self._tokens
54
                and self._tokens[-1].type == token.NEWLINE
55
            ):
56
                continue
57
            self._tokens.append(tok)
58
            if not self._path:
59
                self._lines[tok.start[0]] = tok.line
60
        return self._tokens[self._index]
61

62
    def diagnose(self) -> tokenize.TokenInfo:
63
        if not self._tokens:
64
            self.getnext()
65
        return self._tokens[-1]
66

67
    def get_last_non_whitespace_token(self) -> tokenize.TokenInfo:
68
        for tok in reversed(self._tokens[: self._index]):
69
            if tok.type != tokenize.ENDMARKER and (
70
                tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT
71
            ):
72
                break
73
        return tok
74

75
    def get_lines(self, line_numbers: List[int]) -> List[str]:
76
        """Retrieve source lines corresponding to line numbers."""
77
        if self._lines:
78
            lines = self._lines
79
        else:
80
            n = len(line_numbers)
81
            lines = {}
82
            count = 0
83
            seen = 0
84
            with open(self._path) as f:
85
                for l in f:
86
                    count += 1
87
                    if count in line_numbers:
88
                        seen += 1
89
                        lines[count] = l
90
                        if seen == n:
91
                            break
92

93
        return [lines[n] for n in line_numbers]
94

95
    def mark(self) -> Mark:
96
        return self._index
97

98
    def reset(self, index: Mark) -> None:
99
        if index == self._index:
100
            return
101
        assert 0 <= index <= len(self._tokens), (index, len(self._tokens))
102
        old_index = self._index
103
        self._index = index
104
        if self._verbose:
105
            self.report(True, index < old_index)
106

107
    def report(self, cached: bool, back: bool) -> None:
108
        if back:
109
            fill = "-" * self._index + "-"
110
        elif cached:
111
            fill = "-" * self._index + ">"
112
        else:
113
            fill = "-" * self._index + "*"
114
        if self._index == 0:
115
            print(f"{fill} (Bof)")
116
        else:
117
            tok = self._tokens[self._index - 1]
118
            print(f"{fill} {shorttok(tok)}")
119

120
Product

Resources

Company