Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/cases_generator/lexer.py
12 views
1
# Parser for C code
2
# Originally by Mark Shannon ([email protected])
3
# https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34
4
5
import re
6
from dataclasses import dataclass
7
8
def choice(*opts):
9
return "|".join("(%s)" % opt for opt in opts)
10
11
# Regexes
12
13
# Longer operators must go before shorter ones.
14
15
PLUSPLUS = r'\+\+'
16
MINUSMINUS = r'--'
17
18
# ->
19
ARROW = r'->'
20
ELLIPSIS = r'\.\.\.'
21
22
# Assignment operators
23
TIMESEQUAL = r'\*='
24
DIVEQUAL = r'/='
25
MODEQUAL = r'%='
26
PLUSEQUAL = r'\+='
27
MINUSEQUAL = r'-='
28
LSHIFTEQUAL = r'<<='
29
RSHIFTEQUAL = r'>>='
30
ANDEQUAL = r'&='
31
OREQUAL = r'\|='
32
XOREQUAL = r'\^='
33
34
# Operators
35
PLUS = r'\+'
36
MINUS = r'-'
37
TIMES = r'\*'
38
DIVIDE = r'/'
39
MOD = r'%'
40
NOT = r'~'
41
XOR = r'\^'
42
LOR = r'\|\|'
43
LAND = r'&&'
44
LSHIFT = r'<<'
45
RSHIFT = r'>>'
46
LE = r'<='
47
GE = r'>='
48
EQ = r'=='
49
NE = r'!='
50
LT = r'<'
51
GT = r'>'
52
LNOT = r'!'
53
OR = r'\|'
54
AND = r'&'
55
EQUALS = r'='
56
57
# ?
58
CONDOP = r'\?'
59
60
# Delimiters
61
LPAREN = r'\('
62
RPAREN = r'\)'
63
LBRACKET = r'\['
64
RBRACKET = r'\]'
65
LBRACE = r'\{'
66
RBRACE = r'\}'
67
COMMA = r','
68
PERIOD = r'\.'
69
SEMI = r';'
70
COLON = r':'
71
BACKSLASH = r'\\'
72
73
operators = { op: pattern for op, pattern in globals().items() if op == op.upper() }
74
for op in operators:
75
globals()[op] = op
76
opmap = { pattern.replace("\\", "") or '\\' : op for op, pattern in operators.items() }
77
78
# Macros
79
macro = r'# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)'
80
MACRO = 'MACRO'
81
82
id_re = r'[a-zA-Z_][0-9a-zA-Z_]*'
83
IDENTIFIER = 'IDENTIFIER'
84
85
suffix = r'([uU]?[lL]?[lL]?)'
86
octal = r'0[0-7]+' + suffix
87
hex = r'0[xX][0-9a-fA-F]+'
88
decimal_digits = r'(0|[1-9][0-9]*)'
89
decimal = decimal_digits + suffix
90
91
92
exponent = r"""([eE][-+]?[0-9]+)"""
93
fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
94
float = '(((('+fraction+')'+exponent+'?)|([0-9]+'+exponent+'))[FfLl]?)'
95
96
number_re = choice(octal, hex, float, decimal)
97
NUMBER = 'NUMBER'
98
99
simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
100
decimal_escape = r"""(\d+)"""
101
hex_escape = r"""(x[0-9a-fA-F]+)"""
102
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
103
string_char = r"""([^"\\\n]|"""+escape_sequence+')'
104
str_re = '"'+string_char+'*"'
105
STRING = 'STRING'
106
char = r'\'.\'' # TODO: escape sequence
107
CHARACTER = 'CHARACTER'
108
109
comment_re = r'//.*|/\*([^*]|\*[^/])*\*/'
110
COMMENT = 'COMMENT'
111
112
newline = r"\n"
113
invalid = r"\S" # A single non-space character that's not caught by any of the other patterns
114
matcher = re.compile(choice(id_re, number_re, str_re, char, newline, macro, comment_re, *operators.values(), invalid))
115
letter = re.compile(r'[a-zA-Z_]')
116
117
kwds = (
118
'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
119
'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
120
'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'OVERRIDE',
121
'REGISTER', 'OFFSETOF',
122
'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
123
'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
124
'VOLATILE', 'WHILE'
125
)
126
for name in kwds:
127
globals()[name] = name
128
keywords = { name.lower() : name for name in kwds }
129
130
131
def make_syntax_error(
132
message: str, filename: str, line: int, column: int, line_text: str,
133
) -> SyntaxError:
134
return SyntaxError(message, (filename, line, column, line_text))
135
136
137
@dataclass(slots=True)
138
class Token:
139
kind: str
140
text: str
141
begin: tuple[int, int]
142
end: tuple[int, int]
143
144
@property
145
def line(self):
146
return self.begin[0]
147
148
@property
149
def column(self):
150
return self.begin[1]
151
152
@property
153
def end_line(self):
154
return self.end[0]
155
156
@property
157
def end_column(self):
158
return self.end[1]
159
160
@property
161
def width(self):
162
return self.end[1] - self.begin[1]
163
164
def replaceText(self, txt):
165
assert isinstance(txt, str)
166
return Token(self.kind, txt, self.begin, self.end)
167
168
def __repr__(self):
169
b0, b1 = self.begin
170
e0, e1 = self.end
171
if b0 == e0:
172
return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})"
173
else:
174
return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})"
175
176
177
def tokenize(src, line=1, filename=None):
178
linestart = -1
179
for m in matcher.finditer(src):
180
start, end = m.span()
181
text = m.group(0)
182
if text in keywords:
183
kind = keywords[text]
184
elif letter.match(text):
185
kind = IDENTIFIER
186
elif text == '...':
187
kind = ELLIPSIS
188
elif text == '.':
189
kind = PERIOD
190
elif text[0] in '0123456789.':
191
kind = NUMBER
192
elif text[0] == '"':
193
kind = STRING
194
elif text in opmap:
195
kind = opmap[text]
196
elif text == '\n':
197
linestart = start
198
line += 1
199
kind = '\n'
200
elif text[0] == "'":
201
kind = CHARACTER
202
elif text[0] == '#':
203
kind = MACRO
204
elif text[0] == '/' and text[1] in '/*':
205
kind = COMMENT
206
else:
207
lineend = src.find("\n", start)
208
if lineend == -1:
209
lineend = len(src)
210
raise make_syntax_error(f"Bad token: {text}",
211
filename, line, start-linestart+1, src[linestart:lineend])
212
if kind == COMMENT:
213
begin = line, start-linestart
214
newlines = text.count('\n')
215
if newlines:
216
linestart = start + text.rfind('\n')
217
line += newlines
218
else:
219
begin = line, start-linestart
220
if kind != "\n":
221
yield Token(kind, text, begin, (line, start-linestart+len(text)))
222
223
224
__all__ = []
225
__all__.extend([kind for kind in globals() if kind.upper() == kind])
226
227
228
def to_text(tkns: list[Token], dedent: int = 0) -> str:
229
res: list[str] = []
230
line, col = -1, 1+dedent
231
for tkn in tkns:
232
if line == -1:
233
line, _ = tkn.begin
234
l, c = tkn.begin
235
#assert(l >= line), (line, txt, start, end)
236
while l > line:
237
line += 1
238
res.append('\n')
239
col = 1+dedent
240
res.append(' '*(c-col))
241
text = tkn.text
242
if dedent != 0 and tkn.kind == 'COMMENT' and '\n' in text:
243
if dedent < 0:
244
text = text.replace('\n', '\n' + ' '*-dedent)
245
# TODO: dedent > 0
246
res.append(text)
247
line, col = tkn.end
248
return ''.join(res)
249
250
251
if __name__ == "__main__":
252
import sys
253
filename = sys.argv[1]
254
if filename == "-c":
255
src = sys.argv[2]
256
else:
257
src = open(filename).read()
258
# print(to_text(tokenize(src)))
259
for tkn in tokenize(src, filename=filename):
260
print(tkn)
261
262