Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Python/Python-tokenize.c
12 views
1
#include "Python.h"
2
#include "errcode.h"
3
#include "../Parser/tokenizer.h"
4
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
5
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
6
7
static struct PyModuleDef _tokenizemodule;
8
9
typedef struct {
10
PyTypeObject *TokenizerIter;
11
} tokenize_state;
12
13
static tokenize_state *
14
get_tokenize_state(PyObject *module) {
15
return (tokenize_state *)PyModule_GetState(module);
16
}
17
18
#define _tokenize_get_state_by_type(type) \
19
get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
20
21
#include "pycore_runtime.h"
22
#include "clinic/Python-tokenize.c.h"
23
24
/*[clinic input]
25
module _tokenizer
26
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
27
[clinic start generated code]*/
28
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
29
30
typedef struct
31
{
32
PyObject_HEAD struct tok_state *tok;
33
int done;
34
} tokenizeriterobject;
35
36
/*[clinic input]
37
@classmethod
38
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
39
40
readline: object
41
/
42
*
43
extra_tokens: bool
44
encoding: str(c_default="NULL") = 'utf-8'
45
[clinic start generated code]*/
46
47
static PyObject *
48
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
49
int extra_tokens, const char *encoding)
50
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
51
{
52
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
53
if (self == NULL) {
54
return NULL;
55
}
56
PyObject *filename = PyUnicode_FromString("<string>");
57
if (filename == NULL) {
58
return NULL;
59
}
60
self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
61
if (self->tok == NULL) {
62
Py_DECREF(filename);
63
return NULL;
64
}
65
self->tok->filename = filename;
66
if (extra_tokens) {
67
self->tok->tok_extra_tokens = 1;
68
}
69
self->done = 0;
70
return (PyObject *)self;
71
}
72
73
static int
74
_tokenizer_error(struct tok_state *tok)
75
{
76
if (PyErr_Occurred()) {
77
return -1;
78
}
79
80
const char *msg = NULL;
81
PyObject* errtype = PyExc_SyntaxError;
82
switch (tok->done) {
83
case E_TOKEN:
84
msg = "invalid token";
85
break;
86
case E_EOF:
87
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
88
PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
89
tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
90
return -1;
91
case E_DEDENT:
92
msg = "unindent does not match any outer indentation level";
93
errtype = PyExc_IndentationError;
94
break;
95
case E_INTR:
96
if (!PyErr_Occurred()) {
97
PyErr_SetNone(PyExc_KeyboardInterrupt);
98
}
99
return -1;
100
case E_NOMEM:
101
PyErr_NoMemory();
102
return -1;
103
case E_TABSPACE:
104
errtype = PyExc_TabError;
105
msg = "inconsistent use of tabs and spaces in indentation";
106
break;
107
case E_TOODEEP:
108
errtype = PyExc_IndentationError;
109
msg = "too many levels of indentation";
110
break;
111
case E_LINECONT: {
112
msg = "unexpected character after line continuation character";
113
break;
114
}
115
default:
116
msg = "unknown tokenization error";
117
}
118
119
PyObject* errstr = NULL;
120
PyObject* error_line = NULL;
121
PyObject* tmp = NULL;
122
PyObject* value = NULL;
123
int result = 0;
124
125
Py_ssize_t size = tok->inp - tok->buf;
126
assert(tok->buf[size-1] == '\n');
127
size -= 1; // Remove the newline character from the end of the line
128
error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
129
if (!error_line) {
130
result = -1;
131
goto exit;
132
}
133
134
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
135
if (offset == -1) {
136
result = -1;
137
goto exit;
138
}
139
tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
140
if (!tmp) {
141
result = -1;
142
goto exit;
143
}
144
145
errstr = PyUnicode_FromString(msg);
146
if (!errstr) {
147
result = -1;
148
goto exit;
149
}
150
151
value = PyTuple_Pack(2, errstr, tmp);
152
if (!value) {
153
result = -1;
154
goto exit;
155
}
156
157
PyErr_SetObject(errtype, value);
158
159
exit:
160
Py_XDECREF(errstr);
161
Py_XDECREF(error_line);
162
Py_XDECREF(tmp);
163
Py_XDECREF(value);
164
return result;
165
}
166
167
static PyObject *
168
tokenizeriter_next(tokenizeriterobject *it)
169
{
170
PyObject* result = NULL;
171
struct token token;
172
_PyToken_Init(&token);
173
174
int type = _PyTokenizer_Get(it->tok, &token);
175
if (type == ERRORTOKEN) {
176
if(!PyErr_Occurred()) {
177
_tokenizer_error(it->tok);
178
assert(PyErr_Occurred());
179
}
180
goto exit;
181
}
182
if (it->done || type == ERRORTOKEN) {
183
PyErr_SetString(PyExc_StopIteration, "EOF");
184
it->done = 1;
185
goto exit;
186
}
187
PyObject *str = NULL;
188
if (token.start == NULL || token.end == NULL) {
189
str = PyUnicode_FromString("");
190
}
191
else {
192
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
193
}
194
if (str == NULL) {
195
goto exit;
196
}
197
198
int is_trailing_token = 0;
199
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
200
is_trailing_token = 1;
201
}
202
203
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
204
PyObject* line = NULL;
205
if (it->tok->tok_extra_tokens && is_trailing_token) {
206
line = PyUnicode_FromString("");
207
} else {
208
Py_ssize_t size = it->tok->inp - line_start;
209
if (size >= 1 && it->tok->implicit_newline) {
210
size -= 1;
211
}
212
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
213
}
214
if (line == NULL) {
215
Py_DECREF(str);
216
goto exit;
217
}
218
219
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
220
Py_ssize_t end_lineno = it->tok->lineno;
221
Py_ssize_t col_offset = -1;
222
Py_ssize_t end_col_offset = -1;
223
if (token.start != NULL && token.start >= line_start) {
224
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
225
}
226
if (token.end != NULL && token.end >= it->tok->line_start) {
227
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
228
}
229
230
if (it->tok->tok_extra_tokens) {
231
if (is_trailing_token) {
232
lineno = end_lineno = lineno + 1;
233
col_offset = end_col_offset = 0;
234
}
235
// Necessary adjustments to match the original Python tokenize
236
// implementation
237
if (type > DEDENT && type < OP) {
238
type = OP;
239
}
240
else if (type == ASYNC || type == AWAIT) {
241
type = NAME;
242
}
243
else if (type == NEWLINE) {
244
Py_DECREF(str);
245
if (!it->tok->implicit_newline) {
246
if (it->tok->start[0] == '\r') {
247
str = PyUnicode_FromString("\r\n");
248
} else {
249
str = PyUnicode_FromString("\n");
250
}
251
}
252
end_col_offset++;
253
}
254
else if (type == NL) {
255
if (it->tok->implicit_newline) {
256
Py_DECREF(str);
257
str = PyUnicode_FromString("");
258
}
259
}
260
261
if (str == NULL) {
262
Py_DECREF(line);
263
goto exit;
264
}
265
}
266
267
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
268
exit:
269
_PyToken_Free(&token);
270
if (type == ENDMARKER) {
271
it->done = 1;
272
}
273
return result;
274
}
275
276
static void
277
tokenizeriter_dealloc(tokenizeriterobject *it)
278
{
279
PyTypeObject *tp = Py_TYPE(it);
280
_PyTokenizer_Free(it->tok);
281
tp->tp_free(it);
282
Py_DECREF(tp);
283
}
284
285
static PyType_Slot tokenizeriter_slots[] = {
286
{Py_tp_new, tokenizeriter_new},
287
{Py_tp_dealloc, tokenizeriter_dealloc},
288
{Py_tp_getattro, PyObject_GenericGetAttr},
289
{Py_tp_iter, PyObject_SelfIter},
290
{Py_tp_iternext, tokenizeriter_next},
291
{0, NULL},
292
};
293
294
static PyType_Spec tokenizeriter_spec = {
295
.name = "_tokenize.TokenizerIter",
296
.basicsize = sizeof(tokenizeriterobject),
297
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
298
.slots = tokenizeriter_slots,
299
};
300
301
static int
302
tokenizemodule_exec(PyObject *m)
303
{
304
tokenize_state *state = get_tokenize_state(m);
305
if (state == NULL) {
306
return -1;
307
}
308
309
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
310
if (state->TokenizerIter == NULL) {
311
return -1;
312
}
313
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
314
return -1;
315
}
316
317
return 0;
318
}
319
320
static PyMethodDef tokenize_methods[] = {
321
{NULL, NULL, 0, NULL} /* Sentinel */
322
};
323
324
static PyModuleDef_Slot tokenizemodule_slots[] = {
325
{Py_mod_exec, tokenizemodule_exec},
326
{Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
327
{0, NULL}
328
};
329
330
static int
331
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
332
{
333
tokenize_state *state = get_tokenize_state(m);
334
Py_VISIT(state->TokenizerIter);
335
return 0;
336
}
337
338
static int
339
tokenizemodule_clear(PyObject *m)
340
{
341
tokenize_state *state = get_tokenize_state(m);
342
Py_CLEAR(state->TokenizerIter);
343
return 0;
344
}
345
346
static void
347
tokenizemodule_free(void *m)
348
{
349
tokenizemodule_clear((PyObject *)m);
350
}
351
352
static struct PyModuleDef _tokenizemodule = {
353
PyModuleDef_HEAD_INIT,
354
.m_name = "_tokenize",
355
.m_size = sizeof(tokenize_state),
356
.m_slots = tokenizemodule_slots,
357
.m_methods = tokenize_methods,
358
.m_traverse = tokenizemodule_traverse,
359
.m_clear = tokenizemodule_clear,
360
.m_free = tokenizemodule_free,
361
};
362
363
PyMODINIT_FUNC
364
PyInit__tokenize(void)
365
{
366
return PyModuleDef_Init(&_tokenizemodule);
367
}
368
369