CoCalc -- Python-tokenize.c

GitHub Repository: allendowney/cpython
Path: blob/main/Python/Python-tokenize.c
¹² views
1
#include "Python.h"
2
#include "errcode.h"
3
#include "../Parser/tokenizer.h"
4
#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
5
#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
6

7
static struct PyModuleDef _tokenizemodule;
8

9
typedef struct {
10
    PyTypeObject *TokenizerIter;
11
} tokenize_state;
12

13
static tokenize_state *
14
get_tokenize_state(PyObject *module) {
15
    return (tokenize_state *)PyModule_GetState(module);
16
}
17

18
#define _tokenize_get_state_by_type(type) \
19
    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
20

21
#include "pycore_runtime.h"
22
#include "clinic/Python-tokenize.c.h"
23

24
/*[clinic input]
25
module _tokenizer
26
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
27
[clinic start generated code]*/
28
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
29

30
typedef struct
31
{
32
    PyObject_HEAD struct tok_state *tok;
33
    int done;
34
} tokenizeriterobject;
35

36
/*[clinic input]
37
@classmethod
38
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
39

40
    readline: object
41
    /
42
    *
43
    extra_tokens: bool
44
    encoding: str(c_default="NULL") = 'utf-8'
45
[clinic start generated code]*/
46

47
static PyObject *
48
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
49
                       int extra_tokens, const char *encoding)
50
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
51
{
52
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
53
    if (self == NULL) {
54
        return NULL;
55
    }
56
    PyObject *filename = PyUnicode_FromString("<string>");
57
    if (filename == NULL) {
58
        return NULL;
59
    }
60
    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
61
    if (self->tok == NULL) {
62
        Py_DECREF(filename);
63
        return NULL;
64
    }
65
    self->tok->filename = filename;
66
    if (extra_tokens) {
67
        self->tok->tok_extra_tokens = 1;
68
    }
69
    self->done = 0;
70
    return (PyObject *)self;
71
}
72

73
static int
74
_tokenizer_error(struct tok_state *tok)
75
{
76
    if (PyErr_Occurred()) {
77
        return -1;
78
    }
79

80
    const char *msg = NULL;
81
    PyObject* errtype = PyExc_SyntaxError;
82
    switch (tok->done) {
83
        case E_TOKEN:
84
            msg = "invalid token";
85
            break;
86
        case E_EOF:
87
            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
88
            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
89
                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
90
            return -1;
91
        case E_DEDENT:
92
            msg = "unindent does not match any outer indentation level";
93
            errtype = PyExc_IndentationError;
94
            break;
95
        case E_INTR:
96
            if (!PyErr_Occurred()) {
97
                PyErr_SetNone(PyExc_KeyboardInterrupt);
98
            }
99
            return -1;
100
        case E_NOMEM:
101
            PyErr_NoMemory();
102
            return -1;
103
        case E_TABSPACE:
104
            errtype = PyExc_TabError;
105
            msg = "inconsistent use of tabs and spaces in indentation";
106
            break;
107
        case E_TOODEEP:
108
            errtype = PyExc_IndentationError;
109
            msg = "too many levels of indentation";
110
            break;
111
        case E_LINECONT: {
112
            msg = "unexpected character after line continuation character";
113
            break;
114
        }
115
        default:
116
            msg = "unknown tokenization error";
117
    }
118

119
    PyObject* errstr = NULL;
120
    PyObject* error_line = NULL;
121
    PyObject* tmp = NULL;
122
    PyObject* value = NULL;
123
    int result = 0;
124

125
    Py_ssize_t size = tok->inp - tok->buf;
126
    assert(tok->buf[size-1] == '\n');
127
    size -= 1; // Remove the newline character from the end of the line
128
    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
129
    if (!error_line) {
130
        result = -1;
131
        goto exit;
132
    }
133

134
    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
135
    if (offset == -1) {
136
        result = -1;
137
        goto exit;
138
    }
139
    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
140
    if (!tmp) {
141
        result = -1;
142
        goto exit;
143
    }
144

145
    errstr = PyUnicode_FromString(msg);
146
    if (!errstr) {
147
        result = -1;
148
        goto exit;
149
    }
150

151
    value = PyTuple_Pack(2, errstr, tmp);
152
    if (!value) {
153
        result = -1;
154
        goto exit;
155
    }
156

157
    PyErr_SetObject(errtype, value);
158

159
exit:
160
    Py_XDECREF(errstr);
161
    Py_XDECREF(error_line);
162
    Py_XDECREF(tmp);
163
    Py_XDECREF(value);
164
    return result;
165
}
166

167
static PyObject *
168
tokenizeriter_next(tokenizeriterobject *it)
169
{
170
    PyObject* result = NULL;
171
    struct token token;
172
    _PyToken_Init(&token);
173

174
    int type = _PyTokenizer_Get(it->tok, &token);
175
    if (type == ERRORTOKEN) {
176
        if(!PyErr_Occurred()) {
177
            _tokenizer_error(it->tok);
178
            assert(PyErr_Occurred());
179
        }
180
        goto exit;
181
    }
182
    if (it->done || type == ERRORTOKEN) {
183
        PyErr_SetString(PyExc_StopIteration, "EOF");
184
        it->done = 1;
185
        goto exit;
186
    }
187
    PyObject *str = NULL;
188
    if (token.start == NULL || token.end == NULL) {
189
        str = PyUnicode_FromString("");
190
    }
191
    else {
192
        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
193
    }
194
    if (str == NULL) {
195
        goto exit;
196
    }
197

198
    int is_trailing_token = 0;
199
    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
200
        is_trailing_token = 1;
201
    }
202

203
    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
204
    PyObject* line = NULL;
205
    if (it->tok->tok_extra_tokens && is_trailing_token) {
206
        line = PyUnicode_FromString("");
207
    } else {
208
        Py_ssize_t size = it->tok->inp - line_start;
209
        if (size >= 1 && it->tok->implicit_newline) {
210
            size -= 1;
211
        }
212
        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
213
    }
214
    if (line == NULL) {
215
        Py_DECREF(str);
216
        goto exit;
217
    }
218

219
    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
220
    Py_ssize_t end_lineno = it->tok->lineno;
221
    Py_ssize_t col_offset = -1;
222
    Py_ssize_t end_col_offset = -1;
223
    if (token.start != NULL && token.start >= line_start) {
224
        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
225
    }
226
    if (token.end != NULL && token.end >= it->tok->line_start) {
227
        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
228
    }
229

230
    if (it->tok->tok_extra_tokens) {
231
        if (is_trailing_token) {
232
            lineno = end_lineno = lineno + 1;
233
            col_offset = end_col_offset = 0;
234
        }
235
        // Necessary adjustments to match the original Python tokenize
236
        // implementation
237
        if (type > DEDENT && type < OP) {
238
            type = OP;
239
        }
240
        else if (type == ASYNC || type == AWAIT) {
241
            type = NAME;
242
        }
243
        else if (type == NEWLINE) {
244
            Py_DECREF(str);
245
            if (!it->tok->implicit_newline) {
246
                if (it->tok->start[0] == '\r') {
247
                    str = PyUnicode_FromString("\r\n");
248
                } else {
249
                    str = PyUnicode_FromString("\n");
250
                }
251
            }
252
            end_col_offset++;
253
        }
254
        else if (type == NL) {
255
            if (it->tok->implicit_newline) {
256
                Py_DECREF(str);
257
                str = PyUnicode_FromString("");
258
            }
259
        }
260

261
        if (str == NULL) {
262
            Py_DECREF(line);
263
            goto exit;
264
        }
265
    }
266

267
    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
268
exit:
269
    _PyToken_Free(&token);
270
    if (type == ENDMARKER) {
271
        it->done = 1;
272
    }
273
    return result;
274
}
275

276
static void
277
tokenizeriter_dealloc(tokenizeriterobject *it)
278
{
279
    PyTypeObject *tp = Py_TYPE(it);
280
    _PyTokenizer_Free(it->tok);
281
    tp->tp_free(it);
282
    Py_DECREF(tp);
283
}
284

285
static PyType_Slot tokenizeriter_slots[] = {
286
    {Py_tp_new, tokenizeriter_new},
287
    {Py_tp_dealloc, tokenizeriter_dealloc},
288
    {Py_tp_getattro, PyObject_GenericGetAttr},
289
    {Py_tp_iter, PyObject_SelfIter},
290
    {Py_tp_iternext, tokenizeriter_next},
291
    {0, NULL},
292
};
293

294
static PyType_Spec tokenizeriter_spec = {
295
    .name = "_tokenize.TokenizerIter",
296
    .basicsize = sizeof(tokenizeriterobject),
297
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
298
    .slots = tokenizeriter_slots,
299
};
300

301
static int
302
tokenizemodule_exec(PyObject *m)
303
{
304
    tokenize_state *state = get_tokenize_state(m);
305
    if (state == NULL) {
306
        return -1;
307
    }
308

309
    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
310
    if (state->TokenizerIter == NULL) {
311
        return -1;
312
    }
313
    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
314
        return -1;
315
    }
316

317
    return 0;
318
}
319

320
static PyMethodDef tokenize_methods[] = {
321
    {NULL, NULL, 0, NULL} /* Sentinel */
322
};
323

324
static PyModuleDef_Slot tokenizemodule_slots[] = {
325
    {Py_mod_exec, tokenizemodule_exec},
326
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
327
    {0, NULL}
328
};
329

330
static int
331
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
332
{
333
    tokenize_state *state = get_tokenize_state(m);
334
    Py_VISIT(state->TokenizerIter);
335
    return 0;
336
}
337

338
static int
339
tokenizemodule_clear(PyObject *m)
340
{
341
    tokenize_state *state = get_tokenize_state(m);
342
    Py_CLEAR(state->TokenizerIter);
343
    return 0;
344
}
345

346
static void
347
tokenizemodule_free(void *m)
348
{
349
    tokenizemodule_clear((PyObject *)m);
350
}
351

352
static struct PyModuleDef _tokenizemodule = {
353
    PyModuleDef_HEAD_INIT,
354
    .m_name = "_tokenize",
355
    .m_size = sizeof(tokenize_state),
356
    .m_slots = tokenizemodule_slots,
357
    .m_methods = tokenize_methods,
358
    .m_traverse = tokenizemodule_traverse,
359
    .m_clear = tokenizemodule_clear,
360
    .m_free = tokenizemodule_free,
361
};
362

363
PyMODINIT_FUNC
364
PyInit__tokenize(void)
365
{
366
    return PyModuleDef_Init(&_tokenizemodule);
367
}
368

369
Product

Resources

Company