CoCalc -- pegen

GitHub Repository: allendowney/cpython
Path: blob/main/Parser/pegen_errors.c
¹² views
1
#include <Python.h>
2
#include <errcode.h>
3

4
#include "tokenizer.h"
5
#include "pegen.h"
6

7
// TOKENIZER ERRORS
8

9
void
10
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
11
{
12
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13
          || PyErr_ExceptionMatches(PyExc_SyntaxError)
14
          || PyErr_ExceptionMatches(PyExc_ValueError)
15
          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16
        return;
17
    }
18
    PyObject *errstr = NULL;
19
    PyObject *tuple = NULL;
20
    PyObject *type;
21
    PyObject *value;
22
    PyObject *tback;
23
    PyErr_Fetch(&type, &value, &tback);
24
    errstr = PyObject_Str(value);
25
    if (!errstr) {
26
        goto error;
27
    }
28

29
    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30
    if (!tmp) {
31
        goto error;
32
    }
33

34
    tuple = PyTuple_Pack(2, errstr, tmp);
35
    Py_DECREF(tmp);
36
    if (!value) {
37
        goto error;
38
    }
39
    PyErr_SetObject(PyExc_SyntaxError, tuple);
40

41
error:
42
    Py_XDECREF(type);
43
    Py_XDECREF(value);
44
    Py_XDECREF(tback);
45
    Py_XDECREF(errstr);
46
    Py_XDECREF(tuple);
47
}
48

49
static inline void
50
raise_unclosed_parentheses_error(Parser *p) {
51
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52
       int error_col = p->tok->parencolstack[p->tok->level-1];
53
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54
                                  error_lineno, error_col, error_lineno, -1,
55
                                  "'%c' was never closed",
56
                                  p->tok->parenstack[p->tok->level-1]);
57
}
58

59
int
60
_Pypegen_tokenizer_error(Parser *p)
61
{
62
    if (PyErr_Occurred()) {
63
        return -1;
64
    }
65

66
    const char *msg = NULL;
67
    PyObject* errtype = PyExc_SyntaxError;
68
    Py_ssize_t col_offset = -1;
69
    switch (p->tok->done) {
70
        case E_TOKEN:
71
            msg = "invalid token";
72
            break;
73
        case E_EOF:
74
            if (p->tok->level) {
75
                raise_unclosed_parentheses_error(p);
76
            } else {
77
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78
            }
79
            return -1;
80
        case E_DEDENT:
81
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82
            return -1;
83
        case E_INTR:
84
            if (!PyErr_Occurred()) {
85
                PyErr_SetNone(PyExc_KeyboardInterrupt);
86
            }
87
            return -1;
88
        case E_NOMEM:
89
            PyErr_NoMemory();
90
            return -1;
91
        case E_TABSPACE:
92
            errtype = PyExc_TabError;
93
            msg = "inconsistent use of tabs and spaces in indentation";
94
            break;
95
        case E_TOODEEP:
96
            errtype = PyExc_IndentationError;
97
            msg = "too many levels of indentation";
98
            break;
99
        case E_LINECONT: {
100
            col_offset = p->tok->cur - p->tok->buf - 1;
101
            msg = "unexpected character after line continuation character";
102
            break;
103
        }
104
        default:
105
            msg = "unknown parsing error";
106
    }
107

108
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109
                               col_offset >= 0 ? col_offset : 0,
110
                               p->tok->lineno, -1, msg);
111
    return -1;
112
}
113

114
int
115
_Pypegen_raise_decode_error(Parser *p)
116
{
117
    assert(PyErr_Occurred());
118
    const char *errtype = NULL;
119
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120
        errtype = "unicode error";
121
    }
122
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123
        errtype = "value error";
124
    }
125
    if (errtype) {
126
        PyObject *type;
127
        PyObject *value;
128
        PyObject *tback;
129
        PyObject *errstr;
130
        PyErr_Fetch(&type, &value, &tback);
131
        errstr = PyObject_Str(value);
132
        if (errstr) {
133
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134
            Py_DECREF(errstr);
135
        }
136
        else {
137
            PyErr_Clear();
138
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139
        }
140
        Py_XDECREF(type);
141
        Py_XDECREF(value);
142
        Py_XDECREF(tback);
143
    }
144

145
    return -1;
146
}
147

148
static int
149
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150
    // Tokenize the whole input to see if there are any tokenization
151
    // errors such as mistmatching parentheses. These will get priority
152
    // over generic syntax errors only if the line number of the error is
153
    // before the one that we had for the generic error.
154

155
    // We don't want to tokenize to the end for interactive input
156
    if (p->tok->prompt != NULL) {
157
        return 0;
158
    }
159

160
    PyObject *type, *value, *traceback;
161
    PyErr_Fetch(&type, &value, &traceback);
162

163
    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164
    Py_ssize_t current_err_line = current_token->lineno;
165

166
    int ret = 0;
167
    struct token new_token;
168
    _PyToken_Init(&new_token);
169

170
    for (;;) {
171
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
172
            case ERRORTOKEN:
173
                if (PyErr_Occurred()) {
174
                    ret = -1;
175
                    goto exit;
176
                }
177
                if (p->tok->level != 0) {
178
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
179
                    if (current_err_line > error_lineno) {
180
                        raise_unclosed_parentheses_error(p);
181
                        ret = -1;
182
                        goto exit;
183
                    }
184
                }
185
                break;
186
            case ENDMARKER:
187
                break;
188
            default:
189
                continue;
190
        }
191
        break;
192
    }
193

194

195
exit:
196
    _PyToken_Free(&new_token);
197
    // If we're in an f-string, we want the syntax error in the expression part
198
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
199
    // do not swallow it.
200
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
201
        Py_XDECREF(value);
202
        Py_XDECREF(type);
203
        Py_XDECREF(traceback);
204
    } else {
205
        PyErr_Restore(type, value, traceback);
206
    }
207
    return ret;
208
}
209

210
// PARSER ERRORS
211

212
void *
213
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
214
{
215
    if (p->fill == 0) {
216
        va_list va;
217
        va_start(va, errmsg);
218
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
219
        va_end(va);
220
        return NULL;
221
    }
222
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
223
        p->error_indicator = 1;
224
        return NULL;
225
    }
226
    Token *t = p->known_err_token != NULL
227
                   ? p->known_err_token
228
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
229
    Py_ssize_t col_offset;
230
    Py_ssize_t end_col_offset = -1;
231
    if (t->col_offset == -1) {
232
        if (p->tok->cur == p->tok->buf) {
233
            col_offset = 0;
234
        } else {
235
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
236
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
237
        }
238
    } else {
239
        col_offset = t->col_offset + 1;
240
    }
241

242
    if (t->end_col_offset != -1) {
243
        end_col_offset = t->end_col_offset + 1;
244
    }
245

246
    va_list va;
247
    va_start(va, errmsg);
248
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
249
    va_end(va);
250

251
    return NULL;
252
}
253

254
static PyObject *
255
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
256
{
257
    /* If the file descriptor is interactive, the source lines of the current
258
     * (multi-line) statement are stored in p->tok->interactive_src_start.
259
     * If not, we're parsing from a string, which means that the whole source
260
     * is stored in p->tok->str. */
261
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
262

263
    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
264
    if (cur_line == NULL) {
265
        assert(p->tok->fp_interactive);
266
        // We can reach this point if the tokenizer buffers for interactive source have not been
267
        // initialized because we failed to decode the original source with the given locale.
268
        return PyUnicode_FromStringAndSize("", 0);
269
    }
270

271
    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
272
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
273

274
    for (int i = 0; i < relative_lineno - 1; i++) {
275
        char *new_line = strchr(cur_line, '\n');
276
        // The assert is here for debug builds but the conditional that
277
        // follows is there so in release builds we do not crash at the cost
278
        // to report a potentially wrong line.
279
        assert(new_line != NULL && new_line + 1 < buf_end);
280
        if (new_line == NULL || new_line + 1 > buf_end) {
281
            break;
282
        }
283
        cur_line = new_line + 1;
284
    }
285

286
    char *next_newline;
287
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
288
        next_newline = cur_line + strlen(cur_line);
289
    }
290
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
291
}
292

293
void *
294
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
295
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
296
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
297
                                    const char *errmsg, va_list va)
298
{
299
    PyObject *value = NULL;
300
    PyObject *errstr = NULL;
301
    PyObject *error_line = NULL;
302
    PyObject *tmp = NULL;
303
    p->error_indicator = 1;
304

305
    if (end_lineno == CURRENT_POS) {
306
        end_lineno = p->tok->lineno;
307
    }
308
    if (end_col_offset == CURRENT_POS) {
309
        end_col_offset = p->tok->cur - p->tok->line_start;
310
    }
311

312
    if (p->start_rule == Py_fstring_input) {
313
        const char *fstring_msg = "f-string: ";
314
        Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
315

316
        char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
317
        if (!new_errmsg) {
318
            return (void *) PyErr_NoMemory();
319
        }
320

321
        // Copy both strings into new buffer
322
        memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
323
        memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
324
        new_errmsg[len] = 0;
325
        errmsg = new_errmsg;
326
    }
327
    errstr = PyUnicode_FromFormatV(errmsg, va);
328
    if (!errstr) {
329
        goto error;
330
    }
331

332
    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
333
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
334
    }
335
    else if (p->start_rule == Py_file_input) {
336
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
337
                                                     (int) lineno, p->tok->encoding);
338
    }
339

340
    if (!error_line) {
341
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
342
           then we need to find the error line from some other source, because
343
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
344
           failed or we're parsing from a string or the REPL. There's a third edge case where
345
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
346
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
347
           does not physically exist */
348
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
349

350
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
351
            Py_ssize_t size = p->tok->inp - p->tok->buf;
352
            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
353
        }
354
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
355
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
356
        }
357
        else {
358
            error_line = PyUnicode_FromStringAndSize("", 0);
359
        }
360
        if (!error_line) {
361
            goto error;
362
        }
363
    }
364

365
    if (p->start_rule == Py_fstring_input) {
366
        col_offset -= p->starting_col_offset;
367
        end_col_offset -= p->starting_col_offset;
368
    }
369

370
    Py_ssize_t col_number = col_offset;
371
    Py_ssize_t end_col_number = end_col_offset;
372

373
    if (p->tok->encoding != NULL) {
374
        col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
375
        if (col_number < 0) {
376
            goto error;
377
        }
378
        if (end_col_number > 0) {
379
            Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
380
            if (end_col_offset < 0) {
381
                goto error;
382
            } else {
383
                end_col_number = end_col_offset;
384
            }
385
        }
386
    }
387
    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
388
    if (!tmp) {
389
        goto error;
390
    }
391
    value = PyTuple_Pack(2, errstr, tmp);
392
    Py_DECREF(tmp);
393
    if (!value) {
394
        goto error;
395
    }
396
    PyErr_SetObject(errtype, value);
397

398
    Py_DECREF(errstr);
399
    Py_DECREF(value);
400
    if (p->start_rule == Py_fstring_input) {
401
        PyMem_Free((void *)errmsg);
402
    }
403
    return NULL;
404

405
error:
406
    Py_XDECREF(errstr);
407
    Py_XDECREF(error_line);
408
    if (p->start_rule == Py_fstring_input) {
409
        PyMem_Free((void *)errmsg);
410
    }
411
    return NULL;
412
}
413

414
void
415
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
416
    // Existing sintax error
417
    if (PyErr_Occurred()) {
418
        // Prioritize tokenizer errors to custom syntax errors raised
419
        // on the second phase only if the errors come from the parser.
420
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
421
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
422
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
423
        }
424
        // Propagate the existing syntax error.
425
        return;
426
    }
427
    // Initialization error
428
    if (p->fill == 0) {
429
        RAISE_SYNTAX_ERROR("error at start before reading any input");
430
    }
431
    // Parser encountered EOF (End of File) unexpectedtly
432
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
433
        if (p->tok->level) {
434
            raise_unclosed_parentheses_error(p);
435
        } else {
436
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
437
        }
438
        return;
439
    }
440
    // Indentation error in the tokenizer
441
    if (last_token->type == INDENT || last_token->type == DEDENT) {
442
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
443
        return;
444
    }
445
    // Unknown error (generic case)
446

447
    // Use the last token we found on the first pass to avoid reporting
448
    // incorrect locations for generic syntax errors just because we reached
449
    // further away when trying to find specific syntax errors in the second
450
    // pass.
451
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
452
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
453
    // generic SyntaxError we just raised if errors are found.
454
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
455
}
456

457
Product

Resources

Company