Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Parser/pegen_errors.c
12 views
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "tokenizer.h"
5
#include "pegen.h"
6
7
// TOKENIZER ERRORS
8
9
void
10
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
11
{
12
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
14
|| PyErr_ExceptionMatches(PyExc_ValueError)
15
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16
return;
17
}
18
PyObject *errstr = NULL;
19
PyObject *tuple = NULL;
20
PyObject *type;
21
PyObject *value;
22
PyObject *tback;
23
PyErr_Fetch(&type, &value, &tback);
24
errstr = PyObject_Str(value);
25
if (!errstr) {
26
goto error;
27
}
28
29
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30
if (!tmp) {
31
goto error;
32
}
33
34
tuple = PyTuple_Pack(2, errstr, tmp);
35
Py_DECREF(tmp);
36
if (!value) {
37
goto error;
38
}
39
PyErr_SetObject(PyExc_SyntaxError, tuple);
40
41
error:
42
Py_XDECREF(type);
43
Py_XDECREF(value);
44
Py_XDECREF(tback);
45
Py_XDECREF(errstr);
46
Py_XDECREF(tuple);
47
}
48
49
static inline void
50
raise_unclosed_parentheses_error(Parser *p) {
51
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52
int error_col = p->tok->parencolstack[p->tok->level-1];
53
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54
error_lineno, error_col, error_lineno, -1,
55
"'%c' was never closed",
56
p->tok->parenstack[p->tok->level-1]);
57
}
58
59
int
60
_Pypegen_tokenizer_error(Parser *p)
61
{
62
if (PyErr_Occurred()) {
63
return -1;
64
}
65
66
const char *msg = NULL;
67
PyObject* errtype = PyExc_SyntaxError;
68
Py_ssize_t col_offset = -1;
69
switch (p->tok->done) {
70
case E_TOKEN:
71
msg = "invalid token";
72
break;
73
case E_EOF:
74
if (p->tok->level) {
75
raise_unclosed_parentheses_error(p);
76
} else {
77
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78
}
79
return -1;
80
case E_DEDENT:
81
RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82
return -1;
83
case E_INTR:
84
if (!PyErr_Occurred()) {
85
PyErr_SetNone(PyExc_KeyboardInterrupt);
86
}
87
return -1;
88
case E_NOMEM:
89
PyErr_NoMemory();
90
return -1;
91
case E_TABSPACE:
92
errtype = PyExc_TabError;
93
msg = "inconsistent use of tabs and spaces in indentation";
94
break;
95
case E_TOODEEP:
96
errtype = PyExc_IndentationError;
97
msg = "too many levels of indentation";
98
break;
99
case E_LINECONT: {
100
col_offset = p->tok->cur - p->tok->buf - 1;
101
msg = "unexpected character after line continuation character";
102
break;
103
}
104
default:
105
msg = "unknown parsing error";
106
}
107
108
RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109
col_offset >= 0 ? col_offset : 0,
110
p->tok->lineno, -1, msg);
111
return -1;
112
}
113
114
int
115
_Pypegen_raise_decode_error(Parser *p)
116
{
117
assert(PyErr_Occurred());
118
const char *errtype = NULL;
119
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120
errtype = "unicode error";
121
}
122
else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123
errtype = "value error";
124
}
125
if (errtype) {
126
PyObject *type;
127
PyObject *value;
128
PyObject *tback;
129
PyObject *errstr;
130
PyErr_Fetch(&type, &value, &tback);
131
errstr = PyObject_Str(value);
132
if (errstr) {
133
RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134
Py_DECREF(errstr);
135
}
136
else {
137
PyErr_Clear();
138
RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139
}
140
Py_XDECREF(type);
141
Py_XDECREF(value);
142
Py_XDECREF(tback);
143
}
144
145
return -1;
146
}
147
148
static int
149
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150
// Tokenize the whole input to see if there are any tokenization
151
// errors such as mistmatching parentheses. These will get priority
152
// over generic syntax errors only if the line number of the error is
153
// before the one that we had for the generic error.
154
155
// We don't want to tokenize to the end for interactive input
156
if (p->tok->prompt != NULL) {
157
return 0;
158
}
159
160
PyObject *type, *value, *traceback;
161
PyErr_Fetch(&type, &value, &traceback);
162
163
Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164
Py_ssize_t current_err_line = current_token->lineno;
165
166
int ret = 0;
167
struct token new_token;
168
_PyToken_Init(&new_token);
169
170
for (;;) {
171
switch (_PyTokenizer_Get(p->tok, &new_token)) {
172
case ERRORTOKEN:
173
if (PyErr_Occurred()) {
174
ret = -1;
175
goto exit;
176
}
177
if (p->tok->level != 0) {
178
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
179
if (current_err_line > error_lineno) {
180
raise_unclosed_parentheses_error(p);
181
ret = -1;
182
goto exit;
183
}
184
}
185
break;
186
case ENDMARKER:
187
break;
188
default:
189
continue;
190
}
191
break;
192
}
193
194
195
exit:
196
_PyToken_Free(&new_token);
197
// If we're in an f-string, we want the syntax error in the expression part
198
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
199
// do not swallow it.
200
if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
201
Py_XDECREF(value);
202
Py_XDECREF(type);
203
Py_XDECREF(traceback);
204
} else {
205
PyErr_Restore(type, value, traceback);
206
}
207
return ret;
208
}
209
210
// PARSER ERRORS
211
212
void *
213
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
214
{
215
if (p->fill == 0) {
216
va_list va;
217
va_start(va, errmsg);
218
_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
219
va_end(va);
220
return NULL;
221
}
222
if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
223
p->error_indicator = 1;
224
return NULL;
225
}
226
Token *t = p->known_err_token != NULL
227
? p->known_err_token
228
: p->tokens[use_mark ? p->mark : p->fill - 1];
229
Py_ssize_t col_offset;
230
Py_ssize_t end_col_offset = -1;
231
if (t->col_offset == -1) {
232
if (p->tok->cur == p->tok->buf) {
233
col_offset = 0;
234
} else {
235
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
236
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
237
}
238
} else {
239
col_offset = t->col_offset + 1;
240
}
241
242
if (t->end_col_offset != -1) {
243
end_col_offset = t->end_col_offset + 1;
244
}
245
246
va_list va;
247
va_start(va, errmsg);
248
_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
249
va_end(va);
250
251
return NULL;
252
}
253
254
static PyObject *
255
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
256
{
257
/* If the file descriptor is interactive, the source lines of the current
258
* (multi-line) statement are stored in p->tok->interactive_src_start.
259
* If not, we're parsing from a string, which means that the whole source
260
* is stored in p->tok->str. */
261
assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
262
263
char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
264
if (cur_line == NULL) {
265
assert(p->tok->fp_interactive);
266
// We can reach this point if the tokenizer buffers for interactive source have not been
267
// initialized because we failed to decode the original source with the given locale.
268
return PyUnicode_FromStringAndSize("", 0);
269
}
270
271
Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
272
const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
273
274
for (int i = 0; i < relative_lineno - 1; i++) {
275
char *new_line = strchr(cur_line, '\n');
276
// The assert is here for debug builds but the conditional that
277
// follows is there so in release builds we do not crash at the cost
278
// to report a potentially wrong line.
279
assert(new_line != NULL && new_line + 1 < buf_end);
280
if (new_line == NULL || new_line + 1 > buf_end) {
281
break;
282
}
283
cur_line = new_line + 1;
284
}
285
286
char *next_newline;
287
if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
288
next_newline = cur_line + strlen(cur_line);
289
}
290
return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
291
}
292
293
void *
294
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
295
Py_ssize_t lineno, Py_ssize_t col_offset,
296
Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
297
const char *errmsg, va_list va)
298
{
299
PyObject *value = NULL;
300
PyObject *errstr = NULL;
301
PyObject *error_line = NULL;
302
PyObject *tmp = NULL;
303
p->error_indicator = 1;
304
305
if (end_lineno == CURRENT_POS) {
306
end_lineno = p->tok->lineno;
307
}
308
if (end_col_offset == CURRENT_POS) {
309
end_col_offset = p->tok->cur - p->tok->line_start;
310
}
311
312
if (p->start_rule == Py_fstring_input) {
313
const char *fstring_msg = "f-string: ";
314
Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
315
316
char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
317
if (!new_errmsg) {
318
return (void *) PyErr_NoMemory();
319
}
320
321
// Copy both strings into new buffer
322
memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
323
memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
324
new_errmsg[len] = 0;
325
errmsg = new_errmsg;
326
}
327
errstr = PyUnicode_FromFormatV(errmsg, va);
328
if (!errstr) {
329
goto error;
330
}
331
332
if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
333
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
334
}
335
else if (p->start_rule == Py_file_input) {
336
error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
337
(int) lineno, p->tok->encoding);
338
}
339
340
if (!error_line) {
341
/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
342
then we need to find the error line from some other source, because
343
p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
344
failed or we're parsing from a string or the REPL. There's a third edge case where
345
we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
346
`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
347
does not physically exist */
348
assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
349
350
if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
351
Py_ssize_t size = p->tok->inp - p->tok->buf;
352
error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
353
}
354
else if (p->tok->fp == NULL || p->tok->fp == stdin) {
355
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
356
}
357
else {
358
error_line = PyUnicode_FromStringAndSize("", 0);
359
}
360
if (!error_line) {
361
goto error;
362
}
363
}
364
365
if (p->start_rule == Py_fstring_input) {
366
col_offset -= p->starting_col_offset;
367
end_col_offset -= p->starting_col_offset;
368
}
369
370
Py_ssize_t col_number = col_offset;
371
Py_ssize_t end_col_number = end_col_offset;
372
373
if (p->tok->encoding != NULL) {
374
col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
375
if (col_number < 0) {
376
goto error;
377
}
378
if (end_col_number > 0) {
379
Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
380
if (end_col_offset < 0) {
381
goto error;
382
} else {
383
end_col_number = end_col_offset;
384
}
385
}
386
}
387
tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
388
if (!tmp) {
389
goto error;
390
}
391
value = PyTuple_Pack(2, errstr, tmp);
392
Py_DECREF(tmp);
393
if (!value) {
394
goto error;
395
}
396
PyErr_SetObject(errtype, value);
397
398
Py_DECREF(errstr);
399
Py_DECREF(value);
400
if (p->start_rule == Py_fstring_input) {
401
PyMem_Free((void *)errmsg);
402
}
403
return NULL;
404
405
error:
406
Py_XDECREF(errstr);
407
Py_XDECREF(error_line);
408
if (p->start_rule == Py_fstring_input) {
409
PyMem_Free((void *)errmsg);
410
}
411
return NULL;
412
}
413
414
void
415
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
416
// Existing sintax error
417
if (PyErr_Occurred()) {
418
// Prioritize tokenizer errors to custom syntax errors raised
419
// on the second phase only if the errors come from the parser.
420
int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
421
if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
422
_PyPegen_tokenize_full_source_to_check_for_errors(p);
423
}
424
// Propagate the existing syntax error.
425
return;
426
}
427
// Initialization error
428
if (p->fill == 0) {
429
RAISE_SYNTAX_ERROR("error at start before reading any input");
430
}
431
// Parser encountered EOF (End of File) unexpectedtly
432
if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
433
if (p->tok->level) {
434
raise_unclosed_parentheses_error(p);
435
} else {
436
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
437
}
438
return;
439
}
440
// Indentation error in the tokenizer
441
if (last_token->type == INDENT || last_token->type == DEDENT) {
442
RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
443
return;
444
}
445
// Unknown error (generic case)
446
447
// Use the last token we found on the first pass to avoid reporting
448
// incorrect locations for generic syntax errors just because we reached
449
// further away when trying to find specific syntax errors in the second
450
// pass.
451
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
452
// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
453
// generic SyntaxError we just raised if errors are found.
454
_PyPegen_tokenize_full_source_to_check_for_errors(p);
455
}
456
457