Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Parser/tokenizer.c
12 views
1
2
/* Tokenizer implementation */
3
4
#include "Python.h"
5
#include "pycore_call.h" // _PyObject_CallNoArgs()
6
7
#include <ctype.h>
8
#include <assert.h>
9
10
#include "tokenizer.h"
11
#include "errcode.h"
12
13
/* Alternate tab spacing */
14
#define ALTTABSIZE 1
15
16
#define is_potential_identifier_start(c) (\
17
(c >= 'a' && c <= 'z')\
18
|| (c >= 'A' && c <= 'Z')\
19
|| c == '_'\
20
|| (c >= 128))
21
22
#define is_potential_identifier_char(c) (\
23
(c >= 'a' && c <= 'z')\
24
|| (c >= 'A' && c <= 'Z')\
25
|| (c >= '0' && c <= '9')\
26
|| c == '_'\
27
|| (c >= 128))
28
29
30
/* Don't ever change this -- it would break the portability of Python code */
31
#define TABSIZE 8
32
33
#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
34
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
35
type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
36
#define ADVANCE_LINENO() \
37
tok->lineno++; \
38
tok->col_offset = 0;
39
40
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
41
#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
42
#ifdef Py_DEBUG
43
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
44
assert(tok->tok_mode_stack_index >= 0);
45
assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
46
return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
47
}
48
static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
49
assert(tok->tok_mode_stack_index >= 0);
50
assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
51
return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
52
}
53
#else
54
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
55
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
56
#endif
57
58
/* Forward */
59
static struct tok_state *tok_new(void);
60
static int tok_nextc(struct tok_state *tok);
61
static void tok_backup(struct tok_state *tok, int c);
62
static int syntaxerror(struct tok_state *tok, const char *format, ...);
63
64
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
65
tokenizing. */
66
static const char* type_comment_prefix = "# type: ";
67
68
/* Create and initialize a new tok_state structure */
69
70
static struct tok_state *
71
tok_new(void)
72
{
73
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
74
sizeof(struct tok_state));
75
if (tok == NULL)
76
return NULL;
77
tok->buf = tok->cur = tok->inp = NULL;
78
tok->fp_interactive = 0;
79
tok->interactive_src_start = NULL;
80
tok->interactive_src_end = NULL;
81
tok->start = NULL;
82
tok->end = NULL;
83
tok->done = E_OK;
84
tok->fp = NULL;
85
tok->input = NULL;
86
tok->tabsize = TABSIZE;
87
tok->indent = 0;
88
tok->indstack[0] = 0;
89
tok->atbol = 1;
90
tok->pendin = 0;
91
tok->prompt = tok->nextprompt = NULL;
92
tok->lineno = 0;
93
tok->starting_col_offset = -1;
94
tok->col_offset = -1;
95
tok->level = 0;
96
tok->altindstack[0] = 0;
97
tok->decoding_state = STATE_INIT;
98
tok->decoding_erred = 0;
99
tok->enc = NULL;
100
tok->encoding = NULL;
101
tok->cont_line = 0;
102
tok->filename = NULL;
103
tok->decoding_readline = NULL;
104
tok->decoding_buffer = NULL;
105
tok->readline = NULL;
106
tok->type_comments = 0;
107
tok->async_hacks = 0;
108
tok->async_def = 0;
109
tok->async_def_indent = 0;
110
tok->async_def_nl = 0;
111
tok->interactive_underflow = IUNDERFLOW_NORMAL;
112
tok->str = NULL;
113
tok->report_warnings = 1;
114
tok->tok_extra_tokens = 0;
115
tok->comment_newline = 0;
116
tok->implicit_newline = 0;
117
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
118
tok->tok_mode_stack_index = 0;
119
tok->tok_report_warnings = 1;
120
#ifdef Py_DEBUG
121
tok->debug = _Py_GetConfig()->parser_debug;
122
#endif
123
return tok;
124
}
125
126
static char *
127
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
128
{
129
char* result = (char *)PyMem_Malloc(len + 1);
130
if (!result) {
131
tok->done = E_NOMEM;
132
return NULL;
133
}
134
memcpy(result, s, len);
135
result[len] = '\0';
136
return result;
137
}
138
139
static char *
140
error_ret(struct tok_state *tok) /* XXX */
141
{
142
tok->decoding_erred = 1;
143
if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
144
PyMem_Free(tok->buf);
145
}
146
tok->buf = tok->cur = tok->inp = NULL;
147
tok->start = NULL;
148
tok->end = NULL;
149
tok->done = E_DECODE;
150
return NULL; /* as if it were EOF */
151
}
152
153
154
static const char *
155
get_normal_name(const char *s) /* for utf-8 and latin-1 */
156
{
157
char buf[13];
158
int i;
159
for (i = 0; i < 12; i++) {
160
int c = s[i];
161
if (c == '\0')
162
break;
163
else if (c == '_')
164
buf[i] = '-';
165
else
166
buf[i] = tolower(c);
167
}
168
buf[i] = '\0';
169
if (strcmp(buf, "utf-8") == 0 ||
170
strncmp(buf, "utf-8-", 6) == 0)
171
return "utf-8";
172
else if (strcmp(buf, "latin-1") == 0 ||
173
strcmp(buf, "iso-8859-1") == 0 ||
174
strcmp(buf, "iso-latin-1") == 0 ||
175
strncmp(buf, "latin-1-", 8) == 0 ||
176
strncmp(buf, "iso-8859-1-", 11) == 0 ||
177
strncmp(buf, "iso-latin-1-", 12) == 0)
178
return "iso-8859-1";
179
else
180
return s;
181
}
182
183
/* Return the coding spec in S, or NULL if none is found. */
184
185
static int
186
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
187
{
188
Py_ssize_t i;
189
*spec = NULL;
190
/* Coding spec must be in a comment, and that comment must be
191
* the only statement on the source code line. */
192
for (i = 0; i < size - 6; i++) {
193
if (s[i] == '#')
194
break;
195
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
196
return 1;
197
}
198
for (; i < size - 6; i++) { /* XXX inefficient search */
199
const char* t = s + i;
200
if (memcmp(t, "coding", 6) == 0) {
201
const char* begin = NULL;
202
t += 6;
203
if (t[0] != ':' && t[0] != '=')
204
continue;
205
do {
206
t++;
207
} while (t[0] == ' ' || t[0] == '\t');
208
209
begin = t;
210
while (Py_ISALNUM(t[0]) ||
211
t[0] == '-' || t[0] == '_' || t[0] == '.')
212
t++;
213
214
if (begin < t) {
215
char* r = new_string(begin, t - begin, tok);
216
const char* q;
217
if (!r)
218
return 0;
219
q = get_normal_name(r);
220
if (r != q) {
221
PyMem_Free(r);
222
r = new_string(q, strlen(q), tok);
223
if (!r)
224
return 0;
225
}
226
*spec = r;
227
break;
228
}
229
}
230
}
231
return 1;
232
}
233
234
/* Check whether the line contains a coding spec. If it does,
235
invoke the set_readline function for the new encoding.
236
This function receives the tok_state and the new encoding.
237
Return 1 on success, 0 on failure. */
238
239
static int
240
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
241
int set_readline(struct tok_state *, const char *))
242
{
243
char *cs;
244
if (tok->cont_line) {
245
/* It's a continuation line, so it can't be a coding spec. */
246
tok->decoding_state = STATE_NORMAL;
247
return 1;
248
}
249
if (!get_coding_spec(line, &cs, size, tok)) {
250
return 0;
251
}
252
if (!cs) {
253
Py_ssize_t i;
254
for (i = 0; i < size; i++) {
255
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
256
break;
257
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
258
/* Stop checking coding spec after a line containing
259
* anything except a comment. */
260
tok->decoding_state = STATE_NORMAL;
261
break;
262
}
263
}
264
return 1;
265
}
266
tok->decoding_state = STATE_NORMAL;
267
if (tok->encoding == NULL) {
268
assert(tok->decoding_readline == NULL);
269
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
270
error_ret(tok);
271
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
272
PyMem_Free(cs);
273
return 0;
274
}
275
tok->encoding = cs;
276
} else { /* then, compare cs with BOM */
277
if (strcmp(tok->encoding, cs) != 0) {
278
error_ret(tok);
279
PyErr_Format(PyExc_SyntaxError,
280
"encoding problem: %s with BOM", cs);
281
PyMem_Free(cs);
282
return 0;
283
}
284
PyMem_Free(cs);
285
}
286
return 1;
287
}
288
289
/* See whether the file starts with a BOM. If it does,
290
invoke the set_readline function with the new encoding.
291
Return 1 on success, 0 on failure. */
292
293
static int
294
check_bom(int get_char(struct tok_state *),
295
void unget_char(int, struct tok_state *),
296
int set_readline(struct tok_state *, const char *),
297
struct tok_state *tok)
298
{
299
int ch1, ch2, ch3;
300
ch1 = get_char(tok);
301
tok->decoding_state = STATE_SEEK_CODING;
302
if (ch1 == EOF) {
303
return 1;
304
} else if (ch1 == 0xEF) {
305
ch2 = get_char(tok);
306
if (ch2 != 0xBB) {
307
unget_char(ch2, tok);
308
unget_char(ch1, tok);
309
return 1;
310
}
311
ch3 = get_char(tok);
312
if (ch3 != 0xBF) {
313
unget_char(ch3, tok);
314
unget_char(ch2, tok);
315
unget_char(ch1, tok);
316
return 1;
317
}
318
} else {
319
unget_char(ch1, tok);
320
return 1;
321
}
322
if (tok->encoding != NULL)
323
PyMem_Free(tok->encoding);
324
tok->encoding = new_string("utf-8", 5, tok);
325
if (!tok->encoding)
326
return 0;
327
/* No need to set_readline: input is already utf-8 */
328
return 1;
329
}
330
331
static int
332
tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
333
assert(tok->fp_interactive);
334
335
if (!line) {
336
return 0;
337
}
338
339
Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
340
Py_ssize_t line_size = strlen(line);
341
char last_char = line[line_size > 0 ? line_size - 1 : line_size];
342
if (last_char != '\n') {
343
line_size += 1;
344
}
345
char* new_str = tok->interactive_src_start;
346
347
new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
348
if (!new_str) {
349
if (tok->interactive_src_start) {
350
PyMem_Free(tok->interactive_src_start);
351
}
352
tok->interactive_src_start = NULL;
353
tok->interactive_src_end = NULL;
354
tok->done = E_NOMEM;
355
return -1;
356
}
357
strcpy(new_str + current_size, line);
358
tok->implicit_newline = 0;
359
if (last_char != '\n') {
360
/* Last line does not end in \n, fake one */
361
new_str[current_size + line_size - 1] = '\n';
362
new_str[current_size + line_size] = '\0';
363
tok->implicit_newline = 1;
364
}
365
tok->interactive_src_start = new_str;
366
tok->interactive_src_end = new_str + current_size + line_size;
367
return 0;
368
}
369
370
/* Traverse and remember all f-string buffers, in order to be able to restore
371
them after reallocating tok->buf */
372
static void
373
remember_fstring_buffers(struct tok_state *tok)
374
{
375
int index;
376
tokenizer_mode *mode;
377
378
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
379
mode = &(tok->tok_mode_stack[index]);
380
mode->f_string_start_offset = mode->f_string_start - tok->buf;
381
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
382
}
383
}
384
385
/* Traverse and restore all f-string buffers after reallocating tok->buf */
386
static void
387
restore_fstring_buffers(struct tok_state *tok)
388
{
389
int index;
390
tokenizer_mode *mode;
391
392
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
393
mode = &(tok->tok_mode_stack[index]);
394
mode->f_string_start = tok->buf + mode->f_string_start_offset;
395
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
396
}
397
}
398
399
static int
400
set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
401
assert(token != NULL);
402
assert(c == '}' || c == ':' || c == '!');
403
tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
404
405
if (!tok_mode->f_string_debug || token->metadata) {
406
return 0;
407
}
408
409
PyObject *res = PyUnicode_DecodeUTF8(
410
tok_mode->last_expr_buffer,
411
tok_mode->last_expr_size - tok_mode->last_expr_end,
412
NULL
413
);
414
if (!res) {
415
return -1;
416
}
417
token->metadata = res;
418
return 0;
419
}
420
421
static int
422
update_fstring_expr(struct tok_state *tok, char cur)
423
{
424
assert(tok->cur != NULL);
425
426
Py_ssize_t size = strlen(tok->cur);
427
tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
428
429
switch (cur) {
430
case 0:
431
if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
432
return 1;
433
}
434
char *new_buffer = PyMem_Realloc(
435
tok_mode->last_expr_buffer,
436
tok_mode->last_expr_size + size
437
);
438
if (new_buffer == NULL) {
439
PyMem_Free(tok_mode->last_expr_buffer);
440
goto error;
441
}
442
tok_mode->last_expr_buffer = new_buffer;
443
strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
444
tok_mode->last_expr_size += size;
445
break;
446
case '{':
447
if (tok_mode->last_expr_buffer != NULL) {
448
PyMem_Free(tok_mode->last_expr_buffer);
449
}
450
tok_mode->last_expr_buffer = PyMem_Malloc(size);
451
if (tok_mode->last_expr_buffer == NULL) {
452
goto error;
453
}
454
tok_mode->last_expr_size = size;
455
tok_mode->last_expr_end = -1;
456
strncpy(tok_mode->last_expr_buffer, tok->cur, size);
457
break;
458
case '}':
459
case '!':
460
case ':':
461
if (tok_mode->last_expr_end == -1) {
462
tok_mode->last_expr_end = strlen(tok->start);
463
}
464
break;
465
default:
466
Py_UNREACHABLE();
467
}
468
return 1;
469
error:
470
tok->done = E_NOMEM;
471
return 0;
472
}
473
474
static void
475
free_fstring_expressions(struct tok_state *tok)
476
{
477
int index;
478
tokenizer_mode *mode;
479
480
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
481
mode = &(tok->tok_mode_stack[index]);
482
if (mode->last_expr_buffer != NULL) {
483
PyMem_Free(mode->last_expr_buffer);
484
mode->last_expr_buffer = NULL;
485
mode->last_expr_size = 0;
486
mode->last_expr_end = -1;
487
}
488
}
489
}
490
491
/* Read a line of text from TOK into S, using the stream in TOK.
492
Return NULL on failure, else S.
493
494
On entry, tok->decoding_buffer will be one of:
495
1) NULL: need to call tok->decoding_readline to get a new line
496
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
497
stored the result in tok->decoding_buffer
498
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
499
(in the s buffer) to copy entire contents of the line read
500
by tok->decoding_readline. tok->decoding_buffer has the overflow.
501
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
502
until the buffer ends with a '\n' (or until the end of the file is
503
reached): see tok_nextc and its calls to tok_reserve_buf.
504
*/
505
506
static int
507
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
508
{
509
Py_ssize_t cur = tok->cur - tok->buf;
510
Py_ssize_t oldsize = tok->inp - tok->buf;
511
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
512
if (newsize > tok->end - tok->buf) {
513
char *newbuf = tok->buf;
514
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
515
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
516
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
517
remember_fstring_buffers(tok);
518
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
519
if (newbuf == NULL) {
520
tok->done = E_NOMEM;
521
return 0;
522
}
523
tok->buf = newbuf;
524
tok->cur = tok->buf + cur;
525
tok->inp = tok->buf + oldsize;
526
tok->end = tok->buf + newsize;
527
tok->start = start < 0 ? NULL : tok->buf + start;
528
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
529
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
530
restore_fstring_buffers(tok);
531
}
532
return 1;
533
}
534
535
static inline int
536
contains_null_bytes(const char* str, size_t size) {
537
return memchr(str, 0, size) != NULL;
538
}
539
540
static int
541
tok_readline_recode(struct tok_state *tok) {
542
PyObject *line;
543
const char *buf;
544
Py_ssize_t buflen;
545
line = tok->decoding_buffer;
546
if (line == NULL) {
547
line = PyObject_CallNoArgs(tok->decoding_readline);
548
if (line == NULL) {
549
error_ret(tok);
550
goto error;
551
}
552
}
553
else {
554
tok->decoding_buffer = NULL;
555
}
556
buf = PyUnicode_AsUTF8AndSize(line, &buflen);
557
if (buf == NULL) {
558
error_ret(tok);
559
goto error;
560
}
561
// Make room for the null terminator *and* potentially
562
// an extra newline character that we may need to artificially
563
// add.
564
size_t buffer_size = buflen + 2;
565
if (!tok_reserve_buf(tok, buffer_size)) {
566
goto error;
567
}
568
memcpy(tok->inp, buf, buflen);
569
tok->inp += buflen;
570
*tok->inp = '\0';
571
if (tok->fp_interactive &&
572
tok_concatenate_interactive_new_line(tok, buf) == -1) {
573
goto error;
574
}
575
Py_DECREF(line);
576
return 1;
577
error:
578
Py_XDECREF(line);
579
return 0;
580
}
581
582
/* Set the readline function for TOK to a StreamReader's
583
readline function. The StreamReader is named ENC.
584
585
This function is called from check_bom and check_coding_spec.
586
587
ENC is usually identical to the future value of tok->encoding,
588
except for the (currently unsupported) case of UTF-16.
589
590
Return 1 on success, 0 on failure. */
591
592
static int
593
fp_setreadl(struct tok_state *tok, const char* enc)
594
{
595
PyObject *readline, *open, *stream;
596
int fd;
597
long pos;
598
599
fd = fileno(tok->fp);
600
/* Due to buffering the file offset for fd can be different from the file
601
* position of tok->fp. If tok->fp was opened in text mode on Windows,
602
* its file position counts CRLF as one char and can't be directly mapped
603
* to the file offset for fd. Instead we step back one byte and read to
604
* the end of line.*/
605
pos = ftell(tok->fp);
606
if (pos == -1 ||
607
lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
608
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
609
return 0;
610
}
611
612
open = _PyImport_GetModuleAttrString("io", "open");
613
if (open == NULL) {
614
return 0;
615
}
616
stream = PyObject_CallFunction(open, "isisOOO",
617
fd, "r", -1, enc, Py_None, Py_None, Py_False);
618
Py_DECREF(open);
619
if (stream == NULL) {
620
return 0;
621
}
622
623
readline = PyObject_GetAttr(stream, &_Py_ID(readline));
624
Py_DECREF(stream);
625
if (readline == NULL) {
626
return 0;
627
}
628
Py_XSETREF(tok->decoding_readline, readline);
629
630
if (pos > 0) {
631
PyObject *bufobj = _PyObject_CallNoArgs(readline);
632
if (bufobj == NULL) {
633
return 0;
634
}
635
Py_DECREF(bufobj);
636
}
637
638
return 1;
639
}
640
641
/* Fetch the next byte from TOK. */
642
643
static int fp_getc(struct tok_state *tok) {
644
return getc(tok->fp);
645
}
646
647
/* Unfetch the last byte back into TOK. */
648
649
static void fp_ungetc(int c, struct tok_state *tok) {
650
ungetc(c, tok->fp);
651
}
652
653
/* Check whether the characters at s start a valid
654
UTF-8 sequence. Return the number of characters forming
655
the sequence if yes, 0 if not. The special cases match
656
those in stringlib/codecs.h:utf8_decode.
657
*/
658
static int
659
valid_utf8(const unsigned char* s)
660
{
661
int expected = 0;
662
int length;
663
if (*s < 0x80) {
664
/* single-byte code */
665
return 1;
666
}
667
else if (*s < 0xE0) {
668
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
669
if (*s < 0xC2) {
670
/* invalid sequence
671
\x80-\xBF -- continuation byte
672
\xC0-\xC1 -- fake 0000-007F */
673
return 0;
674
}
675
expected = 1;
676
}
677
else if (*s < 0xF0) {
678
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
679
if (*s == 0xE0 && *(s + 1) < 0xA0) {
680
/* invalid sequence
681
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
682
return 0;
683
}
684
else if (*s == 0xED && *(s + 1) >= 0xA0) {
685
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
686
will result in surrogates in range D800-DFFF. Surrogates are
687
not valid UTF-8 so they are rejected.
688
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
689
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
690
return 0;
691
}
692
expected = 2;
693
}
694
else if (*s < 0xF5) {
695
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
696
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
697
/* invalid sequence -- one of:
698
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
699
\xF4\x90\x80\x80- -- 110000- overflow */
700
return 0;
701
}
702
expected = 3;
703
}
704
else {
705
/* invalid start byte */
706
return 0;
707
}
708
length = expected + 1;
709
for (; expected; expected--)
710
if (s[expected] < 0x80 || s[expected] >= 0xC0)
711
return 0;
712
return length;
713
}
714
715
static int
716
ensure_utf8(char *line, struct tok_state *tok)
717
{
718
int badchar = 0;
719
unsigned char *c;
720
int length;
721
for (c = (unsigned char *)line; *c; c += length) {
722
if (!(length = valid_utf8(c))) {
723
badchar = *c;
724
break;
725
}
726
}
727
if (badchar) {
728
PyErr_Format(PyExc_SyntaxError,
729
"Non-UTF-8 code starting with '\\x%.2x' "
730
"in file %U on line %i, "
731
"but no encoding declared; "
732
"see https://peps.python.org/pep-0263/ for details",
733
badchar, tok->filename, tok->lineno);
734
return 0;
735
}
736
return 1;
737
}
738
739
/* Fetch a byte from TOK, using the string buffer. */
740
741
static int
742
buf_getc(struct tok_state *tok) {
743
return Py_CHARMASK(*tok->str++);
744
}
745
746
/* Unfetch a byte from TOK, using the string buffer. */
747
748
static void
749
buf_ungetc(int c, struct tok_state *tok) {
750
tok->str--;
751
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
752
}
753
754
/* Set the readline function for TOK to ENC. For the string-based
755
tokenizer, this means to just record the encoding. */
756
757
static int
758
buf_setreadl(struct tok_state *tok, const char* enc) {
759
tok->enc = enc;
760
return 1;
761
}
762
763
/* Return a UTF-8 encoding Python string object from the
764
C byte string STR, which is encoded with ENC. */
765
766
static PyObject *
767
translate_into_utf8(const char* str, const char* enc) {
768
PyObject *utf8;
769
PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
770
if (buf == NULL)
771
return NULL;
772
utf8 = PyUnicode_AsUTF8String(buf);
773
Py_DECREF(buf);
774
return utf8;
775
}
776
777
778
static char *
779
translate_newlines(const char *s, int exec_input, int preserve_crlf,
780
struct tok_state *tok) {
781
int skip_next_lf = 0;
782
size_t needed_length = strlen(s) + 2, final_length;
783
char *buf, *current;
784
char c = '\0';
785
buf = PyMem_Malloc(needed_length);
786
if (buf == NULL) {
787
tok->done = E_NOMEM;
788
return NULL;
789
}
790
for (current = buf; *s; s++, current++) {
791
c = *s;
792
if (skip_next_lf) {
793
skip_next_lf = 0;
794
if (c == '\n') {
795
c = *++s;
796
if (!c)
797
break;
798
}
799
}
800
if (!preserve_crlf && c == '\r') {
801
skip_next_lf = 1;
802
c = '\n';
803
}
804
*current = c;
805
}
806
/* If this is exec input, add a newline to the end of the string if
807
there isn't one already. */
808
if (exec_input && c != '\n' && c != '\0') {
809
*current = '\n';
810
current++;
811
}
812
*current = '\0';
813
final_length = current - buf + 1;
814
if (final_length < needed_length && final_length) {
815
/* should never fail */
816
char* result = PyMem_Realloc(buf, final_length);
817
if (result == NULL) {
818
PyMem_Free(buf);
819
}
820
buf = result;
821
}
822
return buf;
823
}
824
825
/* Decode a byte string STR for use as the buffer of TOK.
826
Look for encoding declarations inside STR, and record them
827
inside TOK. */
828
829
static char *
830
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
831
{
832
PyObject* utf8 = NULL;
833
char *str;
834
const char *s;
835
const char *newl[2] = {NULL, NULL};
836
int lineno = 0;
837
tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
838
if (str == NULL)
839
return NULL;
840
tok->enc = NULL;
841
tok->str = str;
842
if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
843
return error_ret(tok);
844
str = tok->str; /* string after BOM if any */
845
assert(str);
846
if (tok->enc != NULL) {
847
utf8 = translate_into_utf8(str, tok->enc);
848
if (utf8 == NULL)
849
return error_ret(tok);
850
str = PyBytes_AsString(utf8);
851
}
852
for (s = str;; s++) {
853
if (*s == '\0') break;
854
else if (*s == '\n') {
855
assert(lineno < 2);
856
newl[lineno] = s;
857
lineno++;
858
if (lineno == 2) break;
859
}
860
}
861
tok->enc = NULL;
862
/* need to check line 1 and 2 separately since check_coding_spec
863
assumes a single line as input */
864
if (newl[0]) {
865
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
866
return NULL;
867
}
868
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
869
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
870
tok, buf_setreadl))
871
return NULL;
872
}
873
}
874
if (tok->enc != NULL) {
875
assert(utf8 == NULL);
876
utf8 = translate_into_utf8(str, tok->enc);
877
if (utf8 == NULL)
878
return error_ret(tok);
879
str = PyBytes_AS_STRING(utf8);
880
}
881
assert(tok->decoding_buffer == NULL);
882
tok->decoding_buffer = utf8; /* CAUTION */
883
return str;
884
}
885
886
/* Set up tokenizer for string */
887
888
struct tok_state *
889
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
890
{
891
struct tok_state *tok = tok_new();
892
char *decoded;
893
894
if (tok == NULL)
895
return NULL;
896
decoded = decode_str(str, exec_input, tok, preserve_crlf);
897
if (decoded == NULL) {
898
_PyTokenizer_Free(tok);
899
return NULL;
900
}
901
902
tok->buf = tok->cur = tok->inp = decoded;
903
tok->end = decoded;
904
return tok;
905
}
906
907
struct tok_state *
908
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
909
int exec_input, int preserve_crlf)
910
{
911
struct tok_state *tok = tok_new();
912
if (tok == NULL)
913
return NULL;
914
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
915
_PyTokenizer_Free(tok);
916
return NULL;
917
}
918
tok->cur = tok->inp = tok->buf;
919
tok->end = tok->buf + BUFSIZ;
920
tok->fp = NULL;
921
if (enc != NULL) {
922
tok->encoding = new_string(enc, strlen(enc), tok);
923
if (!tok->encoding) {
924
_PyTokenizer_Free(tok);
925
return NULL;
926
}
927
}
928
tok->decoding_state = STATE_NORMAL;
929
Py_INCREF(readline);
930
tok->readline = readline;
931
return tok;
932
}
933
934
/* Set up tokenizer for UTF-8 string */
935
936
struct tok_state *
937
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
938
{
939
struct tok_state *tok = tok_new();
940
char *translated;
941
if (tok == NULL)
942
return NULL;
943
tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
944
if (translated == NULL) {
945
_PyTokenizer_Free(tok);
946
return NULL;
947
}
948
tok->decoding_state = STATE_NORMAL;
949
tok->enc = NULL;
950
tok->str = translated;
951
tok->encoding = new_string("utf-8", 5, tok);
952
if (!tok->encoding) {
953
_PyTokenizer_Free(tok);
954
return NULL;
955
}
956
957
tok->buf = tok->cur = tok->inp = translated;
958
tok->end = translated;
959
return tok;
960
}
961
962
/* Set up tokenizer for file */
963
964
struct tok_state *
965
_PyTokenizer_FromFile(FILE *fp, const char* enc,
966
const char *ps1, const char *ps2)
967
{
968
struct tok_state *tok = tok_new();
969
if (tok == NULL)
970
return NULL;
971
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
972
_PyTokenizer_Free(tok);
973
return NULL;
974
}
975
tok->cur = tok->inp = tok->buf;
976
tok->end = tok->buf + BUFSIZ;
977
tok->fp = fp;
978
tok->prompt = ps1;
979
tok->nextprompt = ps2;
980
if (enc != NULL) {
981
/* Must copy encoding declaration since it
982
gets copied into the parse tree. */
983
tok->encoding = new_string(enc, strlen(enc), tok);
984
if (!tok->encoding) {
985
_PyTokenizer_Free(tok);
986
return NULL;
987
}
988
tok->decoding_state = STATE_NORMAL;
989
}
990
return tok;
991
}
992
993
/* Free a tok_state structure */
994
995
void
996
_PyTokenizer_Free(struct tok_state *tok)
997
{
998
if (tok->encoding != NULL) {
999
PyMem_Free(tok->encoding);
1000
}
1001
Py_XDECREF(tok->decoding_readline);
1002
Py_XDECREF(tok->decoding_buffer);
1003
Py_XDECREF(tok->readline);
1004
Py_XDECREF(tok->filename);
1005
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
1006
PyMem_Free(tok->buf);
1007
}
1008
if (tok->input) {
1009
PyMem_Free(tok->input);
1010
}
1011
if (tok->interactive_src_start != NULL) {
1012
PyMem_Free(tok->interactive_src_start);
1013
}
1014
free_fstring_expressions(tok);
1015
PyMem_Free(tok);
1016
}
1017
1018
void
1019
_PyToken_Free(struct token *token) {
1020
Py_XDECREF(token->metadata);
1021
}
1022
1023
void
1024
_PyToken_Init(struct token *token) {
1025
token->metadata = NULL;
1026
}
1027
1028
static int
1029
tok_readline_raw(struct tok_state *tok)
1030
{
1031
do {
1032
if (!tok_reserve_buf(tok, BUFSIZ)) {
1033
return 0;
1034
}
1035
int n_chars = (int)(tok->end - tok->inp);
1036
size_t line_size = 0;
1037
char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
1038
if (line == NULL) {
1039
return 1;
1040
}
1041
if (tok->fp_interactive &&
1042
tok_concatenate_interactive_new_line(tok, line) == -1) {
1043
return 0;
1044
}
1045
tok->inp += line_size;
1046
if (tok->inp == tok->buf) {
1047
return 0;
1048
}
1049
} while (tok->inp[-1] != '\n');
1050
return 1;
1051
}
1052
1053
static int
1054
tok_readline_string(struct tok_state* tok) {
1055
PyObject* line = NULL;
1056
PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
1057
if (raw_line == NULL) {
1058
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
1059
PyErr_Clear();
1060
return 1;
1061
}
1062
error_ret(tok);
1063
goto error;
1064
}
1065
if(tok->encoding != NULL) {
1066
if (!PyBytes_Check(raw_line)) {
1067
PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
1068
error_ret(tok);
1069
goto error;
1070
}
1071
line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
1072
tok->encoding, "replace");
1073
Py_CLEAR(raw_line);
1074
if (line == NULL) {
1075
error_ret(tok);
1076
goto error;
1077
}
1078
} else {
1079
if(!PyUnicode_Check(raw_line)) {
1080
PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
1081
error_ret(tok);
1082
goto error;
1083
}
1084
line = raw_line;
1085
raw_line = NULL;
1086
}
1087
Py_ssize_t buflen;
1088
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
1089
if (buf == NULL) {
1090
error_ret(tok);
1091
goto error;
1092
}
1093
1094
// Make room for the null terminator *and* potentially
1095
// an extra newline character that we may need to artificially
1096
// add.
1097
size_t buffer_size = buflen + 2;
1098
if (!tok_reserve_buf(tok, buffer_size)) {
1099
goto error;
1100
}
1101
memcpy(tok->inp, buf, buflen);
1102
tok->inp += buflen;
1103
*tok->inp = '\0';
1104
1105
tok->line_start = tok->cur;
1106
Py_DECREF(line);
1107
return 1;
1108
error:
1109
Py_XDECREF(raw_line);
1110
Py_XDECREF(line);
1111
return 0;
1112
}
1113
1114
static int
1115
tok_underflow_string(struct tok_state *tok) {
1116
char *end = strchr(tok->inp, '\n');
1117
if (end != NULL) {
1118
end++;
1119
}
1120
else {
1121
end = strchr(tok->inp, '\0');
1122
if (end == tok->inp) {
1123
tok->done = E_EOF;
1124
return 0;
1125
}
1126
}
1127
if (tok->start == NULL) {
1128
tok->buf = tok->cur;
1129
}
1130
tok->line_start = tok->cur;
1131
ADVANCE_LINENO();
1132
tok->inp = end;
1133
return 1;
1134
}
1135
1136
static int
1137
tok_underflow_interactive(struct tok_state *tok) {
1138
if (tok->interactive_underflow == IUNDERFLOW_STOP) {
1139
tok->done = E_INTERACT_STOP;
1140
return 1;
1141
}
1142
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
1143
if (newtok != NULL) {
1144
char *translated = translate_newlines(newtok, 0, 0, tok);
1145
PyMem_Free(newtok);
1146
if (translated == NULL) {
1147
return 0;
1148
}
1149
newtok = translated;
1150
}
1151
if (tok->encoding && newtok && *newtok) {
1152
/* Recode to UTF-8 */
1153
Py_ssize_t buflen;
1154
const char* buf;
1155
PyObject *u = translate_into_utf8(newtok, tok->encoding);
1156
PyMem_Free(newtok);
1157
if (u == NULL) {
1158
tok->done = E_DECODE;
1159
return 0;
1160
}
1161
buflen = PyBytes_GET_SIZE(u);
1162
buf = PyBytes_AS_STRING(u);
1163
newtok = PyMem_Malloc(buflen+1);
1164
if (newtok == NULL) {
1165
Py_DECREF(u);
1166
tok->done = E_NOMEM;
1167
return 0;
1168
}
1169
strcpy(newtok, buf);
1170
Py_DECREF(u);
1171
}
1172
if (tok->fp_interactive &&
1173
tok_concatenate_interactive_new_line(tok, newtok) == -1) {
1174
PyMem_Free(newtok);
1175
return 0;
1176
}
1177
if (tok->nextprompt != NULL) {
1178
tok->prompt = tok->nextprompt;
1179
}
1180
if (newtok == NULL) {
1181
tok->done = E_INTR;
1182
}
1183
else if (*newtok == '\0') {
1184
PyMem_Free(newtok);
1185
tok->done = E_EOF;
1186
}
1187
else if (tok->start != NULL) {
1188
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
1189
remember_fstring_buffers(tok);
1190
size_t size = strlen(newtok);
1191
ADVANCE_LINENO();
1192
if (!tok_reserve_buf(tok, size + 1)) {
1193
PyMem_Free(tok->buf);
1194
tok->buf = NULL;
1195
PyMem_Free(newtok);
1196
return 0;
1197
}
1198
memcpy(tok->cur, newtok, size + 1);
1199
PyMem_Free(newtok);
1200
tok->inp += size;
1201
tok->multi_line_start = tok->buf + cur_multi_line_start;
1202
restore_fstring_buffers(tok);
1203
}
1204
else {
1205
remember_fstring_buffers(tok);
1206
ADVANCE_LINENO();
1207
PyMem_Free(tok->buf);
1208
tok->buf = newtok;
1209
tok->cur = tok->buf;
1210
tok->line_start = tok->buf;
1211
tok->inp = strchr(tok->buf, '\0');
1212
tok->end = tok->inp + 1;
1213
restore_fstring_buffers(tok);
1214
}
1215
if (tok->done != E_OK) {
1216
if (tok->prompt != NULL) {
1217
PySys_WriteStderr("\n");
1218
}
1219
return 0;
1220
}
1221
1222
if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1223
return 0;
1224
}
1225
return 1;
1226
}
1227
1228
static int
1229
tok_underflow_file(struct tok_state *tok) {
1230
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
1231
tok->cur = tok->inp = tok->buf;
1232
}
1233
if (tok->decoding_state == STATE_INIT) {
1234
/* We have not yet determined the encoding.
1235
If an encoding is found, use the file-pointer
1236
reader functions from now on. */
1237
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
1238
error_ret(tok);
1239
return 0;
1240
}
1241
assert(tok->decoding_state != STATE_INIT);
1242
}
1243
/* Read until '\n' or EOF */
1244
if (tok->decoding_readline != NULL) {
1245
/* We already have a codec associated with this input. */
1246
if (!tok_readline_recode(tok)) {
1247
return 0;
1248
}
1249
}
1250
else {
1251
/* We want a 'raw' read. */
1252
if (!tok_readline_raw(tok)) {
1253
return 0;
1254
}
1255
}
1256
if (tok->inp == tok->cur) {
1257
tok->done = E_EOF;
1258
return 0;
1259
}
1260
tok->implicit_newline = 0;
1261
if (tok->inp[-1] != '\n') {
1262
assert(tok->inp + 1 < tok->end);
1263
/* Last line does not end in \n, fake one */
1264
*tok->inp++ = '\n';
1265
*tok->inp = '\0';
1266
tok->implicit_newline = 1;
1267
}
1268
1269
if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1270
return 0;
1271
}
1272
1273
ADVANCE_LINENO();
1274
if (tok->decoding_state != STATE_NORMAL) {
1275
if (tok->lineno > 2) {
1276
tok->decoding_state = STATE_NORMAL;
1277
}
1278
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1279
tok, fp_setreadl))
1280
{
1281
return 0;
1282
}
1283
}
1284
/* The default encoding is UTF-8, so make sure we don't have any
1285
non-UTF-8 sequences in it. */
1286
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1287
error_ret(tok);
1288
return 0;
1289
}
1290
assert(tok->done == E_OK);
1291
return tok->done == E_OK;
1292
}
1293
1294
static int
1295
tok_underflow_readline(struct tok_state* tok) {
1296
assert(tok->decoding_state == STATE_NORMAL);
1297
assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
1298
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
1299
tok->cur = tok->inp = tok->buf;
1300
}
1301
if (!tok_readline_string(tok)) {
1302
return 0;
1303
}
1304
if (tok->inp == tok->cur) {
1305
tok->done = E_EOF;
1306
return 0;
1307
}
1308
tok->implicit_newline = 0;
1309
if (tok->inp[-1] != '\n') {
1310
assert(tok->inp + 1 < tok->end);
1311
/* Last line does not end in \n, fake one */
1312
*tok->inp++ = '\n';
1313
*tok->inp = '\0';
1314
tok->implicit_newline = 1;
1315
}
1316
1317
if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1318
return 0;
1319
}
1320
1321
ADVANCE_LINENO();
1322
/* The default encoding is UTF-8, so make sure we don't have any
1323
non-UTF-8 sequences in it. */
1324
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1325
error_ret(tok);
1326
return 0;
1327
}
1328
assert(tok->done == E_OK);
1329
return tok->done == E_OK;
1330
}
1331
1332
#if defined(Py_DEBUG)
1333
static void
1334
print_escape(FILE *f, const char *s, Py_ssize_t size)
1335
{
1336
if (s == NULL) {
1337
fputs("NULL", f);
1338
return;
1339
}
1340
putc('"', f);
1341
while (size-- > 0) {
1342
unsigned char c = *s++;
1343
switch (c) {
1344
case '\n': fputs("\\n", f); break;
1345
case '\r': fputs("\\r", f); break;
1346
case '\t': fputs("\\t", f); break;
1347
case '\f': fputs("\\f", f); break;
1348
case '\'': fputs("\\'", f); break;
1349
case '"': fputs("\\\"", f); break;
1350
default:
1351
if (0x20 <= c && c <= 0x7f)
1352
putc(c, f);
1353
else
1354
fprintf(f, "\\x%02x", c);
1355
}
1356
}
1357
putc('"', f);
1358
}
1359
#endif
1360
1361
/* Get next char, updating state; error code goes into tok->done */
1362
1363
static int
1364
tok_nextc(struct tok_state *tok)
1365
{
1366
int rc;
1367
for (;;) {
1368
if (tok->cur != tok->inp) {
1369
tok->col_offset++;
1370
return Py_CHARMASK(*tok->cur++); /* Fast path */
1371
}
1372
if (tok->done != E_OK) {
1373
return EOF;
1374
}
1375
if (tok->readline) {
1376
rc = tok_underflow_readline(tok);
1377
}
1378
else if (tok->fp == NULL) {
1379
rc = tok_underflow_string(tok);
1380
}
1381
else if (tok->prompt != NULL) {
1382
rc = tok_underflow_interactive(tok);
1383
}
1384
else {
1385
rc = tok_underflow_file(tok);
1386
}
1387
#if defined(Py_DEBUG)
1388
if (tok->debug) {
1389
fprintf(stderr, "line[%d] = ", tok->lineno);
1390
print_escape(stderr, tok->cur, tok->inp - tok->cur);
1391
fprintf(stderr, " tok->done = %d\n", tok->done);
1392
}
1393
#endif
1394
if (!rc) {
1395
tok->cur = tok->inp;
1396
return EOF;
1397
}
1398
tok->line_start = tok->cur;
1399
1400
if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1401
syntaxerror(tok, "source code cannot contain null bytes");
1402
tok->cur = tok->inp;
1403
return EOF;
1404
}
1405
}
1406
Py_UNREACHABLE();
1407
}
1408
1409
/* Back-up one character */
1410
1411
static void
1412
tok_backup(struct tok_state *tok, int c)
1413
{
1414
if (c != EOF) {
1415
if (--tok->cur < tok->buf) {
1416
Py_FatalError("tokenizer beginning of buffer");
1417
}
1418
if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
1419
Py_FatalError("tok_backup: wrong character");
1420
}
1421
tok->col_offset--;
1422
}
1423
}
1424
1425
static int
1426
_syntaxerror_range(struct tok_state *tok, const char *format,
1427
int col_offset, int end_col_offset,
1428
va_list vargs)
1429
{
1430
// In release builds, we don't want to overwrite a previous error, but in debug builds we
1431
// want to fail if we are not doing it so we can fix it.
1432
assert(tok->done != E_ERROR);
1433
if (tok->done == E_ERROR) {
1434
return ERRORTOKEN;
1435
}
1436
PyObject *errmsg, *errtext, *args;
1437
errmsg = PyUnicode_FromFormatV(format, vargs);
1438
if (!errmsg) {
1439
goto error;
1440
}
1441
1442
errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1443
"replace");
1444
if (!errtext) {
1445
goto error;
1446
}
1447
1448
if (col_offset == -1) {
1449
col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1450
}
1451
if (end_col_offset == -1) {
1452
end_col_offset = col_offset;
1453
}
1454
1455
Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1456
if (line_len != tok->cur - tok->line_start) {
1457
Py_DECREF(errtext);
1458
errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1459
"replace");
1460
}
1461
if (!errtext) {
1462
goto error;
1463
}
1464
1465
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1466
col_offset, errtext, tok->lineno, end_col_offset);
1467
if (args) {
1468
PyErr_SetObject(PyExc_SyntaxError, args);
1469
Py_DECREF(args);
1470
}
1471
1472
error:
1473
Py_XDECREF(errmsg);
1474
tok->done = E_ERROR;
1475
return ERRORTOKEN;
1476
}
1477
1478
static int
1479
syntaxerror(struct tok_state *tok, const char *format, ...)
1480
{
1481
// This errors are cleaned on startup. Todo: Fix it.
1482
va_list vargs;
1483
va_start(vargs, format);
1484
int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1485
va_end(vargs);
1486
return ret;
1487
}
1488
1489
static int
1490
syntaxerror_known_range(struct tok_state *tok,
1491
int col_offset, int end_col_offset,
1492
const char *format, ...)
1493
{
1494
va_list vargs;
1495
va_start(vargs, format);
1496
int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1497
va_end(vargs);
1498
return ret;
1499
}
1500
1501
static int
1502
indenterror(struct tok_state *tok)
1503
{
1504
tok->done = E_TABSPACE;
1505
tok->cur = tok->inp;
1506
return ERRORTOKEN;
1507
}
1508
1509
static int
1510
parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1511
{
1512
if (!tok->report_warnings) {
1513
return 0;
1514
}
1515
1516
PyObject *errmsg;
1517
va_list vargs;
1518
va_start(vargs, format);
1519
errmsg = PyUnicode_FromFormatV(format, vargs);
1520
va_end(vargs);
1521
if (!errmsg) {
1522
goto error;
1523
}
1524
1525
if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1526
tok->lineno, NULL, NULL) < 0) {
1527
if (PyErr_ExceptionMatches(category)) {
1528
/* Replace the DeprecationWarning exception with a SyntaxError
1529
to get a more accurate error report */
1530
PyErr_Clear();
1531
syntaxerror(tok, "%U", errmsg);
1532
}
1533
goto error;
1534
}
1535
Py_DECREF(errmsg);
1536
return 0;
1537
1538
error:
1539
Py_XDECREF(errmsg);
1540
tok->done = E_ERROR;
1541
return -1;
1542
}
1543
1544
static int
1545
warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
1546
{
1547
1548
if (!tok->tok_report_warnings) {
1549
return 0;
1550
}
1551
1552
PyObject *msg = PyUnicode_FromFormat(
1553
"invalid escape sequence '\\%c'",
1554
(char) first_invalid_escape_char
1555
);
1556
1557
if (msg == NULL) {
1558
return -1;
1559
}
1560
1561
if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename,
1562
tok->lineno, NULL, NULL) < 0) {
1563
Py_DECREF(msg);
1564
1565
if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) {
1566
/* Replace the SyntaxWarning exception with a SyntaxError
1567
to get a more accurate error report */
1568
PyErr_Clear();
1569
return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
1570
}
1571
1572
return -1;
1573
}
1574
1575
Py_DECREF(msg);
1576
return 0;
1577
}
1578
1579
static int
1580
lookahead(struct tok_state *tok, const char *test)
1581
{
1582
const char *s = test;
1583
int res = 0;
1584
while (1) {
1585
int c = tok_nextc(tok);
1586
if (*s == 0) {
1587
res = !is_potential_identifier_char(c);
1588
}
1589
else if (c == *s) {
1590
s++;
1591
continue;
1592
}
1593
1594
tok_backup(tok, c);
1595
while (s != test) {
1596
tok_backup(tok, *--s);
1597
}
1598
return res;
1599
}
1600
}
1601
1602
static int
1603
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
1604
if (tok->tok_extra_tokens) {
1605
// When we are parsing extra tokens, we don't want to emit warnings
1606
// about invalid literals, because we want to be a bit more liberal.
1607
return 1;
1608
}
1609
/* Emit a deprecation warning only if the numeric literal is immediately
1610
* followed by one of keywords which can occur after a numeric literal
1611
* in valid code: "and", "else", "for", "if", "in", "is" and "or".
1612
* It allows to gradually deprecate existing valid code without adding
1613
* warning before error in most cases of invalid numeric literal (which
1614
* would be confusing and break existing tests).
1615
* Raise a syntax error with slightly better message than plain
1616
* "invalid syntax" if the numeric literal is immediately followed by
1617
* other keyword or identifier.
1618
*/
1619
int r = 0;
1620
if (c == 'a') {
1621
r = lookahead(tok, "nd");
1622
}
1623
else if (c == 'e') {
1624
r = lookahead(tok, "lse");
1625
}
1626
else if (c == 'f') {
1627
r = lookahead(tok, "or");
1628
}
1629
else if (c == 'i') {
1630
int c2 = tok_nextc(tok);
1631
if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1632
r = 1;
1633
}
1634
tok_backup(tok, c2);
1635
}
1636
else if (c == 'o') {
1637
r = lookahead(tok, "r");
1638
}
1639
else if (c == 'n') {
1640
r = lookahead(tok, "ot");
1641
}
1642
if (r) {
1643
tok_backup(tok, c);
1644
if (parser_warn(tok, PyExc_SyntaxWarning,
1645
"invalid %s literal", kind))
1646
{
1647
return 0;
1648
}
1649
tok_nextc(tok);
1650
}
1651
else /* In future releases, only error will remain. */
1652
if (is_potential_identifier_char(c)) {
1653
tok_backup(tok, c);
1654
syntaxerror(tok, "invalid %s literal", kind);
1655
return 0;
1656
}
1657
return 1;
1658
}
1659
1660
/* Verify that the identifier follows PEP 3131.
1661
All identifier strings are guaranteed to be "ready" unicode objects.
1662
*/
1663
static int
1664
verify_identifier(struct tok_state *tok)
1665
{
1666
if (tok->tok_extra_tokens) {
1667
return 1;
1668
}
1669
PyObject *s;
1670
if (tok->decoding_erred)
1671
return 0;
1672
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1673
if (s == NULL) {
1674
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1675
tok->done = E_DECODE;
1676
}
1677
else {
1678
tok->done = E_ERROR;
1679
}
1680
return 0;
1681
}
1682
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1683
if (invalid < 0) {
1684
Py_DECREF(s);
1685
tok->done = E_ERROR;
1686
return 0;
1687
}
1688
assert(PyUnicode_GET_LENGTH(s) > 0);
1689
if (invalid < PyUnicode_GET_LENGTH(s)) {
1690
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1691
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1692
/* Determine the offset in UTF-8 encoded input */
1693
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1694
if (s != NULL) {
1695
Py_SETREF(s, PyUnicode_AsUTF8String(s));
1696
}
1697
if (s == NULL) {
1698
tok->done = E_ERROR;
1699
return 0;
1700
}
1701
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1702
}
1703
Py_DECREF(s);
1704
if (Py_UNICODE_ISPRINTABLE(ch)) {
1705
syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
1706
}
1707
else {
1708
syntaxerror(tok, "invalid non-printable character U+%04X", ch);
1709
}
1710
return 0;
1711
}
1712
Py_DECREF(s);
1713
return 1;
1714
}
1715
1716
static int
1717
tok_decimal_tail(struct tok_state *tok)
1718
{
1719
int c;
1720
1721
while (1) {
1722
do {
1723
c = tok_nextc(tok);
1724
} while (isdigit(c));
1725
if (c != '_') {
1726
break;
1727
}
1728
c = tok_nextc(tok);
1729
if (!isdigit(c)) {
1730
tok_backup(tok, c);
1731
syntaxerror(tok, "invalid decimal literal");
1732
return 0;
1733
}
1734
}
1735
return c;
1736
}
1737
1738
1739
static inline int
1740
tok_continuation_line(struct tok_state *tok) {
1741
int c = tok_nextc(tok);
1742
if (c == '\r') {
1743
c = tok_nextc(tok);
1744
}
1745
if (c != '\n') {
1746
tok->done = E_LINECONT;
1747
return -1;
1748
}
1749
c = tok_nextc(tok);
1750
if (c == EOF) {
1751
tok->done = E_EOF;
1752
tok->cur = tok->inp;
1753
return -1;
1754
} else {
1755
tok_backup(tok, c);
1756
}
1757
return c;
1758
}
1759
1760
static int
1761
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
1762
int end_col_offset, const char *start, const char *end)
1763
{
1764
token->level = tok->level;
1765
token->lineno = token->end_lineno = tok->lineno;
1766
token->col_offset = col_offset;
1767
token->end_col_offset = end_col_offset;
1768
token->start = start;
1769
token->end = end;
1770
return type;
1771
}
1772
1773
static int
1774
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
1775
{
1776
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
1777
token->level = tok->level;
1778
if (ISSTRINGLIT(type)) {
1779
token->lineno = tok->first_lineno;
1780
}
1781
else {
1782
token->lineno = tok->lineno;
1783
}
1784
token->end_lineno = tok->lineno;
1785
token->col_offset = token->end_col_offset = -1;
1786
token->start = start;
1787
token->end = end;
1788
1789
if (start != NULL && end != NULL) {
1790
token->col_offset = tok->starting_col_offset;
1791
token->end_col_offset = tok->col_offset;
1792
}
1793
return type;
1794
}
1795
1796
1797
static int
1798
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1799
{
1800
int c;
1801
int blankline, nonascii;
1802
1803
const char *p_start = NULL;
1804
const char *p_end = NULL;
1805
nextline:
1806
tok->start = NULL;
1807
tok->starting_col_offset = -1;
1808
blankline = 0;
1809
1810
1811
/* Get indentation level */
1812
if (tok->atbol) {
1813
int col = 0;
1814
int altcol = 0;
1815
tok->atbol = 0;
1816
int cont_line_col = 0;
1817
for (;;) {
1818
c = tok_nextc(tok);
1819
if (c == ' ') {
1820
col++, altcol++;
1821
}
1822
else if (c == '\t') {
1823
col = (col / tok->tabsize + 1) * tok->tabsize;
1824
altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1825
}
1826
else if (c == '\014') {/* Control-L (formfeed) */
1827
col = altcol = 0; /* For Emacs users */
1828
}
1829
else if (c == '\\') {
1830
// Indentation cannot be split over multiple physical lines
1831
// using backslashes. This means that if we found a backslash
1832
// preceded by whitespace, **the first one we find** determines
1833
// the level of indentation of whatever comes next.
1834
cont_line_col = cont_line_col ? cont_line_col : col;
1835
if ((c = tok_continuation_line(tok)) == -1) {
1836
return MAKE_TOKEN(ERRORTOKEN);
1837
}
1838
}
1839
else {
1840
break;
1841
}
1842
}
1843
tok_backup(tok, c);
1844
if (c == '#' || c == '\n' || c == '\r') {
1845
/* Lines with only whitespace and/or comments
1846
shouldn't affect the indentation and are
1847
not passed to the parser as NEWLINE tokens,
1848
except *totally* empty lines in interactive
1849
mode, which signal the end of a command group. */
1850
if (col == 0 && c == '\n' && tok->prompt != NULL) {
1851
blankline = 0; /* Let it through */
1852
}
1853
else if (tok->prompt != NULL && tok->lineno == 1) {
1854
/* In interactive mode, if the first line contains
1855
only spaces and/or a comment, let it through. */
1856
blankline = 0;
1857
col = altcol = 0;
1858
}
1859
else {
1860
blankline = 1; /* Ignore completely */
1861
}
1862
/* We can't jump back right here since we still
1863
may need to skip to the end of a comment */
1864
}
1865
if (!blankline && tok->level == 0) {
1866
col = cont_line_col ? cont_line_col : col;
1867
altcol = cont_line_col ? cont_line_col : altcol;
1868
if (col == tok->indstack[tok->indent]) {
1869
/* No change */
1870
if (altcol != tok->altindstack[tok->indent]) {
1871
return MAKE_TOKEN(indenterror(tok));
1872
}
1873
}
1874
else if (col > tok->indstack[tok->indent]) {
1875
/* Indent -- always one */
1876
if (tok->indent+1 >= MAXINDENT) {
1877
tok->done = E_TOODEEP;
1878
tok->cur = tok->inp;
1879
return MAKE_TOKEN(ERRORTOKEN);
1880
}
1881
if (altcol <= tok->altindstack[tok->indent]) {
1882
return MAKE_TOKEN(indenterror(tok));
1883
}
1884
tok->pendin++;
1885
tok->indstack[++tok->indent] = col;
1886
tok->altindstack[tok->indent] = altcol;
1887
}
1888
else /* col < tok->indstack[tok->indent] */ {
1889
/* Dedent -- any number, must be consistent */
1890
while (tok->indent > 0 &&
1891
col < tok->indstack[tok->indent]) {
1892
tok->pendin--;
1893
tok->indent--;
1894
}
1895
if (col != tok->indstack[tok->indent]) {
1896
tok->done = E_DEDENT;
1897
tok->cur = tok->inp;
1898
return MAKE_TOKEN(ERRORTOKEN);
1899
}
1900
if (altcol != tok->altindstack[tok->indent]) {
1901
return MAKE_TOKEN(indenterror(tok));
1902
}
1903
}
1904
}
1905
}
1906
1907
tok->start = tok->cur;
1908
tok->starting_col_offset = tok->col_offset;
1909
1910
/* Return pending indents/dedents */
1911
if (tok->pendin != 0) {
1912
if (tok->pendin < 0) {
1913
if (tok->tok_extra_tokens) {
1914
p_start = tok->cur;
1915
p_end = tok->cur;
1916
}
1917
tok->pendin++;
1918
return MAKE_TOKEN(DEDENT);
1919
}
1920
else {
1921
if (tok->tok_extra_tokens) {
1922
p_start = tok->buf;
1923
p_end = tok->cur;
1924
}
1925
tok->pendin--;
1926
return MAKE_TOKEN(INDENT);
1927
}
1928
}
1929
1930
/* Peek ahead at the next character */
1931
c = tok_nextc(tok);
1932
tok_backup(tok, c);
1933
/* Check if we are closing an async function */
1934
if (tok->async_def
1935
&& !blankline
1936
/* Due to some implementation artifacts of type comments,
1937
* a TYPE_COMMENT at the start of a function won't set an
1938
* indentation level and it will produce a NEWLINE after it.
1939
* To avoid spuriously ending an async function due to this,
1940
* wait until we have some non-newline char in front of us. */
1941
&& c != '\n'
1942
&& tok->level == 0
1943
/* There was a NEWLINE after ASYNC DEF,
1944
so we're past the signature. */
1945
&& tok->async_def_nl
1946
/* Current indentation level is less than where
1947
the async function was defined */
1948
&& tok->async_def_indent >= tok->indent)
1949
{
1950
tok->async_def = 0;
1951
tok->async_def_indent = 0;
1952
tok->async_def_nl = 0;
1953
}
1954
1955
again:
1956
tok->start = NULL;
1957
/* Skip spaces */
1958
do {
1959
c = tok_nextc(tok);
1960
} while (c == ' ' || c == '\t' || c == '\014');
1961
1962
/* Set start of current token */
1963
tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1964
tok->starting_col_offset = tok->col_offset - 1;
1965
1966
/* Skip comment, unless it's a type comment */
1967
if (c == '#') {
1968
1969
const char* p = NULL;
1970
const char *prefix, *type_start;
1971
int current_starting_col_offset;
1972
1973
while (c != EOF && c != '\n' && c != '\r') {
1974
c = tok_nextc(tok);
1975
}
1976
1977
if (tok->tok_extra_tokens) {
1978
p = tok->start;
1979
}
1980
1981
if (tok->type_comments) {
1982
p = tok->start;
1983
current_starting_col_offset = tok->starting_col_offset;
1984
prefix = type_comment_prefix;
1985
while (*prefix && p < tok->cur) {
1986
if (*prefix == ' ') {
1987
while (*p == ' ' || *p == '\t') {
1988
p++;
1989
current_starting_col_offset++;
1990
}
1991
} else if (*prefix == *p) {
1992
p++;
1993
current_starting_col_offset++;
1994
} else {
1995
break;
1996
}
1997
1998
prefix++;
1999
}
2000
2001
/* This is a type comment if we matched all of type_comment_prefix. */
2002
if (!*prefix) {
2003
int is_type_ignore = 1;
2004
// +6 in order to skip the word 'ignore'
2005
const char *ignore_end = p + 6;
2006
const int ignore_end_col_offset = current_starting_col_offset + 6;
2007
tok_backup(tok, c); /* don't eat the newline or EOF */
2008
2009
type_start = p;
2010
2011
/* A TYPE_IGNORE is "type: ignore" followed by the end of the token
2012
* or anything ASCII and non-alphanumeric. */
2013
is_type_ignore = (
2014
tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
2015
&& !(tok->cur > ignore_end
2016
&& ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
2017
2018
if (is_type_ignore) {
2019
p_start = ignore_end;
2020
p_end = tok->cur;
2021
2022
/* If this type ignore is the only thing on the line, consume the newline also. */
2023
if (blankline) {
2024
tok_nextc(tok);
2025
tok->atbol = 1;
2026
}
2027
return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
2028
} else {
2029
p_start = type_start;
2030
p_end = tok->cur;
2031
return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
2032
}
2033
}
2034
}
2035
if (tok->tok_extra_tokens) {
2036
tok_backup(tok, c); /* don't eat the newline or EOF */
2037
p_start = p;
2038
p_end = tok->cur;
2039
tok->comment_newline = blankline;
2040
return MAKE_TOKEN(COMMENT);
2041
}
2042
}
2043
2044
if (tok->done == E_INTERACT_STOP) {
2045
return MAKE_TOKEN(ENDMARKER);
2046
}
2047
2048
/* Check for EOF and errors now */
2049
if (c == EOF) {
2050
if (tok->level) {
2051
return MAKE_TOKEN(ERRORTOKEN);
2052
}
2053
return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
2054
}
2055
2056
/* Identifier (most frequent token!) */
2057
nonascii = 0;
2058
if (is_potential_identifier_start(c)) {
2059
/* Process the various legal combinations of b"", r"", u"", and f"". */
2060
int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
2061
while (1) {
2062
if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
2063
saw_b = 1;
2064
/* Since this is a backwards compatibility support literal we don't
2065
want to support it in arbitrary order like byte literals. */
2066
else if (!(saw_b || saw_u || saw_r || saw_f)
2067
&& (c == 'u'|| c == 'U')) {
2068
saw_u = 1;
2069
}
2070
/* ur"" and ru"" are not supported */
2071
else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
2072
saw_r = 1;
2073
}
2074
else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
2075
saw_f = 1;
2076
}
2077
else {
2078
break;
2079
}
2080
c = tok_nextc(tok);
2081
if (c == '"' || c == '\'') {
2082
if (saw_f) {
2083
goto f_string_quote;
2084
}
2085
goto letter_quote;
2086
}
2087
}
2088
while (is_potential_identifier_char(c)) {
2089
if (c >= 128) {
2090
nonascii = 1;
2091
}
2092
c = tok_nextc(tok);
2093
}
2094
tok_backup(tok, c);
2095
if (nonascii && !verify_identifier(tok)) {
2096
return MAKE_TOKEN(ERRORTOKEN);
2097
}
2098
2099
p_start = tok->start;
2100
p_end = tok->cur;
2101
2102
/* async/await parsing block. */
2103
if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
2104
/* May be an 'async' or 'await' token. For Python 3.7 or
2105
later we recognize them unconditionally. For Python
2106
3.5 or 3.6 we recognize 'async' in front of 'def', and
2107
either one inside of 'async def'. (Technically we
2108
shouldn't recognize these at all for 3.4 or earlier,
2109
but there's no *valid* Python 3.4 code that would be
2110
rejected, and async functions will be rejected in a
2111
later phase.) */
2112
if (!tok->async_hacks || tok->async_def) {
2113
/* Always recognize the keywords. */
2114
if (memcmp(tok->start, "async", 5) == 0) {
2115
return MAKE_TOKEN(ASYNC);
2116
}
2117
if (memcmp(tok->start, "await", 5) == 0) {
2118
return MAKE_TOKEN(AWAIT);
2119
}
2120
}
2121
else if (memcmp(tok->start, "async", 5) == 0) {
2122
/* The current token is 'async'.
2123
Look ahead one token to see if that is 'def'. */
2124
2125
struct tok_state ahead_tok;
2126
struct token ahead_token;
2127
_PyToken_Init(&ahead_token);
2128
int ahead_tok_kind;
2129
2130
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
2131
ahead_tok_kind = tok_get_normal_mode(&ahead_tok,
2132
current_tok,
2133
&ahead_token);
2134
2135
if (ahead_tok_kind == NAME
2136
&& ahead_tok.cur - ahead_tok.start == 3
2137
&& memcmp(ahead_tok.start, "def", 3) == 0)
2138
{
2139
/* The next token is going to be 'def', so instead of
2140
returning a plain NAME token, return ASYNC. */
2141
tok->async_def_indent = tok->indent;
2142
tok->async_def = 1;
2143
_PyToken_Free(&ahead_token);
2144
return MAKE_TOKEN(ASYNC);
2145
}
2146
_PyToken_Free(&ahead_token);
2147
}
2148
}
2149
2150
return MAKE_TOKEN(NAME);
2151
}
2152
2153
if (c == '\r') {
2154
c = tok_nextc(tok);
2155
}
2156
2157
/* Newline */
2158
if (c == '\n') {
2159
tok->atbol = 1;
2160
if (blankline || tok->level > 0) {
2161
if (tok->tok_extra_tokens) {
2162
if (tok->comment_newline) {
2163
tok->comment_newline = 0;
2164
}
2165
p_start = tok->start;
2166
p_end = tok->cur;
2167
return MAKE_TOKEN(NL);
2168
}
2169
goto nextline;
2170
}
2171
if (tok->comment_newline && tok->tok_extra_tokens) {
2172
tok->comment_newline = 0;
2173
p_start = tok->start;
2174
p_end = tok->cur;
2175
return MAKE_TOKEN(NL);
2176
}
2177
p_start = tok->start;
2178
p_end = tok->cur - 1; /* Leave '\n' out of the string */
2179
tok->cont_line = 0;
2180
if (tok->async_def) {
2181
/* We're somewhere inside an 'async def' function, and
2182
we've encountered a NEWLINE after its signature. */
2183
tok->async_def_nl = 1;
2184
}
2185
return MAKE_TOKEN(NEWLINE);
2186
}
2187
2188
/* Period or number starting with period? */
2189
if (c == '.') {
2190
c = tok_nextc(tok);
2191
if (isdigit(c)) {
2192
goto fraction;
2193
} else if (c == '.') {
2194
c = tok_nextc(tok);
2195
if (c == '.') {
2196
p_start = tok->start;
2197
p_end = tok->cur;
2198
return MAKE_TOKEN(ELLIPSIS);
2199
}
2200
else {
2201
tok_backup(tok, c);
2202
}
2203
tok_backup(tok, '.');
2204
}
2205
else {
2206
tok_backup(tok, c);
2207
}
2208
p_start = tok->start;
2209
p_end = tok->cur;
2210
return MAKE_TOKEN(DOT);
2211
}
2212
2213
/* Number */
2214
if (isdigit(c)) {
2215
if (c == '0') {
2216
/* Hex, octal or binary -- maybe. */
2217
c = tok_nextc(tok);
2218
if (c == 'x' || c == 'X') {
2219
/* Hex */
2220
c = tok_nextc(tok);
2221
do {
2222
if (c == '_') {
2223
c = tok_nextc(tok);
2224
}
2225
if (!isxdigit(c)) {
2226
tok_backup(tok, c);
2227
return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
2228
}
2229
do {
2230
c = tok_nextc(tok);
2231
} while (isxdigit(c));
2232
} while (c == '_');
2233
if (!verify_end_of_number(tok, c, "hexadecimal")) {
2234
return MAKE_TOKEN(ERRORTOKEN);
2235
}
2236
}
2237
else if (c == 'o' || c == 'O') {
2238
/* Octal */
2239
c = tok_nextc(tok);
2240
do {
2241
if (c == '_') {
2242
c = tok_nextc(tok);
2243
}
2244
if (c < '0' || c >= '8') {
2245
if (isdigit(c)) {
2246
return MAKE_TOKEN(syntaxerror(tok,
2247
"invalid digit '%c' in octal literal", c));
2248
}
2249
else {
2250
tok_backup(tok, c);
2251
return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
2252
}
2253
}
2254
do {
2255
c = tok_nextc(tok);
2256
} while ('0' <= c && c < '8');
2257
} while (c == '_');
2258
if (isdigit(c)) {
2259
return MAKE_TOKEN(syntaxerror(tok,
2260
"invalid digit '%c' in octal literal", c));
2261
}
2262
if (!verify_end_of_number(tok, c, "octal")) {
2263
return MAKE_TOKEN(ERRORTOKEN);
2264
}
2265
}
2266
else if (c == 'b' || c == 'B') {
2267
/* Binary */
2268
c = tok_nextc(tok);
2269
do {
2270
if (c == '_') {
2271
c = tok_nextc(tok);
2272
}
2273
if (c != '0' && c != '1') {
2274
if (isdigit(c)) {
2275
return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
2276
}
2277
else {
2278
tok_backup(tok, c);
2279
return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
2280
}
2281
}
2282
do {
2283
c = tok_nextc(tok);
2284
} while (c == '0' || c == '1');
2285
} while (c == '_');
2286
if (isdigit(c)) {
2287
return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
2288
}
2289
if (!verify_end_of_number(tok, c, "binary")) {
2290
return MAKE_TOKEN(ERRORTOKEN);
2291
}
2292
}
2293
else {
2294
int nonzero = 0;
2295
/* maybe old-style octal; c is first char of it */
2296
/* in any case, allow '0' as a literal */
2297
while (1) {
2298
if (c == '_') {
2299
c = tok_nextc(tok);
2300
if (!isdigit(c)) {
2301
tok_backup(tok, c);
2302
return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
2303
}
2304
}
2305
if (c != '0') {
2306
break;
2307
}
2308
c = tok_nextc(tok);
2309
}
2310
char* zeros_end = tok->cur;
2311
if (isdigit(c)) {
2312
nonzero = 1;
2313
c = tok_decimal_tail(tok);
2314
if (c == 0) {
2315
return MAKE_TOKEN(ERRORTOKEN);
2316
}
2317
}
2318
if (c == '.') {
2319
c = tok_nextc(tok);
2320
goto fraction;
2321
}
2322
else if (c == 'e' || c == 'E') {
2323
goto exponent;
2324
}
2325
else if (c == 'j' || c == 'J') {
2326
goto imaginary;
2327
}
2328
else if (nonzero && !tok->tok_extra_tokens) {
2329
/* Old-style octal: now disallowed. */
2330
tok_backup(tok, c);
2331
return MAKE_TOKEN(syntaxerror_known_range(
2332
tok, (int)(tok->start + 1 - tok->line_start),
2333
(int)(zeros_end - tok->line_start),
2334
"leading zeros in decimal integer "
2335
"literals are not permitted; "
2336
"use an 0o prefix for octal integers"));
2337
}
2338
if (!verify_end_of_number(tok, c, "decimal")) {
2339
return MAKE_TOKEN(ERRORTOKEN);
2340
}
2341
}
2342
}
2343
else {
2344
/* Decimal */
2345
c = tok_decimal_tail(tok);
2346
if (c == 0) {
2347
return MAKE_TOKEN(ERRORTOKEN);
2348
}
2349
{
2350
/* Accept floating point numbers. */
2351
if (c == '.') {
2352
c = tok_nextc(tok);
2353
fraction:
2354
/* Fraction */
2355
if (isdigit(c)) {
2356
c = tok_decimal_tail(tok);
2357
if (c == 0) {
2358
return MAKE_TOKEN(ERRORTOKEN);
2359
}
2360
}
2361
}
2362
if (c == 'e' || c == 'E') {
2363
int e;
2364
exponent:
2365
e = c;
2366
/* Exponent part */
2367
c = tok_nextc(tok);
2368
if (c == '+' || c == '-') {
2369
c = tok_nextc(tok);
2370
if (!isdigit(c)) {
2371
tok_backup(tok, c);
2372
return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
2373
}
2374
} else if (!isdigit(c)) {
2375
tok_backup(tok, c);
2376
if (!verify_end_of_number(tok, e, "decimal")) {
2377
return MAKE_TOKEN(ERRORTOKEN);
2378
}
2379
tok_backup(tok, e);
2380
p_start = tok->start;
2381
p_end = tok->cur;
2382
return MAKE_TOKEN(NUMBER);
2383
}
2384
c = tok_decimal_tail(tok);
2385
if (c == 0) {
2386
return MAKE_TOKEN(ERRORTOKEN);
2387
}
2388
}
2389
if (c == 'j' || c == 'J') {
2390
/* Imaginary part */
2391
imaginary:
2392
c = tok_nextc(tok);
2393
if (!verify_end_of_number(tok, c, "imaginary")) {
2394
return MAKE_TOKEN(ERRORTOKEN);
2395
}
2396
}
2397
else if (!verify_end_of_number(tok, c, "decimal")) {
2398
return MAKE_TOKEN(ERRORTOKEN);
2399
}
2400
}
2401
}
2402
tok_backup(tok, c);
2403
p_start = tok->start;
2404
p_end = tok->cur;
2405
return MAKE_TOKEN(NUMBER);
2406
}
2407
2408
f_string_quote:
2409
if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
2410
int quote = c;
2411
int quote_size = 1; /* 1 or 3 */
2412
2413
/* Nodes of type STRING, especially multi line strings
2414
must be handled differently in order to get both
2415
the starting line number and the column offset right.
2416
(cf. issue 16806) */
2417
tok->first_lineno = tok->lineno;
2418
tok->multi_line_start = tok->line_start;
2419
2420
/* Find the quote size and start of string */
2421
int after_quote = tok_nextc(tok);
2422
if (after_quote == quote) {
2423
int after_after_quote = tok_nextc(tok);
2424
if (after_after_quote == quote) {
2425
quote_size = 3;
2426
}
2427
else {
2428
// TODO: Check this
2429
tok_backup(tok, after_after_quote);
2430
tok_backup(tok, after_quote);
2431
}
2432
}
2433
if (after_quote != quote) {
2434
tok_backup(tok, after_quote);
2435
}
2436
2437
2438
p_start = tok->start;
2439
p_end = tok->cur;
2440
if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
2441
return MAKE_TOKEN(syntaxerror(tok, "too many nested f-strings"));
2442
}
2443
tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
2444
the_current_tok->kind = TOK_FSTRING_MODE;
2445
the_current_tok->f_string_quote = quote;
2446
the_current_tok->f_string_quote_size = quote_size;
2447
the_current_tok->f_string_start = tok->start;
2448
the_current_tok->f_string_multi_line_start = tok->line_start;
2449
the_current_tok->f_string_line_start = tok->lineno;
2450
the_current_tok->f_string_start_offset = -1;
2451
the_current_tok->f_string_multi_line_start_offset = -1;
2452
the_current_tok->last_expr_buffer = NULL;
2453
the_current_tok->last_expr_size = 0;
2454
the_current_tok->last_expr_end = -1;
2455
the_current_tok->f_string_debug = 0;
2456
2457
switch (*tok->start) {
2458
case 'F':
2459
case 'f':
2460
the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
2461
break;
2462
case 'R':
2463
case 'r':
2464
the_current_tok->f_string_raw = 1;
2465
break;
2466
default:
2467
Py_UNREACHABLE();
2468
}
2469
2470
the_current_tok->curly_bracket_depth = 0;
2471
the_current_tok->curly_bracket_expr_start_depth = -1;
2472
return MAKE_TOKEN(FSTRING_START);
2473
}
2474
2475
letter_quote:
2476
/* String */
2477
if (c == '\'' || c == '"') {
2478
int quote = c;
2479
int quote_size = 1; /* 1 or 3 */
2480
int end_quote_size = 0;
2481
2482
/* Nodes of type STRING, especially multi line strings
2483
must be handled differently in order to get both
2484
the starting line number and the column offset right.
2485
(cf. issue 16806) */
2486
tok->first_lineno = tok->lineno;
2487
tok->multi_line_start = tok->line_start;
2488
2489
/* Find the quote size and start of string */
2490
c = tok_nextc(tok);
2491
if (c == quote) {
2492
c = tok_nextc(tok);
2493
if (c == quote) {
2494
quote_size = 3;
2495
}
2496
else {
2497
end_quote_size = 1; /* empty string found */
2498
}
2499
}
2500
if (c != quote) {
2501
tok_backup(tok, c);
2502
}
2503
2504
/* Get rest of string */
2505
while (end_quote_size != quote_size) {
2506
c = tok_nextc(tok);
2507
if (tok->done == E_ERROR) {
2508
return MAKE_TOKEN(ERRORTOKEN);
2509
}
2510
if (tok->done == E_DECODE) {
2511
break;
2512
}
2513
if (c == EOF || (quote_size == 1 && c == '\n')) {
2514
assert(tok->multi_line_start != NULL);
2515
// shift the tok_state's location into
2516
// the start of string, and report the error
2517
// from the initial quote character
2518
tok->cur = (char *)tok->start;
2519
tok->cur++;
2520
tok->line_start = tok->multi_line_start;
2521
int start = tok->lineno;
2522
tok->lineno = tok->first_lineno;
2523
2524
if (INSIDE_FSTRING(tok)) {
2525
/* When we are in an f-string, before raising the
2526
* unterminated string literal error, check whether
2527
* does the initial quote matches with f-strings quotes
2528
* and if it is, then this must be a missing '}' token
2529
* so raise the proper error */
2530
tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
2531
if (the_current_tok->f_string_quote == quote &&
2532
the_current_tok->f_string_quote_size == quote_size) {
2533
return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
2534
}
2535
}
2536
2537
if (quote_size == 3) {
2538
syntaxerror(tok, "unterminated triple-quoted string literal"
2539
" (detected at line %d)", start);
2540
if (c != '\n') {
2541
tok->done = E_EOFS;
2542
}
2543
return MAKE_TOKEN(ERRORTOKEN);
2544
}
2545
else {
2546
syntaxerror(tok, "unterminated string literal (detected at"
2547
" line %d)", start);
2548
if (c != '\n') {
2549
tok->done = E_EOLS;
2550
}
2551
return MAKE_TOKEN(ERRORTOKEN);
2552
}
2553
}
2554
if (c == quote) {
2555
end_quote_size += 1;
2556
}
2557
else {
2558
end_quote_size = 0;
2559
if (c == '\\') {
2560
c = tok_nextc(tok); /* skip escaped char */
2561
if (c == '\r') {
2562
c = tok_nextc(tok);
2563
}
2564
}
2565
}
2566
}
2567
2568
p_start = tok->start;
2569
p_end = tok->cur;
2570
return MAKE_TOKEN(STRING);
2571
}
2572
2573
/* Line continuation */
2574
if (c == '\\') {
2575
if ((c = tok_continuation_line(tok)) == -1) {
2576
return MAKE_TOKEN(ERRORTOKEN);
2577
}
2578
tok->cont_line = 1;
2579
goto again; /* Read next line */
2580
}
2581
2582
/* Punctuation character */
2583
int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
2584
if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
2585
/* This code block gets executed before the curly_bracket_depth is incremented
2586
* by the `{` case, so for ensuring that we are on the 0th level, we need
2587
* to adjust it manually */
2588
int cursor = current_tok->curly_bracket_depth - (c != '{');
2589
if (cursor == 0 && !update_fstring_expr(tok, c)) {
2590
return MAKE_TOKEN(ENDMARKER);
2591
}
2592
if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
2593
return MAKE_TOKEN(ERRORTOKEN);
2594
}
2595
2596
if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
2597
current_tok->kind = TOK_FSTRING_MODE;
2598
p_start = tok->start;
2599
p_end = tok->cur;
2600
return MAKE_TOKEN(_PyToken_OneChar(c));
2601
}
2602
}
2603
2604
/* Check for two-character token */
2605
{
2606
int c2 = tok_nextc(tok);
2607
int current_token = _PyToken_TwoChars(c, c2);
2608
if (current_token != OP) {
2609
int c3 = tok_nextc(tok);
2610
int current_token3 = _PyToken_ThreeChars(c, c2, c3);
2611
if (current_token3 != OP) {
2612
current_token = current_token3;
2613
}
2614
else {
2615
tok_backup(tok, c3);
2616
}
2617
p_start = tok->start;
2618
p_end = tok->cur;
2619
return MAKE_TOKEN(current_token);
2620
}
2621
tok_backup(tok, c2);
2622
}
2623
2624
/* Keep track of parentheses nesting level */
2625
switch (c) {
2626
case '(':
2627
case '[':
2628
case '{':
2629
if (tok->level >= MAXLEVEL) {
2630
return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
2631
}
2632
tok->parenstack[tok->level] = c;
2633
tok->parenlinenostack[tok->level] = tok->lineno;
2634
tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2635
tok->level++;
2636
if (INSIDE_FSTRING(tok)) {
2637
current_tok->curly_bracket_depth++;
2638
}
2639
break;
2640
case ')':
2641
case ']':
2642
case '}':
2643
if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
2644
return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
2645
}
2646
if (!tok->tok_extra_tokens && !tok->level) {
2647
return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
2648
}
2649
if (tok->level > 0) {
2650
tok->level--;
2651
int opening = tok->parenstack[tok->level];
2652
if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
2653
(opening == '[' && c == ']') ||
2654
(opening == '{' && c == '}'))) {
2655
/* If the opening bracket belongs to an f-string's expression
2656
part (e.g. f"{)}") and the closing bracket is an arbitrary
2657
nested expression, then instead of matching a different
2658
syntactical construct with it; we'll throw an unmatched
2659
parentheses error. */
2660
if (INSIDE_FSTRING(tok) && opening == '{') {
2661
assert(current_tok->curly_bracket_depth >= 0);
2662
int previous_bracket = current_tok->curly_bracket_depth - 1;
2663
if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
2664
return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
2665
}
2666
}
2667
if (tok->parenlinenostack[tok->level] != tok->lineno) {
2668
return MAKE_TOKEN(syntaxerror(tok,
2669
"closing parenthesis '%c' does not match "
2670
"opening parenthesis '%c' on line %d",
2671
c, opening, tok->parenlinenostack[tok->level]));
2672
}
2673
else {
2674
return MAKE_TOKEN(syntaxerror(tok,
2675
"closing parenthesis '%c' does not match "
2676
"opening parenthesis '%c'",
2677
c, opening));
2678
}
2679
}
2680
}
2681
2682
if (INSIDE_FSTRING(tok)) {
2683
current_tok->curly_bracket_depth--;
2684
if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
2685
current_tok->curly_bracket_expr_start_depth--;
2686
current_tok->kind = TOK_FSTRING_MODE;
2687
current_tok->f_string_debug = 0;
2688
}
2689
}
2690
break;
2691
default:
2692
break;
2693
}
2694
2695
if (!Py_UNICODE_ISPRINTABLE(c)) {
2696
return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c));
2697
}
2698
2699
if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
2700
current_tok->f_string_debug = 1;
2701
}
2702
2703
/* Punctuation character */
2704
p_start = tok->start;
2705
p_end = tok->cur;
2706
return MAKE_TOKEN(_PyToken_OneChar(c));
2707
}
2708
2709
static int
2710
tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
2711
{
2712
const char *p_start = NULL;
2713
const char *p_end = NULL;
2714
int end_quote_size = 0;
2715
int unicode_escape = 0;
2716
2717
tok->start = tok->cur;
2718
tok->first_lineno = tok->lineno;
2719
tok->starting_col_offset = tok->col_offset;
2720
2721
// If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
2722
// before it.
2723
int start_char = tok_nextc(tok);
2724
if (start_char == '{') {
2725
int peek1 = tok_nextc(tok);
2726
tok_backup(tok, peek1);
2727
tok_backup(tok, start_char);
2728
if (peek1 != '{') {
2729
current_tok->curly_bracket_expr_start_depth++;
2730
if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
2731
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
2732
}
2733
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2734
return tok_get_normal_mode(tok, current_tok, token);
2735
}
2736
}
2737
else {
2738
tok_backup(tok, start_char);
2739
}
2740
2741
// Check if we are at the end of the string
2742
for (int i = 0; i < current_tok->f_string_quote_size; i++) {
2743
int quote = tok_nextc(tok);
2744
if (quote != current_tok->f_string_quote) {
2745
tok_backup(tok, quote);
2746
goto f_string_middle;
2747
}
2748
}
2749
2750
if (current_tok->last_expr_buffer != NULL) {
2751
PyMem_Free(current_tok->last_expr_buffer);
2752
current_tok->last_expr_buffer = NULL;
2753
current_tok->last_expr_size = 0;
2754
current_tok->last_expr_end = -1;
2755
}
2756
2757
p_start = tok->start;
2758
p_end = tok->cur;
2759
tok->tok_mode_stack_index--;
2760
return MAKE_TOKEN(FSTRING_END);
2761
2762
f_string_middle:
2763
2764
// TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
2765
// this.
2766
tok->multi_line_start = tok->line_start;
2767
while (end_quote_size != current_tok->f_string_quote_size) {
2768
int c = tok_nextc(tok);
2769
if (tok->done == E_ERROR) {
2770
return MAKE_TOKEN(ERRORTOKEN);
2771
}
2772
if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
2773
if (tok->decoding_erred) {
2774
return MAKE_TOKEN(ERRORTOKEN);
2775
}
2776
2777
assert(tok->multi_line_start != NULL);
2778
// shift the tok_state's location into
2779
// the start of string, and report the error
2780
// from the initial quote character
2781
tok->cur = (char *)current_tok->f_string_start;
2782
tok->cur++;
2783
tok->line_start = current_tok->f_string_multi_line_start;
2784
int start = tok->lineno;
2785
2786
tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
2787
tok->lineno = the_current_tok->f_string_line_start;
2788
2789
if (current_tok->f_string_quote_size == 3) {
2790
return MAKE_TOKEN(syntaxerror(tok,
2791
"unterminated triple-quoted f-string literal"
2792
" (detected at line %d)", start));
2793
}
2794
else {
2795
return MAKE_TOKEN(syntaxerror(tok,
2796
"unterminated f-string literal (detected at"
2797
" line %d)", start));
2798
}
2799
}
2800
2801
if (c == current_tok->f_string_quote) {
2802
end_quote_size += 1;
2803
continue;
2804
} else {
2805
end_quote_size = 0;
2806
}
2807
2808
int in_format_spec = (
2809
current_tok->last_expr_end != -1
2810
&&
2811
INSIDE_FSTRING_EXPR(current_tok)
2812
);
2813
if (c == '{') {
2814
int peek = tok_nextc(tok);
2815
if (peek != '{' || in_format_spec) {
2816
tok_backup(tok, peek);
2817
tok_backup(tok, c);
2818
current_tok->curly_bracket_expr_start_depth++;
2819
if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
2820
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
2821
}
2822
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2823
p_start = tok->start;
2824
p_end = tok->cur;
2825
} else {
2826
p_start = tok->start;
2827
p_end = tok->cur - 1;
2828
}
2829
return MAKE_TOKEN(FSTRING_MIDDLE);
2830
} else if (c == '}') {
2831
if (unicode_escape) {
2832
p_start = tok->start;
2833
p_end = tok->cur;
2834
return MAKE_TOKEN(FSTRING_MIDDLE);
2835
}
2836
int peek = tok_nextc(tok);
2837
2838
// The tokenizer can only be in the format spec if we have already completed the expression
2839
// scanning (indicated by the end of the expression being set) and we are not at the top level
2840
// of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
2841
// brackets, we can bypass it here.
2842
if (peek == '}' && !in_format_spec) {
2843
p_start = tok->start;
2844
p_end = tok->cur - 1;
2845
} else {
2846
tok_backup(tok, peek);
2847
tok_backup(tok, c);
2848
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2849
p_start = tok->start;
2850
p_end = tok->cur;
2851
}
2852
return MAKE_TOKEN(FSTRING_MIDDLE);
2853
} else if (c == '\\') {
2854
int peek = tok_nextc(tok);
2855
if (peek == '\r') {
2856
peek = tok_nextc(tok);
2857
}
2858
// Special case when the backslash is right before a curly
2859
// brace. We have to restore and return the control back
2860
// to the loop for the next iteration.
2861
if (peek == '{' || peek == '}') {
2862
if (!current_tok->f_string_raw) {
2863
if (warn_invalid_escape_sequence(tok, peek)) {
2864
return MAKE_TOKEN(ERRORTOKEN);
2865
}
2866
}
2867
tok_backup(tok, peek);
2868
continue;
2869
}
2870
2871
if (!current_tok->f_string_raw) {
2872
if (peek == 'N') {
2873
/* Handle named unicode escapes (\N{BULLET}) */
2874
peek = tok_nextc(tok);
2875
if (peek == '{') {
2876
unicode_escape = 1;
2877
} else {
2878
tok_backup(tok, peek);
2879
}
2880
}
2881
} /* else {
2882
skip the escaped character
2883
}*/
2884
}
2885
}
2886
2887
// Backup the f-string quotes to emit a final FSTRING_MIDDLE and
2888
// add the quotes to the FSTRING_END in the next tokenizer iteration.
2889
for (int i = 0; i < current_tok->f_string_quote_size; i++) {
2890
tok_backup(tok, current_tok->f_string_quote);
2891
}
2892
p_start = tok->start;
2893
p_end = tok->cur;
2894
return MAKE_TOKEN(FSTRING_MIDDLE);
2895
}
2896
2897
2898
static int
2899
tok_get(struct tok_state *tok, struct token *token)
2900
{
2901
tokenizer_mode *current_tok = TOK_GET_MODE(tok);
2902
if (current_tok->kind == TOK_REGULAR_MODE) {
2903
return tok_get_normal_mode(tok, current_tok, token);
2904
} else {
2905
return tok_get_fstring_mode(tok, current_tok, token);
2906
}
2907
}
2908
2909
int
2910
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
2911
{
2912
int result = tok_get(tok, token);
2913
if (tok->decoding_erred) {
2914
result = ERRORTOKEN;
2915
tok->done = E_DECODE;
2916
}
2917
return result;
2918
}
2919
2920
#if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2921
// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2922
// dup() emulation with open() is slow.
2923
typedef union {
2924
void *cookie;
2925
int fd;
2926
} borrowed;
2927
2928
static ssize_t
2929
borrow_read(void *cookie, char *buf, size_t size)
2930
{
2931
borrowed b = {.cookie = cookie};
2932
return read(b.fd, (void *)buf, size);
2933
}
2934
2935
static FILE *
2936
fdopen_borrow(int fd) {
2937
// supports only reading. seek fails. close and write are no-ops.
2938
cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2939
borrowed b = {.fd = fd};
2940
return fopencookie(b.cookie, "r", io_cb);
2941
}
2942
#else
2943
static FILE *
2944
fdopen_borrow(int fd) {
2945
fd = _Py_dup(fd);
2946
if (fd < 0) {
2947
return NULL;
2948
}
2949
return fdopen(fd, "r");
2950
}
2951
#endif
2952
2953
/* Get the encoding of a Python file. Check for the coding cookie and check if
2954
the file starts with a BOM.
2955
2956
_PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2957
encoding in the first or second line of the file (in which case the encoding
2958
should be assumed to be UTF-8).
2959
2960
The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2961
by the caller. */
2962
2963
char *
2964
_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2965
{
2966
struct tok_state *tok;
2967
FILE *fp;
2968
char *encoding = NULL;
2969
2970
fp = fdopen_borrow(fd);
2971
if (fp == NULL) {
2972
return NULL;
2973
}
2974
tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2975
if (tok == NULL) {
2976
fclose(fp);
2977
return NULL;
2978
}
2979
if (filename != NULL) {
2980
tok->filename = Py_NewRef(filename);
2981
}
2982
else {
2983
tok->filename = PyUnicode_FromString("<string>");
2984
if (tok->filename == NULL) {
2985
fclose(fp);
2986
_PyTokenizer_Free(tok);
2987
return encoding;
2988
}
2989
}
2990
struct token token;
2991
// We don't want to report warnings here because it could cause infinite recursion
2992
// if fetching the encoding shows a warning.
2993
tok->report_warnings = 0;
2994
while (tok->lineno < 2 && tok->done == E_OK) {
2995
_PyToken_Init(&token);
2996
_PyTokenizer_Get(tok, &token);
2997
_PyToken_Free(&token);
2998
}
2999
fclose(fp);
3000
if (tok->encoding) {
3001
encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
3002
if (encoding) {
3003
strcpy(encoding, tok->encoding);
3004
}
3005
}
3006
_PyTokenizer_Free(tok);
3007
return encoding;
3008
}
3009
3010
#ifdef Py_DEBUG
3011
void
3012
tok_dump(int type, char *start, char *end)
3013
{
3014
fprintf(stderr, "%s", _PyParser_TokenNames[type]);
3015
if (type == NAME || type == NUMBER || type == STRING || type == OP)
3016
fprintf(stderr, "(%.*s)", (int)(end - start), start);
3017
}
3018
#endif // Py_DEBUG
3019
3020