CoCalc -- tokenizer.c

GitHub Repository: allendowney/cpython
Path: blob/main/Parser/tokenizer.c
¹² views
1

2
/* Tokenizer implementation */
3

4
#include "Python.h"
5
#include "pycore_call.h"          // _PyObject_CallNoArgs()
6

7
#include <ctype.h>
8
#include <assert.h>
9

10
#include "tokenizer.h"
11
#include "errcode.h"
12

13
/* Alternate tab spacing */
14
#define ALTTABSIZE 1
15

16
#define is_potential_identifier_start(c) (\
17
              (c >= 'a' && c <= 'z')\
18
               || (c >= 'A' && c <= 'Z')\
19
               || c == '_'\
20
               || (c >= 128))
21

22
#define is_potential_identifier_char(c) (\
23
              (c >= 'a' && c <= 'z')\
24
               || (c >= 'A' && c <= 'Z')\
25
               || (c >= '0' && c <= '9')\
26
               || c == '_'\
27
               || (c >= 128))
28

29

30
/* Don't ever change this -- it would break the portability of Python code */
31
#define TABSIZE 8
32

33
#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
34
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
35
                type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
36
#define ADVANCE_LINENO() \
37
            tok->lineno++; \
38
            tok->col_offset = 0;
39

40
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
41
#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
42
#ifdef Py_DEBUG
43
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
44
    assert(tok->tok_mode_stack_index >= 0);
45
    assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
46
    return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
47
}
48
static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
49
    assert(tok->tok_mode_stack_index >= 0);
50
    assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
51
    return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
52
}
53
#else
54
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
55
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
56
#endif
57

58
/* Forward */
59
static struct tok_state *tok_new(void);
60
static int tok_nextc(struct tok_state *tok);
61
static void tok_backup(struct tok_state *tok, int c);
62
static int syntaxerror(struct tok_state *tok, const char *format, ...);
63

64
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
65
   tokenizing. */
66
static const char* type_comment_prefix = "# type: ";
67

68
/* Create and initialize a new tok_state structure */
69

70
static struct tok_state *
71
tok_new(void)
72
{
73
    struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
74
                                            sizeof(struct tok_state));
75
    if (tok == NULL)
76
        return NULL;
77
    tok->buf = tok->cur = tok->inp = NULL;
78
    tok->fp_interactive = 0;
79
    tok->interactive_src_start = NULL;
80
    tok->interactive_src_end = NULL;
81
    tok->start = NULL;
82
    tok->end = NULL;
83
    tok->done = E_OK;
84
    tok->fp = NULL;
85
    tok->input = NULL;
86
    tok->tabsize = TABSIZE;
87
    tok->indent = 0;
88
    tok->indstack[0] = 0;
89
    tok->atbol = 1;
90
    tok->pendin = 0;
91
    tok->prompt = tok->nextprompt = NULL;
92
    tok->lineno = 0;
93
    tok->starting_col_offset = -1;
94
    tok->col_offset = -1;
95
    tok->level = 0;
96
    tok->altindstack[0] = 0;
97
    tok->decoding_state = STATE_INIT;
98
    tok->decoding_erred = 0;
99
    tok->enc = NULL;
100
    tok->encoding = NULL;
101
    tok->cont_line = 0;
102
    tok->filename = NULL;
103
    tok->decoding_readline = NULL;
104
    tok->decoding_buffer = NULL;
105
    tok->readline = NULL;
106
    tok->type_comments = 0;
107
    tok->async_hacks = 0;
108
    tok->async_def = 0;
109
    tok->async_def_indent = 0;
110
    tok->async_def_nl = 0;
111
    tok->interactive_underflow = IUNDERFLOW_NORMAL;
112
    tok->str = NULL;
113
    tok->report_warnings = 1;
114
    tok->tok_extra_tokens = 0;
115
    tok->comment_newline = 0;
116
    tok->implicit_newline = 0;
117
    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
118
    tok->tok_mode_stack_index = 0;
119
    tok->tok_report_warnings = 1;
120
#ifdef Py_DEBUG
121
    tok->debug = _Py_GetConfig()->parser_debug;
122
#endif
123
    return tok;
124
}
125

126
static char *
127
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
128
{
129
    char* result = (char *)PyMem_Malloc(len + 1);
130
    if (!result) {
131
        tok->done = E_NOMEM;
132
        return NULL;
133
    }
134
    memcpy(result, s, len);
135
    result[len] = '\0';
136
    return result;
137
}
138

139
static char *
140
error_ret(struct tok_state *tok) /* XXX */
141
{
142
    tok->decoding_erred = 1;
143
    if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
144
        PyMem_Free(tok->buf);
145
    }
146
    tok->buf = tok->cur = tok->inp = NULL;
147
    tok->start = NULL;
148
    tok->end = NULL;
149
    tok->done = E_DECODE;
150
    return NULL;                /* as if it were EOF */
151
}
152

153

154
static const char *
155
get_normal_name(const char *s)  /* for utf-8 and latin-1 */
156
{
157
    char buf[13];
158
    int i;
159
    for (i = 0; i < 12; i++) {
160
        int c = s[i];
161
        if (c == '\0')
162
            break;
163
        else if (c == '_')
164
            buf[i] = '-';
165
        else
166
            buf[i] = tolower(c);
167
    }
168
    buf[i] = '\0';
169
    if (strcmp(buf, "utf-8") == 0 ||
170
        strncmp(buf, "utf-8-", 6) == 0)
171
        return "utf-8";
172
    else if (strcmp(buf, "latin-1") == 0 ||
173
             strcmp(buf, "iso-8859-1") == 0 ||
174
             strcmp(buf, "iso-latin-1") == 0 ||
175
             strncmp(buf, "latin-1-", 8) == 0 ||
176
             strncmp(buf, "iso-8859-1-", 11) == 0 ||
177
             strncmp(buf, "iso-latin-1-", 12) == 0)
178
        return "iso-8859-1";
179
    else
180
        return s;
181
}
182

183
/* Return the coding spec in S, or NULL if none is found.  */
184

185
static int
186
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
187
{
188
    Py_ssize_t i;
189
    *spec = NULL;
190
    /* Coding spec must be in a comment, and that comment must be
191
     * the only statement on the source code line. */
192
    for (i = 0; i < size - 6; i++) {
193
        if (s[i] == '#')
194
            break;
195
        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
196
            return 1;
197
    }
198
    for (; i < size - 6; i++) { /* XXX inefficient search */
199
        const char* t = s + i;
200
        if (memcmp(t, "coding", 6) == 0) {
201
            const char* begin = NULL;
202
            t += 6;
203
            if (t[0] != ':' && t[0] != '=')
204
                continue;
205
            do {
206
                t++;
207
            } while (t[0] == ' ' || t[0] == '\t');
208

209
            begin = t;
210
            while (Py_ISALNUM(t[0]) ||
211
                   t[0] == '-' || t[0] == '_' || t[0] == '.')
212
                t++;
213

214
            if (begin < t) {
215
                char* r = new_string(begin, t - begin, tok);
216
                const char* q;
217
                if (!r)
218
                    return 0;
219
                q = get_normal_name(r);
220
                if (r != q) {
221
                    PyMem_Free(r);
222
                    r = new_string(q, strlen(q), tok);
223
                    if (!r)
224
                        return 0;
225
                }
226
                *spec = r;
227
                break;
228
            }
229
        }
230
    }
231
    return 1;
232
}
233

234
/* Check whether the line contains a coding spec. If it does,
235
   invoke the set_readline function for the new encoding.
236
   This function receives the tok_state and the new encoding.
237
   Return 1 on success, 0 on failure.  */
238

239
static int
240
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
241
                  int set_readline(struct tok_state *, const char *))
242
{
243
    char *cs;
244
    if (tok->cont_line) {
245
        /* It's a continuation line, so it can't be a coding spec. */
246
        tok->decoding_state = STATE_NORMAL;
247
        return 1;
248
    }
249
    if (!get_coding_spec(line, &cs, size, tok)) {
250
        return 0;
251
    }
252
    if (!cs) {
253
        Py_ssize_t i;
254
        for (i = 0; i < size; i++) {
255
            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
256
                break;
257
            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
258
                /* Stop checking coding spec after a line containing
259
                 * anything except a comment. */
260
                tok->decoding_state = STATE_NORMAL;
261
                break;
262
            }
263
        }
264
        return 1;
265
    }
266
    tok->decoding_state = STATE_NORMAL;
267
    if (tok->encoding == NULL) {
268
        assert(tok->decoding_readline == NULL);
269
        if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
270
            error_ret(tok);
271
            PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
272
            PyMem_Free(cs);
273
            return 0;
274
        }
275
        tok->encoding = cs;
276
    } else {                /* then, compare cs with BOM */
277
        if (strcmp(tok->encoding, cs) != 0) {
278
            error_ret(tok);
279
            PyErr_Format(PyExc_SyntaxError,
280
                         "encoding problem: %s with BOM", cs);
281
            PyMem_Free(cs);
282
            return 0;
283
        }
284
        PyMem_Free(cs);
285
    }
286
    return 1;
287
}
288

289
/* See whether the file starts with a BOM. If it does,
290
   invoke the set_readline function with the new encoding.
291
   Return 1 on success, 0 on failure.  */
292

293
static int
294
check_bom(int get_char(struct tok_state *),
295
          void unget_char(int, struct tok_state *),
296
          int set_readline(struct tok_state *, const char *),
297
          struct tok_state *tok)
298
{
299
    int ch1, ch2, ch3;
300
    ch1 = get_char(tok);
301
    tok->decoding_state = STATE_SEEK_CODING;
302
    if (ch1 == EOF) {
303
        return 1;
304
    } else if (ch1 == 0xEF) {
305
        ch2 = get_char(tok);
306
        if (ch2 != 0xBB) {
307
            unget_char(ch2, tok);
308
            unget_char(ch1, tok);
309
            return 1;
310
        }
311
        ch3 = get_char(tok);
312
        if (ch3 != 0xBF) {
313
            unget_char(ch3, tok);
314
            unget_char(ch2, tok);
315
            unget_char(ch1, tok);
316
            return 1;
317
        }
318
    } else {
319
        unget_char(ch1, tok);
320
        return 1;
321
    }
322
    if (tok->encoding != NULL)
323
        PyMem_Free(tok->encoding);
324
    tok->encoding = new_string("utf-8", 5, tok);
325
    if (!tok->encoding)
326
        return 0;
327
    /* No need to set_readline: input is already utf-8 */
328
    return 1;
329
}
330

331
static int
332
tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
333
    assert(tok->fp_interactive);
334

335
    if (!line) {
336
        return 0;
337
    }
338

339
    Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
340
    Py_ssize_t line_size = strlen(line);
341
    char last_char = line[line_size > 0 ? line_size - 1 : line_size];
342
    if (last_char != '\n') {
343
        line_size += 1;
344
    }
345
    char* new_str = tok->interactive_src_start;
346

347
    new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
348
    if (!new_str) {
349
        if (tok->interactive_src_start) {
350
            PyMem_Free(tok->interactive_src_start);
351
        }
352
        tok->interactive_src_start = NULL;
353
        tok->interactive_src_end = NULL;
354
        tok->done = E_NOMEM;
355
        return -1;
356
    }
357
    strcpy(new_str + current_size, line);
358
    tok->implicit_newline = 0;
359
    if (last_char != '\n') {
360
        /* Last line does not end in \n, fake one */
361
        new_str[current_size + line_size - 1] = '\n';
362
        new_str[current_size + line_size] = '\0';
363
        tok->implicit_newline = 1;
364
    }
365
    tok->interactive_src_start = new_str;
366
    tok->interactive_src_end = new_str + current_size + line_size;
367
    return 0;
368
}
369

370
/* Traverse and remember all f-string buffers, in order to be able to restore
371
   them after reallocating tok->buf */
372
static void
373
remember_fstring_buffers(struct tok_state *tok)
374
{
375
    int index;
376
    tokenizer_mode *mode;
377

378
    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
379
        mode = &(tok->tok_mode_stack[index]);
380
        mode->f_string_start_offset = mode->f_string_start - tok->buf;
381
        mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
382
    }
383
}
384

385
/* Traverse and restore all f-string buffers after reallocating tok->buf */
386
static void
387
restore_fstring_buffers(struct tok_state *tok)
388
{
389
    int index;
390
    tokenizer_mode *mode;
391

392
    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
393
        mode = &(tok->tok_mode_stack[index]);
394
        mode->f_string_start = tok->buf + mode->f_string_start_offset;
395
        mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
396
    }
397
}
398

399
static int
400
set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
401
    assert(token != NULL);
402
    assert(c == '}' || c == ':' || c == '!');
403
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
404

405
    if (!tok_mode->f_string_debug || token->metadata) {
406
        return 0;
407
    }
408

409
    PyObject *res = PyUnicode_DecodeUTF8(
410
        tok_mode->last_expr_buffer,
411
        tok_mode->last_expr_size - tok_mode->last_expr_end,
412
        NULL
413
    );
414
    if (!res) {
415
        return -1;
416
    }
417
    token->metadata = res;
418
    return 0;
419
}
420

421
static int
422
update_fstring_expr(struct tok_state *tok, char cur)
423
{
424
    assert(tok->cur != NULL);
425

426
    Py_ssize_t size = strlen(tok->cur);
427
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
428

429
    switch (cur) {
430
       case 0:
431
            if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
432
                return 1;
433
            }
434
            char *new_buffer = PyMem_Realloc(
435
                tok_mode->last_expr_buffer,
436
                tok_mode->last_expr_size + size
437
            );
438
            if (new_buffer == NULL) {
439
                PyMem_Free(tok_mode->last_expr_buffer);
440
                goto error;
441
            }
442
            tok_mode->last_expr_buffer = new_buffer;
443
            strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
444
            tok_mode->last_expr_size += size;
445
            break;
446
        case '{':
447
            if (tok_mode->last_expr_buffer != NULL) {
448
                PyMem_Free(tok_mode->last_expr_buffer);
449
            }
450
            tok_mode->last_expr_buffer = PyMem_Malloc(size);
451
            if (tok_mode->last_expr_buffer == NULL) {
452
                goto error;
453
            }
454
            tok_mode->last_expr_size = size;
455
            tok_mode->last_expr_end = -1;
456
            strncpy(tok_mode->last_expr_buffer, tok->cur, size);
457
            break;
458
        case '}':
459
        case '!':
460
        case ':':
461
            if (tok_mode->last_expr_end == -1) {
462
                tok_mode->last_expr_end = strlen(tok->start);
463
            }
464
            break;
465
        default:
466
            Py_UNREACHABLE();
467
    }
468
    return 1;
469
error:
470
    tok->done = E_NOMEM;
471
    return 0;
472
}
473

474
static void
475
free_fstring_expressions(struct tok_state *tok)
476
{
477
    int index;
478
    tokenizer_mode *mode;
479

480
    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
481
        mode = &(tok->tok_mode_stack[index]);
482
        if (mode->last_expr_buffer != NULL) {
483
            PyMem_Free(mode->last_expr_buffer);
484
            mode->last_expr_buffer = NULL;
485
            mode->last_expr_size = 0;
486
            mode->last_expr_end = -1;
487
        }
488
    }
489
}
490

491
/* Read a line of text from TOK into S, using the stream in TOK.
492
   Return NULL on failure, else S.
493

494
   On entry, tok->decoding_buffer will be one of:
495
     1) NULL: need to call tok->decoding_readline to get a new line
496
     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
497
       stored the result in tok->decoding_buffer
498
     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
499
       (in the s buffer) to copy entire contents of the line read
500
       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
501
       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
502
       until the buffer ends with a '\n' (or until the end of the file is
503
       reached): see tok_nextc and its calls to tok_reserve_buf.
504
*/
505

506
static int
507
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
508
{
509
    Py_ssize_t cur = tok->cur - tok->buf;
510
    Py_ssize_t oldsize = tok->inp - tok->buf;
511
    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
512
    if (newsize > tok->end - tok->buf) {
513
        char *newbuf = tok->buf;
514
        Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
515
        Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
516
        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
517
        remember_fstring_buffers(tok);
518
        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
519
        if (newbuf == NULL) {
520
            tok->done = E_NOMEM;
521
            return 0;
522
        }
523
        tok->buf = newbuf;
524
        tok->cur = tok->buf + cur;
525
        tok->inp = tok->buf + oldsize;
526
        tok->end = tok->buf + newsize;
527
        tok->start = start < 0 ? NULL : tok->buf + start;
528
        tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
529
        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
530
        restore_fstring_buffers(tok);
531
    }
532
    return 1;
533
}
534

535
static inline int
536
contains_null_bytes(const char* str, size_t size) {
537
    return memchr(str, 0, size) != NULL;
538
}
539

540
static int
541
tok_readline_recode(struct tok_state *tok) {
542
    PyObject *line;
543
    const  char *buf;
544
    Py_ssize_t buflen;
545
    line = tok->decoding_buffer;
546
    if (line == NULL) {
547
        line = PyObject_CallNoArgs(tok->decoding_readline);
548
        if (line == NULL) {
549
            error_ret(tok);
550
            goto error;
551
        }
552
    }
553
    else {
554
        tok->decoding_buffer = NULL;
555
    }
556
    buf = PyUnicode_AsUTF8AndSize(line, &buflen);
557
    if (buf == NULL) {
558
        error_ret(tok);
559
        goto error;
560
    }
561
    // Make room for the null terminator *and* potentially
562
    // an extra newline character that we may need to artificially
563
    // add.
564
    size_t buffer_size = buflen + 2;
565
    if (!tok_reserve_buf(tok, buffer_size)) {
566
        goto error;
567
    }
568
    memcpy(tok->inp, buf, buflen);
569
    tok->inp += buflen;
570
    *tok->inp = '\0';
571
    if (tok->fp_interactive &&
572
        tok_concatenate_interactive_new_line(tok, buf) == -1) {
573
        goto error;
574
    }
575
    Py_DECREF(line);
576
    return 1;
577
error:
578
    Py_XDECREF(line);
579
    return 0;
580
}
581

582
/* Set the readline function for TOK to a StreamReader's
583
   readline function. The StreamReader is named ENC.
584

585
   This function is called from check_bom and check_coding_spec.
586

587
   ENC is usually identical to the future value of tok->encoding,
588
   except for the (currently unsupported) case of UTF-16.
589

590
   Return 1 on success, 0 on failure. */
591

592
static int
593
fp_setreadl(struct tok_state *tok, const char* enc)
594
{
595
    PyObject *readline, *open, *stream;
596
    int fd;
597
    long pos;
598

599
    fd = fileno(tok->fp);
600
    /* Due to buffering the file offset for fd can be different from the file
601
     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
602
     * its file position counts CRLF as one char and can't be directly mapped
603
     * to the file offset for fd.  Instead we step back one byte and read to
604
     * the end of line.*/
605
    pos = ftell(tok->fp);
606
    if (pos == -1 ||
607
        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
608
        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
609
        return 0;
610
    }
611

612
    open = _PyImport_GetModuleAttrString("io", "open");
613
    if (open == NULL) {
614
        return 0;
615
    }
616
    stream = PyObject_CallFunction(open, "isisOOO",
617
                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
618
    Py_DECREF(open);
619
    if (stream == NULL) {
620
        return 0;
621
    }
622

623
    readline = PyObject_GetAttr(stream, &_Py_ID(readline));
624
    Py_DECREF(stream);
625
    if (readline == NULL) {
626
        return 0;
627
    }
628
    Py_XSETREF(tok->decoding_readline, readline);
629

630
    if (pos > 0) {
631
        PyObject *bufobj = _PyObject_CallNoArgs(readline);
632
        if (bufobj == NULL) {
633
            return 0;
634
        }
635
        Py_DECREF(bufobj);
636
    }
637

638
    return 1;
639
}
640

641
/* Fetch the next byte from TOK. */
642

643
static int fp_getc(struct tok_state *tok) {
644
    return getc(tok->fp);
645
}
646

647
/* Unfetch the last byte back into TOK.  */
648

649
static void fp_ungetc(int c, struct tok_state *tok) {
650
    ungetc(c, tok->fp);
651
}
652

653
/* Check whether the characters at s start a valid
654
   UTF-8 sequence. Return the number of characters forming
655
   the sequence if yes, 0 if not.  The special cases match
656
   those in stringlib/codecs.h:utf8_decode.
657
*/
658
static int
659
valid_utf8(const unsigned char* s)
660
{
661
    int expected = 0;
662
    int length;
663
    if (*s < 0x80) {
664
        /* single-byte code */
665
        return 1;
666
    }
667
    else if (*s < 0xE0) {
668
        /* \xC2\x80-\xDF\xBF -- 0080-07FF */
669
        if (*s < 0xC2) {
670
            /* invalid sequence
671
               \x80-\xBF -- continuation byte
672
               \xC0-\xC1 -- fake 0000-007F */
673
            return 0;
674
        }
675
        expected = 1;
676
    }
677
    else if (*s < 0xF0) {
678
        /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
679
        if (*s == 0xE0 && *(s + 1) < 0xA0) {
680
            /* invalid sequence
681
               \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
682
            return 0;
683
        }
684
        else if (*s == 0xED && *(s + 1) >= 0xA0) {
685
            /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
686
               will result in surrogates in range D800-DFFF. Surrogates are
687
               not valid UTF-8 so they are rejected.
688
               See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
689
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
690
            return 0;
691
        }
692
        expected = 2;
693
    }
694
    else if (*s < 0xF5) {
695
        /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
696
        if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
697
            /* invalid sequence -- one of:
698
               \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
699
               \xF4\x90\x80\x80- -- 110000- overflow */
700
            return 0;
701
        }
702
        expected = 3;
703
    }
704
    else {
705
        /* invalid start byte */
706
        return 0;
707
    }
708
    length = expected + 1;
709
    for (; expected; expected--)
710
        if (s[expected] < 0x80 || s[expected] >= 0xC0)
711
            return 0;
712
    return length;
713
}
714

715
static int
716
ensure_utf8(char *line, struct tok_state *tok)
717
{
718
    int badchar = 0;
719
    unsigned char *c;
720
    int length;
721
    for (c = (unsigned char *)line; *c; c += length) {
722
        if (!(length = valid_utf8(c))) {
723
            badchar = *c;
724
            break;
725
        }
726
    }
727
    if (badchar) {
728
        PyErr_Format(PyExc_SyntaxError,
729
                     "Non-UTF-8 code starting with '\\x%.2x' "
730
                     "in file %U on line %i, "
731
                     "but no encoding declared; "
732
                     "see https://peps.python.org/pep-0263/ for details",
733
                     badchar, tok->filename, tok->lineno);
734
        return 0;
735
    }
736
    return 1;
737
}
738

739
/* Fetch a byte from TOK, using the string buffer. */
740

741
static int
742
buf_getc(struct tok_state *tok) {
743
    return Py_CHARMASK(*tok->str++);
744
}
745

746
/* Unfetch a byte from TOK, using the string buffer. */
747

748
static void
749
buf_ungetc(int c, struct tok_state *tok) {
750
    tok->str--;
751
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
752
}
753

754
/* Set the readline function for TOK to ENC. For the string-based
755
   tokenizer, this means to just record the encoding. */
756

757
static int
758
buf_setreadl(struct tok_state *tok, const char* enc) {
759
    tok->enc = enc;
760
    return 1;
761
}
762

763
/* Return a UTF-8 encoding Python string object from the
764
   C byte string STR, which is encoded with ENC. */
765

766
static PyObject *
767
translate_into_utf8(const char* str, const char* enc) {
768
    PyObject *utf8;
769
    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
770
    if (buf == NULL)
771
        return NULL;
772
    utf8 = PyUnicode_AsUTF8String(buf);
773
    Py_DECREF(buf);
774
    return utf8;
775
}
776

777

778
static char *
779
translate_newlines(const char *s, int exec_input, int preserve_crlf,
780
                   struct tok_state *tok) {
781
    int skip_next_lf = 0;
782
    size_t needed_length = strlen(s) + 2, final_length;
783
    char *buf, *current;
784
    char c = '\0';
785
    buf = PyMem_Malloc(needed_length);
786
    if (buf == NULL) {
787
        tok->done = E_NOMEM;
788
        return NULL;
789
    }
790
    for (current = buf; *s; s++, current++) {
791
        c = *s;
792
        if (skip_next_lf) {
793
            skip_next_lf = 0;
794
            if (c == '\n') {
795
                c = *++s;
796
                if (!c)
797
                    break;
798
            }
799
        }
800
        if (!preserve_crlf && c == '\r') {
801
            skip_next_lf = 1;
802
            c = '\n';
803
        }
804
        *current = c;
805
    }
806
    /* If this is exec input, add a newline to the end of the string if
807
       there isn't one already. */
808
    if (exec_input && c != '\n' && c != '\0') {
809
        *current = '\n';
810
        current++;
811
    }
812
    *current = '\0';
813
    final_length = current - buf + 1;
814
    if (final_length < needed_length && final_length) {
815
        /* should never fail */
816
        char* result = PyMem_Realloc(buf, final_length);
817
        if (result == NULL) {
818
            PyMem_Free(buf);
819
        }
820
        buf = result;
821
    }
822
    return buf;
823
}
824

825
/* Decode a byte string STR for use as the buffer of TOK.
826
   Look for encoding declarations inside STR, and record them
827
   inside TOK.  */
828

829
static char *
830
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
831
{
832
    PyObject* utf8 = NULL;
833
    char *str;
834
    const char *s;
835
    const char *newl[2] = {NULL, NULL};
836
    int lineno = 0;
837
    tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
838
    if (str == NULL)
839
        return NULL;
840
    tok->enc = NULL;
841
    tok->str = str;
842
    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
843
        return error_ret(tok);
844
    str = tok->str;             /* string after BOM if any */
845
    assert(str);
846
    if (tok->enc != NULL) {
847
        utf8 = translate_into_utf8(str, tok->enc);
848
        if (utf8 == NULL)
849
            return error_ret(tok);
850
        str = PyBytes_AsString(utf8);
851
    }
852
    for (s = str;; s++) {
853
        if (*s == '\0') break;
854
        else if (*s == '\n') {
855
            assert(lineno < 2);
856
            newl[lineno] = s;
857
            lineno++;
858
            if (lineno == 2) break;
859
        }
860
    }
861
    tok->enc = NULL;
862
    /* need to check line 1 and 2 separately since check_coding_spec
863
       assumes a single line as input */
864
    if (newl[0]) {
865
        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
866
            return NULL;
867
        }
868
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
869
            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
870
                                   tok, buf_setreadl))
871
                return NULL;
872
        }
873
    }
874
    if (tok->enc != NULL) {
875
        assert(utf8 == NULL);
876
        utf8 = translate_into_utf8(str, tok->enc);
877
        if (utf8 == NULL)
878
            return error_ret(tok);
879
        str = PyBytes_AS_STRING(utf8);
880
    }
881
    assert(tok->decoding_buffer == NULL);
882
    tok->decoding_buffer = utf8; /* CAUTION */
883
    return str;
884
}
885

886
/* Set up tokenizer for string */
887

888
struct tok_state *
889
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
890
{
891
    struct tok_state *tok = tok_new();
892
    char *decoded;
893

894
    if (tok == NULL)
895
        return NULL;
896
    decoded = decode_str(str, exec_input, tok, preserve_crlf);
897
    if (decoded == NULL) {
898
        _PyTokenizer_Free(tok);
899
        return NULL;
900
    }
901

902
    tok->buf = tok->cur = tok->inp = decoded;
903
    tok->end = decoded;
904
    return tok;
905
}
906

907
struct tok_state *
908
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
909
                          int exec_input, int preserve_crlf)
910
{
911
    struct tok_state *tok = tok_new();
912
    if (tok == NULL)
913
        return NULL;
914
    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
915
        _PyTokenizer_Free(tok);
916
        return NULL;
917
    }
918
    tok->cur = tok->inp = tok->buf;
919
    tok->end = tok->buf + BUFSIZ;
920
    tok->fp = NULL;
921
    if (enc != NULL) {
922
        tok->encoding = new_string(enc, strlen(enc), tok);
923
        if (!tok->encoding) {
924
            _PyTokenizer_Free(tok);
925
            return NULL;
926
        }
927
    }
928
    tok->decoding_state = STATE_NORMAL;
929
    Py_INCREF(readline);
930
    tok->readline = readline;
931
    return tok;
932
}
933

934
/* Set up tokenizer for UTF-8 string */
935

936
struct tok_state *
937
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
938
{
939
    struct tok_state *tok = tok_new();
940
    char *translated;
941
    if (tok == NULL)
942
        return NULL;
943
    tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
944
    if (translated == NULL) {
945
        _PyTokenizer_Free(tok);
946
        return NULL;
947
    }
948
    tok->decoding_state = STATE_NORMAL;
949
    tok->enc = NULL;
950
    tok->str = translated;
951
    tok->encoding = new_string("utf-8", 5, tok);
952
    if (!tok->encoding) {
953
        _PyTokenizer_Free(tok);
954
        return NULL;
955
    }
956

957
    tok->buf = tok->cur = tok->inp = translated;
958
    tok->end = translated;
959
    return tok;
960
}
961

962
/* Set up tokenizer for file */
963

964
struct tok_state *
965
_PyTokenizer_FromFile(FILE *fp, const char* enc,
966
                      const char *ps1, const char *ps2)
967
{
968
    struct tok_state *tok = tok_new();
969
    if (tok == NULL)
970
        return NULL;
971
    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
972
        _PyTokenizer_Free(tok);
973
        return NULL;
974
    }
975
    tok->cur = tok->inp = tok->buf;
976
    tok->end = tok->buf + BUFSIZ;
977
    tok->fp = fp;
978
    tok->prompt = ps1;
979
    tok->nextprompt = ps2;
980
    if (enc != NULL) {
981
        /* Must copy encoding declaration since it
982
           gets copied into the parse tree. */
983
        tok->encoding = new_string(enc, strlen(enc), tok);
984
        if (!tok->encoding) {
985
            _PyTokenizer_Free(tok);
986
            return NULL;
987
        }
988
        tok->decoding_state = STATE_NORMAL;
989
    }
990
    return tok;
991
}
992

993
/* Free a tok_state structure */
994

995
void
996
_PyTokenizer_Free(struct tok_state *tok)
997
{
998
    if (tok->encoding != NULL) {
999
        PyMem_Free(tok->encoding);
1000
    }
1001
    Py_XDECREF(tok->decoding_readline);
1002
    Py_XDECREF(tok->decoding_buffer);
1003
    Py_XDECREF(tok->readline);
1004
    Py_XDECREF(tok->filename);
1005
    if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
1006
        PyMem_Free(tok->buf);
1007
    }
1008
    if (tok->input) {
1009
        PyMem_Free(tok->input);
1010
    }
1011
    if (tok->interactive_src_start != NULL) {
1012
        PyMem_Free(tok->interactive_src_start);
1013
    }
1014
    free_fstring_expressions(tok);
1015
    PyMem_Free(tok);
1016
}
1017

1018
void
1019
_PyToken_Free(struct token *token) {
1020
    Py_XDECREF(token->metadata);
1021
}
1022

1023
void
1024
_PyToken_Init(struct token *token) {
1025
    token->metadata = NULL;
1026
}
1027

1028
static int
1029
tok_readline_raw(struct tok_state *tok)
1030
{
1031
    do {
1032
        if (!tok_reserve_buf(tok, BUFSIZ)) {
1033
            return 0;
1034
        }
1035
        int n_chars = (int)(tok->end - tok->inp);
1036
        size_t line_size = 0;
1037
        char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
1038
        if (line == NULL) {
1039
            return 1;
1040
        }
1041
        if (tok->fp_interactive &&
1042
            tok_concatenate_interactive_new_line(tok, line) == -1) {
1043
            return 0;
1044
        }
1045
        tok->inp += line_size;
1046
        if (tok->inp == tok->buf) {
1047
            return 0;
1048
        }
1049
    } while (tok->inp[-1] != '\n');
1050
    return 1;
1051
}
1052

1053
static int
1054
tok_readline_string(struct tok_state* tok) {
1055
    PyObject* line = NULL;
1056
    PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
1057
    if (raw_line == NULL) {
1058
        if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
1059
            PyErr_Clear();
1060
            return 1;
1061
        }
1062
        error_ret(tok);
1063
        goto error;
1064
    }
1065
    if(tok->encoding != NULL) {
1066
        if (!PyBytes_Check(raw_line)) {
1067
            PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
1068
            error_ret(tok);
1069
            goto error;
1070
        }
1071
        line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
1072
                                tok->encoding, "replace");
1073
        Py_CLEAR(raw_line);
1074
        if (line == NULL) {
1075
            error_ret(tok);
1076
            goto error;
1077
        }
1078
    } else {
1079
        if(!PyUnicode_Check(raw_line)) {
1080
            PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
1081
            error_ret(tok);
1082
            goto error;
1083
        }
1084
        line = raw_line;
1085
        raw_line = NULL;
1086
    }
1087
    Py_ssize_t buflen;
1088
    const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
1089
    if (buf == NULL) {
1090
        error_ret(tok);
1091
        goto error;
1092
    }
1093

1094
    // Make room for the null terminator *and* potentially
1095
    // an extra newline character that we may need to artificially
1096
    // add.
1097
    size_t buffer_size = buflen + 2;
1098
    if (!tok_reserve_buf(tok, buffer_size)) {
1099
        goto error;
1100
    }
1101
    memcpy(tok->inp, buf, buflen);
1102
    tok->inp += buflen;
1103
    *tok->inp = '\0';
1104

1105
    tok->line_start = tok->cur;
1106
    Py_DECREF(line);
1107
    return 1;
1108
error:
1109
    Py_XDECREF(raw_line);
1110
    Py_XDECREF(line);
1111
    return 0;
1112
}
1113

1114
static int
1115
tok_underflow_string(struct tok_state *tok) {
1116
    char *end = strchr(tok->inp, '\n');
1117
    if (end != NULL) {
1118
        end++;
1119
    }
1120
    else {
1121
        end = strchr(tok->inp, '\0');
1122
        if (end == tok->inp) {
1123
            tok->done = E_EOF;
1124
            return 0;
1125
        }
1126
    }
1127
    if (tok->start == NULL) {
1128
        tok->buf = tok->cur;
1129
    }
1130
    tok->line_start = tok->cur;
1131
    ADVANCE_LINENO();
1132
    tok->inp = end;
1133
    return 1;
1134
}
1135

1136
static int
1137
tok_underflow_interactive(struct tok_state *tok) {
1138
    if (tok->interactive_underflow == IUNDERFLOW_STOP) {
1139
        tok->done = E_INTERACT_STOP;
1140
        return 1;
1141
    }
1142
    char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
1143
    if (newtok != NULL) {
1144
        char *translated = translate_newlines(newtok, 0, 0, tok);
1145
        PyMem_Free(newtok);
1146
        if (translated == NULL) {
1147
            return 0;
1148
        }
1149
        newtok = translated;
1150
    }
1151
    if (tok->encoding && newtok && *newtok) {
1152
        /* Recode to UTF-8 */
1153
        Py_ssize_t buflen;
1154
        const char* buf;
1155
        PyObject *u = translate_into_utf8(newtok, tok->encoding);
1156
        PyMem_Free(newtok);
1157
        if (u == NULL) {
1158
            tok->done = E_DECODE;
1159
            return 0;
1160
        }
1161
        buflen = PyBytes_GET_SIZE(u);
1162
        buf = PyBytes_AS_STRING(u);
1163
        newtok = PyMem_Malloc(buflen+1);
1164
        if (newtok == NULL) {
1165
            Py_DECREF(u);
1166
            tok->done = E_NOMEM;
1167
            return 0;
1168
        }
1169
        strcpy(newtok, buf);
1170
        Py_DECREF(u);
1171
    }
1172
    if (tok->fp_interactive &&
1173
        tok_concatenate_interactive_new_line(tok, newtok) == -1) {
1174
        PyMem_Free(newtok);
1175
        return 0;
1176
    }
1177
    if (tok->nextprompt != NULL) {
1178
        tok->prompt = tok->nextprompt;
1179
    }
1180
    if (newtok == NULL) {
1181
        tok->done = E_INTR;
1182
    }
1183
    else if (*newtok == '\0') {
1184
        PyMem_Free(newtok);
1185
        tok->done = E_EOF;
1186
    }
1187
    else if (tok->start != NULL) {
1188
        Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
1189
        remember_fstring_buffers(tok);
1190
        size_t size = strlen(newtok);
1191
        ADVANCE_LINENO();
1192
        if (!tok_reserve_buf(tok, size + 1)) {
1193
            PyMem_Free(tok->buf);
1194
            tok->buf = NULL;
1195
            PyMem_Free(newtok);
1196
            return 0;
1197
        }
1198
        memcpy(tok->cur, newtok, size + 1);
1199
        PyMem_Free(newtok);
1200
        tok->inp += size;
1201
        tok->multi_line_start = tok->buf + cur_multi_line_start;
1202
        restore_fstring_buffers(tok);
1203
    }
1204
    else {
1205
        remember_fstring_buffers(tok);
1206
        ADVANCE_LINENO();
1207
        PyMem_Free(tok->buf);
1208
        tok->buf = newtok;
1209
        tok->cur = tok->buf;
1210
        tok->line_start = tok->buf;
1211
        tok->inp = strchr(tok->buf, '\0');
1212
        tok->end = tok->inp + 1;
1213
        restore_fstring_buffers(tok);
1214
    }
1215
    if (tok->done != E_OK) {
1216
        if (tok->prompt != NULL) {
1217
            PySys_WriteStderr("\n");
1218
        }
1219
        return 0;
1220
    }
1221

1222
    if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1223
        return 0;
1224
    }
1225
    return 1;
1226
}
1227

1228
static int
1229
tok_underflow_file(struct tok_state *tok) {
1230
    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
1231
        tok->cur = tok->inp = tok->buf;
1232
    }
1233
    if (tok->decoding_state == STATE_INIT) {
1234
        /* We have not yet determined the encoding.
1235
           If an encoding is found, use the file-pointer
1236
           reader functions from now on. */
1237
        if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
1238
            error_ret(tok);
1239
            return 0;
1240
        }
1241
        assert(tok->decoding_state != STATE_INIT);
1242
    }
1243
    /* Read until '\n' or EOF */
1244
    if (tok->decoding_readline != NULL) {
1245
        /* We already have a codec associated with this input. */
1246
        if (!tok_readline_recode(tok)) {
1247
            return 0;
1248
        }
1249
    }
1250
    else {
1251
        /* We want a 'raw' read. */
1252
        if (!tok_readline_raw(tok)) {
1253
            return 0;
1254
        }
1255
    }
1256
    if (tok->inp == tok->cur) {
1257
        tok->done = E_EOF;
1258
        return 0;
1259
    }
1260
    tok->implicit_newline = 0;
1261
    if (tok->inp[-1] != '\n') {
1262
        assert(tok->inp + 1 < tok->end);
1263
        /* Last line does not end in \n, fake one */
1264
        *tok->inp++ = '\n';
1265
        *tok->inp = '\0';
1266
        tok->implicit_newline = 1;
1267
    }
1268

1269
    if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1270
        return 0;
1271
    }
1272

1273
    ADVANCE_LINENO();
1274
    if (tok->decoding_state != STATE_NORMAL) {
1275
        if (tok->lineno > 2) {
1276
            tok->decoding_state = STATE_NORMAL;
1277
        }
1278
        else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1279
                                    tok, fp_setreadl))
1280
        {
1281
            return 0;
1282
        }
1283
    }
1284
    /* The default encoding is UTF-8, so make sure we don't have any
1285
       non-UTF-8 sequences in it. */
1286
    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1287
        error_ret(tok);
1288
        return 0;
1289
    }
1290
    assert(tok->done == E_OK);
1291
    return tok->done == E_OK;
1292
}
1293

1294
static int
1295
tok_underflow_readline(struct tok_state* tok) {
1296
    assert(tok->decoding_state == STATE_NORMAL);
1297
    assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
1298
    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
1299
        tok->cur = tok->inp = tok->buf;
1300
    }
1301
    if (!tok_readline_string(tok)) {
1302
        return 0;
1303
    }
1304
    if (tok->inp == tok->cur) {
1305
        tok->done = E_EOF;
1306
        return 0;
1307
    }
1308
    tok->implicit_newline = 0;
1309
    if (tok->inp[-1] != '\n') {
1310
        assert(tok->inp + 1 < tok->end);
1311
        /* Last line does not end in \n, fake one */
1312
        *tok->inp++ = '\n';
1313
        *tok->inp = '\0';
1314
        tok->implicit_newline = 1;
1315
    }
1316

1317
    if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
1318
        return 0;
1319
    }
1320

1321
    ADVANCE_LINENO();
1322
    /* The default encoding is UTF-8, so make sure we don't have any
1323
       non-UTF-8 sequences in it. */
1324
    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1325
        error_ret(tok);
1326
        return 0;
1327
    }
1328
    assert(tok->done == E_OK);
1329
    return tok->done == E_OK;
1330
}
1331

1332
#if defined(Py_DEBUG)
1333
static void
1334
print_escape(FILE *f, const char *s, Py_ssize_t size)
1335
{
1336
    if (s == NULL) {
1337
        fputs("NULL", f);
1338
        return;
1339
    }
1340
    putc('"', f);
1341
    while (size-- > 0) {
1342
        unsigned char c = *s++;
1343
        switch (c) {
1344
            case '\n': fputs("\\n", f); break;
1345
            case '\r': fputs("\\r", f); break;
1346
            case '\t': fputs("\\t", f); break;
1347
            case '\f': fputs("\\f", f); break;
1348
            case '\'': fputs("\\'", f); break;
1349
            case '"': fputs("\\\"", f); break;
1350
            default:
1351
                if (0x20 <= c && c <= 0x7f)
1352
                    putc(c, f);
1353
                else
1354
                    fprintf(f, "\\x%02x", c);
1355
        }
1356
    }
1357
    putc('"', f);
1358
}
1359
#endif
1360

1361
/* Get next char, updating state; error code goes into tok->done */
1362

1363
static int
1364
tok_nextc(struct tok_state *tok)
1365
{
1366
    int rc;
1367
    for (;;) {
1368
        if (tok->cur != tok->inp) {
1369
            tok->col_offset++;
1370
            return Py_CHARMASK(*tok->cur++); /* Fast path */
1371
        }
1372
        if (tok->done != E_OK) {
1373
            return EOF;
1374
        }
1375
        if (tok->readline) {
1376
            rc = tok_underflow_readline(tok);
1377
        }
1378
        else if (tok->fp == NULL) {
1379
            rc = tok_underflow_string(tok);
1380
        }
1381
        else if (tok->prompt != NULL) {
1382
            rc = tok_underflow_interactive(tok);
1383
        }
1384
        else {
1385
            rc = tok_underflow_file(tok);
1386
        }
1387
#if defined(Py_DEBUG)
1388
        if (tok->debug) {
1389
            fprintf(stderr, "line[%d] = ", tok->lineno);
1390
            print_escape(stderr, tok->cur, tok->inp - tok->cur);
1391
            fprintf(stderr, "  tok->done = %d\n", tok->done);
1392
        }
1393
#endif
1394
        if (!rc) {
1395
            tok->cur = tok->inp;
1396
            return EOF;
1397
        }
1398
        tok->line_start = tok->cur;
1399

1400
        if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1401
            syntaxerror(tok, "source code cannot contain null bytes");
1402
            tok->cur = tok->inp;
1403
            return EOF;
1404
        }
1405
    }
1406
    Py_UNREACHABLE();
1407
}
1408

1409
/* Back-up one character */
1410

1411
static void
1412
tok_backup(struct tok_state *tok, int c)
1413
{
1414
    if (c != EOF) {
1415
        if (--tok->cur < tok->buf) {
1416
            Py_FatalError("tokenizer beginning of buffer");
1417
        }
1418
        if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
1419
            Py_FatalError("tok_backup: wrong character");
1420
        }
1421
        tok->col_offset--;
1422
    }
1423
}
1424

1425
static int
1426
_syntaxerror_range(struct tok_state *tok, const char *format,
1427
                   int col_offset, int end_col_offset,
1428
                   va_list vargs)
1429
{
1430
    // In release builds, we don't want to overwrite a previous error, but in debug builds we
1431
    // want to fail if we are not doing it so we can fix it.
1432
    assert(tok->done != E_ERROR);
1433
    if (tok->done == E_ERROR) {
1434
        return ERRORTOKEN;
1435
    }
1436
    PyObject *errmsg, *errtext, *args;
1437
    errmsg = PyUnicode_FromFormatV(format, vargs);
1438
    if (!errmsg) {
1439
        goto error;
1440
    }
1441

1442
    errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1443
                                   "replace");
1444
    if (!errtext) {
1445
        goto error;
1446
    }
1447

1448
    if (col_offset == -1) {
1449
        col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1450
    }
1451
    if (end_col_offset == -1) {
1452
        end_col_offset = col_offset;
1453
    }
1454

1455
    Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1456
    if (line_len != tok->cur - tok->line_start) {
1457
        Py_DECREF(errtext);
1458
        errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1459
                                       "replace");
1460
    }
1461
    if (!errtext) {
1462
        goto error;
1463
    }
1464

1465
    args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1466
                         col_offset, errtext, tok->lineno, end_col_offset);
1467
    if (args) {
1468
        PyErr_SetObject(PyExc_SyntaxError, args);
1469
        Py_DECREF(args);
1470
    }
1471

1472
error:
1473
    Py_XDECREF(errmsg);
1474
    tok->done = E_ERROR;
1475
    return ERRORTOKEN;
1476
}
1477

1478
static int
1479
syntaxerror(struct tok_state *tok, const char *format, ...)
1480
{
1481
    // This errors are cleaned on startup. Todo: Fix it.
1482
    va_list vargs;
1483
    va_start(vargs, format);
1484
    int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1485
    va_end(vargs);
1486
    return ret;
1487
}
1488

1489
static int
1490
syntaxerror_known_range(struct tok_state *tok,
1491
                        int col_offset, int end_col_offset,
1492
                        const char *format, ...)
1493
{
1494
    va_list vargs;
1495
    va_start(vargs, format);
1496
    int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1497
    va_end(vargs);
1498
    return ret;
1499
}
1500

1501
static int
1502
indenterror(struct tok_state *tok)
1503
{
1504
    tok->done = E_TABSPACE;
1505
    tok->cur = tok->inp;
1506
    return ERRORTOKEN;
1507
}
1508

1509
static int
1510
parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1511
{
1512
    if (!tok->report_warnings) {
1513
        return 0;
1514
    }
1515

1516
    PyObject *errmsg;
1517
    va_list vargs;
1518
    va_start(vargs, format);
1519
    errmsg = PyUnicode_FromFormatV(format, vargs);
1520
    va_end(vargs);
1521
    if (!errmsg) {
1522
        goto error;
1523
    }
1524

1525
    if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1526
                                 tok->lineno, NULL, NULL) < 0) {
1527
        if (PyErr_ExceptionMatches(category)) {
1528
            /* Replace the DeprecationWarning exception with a SyntaxError
1529
               to get a more accurate error report */
1530
            PyErr_Clear();
1531
            syntaxerror(tok, "%U", errmsg);
1532
        }
1533
        goto error;
1534
    }
1535
    Py_DECREF(errmsg);
1536
    return 0;
1537

1538
error:
1539
    Py_XDECREF(errmsg);
1540
    tok->done = E_ERROR;
1541
    return -1;
1542
}
1543

1544
static int
1545
warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
1546
{
1547

1548
    if (!tok->tok_report_warnings) {
1549
        return 0;
1550
    }
1551

1552
    PyObject *msg = PyUnicode_FromFormat(
1553
        "invalid escape sequence '\\%c'",
1554
        (char) first_invalid_escape_char
1555
    );
1556

1557
    if (msg == NULL) {
1558
        return -1;
1559
    }
1560

1561
    if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename,
1562
                                 tok->lineno, NULL, NULL) < 0) {
1563
        Py_DECREF(msg);
1564

1565
        if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) {
1566
            /* Replace the SyntaxWarning exception with a SyntaxError
1567
               to get a more accurate error report */
1568
            PyErr_Clear();
1569
            return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
1570
        }
1571

1572
        return -1;
1573
    }
1574

1575
    Py_DECREF(msg);
1576
    return 0;
1577
}
1578

1579
static int
1580
lookahead(struct tok_state *tok, const char *test)
1581
{
1582
    const char *s = test;
1583
    int res = 0;
1584
    while (1) {
1585
        int c = tok_nextc(tok);
1586
        if (*s == 0) {
1587
            res = !is_potential_identifier_char(c);
1588
        }
1589
        else if (c == *s) {
1590
            s++;
1591
            continue;
1592
        }
1593

1594
        tok_backup(tok, c);
1595
        while (s != test) {
1596
            tok_backup(tok, *--s);
1597
        }
1598
        return res;
1599
    }
1600
}
1601

1602
static int
1603
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
1604
    if (tok->tok_extra_tokens) {
1605
        // When we are parsing extra tokens, we don't want to emit warnings
1606
        // about invalid literals, because we want to be a bit more liberal.
1607
        return 1;
1608
    }
1609
    /* Emit a deprecation warning only if the numeric literal is immediately
1610
     * followed by one of keywords which can occur after a numeric literal
1611
     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1612
     * It allows to gradually deprecate existing valid code without adding
1613
     * warning before error in most cases of invalid numeric literal (which
1614
     * would be confusing and break existing tests).
1615
     * Raise a syntax error with slightly better message than plain
1616
     * "invalid syntax" if the numeric literal is immediately followed by
1617
     * other keyword or identifier.
1618
     */
1619
    int r = 0;
1620
    if (c == 'a') {
1621
        r = lookahead(tok, "nd");
1622
    }
1623
    else if (c == 'e') {
1624
        r = lookahead(tok, "lse");
1625
    }
1626
    else if (c == 'f') {
1627
        r = lookahead(tok, "or");
1628
    }
1629
    else if (c == 'i') {
1630
        int c2 = tok_nextc(tok);
1631
        if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1632
            r = 1;
1633
        }
1634
        tok_backup(tok, c2);
1635
    }
1636
    else if (c == 'o') {
1637
        r = lookahead(tok, "r");
1638
    }
1639
    else if (c == 'n') {
1640
        r = lookahead(tok, "ot");
1641
    }
1642
    if (r) {
1643
        tok_backup(tok, c);
1644
        if (parser_warn(tok, PyExc_SyntaxWarning,
1645
                "invalid %s literal", kind))
1646
        {
1647
            return 0;
1648
        }
1649
        tok_nextc(tok);
1650
    }
1651
    else /* In future releases, only error will remain. */
1652
    if (is_potential_identifier_char(c)) {
1653
        tok_backup(tok, c);
1654
        syntaxerror(tok, "invalid %s literal", kind);
1655
        return 0;
1656
    }
1657
    return 1;
1658
}
1659

1660
/* Verify that the identifier follows PEP 3131.
1661
   All identifier strings are guaranteed to be "ready" unicode objects.
1662
 */
1663
static int
1664
verify_identifier(struct tok_state *tok)
1665
{
1666
    if (tok->tok_extra_tokens) {
1667
        return 1;
1668
    }
1669
    PyObject *s;
1670
    if (tok->decoding_erred)
1671
        return 0;
1672
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1673
    if (s == NULL) {
1674
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1675
            tok->done = E_DECODE;
1676
        }
1677
        else {
1678
            tok->done = E_ERROR;
1679
        }
1680
        return 0;
1681
    }
1682
    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1683
    if (invalid < 0) {
1684
        Py_DECREF(s);
1685
        tok->done = E_ERROR;
1686
        return 0;
1687
    }
1688
    assert(PyUnicode_GET_LENGTH(s) > 0);
1689
    if (invalid < PyUnicode_GET_LENGTH(s)) {
1690
        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1691
        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1692
            /* Determine the offset in UTF-8 encoded input */
1693
            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1694
            if (s != NULL) {
1695
                Py_SETREF(s, PyUnicode_AsUTF8String(s));
1696
            }
1697
            if (s == NULL) {
1698
                tok->done = E_ERROR;
1699
                return 0;
1700
            }
1701
            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1702
        }
1703
        Py_DECREF(s);
1704
        if (Py_UNICODE_ISPRINTABLE(ch)) {
1705
            syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
1706
        }
1707
        else {
1708
            syntaxerror(tok, "invalid non-printable character U+%04X", ch);
1709
        }
1710
        return 0;
1711
    }
1712
    Py_DECREF(s);
1713
    return 1;
1714
}
1715

1716
static int
1717
tok_decimal_tail(struct tok_state *tok)
1718
{
1719
    int c;
1720

1721
    while (1) {
1722
        do {
1723
            c = tok_nextc(tok);
1724
        } while (isdigit(c));
1725
        if (c != '_') {
1726
            break;
1727
        }
1728
        c = tok_nextc(tok);
1729
        if (!isdigit(c)) {
1730
            tok_backup(tok, c);
1731
            syntaxerror(tok, "invalid decimal literal");
1732
            return 0;
1733
        }
1734
    }
1735
    return c;
1736
}
1737

1738

1739
static inline int
1740
tok_continuation_line(struct tok_state *tok) {
1741
    int c = tok_nextc(tok);
1742
    if (c == '\r') {
1743
        c = tok_nextc(tok);
1744
    }
1745
    if (c != '\n') {
1746
        tok->done = E_LINECONT;
1747
        return -1;
1748
    }
1749
    c = tok_nextc(tok);
1750
    if (c == EOF) {
1751
        tok->done = E_EOF;
1752
        tok->cur = tok->inp;
1753
        return -1;
1754
    } else {
1755
        tok_backup(tok, c);
1756
    }
1757
    return c;
1758
}
1759

1760
static int
1761
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
1762
                         int end_col_offset, const char *start, const char *end)
1763
{
1764
    token->level = tok->level;
1765
    token->lineno = token->end_lineno = tok->lineno;
1766
    token->col_offset = col_offset;
1767
    token->end_col_offset = end_col_offset;
1768
    token->start = start;
1769
    token->end = end;
1770
    return type;
1771
}
1772

1773
static int
1774
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
1775
{
1776
    assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
1777
    token->level = tok->level;
1778
    if (ISSTRINGLIT(type)) {
1779
        token->lineno = tok->first_lineno;
1780
    }
1781
    else {
1782
        token->lineno = tok->lineno;
1783
    }
1784
    token->end_lineno = tok->lineno;
1785
    token->col_offset = token->end_col_offset = -1;
1786
    token->start = start;
1787
    token->end = end;
1788

1789
    if (start != NULL && end != NULL) {
1790
        token->col_offset = tok->starting_col_offset;
1791
        token->end_col_offset = tok->col_offset;
1792
    }
1793
    return type;
1794
}
1795

1796

1797
static int
1798
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1799
{
1800
    int c;
1801
    int blankline, nonascii;
1802

1803
    const char *p_start = NULL;
1804
    const char *p_end = NULL;
1805
  nextline:
1806
    tok->start = NULL;
1807
    tok->starting_col_offset = -1;
1808
    blankline = 0;
1809

1810

1811
    /* Get indentation level */
1812
    if (tok->atbol) {
1813
        int col = 0;
1814
        int altcol = 0;
1815
        tok->atbol = 0;
1816
        int cont_line_col = 0;
1817
        for (;;) {
1818
            c = tok_nextc(tok);
1819
            if (c == ' ') {
1820
                col++, altcol++;
1821
            }
1822
            else if (c == '\t') {
1823
                col = (col / tok->tabsize + 1) * tok->tabsize;
1824
                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1825
            }
1826
            else if (c == '\014')  {/* Control-L (formfeed) */
1827
                col = altcol = 0; /* For Emacs users */
1828
            }
1829
            else if (c == '\\') {
1830
                // Indentation cannot be split over multiple physical lines
1831
                // using backslashes. This means that if we found a backslash
1832
                // preceded by whitespace, **the first one we find** determines
1833
                // the level of indentation of whatever comes next.
1834
                cont_line_col = cont_line_col ? cont_line_col : col;
1835
                if ((c = tok_continuation_line(tok)) == -1) {
1836
                    return MAKE_TOKEN(ERRORTOKEN);
1837
                }
1838
            }
1839
            else {
1840
                break;
1841
            }
1842
        }
1843
        tok_backup(tok, c);
1844
        if (c == '#' || c == '\n' || c == '\r') {
1845
            /* Lines with only whitespace and/or comments
1846
               shouldn't affect the indentation and are
1847
               not passed to the parser as NEWLINE tokens,
1848
               except *totally* empty lines in interactive
1849
               mode, which signal the end of a command group. */
1850
            if (col == 0 && c == '\n' && tok->prompt != NULL) {
1851
                blankline = 0; /* Let it through */
1852
            }
1853
            else if (tok->prompt != NULL && tok->lineno == 1) {
1854
                /* In interactive mode, if the first line contains
1855
                   only spaces and/or a comment, let it through. */
1856
                blankline = 0;
1857
                col = altcol = 0;
1858
            }
1859
            else {
1860
                blankline = 1; /* Ignore completely */
1861
            }
1862
            /* We can't jump back right here since we still
1863
               may need to skip to the end of a comment */
1864
        }
1865
        if (!blankline && tok->level == 0) {
1866
            col = cont_line_col ? cont_line_col : col;
1867
            altcol = cont_line_col ? cont_line_col : altcol;
1868
            if (col == tok->indstack[tok->indent]) {
1869
                /* No change */
1870
                if (altcol != tok->altindstack[tok->indent]) {
1871
                    return MAKE_TOKEN(indenterror(tok));
1872
                }
1873
            }
1874
            else if (col > tok->indstack[tok->indent]) {
1875
                /* Indent -- always one */
1876
                if (tok->indent+1 >= MAXINDENT) {
1877
                    tok->done = E_TOODEEP;
1878
                    tok->cur = tok->inp;
1879
                    return MAKE_TOKEN(ERRORTOKEN);
1880
                }
1881
                if (altcol <= tok->altindstack[tok->indent]) {
1882
                    return MAKE_TOKEN(indenterror(tok));
1883
                }
1884
                tok->pendin++;
1885
                tok->indstack[++tok->indent] = col;
1886
                tok->altindstack[tok->indent] = altcol;
1887
            }
1888
            else /* col < tok->indstack[tok->indent] */ {
1889
                /* Dedent -- any number, must be consistent */
1890
                while (tok->indent > 0 &&
1891
                    col < tok->indstack[tok->indent]) {
1892
                    tok->pendin--;
1893
                    tok->indent--;
1894
                }
1895
                if (col != tok->indstack[tok->indent]) {
1896
                    tok->done = E_DEDENT;
1897
                    tok->cur = tok->inp;
1898
                    return MAKE_TOKEN(ERRORTOKEN);
1899
                }
1900
                if (altcol != tok->altindstack[tok->indent]) {
1901
                    return MAKE_TOKEN(indenterror(tok));
1902
                }
1903
            }
1904
        }
1905
    }
1906

1907
    tok->start = tok->cur;
1908
    tok->starting_col_offset = tok->col_offset;
1909

1910
    /* Return pending indents/dedents */
1911
    if (tok->pendin != 0) {
1912
        if (tok->pendin < 0) {
1913
            if (tok->tok_extra_tokens) {
1914
                p_start = tok->cur;
1915
                p_end = tok->cur;
1916
            }
1917
            tok->pendin++;
1918
            return MAKE_TOKEN(DEDENT);
1919
        }
1920
        else {
1921
            if (tok->tok_extra_tokens) {
1922
                p_start = tok->buf;
1923
                p_end = tok->cur;
1924
            }
1925
            tok->pendin--;
1926
            return MAKE_TOKEN(INDENT);
1927
        }
1928
    }
1929

1930
    /* Peek ahead at the next character */
1931
    c = tok_nextc(tok);
1932
    tok_backup(tok, c);
1933
    /* Check if we are closing an async function */
1934
    if (tok->async_def
1935
        && !blankline
1936
        /* Due to some implementation artifacts of type comments,
1937
         * a TYPE_COMMENT at the start of a function won't set an
1938
         * indentation level and it will produce a NEWLINE after it.
1939
         * To avoid spuriously ending an async function due to this,
1940
         * wait until we have some non-newline char in front of us. */
1941
        && c != '\n'
1942
        && tok->level == 0
1943
        /* There was a NEWLINE after ASYNC DEF,
1944
           so we're past the signature. */
1945
        && tok->async_def_nl
1946
        /* Current indentation level is less than where
1947
           the async function was defined */
1948
        && tok->async_def_indent >= tok->indent)
1949
    {
1950
        tok->async_def = 0;
1951
        tok->async_def_indent = 0;
1952
        tok->async_def_nl = 0;
1953
    }
1954

1955
 again:
1956
    tok->start = NULL;
1957
    /* Skip spaces */
1958
    do {
1959
        c = tok_nextc(tok);
1960
    } while (c == ' ' || c == '\t' || c == '\014');
1961

1962
    /* Set start of current token */
1963
    tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1964
    tok->starting_col_offset = tok->col_offset - 1;
1965

1966
    /* Skip comment, unless it's a type comment */
1967
    if (c == '#') {
1968

1969
        const char* p = NULL;
1970
        const char *prefix, *type_start;
1971
        int current_starting_col_offset;
1972

1973
        while (c != EOF && c != '\n' && c != '\r') {
1974
            c = tok_nextc(tok);
1975
        }
1976

1977
        if (tok->tok_extra_tokens) {
1978
            p = tok->start;
1979
        }
1980

1981
        if (tok->type_comments) {
1982
            p = tok->start;
1983
            current_starting_col_offset = tok->starting_col_offset;
1984
            prefix = type_comment_prefix;
1985
            while (*prefix && p < tok->cur) {
1986
                if (*prefix == ' ') {
1987
                    while (*p == ' ' || *p == '\t') {
1988
                        p++;
1989
                        current_starting_col_offset++;
1990
                    }
1991
                } else if (*prefix == *p) {
1992
                    p++;
1993
                    current_starting_col_offset++;
1994
                } else {
1995
                    break;
1996
                }
1997

1998
                prefix++;
1999
            }
2000

2001
            /* This is a type comment if we matched all of type_comment_prefix. */
2002
            if (!*prefix) {
2003
                int is_type_ignore = 1;
2004
                // +6 in order to skip the word 'ignore'
2005
                const char *ignore_end = p + 6;
2006
                const int ignore_end_col_offset = current_starting_col_offset + 6;
2007
                tok_backup(tok, c);  /* don't eat the newline or EOF */
2008

2009
                type_start = p;
2010

2011
                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
2012
                 * or anything ASCII and non-alphanumeric. */
2013
                is_type_ignore = (
2014
                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
2015
                    && !(tok->cur > ignore_end
2016
                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
2017

2018
                if (is_type_ignore) {
2019
                    p_start = ignore_end;
2020
                    p_end = tok->cur;
2021

2022
                    /* If this type ignore is the only thing on the line, consume the newline also. */
2023
                    if (blankline) {
2024
                        tok_nextc(tok);
2025
                        tok->atbol = 1;
2026
                    }
2027
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
2028
                } else {
2029
                    p_start = type_start;
2030
                    p_end = tok->cur;
2031
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
2032
                }
2033
            }
2034
        }
2035
        if (tok->tok_extra_tokens) {
2036
            tok_backup(tok, c);  /* don't eat the newline or EOF */
2037
            p_start = p;
2038
            p_end = tok->cur;
2039
            tok->comment_newline = blankline;
2040
            return MAKE_TOKEN(COMMENT);
2041
        }
2042
    }
2043

2044
    if (tok->done == E_INTERACT_STOP) {
2045
        return MAKE_TOKEN(ENDMARKER);
2046
    }
2047

2048
    /* Check for EOF and errors now */
2049
    if (c == EOF) {
2050
        if (tok->level) {
2051
            return MAKE_TOKEN(ERRORTOKEN);
2052
        }
2053
        return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
2054
    }
2055

2056
    /* Identifier (most frequent token!) */
2057
    nonascii = 0;
2058
    if (is_potential_identifier_start(c)) {
2059
        /* Process the various legal combinations of b"", r"", u"", and f"". */
2060
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
2061
        while (1) {
2062
            if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
2063
                saw_b = 1;
2064
            /* Since this is a backwards compatibility support literal we don't
2065
               want to support it in arbitrary order like byte literals. */
2066
            else if (!(saw_b || saw_u || saw_r || saw_f)
2067
                     && (c == 'u'|| c == 'U')) {
2068
                saw_u = 1;
2069
            }
2070
            /* ur"" and ru"" are not supported */
2071
            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
2072
                saw_r = 1;
2073
            }
2074
            else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
2075
                saw_f = 1;
2076
            }
2077
            else {
2078
                break;
2079
            }
2080
            c = tok_nextc(tok);
2081
            if (c == '"' || c == '\'') {
2082
                if (saw_f) {
2083
                    goto f_string_quote;
2084
                }
2085
                goto letter_quote;
2086
            }
2087
        }
2088
        while (is_potential_identifier_char(c)) {
2089
            if (c >= 128) {
2090
                nonascii = 1;
2091
            }
2092
            c = tok_nextc(tok);
2093
        }
2094
        tok_backup(tok, c);
2095
        if (nonascii && !verify_identifier(tok)) {
2096
            return MAKE_TOKEN(ERRORTOKEN);
2097
        }
2098

2099
        p_start = tok->start;
2100
        p_end = tok->cur;
2101

2102
        /* async/await parsing block. */
2103
        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
2104
            /* May be an 'async' or 'await' token.  For Python 3.7 or
2105
               later we recognize them unconditionally.  For Python
2106
               3.5 or 3.6 we recognize 'async' in front of 'def', and
2107
               either one inside of 'async def'.  (Technically we
2108
               shouldn't recognize these at all for 3.4 or earlier,
2109
               but there's no *valid* Python 3.4 code that would be
2110
               rejected, and async functions will be rejected in a
2111
               later phase.) */
2112
            if (!tok->async_hacks || tok->async_def) {
2113
                /* Always recognize the keywords. */
2114
                if (memcmp(tok->start, "async", 5) == 0) {
2115
                    return MAKE_TOKEN(ASYNC);
2116
                }
2117
                if (memcmp(tok->start, "await", 5) == 0) {
2118
                    return MAKE_TOKEN(AWAIT);
2119
                }
2120
            }
2121
            else if (memcmp(tok->start, "async", 5) == 0) {
2122
                /* The current token is 'async'.
2123
                   Look ahead one token to see if that is 'def'. */
2124

2125
                struct tok_state ahead_tok;
2126
                struct token ahead_token;
2127
                _PyToken_Init(&ahead_token);
2128
                int ahead_tok_kind;
2129

2130
                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
2131
                ahead_tok_kind = tok_get_normal_mode(&ahead_tok,
2132
                                                     current_tok,
2133
                                                     &ahead_token);
2134

2135
                if (ahead_tok_kind == NAME
2136
                    && ahead_tok.cur - ahead_tok.start == 3
2137
                    && memcmp(ahead_tok.start, "def", 3) == 0)
2138
                {
2139
                    /* The next token is going to be 'def', so instead of
2140
                       returning a plain NAME token, return ASYNC. */
2141
                    tok->async_def_indent = tok->indent;
2142
                    tok->async_def = 1;
2143
                    _PyToken_Free(&ahead_token);
2144
                    return MAKE_TOKEN(ASYNC);
2145
                }
2146
                _PyToken_Free(&ahead_token);
2147
            }
2148
        }
2149

2150
        return MAKE_TOKEN(NAME);
2151
    }
2152

2153
    if (c == '\r') {
2154
        c = tok_nextc(tok);
2155
    }
2156

2157
    /* Newline */
2158
    if (c == '\n') {
2159
        tok->atbol = 1;
2160
        if (blankline || tok->level > 0) {
2161
            if (tok->tok_extra_tokens) {
2162
                if (tok->comment_newline) {
2163
                    tok->comment_newline = 0;
2164
                }
2165
                p_start = tok->start;
2166
                p_end = tok->cur;
2167
                return MAKE_TOKEN(NL);
2168
            }
2169
            goto nextline;
2170
        }
2171
        if (tok->comment_newline && tok->tok_extra_tokens) {
2172
            tok->comment_newline = 0;
2173
            p_start = tok->start;
2174
            p_end = tok->cur;
2175
            return MAKE_TOKEN(NL);
2176
        }
2177
        p_start = tok->start;
2178
        p_end = tok->cur - 1; /* Leave '\n' out of the string */
2179
        tok->cont_line = 0;
2180
        if (tok->async_def) {
2181
            /* We're somewhere inside an 'async def' function, and
2182
               we've encountered a NEWLINE after its signature. */
2183
            tok->async_def_nl = 1;
2184
        }
2185
        return MAKE_TOKEN(NEWLINE);
2186
    }
2187

2188
    /* Period or number starting with period? */
2189
    if (c == '.') {
2190
        c = tok_nextc(tok);
2191
        if (isdigit(c)) {
2192
            goto fraction;
2193
        } else if (c == '.') {
2194
            c = tok_nextc(tok);
2195
            if (c == '.') {
2196
                p_start = tok->start;
2197
                p_end = tok->cur;
2198
                return MAKE_TOKEN(ELLIPSIS);
2199
            }
2200
            else {
2201
                tok_backup(tok, c);
2202
            }
2203
            tok_backup(tok, '.');
2204
        }
2205
        else {
2206
            tok_backup(tok, c);
2207
        }
2208
        p_start = tok->start;
2209
        p_end = tok->cur;
2210
        return MAKE_TOKEN(DOT);
2211
    }
2212

2213
    /* Number */
2214
    if (isdigit(c)) {
2215
        if (c == '0') {
2216
            /* Hex, octal or binary -- maybe. */
2217
            c = tok_nextc(tok);
2218
            if (c == 'x' || c == 'X') {
2219
                /* Hex */
2220
                c = tok_nextc(tok);
2221
                do {
2222
                    if (c == '_') {
2223
                        c = tok_nextc(tok);
2224
                    }
2225
                    if (!isxdigit(c)) {
2226
                        tok_backup(tok, c);
2227
                        return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
2228
                    }
2229
                    do {
2230
                        c = tok_nextc(tok);
2231
                    } while (isxdigit(c));
2232
                } while (c == '_');
2233
                if (!verify_end_of_number(tok, c, "hexadecimal")) {
2234
                    return MAKE_TOKEN(ERRORTOKEN);
2235
                }
2236
            }
2237
            else if (c == 'o' || c == 'O') {
2238
                /* Octal */
2239
                c = tok_nextc(tok);
2240
                do {
2241
                    if (c == '_') {
2242
                        c = tok_nextc(tok);
2243
                    }
2244
                    if (c < '0' || c >= '8') {
2245
                        if (isdigit(c)) {
2246
                            return MAKE_TOKEN(syntaxerror(tok,
2247
                                    "invalid digit '%c' in octal literal", c));
2248
                        }
2249
                        else {
2250
                            tok_backup(tok, c);
2251
                            return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
2252
                        }
2253
                    }
2254
                    do {
2255
                        c = tok_nextc(tok);
2256
                    } while ('0' <= c && c < '8');
2257
                } while (c == '_');
2258
                if (isdigit(c)) {
2259
                    return MAKE_TOKEN(syntaxerror(tok,
2260
                            "invalid digit '%c' in octal literal", c));
2261
                }
2262
                if (!verify_end_of_number(tok, c, "octal")) {
2263
                    return MAKE_TOKEN(ERRORTOKEN);
2264
                }
2265
            }
2266
            else if (c == 'b' || c == 'B') {
2267
                /* Binary */
2268
                c = tok_nextc(tok);
2269
                do {
2270
                    if (c == '_') {
2271
                        c = tok_nextc(tok);
2272
                    }
2273
                    if (c != '0' && c != '1') {
2274
                        if (isdigit(c)) {
2275
                            return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
2276
                        }
2277
                        else {
2278
                            tok_backup(tok, c);
2279
                            return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
2280
                        }
2281
                    }
2282
                    do {
2283
                        c = tok_nextc(tok);
2284
                    } while (c == '0' || c == '1');
2285
                } while (c == '_');
2286
                if (isdigit(c)) {
2287
                    return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
2288
                }
2289
                if (!verify_end_of_number(tok, c, "binary")) {
2290
                    return MAKE_TOKEN(ERRORTOKEN);
2291
                }
2292
            }
2293
            else {
2294
                int nonzero = 0;
2295
                /* maybe old-style octal; c is first char of it */
2296
                /* in any case, allow '0' as a literal */
2297
                while (1) {
2298
                    if (c == '_') {
2299
                        c = tok_nextc(tok);
2300
                        if (!isdigit(c)) {
2301
                            tok_backup(tok, c);
2302
                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
2303
                        }
2304
                    }
2305
                    if (c != '0') {
2306
                        break;
2307
                    }
2308
                    c = tok_nextc(tok);
2309
                }
2310
                char* zeros_end = tok->cur;
2311
                if (isdigit(c)) {
2312
                    nonzero = 1;
2313
                    c = tok_decimal_tail(tok);
2314
                    if (c == 0) {
2315
                        return MAKE_TOKEN(ERRORTOKEN);
2316
                    }
2317
                }
2318
                if (c == '.') {
2319
                    c = tok_nextc(tok);
2320
                    goto fraction;
2321
                }
2322
                else if (c == 'e' || c == 'E') {
2323
                    goto exponent;
2324
                }
2325
                else if (c == 'j' || c == 'J') {
2326
                    goto imaginary;
2327
                }
2328
                else if (nonzero && !tok->tok_extra_tokens) {
2329
                    /* Old-style octal: now disallowed. */
2330
                    tok_backup(tok, c);
2331
                    return MAKE_TOKEN(syntaxerror_known_range(
2332
                            tok, (int)(tok->start + 1 - tok->line_start),
2333
                            (int)(zeros_end - tok->line_start),
2334
                            "leading zeros in decimal integer "
2335
                            "literals are not permitted; "
2336
                            "use an 0o prefix for octal integers"));
2337
                }
2338
                if (!verify_end_of_number(tok, c, "decimal")) {
2339
                    return MAKE_TOKEN(ERRORTOKEN);
2340
                }
2341
            }
2342
        }
2343
        else {
2344
            /* Decimal */
2345
            c = tok_decimal_tail(tok);
2346
            if (c == 0) {
2347
                return MAKE_TOKEN(ERRORTOKEN);
2348
            }
2349
            {
2350
                /* Accept floating point numbers. */
2351
                if (c == '.') {
2352
                    c = tok_nextc(tok);
2353
        fraction:
2354
                    /* Fraction */
2355
                    if (isdigit(c)) {
2356
                        c = tok_decimal_tail(tok);
2357
                        if (c == 0) {
2358
                            return MAKE_TOKEN(ERRORTOKEN);
2359
                        }
2360
                    }
2361
                }
2362
                if (c == 'e' || c == 'E') {
2363
                    int e;
2364
                  exponent:
2365
                    e = c;
2366
                    /* Exponent part */
2367
                    c = tok_nextc(tok);
2368
                    if (c == '+' || c == '-') {
2369
                        c = tok_nextc(tok);
2370
                        if (!isdigit(c)) {
2371
                            tok_backup(tok, c);
2372
                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
2373
                        }
2374
                    } else if (!isdigit(c)) {
2375
                        tok_backup(tok, c);
2376
                        if (!verify_end_of_number(tok, e, "decimal")) {
2377
                            return MAKE_TOKEN(ERRORTOKEN);
2378
                        }
2379
                        tok_backup(tok, e);
2380
                        p_start = tok->start;
2381
                        p_end = tok->cur;
2382
                        return MAKE_TOKEN(NUMBER);
2383
                    }
2384
                    c = tok_decimal_tail(tok);
2385
                    if (c == 0) {
2386
                        return MAKE_TOKEN(ERRORTOKEN);
2387
                    }
2388
                }
2389
                if (c == 'j' || c == 'J') {
2390
                    /* Imaginary part */
2391
        imaginary:
2392
                    c = tok_nextc(tok);
2393
                    if (!verify_end_of_number(tok, c, "imaginary")) {
2394
                        return MAKE_TOKEN(ERRORTOKEN);
2395
                    }
2396
                }
2397
                else if (!verify_end_of_number(tok, c, "decimal")) {
2398
                    return MAKE_TOKEN(ERRORTOKEN);
2399
                }
2400
            }
2401
        }
2402
        tok_backup(tok, c);
2403
        p_start = tok->start;
2404
        p_end = tok->cur;
2405
        return MAKE_TOKEN(NUMBER);
2406
    }
2407

2408
  f_string_quote:
2409
    if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
2410
        int quote = c;
2411
        int quote_size = 1;             /* 1 or 3 */
2412

2413
        /* Nodes of type STRING, especially multi line strings
2414
           must be handled differently in order to get both
2415
           the starting line number and the column offset right.
2416
           (cf. issue 16806) */
2417
        tok->first_lineno = tok->lineno;
2418
        tok->multi_line_start = tok->line_start;
2419

2420
        /* Find the quote size and start of string */
2421
        int after_quote = tok_nextc(tok);
2422
        if (after_quote == quote) {
2423
            int after_after_quote = tok_nextc(tok);
2424
            if (after_after_quote == quote) {
2425
                quote_size = 3;
2426
            }
2427
            else {
2428
                // TODO: Check this
2429
                tok_backup(tok, after_after_quote);
2430
                tok_backup(tok, after_quote);
2431
            }
2432
        }
2433
        if (after_quote != quote) {
2434
            tok_backup(tok, after_quote);
2435
        }
2436

2437

2438
        p_start = tok->start;
2439
        p_end = tok->cur;
2440
        if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
2441
            return MAKE_TOKEN(syntaxerror(tok, "too many nested f-strings"));
2442
        }
2443
        tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
2444
        the_current_tok->kind = TOK_FSTRING_MODE;
2445
        the_current_tok->f_string_quote = quote;
2446
        the_current_tok->f_string_quote_size = quote_size;
2447
        the_current_tok->f_string_start = tok->start;
2448
        the_current_tok->f_string_multi_line_start = tok->line_start;
2449
        the_current_tok->f_string_line_start = tok->lineno;
2450
        the_current_tok->f_string_start_offset = -1;
2451
        the_current_tok->f_string_multi_line_start_offset = -1;
2452
        the_current_tok->last_expr_buffer = NULL;
2453
        the_current_tok->last_expr_size = 0;
2454
        the_current_tok->last_expr_end = -1;
2455
        the_current_tok->f_string_debug = 0;
2456

2457
        switch (*tok->start) {
2458
            case 'F':
2459
            case 'f':
2460
                the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
2461
                break;
2462
            case 'R':
2463
            case 'r':
2464
                the_current_tok->f_string_raw = 1;
2465
                break;
2466
            default:
2467
                Py_UNREACHABLE();
2468
        }
2469

2470
        the_current_tok->curly_bracket_depth = 0;
2471
        the_current_tok->curly_bracket_expr_start_depth = -1;
2472
        return MAKE_TOKEN(FSTRING_START);
2473
    }
2474

2475
  letter_quote:
2476
    /* String */
2477
    if (c == '\'' || c == '"') {
2478
        int quote = c;
2479
        int quote_size = 1;             /* 1 or 3 */
2480
        int end_quote_size = 0;
2481

2482
        /* Nodes of type STRING, especially multi line strings
2483
           must be handled differently in order to get both
2484
           the starting line number and the column offset right.
2485
           (cf. issue 16806) */
2486
        tok->first_lineno = tok->lineno;
2487
        tok->multi_line_start = tok->line_start;
2488

2489
        /* Find the quote size and start of string */
2490
        c = tok_nextc(tok);
2491
        if (c == quote) {
2492
            c = tok_nextc(tok);
2493
            if (c == quote) {
2494
                quote_size = 3;
2495
            }
2496
            else {
2497
                end_quote_size = 1;     /* empty string found */
2498
            }
2499
        }
2500
        if (c != quote) {
2501
            tok_backup(tok, c);
2502
        }
2503

2504
        /* Get rest of string */
2505
        while (end_quote_size != quote_size) {
2506
            c = tok_nextc(tok);
2507
            if (tok->done == E_ERROR) {
2508
                return MAKE_TOKEN(ERRORTOKEN);
2509
            }
2510
            if (tok->done == E_DECODE) {
2511
                break;
2512
            }
2513
            if (c == EOF || (quote_size == 1 && c == '\n')) {
2514
                assert(tok->multi_line_start != NULL);
2515
                // shift the tok_state's location into
2516
                // the start of string, and report the error
2517
                // from the initial quote character
2518
                tok->cur = (char *)tok->start;
2519
                tok->cur++;
2520
                tok->line_start = tok->multi_line_start;
2521
                int start = tok->lineno;
2522
                tok->lineno = tok->first_lineno;
2523

2524
                if (INSIDE_FSTRING(tok)) {
2525
                    /* When we are in an f-string, before raising the
2526
                     * unterminated string literal error, check whether
2527
                     * does the initial quote matches with f-strings quotes
2528
                     * and if it is, then this must be a missing '}' token
2529
                     * so raise the proper error */
2530
                    tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
2531
                    if (the_current_tok->f_string_quote == quote &&
2532
                        the_current_tok->f_string_quote_size == quote_size) {
2533
                        return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
2534
                    }
2535
                }
2536

2537
                if (quote_size == 3) {
2538
                    syntaxerror(tok, "unterminated triple-quoted string literal"
2539
                                     " (detected at line %d)", start);
2540
                    if (c != '\n') {
2541
                        tok->done = E_EOFS;
2542
                    }
2543
                    return MAKE_TOKEN(ERRORTOKEN);
2544
                }
2545
                else {
2546
                    syntaxerror(tok, "unterminated string literal (detected at"
2547
                                     " line %d)", start);
2548
                    if (c != '\n') {
2549
                        tok->done = E_EOLS;
2550
                    }
2551
                    return MAKE_TOKEN(ERRORTOKEN);
2552
                }
2553
            }
2554
            if (c == quote) {
2555
                end_quote_size += 1;
2556
            }
2557
            else {
2558
                end_quote_size = 0;
2559
                if (c == '\\') {
2560
                    c = tok_nextc(tok);  /* skip escaped char */
2561
                    if (c == '\r') {
2562
                        c = tok_nextc(tok);
2563
                    }
2564
                }
2565
            }
2566
        }
2567

2568
        p_start = tok->start;
2569
        p_end = tok->cur;
2570
        return MAKE_TOKEN(STRING);
2571
    }
2572

2573
    /* Line continuation */
2574
    if (c == '\\') {
2575
        if ((c = tok_continuation_line(tok)) == -1) {
2576
            return MAKE_TOKEN(ERRORTOKEN);
2577
        }
2578
        tok->cont_line = 1;
2579
        goto again; /* Read next line */
2580
    }
2581

2582
    /* Punctuation character */
2583
    int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
2584
    if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
2585
        /* This code block gets executed before the curly_bracket_depth is incremented
2586
         * by the `{` case, so for ensuring that we are on the 0th level, we need
2587
         * to adjust it manually */
2588
        int cursor = current_tok->curly_bracket_depth - (c != '{');
2589
        if (cursor == 0 && !update_fstring_expr(tok, c)) {
2590
            return MAKE_TOKEN(ENDMARKER);
2591
        }
2592
        if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
2593
            return MAKE_TOKEN(ERRORTOKEN);
2594
        }
2595

2596
        if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
2597
            current_tok->kind = TOK_FSTRING_MODE;
2598
            p_start = tok->start;
2599
            p_end = tok->cur;
2600
            return MAKE_TOKEN(_PyToken_OneChar(c));
2601
        }
2602
    }
2603

2604
    /* Check for two-character token */
2605
    {
2606
        int c2 = tok_nextc(tok);
2607
        int current_token = _PyToken_TwoChars(c, c2);
2608
        if (current_token != OP) {
2609
            int c3 = tok_nextc(tok);
2610
            int current_token3 = _PyToken_ThreeChars(c, c2, c3);
2611
            if (current_token3 != OP) {
2612
                current_token = current_token3;
2613
            }
2614
            else {
2615
                tok_backup(tok, c3);
2616
            }
2617
            p_start = tok->start;
2618
            p_end = tok->cur;
2619
            return MAKE_TOKEN(current_token);
2620
        }
2621
        tok_backup(tok, c2);
2622
    }
2623

2624
    /* Keep track of parentheses nesting level */
2625
    switch (c) {
2626
    case '(':
2627
    case '[':
2628
    case '{':
2629
        if (tok->level >= MAXLEVEL) {
2630
            return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
2631
        }
2632
        tok->parenstack[tok->level] = c;
2633
        tok->parenlinenostack[tok->level] = tok->lineno;
2634
        tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2635
        tok->level++;
2636
        if (INSIDE_FSTRING(tok)) {
2637
            current_tok->curly_bracket_depth++;
2638
        }
2639
        break;
2640
    case ')':
2641
    case ']':
2642
    case '}':
2643
        if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
2644
            return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
2645
        }
2646
        if (!tok->tok_extra_tokens && !tok->level) {
2647
            return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
2648
        }
2649
        if (tok->level > 0) {
2650
            tok->level--;
2651
            int opening = tok->parenstack[tok->level];
2652
            if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
2653
                                            (opening == '[' && c == ']') ||
2654
                                            (opening == '{' && c == '}'))) {
2655
                /* If the opening bracket belongs to an f-string's expression
2656
                part (e.g. f"{)}") and the closing bracket is an arbitrary
2657
                nested expression, then instead of matching a different
2658
                syntactical construct with it; we'll throw an unmatched
2659
                parentheses error. */
2660
                if (INSIDE_FSTRING(tok) && opening == '{') {
2661
                    assert(current_tok->curly_bracket_depth >= 0);
2662
                    int previous_bracket = current_tok->curly_bracket_depth - 1;
2663
                    if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
2664
                        return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
2665
                    }
2666
                }
2667
                if (tok->parenlinenostack[tok->level] != tok->lineno) {
2668
                    return MAKE_TOKEN(syntaxerror(tok,
2669
                            "closing parenthesis '%c' does not match "
2670
                            "opening parenthesis '%c' on line %d",
2671
                            c, opening, tok->parenlinenostack[tok->level]));
2672
                }
2673
                else {
2674
                    return MAKE_TOKEN(syntaxerror(tok,
2675
                            "closing parenthesis '%c' does not match "
2676
                            "opening parenthesis '%c'",
2677
                            c, opening));
2678
                }
2679
            }
2680
        }
2681

2682
        if (INSIDE_FSTRING(tok)) {
2683
            current_tok->curly_bracket_depth--;
2684
            if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
2685
                current_tok->curly_bracket_expr_start_depth--;
2686
                current_tok->kind = TOK_FSTRING_MODE;
2687
                current_tok->f_string_debug = 0;
2688
            }
2689
        }
2690
        break;
2691
    default:
2692
        break;
2693
    }
2694

2695
    if (!Py_UNICODE_ISPRINTABLE(c)) {
2696
        return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c));
2697
    }
2698

2699
    if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
2700
        current_tok->f_string_debug = 1;
2701
    }
2702

2703
    /* Punctuation character */
2704
    p_start = tok->start;
2705
    p_end = tok->cur;
2706
    return MAKE_TOKEN(_PyToken_OneChar(c));
2707
}
2708

2709
static int
2710
tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
2711
{
2712
    const char *p_start = NULL;
2713
    const char *p_end = NULL;
2714
    int end_quote_size = 0;
2715
    int unicode_escape = 0;
2716

2717
    tok->start = tok->cur;
2718
    tok->first_lineno = tok->lineno;
2719
    tok->starting_col_offset = tok->col_offset;
2720

2721
    // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
2722
    // before it.
2723
    int start_char = tok_nextc(tok);
2724
    if (start_char == '{') {
2725
        int peek1 = tok_nextc(tok);
2726
        tok_backup(tok, peek1);
2727
        tok_backup(tok, start_char);
2728
        if (peek1 != '{') {
2729
            current_tok->curly_bracket_expr_start_depth++;
2730
            if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
2731
                return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
2732
            }
2733
            TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2734
            return tok_get_normal_mode(tok, current_tok, token);
2735
        }
2736
    }
2737
    else {
2738
        tok_backup(tok, start_char);
2739
    }
2740

2741
    // Check if we are at the end of the string
2742
    for (int i = 0; i < current_tok->f_string_quote_size; i++) {
2743
        int quote = tok_nextc(tok);
2744
        if (quote != current_tok->f_string_quote) {
2745
            tok_backup(tok, quote);
2746
            goto f_string_middle;
2747
        }
2748
    }
2749

2750
    if (current_tok->last_expr_buffer != NULL) {
2751
        PyMem_Free(current_tok->last_expr_buffer);
2752
        current_tok->last_expr_buffer = NULL;
2753
        current_tok->last_expr_size = 0;
2754
        current_tok->last_expr_end = -1;
2755
    }
2756

2757
    p_start = tok->start;
2758
    p_end = tok->cur;
2759
    tok->tok_mode_stack_index--;
2760
    return MAKE_TOKEN(FSTRING_END);
2761

2762
f_string_middle:
2763

2764
    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
2765
    // this.
2766
    tok->multi_line_start = tok->line_start;
2767
    while (end_quote_size != current_tok->f_string_quote_size) {
2768
        int c = tok_nextc(tok);
2769
        if (tok->done == E_ERROR) {
2770
            return MAKE_TOKEN(ERRORTOKEN);
2771
        }
2772
        if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
2773
            if (tok->decoding_erred) {
2774
                return MAKE_TOKEN(ERRORTOKEN);
2775
            }
2776

2777
            assert(tok->multi_line_start != NULL);
2778
            // shift the tok_state's location into
2779
            // the start of string, and report the error
2780
            // from the initial quote character
2781
            tok->cur = (char *)current_tok->f_string_start;
2782
            tok->cur++;
2783
            tok->line_start = current_tok->f_string_multi_line_start;
2784
            int start = tok->lineno;
2785

2786
            tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
2787
            tok->lineno = the_current_tok->f_string_line_start;
2788

2789
            if (current_tok->f_string_quote_size == 3) {
2790
                return MAKE_TOKEN(syntaxerror(tok,
2791
                                    "unterminated triple-quoted f-string literal"
2792
                                    " (detected at line %d)", start));
2793
            }
2794
            else {
2795
                return MAKE_TOKEN(syntaxerror(tok,
2796
                                    "unterminated f-string literal (detected at"
2797
                                    " line %d)", start));
2798
            }
2799
        }
2800

2801
        if (c == current_tok->f_string_quote) {
2802
            end_quote_size += 1;
2803
            continue;
2804
        } else {
2805
            end_quote_size = 0;
2806
        }
2807

2808
        int in_format_spec = (
2809
                current_tok->last_expr_end != -1
2810
                &&
2811
                INSIDE_FSTRING_EXPR(current_tok)
2812
        );
2813
        if (c == '{') {
2814
            int peek = tok_nextc(tok);
2815
            if (peek != '{' || in_format_spec) {
2816
                tok_backup(tok, peek);
2817
                tok_backup(tok, c);
2818
                current_tok->curly_bracket_expr_start_depth++;
2819
                if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
2820
                    return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
2821
                }
2822
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2823
                p_start = tok->start;
2824
                p_end = tok->cur;
2825
            } else {
2826
                p_start = tok->start;
2827
                p_end = tok->cur - 1;
2828
            }
2829
            return MAKE_TOKEN(FSTRING_MIDDLE);
2830
        } else if (c == '}') {
2831
            if (unicode_escape) {
2832
                p_start = tok->start;
2833
                p_end = tok->cur;
2834
                return MAKE_TOKEN(FSTRING_MIDDLE);
2835
            }
2836
            int peek = tok_nextc(tok);
2837

2838
            // The tokenizer can only be in the format spec if we have already completed the expression
2839
            // scanning (indicated by the end of the expression being set) and we are not at the top level
2840
            // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
2841
            // brackets, we can bypass it here.
2842
            if (peek == '}' && !in_format_spec) {
2843
                p_start = tok->start;
2844
                p_end = tok->cur - 1;
2845
            } else {
2846
                tok_backup(tok, peek);
2847
                tok_backup(tok, c);
2848
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
2849
                p_start = tok->start;
2850
                p_end = tok->cur;
2851
            }
2852
            return MAKE_TOKEN(FSTRING_MIDDLE);
2853
        } else if (c == '\\') {
2854
            int peek = tok_nextc(tok);
2855
            if (peek == '\r') {
2856
                peek = tok_nextc(tok);
2857
            }
2858
            // Special case when the backslash is right before a curly
2859
            // brace. We have to restore and return the control back
2860
            // to the loop for the next iteration.
2861
            if (peek == '{' || peek == '}') {
2862
                if (!current_tok->f_string_raw) {
2863
                    if (warn_invalid_escape_sequence(tok, peek)) {
2864
                        return MAKE_TOKEN(ERRORTOKEN);
2865
                    }
2866
                }
2867
                tok_backup(tok, peek);
2868
                continue;
2869
            }
2870

2871
            if (!current_tok->f_string_raw) {
2872
                if (peek == 'N') {
2873
                    /* Handle named unicode escapes (\N{BULLET}) */
2874
                    peek = tok_nextc(tok);
2875
                    if (peek == '{') {
2876
                        unicode_escape = 1;
2877
                    } else {
2878
                        tok_backup(tok, peek);
2879
                    }
2880
                }
2881
            } /* else {
2882
                skip the escaped character
2883
            }*/
2884
        }
2885
    }
2886

2887
    // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
2888
    // add the quotes to the FSTRING_END in the next tokenizer iteration.
2889
    for (int i = 0; i < current_tok->f_string_quote_size; i++) {
2890
        tok_backup(tok, current_tok->f_string_quote);
2891
    }
2892
    p_start = tok->start;
2893
    p_end = tok->cur;
2894
    return MAKE_TOKEN(FSTRING_MIDDLE);
2895
}
2896

2897

2898
static int
2899
tok_get(struct tok_state *tok, struct token *token)
2900
{
2901
    tokenizer_mode *current_tok = TOK_GET_MODE(tok);
2902
    if (current_tok->kind == TOK_REGULAR_MODE) {
2903
        return tok_get_normal_mode(tok, current_tok, token);
2904
    } else {
2905
        return tok_get_fstring_mode(tok, current_tok, token);
2906
    }
2907
}
2908

2909
int
2910
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
2911
{
2912
    int result = tok_get(tok, token);
2913
    if (tok->decoding_erred) {
2914
        result = ERRORTOKEN;
2915
        tok->done = E_DECODE;
2916
    }
2917
    return result;
2918
}
2919

2920
#if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2921
// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2922
// dup() emulation with open() is slow.
2923
typedef union {
2924
    void *cookie;
2925
    int fd;
2926
} borrowed;
2927

2928
static ssize_t
2929
borrow_read(void *cookie, char *buf, size_t size)
2930
{
2931
    borrowed b = {.cookie = cookie};
2932
    return read(b.fd, (void *)buf, size);
2933
}
2934

2935
static FILE *
2936
fdopen_borrow(int fd) {
2937
    // supports only reading. seek fails. close and write are no-ops.
2938
    cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2939
    borrowed b = {.fd = fd};
2940
    return fopencookie(b.cookie, "r", io_cb);
2941
}
2942
#else
2943
static FILE *
2944
fdopen_borrow(int fd) {
2945
    fd = _Py_dup(fd);
2946
    if (fd < 0) {
2947
        return NULL;
2948
    }
2949
    return fdopen(fd, "r");
2950
}
2951
#endif
2952

2953
/* Get the encoding of a Python file. Check for the coding cookie and check if
2954
   the file starts with a BOM.
2955

2956
   _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2957
   encoding in the first or second line of the file (in which case the encoding
2958
   should be assumed to be UTF-8).
2959

2960
   The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2961
   by the caller. */
2962

2963
char *
2964
_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2965
{
2966
    struct tok_state *tok;
2967
    FILE *fp;
2968
    char *encoding = NULL;
2969

2970
    fp = fdopen_borrow(fd);
2971
    if (fp == NULL) {
2972
        return NULL;
2973
    }
2974
    tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2975
    if (tok == NULL) {
2976
        fclose(fp);
2977
        return NULL;
2978
    }
2979
    if (filename != NULL) {
2980
        tok->filename = Py_NewRef(filename);
2981
    }
2982
    else {
2983
        tok->filename = PyUnicode_FromString("<string>");
2984
        if (tok->filename == NULL) {
2985
            fclose(fp);
2986
            _PyTokenizer_Free(tok);
2987
            return encoding;
2988
        }
2989
    }
2990
    struct token token;
2991
    // We don't want to report warnings here because it could cause infinite recursion
2992
    // if fetching the encoding shows a warning.
2993
    tok->report_warnings = 0;
2994
    while (tok->lineno < 2 && tok->done == E_OK) {
2995
        _PyToken_Init(&token);
2996
        _PyTokenizer_Get(tok, &token);
2997
        _PyToken_Free(&token);
2998
    }
2999
    fclose(fp);
3000
    if (tok->encoding) {
3001
        encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
3002
        if (encoding) {
3003
            strcpy(encoding, tok->encoding);
3004
        }
3005
    }
3006
    _PyTokenizer_Free(tok);
3007
    return encoding;
3008
}
3009

3010
#ifdef Py_DEBUG
3011
void
3012
tok_dump(int type, char *start, char *end)
3013
{
3014
    fprintf(stderr, "%s", _PyParser_TokenNames[type]);
3015
    if (type == NAME || type == NUMBER || type == STRING || type == OP)
3016
        fprintf(stderr, "(%.*s)", (int)(end - start), start);
3017
}
3018
#endif  // Py_DEBUG
3019

3020
Product

Resources

Company