CoCalc -- string

GitHub Repository: allendowney/cpython
Path: blob/main/Parser/string_parser.c
¹² views
1
#include <stdbool.h>
2

3
#include <Python.h>
4

5
#include "tokenizer.h"
6
#include "pegen.h"
7
#include "string_parser.h"
8

9
//// STRING HANDLING FUNCTIONS ////
10

11
static int
12
warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13
{
14
    unsigned char c = *first_invalid_escape;
15
    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {  // in this case the tokenizer has already emitted a warning,
16
                                                                                            // see tokenizer.c:warn_invalid_escape_sequence
17
        return 0;
18
    }
19

20
    int octal = ('4' <= c && c <= '7');
21
    PyObject *msg =
22
        octal
23
        ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
24
                               first_invalid_escape)
25
        : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
26
    if (msg == NULL) {
27
        return -1;
28
    }
29
    PyObject *category;
30
    if (p->feature_version >= 12) {
31
        category = PyExc_SyntaxWarning;
32
    }
33
    else {
34
        category = PyExc_DeprecationWarning;
35
    }
36
    if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
37
                                 t->lineno, NULL, NULL) < 0) {
38
        if (PyErr_ExceptionMatches(category)) {
39
            /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
40
               to get a more accurate error report */
41
            PyErr_Clear();
42

43
            /* This is needed, in order for the SyntaxError to point to the token t,
44
               since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
45
               error location, if p->known_err_token is not set. */
46
            p->known_err_token = t;
47
            if (octal) {
48
                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
49
                                   first_invalid_escape);
50
            }
51
            else {
52
                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
53
            }
54
        }
55
        Py_DECREF(msg);
56
        return -1;
57
    }
58
    Py_DECREF(msg);
59
    return 0;
60
}
61

62
static PyObject *
63
decode_utf8(const char **sPtr, const char *end)
64
{
65
    const char *s;
66
    const char *t;
67
    t = s = *sPtr;
68
    while (s < end && (*s & 0x80)) {
69
        s++;
70
    }
71
    *sPtr = s;
72
    return PyUnicode_DecodeUTF8(t, s - t, NULL);
73
}
74

75
static PyObject *
76
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
77
{
78
    PyObject *v;
79
    PyObject *u;
80
    char *buf;
81
    char *p;
82
    const char *end;
83

84
    /* check for integer overflow */
85
    if (len > SIZE_MAX / 6) {
86
        return NULL;
87
    }
88
    /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
89
       "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
90
    u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
91
    if (u == NULL) {
92
        return NULL;
93
    }
94
    p = buf = PyBytes_AsString(u);
95
    if (p == NULL) {
96
        return NULL;
97
    }
98
    end = s + len;
99
    while (s < end) {
100
        if (*s == '\\') {
101
            *p++ = *s++;
102
            if (s >= end || *s & 0x80) {
103
                strcpy(p, "u005c");
104
                p += 5;
105
                if (s >= end) {
106
                    break;
107
                }
108
            }
109
        }
110
        if (*s & 0x80) {
111
            PyObject *w;
112
            int kind;
113
            const void *data;
114
            Py_ssize_t w_len;
115
            Py_ssize_t i;
116
            w = decode_utf8(&s, end);
117
            if (w == NULL) {
118
                Py_DECREF(u);
119
                return NULL;
120
            }
121
            kind = PyUnicode_KIND(w);
122
            data = PyUnicode_DATA(w);
123
            w_len = PyUnicode_GET_LENGTH(w);
124
            for (i = 0; i < w_len; i++) {
125
                Py_UCS4 chr = PyUnicode_READ(kind, data, i);
126
                sprintf(p, "\\U%08x", chr);
127
                p += 10;
128
            }
129
            /* Should be impossible to overflow */
130
            assert(p - buf <= PyBytes_GET_SIZE(u));
131
            Py_DECREF(w);
132
        }
133
        else {
134
            *p++ = *s++;
135
        }
136
    }
137
    len = p - buf;
138
    s = buf;
139

140
    const char *first_invalid_escape;
141
    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
142

143
    // HACK: later we can simply pass the line no, since we don't preserve the tokens
144
    // when we are decoding the string but we preserve the line numbers.
145
    if (v != NULL && first_invalid_escape != NULL && t != NULL) {
146
        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
147
            /* We have not decref u before because first_invalid_escape points
148
               inside u. */
149
            Py_XDECREF(u);
150
            Py_DECREF(v);
151
            return NULL;
152
        }
153
    }
154
    Py_XDECREF(u);
155
    return v;
156
}
157

158
static PyObject *
159
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
160
{
161
    const char *first_invalid_escape;
162
    PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
163
    if (result == NULL) {
164
        return NULL;
165
    }
166

167
    if (first_invalid_escape != NULL) {
168
        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
169
            Py_DECREF(result);
170
            return NULL;
171
        }
172
    }
173
    return result;
174
}
175

176
PyObject *
177
_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
178
{
179
    if (raw) {
180
        return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
181
    }
182
    return decode_unicode_with_escapes(p, s, len, t);
183
}
184

185
/* s must include the bracketing quote characters, and r, b &/or f prefixes
186
    (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
187
   _PyPegen_parse_string parses it, and returns the decoded Python string object. */
188
PyObject *
189
_PyPegen_parse_string(Parser *p, Token *t)
190
{
191
    const char *s = PyBytes_AsString(t->bytes);
192
    if (s == NULL) {
193
        return NULL;
194
    }
195

196
    size_t len;
197
    int quote = Py_CHARMASK(*s);
198
    int bytesmode = 0;
199
    int rawmode = 0;
200

201
    if (Py_ISALPHA(quote)) {
202
        while (!bytesmode || !rawmode) {
203
            if (quote == 'b' || quote == 'B') {
204
                quote =(unsigned char)*++s;
205
                bytesmode = 1;
206
            }
207
            else if (quote == 'u' || quote == 'U') {
208
                quote = (unsigned char)*++s;
209
            }
210
            else if (quote == 'r' || quote == 'R') {
211
                quote = (unsigned char)*++s;
212
                rawmode = 1;
213
            }
214
            else {
215
                break;
216
            }
217
        }
218
    }
219

220
    if (quote != '\'' && quote != '\"') {
221
        PyErr_BadInternalCall();
222
        return NULL;
223
    }
224
    /* Skip the leading quote char. */
225
    s++;
226
    len = strlen(s);
227
    if (len > INT_MAX) {
228
        PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
229
        return NULL;
230
    }
231
    if (s[--len] != quote) {
232
        /* Last quote char must match the first. */
233
        PyErr_BadInternalCall();
234
        return NULL;
235
    }
236
    if (len >= 4 && s[0] == quote && s[1] == quote) {
237
        /* A triple quoted string. We've already skipped one quote at
238
           the start and one at the end of the string. Now skip the
239
           two at the start. */
240
        s += 2;
241
        len -= 2;
242
        /* And check that the last two match. */
243
        if (s[--len] != quote || s[--len] != quote) {
244
            PyErr_BadInternalCall();
245
            return NULL;
246
        }
247
    }
248

249
    /* Avoid invoking escape decoding routines if possible. */
250
    rawmode = rawmode || strchr(s, '\\') == NULL;
251
    if (bytesmode) {
252
        /* Disallow non-ASCII characters. */
253
        const char *ch;
254
        for (ch = s; *ch; ch++) {
255
            if (Py_CHARMASK(*ch) >= 0x80) {
256
                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
257
                                   t,
258
                                   "bytes can only contain ASCII "
259
                                   "literal characters");
260
                return NULL;
261
            }
262
        }
263
        if (rawmode) {
264
            return PyBytes_FromStringAndSize(s, len);
265
        }
266
        return decode_bytes_with_escapes(p, s, len, t);
267
    }
268
    return _PyPegen_decode_string(p, rawmode, s, len, t);
269
}
270

271
Product

Resources

Company