Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Parser/string_parser.c
12 views
1
#include <stdbool.h>
2
3
#include <Python.h>
4
5
#include "tokenizer.h"
6
#include "pegen.h"
7
#include "string_parser.h"
8
9
//// STRING HANDLING FUNCTIONS ////
10
11
static int
12
warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13
{
14
unsigned char c = *first_invalid_escape;
15
if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning,
16
// see tokenizer.c:warn_invalid_escape_sequence
17
return 0;
18
}
19
20
int octal = ('4' <= c && c <= '7');
21
PyObject *msg =
22
octal
23
? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
24
first_invalid_escape)
25
: PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
26
if (msg == NULL) {
27
return -1;
28
}
29
PyObject *category;
30
if (p->feature_version >= 12) {
31
category = PyExc_SyntaxWarning;
32
}
33
else {
34
category = PyExc_DeprecationWarning;
35
}
36
if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
37
t->lineno, NULL, NULL) < 0) {
38
if (PyErr_ExceptionMatches(category)) {
39
/* Replace the Syntax/DeprecationWarning exception with a SyntaxError
40
to get a more accurate error report */
41
PyErr_Clear();
42
43
/* This is needed, in order for the SyntaxError to point to the token t,
44
since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
45
error location, if p->known_err_token is not set. */
46
p->known_err_token = t;
47
if (octal) {
48
RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
49
first_invalid_escape);
50
}
51
else {
52
RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
53
}
54
}
55
Py_DECREF(msg);
56
return -1;
57
}
58
Py_DECREF(msg);
59
return 0;
60
}
61
62
static PyObject *
63
decode_utf8(const char **sPtr, const char *end)
64
{
65
const char *s;
66
const char *t;
67
t = s = *sPtr;
68
while (s < end && (*s & 0x80)) {
69
s++;
70
}
71
*sPtr = s;
72
return PyUnicode_DecodeUTF8(t, s - t, NULL);
73
}
74
75
static PyObject *
76
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
77
{
78
PyObject *v;
79
PyObject *u;
80
char *buf;
81
char *p;
82
const char *end;
83
84
/* check for integer overflow */
85
if (len > SIZE_MAX / 6) {
86
return NULL;
87
}
88
/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
89
"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
90
u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
91
if (u == NULL) {
92
return NULL;
93
}
94
p = buf = PyBytes_AsString(u);
95
if (p == NULL) {
96
return NULL;
97
}
98
end = s + len;
99
while (s < end) {
100
if (*s == '\\') {
101
*p++ = *s++;
102
if (s >= end || *s & 0x80) {
103
strcpy(p, "u005c");
104
p += 5;
105
if (s >= end) {
106
break;
107
}
108
}
109
}
110
if (*s & 0x80) {
111
PyObject *w;
112
int kind;
113
const void *data;
114
Py_ssize_t w_len;
115
Py_ssize_t i;
116
w = decode_utf8(&s, end);
117
if (w == NULL) {
118
Py_DECREF(u);
119
return NULL;
120
}
121
kind = PyUnicode_KIND(w);
122
data = PyUnicode_DATA(w);
123
w_len = PyUnicode_GET_LENGTH(w);
124
for (i = 0; i < w_len; i++) {
125
Py_UCS4 chr = PyUnicode_READ(kind, data, i);
126
sprintf(p, "\\U%08x", chr);
127
p += 10;
128
}
129
/* Should be impossible to overflow */
130
assert(p - buf <= PyBytes_GET_SIZE(u));
131
Py_DECREF(w);
132
}
133
else {
134
*p++ = *s++;
135
}
136
}
137
len = p - buf;
138
s = buf;
139
140
const char *first_invalid_escape;
141
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
142
143
// HACK: later we can simply pass the line no, since we don't preserve the tokens
144
// when we are decoding the string but we preserve the line numbers.
145
if (v != NULL && first_invalid_escape != NULL && t != NULL) {
146
if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
147
/* We have not decref u before because first_invalid_escape points
148
inside u. */
149
Py_XDECREF(u);
150
Py_DECREF(v);
151
return NULL;
152
}
153
}
154
Py_XDECREF(u);
155
return v;
156
}
157
158
static PyObject *
159
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
160
{
161
const char *first_invalid_escape;
162
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
163
if (result == NULL) {
164
return NULL;
165
}
166
167
if (first_invalid_escape != NULL) {
168
if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
169
Py_DECREF(result);
170
return NULL;
171
}
172
}
173
return result;
174
}
175
176
PyObject *
177
_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
178
{
179
if (raw) {
180
return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
181
}
182
return decode_unicode_with_escapes(p, s, len, t);
183
}
184
185
/* s must include the bracketing quote characters, and r, b &/or f prefixes
186
(if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
187
_PyPegen_parse_string parses it, and returns the decoded Python string object. */
188
PyObject *
189
_PyPegen_parse_string(Parser *p, Token *t)
190
{
191
const char *s = PyBytes_AsString(t->bytes);
192
if (s == NULL) {
193
return NULL;
194
}
195
196
size_t len;
197
int quote = Py_CHARMASK(*s);
198
int bytesmode = 0;
199
int rawmode = 0;
200
201
if (Py_ISALPHA(quote)) {
202
while (!bytesmode || !rawmode) {
203
if (quote == 'b' || quote == 'B') {
204
quote =(unsigned char)*++s;
205
bytesmode = 1;
206
}
207
else if (quote == 'u' || quote == 'U') {
208
quote = (unsigned char)*++s;
209
}
210
else if (quote == 'r' || quote == 'R') {
211
quote = (unsigned char)*++s;
212
rawmode = 1;
213
}
214
else {
215
break;
216
}
217
}
218
}
219
220
if (quote != '\'' && quote != '\"') {
221
PyErr_BadInternalCall();
222
return NULL;
223
}
224
/* Skip the leading quote char. */
225
s++;
226
len = strlen(s);
227
if (len > INT_MAX) {
228
PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
229
return NULL;
230
}
231
if (s[--len] != quote) {
232
/* Last quote char must match the first. */
233
PyErr_BadInternalCall();
234
return NULL;
235
}
236
if (len >= 4 && s[0] == quote && s[1] == quote) {
237
/* A triple quoted string. We've already skipped one quote at
238
the start and one at the end of the string. Now skip the
239
two at the start. */
240
s += 2;
241
len -= 2;
242
/* And check that the last two match. */
243
if (s[--len] != quote || s[--len] != quote) {
244
PyErr_BadInternalCall();
245
return NULL;
246
}
247
}
248
249
/* Avoid invoking escape decoding routines if possible. */
250
rawmode = rawmode || strchr(s, '\\') == NULL;
251
if (bytesmode) {
252
/* Disallow non-ASCII characters. */
253
const char *ch;
254
for (ch = s; *ch; ch++) {
255
if (Py_CHARMASK(*ch) >= 0x80) {
256
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
257
t,
258
"bytes can only contain ASCII "
259
"literal characters");
260
return NULL;
261
}
262
}
263
if (rawmode) {
264
return PyBytes_FromStringAndSize(s, len);
265
}
266
return decode_bytes_with_escapes(p, s, len, t);
267
}
268
return _PyPegen_decode_string(p, rawmode, s, len, t);
269
}
270
271