Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Objects/stringlib/split.h
12 views
1
/* stringlib: split implementation */
2
3
#ifndef STRINGLIB_FASTSEARCH_H
4
#error must include "stringlib/fastsearch.h" before including this module
5
#endif
6
7
/* Overallocate the initial list to reduce the number of reallocs for small
8
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
9
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
10
text (roughly 11 words per line) and field delimited data (usually 1-10
11
fields). For large strings the split algorithms are bandwidth limited
12
so increasing the preallocation likely will not improve things.*/
13
14
#define MAX_PREALLOC 12
15
16
/* 5 splits gives 6 elements */
17
#define PREALLOC_SIZE(maxsplit) \
18
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
19
20
#define SPLIT_APPEND(data, left, right) \
21
sub = STRINGLIB_NEW((data) + (left), \
22
(right) - (left)); \
23
if (sub == NULL) \
24
goto onError; \
25
if (PyList_Append(list, sub)) { \
26
Py_DECREF(sub); \
27
goto onError; \
28
} \
29
else \
30
Py_DECREF(sub);
31
32
#define SPLIT_ADD(data, left, right) { \
33
sub = STRINGLIB_NEW((data) + (left), \
34
(right) - (left)); \
35
if (sub == NULL) \
36
goto onError; \
37
if (count < MAX_PREALLOC) { \
38
PyList_SET_ITEM(list, count, sub); \
39
} else { \
40
if (PyList_Append(list, sub)) { \
41
Py_DECREF(sub); \
42
goto onError; \
43
} \
44
else \
45
Py_DECREF(sub); \
46
} \
47
count++; }
48
49
50
/* Always force the list to the expected size. */
51
#define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
52
53
Py_LOCAL_INLINE(PyObject *)
54
STRINGLIB(split_whitespace)(PyObject* str_obj,
55
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
56
Py_ssize_t maxcount)
57
{
58
Py_ssize_t i, j, count=0;
59
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
60
PyObject *sub;
61
62
if (list == NULL)
63
return NULL;
64
65
i = j = 0;
66
while (maxcount-- > 0) {
67
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
68
i++;
69
if (i == str_len) break;
70
j = i; i++;
71
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
72
i++;
73
#if !STRINGLIB_MUTABLE
74
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
75
/* No whitespace in str_obj, so just use it as list[0] */
76
Py_INCREF(str_obj);
77
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
78
count++;
79
break;
80
}
81
#endif
82
SPLIT_ADD(str, j, i);
83
}
84
85
if (i < str_len) {
86
/* Only occurs when maxcount was reached */
87
/* Skip any remaining whitespace and copy to end of string */
88
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
89
i++;
90
if (i != str_len)
91
SPLIT_ADD(str, i, str_len);
92
}
93
FIX_PREALLOC_SIZE(list);
94
return list;
95
96
onError:
97
Py_DECREF(list);
98
return NULL;
99
}
100
101
Py_LOCAL_INLINE(PyObject *)
102
STRINGLIB(split_char)(PyObject* str_obj,
103
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
104
const STRINGLIB_CHAR ch,
105
Py_ssize_t maxcount)
106
{
107
Py_ssize_t i, j, count=0;
108
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
109
PyObject *sub;
110
111
if (list == NULL)
112
return NULL;
113
114
i = j = 0;
115
while ((j < str_len) && (maxcount-- > 0)) {
116
for(; j < str_len; j++) {
117
/* I found that using memchr makes no difference */
118
if (str[j] == ch) {
119
SPLIT_ADD(str, i, j);
120
i = j = j + 1;
121
break;
122
}
123
}
124
}
125
#if !STRINGLIB_MUTABLE
126
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
127
/* ch not in str_obj, so just use str_obj as list[0] */
128
Py_INCREF(str_obj);
129
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
130
count++;
131
} else
132
#endif
133
if (i <= str_len) {
134
SPLIT_ADD(str, i, str_len);
135
}
136
FIX_PREALLOC_SIZE(list);
137
return list;
138
139
onError:
140
Py_DECREF(list);
141
return NULL;
142
}
143
144
Py_LOCAL_INLINE(PyObject *)
145
STRINGLIB(split)(PyObject* str_obj,
146
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
147
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
148
Py_ssize_t maxcount)
149
{
150
Py_ssize_t i, j, pos, count=0;
151
PyObject *list, *sub;
152
153
if (sep_len == 0) {
154
PyErr_SetString(PyExc_ValueError, "empty separator");
155
return NULL;
156
}
157
else if (sep_len == 1)
158
return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
159
160
list = PyList_New(PREALLOC_SIZE(maxcount));
161
if (list == NULL)
162
return NULL;
163
164
i = j = 0;
165
while (maxcount-- > 0) {
166
pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
167
if (pos < 0)
168
break;
169
j = i + pos;
170
SPLIT_ADD(str, i, j);
171
i = j + sep_len;
172
}
173
#if !STRINGLIB_MUTABLE
174
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
175
/* No match in str_obj, so just use it as list[0] */
176
Py_INCREF(str_obj);
177
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
178
count++;
179
} else
180
#endif
181
{
182
SPLIT_ADD(str, i, str_len);
183
}
184
FIX_PREALLOC_SIZE(list);
185
return list;
186
187
onError:
188
Py_DECREF(list);
189
return NULL;
190
}
191
192
Py_LOCAL_INLINE(PyObject *)
193
STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
194
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
195
Py_ssize_t maxcount)
196
{
197
Py_ssize_t i, j, count=0;
198
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
199
PyObject *sub;
200
201
if (list == NULL)
202
return NULL;
203
204
i = j = str_len - 1;
205
while (maxcount-- > 0) {
206
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
207
i--;
208
if (i < 0) break;
209
j = i; i--;
210
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
211
i--;
212
#if !STRINGLIB_MUTABLE
213
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
214
/* No whitespace in str_obj, so just use it as list[0] */
215
Py_INCREF(str_obj);
216
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
217
count++;
218
break;
219
}
220
#endif
221
SPLIT_ADD(str, i + 1, j + 1);
222
}
223
224
if (i >= 0) {
225
/* Only occurs when maxcount was reached */
226
/* Skip any remaining whitespace and copy to beginning of string */
227
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
228
i--;
229
if (i >= 0)
230
SPLIT_ADD(str, 0, i + 1);
231
}
232
FIX_PREALLOC_SIZE(list);
233
if (PyList_Reverse(list) < 0)
234
goto onError;
235
return list;
236
237
onError:
238
Py_DECREF(list);
239
return NULL;
240
}
241
242
Py_LOCAL_INLINE(PyObject *)
243
STRINGLIB(rsplit_char)(PyObject* str_obj,
244
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
245
const STRINGLIB_CHAR ch,
246
Py_ssize_t maxcount)
247
{
248
Py_ssize_t i, j, count=0;
249
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
250
PyObject *sub;
251
252
if (list == NULL)
253
return NULL;
254
255
i = j = str_len - 1;
256
while ((i >= 0) && (maxcount-- > 0)) {
257
for(; i >= 0; i--) {
258
if (str[i] == ch) {
259
SPLIT_ADD(str, i + 1, j + 1);
260
j = i = i - 1;
261
break;
262
}
263
}
264
}
265
#if !STRINGLIB_MUTABLE
266
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
267
/* ch not in str_obj, so just use str_obj as list[0] */
268
Py_INCREF(str_obj);
269
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
270
count++;
271
} else
272
#endif
273
if (j >= -1) {
274
SPLIT_ADD(str, 0, j + 1);
275
}
276
FIX_PREALLOC_SIZE(list);
277
if (PyList_Reverse(list) < 0)
278
goto onError;
279
return list;
280
281
onError:
282
Py_DECREF(list);
283
return NULL;
284
}
285
286
Py_LOCAL_INLINE(PyObject *)
287
STRINGLIB(rsplit)(PyObject* str_obj,
288
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
289
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
290
Py_ssize_t maxcount)
291
{
292
Py_ssize_t j, pos, count=0;
293
PyObject *list, *sub;
294
295
if (sep_len == 0) {
296
PyErr_SetString(PyExc_ValueError, "empty separator");
297
return NULL;
298
}
299
else if (sep_len == 1)
300
return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
301
302
list = PyList_New(PREALLOC_SIZE(maxcount));
303
if (list == NULL)
304
return NULL;
305
306
j = str_len;
307
while (maxcount-- > 0) {
308
pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
309
if (pos < 0)
310
break;
311
SPLIT_ADD(str, pos + sep_len, j);
312
j = pos;
313
}
314
#if !STRINGLIB_MUTABLE
315
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
316
/* No match in str_obj, so just use it as list[0] */
317
Py_INCREF(str_obj);
318
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
319
count++;
320
} else
321
#endif
322
{
323
SPLIT_ADD(str, 0, j);
324
}
325
FIX_PREALLOC_SIZE(list);
326
if (PyList_Reverse(list) < 0)
327
goto onError;
328
return list;
329
330
onError:
331
Py_DECREF(list);
332
return NULL;
333
}
334
335
Py_LOCAL_INLINE(PyObject *)
336
STRINGLIB(splitlines)(PyObject* str_obj,
337
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
338
int keepends)
339
{
340
/* This does not use the preallocated list because splitlines is
341
usually run with hundreds of newlines. The overhead of
342
switching between PyList_SET_ITEM and append causes about a
343
2-3% slowdown for that common case. A smarter implementation
344
could move the if check out, so the SET_ITEMs are done first
345
and the appends only done when the prealloc buffer is full.
346
That's too much work for little gain.*/
347
348
Py_ssize_t i;
349
Py_ssize_t j;
350
PyObject *list = PyList_New(0);
351
PyObject *sub;
352
353
if (list == NULL)
354
return NULL;
355
356
for (i = j = 0; i < str_len; ) {
357
Py_ssize_t eol;
358
359
/* Find a line and append it */
360
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
361
i++;
362
363
/* Skip the line break reading CRLF as one line break */
364
eol = i;
365
if (i < str_len) {
366
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
367
i += 2;
368
else
369
i++;
370
if (keepends)
371
eol = i;
372
}
373
#if !STRINGLIB_MUTABLE
374
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
375
/* No linebreak in str_obj, so just use it as list[0] */
376
if (PyList_Append(list, str_obj))
377
goto onError;
378
break;
379
}
380
#endif
381
SPLIT_APPEND(str, j, eol);
382
j = i;
383
}
384
return list;
385
386
onError:
387
Py_DECREF(list);
388
return NULL;
389
}
390
391
392