Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Objects/stringlib/codecs.h
12 views
1
/* stringlib: codec implementations */
2
3
#if !STRINGLIB_IS_UNICODE
4
# error "codecs.h is specific to Unicode"
5
#endif
6
7
#include "pycore_bitutils.h" // _Py_bswap32()
8
9
/* Mask to quickly check whether a C 'size_t' contains a
10
non-ASCII, UTF8-encoded char. */
11
#if (SIZEOF_SIZE_T == 8)
12
# define ASCII_CHAR_MASK 0x8080808080808080ULL
13
#elif (SIZEOF_SIZE_T == 4)
14
# define ASCII_CHAR_MASK 0x80808080U
15
#else
16
# error C 'size_t' size should be either 4 or 8!
17
#endif
18
19
/* 10xxxxxx */
20
#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
21
22
Py_LOCAL_INLINE(Py_UCS4)
23
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
24
STRINGLIB_CHAR *dest,
25
Py_ssize_t *outpos)
26
{
27
Py_UCS4 ch;
28
const char *s = *inptr;
29
STRINGLIB_CHAR *p = dest + *outpos;
30
31
while (s < end) {
32
ch = (unsigned char)*s;
33
34
if (ch < 0x80) {
35
/* Fast path for runs of ASCII characters. Given that common UTF-8
36
input will consist of an overwhelming majority of ASCII
37
characters, we try to optimize for this case by checking
38
as many characters as a C 'size_t' can contain.
39
First, check if we can do an aligned read, as most CPUs have
40
a penalty for unaligned reads.
41
*/
42
if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
43
/* Help register allocation */
44
const char *_s = s;
45
STRINGLIB_CHAR *_p = p;
46
while (_s + SIZEOF_SIZE_T <= end) {
47
/* Read a whole size_t at a time (either 4 or 8 bytes),
48
and do a fast unrolled copy if it only contains ASCII
49
characters. */
50
size_t value = *(const size_t *) _s;
51
if (value & ASCII_CHAR_MASK)
52
break;
53
#if PY_LITTLE_ENDIAN
54
_p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
55
_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
56
_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
57
_p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
58
# if SIZEOF_SIZE_T == 8
59
_p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
60
_p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
61
_p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
62
_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
63
# endif
64
#else
65
# if SIZEOF_SIZE_T == 8
66
_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
67
_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
68
_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
69
_p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
70
_p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
71
_p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
72
_p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
73
_p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
74
# else
75
_p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
76
_p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
77
_p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
78
_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
79
# endif
80
#endif
81
_s += SIZEOF_SIZE_T;
82
_p += SIZEOF_SIZE_T;
83
}
84
s = _s;
85
p = _p;
86
if (s == end)
87
break;
88
ch = (unsigned char)*s;
89
}
90
if (ch < 0x80) {
91
s++;
92
*p++ = ch;
93
continue;
94
}
95
}
96
97
if (ch < 0xE0) {
98
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
99
Py_UCS4 ch2;
100
if (ch < 0xC2) {
101
/* invalid sequence
102
\x80-\xBF -- continuation byte
103
\xC0-\xC1 -- fake 0000-007F */
104
goto InvalidStart;
105
}
106
if (end - s < 2) {
107
/* unexpected end of data: the caller will decide whether
108
it's an error or not */
109
break;
110
}
111
ch2 = (unsigned char)s[1];
112
if (!IS_CONTINUATION_BYTE(ch2))
113
/* invalid continuation byte */
114
goto InvalidContinuation1;
115
ch = (ch << 6) + ch2 -
116
((0xC0 << 6) + 0x80);
117
assert ((ch > 0x007F) && (ch <= 0x07FF));
118
s += 2;
119
if (STRINGLIB_MAX_CHAR <= 0x007F ||
120
(STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
121
/* Out-of-range */
122
goto Return;
123
*p++ = ch;
124
continue;
125
}
126
127
if (ch < 0xF0) {
128
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
129
Py_UCS4 ch2, ch3;
130
if (end - s < 3) {
131
/* unexpected end of data: the caller will decide whether
132
it's an error or not */
133
if (end - s < 2)
134
break;
135
ch2 = (unsigned char)s[1];
136
if (!IS_CONTINUATION_BYTE(ch2) ||
137
(ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
138
/* for clarification see comments below */
139
goto InvalidContinuation1;
140
break;
141
}
142
ch2 = (unsigned char)s[1];
143
ch3 = (unsigned char)s[2];
144
if (!IS_CONTINUATION_BYTE(ch2)) {
145
/* invalid continuation byte */
146
goto InvalidContinuation1;
147
}
148
if (ch == 0xE0) {
149
if (ch2 < 0xA0)
150
/* invalid sequence
151
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
152
goto InvalidContinuation1;
153
} else if (ch == 0xED && ch2 >= 0xA0) {
154
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
155
will result in surrogates in range D800-DFFF. Surrogates are
156
not valid UTF-8 so they are rejected.
157
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
158
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
159
goto InvalidContinuation1;
160
}
161
if (!IS_CONTINUATION_BYTE(ch3)) {
162
/* invalid continuation byte */
163
goto InvalidContinuation2;
164
}
165
ch = (ch << 12) + (ch2 << 6) + ch3 -
166
((0xE0 << 12) + (0x80 << 6) + 0x80);
167
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
168
s += 3;
169
if (STRINGLIB_MAX_CHAR <= 0x07FF ||
170
(STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
171
/* Out-of-range */
172
goto Return;
173
*p++ = ch;
174
continue;
175
}
176
177
if (ch < 0xF5) {
178
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
179
Py_UCS4 ch2, ch3, ch4;
180
if (end - s < 4) {
181
/* unexpected end of data: the caller will decide whether
182
it's an error or not */
183
if (end - s < 2)
184
break;
185
ch2 = (unsigned char)s[1];
186
if (!IS_CONTINUATION_BYTE(ch2) ||
187
(ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
188
/* for clarification see comments below */
189
goto InvalidContinuation1;
190
if (end - s < 3)
191
break;
192
ch3 = (unsigned char)s[2];
193
if (!IS_CONTINUATION_BYTE(ch3))
194
goto InvalidContinuation2;
195
break;
196
}
197
ch2 = (unsigned char)s[1];
198
ch3 = (unsigned char)s[2];
199
ch4 = (unsigned char)s[3];
200
if (!IS_CONTINUATION_BYTE(ch2)) {
201
/* invalid continuation byte */
202
goto InvalidContinuation1;
203
}
204
if (ch == 0xF0) {
205
if (ch2 < 0x90)
206
/* invalid sequence
207
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
208
goto InvalidContinuation1;
209
} else if (ch == 0xF4 && ch2 >= 0x90) {
210
/* invalid sequence
211
\xF4\x90\x80\x80- -- 110000- overflow */
212
goto InvalidContinuation1;
213
}
214
if (!IS_CONTINUATION_BYTE(ch3)) {
215
/* invalid continuation byte */
216
goto InvalidContinuation2;
217
}
218
if (!IS_CONTINUATION_BYTE(ch4)) {
219
/* invalid continuation byte */
220
goto InvalidContinuation3;
221
}
222
ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
223
((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
224
assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
225
s += 4;
226
if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
227
(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
228
/* Out-of-range */
229
goto Return;
230
*p++ = ch;
231
continue;
232
}
233
goto InvalidStart;
234
}
235
ch = 0;
236
Return:
237
*inptr = s;
238
*outpos = p - dest;
239
return ch;
240
InvalidStart:
241
ch = 1;
242
goto Return;
243
InvalidContinuation1:
244
ch = 2;
245
goto Return;
246
InvalidContinuation2:
247
ch = 3;
248
goto Return;
249
InvalidContinuation3:
250
ch = 4;
251
goto Return;
252
}
253
254
#undef ASCII_CHAR_MASK
255
256
257
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
258
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
259
UCS-1 strings don't need to handle surrogates for example. */
260
Py_LOCAL_INLINE(char *)
261
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
262
PyObject *unicode,
263
const STRINGLIB_CHAR *data,
264
Py_ssize_t size,
265
_Py_error_handler error_handler,
266
const char *errors)
267
{
268
Py_ssize_t i; /* index into data of next input character */
269
char *p; /* next free byte in output buffer */
270
#if STRINGLIB_SIZEOF_CHAR > 1
271
PyObject *error_handler_obj = NULL;
272
PyObject *exc = NULL;
273
PyObject *rep = NULL;
274
#endif
275
#if STRINGLIB_SIZEOF_CHAR == 1
276
const Py_ssize_t max_char_size = 2;
277
#elif STRINGLIB_SIZEOF_CHAR == 2
278
const Py_ssize_t max_char_size = 3;
279
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
280
const Py_ssize_t max_char_size = 4;
281
#endif
282
283
assert(size >= 0);
284
if (size > PY_SSIZE_T_MAX / max_char_size) {
285
/* integer overflow */
286
PyErr_NoMemory();
287
return NULL;
288
}
289
290
_PyBytesWriter_Init(writer);
291
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
292
if (p == NULL)
293
return NULL;
294
295
for (i = 0; i < size;) {
296
Py_UCS4 ch = data[i++];
297
298
if (ch < 0x80) {
299
/* Encode ASCII */
300
*p++ = (char) ch;
301
302
}
303
else
304
#if STRINGLIB_SIZEOF_CHAR > 1
305
if (ch < 0x0800)
306
#endif
307
{
308
/* Encode Latin-1 */
309
*p++ = (char)(0xc0 | (ch >> 6));
310
*p++ = (char)(0x80 | (ch & 0x3f));
311
}
312
#if STRINGLIB_SIZEOF_CHAR > 1
313
else if (Py_UNICODE_IS_SURROGATE(ch)) {
314
Py_ssize_t startpos, endpos, newpos;
315
Py_ssize_t k;
316
if (error_handler == _Py_ERROR_UNKNOWN) {
317
error_handler = _Py_GetErrorHandler(errors);
318
}
319
320
startpos = i-1;
321
endpos = startpos+1;
322
323
while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
324
endpos++;
325
326
/* Only overallocate the buffer if it's not the last write */
327
writer->overallocate = (endpos < size);
328
329
switch (error_handler)
330
{
331
case _Py_ERROR_REPLACE:
332
memset(p, '?', endpos - startpos);
333
p += (endpos - startpos);
334
/* fall through */
335
case _Py_ERROR_IGNORE:
336
i += (endpos - startpos - 1);
337
break;
338
339
case _Py_ERROR_SURROGATEPASS:
340
for (k=startpos; k<endpos; k++) {
341
ch = data[k];
342
*p++ = (char)(0xe0 | (ch >> 12));
343
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
344
*p++ = (char)(0x80 | (ch & 0x3f));
345
}
346
i += (endpos - startpos - 1);
347
break;
348
349
case _Py_ERROR_BACKSLASHREPLACE:
350
/* subtract preallocated bytes */
351
writer->min_size -= max_char_size * (endpos - startpos);
352
p = backslashreplace(writer, p,
353
unicode, startpos, endpos);
354
if (p == NULL)
355
goto error;
356
i += (endpos - startpos - 1);
357
break;
358
359
case _Py_ERROR_XMLCHARREFREPLACE:
360
/* subtract preallocated bytes */
361
writer->min_size -= max_char_size * (endpos - startpos);
362
p = xmlcharrefreplace(writer, p,
363
unicode, startpos, endpos);
364
if (p == NULL)
365
goto error;
366
i += (endpos - startpos - 1);
367
break;
368
369
case _Py_ERROR_SURROGATEESCAPE:
370
for (k=startpos; k<endpos; k++) {
371
ch = data[k];
372
if (!(0xDC80 <= ch && ch <= 0xDCFF))
373
break;
374
*p++ = (char)(ch & 0xff);
375
}
376
if (k >= endpos) {
377
i += (endpos - startpos - 1);
378
break;
379
}
380
startpos = k;
381
assert(startpos < endpos);
382
/* fall through */
383
default:
384
rep = unicode_encode_call_errorhandler(
385
errors, &error_handler_obj, "utf-8", "surrogates not allowed",
386
unicode, &exc, startpos, endpos, &newpos);
387
if (!rep)
388
goto error;
389
390
if (newpos < startpos) {
391
writer->overallocate = 1;
392
p = _PyBytesWriter_Prepare(writer, p,
393
max_char_size * (startpos - newpos));
394
if (p == NULL)
395
goto error;
396
}
397
else {
398
/* subtract preallocated bytes */
399
writer->min_size -= max_char_size * (newpos - startpos);
400
/* Only overallocate the buffer if it's not the last write */
401
writer->overallocate = (newpos < size);
402
}
403
404
if (PyBytes_Check(rep)) {
405
p = _PyBytesWriter_WriteBytes(writer, p,
406
PyBytes_AS_STRING(rep),
407
PyBytes_GET_SIZE(rep));
408
}
409
else {
410
/* rep is unicode */
411
if (!PyUnicode_IS_ASCII(rep)) {
412
raise_encode_exception(&exc, "utf-8", unicode,
413
startpos, endpos,
414
"surrogates not allowed");
415
goto error;
416
}
417
418
p = _PyBytesWriter_WriteBytes(writer, p,
419
PyUnicode_DATA(rep),
420
PyUnicode_GET_LENGTH(rep));
421
}
422
423
if (p == NULL)
424
goto error;
425
Py_CLEAR(rep);
426
427
i = newpos;
428
}
429
430
/* If overallocation was disabled, ensure that it was the last
431
write. Otherwise, we missed an optimization */
432
assert(writer->overallocate || i == size);
433
}
434
else
435
#if STRINGLIB_SIZEOF_CHAR > 2
436
if (ch < 0x10000)
437
#endif
438
{
439
*p++ = (char)(0xe0 | (ch >> 12));
440
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
441
*p++ = (char)(0x80 | (ch & 0x3f));
442
}
443
#if STRINGLIB_SIZEOF_CHAR > 2
444
else /* ch >= 0x10000 */
445
{
446
assert(ch <= MAX_UNICODE);
447
/* Encode UCS4 Unicode ordinals */
448
*p++ = (char)(0xf0 | (ch >> 18));
449
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
450
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
451
*p++ = (char)(0x80 | (ch & 0x3f));
452
}
453
#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
454
#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
455
}
456
457
#if STRINGLIB_SIZEOF_CHAR > 1
458
Py_XDECREF(error_handler_obj);
459
Py_XDECREF(exc);
460
#endif
461
return p;
462
463
#if STRINGLIB_SIZEOF_CHAR > 1
464
error:
465
Py_XDECREF(rep);
466
Py_XDECREF(error_handler_obj);
467
Py_XDECREF(exc);
468
return NULL;
469
#endif
470
}
471
472
/* The pattern for constructing UCS2-repeated masks. */
473
#if SIZEOF_LONG == 8
474
# define UCS2_REPEAT_MASK 0x0001000100010001ul
475
#elif SIZEOF_LONG == 4
476
# define UCS2_REPEAT_MASK 0x00010001ul
477
#else
478
# error C 'long' size should be either 4 or 8!
479
#endif
480
481
/* The mask for fast checking. */
482
#if STRINGLIB_SIZEOF_CHAR == 1
483
/* The mask for fast checking of whether a C 'long' contains a
484
non-ASCII or non-Latin1 UTF16-encoded characters. */
485
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
486
#else
487
/* The mask for fast checking of whether a C 'long' may contain
488
UTF16-encoded surrogate characters. This is an efficient heuristic,
489
assuming that non-surrogate characters with a code point >= 0x8000 are
490
rare in most input.
491
*/
492
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
493
#endif
494
/* The mask for fast byte-swapping. */
495
#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
496
/* Swap bytes. */
497
#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
498
(((value) & STRIPPED_MASK) << 8))
499
500
Py_LOCAL_INLINE(Py_UCS4)
501
STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
502
STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
503
int native_ordering)
504
{
505
Py_UCS4 ch;
506
const unsigned char *q = *inptr;
507
STRINGLIB_CHAR *p = dest + *outpos;
508
/* Offsets from q for retrieving byte pairs in the right order. */
509
#if PY_LITTLE_ENDIAN
510
int ihi = !!native_ordering, ilo = !native_ordering;
511
#else
512
int ihi = !native_ordering, ilo = !!native_ordering;
513
#endif
514
--e;
515
516
while (q < e) {
517
Py_UCS4 ch2;
518
/* First check for possible aligned read of a C 'long'. Unaligned
519
reads are more expensive, better to defer to another iteration. */
520
if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
521
/* Fast path for runs of in-range non-surrogate chars. */
522
const unsigned char *_q = q;
523
while (_q + SIZEOF_LONG <= e) {
524
unsigned long block = * (const unsigned long *) _q;
525
if (native_ordering) {
526
/* Can use buffer directly */
527
if (block & FAST_CHAR_MASK)
528
break;
529
}
530
else {
531
/* Need to byte-swap */
532
if (block & SWAB(FAST_CHAR_MASK))
533
break;
534
#if STRINGLIB_SIZEOF_CHAR == 1
535
block >>= 8;
536
#else
537
block = SWAB(block);
538
#endif
539
}
540
#if PY_LITTLE_ENDIAN
541
# if SIZEOF_LONG == 4
542
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
543
p[1] = (STRINGLIB_CHAR)(block >> 16);
544
# elif SIZEOF_LONG == 8
545
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
546
p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
547
p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
548
p[3] = (STRINGLIB_CHAR)(block >> 48);
549
# endif
550
#else
551
# if SIZEOF_LONG == 4
552
p[0] = (STRINGLIB_CHAR)(block >> 16);
553
p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
554
# elif SIZEOF_LONG == 8
555
p[0] = (STRINGLIB_CHAR)(block >> 48);
556
p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
557
p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
558
p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
559
# endif
560
#endif
561
_q += SIZEOF_LONG;
562
p += SIZEOF_LONG / 2;
563
}
564
q = _q;
565
if (q >= e)
566
break;
567
}
568
569
ch = (q[ihi] << 8) | q[ilo];
570
q += 2;
571
if (!Py_UNICODE_IS_SURROGATE(ch)) {
572
#if STRINGLIB_SIZEOF_CHAR < 2
573
if (ch > STRINGLIB_MAX_CHAR)
574
/* Out-of-range */
575
goto Return;
576
#endif
577
*p++ = (STRINGLIB_CHAR)ch;
578
continue;
579
}
580
581
/* UTF-16 code pair: */
582
if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
583
goto IllegalEncoding;
584
if (q >= e)
585
goto UnexpectedEnd;
586
ch2 = (q[ihi] << 8) | q[ilo];
587
q += 2;
588
if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
589
goto IllegalSurrogate;
590
ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
591
#if STRINGLIB_SIZEOF_CHAR < 4
592
/* Out-of-range */
593
goto Return;
594
#else
595
*p++ = (STRINGLIB_CHAR)ch;
596
#endif
597
}
598
ch = 0;
599
Return:
600
*inptr = q;
601
*outpos = p - dest;
602
return ch;
603
UnexpectedEnd:
604
ch = 1;
605
goto Return;
606
IllegalEncoding:
607
ch = 2;
608
goto Return;
609
IllegalSurrogate:
610
ch = 3;
611
goto Return;
612
}
613
#undef UCS2_REPEAT_MASK
614
#undef FAST_CHAR_MASK
615
#undef STRIPPED_MASK
616
#undef SWAB
617
618
619
#if STRINGLIB_MAX_CHAR >= 0x80
620
Py_LOCAL_INLINE(Py_ssize_t)
621
STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
622
Py_ssize_t len,
623
unsigned short **outptr,
624
int native_ordering)
625
{
626
unsigned short *out = *outptr;
627
const STRINGLIB_CHAR *end = in + len;
628
#if STRINGLIB_SIZEOF_CHAR == 1
629
if (native_ordering) {
630
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
631
while (in < unrolled_end) {
632
out[0] = in[0];
633
out[1] = in[1];
634
out[2] = in[2];
635
out[3] = in[3];
636
in += 4; out += 4;
637
}
638
while (in < end) {
639
*out++ = *in++;
640
}
641
} else {
642
# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
643
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
644
while (in < unrolled_end) {
645
out[0] = SWAB2(in[0]);
646
out[1] = SWAB2(in[1]);
647
out[2] = SWAB2(in[2]);
648
out[3] = SWAB2(in[3]);
649
in += 4; out += 4;
650
}
651
while (in < end) {
652
Py_UCS4 ch = *in++;
653
*out++ = SWAB2((Py_UCS2)ch);
654
}
655
#undef SWAB2
656
}
657
*outptr = out;
658
return len;
659
#else
660
if (native_ordering) {
661
#if STRINGLIB_MAX_CHAR < 0x10000
662
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
663
while (in < unrolled_end) {
664
/* check if any character is a surrogate character */
665
if (((in[0] ^ 0xd800) &
666
(in[1] ^ 0xd800) &
667
(in[2] ^ 0xd800) &
668
(in[3] ^ 0xd800) & 0xf800) == 0)
669
break;
670
out[0] = in[0];
671
out[1] = in[1];
672
out[2] = in[2];
673
out[3] = in[3];
674
in += 4; out += 4;
675
}
676
#endif
677
while (in < end) {
678
Py_UCS4 ch;
679
ch = *in++;
680
if (ch < 0xd800)
681
*out++ = ch;
682
else if (ch < 0xe000)
683
/* reject surrogate characters (U+D800-U+DFFF) */
684
goto fail;
685
#if STRINGLIB_MAX_CHAR >= 0x10000
686
else if (ch >= 0x10000) {
687
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
688
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
689
out += 2;
690
}
691
#endif
692
else
693
*out++ = ch;
694
}
695
} else {
696
#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
697
#if STRINGLIB_MAX_CHAR < 0x10000
698
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
699
while (in < unrolled_end) {
700
/* check if any character is a surrogate character */
701
if (((in[0] ^ 0xd800) &
702
(in[1] ^ 0xd800) &
703
(in[2] ^ 0xd800) &
704
(in[3] ^ 0xd800) & 0xf800) == 0)
705
break;
706
out[0] = SWAB2(in[0]);
707
out[1] = SWAB2(in[1]);
708
out[2] = SWAB2(in[2]);
709
out[3] = SWAB2(in[3]);
710
in += 4; out += 4;
711
}
712
#endif
713
while (in < end) {
714
Py_UCS4 ch = *in++;
715
if (ch < 0xd800)
716
*out++ = SWAB2((Py_UCS2)ch);
717
else if (ch < 0xe000)
718
/* reject surrogate characters (U+D800-U+DFFF) */
719
goto fail;
720
#if STRINGLIB_MAX_CHAR >= 0x10000
721
else if (ch >= 0x10000) {
722
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
723
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
724
out[0] = SWAB2(ch1);
725
out[1] = SWAB2(ch2);
726
out += 2;
727
}
728
#endif
729
else
730
*out++ = SWAB2((Py_UCS2)ch);
731
}
732
#undef SWAB2
733
}
734
*outptr = out;
735
return len;
736
fail:
737
*outptr = out;
738
return len - (end - in + 1);
739
#endif
740
}
741
742
static inline uint32_t
743
STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
744
{
745
uint32_t word = ch;
746
#if STRINGLIB_SIZEOF_CHAR == 1
747
/* high bytes are zero */
748
return (word << 24);
749
#elif STRINGLIB_SIZEOF_CHAR == 2
750
/* high bytes are zero */
751
return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
752
#else
753
return _Py_bswap32(word);
754
#endif
755
}
756
757
Py_LOCAL_INLINE(Py_ssize_t)
758
STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
759
Py_ssize_t len,
760
uint32_t **outptr,
761
int native_ordering)
762
{
763
uint32_t *out = *outptr;
764
const STRINGLIB_CHAR *end = in + len;
765
if (native_ordering) {
766
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
767
while (in < unrolled_end) {
768
#if STRINGLIB_SIZEOF_CHAR > 1
769
/* check if any character is a surrogate character */
770
if (((in[0] ^ 0xd800) &
771
(in[1] ^ 0xd800) &
772
(in[2] ^ 0xd800) &
773
(in[3] ^ 0xd800) & 0xf800) == 0)
774
break;
775
#endif
776
out[0] = in[0];
777
out[1] = in[1];
778
out[2] = in[2];
779
out[3] = in[3];
780
in += 4; out += 4;
781
}
782
while (in < end) {
783
Py_UCS4 ch;
784
ch = *in++;
785
#if STRINGLIB_SIZEOF_CHAR > 1
786
if (Py_UNICODE_IS_SURROGATE(ch)) {
787
/* reject surrogate characters (U+D800-U+DFFF) */
788
goto fail;
789
}
790
#endif
791
*out++ = ch;
792
}
793
} else {
794
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
795
while (in < unrolled_end) {
796
#if STRINGLIB_SIZEOF_CHAR > 1
797
/* check if any character is a surrogate character */
798
if (((in[0] ^ 0xd800) &
799
(in[1] ^ 0xd800) &
800
(in[2] ^ 0xd800) &
801
(in[3] ^ 0xd800) & 0xf800) == 0)
802
break;
803
#endif
804
out[0] = STRINGLIB(SWAB4)(in[0]);
805
out[1] = STRINGLIB(SWAB4)(in[1]);
806
out[2] = STRINGLIB(SWAB4)(in[2]);
807
out[3] = STRINGLIB(SWAB4)(in[3]);
808
in += 4; out += 4;
809
}
810
while (in < end) {
811
Py_UCS4 ch = *in++;
812
#if STRINGLIB_SIZEOF_CHAR > 1
813
if (Py_UNICODE_IS_SURROGATE(ch)) {
814
/* reject surrogate characters (U+D800-U+DFFF) */
815
goto fail;
816
}
817
#endif
818
*out++ = STRINGLIB(SWAB4)(ch);
819
}
820
}
821
*outptr = out;
822
return len;
823
#if STRINGLIB_SIZEOF_CHAR > 1
824
fail:
825
*outptr = out;
826
return len - (end - in + 1);
827
#endif
828
}
829
830
#endif
831
832