Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Python/codecs.c
12 views
1
/* ------------------------------------------------------------------------
2
3
Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg ([email protected]).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h" // _PyObject_CallNoArgs()
13
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
15
#include "pycore_pystate.h" // _PyInterpreterState_GET()
16
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
17
#include <ctype.h>
18
19
const char *Py_hexdigits = "0123456789abcdef";
20
21
/* --- Codec Registry ----------------------------------------------------- */
22
23
/* Import the standard encodings package which will register the first
24
codec search function.
25
26
This is done in a lazy way so that the Unicode implementation does
27
not downgrade startup time of scripts not needing it.
28
29
ImportErrors are silently ignored by this function. Only one try is
30
made.
31
32
*/
33
34
static int _PyCodecRegistry_Init(void); /* Forward */
35
36
int PyCodec_Register(PyObject *search_function)
37
{
38
PyInterpreterState *interp = _PyInterpreterState_GET();
39
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
40
goto onError;
41
if (search_function == NULL) {
42
PyErr_BadArgument();
43
goto onError;
44
}
45
if (!PyCallable_Check(search_function)) {
46
PyErr_SetString(PyExc_TypeError, "argument must be callable");
47
goto onError;
48
}
49
return PyList_Append(interp->codec_search_path, search_function);
50
51
onError:
52
return -1;
53
}
54
55
int
56
PyCodec_Unregister(PyObject *search_function)
57
{
58
PyInterpreterState *interp = PyInterpreterState_Get();
59
PyObject *codec_search_path = interp->codec_search_path;
60
/* Do nothing if codec_search_path is not created yet or was cleared. */
61
if (codec_search_path == NULL) {
62
return 0;
63
}
64
65
assert(PyList_CheckExact(codec_search_path));
66
Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
67
for (Py_ssize_t i = 0; i < n; i++) {
68
PyObject *item = PyList_GET_ITEM(codec_search_path, i);
69
if (item == search_function) {
70
if (interp->codec_search_cache != NULL) {
71
assert(PyDict_CheckExact(interp->codec_search_cache));
72
PyDict_Clear(interp->codec_search_cache);
73
}
74
return PyList_SetSlice(codec_search_path, i, i+1, NULL);
75
}
76
}
77
return 0;
78
}
79
80
extern int _Py_normalize_encoding(const char *, char *, size_t);
81
82
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
83
converted to lower case, spaces and hyphens are replaced with underscores. */
84
85
static
86
PyObject *normalizestring(const char *string)
87
{
88
size_t len = strlen(string);
89
char *encoding;
90
PyObject *v;
91
92
if (len > PY_SSIZE_T_MAX) {
93
PyErr_SetString(PyExc_OverflowError, "string is too large");
94
return NULL;
95
}
96
97
encoding = PyMem_Malloc(len + 1);
98
if (encoding == NULL)
99
return PyErr_NoMemory();
100
101
if (!_Py_normalize_encoding(string, encoding, len + 1))
102
{
103
PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
104
PyMem_Free(encoding);
105
return NULL;
106
}
107
108
v = PyUnicode_FromString(encoding);
109
PyMem_Free(encoding);
110
return v;
111
}
112
113
/* Lookup the given encoding and return a tuple providing the codec
114
facilities.
115
116
The encoding string is looked up converted to all lower-case
117
characters. This makes encodings looked up through this mechanism
118
effectively case-insensitive.
119
120
If no codec is found, a LookupError is set and NULL returned.
121
122
As side effect, this tries to load the encodings package, if not
123
yet done. This is part of the lazy load strategy for the encodings
124
package.
125
126
*/
127
128
PyObject *_PyCodec_Lookup(const char *encoding)
129
{
130
if (encoding == NULL) {
131
PyErr_BadArgument();
132
return NULL;
133
}
134
135
PyInterpreterState *interp = _PyInterpreterState_GET();
136
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
137
return NULL;
138
}
139
140
/* Convert the encoding to a normalized Python string: all
141
characters are converted to lower case, spaces and hyphens are
142
replaced with underscores. */
143
PyObject *v = normalizestring(encoding);
144
if (v == NULL) {
145
return NULL;
146
}
147
PyUnicode_InternInPlace(&v);
148
149
/* First, try to lookup the name in the registry dictionary */
150
PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
151
if (result != NULL) {
152
Py_INCREF(result);
153
Py_DECREF(v);
154
return result;
155
}
156
else if (PyErr_Occurred()) {
157
goto onError;
158
}
159
160
/* Next, scan the search functions in order of registration */
161
const Py_ssize_t len = PyList_Size(interp->codec_search_path);
162
if (len < 0)
163
goto onError;
164
if (len == 0) {
165
PyErr_SetString(PyExc_LookupError,
166
"no codec search functions registered: "
167
"can't find encoding");
168
goto onError;
169
}
170
171
Py_ssize_t i;
172
for (i = 0; i < len; i++) {
173
PyObject *func;
174
175
func = PyList_GetItem(interp->codec_search_path, i);
176
if (func == NULL)
177
goto onError;
178
result = PyObject_CallOneArg(func, v);
179
if (result == NULL)
180
goto onError;
181
if (result == Py_None) {
182
Py_DECREF(result);
183
continue;
184
}
185
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
186
PyErr_SetString(PyExc_TypeError,
187
"codec search functions must return 4-tuples");
188
Py_DECREF(result);
189
goto onError;
190
}
191
break;
192
}
193
if (i == len) {
194
/* XXX Perhaps we should cache misses too ? */
195
PyErr_Format(PyExc_LookupError,
196
"unknown encoding: %s", encoding);
197
goto onError;
198
}
199
200
/* Cache and return the result */
201
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
202
Py_DECREF(result);
203
goto onError;
204
}
205
Py_DECREF(v);
206
return result;
207
208
onError:
209
Py_DECREF(v);
210
return NULL;
211
}
212
213
/* Codec registry encoding check API. */
214
215
int PyCodec_KnownEncoding(const char *encoding)
216
{
217
PyObject *codecs;
218
219
codecs = _PyCodec_Lookup(encoding);
220
if (!codecs) {
221
PyErr_Clear();
222
return 0;
223
}
224
else {
225
Py_DECREF(codecs);
226
return 1;
227
}
228
}
229
230
static
231
PyObject *args_tuple(PyObject *object,
232
const char *errors)
233
{
234
PyObject *args;
235
236
args = PyTuple_New(1 + (errors != NULL));
237
if (args == NULL)
238
return NULL;
239
PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
240
if (errors) {
241
PyObject *v;
242
243
v = PyUnicode_FromString(errors);
244
if (v == NULL) {
245
Py_DECREF(args);
246
return NULL;
247
}
248
PyTuple_SET_ITEM(args, 1, v);
249
}
250
return args;
251
}
252
253
/* Helper function to get a codec item */
254
255
static
256
PyObject *codec_getitem(const char *encoding, int index)
257
{
258
PyObject *codecs;
259
PyObject *v;
260
261
codecs = _PyCodec_Lookup(encoding);
262
if (codecs == NULL)
263
return NULL;
264
v = PyTuple_GET_ITEM(codecs, index);
265
Py_DECREF(codecs);
266
return Py_NewRef(v);
267
}
268
269
/* Helper functions to create an incremental codec. */
270
static
271
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272
const char *errors,
273
const char *attrname)
274
{
275
PyObject *ret, *inccodec;
276
277
inccodec = PyObject_GetAttrString(codec_info, attrname);
278
if (inccodec == NULL)
279
return NULL;
280
if (errors)
281
ret = PyObject_CallFunction(inccodec, "s", errors);
282
else
283
ret = _PyObject_CallNoArgs(inccodec);
284
Py_DECREF(inccodec);
285
return ret;
286
}
287
288
static
289
PyObject *codec_getincrementalcodec(const char *encoding,
290
const char *errors,
291
const char *attrname)
292
{
293
PyObject *codec_info, *ret;
294
295
codec_info = _PyCodec_Lookup(encoding);
296
if (codec_info == NULL)
297
return NULL;
298
ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299
Py_DECREF(codec_info);
300
return ret;
301
}
302
303
/* Helper function to create a stream codec. */
304
305
static
306
PyObject *codec_getstreamcodec(const char *encoding,
307
PyObject *stream,
308
const char *errors,
309
const int index)
310
{
311
PyObject *codecs, *streamcodec, *codeccls;
312
313
codecs = _PyCodec_Lookup(encoding);
314
if (codecs == NULL)
315
return NULL;
316
317
codeccls = PyTuple_GET_ITEM(codecs, index);
318
if (errors != NULL)
319
streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
320
else
321
streamcodec = PyObject_CallOneArg(codeccls, stream);
322
Py_DECREF(codecs);
323
return streamcodec;
324
}
325
326
/* Helpers to work with the result of _PyCodec_Lookup
327
328
*/
329
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330
const char *errors)
331
{
332
return codec_makeincrementalcodec(codec_info, errors,
333
"incrementaldecoder");
334
}
335
336
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337
const char *errors)
338
{
339
return codec_makeincrementalcodec(codec_info, errors,
340
"incrementalencoder");
341
}
342
343
344
/* Convenience APIs to query the Codec registry.
345
346
All APIs return a codec object with incremented refcount.
347
348
*/
349
350
PyObject *PyCodec_Encoder(const char *encoding)
351
{
352
return codec_getitem(encoding, 0);
353
}
354
355
PyObject *PyCodec_Decoder(const char *encoding)
356
{
357
return codec_getitem(encoding, 1);
358
}
359
360
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
361
const char *errors)
362
{
363
return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
364
}
365
366
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
367
const char *errors)
368
{
369
return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
370
}
371
372
PyObject *PyCodec_StreamReader(const char *encoding,
373
PyObject *stream,
374
const char *errors)
375
{
376
return codec_getstreamcodec(encoding, stream, errors, 2);
377
}
378
379
PyObject *PyCodec_StreamWriter(const char *encoding,
380
PyObject *stream,
381
const char *errors)
382
{
383
return codec_getstreamcodec(encoding, stream, errors, 3);
384
}
385
386
/* Encode an object (e.g. a Unicode object) using the given encoding
387
and return the resulting encoded object (usually a Python string).
388
389
errors is passed to the encoder factory as argument if non-NULL. */
390
391
static PyObject *
392
_PyCodec_EncodeInternal(PyObject *object,
393
PyObject *encoder,
394
const char *encoding,
395
const char *errors)
396
{
397
PyObject *args = NULL, *result = NULL;
398
PyObject *v = NULL;
399
400
args = args_tuple(object, errors);
401
if (args == NULL)
402
goto onError;
403
404
result = PyObject_Call(encoder, args, NULL);
405
if (result == NULL) {
406
_PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
407
goto onError;
408
}
409
410
if (!PyTuple_Check(result) ||
411
PyTuple_GET_SIZE(result) != 2) {
412
PyErr_SetString(PyExc_TypeError,
413
"encoder must return a tuple (object, integer)");
414
goto onError;
415
}
416
v = Py_NewRef(PyTuple_GET_ITEM(result,0));
417
/* We don't check or use the second (integer) entry. */
418
419
Py_DECREF(args);
420
Py_DECREF(encoder);
421
Py_DECREF(result);
422
return v;
423
424
onError:
425
Py_XDECREF(result);
426
Py_XDECREF(args);
427
Py_XDECREF(encoder);
428
return NULL;
429
}
430
431
/* Decode an object (usually a Python string) using the given encoding
432
and return an equivalent object (e.g. a Unicode object).
433
434
errors is passed to the decoder factory as argument if non-NULL. */
435
436
static PyObject *
437
_PyCodec_DecodeInternal(PyObject *object,
438
PyObject *decoder,
439
const char *encoding,
440
const char *errors)
441
{
442
PyObject *args = NULL, *result = NULL;
443
PyObject *v;
444
445
args = args_tuple(object, errors);
446
if (args == NULL)
447
goto onError;
448
449
result = PyObject_Call(decoder, args, NULL);
450
if (result == NULL) {
451
_PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
452
goto onError;
453
}
454
if (!PyTuple_Check(result) ||
455
PyTuple_GET_SIZE(result) != 2) {
456
PyErr_SetString(PyExc_TypeError,
457
"decoder must return a tuple (object,integer)");
458
goto onError;
459
}
460
v = Py_NewRef(PyTuple_GET_ITEM(result,0));
461
/* We don't check or use the second (integer) entry. */
462
463
Py_DECREF(args);
464
Py_DECREF(decoder);
465
Py_DECREF(result);
466
return v;
467
468
onError:
469
Py_XDECREF(args);
470
Py_XDECREF(decoder);
471
Py_XDECREF(result);
472
return NULL;
473
}
474
475
/* Generic encoding/decoding API */
476
PyObject *PyCodec_Encode(PyObject *object,
477
const char *encoding,
478
const char *errors)
479
{
480
PyObject *encoder;
481
482
encoder = PyCodec_Encoder(encoding);
483
if (encoder == NULL)
484
return NULL;
485
486
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
487
}
488
489
PyObject *PyCodec_Decode(PyObject *object,
490
const char *encoding,
491
const char *errors)
492
{
493
PyObject *decoder;
494
495
decoder = PyCodec_Decoder(encoding);
496
if (decoder == NULL)
497
return NULL;
498
499
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
500
}
501
502
/* Text encoding/decoding API */
503
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
504
const char *alternate_command)
505
{
506
PyObject *codec;
507
PyObject *attr;
508
int is_text_codec;
509
510
codec = _PyCodec_Lookup(encoding);
511
if (codec == NULL)
512
return NULL;
513
514
/* Backwards compatibility: assume any raw tuple describes a text
515
* encoding, and the same for anything lacking the private
516
* attribute.
517
*/
518
if (!PyTuple_CheckExact(codec)) {
519
if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
520
Py_DECREF(codec);
521
return NULL;
522
}
523
if (attr != NULL) {
524
is_text_codec = PyObject_IsTrue(attr);
525
Py_DECREF(attr);
526
if (is_text_codec <= 0) {
527
Py_DECREF(codec);
528
if (!is_text_codec)
529
PyErr_Format(PyExc_LookupError,
530
"'%.400s' is not a text encoding; "
531
"use %s to handle arbitrary codecs",
532
encoding, alternate_command);
533
return NULL;
534
}
535
}
536
}
537
538
/* This appears to be a valid text encoding */
539
return codec;
540
}
541
542
543
static
544
PyObject *codec_getitem_checked(const char *encoding,
545
const char *alternate_command,
546
int index)
547
{
548
PyObject *codec;
549
PyObject *v;
550
551
codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
552
if (codec == NULL)
553
return NULL;
554
555
v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
556
Py_DECREF(codec);
557
return v;
558
}
559
560
static PyObject * _PyCodec_TextEncoder(const char *encoding)
561
{
562
return codec_getitem_checked(encoding, "codecs.encode()", 0);
563
}
564
565
static PyObject * _PyCodec_TextDecoder(const char *encoding)
566
{
567
return codec_getitem_checked(encoding, "codecs.decode()", 1);
568
}
569
570
PyObject *_PyCodec_EncodeText(PyObject *object,
571
const char *encoding,
572
const char *errors)
573
{
574
PyObject *encoder;
575
576
encoder = _PyCodec_TextEncoder(encoding);
577
if (encoder == NULL)
578
return NULL;
579
580
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
581
}
582
583
PyObject *_PyCodec_DecodeText(PyObject *object,
584
const char *encoding,
585
const char *errors)
586
{
587
PyObject *decoder;
588
589
decoder = _PyCodec_TextDecoder(encoding);
590
if (decoder == NULL)
591
return NULL;
592
593
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
594
}
595
596
/* Register the error handling callback function error under the name
597
name. This function will be called by the codec when it encounters
598
an unencodable characters/undecodable bytes and doesn't know the
599
callback name, when name is specified as the error parameter
600
in the call to the encode/decode function.
601
Return 0 on success, -1 on error */
602
int PyCodec_RegisterError(const char *name, PyObject *error)
603
{
604
PyInterpreterState *interp = _PyInterpreterState_GET();
605
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
606
return -1;
607
if (!PyCallable_Check(error)) {
608
PyErr_SetString(PyExc_TypeError, "handler must be callable");
609
return -1;
610
}
611
return PyDict_SetItemString(interp->codec_error_registry,
612
name, error);
613
}
614
615
/* Lookup the error handling callback function registered under the
616
name error. As a special case NULL can be passed, in which case
617
the error handling callback for strict encoding will be returned. */
618
PyObject *PyCodec_LookupError(const char *name)
619
{
620
PyObject *handler = NULL;
621
622
PyInterpreterState *interp = _PyInterpreterState_GET();
623
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
624
return NULL;
625
626
if (name==NULL)
627
name = "strict";
628
handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
629
if (handler) {
630
Py_INCREF(handler);
631
}
632
else if (!PyErr_Occurred()) {
633
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
634
}
635
return handler;
636
}
637
638
static void wrong_exception_type(PyObject *exc)
639
{
640
PyErr_Format(PyExc_TypeError,
641
"don't know how to handle %.200s in error callback",
642
Py_TYPE(exc)->tp_name);
643
}
644
645
PyObject *PyCodec_StrictErrors(PyObject *exc)
646
{
647
if (PyExceptionInstance_Check(exc))
648
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
649
else
650
PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
651
return NULL;
652
}
653
654
655
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
656
{
657
Py_ssize_t end;
658
659
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
660
if (PyUnicodeEncodeError_GetEnd(exc, &end))
661
return NULL;
662
}
663
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
664
if (PyUnicodeDecodeError_GetEnd(exc, &end))
665
return NULL;
666
}
667
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
668
if (PyUnicodeTranslateError_GetEnd(exc, &end))
669
return NULL;
670
}
671
else {
672
wrong_exception_type(exc);
673
return NULL;
674
}
675
return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
676
}
677
678
679
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
680
{
681
Py_ssize_t start, end, i, len;
682
683
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
684
PyObject *res;
685
Py_UCS1 *outp;
686
if (PyUnicodeEncodeError_GetStart(exc, &start))
687
return NULL;
688
if (PyUnicodeEncodeError_GetEnd(exc, &end))
689
return NULL;
690
len = end - start;
691
res = PyUnicode_New(len, '?');
692
if (res == NULL)
693
return NULL;
694
assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
695
outp = PyUnicode_1BYTE_DATA(res);
696
for (i = 0; i < len; ++i)
697
outp[i] = '?';
698
assert(_PyUnicode_CheckConsistency(res, 1));
699
return Py_BuildValue("(Nn)", res, end);
700
}
701
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
702
if (PyUnicodeDecodeError_GetEnd(exc, &end))
703
return NULL;
704
return Py_BuildValue("(Cn)",
705
(int)Py_UNICODE_REPLACEMENT_CHARACTER,
706
end);
707
}
708
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
709
PyObject *res;
710
Py_UCS2 *outp;
711
if (PyUnicodeTranslateError_GetStart(exc, &start))
712
return NULL;
713
if (PyUnicodeTranslateError_GetEnd(exc, &end))
714
return NULL;
715
len = end - start;
716
res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
717
if (res == NULL)
718
return NULL;
719
assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
720
outp = PyUnicode_2BYTE_DATA(res);
721
for (i = 0; i < len; i++)
722
outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
723
assert(_PyUnicode_CheckConsistency(res, 1));
724
return Py_BuildValue("(Nn)", res, end);
725
}
726
else {
727
wrong_exception_type(exc);
728
return NULL;
729
}
730
}
731
732
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
733
{
734
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
735
PyObject *restuple;
736
PyObject *object;
737
Py_ssize_t i;
738
Py_ssize_t start;
739
Py_ssize_t end;
740
PyObject *res;
741
Py_UCS1 *outp;
742
Py_ssize_t ressize;
743
Py_UCS4 ch;
744
if (PyUnicodeEncodeError_GetStart(exc, &start))
745
return NULL;
746
if (PyUnicodeEncodeError_GetEnd(exc, &end))
747
return NULL;
748
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
749
return NULL;
750
if (end - start > PY_SSIZE_T_MAX / (2+7+1))
751
end = start + PY_SSIZE_T_MAX / (2+7+1);
752
for (i = start, ressize = 0; i < end; ++i) {
753
/* object is guaranteed to be "ready" */
754
ch = PyUnicode_READ_CHAR(object, i);
755
if (ch<10)
756
ressize += 2+1+1;
757
else if (ch<100)
758
ressize += 2+2+1;
759
else if (ch<1000)
760
ressize += 2+3+1;
761
else if (ch<10000)
762
ressize += 2+4+1;
763
else if (ch<100000)
764
ressize += 2+5+1;
765
else if (ch<1000000)
766
ressize += 2+6+1;
767
else
768
ressize += 2+7+1;
769
}
770
/* allocate replacement */
771
res = PyUnicode_New(ressize, 127);
772
if (res == NULL) {
773
Py_DECREF(object);
774
return NULL;
775
}
776
outp = PyUnicode_1BYTE_DATA(res);
777
/* generate replacement */
778
for (i = start; i < end; ++i) {
779
int digits;
780
int base;
781
ch = PyUnicode_READ_CHAR(object, i);
782
*outp++ = '&';
783
*outp++ = '#';
784
if (ch<10) {
785
digits = 1;
786
base = 1;
787
}
788
else if (ch<100) {
789
digits = 2;
790
base = 10;
791
}
792
else if (ch<1000) {
793
digits = 3;
794
base = 100;
795
}
796
else if (ch<10000) {
797
digits = 4;
798
base = 1000;
799
}
800
else if (ch<100000) {
801
digits = 5;
802
base = 10000;
803
}
804
else if (ch<1000000) {
805
digits = 6;
806
base = 100000;
807
}
808
else {
809
digits = 7;
810
base = 1000000;
811
}
812
while (digits-->0) {
813
*outp++ = '0' + ch/base;
814
ch %= base;
815
base /= 10;
816
}
817
*outp++ = ';';
818
}
819
assert(_PyUnicode_CheckConsistency(res, 1));
820
restuple = Py_BuildValue("(Nn)", res, end);
821
Py_DECREF(object);
822
return restuple;
823
}
824
else {
825
wrong_exception_type(exc);
826
return NULL;
827
}
828
}
829
830
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
831
{
832
PyObject *object;
833
Py_ssize_t i;
834
Py_ssize_t start;
835
Py_ssize_t end;
836
PyObject *res;
837
Py_UCS1 *outp;
838
int ressize;
839
Py_UCS4 c;
840
841
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
842
const unsigned char *p;
843
if (PyUnicodeDecodeError_GetStart(exc, &start))
844
return NULL;
845
if (PyUnicodeDecodeError_GetEnd(exc, &end))
846
return NULL;
847
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
848
return NULL;
849
p = (const unsigned char*)PyBytes_AS_STRING(object);
850
res = PyUnicode_New(4 * (end - start), 127);
851
if (res == NULL) {
852
Py_DECREF(object);
853
return NULL;
854
}
855
outp = PyUnicode_1BYTE_DATA(res);
856
for (i = start; i < end; i++, outp += 4) {
857
unsigned char c = p[i];
858
outp[0] = '\\';
859
outp[1] = 'x';
860
outp[2] = Py_hexdigits[(c>>4)&0xf];
861
outp[3] = Py_hexdigits[c&0xf];
862
}
863
864
assert(_PyUnicode_CheckConsistency(res, 1));
865
Py_DECREF(object);
866
return Py_BuildValue("(Nn)", res, end);
867
}
868
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
869
if (PyUnicodeEncodeError_GetStart(exc, &start))
870
return NULL;
871
if (PyUnicodeEncodeError_GetEnd(exc, &end))
872
return NULL;
873
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
874
return NULL;
875
}
876
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
877
if (PyUnicodeTranslateError_GetStart(exc, &start))
878
return NULL;
879
if (PyUnicodeTranslateError_GetEnd(exc, &end))
880
return NULL;
881
if (!(object = PyUnicodeTranslateError_GetObject(exc)))
882
return NULL;
883
}
884
else {
885
wrong_exception_type(exc);
886
return NULL;
887
}
888
889
if (end - start > PY_SSIZE_T_MAX / (1+1+8))
890
end = start + PY_SSIZE_T_MAX / (1+1+8);
891
for (i = start, ressize = 0; i < end; ++i) {
892
/* object is guaranteed to be "ready" */
893
c = PyUnicode_READ_CHAR(object, i);
894
if (c >= 0x10000) {
895
ressize += 1+1+8;
896
}
897
else if (c >= 0x100) {
898
ressize += 1+1+4;
899
}
900
else
901
ressize += 1+1+2;
902
}
903
res = PyUnicode_New(ressize, 127);
904
if (res == NULL) {
905
Py_DECREF(object);
906
return NULL;
907
}
908
outp = PyUnicode_1BYTE_DATA(res);
909
for (i = start; i < end; ++i) {
910
c = PyUnicode_READ_CHAR(object, i);
911
*outp++ = '\\';
912
if (c >= 0x00010000) {
913
*outp++ = 'U';
914
*outp++ = Py_hexdigits[(c>>28)&0xf];
915
*outp++ = Py_hexdigits[(c>>24)&0xf];
916
*outp++ = Py_hexdigits[(c>>20)&0xf];
917
*outp++ = Py_hexdigits[(c>>16)&0xf];
918
*outp++ = Py_hexdigits[(c>>12)&0xf];
919
*outp++ = Py_hexdigits[(c>>8)&0xf];
920
}
921
else if (c >= 0x100) {
922
*outp++ = 'u';
923
*outp++ = Py_hexdigits[(c>>12)&0xf];
924
*outp++ = Py_hexdigits[(c>>8)&0xf];
925
}
926
else
927
*outp++ = 'x';
928
*outp++ = Py_hexdigits[(c>>4)&0xf];
929
*outp++ = Py_hexdigits[c&0xf];
930
}
931
932
assert(_PyUnicode_CheckConsistency(res, 1));
933
Py_DECREF(object);
934
return Py_BuildValue("(Nn)", res, end);
935
}
936
937
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
938
939
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
940
{
941
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
942
PyObject *restuple;
943
PyObject *object;
944
Py_ssize_t i;
945
Py_ssize_t start;
946
Py_ssize_t end;
947
PyObject *res;
948
Py_UCS1 *outp;
949
Py_ssize_t ressize;
950
int replsize;
951
Py_UCS4 c;
952
char buffer[256]; /* NAME_MAXLEN */
953
if (PyUnicodeEncodeError_GetStart(exc, &start))
954
return NULL;
955
if (PyUnicodeEncodeError_GetEnd(exc, &end))
956
return NULL;
957
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
958
return NULL;
959
if (!ucnhash_capi) {
960
/* load the unicode data module */
961
ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
962
PyUnicodeData_CAPSULE_NAME, 1);
963
if (!ucnhash_capi) {
964
return NULL;
965
}
966
}
967
for (i = start, ressize = 0; i < end; ++i) {
968
/* object is guaranteed to be "ready" */
969
c = PyUnicode_READ_CHAR(object, i);
970
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
971
replsize = 1+1+1+(int)strlen(buffer)+1;
972
}
973
else if (c >= 0x10000) {
974
replsize = 1+1+8;
975
}
976
else if (c >= 0x100) {
977
replsize = 1+1+4;
978
}
979
else
980
replsize = 1+1+2;
981
if (ressize > PY_SSIZE_T_MAX - replsize)
982
break;
983
ressize += replsize;
984
}
985
end = i;
986
res = PyUnicode_New(ressize, 127);
987
if (res==NULL)
988
return NULL;
989
for (i = start, outp = PyUnicode_1BYTE_DATA(res);
990
i < end; ++i) {
991
c = PyUnicode_READ_CHAR(object, i);
992
*outp++ = '\\';
993
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
994
*outp++ = 'N';
995
*outp++ = '{';
996
strcpy((char *)outp, buffer);
997
outp += strlen(buffer);
998
*outp++ = '}';
999
continue;
1000
}
1001
if (c >= 0x00010000) {
1002
*outp++ = 'U';
1003
*outp++ = Py_hexdigits[(c>>28)&0xf];
1004
*outp++ = Py_hexdigits[(c>>24)&0xf];
1005
*outp++ = Py_hexdigits[(c>>20)&0xf];
1006
*outp++ = Py_hexdigits[(c>>16)&0xf];
1007
*outp++ = Py_hexdigits[(c>>12)&0xf];
1008
*outp++ = Py_hexdigits[(c>>8)&0xf];
1009
}
1010
else if (c >= 0x100) {
1011
*outp++ = 'u';
1012
*outp++ = Py_hexdigits[(c>>12)&0xf];
1013
*outp++ = Py_hexdigits[(c>>8)&0xf];
1014
}
1015
else
1016
*outp++ = 'x';
1017
*outp++ = Py_hexdigits[(c>>4)&0xf];
1018
*outp++ = Py_hexdigits[c&0xf];
1019
}
1020
1021
assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1022
assert(_PyUnicode_CheckConsistency(res, 1));
1023
restuple = Py_BuildValue("(Nn)", res, end);
1024
Py_DECREF(object);
1025
return restuple;
1026
}
1027
else {
1028
wrong_exception_type(exc);
1029
return NULL;
1030
}
1031
}
1032
1033
#define ENC_UNKNOWN -1
1034
#define ENC_UTF8 0
1035
#define ENC_UTF16BE 1
1036
#define ENC_UTF16LE 2
1037
#define ENC_UTF32BE 3
1038
#define ENC_UTF32LE 4
1039
1040
static int
1041
get_standard_encoding(const char *encoding, int *bytelength)
1042
{
1043
if (Py_TOLOWER(encoding[0]) == 'u' &&
1044
Py_TOLOWER(encoding[1]) == 't' &&
1045
Py_TOLOWER(encoding[2]) == 'f') {
1046
encoding += 3;
1047
if (*encoding == '-' || *encoding == '_' )
1048
encoding++;
1049
if (encoding[0] == '8' && encoding[1] == '\0') {
1050
*bytelength = 3;
1051
return ENC_UTF8;
1052
}
1053
else if (encoding[0] == '1' && encoding[1] == '6') {
1054
encoding += 2;
1055
*bytelength = 2;
1056
if (*encoding == '\0') {
1057
#ifdef WORDS_BIGENDIAN
1058
return ENC_UTF16BE;
1059
#else
1060
return ENC_UTF16LE;
1061
#endif
1062
}
1063
if (*encoding == '-' || *encoding == '_' )
1064
encoding++;
1065
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1066
if (Py_TOLOWER(encoding[0]) == 'b')
1067
return ENC_UTF16BE;
1068
if (Py_TOLOWER(encoding[0]) == 'l')
1069
return ENC_UTF16LE;
1070
}
1071
}
1072
else if (encoding[0] == '3' && encoding[1] == '2') {
1073
encoding += 2;
1074
*bytelength = 4;
1075
if (*encoding == '\0') {
1076
#ifdef WORDS_BIGENDIAN
1077
return ENC_UTF32BE;
1078
#else
1079
return ENC_UTF32LE;
1080
#endif
1081
}
1082
if (*encoding == '-' || *encoding == '_' )
1083
encoding++;
1084
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1085
if (Py_TOLOWER(encoding[0]) == 'b')
1086
return ENC_UTF32BE;
1087
if (Py_TOLOWER(encoding[0]) == 'l')
1088
return ENC_UTF32LE;
1089
}
1090
}
1091
}
1092
else if (strcmp(encoding, "CP_UTF8") == 0) {
1093
*bytelength = 3;
1094
return ENC_UTF8;
1095
}
1096
return ENC_UNKNOWN;
1097
}
1098
1099
/* This handler is declared static until someone demonstrates
1100
a need to call it directly. */
1101
static PyObject *
1102
PyCodec_SurrogatePassErrors(PyObject *exc)
1103
{
1104
PyObject *restuple;
1105
PyObject *object;
1106
PyObject *encode;
1107
const char *encoding;
1108
int code;
1109
int bytelength;
1110
Py_ssize_t i;
1111
Py_ssize_t start;
1112
Py_ssize_t end;
1113
PyObject *res;
1114
1115
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1116
unsigned char *outp;
1117
if (PyUnicodeEncodeError_GetStart(exc, &start))
1118
return NULL;
1119
if (PyUnicodeEncodeError_GetEnd(exc, &end))
1120
return NULL;
1121
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1122
return NULL;
1123
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1124
Py_DECREF(object);
1125
return NULL;
1126
}
1127
if (!(encoding = PyUnicode_AsUTF8(encode))) {
1128
Py_DECREF(object);
1129
Py_DECREF(encode);
1130
return NULL;
1131
}
1132
code = get_standard_encoding(encoding, &bytelength);
1133
Py_DECREF(encode);
1134
if (code == ENC_UNKNOWN) {
1135
/* Not supported, fail with original exception */
1136
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1137
Py_DECREF(object);
1138
return NULL;
1139
}
1140
1141
if (end - start > PY_SSIZE_T_MAX / bytelength)
1142
end = start + PY_SSIZE_T_MAX / bytelength;
1143
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1144
if (!res) {
1145
Py_DECREF(object);
1146
return NULL;
1147
}
1148
outp = (unsigned char*)PyBytes_AsString(res);
1149
for (i = start; i < end; i++) {
1150
/* object is guaranteed to be "ready" */
1151
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1152
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1153
/* Not a surrogate, fail with original exception */
1154
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1155
Py_DECREF(res);
1156
Py_DECREF(object);
1157
return NULL;
1158
}
1159
switch (code) {
1160
case ENC_UTF8:
1161
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
1162
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1163
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1164
break;
1165
case ENC_UTF16LE:
1166
*outp++ = (unsigned char) ch;
1167
*outp++ = (unsigned char)(ch >> 8);
1168
break;
1169
case ENC_UTF16BE:
1170
*outp++ = (unsigned char)(ch >> 8);
1171
*outp++ = (unsigned char) ch;
1172
break;
1173
case ENC_UTF32LE:
1174
*outp++ = (unsigned char) ch;
1175
*outp++ = (unsigned char)(ch >> 8);
1176
*outp++ = (unsigned char)(ch >> 16);
1177
*outp++ = (unsigned char)(ch >> 24);
1178
break;
1179
case ENC_UTF32BE:
1180
*outp++ = (unsigned char)(ch >> 24);
1181
*outp++ = (unsigned char)(ch >> 16);
1182
*outp++ = (unsigned char)(ch >> 8);
1183
*outp++ = (unsigned char) ch;
1184
break;
1185
}
1186
}
1187
restuple = Py_BuildValue("(On)", res, end);
1188
Py_DECREF(res);
1189
Py_DECREF(object);
1190
return restuple;
1191
}
1192
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1193
const unsigned char *p;
1194
Py_UCS4 ch = 0;
1195
if (PyUnicodeDecodeError_GetStart(exc, &start))
1196
return NULL;
1197
if (PyUnicodeDecodeError_GetEnd(exc, &end))
1198
return NULL;
1199
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1200
return NULL;
1201
p = (const unsigned char*)PyBytes_AS_STRING(object);
1202
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1203
Py_DECREF(object);
1204
return NULL;
1205
}
1206
if (!(encoding = PyUnicode_AsUTF8(encode))) {
1207
Py_DECREF(object);
1208
Py_DECREF(encode);
1209
return NULL;
1210
}
1211
code = get_standard_encoding(encoding, &bytelength);
1212
Py_DECREF(encode);
1213
if (code == ENC_UNKNOWN) {
1214
/* Not supported, fail with original exception */
1215
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1216
Py_DECREF(object);
1217
return NULL;
1218
}
1219
1220
/* Try decoding a single surrogate character. If
1221
there are more, let the codec call us again. */
1222
p += start;
1223
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1224
switch (code) {
1225
case ENC_UTF8:
1226
if ((p[0] & 0xf0) == 0xe0 &&
1227
(p[1] & 0xc0) == 0x80 &&
1228
(p[2] & 0xc0) == 0x80) {
1229
/* it's a three-byte code */
1230
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1231
}
1232
break;
1233
case ENC_UTF16LE:
1234
ch = p[1] << 8 | p[0];
1235
break;
1236
case ENC_UTF16BE:
1237
ch = p[0] << 8 | p[1];
1238
break;
1239
case ENC_UTF32LE:
1240
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1241
break;
1242
case ENC_UTF32BE:
1243
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1244
break;
1245
}
1246
}
1247
1248
Py_DECREF(object);
1249
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1250
/* it's not a surrogate - fail */
1251
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1252
return NULL;
1253
}
1254
res = PyUnicode_FromOrdinal(ch);
1255
if (res == NULL)
1256
return NULL;
1257
return Py_BuildValue("(Nn)", res, start + bytelength);
1258
}
1259
else {
1260
wrong_exception_type(exc);
1261
return NULL;
1262
}
1263
}
1264
1265
static PyObject *
1266
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1267
{
1268
PyObject *restuple;
1269
PyObject *object;
1270
Py_ssize_t i;
1271
Py_ssize_t start;
1272
Py_ssize_t end;
1273
PyObject *res;
1274
1275
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1276
char *outp;
1277
if (PyUnicodeEncodeError_GetStart(exc, &start))
1278
return NULL;
1279
if (PyUnicodeEncodeError_GetEnd(exc, &end))
1280
return NULL;
1281
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1282
return NULL;
1283
res = PyBytes_FromStringAndSize(NULL, end-start);
1284
if (!res) {
1285
Py_DECREF(object);
1286
return NULL;
1287
}
1288
outp = PyBytes_AsString(res);
1289
for (i = start; i < end; i++) {
1290
/* object is guaranteed to be "ready" */
1291
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1292
if (ch < 0xdc80 || ch > 0xdcff) {
1293
/* Not a UTF-8b surrogate, fail with original exception */
1294
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1295
Py_DECREF(res);
1296
Py_DECREF(object);
1297
return NULL;
1298
}
1299
*outp++ = ch - 0xdc00;
1300
}
1301
restuple = Py_BuildValue("(On)", res, end);
1302
Py_DECREF(res);
1303
Py_DECREF(object);
1304
return restuple;
1305
}
1306
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1307
PyObject *str;
1308
const unsigned char *p;
1309
Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1310
int consumed = 0;
1311
if (PyUnicodeDecodeError_GetStart(exc, &start))
1312
return NULL;
1313
if (PyUnicodeDecodeError_GetEnd(exc, &end))
1314
return NULL;
1315
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1316
return NULL;
1317
p = (const unsigned char*)PyBytes_AS_STRING(object);
1318
while (consumed < 4 && consumed < end-start) {
1319
/* Refuse to escape ASCII bytes. */
1320
if (p[start+consumed] < 128)
1321
break;
1322
ch[consumed] = 0xdc00 + p[start+consumed];
1323
consumed++;
1324
}
1325
Py_DECREF(object);
1326
if (!consumed) {
1327
/* codec complained about ASCII byte. */
1328
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1329
return NULL;
1330
}
1331
str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1332
if (str == NULL)
1333
return NULL;
1334
return Py_BuildValue("(Nn)", str, start+consumed);
1335
}
1336
else {
1337
wrong_exception_type(exc);
1338
return NULL;
1339
}
1340
}
1341
1342
1343
static PyObject *strict_errors(PyObject *self, PyObject *exc)
1344
{
1345
return PyCodec_StrictErrors(exc);
1346
}
1347
1348
1349
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1350
{
1351
return PyCodec_IgnoreErrors(exc);
1352
}
1353
1354
1355
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1356
{
1357
return PyCodec_ReplaceErrors(exc);
1358
}
1359
1360
1361
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1362
{
1363
return PyCodec_XMLCharRefReplaceErrors(exc);
1364
}
1365
1366
1367
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1368
{
1369
return PyCodec_BackslashReplaceErrors(exc);
1370
}
1371
1372
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1373
{
1374
return PyCodec_NameReplaceErrors(exc);
1375
}
1376
1377
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1378
{
1379
return PyCodec_SurrogatePassErrors(exc);
1380
}
1381
1382
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1383
{
1384
return PyCodec_SurrogateEscapeErrors(exc);
1385
}
1386
1387
static int _PyCodecRegistry_Init(void)
1388
{
1389
static struct {
1390
const char *name;
1391
PyMethodDef def;
1392
} methods[] =
1393
{
1394
{
1395
"strict",
1396
{
1397
"strict_errors",
1398
strict_errors,
1399
METH_O,
1400
PyDoc_STR("Implements the 'strict' error handling, which "
1401
"raises a UnicodeError on coding errors.")
1402
}
1403
},
1404
{
1405
"ignore",
1406
{
1407
"ignore_errors",
1408
ignore_errors,
1409
METH_O,
1410
PyDoc_STR("Implements the 'ignore' error handling, which "
1411
"ignores malformed data and continues.")
1412
}
1413
},
1414
{
1415
"replace",
1416
{
1417
"replace_errors",
1418
replace_errors,
1419
METH_O,
1420
PyDoc_STR("Implements the 'replace' error handling, which "
1421
"replaces malformed data with a replacement marker.")
1422
}
1423
},
1424
{
1425
"xmlcharrefreplace",
1426
{
1427
"xmlcharrefreplace_errors",
1428
xmlcharrefreplace_errors,
1429
METH_O,
1430
PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1431
"which replaces an unencodable character with the "
1432
"appropriate XML character reference.")
1433
}
1434
},
1435
{
1436
"backslashreplace",
1437
{
1438
"backslashreplace_errors",
1439
backslashreplace_errors,
1440
METH_O,
1441
PyDoc_STR("Implements the 'backslashreplace' error handling, "
1442
"which replaces malformed data with a backslashed "
1443
"escape sequence.")
1444
}
1445
},
1446
{
1447
"namereplace",
1448
{
1449
"namereplace_errors",
1450
namereplace_errors,
1451
METH_O,
1452
PyDoc_STR("Implements the 'namereplace' error handling, "
1453
"which replaces an unencodable character with a "
1454
"\\N{...} escape sequence.")
1455
}
1456
},
1457
{
1458
"surrogatepass",
1459
{
1460
"surrogatepass",
1461
surrogatepass_errors,
1462
METH_O
1463
}
1464
},
1465
{
1466
"surrogateescape",
1467
{
1468
"surrogateescape",
1469
surrogateescape_errors,
1470
METH_O
1471
}
1472
}
1473
};
1474
1475
PyInterpreterState *interp = _PyInterpreterState_GET();
1476
PyObject *mod;
1477
1478
if (interp->codec_search_path != NULL)
1479
return 0;
1480
1481
interp->codec_search_path = PyList_New(0);
1482
if (interp->codec_search_path == NULL) {
1483
return -1;
1484
}
1485
1486
interp->codec_search_cache = PyDict_New();
1487
if (interp->codec_search_cache == NULL) {
1488
return -1;
1489
}
1490
1491
interp->codec_error_registry = PyDict_New();
1492
if (interp->codec_error_registry == NULL) {
1493
return -1;
1494
}
1495
1496
for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1497
PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1498
if (!func) {
1499
return -1;
1500
}
1501
1502
int res = PyCodec_RegisterError(methods[i].name, func);
1503
Py_DECREF(func);
1504
if (res) {
1505
return -1;
1506
}
1507
}
1508
1509
mod = PyImport_ImportModule("encodings");
1510
if (mod == NULL) {
1511
return -1;
1512
}
1513
Py_DECREF(mod);
1514
interp->codecs_initialized = 1;
1515
return 0;
1516
}
1517
1518