CoCalc -- codecs.c

GitHub Repository: allendowney/cpython
Path: blob/main/Python/codecs.c
¹² views
1
/* ------------------------------------------------------------------------
2

3
   Python Codec Registry and support functions
4

5
Written by Marc-Andre Lemburg ([email protected]).
6

7
Copyright (c) Corporation for National Research Initiatives.
8

9
   ------------------------------------------------------------------------ */
10

11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"       // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
17
#include <ctype.h>
18

19
const char *Py_hexdigits = "0123456789abcdef";
20

21
/* --- Codec Registry ----------------------------------------------------- */
22

23
/* Import the standard encodings package which will register the first
24
   codec search function.
25

26
   This is done in a lazy way so that the Unicode implementation does
27
   not downgrade startup time of scripts not needing it.
28

29
   ImportErrors are silently ignored by this function. Only one try is
30
   made.
31

32
*/
33

34
static int _PyCodecRegistry_Init(void); /* Forward */
35

36
int PyCodec_Register(PyObject *search_function)
37
{
38
    PyInterpreterState *interp = _PyInterpreterState_GET();
39
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
40
        goto onError;
41
    if (search_function == NULL) {
42
        PyErr_BadArgument();
43
        goto onError;
44
    }
45
    if (!PyCallable_Check(search_function)) {
46
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
47
        goto onError;
48
    }
49
    return PyList_Append(interp->codec_search_path, search_function);
50

51
 onError:
52
    return -1;
53
}
54

55
int
56
PyCodec_Unregister(PyObject *search_function)
57
{
58
    PyInterpreterState *interp = PyInterpreterState_Get();
59
    PyObject *codec_search_path = interp->codec_search_path;
60
    /* Do nothing if codec_search_path is not created yet or was cleared. */
61
    if (codec_search_path == NULL) {
62
        return 0;
63
    }
64

65
    assert(PyList_CheckExact(codec_search_path));
66
    Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
67
    for (Py_ssize_t i = 0; i < n; i++) {
68
        PyObject *item = PyList_GET_ITEM(codec_search_path, i);
69
        if (item == search_function) {
70
            if (interp->codec_search_cache != NULL) {
71
                assert(PyDict_CheckExact(interp->codec_search_cache));
72
                PyDict_Clear(interp->codec_search_cache);
73
            }
74
            return PyList_SetSlice(codec_search_path, i, i+1, NULL);
75
        }
76
    }
77
    return 0;
78
}
79

80
extern int _Py_normalize_encoding(const char *, char *, size_t);
81

82
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
83
   converted to lower case, spaces and hyphens are replaced with underscores. */
84

85
static
86
PyObject *normalizestring(const char *string)
87
{
88
    size_t len = strlen(string);
89
    char *encoding;
90
    PyObject *v;
91

92
    if (len > PY_SSIZE_T_MAX) {
93
        PyErr_SetString(PyExc_OverflowError, "string is too large");
94
        return NULL;
95
    }
96

97
    encoding = PyMem_Malloc(len + 1);
98
    if (encoding == NULL)
99
        return PyErr_NoMemory();
100

101
    if (!_Py_normalize_encoding(string, encoding, len + 1))
102
    {
103
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
104
        PyMem_Free(encoding);
105
        return NULL;
106
    }
107

108
    v = PyUnicode_FromString(encoding);
109
    PyMem_Free(encoding);
110
    return v;
111
}
112

113
/* Lookup the given encoding and return a tuple providing the codec
114
   facilities.
115

116
   The encoding string is looked up converted to all lower-case
117
   characters. This makes encodings looked up through this mechanism
118
   effectively case-insensitive.
119

120
   If no codec is found, a LookupError is set and NULL returned.
121

122
   As side effect, this tries to load the encodings package, if not
123
   yet done. This is part of the lazy load strategy for the encodings
124
   package.
125

126
*/
127

128
PyObject *_PyCodec_Lookup(const char *encoding)
129
{
130
    if (encoding == NULL) {
131
        PyErr_BadArgument();
132
        return NULL;
133
    }
134

135
    PyInterpreterState *interp = _PyInterpreterState_GET();
136
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
137
        return NULL;
138
    }
139

140
    /* Convert the encoding to a normalized Python string: all
141
       characters are converted to lower case, spaces and hyphens are
142
       replaced with underscores. */
143
    PyObject *v = normalizestring(encoding);
144
    if (v == NULL) {
145
        return NULL;
146
    }
147
    PyUnicode_InternInPlace(&v);
148

149
    /* First, try to lookup the name in the registry dictionary */
150
    PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
151
    if (result != NULL) {
152
        Py_INCREF(result);
153
        Py_DECREF(v);
154
        return result;
155
    }
156
    else if (PyErr_Occurred()) {
157
        goto onError;
158
    }
159

160
    /* Next, scan the search functions in order of registration */
161
    const Py_ssize_t len = PyList_Size(interp->codec_search_path);
162
    if (len < 0)
163
        goto onError;
164
    if (len == 0) {
165
        PyErr_SetString(PyExc_LookupError,
166
                        "no codec search functions registered: "
167
                        "can't find encoding");
168
        goto onError;
169
    }
170

171
    Py_ssize_t i;
172
    for (i = 0; i < len; i++) {
173
        PyObject *func;
174

175
        func = PyList_GetItem(interp->codec_search_path, i);
176
        if (func == NULL)
177
            goto onError;
178
        result = PyObject_CallOneArg(func, v);
179
        if (result == NULL)
180
            goto onError;
181
        if (result == Py_None) {
182
            Py_DECREF(result);
183
            continue;
184
        }
185
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
186
            PyErr_SetString(PyExc_TypeError,
187
                            "codec search functions must return 4-tuples");
188
            Py_DECREF(result);
189
            goto onError;
190
        }
191
        break;
192
    }
193
    if (i == len) {
194
        /* XXX Perhaps we should cache misses too ? */
195
        PyErr_Format(PyExc_LookupError,
196
                     "unknown encoding: %s", encoding);
197
        goto onError;
198
    }
199

200
    /* Cache and return the result */
201
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
202
        Py_DECREF(result);
203
        goto onError;
204
    }
205
    Py_DECREF(v);
206
    return result;
207

208
 onError:
209
    Py_DECREF(v);
210
    return NULL;
211
}
212

213
/* Codec registry encoding check API. */
214

215
int PyCodec_KnownEncoding(const char *encoding)
216
{
217
    PyObject *codecs;
218

219
    codecs = _PyCodec_Lookup(encoding);
220
    if (!codecs) {
221
        PyErr_Clear();
222
        return 0;
223
    }
224
    else {
225
        Py_DECREF(codecs);
226
        return 1;
227
    }
228
}
229

230
static
231
PyObject *args_tuple(PyObject *object,
232
                     const char *errors)
233
{
234
    PyObject *args;
235

236
    args = PyTuple_New(1 + (errors != NULL));
237
    if (args == NULL)
238
        return NULL;
239
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
240
    if (errors) {
241
        PyObject *v;
242

243
        v = PyUnicode_FromString(errors);
244
        if (v == NULL) {
245
            Py_DECREF(args);
246
            return NULL;
247
        }
248
        PyTuple_SET_ITEM(args, 1, v);
249
    }
250
    return args;
251
}
252

253
/* Helper function to get a codec item */
254

255
static
256
PyObject *codec_getitem(const char *encoding, int index)
257
{
258
    PyObject *codecs;
259
    PyObject *v;
260

261
    codecs = _PyCodec_Lookup(encoding);
262
    if (codecs == NULL)
263
        return NULL;
264
    v = PyTuple_GET_ITEM(codecs, index);
265
    Py_DECREF(codecs);
266
    return Py_NewRef(v);
267
}
268

269
/* Helper functions to create an incremental codec. */
270
static
271
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272
                                     const char *errors,
273
                                     const char *attrname)
274
{
275
    PyObject *ret, *inccodec;
276

277
    inccodec = PyObject_GetAttrString(codec_info, attrname);
278
    if (inccodec == NULL)
279
        return NULL;
280
    if (errors)
281
        ret = PyObject_CallFunction(inccodec, "s", errors);
282
    else
283
        ret = _PyObject_CallNoArgs(inccodec);
284
    Py_DECREF(inccodec);
285
    return ret;
286
}
287

288
static
289
PyObject *codec_getincrementalcodec(const char *encoding,
290
                                    const char *errors,
291
                                    const char *attrname)
292
{
293
    PyObject *codec_info, *ret;
294

295
    codec_info = _PyCodec_Lookup(encoding);
296
    if (codec_info == NULL)
297
        return NULL;
298
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299
    Py_DECREF(codec_info);
300
    return ret;
301
}
302

303
/* Helper function to create a stream codec. */
304

305
static
306
PyObject *codec_getstreamcodec(const char *encoding,
307
                               PyObject *stream,
308
                               const char *errors,
309
                               const int index)
310
{
311
    PyObject *codecs, *streamcodec, *codeccls;
312

313
    codecs = _PyCodec_Lookup(encoding);
314
    if (codecs == NULL)
315
        return NULL;
316

317
    codeccls = PyTuple_GET_ITEM(codecs, index);
318
    if (errors != NULL)
319
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
320
    else
321
        streamcodec = PyObject_CallOneArg(codeccls, stream);
322
    Py_DECREF(codecs);
323
    return streamcodec;
324
}
325

326
/* Helpers to work with the result of _PyCodec_Lookup
327

328
 */
329
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330
                                             const char *errors)
331
{
332
    return codec_makeincrementalcodec(codec_info, errors,
333
                                      "incrementaldecoder");
334
}
335

336
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337
                                             const char *errors)
338
{
339
    return codec_makeincrementalcodec(codec_info, errors,
340
                                      "incrementalencoder");
341
}
342

343

344
/* Convenience APIs to query the Codec registry.
345

346
   All APIs return a codec object with incremented refcount.
347

348
 */
349

350
PyObject *PyCodec_Encoder(const char *encoding)
351
{
352
    return codec_getitem(encoding, 0);
353
}
354

355
PyObject *PyCodec_Decoder(const char *encoding)
356
{
357
    return codec_getitem(encoding, 1);
358
}
359

360
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
361
                                     const char *errors)
362
{
363
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
364
}
365

366
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
367
                                     const char *errors)
368
{
369
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
370
}
371

372
PyObject *PyCodec_StreamReader(const char *encoding,
373
                               PyObject *stream,
374
                               const char *errors)
375
{
376
    return codec_getstreamcodec(encoding, stream, errors, 2);
377
}
378

379
PyObject *PyCodec_StreamWriter(const char *encoding,
380
                               PyObject *stream,
381
                               const char *errors)
382
{
383
    return codec_getstreamcodec(encoding, stream, errors, 3);
384
}
385

386
/* Encode an object (e.g. a Unicode object) using the given encoding
387
   and return the resulting encoded object (usually a Python string).
388

389
   errors is passed to the encoder factory as argument if non-NULL. */
390

391
static PyObject *
392
_PyCodec_EncodeInternal(PyObject *object,
393
                        PyObject *encoder,
394
                        const char *encoding,
395
                        const char *errors)
396
{
397
    PyObject *args = NULL, *result = NULL;
398
    PyObject *v = NULL;
399

400
    args = args_tuple(object, errors);
401
    if (args == NULL)
402
        goto onError;
403

404
    result = PyObject_Call(encoder, args, NULL);
405
    if (result == NULL) {
406
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
407
        goto onError;
408
    }
409

410
    if (!PyTuple_Check(result) ||
411
        PyTuple_GET_SIZE(result) != 2) {
412
        PyErr_SetString(PyExc_TypeError,
413
                        "encoder must return a tuple (object, integer)");
414
        goto onError;
415
    }
416
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
417
    /* We don't check or use the second (integer) entry. */
418

419
    Py_DECREF(args);
420
    Py_DECREF(encoder);
421
    Py_DECREF(result);
422
    return v;
423

424
 onError:
425
    Py_XDECREF(result);
426
    Py_XDECREF(args);
427
    Py_XDECREF(encoder);
428
    return NULL;
429
}
430

431
/* Decode an object (usually a Python string) using the given encoding
432
   and return an equivalent object (e.g. a Unicode object).
433

434
   errors is passed to the decoder factory as argument if non-NULL. */
435

436
static PyObject *
437
_PyCodec_DecodeInternal(PyObject *object,
438
                        PyObject *decoder,
439
                        const char *encoding,
440
                        const char *errors)
441
{
442
    PyObject *args = NULL, *result = NULL;
443
    PyObject *v;
444

445
    args = args_tuple(object, errors);
446
    if (args == NULL)
447
        goto onError;
448

449
    result = PyObject_Call(decoder, args, NULL);
450
    if (result == NULL) {
451
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
452
        goto onError;
453
    }
454
    if (!PyTuple_Check(result) ||
455
        PyTuple_GET_SIZE(result) != 2) {
456
        PyErr_SetString(PyExc_TypeError,
457
                        "decoder must return a tuple (object,integer)");
458
        goto onError;
459
    }
460
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
461
    /* We don't check or use the second (integer) entry. */
462

463
    Py_DECREF(args);
464
    Py_DECREF(decoder);
465
    Py_DECREF(result);
466
    return v;
467

468
 onError:
469
    Py_XDECREF(args);
470
    Py_XDECREF(decoder);
471
    Py_XDECREF(result);
472
    return NULL;
473
}
474

475
/* Generic encoding/decoding API */
476
PyObject *PyCodec_Encode(PyObject *object,
477
                         const char *encoding,
478
                         const char *errors)
479
{
480
    PyObject *encoder;
481

482
    encoder = PyCodec_Encoder(encoding);
483
    if (encoder == NULL)
484
        return NULL;
485

486
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
487
}
488

489
PyObject *PyCodec_Decode(PyObject *object,
490
                         const char *encoding,
491
                         const char *errors)
492
{
493
    PyObject *decoder;
494

495
    decoder = PyCodec_Decoder(encoding);
496
    if (decoder == NULL)
497
        return NULL;
498

499
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
500
}
501

502
/* Text encoding/decoding API */
503
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
504
                                       const char *alternate_command)
505
{
506
    PyObject *codec;
507
    PyObject *attr;
508
    int is_text_codec;
509

510
    codec = _PyCodec_Lookup(encoding);
511
    if (codec == NULL)
512
        return NULL;
513

514
    /* Backwards compatibility: assume any raw tuple describes a text
515
     * encoding, and the same for anything lacking the private
516
     * attribute.
517
     */
518
    if (!PyTuple_CheckExact(codec)) {
519
        if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
520
            Py_DECREF(codec);
521
            return NULL;
522
        }
523
        if (attr != NULL) {
524
            is_text_codec = PyObject_IsTrue(attr);
525
            Py_DECREF(attr);
526
            if (is_text_codec <= 0) {
527
                Py_DECREF(codec);
528
                if (!is_text_codec)
529
                    PyErr_Format(PyExc_LookupError,
530
                                 "'%.400s' is not a text encoding; "
531
                                 "use %s to handle arbitrary codecs",
532
                                 encoding, alternate_command);
533
                return NULL;
534
            }
535
        }
536
    }
537

538
    /* This appears to be a valid text encoding */
539
    return codec;
540
}
541

542

543
static
544
PyObject *codec_getitem_checked(const char *encoding,
545
                                const char *alternate_command,
546
                                int index)
547
{
548
    PyObject *codec;
549
    PyObject *v;
550

551
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
552
    if (codec == NULL)
553
        return NULL;
554

555
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
556
    Py_DECREF(codec);
557
    return v;
558
}
559

560
static PyObject * _PyCodec_TextEncoder(const char *encoding)
561
{
562
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
563
}
564

565
static PyObject * _PyCodec_TextDecoder(const char *encoding)
566
{
567
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
568
}
569

570
PyObject *_PyCodec_EncodeText(PyObject *object,
571
                              const char *encoding,
572
                              const char *errors)
573
{
574
    PyObject *encoder;
575

576
    encoder = _PyCodec_TextEncoder(encoding);
577
    if (encoder == NULL)
578
        return NULL;
579

580
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
581
}
582

583
PyObject *_PyCodec_DecodeText(PyObject *object,
584
                              const char *encoding,
585
                              const char *errors)
586
{
587
    PyObject *decoder;
588

589
    decoder = _PyCodec_TextDecoder(encoding);
590
    if (decoder == NULL)
591
        return NULL;
592

593
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
594
}
595

596
/* Register the error handling callback function error under the name
597
   name. This function will be called by the codec when it encounters
598
   an unencodable characters/undecodable bytes and doesn't know the
599
   callback name, when name is specified as the error parameter
600
   in the call to the encode/decode function.
601
   Return 0 on success, -1 on error */
602
int PyCodec_RegisterError(const char *name, PyObject *error)
603
{
604
    PyInterpreterState *interp = _PyInterpreterState_GET();
605
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
606
        return -1;
607
    if (!PyCallable_Check(error)) {
608
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
609
        return -1;
610
    }
611
    return PyDict_SetItemString(interp->codec_error_registry,
612
                                name, error);
613
}
614

615
/* Lookup the error handling callback function registered under the
616
   name error. As a special case NULL can be passed, in which case
617
   the error handling callback for strict encoding will be returned. */
618
PyObject *PyCodec_LookupError(const char *name)
619
{
620
    PyObject *handler = NULL;
621

622
    PyInterpreterState *interp = _PyInterpreterState_GET();
623
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
624
        return NULL;
625

626
    if (name==NULL)
627
        name = "strict";
628
    handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
629
    if (handler) {
630
        Py_INCREF(handler);
631
    }
632
    else if (!PyErr_Occurred()) {
633
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
634
    }
635
    return handler;
636
}
637

638
static void wrong_exception_type(PyObject *exc)
639
{
640
    PyErr_Format(PyExc_TypeError,
641
                 "don't know how to handle %.200s in error callback",
642
                 Py_TYPE(exc)->tp_name);
643
}
644

645
PyObject *PyCodec_StrictErrors(PyObject *exc)
646
{
647
    if (PyExceptionInstance_Check(exc))
648
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
649
    else
650
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
651
    return NULL;
652
}
653

654

655
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
656
{
657
    Py_ssize_t end;
658

659
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
660
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
661
            return NULL;
662
    }
663
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
664
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
665
            return NULL;
666
    }
667
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
668
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
669
            return NULL;
670
    }
671
    else {
672
        wrong_exception_type(exc);
673
        return NULL;
674
    }
675
    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
676
}
677

678

679
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
680
{
681
    Py_ssize_t start, end, i, len;
682

683
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
684
        PyObject *res;
685
        Py_UCS1 *outp;
686
        if (PyUnicodeEncodeError_GetStart(exc, &start))
687
            return NULL;
688
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
689
            return NULL;
690
        len = end - start;
691
        res = PyUnicode_New(len, '?');
692
        if (res == NULL)
693
            return NULL;
694
        assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
695
        outp = PyUnicode_1BYTE_DATA(res);
696
        for (i = 0; i < len; ++i)
697
            outp[i] = '?';
698
        assert(_PyUnicode_CheckConsistency(res, 1));
699
        return Py_BuildValue("(Nn)", res, end);
700
    }
701
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
702
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
703
            return NULL;
704
        return Py_BuildValue("(Cn)",
705
                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
706
                             end);
707
    }
708
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
709
        PyObject *res;
710
        Py_UCS2 *outp;
711
        if (PyUnicodeTranslateError_GetStart(exc, &start))
712
            return NULL;
713
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
714
            return NULL;
715
        len = end - start;
716
        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
717
        if (res == NULL)
718
            return NULL;
719
        assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
720
        outp = PyUnicode_2BYTE_DATA(res);
721
        for (i = 0; i < len; i++)
722
            outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
723
        assert(_PyUnicode_CheckConsistency(res, 1));
724
        return Py_BuildValue("(Nn)", res, end);
725
    }
726
    else {
727
        wrong_exception_type(exc);
728
        return NULL;
729
    }
730
}
731

732
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
733
{
734
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
735
        PyObject *restuple;
736
        PyObject *object;
737
        Py_ssize_t i;
738
        Py_ssize_t start;
739
        Py_ssize_t end;
740
        PyObject *res;
741
        Py_UCS1 *outp;
742
        Py_ssize_t ressize;
743
        Py_UCS4 ch;
744
        if (PyUnicodeEncodeError_GetStart(exc, &start))
745
            return NULL;
746
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
747
            return NULL;
748
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
749
            return NULL;
750
        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
751
            end = start + PY_SSIZE_T_MAX / (2+7+1);
752
        for (i = start, ressize = 0; i < end; ++i) {
753
            /* object is guaranteed to be "ready" */
754
            ch = PyUnicode_READ_CHAR(object, i);
755
            if (ch<10)
756
                ressize += 2+1+1;
757
            else if (ch<100)
758
                ressize += 2+2+1;
759
            else if (ch<1000)
760
                ressize += 2+3+1;
761
            else if (ch<10000)
762
                ressize += 2+4+1;
763
            else if (ch<100000)
764
                ressize += 2+5+1;
765
            else if (ch<1000000)
766
                ressize += 2+6+1;
767
            else
768
                ressize += 2+7+1;
769
        }
770
        /* allocate replacement */
771
        res = PyUnicode_New(ressize, 127);
772
        if (res == NULL) {
773
            Py_DECREF(object);
774
            return NULL;
775
        }
776
        outp = PyUnicode_1BYTE_DATA(res);
777
        /* generate replacement */
778
        for (i = start; i < end; ++i) {
779
            int digits;
780
            int base;
781
            ch = PyUnicode_READ_CHAR(object, i);
782
            *outp++ = '&';
783
            *outp++ = '#';
784
            if (ch<10) {
785
                digits = 1;
786
                base = 1;
787
            }
788
            else if (ch<100) {
789
                digits = 2;
790
                base = 10;
791
            }
792
            else if (ch<1000) {
793
                digits = 3;
794
                base = 100;
795
            }
796
            else if (ch<10000) {
797
                digits = 4;
798
                base = 1000;
799
            }
800
            else if (ch<100000) {
801
                digits = 5;
802
                base = 10000;
803
            }
804
            else if (ch<1000000) {
805
                digits = 6;
806
                base = 100000;
807
            }
808
            else {
809
                digits = 7;
810
                base = 1000000;
811
            }
812
            while (digits-->0) {
813
                *outp++ = '0' + ch/base;
814
                ch %= base;
815
                base /= 10;
816
            }
817
            *outp++ = ';';
818
        }
819
        assert(_PyUnicode_CheckConsistency(res, 1));
820
        restuple = Py_BuildValue("(Nn)", res, end);
821
        Py_DECREF(object);
822
        return restuple;
823
    }
824
    else {
825
        wrong_exception_type(exc);
826
        return NULL;
827
    }
828
}
829

830
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
831
{
832
    PyObject *object;
833
    Py_ssize_t i;
834
    Py_ssize_t start;
835
    Py_ssize_t end;
836
    PyObject *res;
837
    Py_UCS1 *outp;
838
    int ressize;
839
    Py_UCS4 c;
840

841
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
842
        const unsigned char *p;
843
        if (PyUnicodeDecodeError_GetStart(exc, &start))
844
            return NULL;
845
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
846
            return NULL;
847
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
848
            return NULL;
849
        p = (const unsigned char*)PyBytes_AS_STRING(object);
850
        res = PyUnicode_New(4 * (end - start), 127);
851
        if (res == NULL) {
852
            Py_DECREF(object);
853
            return NULL;
854
        }
855
        outp = PyUnicode_1BYTE_DATA(res);
856
        for (i = start; i < end; i++, outp += 4) {
857
            unsigned char c = p[i];
858
            outp[0] = '\\';
859
            outp[1] = 'x';
860
            outp[2] = Py_hexdigits[(c>>4)&0xf];
861
            outp[3] = Py_hexdigits[c&0xf];
862
        }
863

864
        assert(_PyUnicode_CheckConsistency(res, 1));
865
        Py_DECREF(object);
866
        return Py_BuildValue("(Nn)", res, end);
867
    }
868
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
869
        if (PyUnicodeEncodeError_GetStart(exc, &start))
870
            return NULL;
871
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
872
            return NULL;
873
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
874
            return NULL;
875
    }
876
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
877
        if (PyUnicodeTranslateError_GetStart(exc, &start))
878
            return NULL;
879
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
880
            return NULL;
881
        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
882
            return NULL;
883
    }
884
    else {
885
        wrong_exception_type(exc);
886
        return NULL;
887
    }
888

889
    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
890
        end = start + PY_SSIZE_T_MAX / (1+1+8);
891
    for (i = start, ressize = 0; i < end; ++i) {
892
        /* object is guaranteed to be "ready" */
893
        c = PyUnicode_READ_CHAR(object, i);
894
        if (c >= 0x10000) {
895
            ressize += 1+1+8;
896
        }
897
        else if (c >= 0x100) {
898
            ressize += 1+1+4;
899
        }
900
        else
901
            ressize += 1+1+2;
902
    }
903
    res = PyUnicode_New(ressize, 127);
904
    if (res == NULL) {
905
        Py_DECREF(object);
906
        return NULL;
907
    }
908
    outp = PyUnicode_1BYTE_DATA(res);
909
    for (i = start; i < end; ++i) {
910
        c = PyUnicode_READ_CHAR(object, i);
911
        *outp++ = '\\';
912
        if (c >= 0x00010000) {
913
            *outp++ = 'U';
914
            *outp++ = Py_hexdigits[(c>>28)&0xf];
915
            *outp++ = Py_hexdigits[(c>>24)&0xf];
916
            *outp++ = Py_hexdigits[(c>>20)&0xf];
917
            *outp++ = Py_hexdigits[(c>>16)&0xf];
918
            *outp++ = Py_hexdigits[(c>>12)&0xf];
919
            *outp++ = Py_hexdigits[(c>>8)&0xf];
920
        }
921
        else if (c >= 0x100) {
922
            *outp++ = 'u';
923
            *outp++ = Py_hexdigits[(c>>12)&0xf];
924
            *outp++ = Py_hexdigits[(c>>8)&0xf];
925
        }
926
        else
927
            *outp++ = 'x';
928
        *outp++ = Py_hexdigits[(c>>4)&0xf];
929
        *outp++ = Py_hexdigits[c&0xf];
930
    }
931

932
    assert(_PyUnicode_CheckConsistency(res, 1));
933
    Py_DECREF(object);
934
    return Py_BuildValue("(Nn)", res, end);
935
}
936

937
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
938

939
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
940
{
941
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
942
        PyObject *restuple;
943
        PyObject *object;
944
        Py_ssize_t i;
945
        Py_ssize_t start;
946
        Py_ssize_t end;
947
        PyObject *res;
948
        Py_UCS1 *outp;
949
        Py_ssize_t ressize;
950
        int replsize;
951
        Py_UCS4 c;
952
        char buffer[256]; /* NAME_MAXLEN */
953
        if (PyUnicodeEncodeError_GetStart(exc, &start))
954
            return NULL;
955
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
956
            return NULL;
957
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
958
            return NULL;
959
        if (!ucnhash_capi) {
960
            /* load the unicode data module */
961
            ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
962
                                            PyUnicodeData_CAPSULE_NAME, 1);
963
            if (!ucnhash_capi) {
964
                return NULL;
965
            }
966
        }
967
        for (i = start, ressize = 0; i < end; ++i) {
968
            /* object is guaranteed to be "ready" */
969
            c = PyUnicode_READ_CHAR(object, i);
970
            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
971
                replsize = 1+1+1+(int)strlen(buffer)+1;
972
            }
973
            else if (c >= 0x10000) {
974
                replsize = 1+1+8;
975
            }
976
            else if (c >= 0x100) {
977
                replsize = 1+1+4;
978
            }
979
            else
980
                replsize = 1+1+2;
981
            if (ressize > PY_SSIZE_T_MAX - replsize)
982
                break;
983
            ressize += replsize;
984
        }
985
        end = i;
986
        res = PyUnicode_New(ressize, 127);
987
        if (res==NULL)
988
            return NULL;
989
        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
990
            i < end; ++i) {
991
            c = PyUnicode_READ_CHAR(object, i);
992
            *outp++ = '\\';
993
            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
994
                *outp++ = 'N';
995
                *outp++ = '{';
996
                strcpy((char *)outp, buffer);
997
                outp += strlen(buffer);
998
                *outp++ = '}';
999
                continue;
1000
            }
1001
            if (c >= 0x00010000) {
1002
                *outp++ = 'U';
1003
                *outp++ = Py_hexdigits[(c>>28)&0xf];
1004
                *outp++ = Py_hexdigits[(c>>24)&0xf];
1005
                *outp++ = Py_hexdigits[(c>>20)&0xf];
1006
                *outp++ = Py_hexdigits[(c>>16)&0xf];
1007
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1008
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1009
            }
1010
            else if (c >= 0x100) {
1011
                *outp++ = 'u';
1012
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1013
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1014
            }
1015
            else
1016
                *outp++ = 'x';
1017
            *outp++ = Py_hexdigits[(c>>4)&0xf];
1018
            *outp++ = Py_hexdigits[c&0xf];
1019
        }
1020

1021
        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1022
        assert(_PyUnicode_CheckConsistency(res, 1));
1023
        restuple = Py_BuildValue("(Nn)", res, end);
1024
        Py_DECREF(object);
1025
        return restuple;
1026
    }
1027
    else {
1028
        wrong_exception_type(exc);
1029
        return NULL;
1030
    }
1031
}
1032

1033
#define ENC_UNKNOWN     -1
1034
#define ENC_UTF8        0
1035
#define ENC_UTF16BE     1
1036
#define ENC_UTF16LE     2
1037
#define ENC_UTF32BE     3
1038
#define ENC_UTF32LE     4
1039

1040
static int
1041
get_standard_encoding(const char *encoding, int *bytelength)
1042
{
1043
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1044
        Py_TOLOWER(encoding[1]) == 't' &&
1045
        Py_TOLOWER(encoding[2]) == 'f') {
1046
        encoding += 3;
1047
        if (*encoding == '-' || *encoding == '_' )
1048
            encoding++;
1049
        if (encoding[0] == '8' && encoding[1] == '\0') {
1050
            *bytelength = 3;
1051
            return ENC_UTF8;
1052
        }
1053
        else if (encoding[0] == '1' && encoding[1] == '6') {
1054
            encoding += 2;
1055
            *bytelength = 2;
1056
            if (*encoding == '\0') {
1057
#ifdef WORDS_BIGENDIAN
1058
                return ENC_UTF16BE;
1059
#else
1060
                return ENC_UTF16LE;
1061
#endif
1062
            }
1063
            if (*encoding == '-' || *encoding == '_' )
1064
                encoding++;
1065
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1066
                if (Py_TOLOWER(encoding[0]) == 'b')
1067
                    return ENC_UTF16BE;
1068
                if (Py_TOLOWER(encoding[0]) == 'l')
1069
                    return ENC_UTF16LE;
1070
            }
1071
        }
1072
        else if (encoding[0] == '3' && encoding[1] == '2') {
1073
            encoding += 2;
1074
            *bytelength = 4;
1075
            if (*encoding == '\0') {
1076
#ifdef WORDS_BIGENDIAN
1077
                return ENC_UTF32BE;
1078
#else
1079
                return ENC_UTF32LE;
1080
#endif
1081
            }
1082
            if (*encoding == '-' || *encoding == '_' )
1083
                encoding++;
1084
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1085
                if (Py_TOLOWER(encoding[0]) == 'b')
1086
                    return ENC_UTF32BE;
1087
                if (Py_TOLOWER(encoding[0]) == 'l')
1088
                    return ENC_UTF32LE;
1089
            }
1090
        }
1091
    }
1092
    else if (strcmp(encoding, "CP_UTF8") == 0) {
1093
        *bytelength = 3;
1094
        return ENC_UTF8;
1095
    }
1096
    return ENC_UNKNOWN;
1097
}
1098

1099
/* This handler is declared static until someone demonstrates
1100
   a need to call it directly. */
1101
static PyObject *
1102
PyCodec_SurrogatePassErrors(PyObject *exc)
1103
{
1104
    PyObject *restuple;
1105
    PyObject *object;
1106
    PyObject *encode;
1107
    const char *encoding;
1108
    int code;
1109
    int bytelength;
1110
    Py_ssize_t i;
1111
    Py_ssize_t start;
1112
    Py_ssize_t end;
1113
    PyObject *res;
1114

1115
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1116
        unsigned char *outp;
1117
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1118
            return NULL;
1119
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1120
            return NULL;
1121
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1122
            return NULL;
1123
        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1124
            Py_DECREF(object);
1125
            return NULL;
1126
        }
1127
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1128
            Py_DECREF(object);
1129
            Py_DECREF(encode);
1130
            return NULL;
1131
        }
1132
        code = get_standard_encoding(encoding, &bytelength);
1133
        Py_DECREF(encode);
1134
        if (code == ENC_UNKNOWN) {
1135
            /* Not supported, fail with original exception */
1136
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1137
            Py_DECREF(object);
1138
            return NULL;
1139
        }
1140

1141
        if (end - start > PY_SSIZE_T_MAX / bytelength)
1142
            end = start + PY_SSIZE_T_MAX / bytelength;
1143
        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1144
        if (!res) {
1145
            Py_DECREF(object);
1146
            return NULL;
1147
        }
1148
        outp = (unsigned char*)PyBytes_AsString(res);
1149
        for (i = start; i < end; i++) {
1150
            /* object is guaranteed to be "ready" */
1151
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1152
            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1153
                /* Not a surrogate, fail with original exception */
1154
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1155
                Py_DECREF(res);
1156
                Py_DECREF(object);
1157
                return NULL;
1158
            }
1159
            switch (code) {
1160
            case ENC_UTF8:
1161
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1162
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1163
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1164
                break;
1165
            case ENC_UTF16LE:
1166
                *outp++ = (unsigned char) ch;
1167
                *outp++ = (unsigned char)(ch >> 8);
1168
                break;
1169
            case ENC_UTF16BE:
1170
                *outp++ = (unsigned char)(ch >> 8);
1171
                *outp++ = (unsigned char) ch;
1172
                break;
1173
            case ENC_UTF32LE:
1174
                *outp++ = (unsigned char) ch;
1175
                *outp++ = (unsigned char)(ch >> 8);
1176
                *outp++ = (unsigned char)(ch >> 16);
1177
                *outp++ = (unsigned char)(ch >> 24);
1178
                break;
1179
            case ENC_UTF32BE:
1180
                *outp++ = (unsigned char)(ch >> 24);
1181
                *outp++ = (unsigned char)(ch >> 16);
1182
                *outp++ = (unsigned char)(ch >> 8);
1183
                *outp++ = (unsigned char) ch;
1184
                break;
1185
            }
1186
        }
1187
        restuple = Py_BuildValue("(On)", res, end);
1188
        Py_DECREF(res);
1189
        Py_DECREF(object);
1190
        return restuple;
1191
    }
1192
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1193
        const unsigned char *p;
1194
        Py_UCS4 ch = 0;
1195
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1196
            return NULL;
1197
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1198
            return NULL;
1199
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1200
            return NULL;
1201
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1202
        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1203
            Py_DECREF(object);
1204
            return NULL;
1205
        }
1206
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1207
            Py_DECREF(object);
1208
            Py_DECREF(encode);
1209
            return NULL;
1210
        }
1211
        code = get_standard_encoding(encoding, &bytelength);
1212
        Py_DECREF(encode);
1213
        if (code == ENC_UNKNOWN) {
1214
            /* Not supported, fail with original exception */
1215
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1216
            Py_DECREF(object);
1217
            return NULL;
1218
        }
1219

1220
        /* Try decoding a single surrogate character. If
1221
           there are more, let the codec call us again. */
1222
        p += start;
1223
        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1224
            switch (code) {
1225
            case ENC_UTF8:
1226
                if ((p[0] & 0xf0) == 0xe0 &&
1227
                    (p[1] & 0xc0) == 0x80 &&
1228
                    (p[2] & 0xc0) == 0x80) {
1229
                    /* it's a three-byte code */
1230
                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1231
                }
1232
                break;
1233
            case ENC_UTF16LE:
1234
                ch = p[1] << 8 | p[0];
1235
                break;
1236
            case ENC_UTF16BE:
1237
                ch = p[0] << 8 | p[1];
1238
                break;
1239
            case ENC_UTF32LE:
1240
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1241
                break;
1242
            case ENC_UTF32BE:
1243
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1244
                break;
1245
            }
1246
        }
1247

1248
        Py_DECREF(object);
1249
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1250
            /* it's not a surrogate - fail */
1251
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1252
            return NULL;
1253
        }
1254
        res = PyUnicode_FromOrdinal(ch);
1255
        if (res == NULL)
1256
            return NULL;
1257
        return Py_BuildValue("(Nn)", res, start + bytelength);
1258
    }
1259
    else {
1260
        wrong_exception_type(exc);
1261
        return NULL;
1262
    }
1263
}
1264

1265
static PyObject *
1266
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1267
{
1268
    PyObject *restuple;
1269
    PyObject *object;
1270
    Py_ssize_t i;
1271
    Py_ssize_t start;
1272
    Py_ssize_t end;
1273
    PyObject *res;
1274

1275
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1276
        char *outp;
1277
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1278
            return NULL;
1279
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1280
            return NULL;
1281
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1282
            return NULL;
1283
        res = PyBytes_FromStringAndSize(NULL, end-start);
1284
        if (!res) {
1285
            Py_DECREF(object);
1286
            return NULL;
1287
        }
1288
        outp = PyBytes_AsString(res);
1289
        for (i = start; i < end; i++) {
1290
            /* object is guaranteed to be "ready" */
1291
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1292
            if (ch < 0xdc80 || ch > 0xdcff) {
1293
                /* Not a UTF-8b surrogate, fail with original exception */
1294
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1295
                Py_DECREF(res);
1296
                Py_DECREF(object);
1297
                return NULL;
1298
            }
1299
            *outp++ = ch - 0xdc00;
1300
        }
1301
        restuple = Py_BuildValue("(On)", res, end);
1302
        Py_DECREF(res);
1303
        Py_DECREF(object);
1304
        return restuple;
1305
    }
1306
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1307
        PyObject *str;
1308
        const unsigned char *p;
1309
        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1310
        int consumed = 0;
1311
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1312
            return NULL;
1313
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1314
            return NULL;
1315
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1316
            return NULL;
1317
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1318
        while (consumed < 4 && consumed < end-start) {
1319
            /* Refuse to escape ASCII bytes. */
1320
            if (p[start+consumed] < 128)
1321
                break;
1322
            ch[consumed] = 0xdc00 + p[start+consumed];
1323
            consumed++;
1324
        }
1325
        Py_DECREF(object);
1326
        if (!consumed) {
1327
            /* codec complained about ASCII byte. */
1328
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1329
            return NULL;
1330
        }
1331
        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1332
        if (str == NULL)
1333
            return NULL;
1334
        return Py_BuildValue("(Nn)", str, start+consumed);
1335
    }
1336
    else {
1337
        wrong_exception_type(exc);
1338
        return NULL;
1339
    }
1340
}
1341

1342

1343
static PyObject *strict_errors(PyObject *self, PyObject *exc)
1344
{
1345
    return PyCodec_StrictErrors(exc);
1346
}
1347

1348

1349
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1350
{
1351
    return PyCodec_IgnoreErrors(exc);
1352
}
1353

1354

1355
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1356
{
1357
    return PyCodec_ReplaceErrors(exc);
1358
}
1359

1360

1361
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1362
{
1363
    return PyCodec_XMLCharRefReplaceErrors(exc);
1364
}
1365

1366

1367
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1368
{
1369
    return PyCodec_BackslashReplaceErrors(exc);
1370
}
1371

1372
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1373
{
1374
    return PyCodec_NameReplaceErrors(exc);
1375
}
1376

1377
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1378
{
1379
    return PyCodec_SurrogatePassErrors(exc);
1380
}
1381

1382
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1383
{
1384
    return PyCodec_SurrogateEscapeErrors(exc);
1385
}
1386

1387
static int _PyCodecRegistry_Init(void)
1388
{
1389
    static struct {
1390
        const char *name;
1391
        PyMethodDef def;
1392
    } methods[] =
1393
    {
1394
        {
1395
            "strict",
1396
            {
1397
                "strict_errors",
1398
                strict_errors,
1399
                METH_O,
1400
                PyDoc_STR("Implements the 'strict' error handling, which "
1401
                          "raises a UnicodeError on coding errors.")
1402
            }
1403
        },
1404
        {
1405
            "ignore",
1406
            {
1407
                "ignore_errors",
1408
                ignore_errors,
1409
                METH_O,
1410
                PyDoc_STR("Implements the 'ignore' error handling, which "
1411
                          "ignores malformed data and continues.")
1412
            }
1413
        },
1414
        {
1415
            "replace",
1416
            {
1417
                "replace_errors",
1418
                replace_errors,
1419
                METH_O,
1420
                PyDoc_STR("Implements the 'replace' error handling, which "
1421
                          "replaces malformed data with a replacement marker.")
1422
            }
1423
        },
1424
        {
1425
            "xmlcharrefreplace",
1426
            {
1427
                "xmlcharrefreplace_errors",
1428
                xmlcharrefreplace_errors,
1429
                METH_O,
1430
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1431
                          "which replaces an unencodable character with the "
1432
                          "appropriate XML character reference.")
1433
            }
1434
        },
1435
        {
1436
            "backslashreplace",
1437
            {
1438
                "backslashreplace_errors",
1439
                backslashreplace_errors,
1440
                METH_O,
1441
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1442
                          "which replaces malformed data with a backslashed "
1443
                          "escape sequence.")
1444
            }
1445
        },
1446
        {
1447
            "namereplace",
1448
            {
1449
                "namereplace_errors",
1450
                namereplace_errors,
1451
                METH_O,
1452
                PyDoc_STR("Implements the 'namereplace' error handling, "
1453
                          "which replaces an unencodable character with a "
1454
                          "\\N{...} escape sequence.")
1455
            }
1456
        },
1457
        {
1458
            "surrogatepass",
1459
            {
1460
                "surrogatepass",
1461
                surrogatepass_errors,
1462
                METH_O
1463
            }
1464
        },
1465
        {
1466
            "surrogateescape",
1467
            {
1468
                "surrogateescape",
1469
                surrogateescape_errors,
1470
                METH_O
1471
            }
1472
        }
1473
    };
1474

1475
    PyInterpreterState *interp = _PyInterpreterState_GET();
1476
    PyObject *mod;
1477

1478
    if (interp->codec_search_path != NULL)
1479
        return 0;
1480

1481
    interp->codec_search_path = PyList_New(0);
1482
    if (interp->codec_search_path == NULL) {
1483
        return -1;
1484
    }
1485

1486
    interp->codec_search_cache = PyDict_New();
1487
    if (interp->codec_search_cache == NULL) {
1488
        return -1;
1489
    }
1490

1491
    interp->codec_error_registry = PyDict_New();
1492
    if (interp->codec_error_registry == NULL) {
1493
        return -1;
1494
    }
1495

1496
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1497
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1498
        if (!func) {
1499
            return -1;
1500
        }
1501

1502
        int res = PyCodec_RegisterError(methods[i].name, func);
1503
        Py_DECREF(func);
1504
        if (res) {
1505
            return -1;
1506
        }
1507
    }
1508

1509
    mod = PyImport_ImportModule("encodings");
1510
    if (mod == NULL) {
1511
        return -1;
1512
    }
1513
    Py_DECREF(mod);
1514
    interp->codecs_initialized = 1;
1515
    return 0;
1516
}
1517

1518
Product

Resources

Company