CoCalc -- codecs.py

GitHub Repository: allendowney/cpython
Path: blob/main/Lib/codecs.py
¹² views
1
""" codecs -- Python Codec Registry, API and helpers.
2

3

4
Written by Marc-Andre Lemburg ([email protected]).
5

6
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7

8
"""
9

10
import builtins
11
import sys
12

13
### Registry and builtin stateless codec functions
14

15
try:
16
    from _codecs import *
17
except ImportError as why:
18
    raise SystemError('Failed to load the builtin codecs: %s' % why)
19

20
__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21
           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22
           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23
           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24
           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25
           "StreamReader", "StreamWriter",
26
           "StreamReaderWriter", "StreamRecoder",
27
           "getencoder", "getdecoder", "getincrementalencoder",
28
           "getincrementaldecoder", "getreader", "getwriter",
29
           "encode", "decode", "iterencode", "iterdecode",
30
           "strict_errors", "ignore_errors", "replace_errors",
31
           "xmlcharrefreplace_errors",
32
           "backslashreplace_errors", "namereplace_errors",
33
           "register_error", "lookup_error"]
34

35
### Constants
36

37
#
38
# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39
# and its possible byte string values
40
# for UTF8/UTF16/UTF32 output and little/big endian machines
41
#
42

43
# UTF-8
44
BOM_UTF8 = b'\xef\xbb\xbf'
45

46
# UTF-16, little endian
47
BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48

49
# UTF-16, big endian
50
BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51

52
# UTF-32, little endian
53
BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54

55
# UTF-32, big endian
56
BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57

58
if sys.byteorder == 'little':
59

60
    # UTF-16, native endianness
61
    BOM = BOM_UTF16 = BOM_UTF16_LE
62

63
    # UTF-32, native endianness
64
    BOM_UTF32 = BOM_UTF32_LE
65

66
else:
67

68
    # UTF-16, native endianness
69
    BOM = BOM_UTF16 = BOM_UTF16_BE
70

71
    # UTF-32, native endianness
72
    BOM_UTF32 = BOM_UTF32_BE
73

74
# Old broken names (don't use in new code)
75
BOM32_LE = BOM_UTF16_LE
76
BOM32_BE = BOM_UTF16_BE
77
BOM64_LE = BOM_UTF32_LE
78
BOM64_BE = BOM_UTF32_BE
79

80

81
### Codec base classes (defining the API)
82

83
class CodecInfo(tuple):
84
    """Codec details when looking up the codec registry"""
85

86
    # Private API to allow Python 3.4 to denylist the known non-Unicode
87
    # codecs in the standard library. A more general mechanism to
88
    # reliably distinguish test encodings from other codecs will hopefully
89
    # be defined for Python 3.5
90
    #
91
    # See http://bugs.python.org/issue19619
92
    _is_text_encoding = True # Assume codecs are text encodings by default
93

94
    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95
        incrementalencoder=None, incrementaldecoder=None, name=None,
96
        *, _is_text_encoding=None):
97
        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98
        self.name = name
99
        self.encode = encode
100
        self.decode = decode
101
        self.incrementalencoder = incrementalencoder
102
        self.incrementaldecoder = incrementaldecoder
103
        self.streamwriter = streamwriter
104
        self.streamreader = streamreader
105
        if _is_text_encoding is not None:
106
            self._is_text_encoding = _is_text_encoding
107
        return self
108

109
    def __repr__(self):
110
        return "<%s.%s object for encoding %s at %#x>" % \
111
                (self.__class__.__module__, self.__class__.__qualname__,
112
                 self.name, id(self))
113

114
class Codec:
115

116
    """ Defines the interface for stateless encoders/decoders.
117

118
        The .encode()/.decode() methods may use different error
119
        handling schemes by providing the errors argument. These
120
        string values are predefined:
121

122
         'strict' - raise a ValueError error (or a subclass)
123
         'ignore' - ignore the character and continue with the next
124
         'replace' - replace with a suitable replacement character;
125
                    Python will use the official U+FFFD REPLACEMENT
126
                    CHARACTER for the builtin Unicode codecs on
127
                    decoding and '?' on encoding.
128
         'surrogateescape' - replace with private code points U+DCnn.
129
         'xmlcharrefreplace' - Replace with the appropriate XML
130
                               character reference (only for encoding).
131
         'backslashreplace'  - Replace with backslashed escape sequences.
132
         'namereplace'       - Replace with \\N{...} escape sequences
133
                               (only for encoding).
134

135
        The set of allowed values can be extended via register_error.
136

137
    """
138
    def encode(self, input, errors='strict'):
139

140
        """ Encodes the object input and returns a tuple (output
141
            object, length consumed).
142

143
            errors defines the error handling to apply. It defaults to
144
            'strict' handling.
145

146
            The method may not store state in the Codec instance. Use
147
            StreamWriter for codecs which have to keep state in order to
148
            make encoding efficient.
149

150
            The encoder must be able to handle zero length input and
151
            return an empty object of the output object type in this
152
            situation.
153

154
        """
155
        raise NotImplementedError
156

157
    def decode(self, input, errors='strict'):
158

159
        """ Decodes the object input and returns a tuple (output
160
            object, length consumed).
161

162
            input must be an object which provides the bf_getreadbuf
163
            buffer slot. Python strings, buffer objects and memory
164
            mapped files are examples of objects providing this slot.
165

166
            errors defines the error handling to apply. It defaults to
167
            'strict' handling.
168

169
            The method may not store state in the Codec instance. Use
170
            StreamReader for codecs which have to keep state in order to
171
            make decoding efficient.
172

173
            The decoder must be able to handle zero length input and
174
            return an empty object of the output object type in this
175
            situation.
176

177
        """
178
        raise NotImplementedError
179

180
class IncrementalEncoder(object):
181
    """
182
    An IncrementalEncoder encodes an input in multiple steps. The input can
183
    be passed piece by piece to the encode() method. The IncrementalEncoder
184
    remembers the state of the encoding process between calls to encode().
185
    """
186
    def __init__(self, errors='strict'):
187
        """
188
        Creates an IncrementalEncoder instance.
189

190
        The IncrementalEncoder may use different error handling schemes by
191
        providing the errors keyword argument. See the module docstring
192
        for a list of possible values.
193
        """
194
        self.errors = errors
195
        self.buffer = ""
196

197
    def encode(self, input, final=False):
198
        """
199
        Encodes input and returns the resulting object.
200
        """
201
        raise NotImplementedError
202

203
    def reset(self):
204
        """
205
        Resets the encoder to the initial state.
206
        """
207

208
    def getstate(self):
209
        """
210
        Return the current state of the encoder.
211
        """
212
        return 0
213

214
    def setstate(self, state):
215
        """
216
        Set the current state of the encoder. state must have been
217
        returned by getstate().
218
        """
219

220
class BufferedIncrementalEncoder(IncrementalEncoder):
221
    """
222
    This subclass of IncrementalEncoder can be used as the baseclass for an
223
    incremental encoder if the encoder must keep some of the output in a
224
    buffer between calls to encode().
225
    """
226
    def __init__(self, errors='strict'):
227
        IncrementalEncoder.__init__(self, errors)
228
        # unencoded input that is kept between calls to encode()
229
        self.buffer = ""
230

231
    def _buffer_encode(self, input, errors, final):
232
        # Overwrite this method in subclasses: It must encode input
233
        # and return an (output, length consumed) tuple
234
        raise NotImplementedError
235

236
    def encode(self, input, final=False):
237
        # encode input (taking the buffer into account)
238
        data = self.buffer + input
239
        (result, consumed) = self._buffer_encode(data, self.errors, final)
240
        # keep unencoded input until the next call
241
        self.buffer = data[consumed:]
242
        return result
243

244
    def reset(self):
245
        IncrementalEncoder.reset(self)
246
        self.buffer = ""
247

248
    def getstate(self):
249
        return self.buffer or 0
250

251
    def setstate(self, state):
252
        self.buffer = state or ""
253

254
class IncrementalDecoder(object):
255
    """
256
    An IncrementalDecoder decodes an input in multiple steps. The input can
257
    be passed piece by piece to the decode() method. The IncrementalDecoder
258
    remembers the state of the decoding process between calls to decode().
259
    """
260
    def __init__(self, errors='strict'):
261
        """
262
        Create an IncrementalDecoder instance.
263

264
        The IncrementalDecoder may use different error handling schemes by
265
        providing the errors keyword argument. See the module docstring
266
        for a list of possible values.
267
        """
268
        self.errors = errors
269

270
    def decode(self, input, final=False):
271
        """
272
        Decode input and returns the resulting object.
273
        """
274
        raise NotImplementedError
275

276
    def reset(self):
277
        """
278
        Reset the decoder to the initial state.
279
        """
280

281
    def getstate(self):
282
        """
283
        Return the current state of the decoder.
284

285
        This must be a (buffered_input, additional_state_info) tuple.
286
        buffered_input must be a bytes object containing bytes that
287
        were passed to decode() that have not yet been converted.
288
        additional_state_info must be a non-negative integer
289
        representing the state of the decoder WITHOUT yet having
290
        processed the contents of buffered_input.  In the initial state
291
        and after reset(), getstate() must return (b"", 0).
292
        """
293
        return (b"", 0)
294

295
    def setstate(self, state):
296
        """
297
        Set the current state of the decoder.
298

299
        state must have been returned by getstate().  The effect of
300
        setstate((b"", 0)) must be equivalent to reset().
301
        """
302

303
class BufferedIncrementalDecoder(IncrementalDecoder):
304
    """
305
    This subclass of IncrementalDecoder can be used as the baseclass for an
306
    incremental decoder if the decoder must be able to handle incomplete
307
    byte sequences.
308
    """
309
    def __init__(self, errors='strict'):
310
        IncrementalDecoder.__init__(self, errors)
311
        # undecoded input that is kept between calls to decode()
312
        self.buffer = b""
313

314
    def _buffer_decode(self, input, errors, final):
315
        # Overwrite this method in subclasses: It must decode input
316
        # and return an (output, length consumed) tuple
317
        raise NotImplementedError
318

319
    def decode(self, input, final=False):
320
        # decode input (taking the buffer into account)
321
        data = self.buffer + input
322
        (result, consumed) = self._buffer_decode(data, self.errors, final)
323
        # keep undecoded input until the next call
324
        self.buffer = data[consumed:]
325
        return result
326

327
    def reset(self):
328
        IncrementalDecoder.reset(self)
329
        self.buffer = b""
330

331
    def getstate(self):
332
        # additional state info is always 0
333
        return (self.buffer, 0)
334

335
    def setstate(self, state):
336
        # ignore additional state info
337
        self.buffer = state[0]
338

339
#
340
# The StreamWriter and StreamReader class provide generic working
341
# interfaces which can be used to implement new encoding submodules
342
# very easily. See encodings/utf_8.py for an example on how this is
343
# done.
344
#
345

346
class StreamWriter(Codec):
347

348
    def __init__(self, stream, errors='strict'):
349

350
        """ Creates a StreamWriter instance.
351

352
            stream must be a file-like object open for writing.
353

354
            The StreamWriter may use different error handling
355
            schemes by providing the errors keyword argument. These
356
            parameters are predefined:
357

358
             'strict' - raise a ValueError (or a subclass)
359
             'ignore' - ignore the character and continue with the next
360
             'replace'- replace with a suitable replacement character
361
             'xmlcharrefreplace' - Replace with the appropriate XML
362
                                   character reference.
363
             'backslashreplace'  - Replace with backslashed escape
364
                                   sequences.
365
             'namereplace'       - Replace with \\N{...} escape sequences.
366

367
            The set of allowed parameter values can be extended via
368
            register_error.
369
        """
370
        self.stream = stream
371
        self.errors = errors
372

373
    def write(self, object):
374

375
        """ Writes the object's contents encoded to self.stream.
376
        """
377
        data, consumed = self.encode(object, self.errors)
378
        self.stream.write(data)
379

380
    def writelines(self, list):
381

382
        """ Writes the concatenated list of strings to the stream
383
            using .write().
384
        """
385
        self.write(''.join(list))
386

387
    def reset(self):
388

389
        """ Resets the codec buffers used for keeping internal state.
390

391
            Calling this method should ensure that the data on the
392
            output is put into a clean state, that allows appending
393
            of new fresh data without having to rescan the whole
394
            stream to recover state.
395

396
        """
397
        pass
398

399
    def seek(self, offset, whence=0):
400
        self.stream.seek(offset, whence)
401
        if whence == 0 and offset == 0:
402
            self.reset()
403

404
    def __getattr__(self, name,
405
                    getattr=getattr):
406

407
        """ Inherit all other methods from the underlying stream.
408
        """
409
        return getattr(self.stream, name)
410

411
    def __enter__(self):
412
        return self
413

414
    def __exit__(self, type, value, tb):
415
        self.stream.close()
416

417
###
418

419
class StreamReader(Codec):
420

421
    charbuffertype = str
422

423
    def __init__(self, stream, errors='strict'):
424

425
        """ Creates a StreamReader instance.
426

427
            stream must be a file-like object open for reading.
428

429
            The StreamReader may use different error handling
430
            schemes by providing the errors keyword argument. These
431
            parameters are predefined:
432

433
             'strict' - raise a ValueError (or a subclass)
434
             'ignore' - ignore the character and continue with the next
435
             'replace'- replace with a suitable replacement character
436
             'backslashreplace' - Replace with backslashed escape sequences;
437

438
            The set of allowed parameter values can be extended via
439
            register_error.
440
        """
441
        self.stream = stream
442
        self.errors = errors
443
        self.bytebuffer = b""
444
        self._empty_charbuffer = self.charbuffertype()
445
        self.charbuffer = self._empty_charbuffer
446
        self.linebuffer = None
447

448
    def decode(self, input, errors='strict'):
449
        raise NotImplementedError
450

451
    def read(self, size=-1, chars=-1, firstline=False):
452

453
        """ Decodes data from the stream self.stream and returns the
454
            resulting object.
455

456
            chars indicates the number of decoded code points or bytes to
457
            return. read() will never return more data than requested,
458
            but it might return less, if there is not enough available.
459

460
            size indicates the approximate maximum number of decoded
461
            bytes or code points to read for decoding. The decoder
462
            can modify this setting as appropriate. The default value
463
            -1 indicates to read and decode as much as possible.  size
464
            is intended to prevent having to decode huge files in one
465
            step.
466

467
            If firstline is true, and a UnicodeDecodeError happens
468
            after the first line terminator in the input only the first line
469
            will be returned, the rest of the input will be kept until the
470
            next call to read().
471

472
            The method should use a greedy read strategy, meaning that
473
            it should read as much data as is allowed within the
474
            definition of the encoding and the given size, e.g.  if
475
            optional encoding endings or state markers are available
476
            on the stream, these should be read too.
477
        """
478
        # If we have lines cached, first merge them back into characters
479
        if self.linebuffer:
480
            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481
            self.linebuffer = None
482

483
        if chars < 0:
484
            # For compatibility with other read() methods that take a
485
            # single argument
486
            chars = size
487

488
        # read until we get the required number of characters (if available)
489
        while True:
490
            # can the request be satisfied from the character buffer?
491
            if chars >= 0:
492
                if len(self.charbuffer) >= chars:
493
                    break
494
            # we need more data
495
            if size < 0:
496
                newdata = self.stream.read()
497
            else:
498
                newdata = self.stream.read(size)
499
            # decode bytes (those remaining from the last call included)
500
            data = self.bytebuffer + newdata
501
            if not data:
502
                break
503
            try:
504
                newchars, decodedbytes = self.decode(data, self.errors)
505
            except UnicodeDecodeError as exc:
506
                if firstline:
507
                    newchars, decodedbytes = \
508
                        self.decode(data[:exc.start], self.errors)
509
                    lines = newchars.splitlines(keepends=True)
510
                    if len(lines)<=1:
511
                        raise
512
                else:
513
                    raise
514
            # keep undecoded bytes until the next call
515
            self.bytebuffer = data[decodedbytes:]
516
            # put new characters in the character buffer
517
            self.charbuffer += newchars
518
            # there was no data available
519
            if not newdata:
520
                break
521
        if chars < 0:
522
            # Return everything we've got
523
            result = self.charbuffer
524
            self.charbuffer = self._empty_charbuffer
525
        else:
526
            # Return the first chars characters
527
            result = self.charbuffer[:chars]
528
            self.charbuffer = self.charbuffer[chars:]
529
        return result
530

531
    def readline(self, size=None, keepends=True):
532

533
        """ Read one line from the input stream and return the
534
            decoded data.
535

536
            size, if given, is passed as size argument to the
537
            read() method.
538

539
        """
540
        # If we have lines cached from an earlier read, return
541
        # them unconditionally
542
        if self.linebuffer:
543
            line = self.linebuffer[0]
544
            del self.linebuffer[0]
545
            if len(self.linebuffer) == 1:
546
                # revert to charbuffer mode; we might need more data
547
                # next time
548
                self.charbuffer = self.linebuffer[0]
549
                self.linebuffer = None
550
            if not keepends:
551
                line = line.splitlines(keepends=False)[0]
552
            return line
553

554
        readsize = size or 72
555
        line = self._empty_charbuffer
556
        # If size is given, we call read() only once
557
        while True:
558
            data = self.read(readsize, firstline=True)
559
            if data:
560
                # If we're at a "\r" read one extra character (which might
561
                # be a "\n") to get a proper line ending. If the stream is
562
                # temporarily exhausted we return the wrong line ending.
563
                if (isinstance(data, str) and data.endswith("\r")) or \
564
                   (isinstance(data, bytes) and data.endswith(b"\r")):
565
                    data += self.read(size=1, chars=1)
566

567
            line += data
568
            lines = line.splitlines(keepends=True)
569
            if lines:
570
                if len(lines) > 1:
571
                    # More than one line result; the first line is a full line
572
                    # to return
573
                    line = lines[0]
574
                    del lines[0]
575
                    if len(lines) > 1:
576
                        # cache the remaining lines
577
                        lines[-1] += self.charbuffer
578
                        self.linebuffer = lines
579
                        self.charbuffer = None
580
                    else:
581
                        # only one remaining line, put it back into charbuffer
582
                        self.charbuffer = lines[0] + self.charbuffer
583
                    if not keepends:
584
                        line = line.splitlines(keepends=False)[0]
585
                    break
586
                line0withend = lines[0]
587
                line0withoutend = lines[0].splitlines(keepends=False)[0]
588
                if line0withend != line0withoutend: # We really have a line end
589
                    # Put the rest back together and keep it until the next call
590
                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591
                                      self.charbuffer
592
                    if keepends:
593
                        line = line0withend
594
                    else:
595
                        line = line0withoutend
596
                    break
597
            # we didn't get anything or this was our only try
598
            if not data or size is not None:
599
                if line and not keepends:
600
                    line = line.splitlines(keepends=False)[0]
601
                break
602
            if readsize < 8000:
603
                readsize *= 2
604
        return line
605

606
    def readlines(self, sizehint=None, keepends=True):
607

608
        """ Read all lines available on the input stream
609
            and return them as a list.
610

611
            Line breaks are implemented using the codec's decoder
612
            method and are included in the list entries.
613

614
            sizehint, if given, is ignored since there is no efficient
615
            way to finding the true end-of-line.
616

617
        """
618
        data = self.read()
619
        return data.splitlines(keepends)
620

621
    def reset(self):
622

623
        """ Resets the codec buffers used for keeping internal state.
624

625
            Note that no stream repositioning should take place.
626
            This method is primarily intended to be able to recover
627
            from decoding errors.
628

629
        """
630
        self.bytebuffer = b""
631
        self.charbuffer = self._empty_charbuffer
632
        self.linebuffer = None
633

634
    def seek(self, offset, whence=0):
635
        """ Set the input stream's current position.
636

637
            Resets the codec buffers used for keeping state.
638
        """
639
        self.stream.seek(offset, whence)
640
        self.reset()
641

642
    def __next__(self):
643

644
        """ Return the next decoded line from the input stream."""
645
        line = self.readline()
646
        if line:
647
            return line
648
        raise StopIteration
649

650
    def __iter__(self):
651
        return self
652

653
    def __getattr__(self, name,
654
                    getattr=getattr):
655

656
        """ Inherit all other methods from the underlying stream.
657
        """
658
        return getattr(self.stream, name)
659

660
    def __enter__(self):
661
        return self
662

663
    def __exit__(self, type, value, tb):
664
        self.stream.close()
665

666
###
667

668
class StreamReaderWriter:
669

670
    """ StreamReaderWriter instances allow wrapping streams which
671
        work in both read and write modes.
672

673
        The design is such that one can use the factory functions
674
        returned by the codec.lookup() function to construct the
675
        instance.
676

677
    """
678
    # Optional attributes set by the file wrappers below
679
    encoding = 'unknown'
680

681
    def __init__(self, stream, Reader, Writer, errors='strict'):
682

683
        """ Creates a StreamReaderWriter instance.
684

685
            stream must be a Stream-like object.
686

687
            Reader, Writer must be factory functions or classes
688
            providing the StreamReader, StreamWriter interface resp.
689

690
            Error handling is done in the same way as defined for the
691
            StreamWriter/Readers.
692

693
        """
694
        self.stream = stream
695
        self.reader = Reader(stream, errors)
696
        self.writer = Writer(stream, errors)
697
        self.errors = errors
698

699
    def read(self, size=-1):
700

701
        return self.reader.read(size)
702

703
    def readline(self, size=None):
704

705
        return self.reader.readline(size)
706

707
    def readlines(self, sizehint=None):
708

709
        return self.reader.readlines(sizehint)
710

711
    def __next__(self):
712

713
        """ Return the next decoded line from the input stream."""
714
        return next(self.reader)
715

716
    def __iter__(self):
717
        return self
718

719
    def write(self, data):
720

721
        return self.writer.write(data)
722

723
    def writelines(self, list):
724

725
        return self.writer.writelines(list)
726

727
    def reset(self):
728

729
        self.reader.reset()
730
        self.writer.reset()
731

732
    def seek(self, offset, whence=0):
733
        self.stream.seek(offset, whence)
734
        self.reader.reset()
735
        if whence == 0 and offset == 0:
736
            self.writer.reset()
737

738
    def __getattr__(self, name,
739
                    getattr=getattr):
740

741
        """ Inherit all other methods from the underlying stream.
742
        """
743
        return getattr(self.stream, name)
744

745
    # these are needed to make "with StreamReaderWriter(...)" work properly
746

747
    def __enter__(self):
748
        return self
749

750
    def __exit__(self, type, value, tb):
751
        self.stream.close()
752

753
###
754

755
class StreamRecoder:
756

757
    """ StreamRecoder instances translate data from one encoding to another.
758

759
        They use the complete set of APIs returned by the
760
        codecs.lookup() function to implement their task.
761

762
        Data written to the StreamRecoder is first decoded into an
763
        intermediate format (depending on the "decode" codec) and then
764
        written to the underlying stream using an instance of the provided
765
        Writer class.
766

767
        In the other direction, data is read from the underlying stream using
768
        a Reader instance and then encoded and returned to the caller.
769

770
    """
771
    # Optional attributes set by the file wrappers below
772
    data_encoding = 'unknown'
773
    file_encoding = 'unknown'
774

775
    def __init__(self, stream, encode, decode, Reader, Writer,
776
                 errors='strict'):
777

778
        """ Creates a StreamRecoder instance which implements a two-way
779
            conversion: encode and decode work on the frontend (the
780
            data visible to .read() and .write()) while Reader and Writer
781
            work on the backend (the data in stream).
782

783
            You can use these objects to do transparent
784
            transcodings from e.g. latin-1 to utf-8 and back.
785

786
            stream must be a file-like object.
787

788
            encode and decode must adhere to the Codec interface; Reader and
789
            Writer must be factory functions or classes providing the
790
            StreamReader and StreamWriter interfaces resp.
791

792
            Error handling is done in the same way as defined for the
793
            StreamWriter/Readers.
794

795
        """
796
        self.stream = stream
797
        self.encode = encode
798
        self.decode = decode
799
        self.reader = Reader(stream, errors)
800
        self.writer = Writer(stream, errors)
801
        self.errors = errors
802

803
    def read(self, size=-1):
804

805
        data = self.reader.read(size)
806
        data, bytesencoded = self.encode(data, self.errors)
807
        return data
808

809
    def readline(self, size=None):
810

811
        if size is None:
812
            data = self.reader.readline()
813
        else:
814
            data = self.reader.readline(size)
815
        data, bytesencoded = self.encode(data, self.errors)
816
        return data
817

818
    def readlines(self, sizehint=None):
819

820
        data = self.reader.read()
821
        data, bytesencoded = self.encode(data, self.errors)
822
        return data.splitlines(keepends=True)
823

824
    def __next__(self):
825

826
        """ Return the next decoded line from the input stream."""
827
        data = next(self.reader)
828
        data, bytesencoded = self.encode(data, self.errors)
829
        return data
830

831
    def __iter__(self):
832
        return self
833

834
    def write(self, data):
835

836
        data, bytesdecoded = self.decode(data, self.errors)
837
        return self.writer.write(data)
838

839
    def writelines(self, list):
840

841
        data = b''.join(list)
842
        data, bytesdecoded = self.decode(data, self.errors)
843
        return self.writer.write(data)
844

845
    def reset(self):
846

847
        self.reader.reset()
848
        self.writer.reset()
849

850
    def seek(self, offset, whence=0):
851
        # Seeks must be propagated to both the readers and writers
852
        # as they might need to reset their internal buffers.
853
        self.reader.seek(offset, whence)
854
        self.writer.seek(offset, whence)
855

856
    def __getattr__(self, name,
857
                    getattr=getattr):
858

859
        """ Inherit all other methods from the underlying stream.
860
        """
861
        return getattr(self.stream, name)
862

863
    def __enter__(self):
864
        return self
865

866
    def __exit__(self, type, value, tb):
867
        self.stream.close()
868

869
### Shortcuts
870

871
def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
872

873
    """ Open an encoded file using the given mode and return
874
        a wrapped version providing transparent encoding/decoding.
875

876
        Note: The wrapped version will only accept the object format
877
        defined by the codecs, i.e. Unicode objects for most builtin
878
        codecs. Output is also codec dependent and will usually be
879
        Unicode as well.
880

881
        If encoding is not None, then the
882
        underlying encoded files are always opened in binary mode.
883
        The default file mode is 'r', meaning to open the file in read mode.
884

885
        encoding specifies the encoding which is to be used for the
886
        file.
887

888
        errors may be given to define the error handling. It defaults
889
        to 'strict' which causes ValueErrors to be raised in case an
890
        encoding error occurs.
891

892
        buffering has the same meaning as for the builtin open() API.
893
        It defaults to -1 which means that the default buffer size will
894
        be used.
895

896
        The returned wrapped file object provides an extra attribute
897
        .encoding which allows querying the used encoding. This
898
        attribute is only available if an encoding was specified as
899
        parameter.
900

901
    """
902
    if encoding is not None and \
903
       'b' not in mode:
904
        # Force opening of the file in binary mode
905
        mode = mode + 'b'
906
    file = builtins.open(filename, mode, buffering)
907
    if encoding is None:
908
        return file
909

910
    try:
911
        info = lookup(encoding)
912
        srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
913
        # Add attributes to simplify introspection
914
        srw.encoding = encoding
915
        return srw
916
    except:
917
        file.close()
918
        raise
919

920
def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
921

922
    """ Return a wrapped version of file which provides transparent
923
        encoding translation.
924

925
        Data written to the wrapped file is decoded according
926
        to the given data_encoding and then encoded to the underlying
927
        file using file_encoding. The intermediate data type
928
        will usually be Unicode but depends on the specified codecs.
929

930
        Bytes read from the file are decoded using file_encoding and then
931
        passed back to the caller encoded using data_encoding.
932

933
        If file_encoding is not given, it defaults to data_encoding.
934

935
        errors may be given to define the error handling. It defaults
936
        to 'strict' which causes ValueErrors to be raised in case an
937
        encoding error occurs.
938

939
        The returned wrapped file object provides two extra attributes
940
        .data_encoding and .file_encoding which reflect the given
941
        parameters of the same name. The attributes can be used for
942
        introspection by Python programs.
943

944
    """
945
    if file_encoding is None:
946
        file_encoding = data_encoding
947
    data_info = lookup(data_encoding)
948
    file_info = lookup(file_encoding)
949
    sr = StreamRecoder(file, data_info.encode, data_info.decode,
950
                       file_info.streamreader, file_info.streamwriter, errors)
951
    # Add attributes to simplify introspection
952
    sr.data_encoding = data_encoding
953
    sr.file_encoding = file_encoding
954
    return sr
955

956
### Helpers for codec lookup
957

958
def getencoder(encoding):
959

960
    """ Lookup up the codec for the given encoding and return
961
        its encoder function.
962

963
        Raises a LookupError in case the encoding cannot be found.
964

965
    """
966
    return lookup(encoding).encode
967

968
def getdecoder(encoding):
969

970
    """ Lookup up the codec for the given encoding and return
971
        its decoder function.
972

973
        Raises a LookupError in case the encoding cannot be found.
974

975
    """
976
    return lookup(encoding).decode
977

978
def getincrementalencoder(encoding):
979

980
    """ Lookup up the codec for the given encoding and return
981
        its IncrementalEncoder class or factory function.
982

983
        Raises a LookupError in case the encoding cannot be found
984
        or the codecs doesn't provide an incremental encoder.
985

986
    """
987
    encoder = lookup(encoding).incrementalencoder
988
    if encoder is None:
989
        raise LookupError(encoding)
990
    return encoder
991

992
def getincrementaldecoder(encoding):
993

994
    """ Lookup up the codec for the given encoding and return
995
        its IncrementalDecoder class or factory function.
996

997
        Raises a LookupError in case the encoding cannot be found
998
        or the codecs doesn't provide an incremental decoder.
999

1000
    """
1001
    decoder = lookup(encoding).incrementaldecoder
1002
    if decoder is None:
1003
        raise LookupError(encoding)
1004
    return decoder
1005

1006
def getreader(encoding):
1007

1008
    """ Lookup up the codec for the given encoding and return
1009
        its StreamReader class or factory function.
1010

1011
        Raises a LookupError in case the encoding cannot be found.
1012

1013
    """
1014
    return lookup(encoding).streamreader
1015

1016
def getwriter(encoding):
1017

1018
    """ Lookup up the codec for the given encoding and return
1019
        its StreamWriter class or factory function.
1020

1021
        Raises a LookupError in case the encoding cannot be found.
1022

1023
    """
1024
    return lookup(encoding).streamwriter
1025

1026
def iterencode(iterator, encoding, errors='strict', **kwargs):
1027
    """
1028
    Encoding iterator.
1029

1030
    Encodes the input strings from the iterator using an IncrementalEncoder.
1031

1032
    errors and kwargs are passed through to the IncrementalEncoder
1033
    constructor.
1034
    """
1035
    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1036
    for input in iterator:
1037
        output = encoder.encode(input)
1038
        if output:
1039
            yield output
1040
    output = encoder.encode("", True)
1041
    if output:
1042
        yield output
1043

1044
def iterdecode(iterator, encoding, errors='strict', **kwargs):
1045
    """
1046
    Decoding iterator.
1047

1048
    Decodes the input strings from the iterator using an IncrementalDecoder.
1049

1050
    errors and kwargs are passed through to the IncrementalDecoder
1051
    constructor.
1052
    """
1053
    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1054
    for input in iterator:
1055
        output = decoder.decode(input)
1056
        if output:
1057
            yield output
1058
    output = decoder.decode(b"", True)
1059
    if output:
1060
        yield output
1061

1062
### Helpers for charmap-based codecs
1063

1064
def make_identity_dict(rng):
1065

1066
    """ make_identity_dict(rng) -> dict
1067

1068
        Return a dictionary where elements of the rng sequence are
1069
        mapped to themselves.
1070

1071
    """
1072
    return {i:i for i in rng}
1073

1074
def make_encoding_map(decoding_map):
1075

1076
    """ Creates an encoding map from a decoding map.
1077

1078
        If a target mapping in the decoding map occurs multiple
1079
        times, then that target is mapped to None (undefined mapping),
1080
        causing an exception when encountered by the charmap codec
1081
        during translation.
1082

1083
        One example where this happens is cp875.py which decodes
1084
        multiple character to \\u001a.
1085

1086
    """
1087
    m = {}
1088
    for k,v in decoding_map.items():
1089
        if not v in m:
1090
            m[v] = k
1091
        else:
1092
            m[v] = None
1093
    return m
1094

1095
### error handlers
1096

1097
try:
1098
    strict_errors = lookup_error("strict")
1099
    ignore_errors = lookup_error("ignore")
1100
    replace_errors = lookup_error("replace")
1101
    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1102
    backslashreplace_errors = lookup_error("backslashreplace")
1103
    namereplace_errors = lookup_error("namereplace")
1104
except LookupError:
1105
    # In --disable-unicode builds, these error handler are missing
1106
    strict_errors = None
1107
    ignore_errors = None
1108
    replace_errors = None
1109
    xmlcharrefreplace_errors = None
1110
    backslashreplace_errors = None
1111
    namereplace_errors = None
1112

1113
# Tell modulefinder that using codecs probably needs the encodings
1114
# package
1115
_false = 0
1116
if _false:
1117
    import encodings
1118

1119
Product

Resources

Company