Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Lib/codecs.py
12 views
1
""" codecs -- Python Codec Registry, API and helpers.
2
3
4
Written by Marc-Andre Lemburg ([email protected]).
5
6
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8
"""
9
10
import builtins
11
import sys
12
13
### Registry and builtin stateless codec functions
14
15
try:
16
from _codecs import *
17
except ImportError as why:
18
raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20
__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21
"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22
"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23
"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24
"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25
"StreamReader", "StreamWriter",
26
"StreamReaderWriter", "StreamRecoder",
27
"getencoder", "getdecoder", "getincrementalencoder",
28
"getincrementaldecoder", "getreader", "getwriter",
29
"encode", "decode", "iterencode", "iterdecode",
30
"strict_errors", "ignore_errors", "replace_errors",
31
"xmlcharrefreplace_errors",
32
"backslashreplace_errors", "namereplace_errors",
33
"register_error", "lookup_error"]
34
35
### Constants
36
37
#
38
# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39
# and its possible byte string values
40
# for UTF8/UTF16/UTF32 output and little/big endian machines
41
#
42
43
# UTF-8
44
BOM_UTF8 = b'\xef\xbb\xbf'
45
46
# UTF-16, little endian
47
BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49
# UTF-16, big endian
50
BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52
# UTF-32, little endian
53
BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55
# UTF-32, big endian
56
BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58
if sys.byteorder == 'little':
59
60
# UTF-16, native endianness
61
BOM = BOM_UTF16 = BOM_UTF16_LE
62
63
# UTF-32, native endianness
64
BOM_UTF32 = BOM_UTF32_LE
65
66
else:
67
68
# UTF-16, native endianness
69
BOM = BOM_UTF16 = BOM_UTF16_BE
70
71
# UTF-32, native endianness
72
BOM_UTF32 = BOM_UTF32_BE
73
74
# Old broken names (don't use in new code)
75
BOM32_LE = BOM_UTF16_LE
76
BOM32_BE = BOM_UTF16_BE
77
BOM64_LE = BOM_UTF32_LE
78
BOM64_BE = BOM_UTF32_BE
79
80
81
### Codec base classes (defining the API)
82
83
class CodecInfo(tuple):
84
"""Codec details when looking up the codec registry"""
85
86
# Private API to allow Python 3.4 to denylist the known non-Unicode
87
# codecs in the standard library. A more general mechanism to
88
# reliably distinguish test encodings from other codecs will hopefully
89
# be defined for Python 3.5
90
#
91
# See http://bugs.python.org/issue19619
92
_is_text_encoding = True # Assume codecs are text encodings by default
93
94
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95
incrementalencoder=None, incrementaldecoder=None, name=None,
96
*, _is_text_encoding=None):
97
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98
self.name = name
99
self.encode = encode
100
self.decode = decode
101
self.incrementalencoder = incrementalencoder
102
self.incrementaldecoder = incrementaldecoder
103
self.streamwriter = streamwriter
104
self.streamreader = streamreader
105
if _is_text_encoding is not None:
106
self._is_text_encoding = _is_text_encoding
107
return self
108
109
def __repr__(self):
110
return "<%s.%s object for encoding %s at %#x>" % \
111
(self.__class__.__module__, self.__class__.__qualname__,
112
self.name, id(self))
113
114
class Codec:
115
116
""" Defines the interface for stateless encoders/decoders.
117
118
The .encode()/.decode() methods may use different error
119
handling schemes by providing the errors argument. These
120
string values are predefined:
121
122
'strict' - raise a ValueError error (or a subclass)
123
'ignore' - ignore the character and continue with the next
124
'replace' - replace with a suitable replacement character;
125
Python will use the official U+FFFD REPLACEMENT
126
CHARACTER for the builtin Unicode codecs on
127
decoding and '?' on encoding.
128
'surrogateescape' - replace with private code points U+DCnn.
129
'xmlcharrefreplace' - Replace with the appropriate XML
130
character reference (only for encoding).
131
'backslashreplace' - Replace with backslashed escape sequences.
132
'namereplace' - Replace with \\N{...} escape sequences
133
(only for encoding).
134
135
The set of allowed values can be extended via register_error.
136
137
"""
138
def encode(self, input, errors='strict'):
139
140
""" Encodes the object input and returns a tuple (output
141
object, length consumed).
142
143
errors defines the error handling to apply. It defaults to
144
'strict' handling.
145
146
The method may not store state in the Codec instance. Use
147
StreamWriter for codecs which have to keep state in order to
148
make encoding efficient.
149
150
The encoder must be able to handle zero length input and
151
return an empty object of the output object type in this
152
situation.
153
154
"""
155
raise NotImplementedError
156
157
def decode(self, input, errors='strict'):
158
159
""" Decodes the object input and returns a tuple (output
160
object, length consumed).
161
162
input must be an object which provides the bf_getreadbuf
163
buffer slot. Python strings, buffer objects and memory
164
mapped files are examples of objects providing this slot.
165
166
errors defines the error handling to apply. It defaults to
167
'strict' handling.
168
169
The method may not store state in the Codec instance. Use
170
StreamReader for codecs which have to keep state in order to
171
make decoding efficient.
172
173
The decoder must be able to handle zero length input and
174
return an empty object of the output object type in this
175
situation.
176
177
"""
178
raise NotImplementedError
179
180
class IncrementalEncoder(object):
181
"""
182
An IncrementalEncoder encodes an input in multiple steps. The input can
183
be passed piece by piece to the encode() method. The IncrementalEncoder
184
remembers the state of the encoding process between calls to encode().
185
"""
186
def __init__(self, errors='strict'):
187
"""
188
Creates an IncrementalEncoder instance.
189
190
The IncrementalEncoder may use different error handling schemes by
191
providing the errors keyword argument. See the module docstring
192
for a list of possible values.
193
"""
194
self.errors = errors
195
self.buffer = ""
196
197
def encode(self, input, final=False):
198
"""
199
Encodes input and returns the resulting object.
200
"""
201
raise NotImplementedError
202
203
def reset(self):
204
"""
205
Resets the encoder to the initial state.
206
"""
207
208
def getstate(self):
209
"""
210
Return the current state of the encoder.
211
"""
212
return 0
213
214
def setstate(self, state):
215
"""
216
Set the current state of the encoder. state must have been
217
returned by getstate().
218
"""
219
220
class BufferedIncrementalEncoder(IncrementalEncoder):
221
"""
222
This subclass of IncrementalEncoder can be used as the baseclass for an
223
incremental encoder if the encoder must keep some of the output in a
224
buffer between calls to encode().
225
"""
226
def __init__(self, errors='strict'):
227
IncrementalEncoder.__init__(self, errors)
228
# unencoded input that is kept between calls to encode()
229
self.buffer = ""
230
231
def _buffer_encode(self, input, errors, final):
232
# Overwrite this method in subclasses: It must encode input
233
# and return an (output, length consumed) tuple
234
raise NotImplementedError
235
236
def encode(self, input, final=False):
237
# encode input (taking the buffer into account)
238
data = self.buffer + input
239
(result, consumed) = self._buffer_encode(data, self.errors, final)
240
# keep unencoded input until the next call
241
self.buffer = data[consumed:]
242
return result
243
244
def reset(self):
245
IncrementalEncoder.reset(self)
246
self.buffer = ""
247
248
def getstate(self):
249
return self.buffer or 0
250
251
def setstate(self, state):
252
self.buffer = state or ""
253
254
class IncrementalDecoder(object):
255
"""
256
An IncrementalDecoder decodes an input in multiple steps. The input can
257
be passed piece by piece to the decode() method. The IncrementalDecoder
258
remembers the state of the decoding process between calls to decode().
259
"""
260
def __init__(self, errors='strict'):
261
"""
262
Create an IncrementalDecoder instance.
263
264
The IncrementalDecoder may use different error handling schemes by
265
providing the errors keyword argument. See the module docstring
266
for a list of possible values.
267
"""
268
self.errors = errors
269
270
def decode(self, input, final=False):
271
"""
272
Decode input and returns the resulting object.
273
"""
274
raise NotImplementedError
275
276
def reset(self):
277
"""
278
Reset the decoder to the initial state.
279
"""
280
281
def getstate(self):
282
"""
283
Return the current state of the decoder.
284
285
This must be a (buffered_input, additional_state_info) tuple.
286
buffered_input must be a bytes object containing bytes that
287
were passed to decode() that have not yet been converted.
288
additional_state_info must be a non-negative integer
289
representing the state of the decoder WITHOUT yet having
290
processed the contents of buffered_input. In the initial state
291
and after reset(), getstate() must return (b"", 0).
292
"""
293
return (b"", 0)
294
295
def setstate(self, state):
296
"""
297
Set the current state of the decoder.
298
299
state must have been returned by getstate(). The effect of
300
setstate((b"", 0)) must be equivalent to reset().
301
"""
302
303
class BufferedIncrementalDecoder(IncrementalDecoder):
304
"""
305
This subclass of IncrementalDecoder can be used as the baseclass for an
306
incremental decoder if the decoder must be able to handle incomplete
307
byte sequences.
308
"""
309
def __init__(self, errors='strict'):
310
IncrementalDecoder.__init__(self, errors)
311
# undecoded input that is kept between calls to decode()
312
self.buffer = b""
313
314
def _buffer_decode(self, input, errors, final):
315
# Overwrite this method in subclasses: It must decode input
316
# and return an (output, length consumed) tuple
317
raise NotImplementedError
318
319
def decode(self, input, final=False):
320
# decode input (taking the buffer into account)
321
data = self.buffer + input
322
(result, consumed) = self._buffer_decode(data, self.errors, final)
323
# keep undecoded input until the next call
324
self.buffer = data[consumed:]
325
return result
326
327
def reset(self):
328
IncrementalDecoder.reset(self)
329
self.buffer = b""
330
331
def getstate(self):
332
# additional state info is always 0
333
return (self.buffer, 0)
334
335
def setstate(self, state):
336
# ignore additional state info
337
self.buffer = state[0]
338
339
#
340
# The StreamWriter and StreamReader class provide generic working
341
# interfaces which can be used to implement new encoding submodules
342
# very easily. See encodings/utf_8.py for an example on how this is
343
# done.
344
#
345
346
class StreamWriter(Codec):
347
348
def __init__(self, stream, errors='strict'):
349
350
""" Creates a StreamWriter instance.
351
352
stream must be a file-like object open for writing.
353
354
The StreamWriter may use different error handling
355
schemes by providing the errors keyword argument. These
356
parameters are predefined:
357
358
'strict' - raise a ValueError (or a subclass)
359
'ignore' - ignore the character and continue with the next
360
'replace'- replace with a suitable replacement character
361
'xmlcharrefreplace' - Replace with the appropriate XML
362
character reference.
363
'backslashreplace' - Replace with backslashed escape
364
sequences.
365
'namereplace' - Replace with \\N{...} escape sequences.
366
367
The set of allowed parameter values can be extended via
368
register_error.
369
"""
370
self.stream = stream
371
self.errors = errors
372
373
def write(self, object):
374
375
""" Writes the object's contents encoded to self.stream.
376
"""
377
data, consumed = self.encode(object, self.errors)
378
self.stream.write(data)
379
380
def writelines(self, list):
381
382
""" Writes the concatenated list of strings to the stream
383
using .write().
384
"""
385
self.write(''.join(list))
386
387
def reset(self):
388
389
""" Resets the codec buffers used for keeping internal state.
390
391
Calling this method should ensure that the data on the
392
output is put into a clean state, that allows appending
393
of new fresh data without having to rescan the whole
394
stream to recover state.
395
396
"""
397
pass
398
399
def seek(self, offset, whence=0):
400
self.stream.seek(offset, whence)
401
if whence == 0 and offset == 0:
402
self.reset()
403
404
def __getattr__(self, name,
405
getattr=getattr):
406
407
""" Inherit all other methods from the underlying stream.
408
"""
409
return getattr(self.stream, name)
410
411
def __enter__(self):
412
return self
413
414
def __exit__(self, type, value, tb):
415
self.stream.close()
416
417
###
418
419
class StreamReader(Codec):
420
421
charbuffertype = str
422
423
def __init__(self, stream, errors='strict'):
424
425
""" Creates a StreamReader instance.
426
427
stream must be a file-like object open for reading.
428
429
The StreamReader may use different error handling
430
schemes by providing the errors keyword argument. These
431
parameters are predefined:
432
433
'strict' - raise a ValueError (or a subclass)
434
'ignore' - ignore the character and continue with the next
435
'replace'- replace with a suitable replacement character
436
'backslashreplace' - Replace with backslashed escape sequences;
437
438
The set of allowed parameter values can be extended via
439
register_error.
440
"""
441
self.stream = stream
442
self.errors = errors
443
self.bytebuffer = b""
444
self._empty_charbuffer = self.charbuffertype()
445
self.charbuffer = self._empty_charbuffer
446
self.linebuffer = None
447
448
def decode(self, input, errors='strict'):
449
raise NotImplementedError
450
451
def read(self, size=-1, chars=-1, firstline=False):
452
453
""" Decodes data from the stream self.stream and returns the
454
resulting object.
455
456
chars indicates the number of decoded code points or bytes to
457
return. read() will never return more data than requested,
458
but it might return less, if there is not enough available.
459
460
size indicates the approximate maximum number of decoded
461
bytes or code points to read for decoding. The decoder
462
can modify this setting as appropriate. The default value
463
-1 indicates to read and decode as much as possible. size
464
is intended to prevent having to decode huge files in one
465
step.
466
467
If firstline is true, and a UnicodeDecodeError happens
468
after the first line terminator in the input only the first line
469
will be returned, the rest of the input will be kept until the
470
next call to read().
471
472
The method should use a greedy read strategy, meaning that
473
it should read as much data as is allowed within the
474
definition of the encoding and the given size, e.g. if
475
optional encoding endings or state markers are available
476
on the stream, these should be read too.
477
"""
478
# If we have lines cached, first merge them back into characters
479
if self.linebuffer:
480
self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481
self.linebuffer = None
482
483
if chars < 0:
484
# For compatibility with other read() methods that take a
485
# single argument
486
chars = size
487
488
# read until we get the required number of characters (if available)
489
while True:
490
# can the request be satisfied from the character buffer?
491
if chars >= 0:
492
if len(self.charbuffer) >= chars:
493
break
494
# we need more data
495
if size < 0:
496
newdata = self.stream.read()
497
else:
498
newdata = self.stream.read(size)
499
# decode bytes (those remaining from the last call included)
500
data = self.bytebuffer + newdata
501
if not data:
502
break
503
try:
504
newchars, decodedbytes = self.decode(data, self.errors)
505
except UnicodeDecodeError as exc:
506
if firstline:
507
newchars, decodedbytes = \
508
self.decode(data[:exc.start], self.errors)
509
lines = newchars.splitlines(keepends=True)
510
if len(lines)<=1:
511
raise
512
else:
513
raise
514
# keep undecoded bytes until the next call
515
self.bytebuffer = data[decodedbytes:]
516
# put new characters in the character buffer
517
self.charbuffer += newchars
518
# there was no data available
519
if not newdata:
520
break
521
if chars < 0:
522
# Return everything we've got
523
result = self.charbuffer
524
self.charbuffer = self._empty_charbuffer
525
else:
526
# Return the first chars characters
527
result = self.charbuffer[:chars]
528
self.charbuffer = self.charbuffer[chars:]
529
return result
530
531
def readline(self, size=None, keepends=True):
532
533
""" Read one line from the input stream and return the
534
decoded data.
535
536
size, if given, is passed as size argument to the
537
read() method.
538
539
"""
540
# If we have lines cached from an earlier read, return
541
# them unconditionally
542
if self.linebuffer:
543
line = self.linebuffer[0]
544
del self.linebuffer[0]
545
if len(self.linebuffer) == 1:
546
# revert to charbuffer mode; we might need more data
547
# next time
548
self.charbuffer = self.linebuffer[0]
549
self.linebuffer = None
550
if not keepends:
551
line = line.splitlines(keepends=False)[0]
552
return line
553
554
readsize = size or 72
555
line = self._empty_charbuffer
556
# If size is given, we call read() only once
557
while True:
558
data = self.read(readsize, firstline=True)
559
if data:
560
# If we're at a "\r" read one extra character (which might
561
# be a "\n") to get a proper line ending. If the stream is
562
# temporarily exhausted we return the wrong line ending.
563
if (isinstance(data, str) and data.endswith("\r")) or \
564
(isinstance(data, bytes) and data.endswith(b"\r")):
565
data += self.read(size=1, chars=1)
566
567
line += data
568
lines = line.splitlines(keepends=True)
569
if lines:
570
if len(lines) > 1:
571
# More than one line result; the first line is a full line
572
# to return
573
line = lines[0]
574
del lines[0]
575
if len(lines) > 1:
576
# cache the remaining lines
577
lines[-1] += self.charbuffer
578
self.linebuffer = lines
579
self.charbuffer = None
580
else:
581
# only one remaining line, put it back into charbuffer
582
self.charbuffer = lines[0] + self.charbuffer
583
if not keepends:
584
line = line.splitlines(keepends=False)[0]
585
break
586
line0withend = lines[0]
587
line0withoutend = lines[0].splitlines(keepends=False)[0]
588
if line0withend != line0withoutend: # We really have a line end
589
# Put the rest back together and keep it until the next call
590
self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591
self.charbuffer
592
if keepends:
593
line = line0withend
594
else:
595
line = line0withoutend
596
break
597
# we didn't get anything or this was our only try
598
if not data or size is not None:
599
if line and not keepends:
600
line = line.splitlines(keepends=False)[0]
601
break
602
if readsize < 8000:
603
readsize *= 2
604
return line
605
606
def readlines(self, sizehint=None, keepends=True):
607
608
""" Read all lines available on the input stream
609
and return them as a list.
610
611
Line breaks are implemented using the codec's decoder
612
method and are included in the list entries.
613
614
sizehint, if given, is ignored since there is no efficient
615
way to finding the true end-of-line.
616
617
"""
618
data = self.read()
619
return data.splitlines(keepends)
620
621
def reset(self):
622
623
""" Resets the codec buffers used for keeping internal state.
624
625
Note that no stream repositioning should take place.
626
This method is primarily intended to be able to recover
627
from decoding errors.
628
629
"""
630
self.bytebuffer = b""
631
self.charbuffer = self._empty_charbuffer
632
self.linebuffer = None
633
634
def seek(self, offset, whence=0):
635
""" Set the input stream's current position.
636
637
Resets the codec buffers used for keeping state.
638
"""
639
self.stream.seek(offset, whence)
640
self.reset()
641
642
def __next__(self):
643
644
""" Return the next decoded line from the input stream."""
645
line = self.readline()
646
if line:
647
return line
648
raise StopIteration
649
650
def __iter__(self):
651
return self
652
653
def __getattr__(self, name,
654
getattr=getattr):
655
656
""" Inherit all other methods from the underlying stream.
657
"""
658
return getattr(self.stream, name)
659
660
def __enter__(self):
661
return self
662
663
def __exit__(self, type, value, tb):
664
self.stream.close()
665
666
###
667
668
class StreamReaderWriter:
669
670
""" StreamReaderWriter instances allow wrapping streams which
671
work in both read and write modes.
672
673
The design is such that one can use the factory functions
674
returned by the codec.lookup() function to construct the
675
instance.
676
677
"""
678
# Optional attributes set by the file wrappers below
679
encoding = 'unknown'
680
681
def __init__(self, stream, Reader, Writer, errors='strict'):
682
683
""" Creates a StreamReaderWriter instance.
684
685
stream must be a Stream-like object.
686
687
Reader, Writer must be factory functions or classes
688
providing the StreamReader, StreamWriter interface resp.
689
690
Error handling is done in the same way as defined for the
691
StreamWriter/Readers.
692
693
"""
694
self.stream = stream
695
self.reader = Reader(stream, errors)
696
self.writer = Writer(stream, errors)
697
self.errors = errors
698
699
def read(self, size=-1):
700
701
return self.reader.read(size)
702
703
def readline(self, size=None):
704
705
return self.reader.readline(size)
706
707
def readlines(self, sizehint=None):
708
709
return self.reader.readlines(sizehint)
710
711
def __next__(self):
712
713
""" Return the next decoded line from the input stream."""
714
return next(self.reader)
715
716
def __iter__(self):
717
return self
718
719
def write(self, data):
720
721
return self.writer.write(data)
722
723
def writelines(self, list):
724
725
return self.writer.writelines(list)
726
727
def reset(self):
728
729
self.reader.reset()
730
self.writer.reset()
731
732
def seek(self, offset, whence=0):
733
self.stream.seek(offset, whence)
734
self.reader.reset()
735
if whence == 0 and offset == 0:
736
self.writer.reset()
737
738
def __getattr__(self, name,
739
getattr=getattr):
740
741
""" Inherit all other methods from the underlying stream.
742
"""
743
return getattr(self.stream, name)
744
745
# these are needed to make "with StreamReaderWriter(...)" work properly
746
747
def __enter__(self):
748
return self
749
750
def __exit__(self, type, value, tb):
751
self.stream.close()
752
753
###
754
755
class StreamRecoder:
756
757
""" StreamRecoder instances translate data from one encoding to another.
758
759
They use the complete set of APIs returned by the
760
codecs.lookup() function to implement their task.
761
762
Data written to the StreamRecoder is first decoded into an
763
intermediate format (depending on the "decode" codec) and then
764
written to the underlying stream using an instance of the provided
765
Writer class.
766
767
In the other direction, data is read from the underlying stream using
768
a Reader instance and then encoded and returned to the caller.
769
770
"""
771
# Optional attributes set by the file wrappers below
772
data_encoding = 'unknown'
773
file_encoding = 'unknown'
774
775
def __init__(self, stream, encode, decode, Reader, Writer,
776
errors='strict'):
777
778
""" Creates a StreamRecoder instance which implements a two-way
779
conversion: encode and decode work on the frontend (the
780
data visible to .read() and .write()) while Reader and Writer
781
work on the backend (the data in stream).
782
783
You can use these objects to do transparent
784
transcodings from e.g. latin-1 to utf-8 and back.
785
786
stream must be a file-like object.
787
788
encode and decode must adhere to the Codec interface; Reader and
789
Writer must be factory functions or classes providing the
790
StreamReader and StreamWriter interfaces resp.
791
792
Error handling is done in the same way as defined for the
793
StreamWriter/Readers.
794
795
"""
796
self.stream = stream
797
self.encode = encode
798
self.decode = decode
799
self.reader = Reader(stream, errors)
800
self.writer = Writer(stream, errors)
801
self.errors = errors
802
803
def read(self, size=-1):
804
805
data = self.reader.read(size)
806
data, bytesencoded = self.encode(data, self.errors)
807
return data
808
809
def readline(self, size=None):
810
811
if size is None:
812
data = self.reader.readline()
813
else:
814
data = self.reader.readline(size)
815
data, bytesencoded = self.encode(data, self.errors)
816
return data
817
818
def readlines(self, sizehint=None):
819
820
data = self.reader.read()
821
data, bytesencoded = self.encode(data, self.errors)
822
return data.splitlines(keepends=True)
823
824
def __next__(self):
825
826
""" Return the next decoded line from the input stream."""
827
data = next(self.reader)
828
data, bytesencoded = self.encode(data, self.errors)
829
return data
830
831
def __iter__(self):
832
return self
833
834
def write(self, data):
835
836
data, bytesdecoded = self.decode(data, self.errors)
837
return self.writer.write(data)
838
839
def writelines(self, list):
840
841
data = b''.join(list)
842
data, bytesdecoded = self.decode(data, self.errors)
843
return self.writer.write(data)
844
845
def reset(self):
846
847
self.reader.reset()
848
self.writer.reset()
849
850
def seek(self, offset, whence=0):
851
# Seeks must be propagated to both the readers and writers
852
# as they might need to reset their internal buffers.
853
self.reader.seek(offset, whence)
854
self.writer.seek(offset, whence)
855
856
def __getattr__(self, name,
857
getattr=getattr):
858
859
""" Inherit all other methods from the underlying stream.
860
"""
861
return getattr(self.stream, name)
862
863
def __enter__(self):
864
return self
865
866
def __exit__(self, type, value, tb):
867
self.stream.close()
868
869
### Shortcuts
870
871
def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
872
873
""" Open an encoded file using the given mode and return
874
a wrapped version providing transparent encoding/decoding.
875
876
Note: The wrapped version will only accept the object format
877
defined by the codecs, i.e. Unicode objects for most builtin
878
codecs. Output is also codec dependent and will usually be
879
Unicode as well.
880
881
If encoding is not None, then the
882
underlying encoded files are always opened in binary mode.
883
The default file mode is 'r', meaning to open the file in read mode.
884
885
encoding specifies the encoding which is to be used for the
886
file.
887
888
errors may be given to define the error handling. It defaults
889
to 'strict' which causes ValueErrors to be raised in case an
890
encoding error occurs.
891
892
buffering has the same meaning as for the builtin open() API.
893
It defaults to -1 which means that the default buffer size will
894
be used.
895
896
The returned wrapped file object provides an extra attribute
897
.encoding which allows querying the used encoding. This
898
attribute is only available if an encoding was specified as
899
parameter.
900
901
"""
902
if encoding is not None and \
903
'b' not in mode:
904
# Force opening of the file in binary mode
905
mode = mode + 'b'
906
file = builtins.open(filename, mode, buffering)
907
if encoding is None:
908
return file
909
910
try:
911
info = lookup(encoding)
912
srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
913
# Add attributes to simplify introspection
914
srw.encoding = encoding
915
return srw
916
except:
917
file.close()
918
raise
919
920
def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
921
922
""" Return a wrapped version of file which provides transparent
923
encoding translation.
924
925
Data written to the wrapped file is decoded according
926
to the given data_encoding and then encoded to the underlying
927
file using file_encoding. The intermediate data type
928
will usually be Unicode but depends on the specified codecs.
929
930
Bytes read from the file are decoded using file_encoding and then
931
passed back to the caller encoded using data_encoding.
932
933
If file_encoding is not given, it defaults to data_encoding.
934
935
errors may be given to define the error handling. It defaults
936
to 'strict' which causes ValueErrors to be raised in case an
937
encoding error occurs.
938
939
The returned wrapped file object provides two extra attributes
940
.data_encoding and .file_encoding which reflect the given
941
parameters of the same name. The attributes can be used for
942
introspection by Python programs.
943
944
"""
945
if file_encoding is None:
946
file_encoding = data_encoding
947
data_info = lookup(data_encoding)
948
file_info = lookup(file_encoding)
949
sr = StreamRecoder(file, data_info.encode, data_info.decode,
950
file_info.streamreader, file_info.streamwriter, errors)
951
# Add attributes to simplify introspection
952
sr.data_encoding = data_encoding
953
sr.file_encoding = file_encoding
954
return sr
955
956
### Helpers for codec lookup
957
958
def getencoder(encoding):
959
960
""" Lookup up the codec for the given encoding and return
961
its encoder function.
962
963
Raises a LookupError in case the encoding cannot be found.
964
965
"""
966
return lookup(encoding).encode
967
968
def getdecoder(encoding):
969
970
""" Lookup up the codec for the given encoding and return
971
its decoder function.
972
973
Raises a LookupError in case the encoding cannot be found.
974
975
"""
976
return lookup(encoding).decode
977
978
def getincrementalencoder(encoding):
979
980
""" Lookup up the codec for the given encoding and return
981
its IncrementalEncoder class or factory function.
982
983
Raises a LookupError in case the encoding cannot be found
984
or the codecs doesn't provide an incremental encoder.
985
986
"""
987
encoder = lookup(encoding).incrementalencoder
988
if encoder is None:
989
raise LookupError(encoding)
990
return encoder
991
992
def getincrementaldecoder(encoding):
993
994
""" Lookup up the codec for the given encoding and return
995
its IncrementalDecoder class or factory function.
996
997
Raises a LookupError in case the encoding cannot be found
998
or the codecs doesn't provide an incremental decoder.
999
1000
"""
1001
decoder = lookup(encoding).incrementaldecoder
1002
if decoder is None:
1003
raise LookupError(encoding)
1004
return decoder
1005
1006
def getreader(encoding):
1007
1008
""" Lookup up the codec for the given encoding and return
1009
its StreamReader class or factory function.
1010
1011
Raises a LookupError in case the encoding cannot be found.
1012
1013
"""
1014
return lookup(encoding).streamreader
1015
1016
def getwriter(encoding):
1017
1018
""" Lookup up the codec for the given encoding and return
1019
its StreamWriter class or factory function.
1020
1021
Raises a LookupError in case the encoding cannot be found.
1022
1023
"""
1024
return lookup(encoding).streamwriter
1025
1026
def iterencode(iterator, encoding, errors='strict', **kwargs):
1027
"""
1028
Encoding iterator.
1029
1030
Encodes the input strings from the iterator using an IncrementalEncoder.
1031
1032
errors and kwargs are passed through to the IncrementalEncoder
1033
constructor.
1034
"""
1035
encoder = getincrementalencoder(encoding)(errors, **kwargs)
1036
for input in iterator:
1037
output = encoder.encode(input)
1038
if output:
1039
yield output
1040
output = encoder.encode("", True)
1041
if output:
1042
yield output
1043
1044
def iterdecode(iterator, encoding, errors='strict', **kwargs):
1045
"""
1046
Decoding iterator.
1047
1048
Decodes the input strings from the iterator using an IncrementalDecoder.
1049
1050
errors and kwargs are passed through to the IncrementalDecoder
1051
constructor.
1052
"""
1053
decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1054
for input in iterator:
1055
output = decoder.decode(input)
1056
if output:
1057
yield output
1058
output = decoder.decode(b"", True)
1059
if output:
1060
yield output
1061
1062
### Helpers for charmap-based codecs
1063
1064
def make_identity_dict(rng):
1065
1066
""" make_identity_dict(rng) -> dict
1067
1068
Return a dictionary where elements of the rng sequence are
1069
mapped to themselves.
1070
1071
"""
1072
return {i:i for i in rng}
1073
1074
def make_encoding_map(decoding_map):
1075
1076
""" Creates an encoding map from a decoding map.
1077
1078
If a target mapping in the decoding map occurs multiple
1079
times, then that target is mapped to None (undefined mapping),
1080
causing an exception when encountered by the charmap codec
1081
during translation.
1082
1083
One example where this happens is cp875.py which decodes
1084
multiple character to \\u001a.
1085
1086
"""
1087
m = {}
1088
for k,v in decoding_map.items():
1089
if not v in m:
1090
m[v] = k
1091
else:
1092
m[v] = None
1093
return m
1094
1095
### error handlers
1096
1097
try:
1098
strict_errors = lookup_error("strict")
1099
ignore_errors = lookup_error("ignore")
1100
replace_errors = lookup_error("replace")
1101
xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1102
backslashreplace_errors = lookup_error("backslashreplace")
1103
namereplace_errors = lookup_error("namereplace")
1104
except LookupError:
1105
# In --disable-unicode builds, these error handler are missing
1106
strict_errors = None
1107
ignore_errors = None
1108
replace_errors = None
1109
xmlcharrefreplace_errors = None
1110
backslashreplace_errors = None
1111
namereplace_errors = None
1112
1113
# Tell modulefinder that using codecs probably needs the encodings
1114
# package
1115
_false = 0
1116
if _false:
1117
import encodings
1118
1119