Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Lib/bz2.py
12 views
1
"""Interface to the libbzip2 compression library.
2
3
This module provides a file interface, classes for incremental
4
(de)compression, and functions for one-shot (de)compression.
5
"""
6
7
__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8
"open", "compress", "decompress"]
9
10
__author__ = "Nadeem Vawda <[email protected]>"
11
12
from builtins import open as _builtin_open
13
import io
14
import os
15
import _compression
16
17
from _bz2 import BZ2Compressor, BZ2Decompressor
18
19
20
_MODE_CLOSED = 0
21
_MODE_READ = 1
22
# Value 2 no longer used
23
_MODE_WRITE = 3
24
25
26
class BZ2File(_compression.BaseStream):
27
28
"""A file object providing transparent bzip2 (de)compression.
29
30
A BZ2File can act as a wrapper for an existing file object, or refer
31
directly to a named file on disk.
32
33
Note that BZ2File provides a *binary* file interface - data read is
34
returned as bytes, and data to be written should be given as bytes.
35
"""
36
37
def __init__(self, filename, mode="r", *, compresslevel=9):
38
"""Open a bzip2-compressed file.
39
40
If filename is a str, bytes, or PathLike object, it gives the
41
name of the file to be opened. Otherwise, it should be a file
42
object, which will be used to read or write the compressed data.
43
44
mode can be 'r' for reading (default), 'w' for (over)writing,
45
'x' for creating exclusively, or 'a' for appending. These can
46
equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
47
48
If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
49
and 9 specifying the level of compression: 1 produces the least
50
compression, and 9 (default) produces the most compression.
51
52
If mode is 'r', the input file may be the concatenation of
53
multiple compressed streams.
54
"""
55
self._fp = None
56
self._closefp = False
57
self._mode = _MODE_CLOSED
58
59
if not (1 <= compresslevel <= 9):
60
raise ValueError("compresslevel must be between 1 and 9")
61
62
if mode in ("", "r", "rb"):
63
mode = "rb"
64
mode_code = _MODE_READ
65
elif mode in ("w", "wb"):
66
mode = "wb"
67
mode_code = _MODE_WRITE
68
self._compressor = BZ2Compressor(compresslevel)
69
elif mode in ("x", "xb"):
70
mode = "xb"
71
mode_code = _MODE_WRITE
72
self._compressor = BZ2Compressor(compresslevel)
73
elif mode in ("a", "ab"):
74
mode = "ab"
75
mode_code = _MODE_WRITE
76
self._compressor = BZ2Compressor(compresslevel)
77
else:
78
raise ValueError("Invalid mode: %r" % (mode,))
79
80
if isinstance(filename, (str, bytes, os.PathLike)):
81
self._fp = _builtin_open(filename, mode)
82
self._closefp = True
83
self._mode = mode_code
84
elif hasattr(filename, "read") or hasattr(filename, "write"):
85
self._fp = filename
86
self._mode = mode_code
87
else:
88
raise TypeError("filename must be a str, bytes, file or PathLike object")
89
90
if self._mode == _MODE_READ:
91
raw = _compression.DecompressReader(self._fp,
92
BZ2Decompressor, trailing_error=OSError)
93
self._buffer = io.BufferedReader(raw)
94
else:
95
self._pos = 0
96
97
def close(self):
98
"""Flush and close the file.
99
100
May be called more than once without error. Once the file is
101
closed, any other operation on it will raise a ValueError.
102
"""
103
if self._mode == _MODE_CLOSED:
104
return
105
try:
106
if self._mode == _MODE_READ:
107
self._buffer.close()
108
elif self._mode == _MODE_WRITE:
109
self._fp.write(self._compressor.flush())
110
self._compressor = None
111
finally:
112
try:
113
if self._closefp:
114
self._fp.close()
115
finally:
116
self._fp = None
117
self._closefp = False
118
self._mode = _MODE_CLOSED
119
self._buffer = None
120
121
@property
122
def closed(self):
123
"""True if this file is closed."""
124
return self._mode == _MODE_CLOSED
125
126
def fileno(self):
127
"""Return the file descriptor for the underlying file."""
128
self._check_not_closed()
129
return self._fp.fileno()
130
131
def seekable(self):
132
"""Return whether the file supports seeking."""
133
return self.readable() and self._buffer.seekable()
134
135
def readable(self):
136
"""Return whether the file was opened for reading."""
137
self._check_not_closed()
138
return self._mode == _MODE_READ
139
140
def writable(self):
141
"""Return whether the file was opened for writing."""
142
self._check_not_closed()
143
return self._mode == _MODE_WRITE
144
145
def peek(self, n=0):
146
"""Return buffered data without advancing the file position.
147
148
Always returns at least one byte of data, unless at EOF.
149
The exact number of bytes returned is unspecified.
150
"""
151
self._check_can_read()
152
# Relies on the undocumented fact that BufferedReader.peek()
153
# always returns at least one byte (except at EOF), independent
154
# of the value of n
155
return self._buffer.peek(n)
156
157
def read(self, size=-1):
158
"""Read up to size uncompressed bytes from the file.
159
160
If size is negative or omitted, read until EOF is reached.
161
Returns b'' if the file is already at EOF.
162
"""
163
self._check_can_read()
164
return self._buffer.read(size)
165
166
def read1(self, size=-1):
167
"""Read up to size uncompressed bytes, while trying to avoid
168
making multiple reads from the underlying stream. Reads up to a
169
buffer's worth of data if size is negative.
170
171
Returns b'' if the file is at EOF.
172
"""
173
self._check_can_read()
174
if size < 0:
175
size = io.DEFAULT_BUFFER_SIZE
176
return self._buffer.read1(size)
177
178
def readinto(self, b):
179
"""Read bytes into b.
180
181
Returns the number of bytes read (0 for EOF).
182
"""
183
self._check_can_read()
184
return self._buffer.readinto(b)
185
186
def readline(self, size=-1):
187
"""Read a line of uncompressed bytes from the file.
188
189
The terminating newline (if present) is retained. If size is
190
non-negative, no more than size bytes will be read (in which
191
case the line may be incomplete). Returns b'' if already at EOF.
192
"""
193
if not isinstance(size, int):
194
if not hasattr(size, "__index__"):
195
raise TypeError("Integer argument expected")
196
size = size.__index__()
197
self._check_can_read()
198
return self._buffer.readline(size)
199
200
def readlines(self, size=-1):
201
"""Read a list of lines of uncompressed bytes from the file.
202
203
size can be specified to control the number of lines read: no
204
further lines will be read once the total size of the lines read
205
so far equals or exceeds size.
206
"""
207
if not isinstance(size, int):
208
if not hasattr(size, "__index__"):
209
raise TypeError("Integer argument expected")
210
size = size.__index__()
211
self._check_can_read()
212
return self._buffer.readlines(size)
213
214
def write(self, data):
215
"""Write a byte string to the file.
216
217
Returns the number of uncompressed bytes written, which is
218
always the length of data in bytes. Note that due to buffering,
219
the file on disk may not reflect the data written until close()
220
is called.
221
"""
222
self._check_can_write()
223
if isinstance(data, (bytes, bytearray)):
224
length = len(data)
225
else:
226
# accept any data that supports the buffer protocol
227
data = memoryview(data)
228
length = data.nbytes
229
230
compressed = self._compressor.compress(data)
231
self._fp.write(compressed)
232
self._pos += length
233
return length
234
235
def writelines(self, seq):
236
"""Write a sequence of byte strings to the file.
237
238
Returns the number of uncompressed bytes written.
239
seq can be any iterable yielding byte strings.
240
241
Line separators are not added between the written byte strings.
242
"""
243
return _compression.BaseStream.writelines(self, seq)
244
245
def seek(self, offset, whence=io.SEEK_SET):
246
"""Change the file position.
247
248
The new position is specified by offset, relative to the
249
position indicated by whence. Values for whence are:
250
251
0: start of stream (default); offset must not be negative
252
1: current stream position
253
2: end of stream; offset must not be positive
254
255
Returns the new file position.
256
257
Note that seeking is emulated, so depending on the parameters,
258
this operation may be extremely slow.
259
"""
260
self._check_can_seek()
261
return self._buffer.seek(offset, whence)
262
263
def tell(self):
264
"""Return the current file position."""
265
self._check_not_closed()
266
if self._mode == _MODE_READ:
267
return self._buffer.tell()
268
return self._pos
269
270
271
def open(filename, mode="rb", compresslevel=9,
272
encoding=None, errors=None, newline=None):
273
"""Open a bzip2-compressed file in binary or text mode.
274
275
The filename argument can be an actual filename (a str, bytes, or
276
PathLike object), or an existing file object to read from or write
277
to.
278
279
The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
280
"ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
281
The default mode is "rb", and the default compresslevel is 9.
282
283
For binary mode, this function is equivalent to the BZ2File
284
constructor: BZ2File(filename, mode, compresslevel). In this case,
285
the encoding, errors and newline arguments must not be provided.
286
287
For text mode, a BZ2File object is created, and wrapped in an
288
io.TextIOWrapper instance with the specified encoding, error
289
handling behavior, and line ending(s).
290
291
"""
292
if "t" in mode:
293
if "b" in mode:
294
raise ValueError("Invalid mode: %r" % (mode,))
295
else:
296
if encoding is not None:
297
raise ValueError("Argument 'encoding' not supported in binary mode")
298
if errors is not None:
299
raise ValueError("Argument 'errors' not supported in binary mode")
300
if newline is not None:
301
raise ValueError("Argument 'newline' not supported in binary mode")
302
303
bz_mode = mode.replace("t", "")
304
binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
305
306
if "t" in mode:
307
encoding = io.text_encoding(encoding)
308
return io.TextIOWrapper(binary_file, encoding, errors, newline)
309
else:
310
return binary_file
311
312
313
def compress(data, compresslevel=9):
314
"""Compress a block of data.
315
316
compresslevel, if given, must be a number between 1 and 9.
317
318
For incremental compression, use a BZ2Compressor object instead.
319
"""
320
comp = BZ2Compressor(compresslevel)
321
return comp.compress(data) + comp.flush()
322
323
324
def decompress(data):
325
"""Decompress a block of data.
326
327
For incremental decompression, use a BZ2Decompressor object instead.
328
"""
329
results = []
330
while data:
331
decomp = BZ2Decompressor()
332
try:
333
res = decomp.decompress(data)
334
except OSError:
335
if results:
336
break # Leftover data is not a valid bzip2 stream; ignore it.
337
else:
338
raise # Error on the first iteration; bail out.
339
results.append(res)
340
if not decomp.eof:
341
raise ValueError("Compressed data ended before the "
342
"end-of-stream marker was reached")
343
data = decomp.unused_data
344
return b"".join(results)
345
346