|
25 | 25 | import struct
|
26 | 26 | import sys
|
27 | 27 | import time
|
28 |
| -import _compression # noqa: I201 # Not third-party |
29 | 28 |
|
30 | 29 | from . import zlib_ng
|
| 30 | +from .zlib_ng import _GzipReader |
31 | 31 |
|
32 | 32 | __all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile",
|
33 | 33 | "READ_BUFFER_SIZE"]
|
|
36 | 36 | _COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION
|
37 | 37 | _COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION
|
38 | 38 |
|
39 |
| -#: The amount of data that is read in at once when decompressing a file. |
40 |
| -#: Increasing this value may increase performance. |
41 |
| -#: 128K is also the size used by pigz and cat to read files from the |
42 |
| -# filesystem. |
43 |
| -READ_BUFFER_SIZE = 128 * 1024 |
| 39 | +# The amount of data that is read in at once when decompressing a file. |
| 40 | +# Increasing this value may increase performance. |
| 41 | +READ_BUFFER_SIZE = 512 * 1024 |
44 | 42 |
|
45 | 43 | FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
|
46 | 44 | READ, WRITE = 1, 2
|
47 | 45 |
|
48 |
| -try: |
49 |
| - BadGzipFile = gzip.BadGzipFile # type: ignore |
50 |
| -except AttributeError: # Versions lower than 3.8 do not have BadGzipFile |
51 |
| - BadGzipFile = OSError # type: ignore |
| 46 | +BadGzipFile = gzip.BadGzipFile # type: ignore |
52 | 47 |
|
53 | 48 |
|
54 | 49 | # The open method was copied from the CPython source with minor adjustments.
|
@@ -149,7 +144,7 @@ def __init__(self, filename=None, mode=None,
|
149 | 144 | zlib_ng.DEF_MEM_LEVEL,
|
150 | 145 | 0)
|
151 | 146 | if self.mode == READ:
|
152 |
| - raw = _GzipNGReader(self.fileobj) |
| 147 | + raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE) |
153 | 148 | self._buffer = io.BufferedReader(raw)
|
154 | 149 |
|
155 | 150 | def __repr__(self):
|
@@ -180,124 +175,9 @@ def write(self, data):
|
180 | 175 | return length
|
181 | 176 |
|
182 | 177 |
|
183 |
| -class _GzipNGReader(gzip._GzipReader): |
184 |
| - def __init__(self, fp): |
185 |
| - # Call the init method of gzip._GzipReader's parent here. |
186 |
| - # It is not very invasive and allows us to override _PaddedFile |
187 |
| - _compression.DecompressReader.__init__( |
188 |
| - self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor, |
189 |
| - wbits=-zlib_ng.MAX_WBITS) |
190 |
| - # Set flag indicating start of a new member |
191 |
| - self._new_member = True |
192 |
| - self._last_mtime = None |
193 |
| - |
194 |
| - def read(self, size=-1): |
195 |
| - if size < 0: |
196 |
| - return self.readall() |
197 |
| - # size=0 is special because decompress(max_length=0) is not supported |
198 |
| - if not size: |
199 |
| - return b"" |
200 |
| - |
201 |
| - # For certain input data, a single |
202 |
| - # call to decompress() may not return |
203 |
| - # any data. In this case, retry until we get some data or reach EOF. |
204 |
| - while True: |
205 |
| - if self._decompressor.eof: |
206 |
| - # Ending case: we've come to the end of a member in the file, |
207 |
| - # so finish up this member, and read a new gzip header. |
208 |
| - # Check the CRC and file size, and set the flag so we read |
209 |
| - # a new member |
210 |
| - self._read_eof() |
211 |
| - self._new_member = True |
212 |
| - self._decompressor = self._decomp_factory( |
213 |
| - **self._decomp_args) |
214 |
| - |
215 |
| - if self._new_member: |
216 |
| - # If the _new_member flag is set, we have to |
217 |
| - # jump to the next member, if there is one. |
218 |
| - self._init_read() |
219 |
| - if not self._read_gzip_header(): |
220 |
| - self._size = self._pos |
221 |
| - return b"" |
222 |
| - self._new_member = False |
223 |
| - |
224 |
| - # Read a chunk of data from the file |
225 |
| - if self._decompressor.needs_input: |
226 |
| - buf = self._fp.read(READ_BUFFER_SIZE) |
227 |
| - uncompress = self._decompressor.decompress(buf, size) |
228 |
| - else: |
229 |
| - uncompress = self._decompressor.decompress(b"", size) |
230 |
| - if self._decompressor.unused_data != b"": |
231 |
| - # Prepend the already read bytes to the fileobj so they can |
232 |
| - # be seen by _read_eof() and _read_gzip_header() |
233 |
| - self._fp.prepend(self._decompressor.unused_data) |
234 |
| - |
235 |
| - if uncompress != b"": |
236 |
| - break |
237 |
| - if buf == b"": |
238 |
| - raise EOFError("Compressed file ended before the " |
239 |
| - "end-of-stream marker was reached") |
240 |
| - |
241 |
| - self._crc = zlib_ng.crc32(uncompress, self._crc) |
242 |
| - self._stream_size += len(uncompress) |
243 |
| - self._pos += len(uncompress) |
244 |
| - return uncompress |
245 |
| - |
246 |
| - |
247 | 178 | # Aliases for improved compatibility with CPython gzip module.
|
248 | 179 | GzipFile = GzipNGFile
|
249 |
| -_GzipReader = _GzipNGReader |
250 |
| - |
251 |
| - |
252 |
| -def _read_exact(fp, n): |
253 |
| - '''Read exactly *n* bytes from `fp` |
254 |
| - This method is required because fp may be unbuffered, |
255 |
| - i.e. return short reads. |
256 |
| - ''' |
257 |
| - data = fp.read(n) |
258 |
| - while len(data) < n: |
259 |
| - b = fp.read(n - len(data)) |
260 |
| - if not b: |
261 |
| - raise EOFError("Compressed file ended before the " |
262 |
| - "end-of-stream marker was reached") |
263 |
| - data += b |
264 |
| - return data |
265 |
| - |
266 |
| - |
267 |
| -def _read_gzip_header(fp): |
268 |
| - '''Read a gzip header from `fp` and progress to the end of the header. |
269 |
| - Returns last mtime if header was present or None otherwise. |
270 |
| - ''' |
271 |
| - magic = fp.read(2) |
272 |
| - if magic == b'': |
273 |
| - return None |
274 |
| - |
275 |
| - if magic != b'\037\213': |
276 |
| - raise BadGzipFile('Not a gzipped file (%r)' % magic) |
277 |
| - |
278 |
| - (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8)) |
279 |
| - if method != 8: |
280 |
| - raise BadGzipFile('Unknown compression method') |
281 |
| - |
282 |
| - if flag & FEXTRA: |
283 |
| - # Read & discard the extra field, if present |
284 |
| - extra_len, = struct.unpack("<H", _read_exact(fp, 2)) |
285 |
| - _read_exact(fp, extra_len) |
286 |
| - if flag & FNAME: |
287 |
| - # Read and discard a null-terminated string containing the filename |
288 |
| - while True: |
289 |
| - s = fp.read(1) |
290 |
| - if not s or s == b'\000': |
291 |
| - break |
292 |
| - if flag & FCOMMENT: |
293 |
| - # Read and discard a null-terminated string containing a comment |
294 |
| - while True: |
295 |
| - s = fp.read(1) |
296 |
| - if not s or s == b'\000': |
297 |
| - break |
298 |
| - if flag & FHCRC: |
299 |
| - _read_exact(fp, 2) # Read & discard the 16-bit header CRC |
300 |
| - return last_mtime |
| 180 | +_GzipNGReader = _GzipReader |
301 | 181 |
|
302 | 182 |
|
303 | 183 | def _create_simple_gzip_header(compresslevel: int,
|
@@ -342,25 +222,9 @@ def decompress(data):
|
342 | 222 | """Decompress a gzip compressed string in one shot.
|
343 | 223 | Return the decompressed string.
|
344 | 224 | """
|
345 |
| - decompressed_members = [] |
346 |
| - while True: |
347 |
| - fp = io.BytesIO(data) |
348 |
| - if _read_gzip_header(fp) is None: |
349 |
| - return b"".join(decompressed_members) |
350 |
| - # Use a zlib raw deflate compressor |
351 |
| - do = zlib_ng.decompressobj(wbits=-zlib_ng.MAX_WBITS) |
352 |
| - # Read all the data except the header |
353 |
| - decompressed = do.decompress(data[fp.tell():]) |
354 |
| - if not do.eof or len(do.unused_data) < 8: |
355 |
| - raise EOFError("Compressed file ended before the end-of-stream " |
356 |
| - "marker was reached") |
357 |
| - crc, length = struct.unpack("<II", do.unused_data[:8]) |
358 |
| - if crc != zlib_ng.crc32(decompressed): |
359 |
| - raise BadGzipFile("CRC check failed") |
360 |
| - if length != (len(decompressed) & 0xffffffff): |
361 |
| - raise BadGzipFile("Incorrect length of data produced") |
362 |
| - decompressed_members.append(decompressed) |
363 |
| - data = do.unused_data[8:].lstrip(b"\x00") |
| 225 | + fp = io.BytesIO(data) |
| 226 | + reader = _GzipReader(fp, max(len(data), 16)) |
| 227 | + return reader.readall() |
364 | 228 |
|
365 | 229 |
|
366 | 230 | def _argument_parser():
|
@@ -431,6 +295,7 @@ def main():
|
431 | 295 | if yes_or_no not in {"y", "Y", "yes"}:
|
432 | 296 | sys.exit("not overwritten")
|
433 | 297 |
|
| 298 | + out_buffer = None |
434 | 299 | if args.compress:
|
435 | 300 | if args.file is None:
|
436 | 301 | in_file = sys.stdin.buffer
|
@@ -470,6 +335,8 @@ def main():
|
470 | 335 | in_file.close()
|
471 | 336 | if out_file is not sys.stdout.buffer:
|
472 | 337 | out_file.close()
|
| 338 | + if out_buffer is not None and out_buffer is not sys.stdout.buffer: |
| 339 | + out_buffer.close() |
473 | 340 |
|
474 | 341 |
|
475 | 342 | if __name__ == "__main__": # pragma: no cover
|
|
0 commit comments