Skip to content

Commit 7b5b779

Browse files
authored
Merge pull request #29 from pycompression/release_0.4.0
Release 0.4.0
2 parents 3df49da + d581195 commit 7b5b779

15 files changed

+1806
-208
lines changed

CHANGELOG.rst

+12
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.4.0
11+
-----------------
12+
+ Add a ``gzip_ng_threaded`` module that contains the ``gzip_ng_threaded.open``
13+
function. This allows using multithreaded compression as well as escaping the
14+
GIL.
15+
+ The internal ``gzip_ng._GzipReader`` has been rewritten in C. As a result the
16+
overhead of decompressing files has significantly been reduced.
17+
+ The ``gzip_ng._GzipReader`` in C is now used in ``gzip_ng.decompress``. The
18+
``_GzipReader`` also can read from objects that support the buffer protocol.
19+
This has reduced overhead significantly.
20+
+ Fix some unclosed buffer errors in the gzip_ng CLI.
21+
1022
version 0.3.0
1123
-----------------
1224
+ Source distributions on Linux now default to building with configure and

README.rst

+7-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ by providing Python bindings for the zlib-ng library.
4242
This package provides Python bindings for the `zlib-ng
4343
<https://github.com/zlib-ng/zlib-ng>`_ library.
4444

45-
``python-zlib-ng`` provides the bindings by offering two modules:
45+
``python-zlib-ng`` provides the bindings by offering three modules:
4646

4747
+ ``zlib_ng``: A drop-in replacement for the zlib module that uses zlib-ng to
4848
accelerate its performance.
@@ -51,6 +51,11 @@ This package provides Python bindings for the `zlib-ng
5151
instead of ``zlib`` to perform its compression and checksum tasks, which
5252
improves performance.
5353

54+
+ ``gzip_ng_threaded`` offers an ``open`` function which returns buffered read
55+
or write streams that can be used to read and write large files while
56+
escaping the GIL using one or multiple threads. This functionality only
57+
works for streaming, seeking is not supported.
58+
5459
``zlib_ng`` and ``gzip_ng`` are almost fully compatible with ``zlib`` and
5560
``gzip`` from the Python standard library. There are some minor differences
5661
see: differences-with-zlib-and-gzip-modules_.
@@ -68,6 +73,7 @@ The python-zlib-ng modules can be imported as follows
6873
6974
from zlib_ng import zlib_ng
7075
from zlib_ng import gzip_ng
76+
from zlib_ng import gzip_ng_threaded
7177
7278
``zlib_ng`` and ``gzip_ng`` are meant to be used as drop in replacements so
7379
their api and functions are the same as the stdlib's modules.

docs/index.rst

+7
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ API-documentation: zlib_ng.gzip_ng
113113
:members:
114114
:special-members: __init__
115115

116+
===========================================
117+
API-documentation: zlib_ng.gzip_ng_threaded
118+
===========================================
119+
120+
.. automodule:: zlib_ng.gzip_ng_threaded
121+
:members: open
122+
116123
===============================
117124
python -m zlib_ng.gzip_ng usage
118125
===============================

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def build_zlib_ng():
123123

124124
setup(
125125
name="zlib-ng",
126-
version="0.3.0",
126+
version="0.4.0",
127127
description="Drop-in replacement for zlib and gzip modules using zlib-ng",
128128
author="Leiden University Medical Center",
129129
author_email="[email protected]", # A placeholder for now

src/zlib_ng/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
# This file is part of python-zlib-ng which is distributed under the
66
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.
77

8-
__version__ = "0.3.0"
8+
__version__ = "0.4.0"

src/zlib_ng/gzip_ng.py

+13-146
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
import struct
2626
import sys
2727
import time
28-
import _compression # noqa: I201 # Not third-party
2928

3029
from . import zlib_ng
30+
from .zlib_ng import _GzipReader
3131

3232
__all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile",
3333
"READ_BUFFER_SIZE"]
@@ -36,19 +36,14 @@
3636
_COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION
3737
_COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION
3838

39-
#: The amount of data that is read in at once when decompressing a file.
40-
#: Increasing this value may increase performance.
41-
#: 128K is also the size used by pigz and cat to read files from the
42-
# filesystem.
43-
READ_BUFFER_SIZE = 128 * 1024
39+
# The amount of data that is read in at once when decompressing a file.
40+
# Increasing this value may increase performance.
41+
READ_BUFFER_SIZE = 512 * 1024
4442

4543
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
4644
READ, WRITE = 1, 2
4745

48-
try:
49-
BadGzipFile = gzip.BadGzipFile # type: ignore
50-
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
51-
BadGzipFile = OSError # type: ignore
46+
BadGzipFile = gzip.BadGzipFile # type: ignore
5247

5348

5449
# The open method was copied from the CPython source with minor adjustments.
@@ -149,7 +144,7 @@ def __init__(self, filename=None, mode=None,
149144
zlib_ng.DEF_MEM_LEVEL,
150145
0)
151146
if self.mode == READ:
152-
raw = _GzipNGReader(self.fileobj)
147+
raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
153148
self._buffer = io.BufferedReader(raw)
154149

155150
def __repr__(self):
@@ -180,124 +175,9 @@ def write(self, data):
180175
return length
181176

182177

183-
class _GzipNGReader(gzip._GzipReader):
184-
def __init__(self, fp):
185-
# Call the init method of gzip._GzipReader's parent here.
186-
# It is not very invasive and allows us to override _PaddedFile
187-
_compression.DecompressReader.__init__(
188-
self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor,
189-
wbits=-zlib_ng.MAX_WBITS)
190-
# Set flag indicating start of a new member
191-
self._new_member = True
192-
self._last_mtime = None
193-
194-
def read(self, size=-1):
195-
if size < 0:
196-
return self.readall()
197-
# size=0 is special because decompress(max_length=0) is not supported
198-
if not size:
199-
return b""
200-
201-
# For certain input data, a single
202-
# call to decompress() may not return
203-
# any data. In this case, retry until we get some data or reach EOF.
204-
while True:
205-
if self._decompressor.eof:
206-
# Ending case: we've come to the end of a member in the file,
207-
# so finish up this member, and read a new gzip header.
208-
# Check the CRC and file size, and set the flag so we read
209-
# a new member
210-
self._read_eof()
211-
self._new_member = True
212-
self._decompressor = self._decomp_factory(
213-
**self._decomp_args)
214-
215-
if self._new_member:
216-
# If the _new_member flag is set, we have to
217-
# jump to the next member, if there is one.
218-
self._init_read()
219-
if not self._read_gzip_header():
220-
self._size = self._pos
221-
return b""
222-
self._new_member = False
223-
224-
# Read a chunk of data from the file
225-
if self._decompressor.needs_input:
226-
buf = self._fp.read(READ_BUFFER_SIZE)
227-
uncompress = self._decompressor.decompress(buf, size)
228-
else:
229-
uncompress = self._decompressor.decompress(b"", size)
230-
if self._decompressor.unused_data != b"":
231-
# Prepend the already read bytes to the fileobj so they can
232-
# be seen by _read_eof() and _read_gzip_header()
233-
self._fp.prepend(self._decompressor.unused_data)
234-
235-
if uncompress != b"":
236-
break
237-
if buf == b"":
238-
raise EOFError("Compressed file ended before the "
239-
"end-of-stream marker was reached")
240-
241-
self._crc = zlib_ng.crc32(uncompress, self._crc)
242-
self._stream_size += len(uncompress)
243-
self._pos += len(uncompress)
244-
return uncompress
245-
246-
247178
# Aliases for improved compatibility with CPython gzip module.
248179
GzipFile = GzipNGFile
249-
_GzipReader = _GzipNGReader
250-
251-
252-
def _read_exact(fp, n):
253-
'''Read exactly *n* bytes from `fp`
254-
This method is required because fp may be unbuffered,
255-
i.e. return short reads.
256-
'''
257-
data = fp.read(n)
258-
while len(data) < n:
259-
b = fp.read(n - len(data))
260-
if not b:
261-
raise EOFError("Compressed file ended before the "
262-
"end-of-stream marker was reached")
263-
data += b
264-
return data
265-
266-
267-
def _read_gzip_header(fp):
268-
'''Read a gzip header from `fp` and progress to the end of the header.
269-
Returns last mtime if header was present or None otherwise.
270-
'''
271-
magic = fp.read(2)
272-
if magic == b'':
273-
return None
274-
275-
if magic != b'\037\213':
276-
raise BadGzipFile('Not a gzipped file (%r)' % magic)
277-
278-
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
279-
if method != 8:
280-
raise BadGzipFile('Unknown compression method')
281-
282-
if flag & FEXTRA:
283-
# Read & discard the extra field, if present
284-
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
285-
_read_exact(fp, extra_len)
286-
if flag & FNAME:
287-
# Read and discard a null-terminated string containing the filename
288-
while True:
289-
s = fp.read(1)
290-
if not s or s == b'\000':
291-
break
292-
if flag & FCOMMENT:
293-
# Read and discard a null-terminated string containing a comment
294-
while True:
295-
s = fp.read(1)
296-
if not s or s == b'\000':
297-
break
298-
if flag & FHCRC:
299-
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
300-
return last_mtime
180+
_GzipNGReader = _GzipReader
301181

302182

303183
def _create_simple_gzip_header(compresslevel: int,
@@ -342,25 +222,9 @@ def decompress(data):
342222
"""Decompress a gzip compressed string in one shot.
343223
Return the decompressed string.
344224
"""
345-
decompressed_members = []
346-
while True:
347-
fp = io.BytesIO(data)
348-
if _read_gzip_header(fp) is None:
349-
return b"".join(decompressed_members)
350-
# Use a zlib raw deflate compressor
351-
do = zlib_ng.decompressobj(wbits=-zlib_ng.MAX_WBITS)
352-
# Read all the data except the header
353-
decompressed = do.decompress(data[fp.tell():])
354-
if not do.eof or len(do.unused_data) < 8:
355-
raise EOFError("Compressed file ended before the end-of-stream "
356-
"marker was reached")
357-
crc, length = struct.unpack("<II", do.unused_data[:8])
358-
if crc != zlib_ng.crc32(decompressed):
359-
raise BadGzipFile("CRC check failed")
360-
if length != (len(decompressed) & 0xffffffff):
361-
raise BadGzipFile("Incorrect length of data produced")
362-
decompressed_members.append(decompressed)
363-
data = do.unused_data[8:].lstrip(b"\x00")
225+
fp = io.BytesIO(data)
226+
reader = _GzipReader(fp, max(len(data), 16))
227+
return reader.readall()
364228

365229

366230
def _argument_parser():
@@ -431,6 +295,7 @@ def main():
431295
if yes_or_no not in {"y", "Y", "yes"}:
432296
sys.exit("not overwritten")
433297

298+
out_buffer = None
434299
if args.compress:
435300
if args.file is None:
436301
in_file = sys.stdin.buffer
@@ -470,6 +335,8 @@ def main():
470335
in_file.close()
471336
if out_file is not sys.stdout.buffer:
472337
out_file.close()
338+
if out_buffer is not None and out_buffer is not sys.stdout.buffer:
339+
out_buffer.close()
473340

474341

475342
if __name__ == "__main__": # pragma: no cover

0 commit comments

Comments
 (0)