Skip to content

Commit

Permalink
BUG: Add RunLengthDecode filter (#2012)
Browse files Browse the repository at this point in the history
Closes #2011
  • Loading branch information
pubpub-zz authored Jul 29, 2023
1 parent 6b70364 commit 3d7206c
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 1 deletion.
63 changes: 62 additions & 1 deletion pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,65 @@ def decode(
return retval


class RunLengthDecode:
"""
The RunLengthDecode filter decodes data that has been encoded in a
simple byte-oriented format based on run length.
The encoded data is a sequence of runs, where each run consists of
a length byte followed by 1 to 128 bytes of data. If the length byte is
in the range 0 to 127,
the following length + 1 (1 to 128) bytes are copied literally during
decompression.
If length is in the range 129 to 255, the following single byte is to be
copied 257 − length (2 to 128) times during decompression. A length value
of 128 denotes EOD.
"""

@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an ASCII-Hex encoded data stream.
Args:
data: a bytes sequence of length/data
decode_parms: ignored.
Returns:
A bytes decompressed sequence.
Raises:
PdfStreamError:
"""
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
lst = []
index = 0
while True:
if index >= len(data):
raise PdfStreamError("Unexpected EOD in RunLengthDecode")
length = data[index]
index += 1
if length == 128:
if index < len(data):
raise PdfStreamError("early EOD in RunLengthDecode")
else:
break
elif length < 128:
length += 1
lst.append(data[index : (index + length)])
index += length
else: # >128
length = 257 - length
lst.append(bytes((data[index],)) * length)
index += 1
return b"".join(lst)


class LZWDecode:
"""
Taken from:
Expand Down Expand Up @@ -582,7 +641,7 @@ def decode_stream_data(stream: Any) -> Union[str, bytes]: # utils.StreamObject
This function decodes the stream data using the filters provided in the
stream. It supports various filter types, including FlateDecode,
ASCIIHexDecode, LZWDecode, ASCII85Decode, DCTDecode, JPXDecode, and
ASCIIHexDecode, RunLengthDecode, LZWDecode, ASCII85Decode, DCTDecode, JPXDecode, and
CCITTFaxDecode.
Args:
Expand Down Expand Up @@ -613,6 +672,8 @@ def decode_stream_data(stream: Any) -> Union[str, bytes]: # utils.StreamObject
data = FlateDecode.decode(data, params)
elif filter_type in (FT.ASCII_HEX_DECODE, FTA.AHx):
data = ASCIIHexDecode.decode(data) # type: ignore
elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
data = RunLengthDecode.decode(data)
elif filter_type in (FT.LZW_DECODE, FTA.LZW):
data = LZWDecode.decode(data, params) # type: ignore
elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
Expand Down
28 changes: 28 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,34 @@ def test_gray_devicen_cmyk():
assert d < 0.001


@pytest.mark.enable_socket()
def test_runlengthdecode():
"""From #1954, test with 2bits image. TODO: 4bits also"""
url = "https://github.com/py-pdf/pypdf/files/12159941/out.pdf"
name = "RunLengthDecode.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
url_png = "https://user-images.githubusercontent.com/4083478/255940800-6d63972e-a3d6-4cf9-aa6f-0793af24cded.png"
name_png = "RunLengthDecode.png"
refimg = Image.open(
BytesIO(get_pdf_from_url(url_png, name=name_png))
) # not a pdf but it works
data = reader.pages[0].images[0]
diff = ImageChops.difference(data.image, refimg)
d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1])
assert d < 0.001
url = "https://github.com/py-pdf/pypdf/files/12162905/out.pdf"
name = "FailedRLE1.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with pytest.raises(PdfStreamError) as exc:
reader.pages[0].images[0]
assert exc.value.args[0] == "Unexpected EOD in RunLengthDecode"
url = "https://github.com/py-pdf/pypdf/files/12162926/out.pdf"
name = "FailedRLE2.pdf"
with pytest.raises(PdfStreamError) as exc:
reader.pages[0].images[0]
assert exc.value.args[0] == "Unexpected EOD in RunLengthDecode"


@pytest.mark.enable_socket()
def test_gray_separation_cmyk():
"""
Expand Down

0 comments on commit 3d7206c

Please sign in to comment.