From 3b890621267fb45d771c869ff56d151bb7a40cd5 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 26 Sep 2024 21:08:59 +0200 Subject: [PATCH] BUG: Cope with encoding with too many differences (#2873) Closes #2836. --- pypdf/_cmap.py | 3 ++- tests/test_cmap.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index dcf3678bd..4cc112552 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -207,7 +207,8 @@ def parse_encoding( x = o else: # isinstance(o,str): try: - encoding[x] = adobe_glyphs[o] # type: ignore + if x < len(encoding): + encoding[x] = adobe_glyphs[o] # type: ignore except Exception: encoding[x] = o # type: ignore if o == " ": diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 8042d306e..f0432469d 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -248,3 +248,14 @@ def test_unigb_utf16(): name = "iss2812.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text() + + +@pytest.mark.enable_socket() +def test_too_many_differences(): + """Cf #2836""" + url = ( + "https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf" + ) + name = "iss2836.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].extract_text() == ""