From 3b890621267fb45d771c869ff56d151bb7a40cd5 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 26 Sep 2024 21:08:59 +0200
Subject: [PATCH] BUG: Cope with encoding with too many differences (#2873)

Closes #2836.
---
 pypdf/_cmap.py     |  3 ++-
 tests/test_cmap.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index dcf3678bd..4cc112552 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -207,7 +207,8 @@ def parse_encoding(
                 x = o
             else:  # isinstance(o,str):
                 try:
-                    encoding[x] = adobe_glyphs[o]  # type: ignore
+                    if x < len(encoding):
+                        encoding[x] = adobe_glyphs[o]  # type: ignore
                 except Exception:
                     encoding[x] = o  # type: ignore
                     if o == " ":
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
index 8042d306e..f0432469d 100644
--- a/tests/test_cmap.py
+++ b/tests/test_cmap.py
@@ -248,3 +248,14 @@ def test_unigb_utf16():
     name = "iss2812.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     assert "《中国能源展望 2060（2024 年版）》编写委员会" in reader.pages[1].extract_text()
+
+
+@pytest.mark.enable_socket()
+def test_too_many_differences():
+    """Cf #2836"""
+    url = (
+        "https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
+    )
+    name = "iss2836.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].extract_text() == ""