From 96b46add0d61940f099f40a9676bb8fff300eaa6 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Sun, 6 Oct 2024 18:46:58 +0900
Subject: [PATCH] MAINT: Generalize the method of obtaining space_code (#2891)

---
 pypdf/_cmap.py | 121 ++++++++++++++++++++++---------------------------
 pypdf/_page.py |  11 +----
 2 files changed, 57 insertions(+), 75 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 20e8cdc42..52a7b47b8 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -51,36 +51,11 @@ def build_char_map_from_dict(
         The font-dictionary itself is suitable for the curious.
     """
     font_type = cast(str, ft["/Subtype"].get_object())
+    encoding, map_dict = get_encoding(ft)
 
-    space_code = 32
-    encoding, space_code = parse_encoding(ft, space_code)
-    map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)
-
-    # encoding can be either a string for decode
-    # (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
-    # if empty string, it means it is than encoding field is not present and
-    # we have to select the good encoding from cmap input data
-    if encoding == "":
-        if -1 not in map_dict or map_dict[-1] == 1:
-            # I have not been able to find any rule for no /Encoding nor /ToUnicode
-            # One example shows /Symbol,bold I consider 8 bits encoding default
-            encoding = "charmap"
-        else:
-            encoding = "utf-16-be"
-    # apply rule from PDF ref 1.7 §5.9.1, 1st bullet :
-    #   if cmap not empty encoding should be discarded
-    #   (here transformed into identity for those characters)
-    # if encoding is an str it is expected to be a identity translation
-    elif isinstance(encoding, dict):
-        for x in int_entry:
-            if x <= 255:
-                encoding[x] = chr(x)
-    if isinstance(space_code, str):
-        sp = space_code
-    else:
-        sp = chr(space_code)
+    space_key_char = get_actual_str_key(" ", encoding, map_dict)
     font_width_map = build_font_width_map(ft, space_width * 2.0)
-    half_space_width = compute_space_width(font_width_map, sp) / 2.0
+    half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
 
     return (
         font_type,
@@ -145,24 +120,36 @@ def build_char_map_from_dict(
 }
 
 
-def parse_encoding(
-    ft: DictionaryObject, space_code: int
-) -> Tuple[Union[str, Dict[int, str]], int]:
+def get_encoding(
+    ft: DictionaryObject
+) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
+    encoding = _parse_encoding(ft)
+    map_dict, int_entry = _parse_to_unicode(ft)
+
+    # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
+    #   if cmap not empty encoding should be discarded
+    #   (here transformed into identity for those characters)
+    # If encoding is a string it is expected to be an identity translation.
+    if isinstance(encoding, dict):
+        for x in int_entry:
+            if x <= 255:
+                encoding[x] = chr(x)
+
+    return encoding, map_dict
+
+
+def _parse_encoding(
+    ft: DictionaryObject
+) -> Union[str, Dict[int, str]]:
     encoding: Union[str, List[str], Dict[int, str]] = []
     if "/Encoding" not in ft:
-        try:
-            if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
-                encoding = dict(
-                    zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
-                )
-            else:
-                encoding = "charmap"
-            return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
-        except Exception:
-            if cast(str, ft["/Subtype"]) == "/Type1":
-                return "charmap", space_code
-            else:
-                return "", space_code
+        if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
+            encoding = dict(
+                zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
+            )
+        else:
+            encoding = "charmap"
+        return encoding
     enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
     if isinstance(enc, str):
         try:
@@ -202,17 +189,15 @@ def parse_encoding(
                         encoding[x] = adobe_glyphs[o]  # type: ignore
                 except Exception:
                     encoding[x] = o  # type: ignore
-                    if o == " ":
-                        space_code = x
                 x += 1
     if isinstance(encoding, list):
         encoding = dict(zip(range(256), encoding))
-    return encoding, space_code
+    return encoding
 
 
-def parse_to_unicode(
-    ft: DictionaryObject, space_code: int
-) -> Tuple[Dict[Any, Any], int, List[int]]:
+def _parse_to_unicode(
+    ft: DictionaryObject
+) -> Tuple[Dict[Any, Any], List[int]]:
     # will store all translation code
     # and map_dict[-1] we will have the number of bytes to convert
     map_dict: Dict[Any, Any] = {}
@@ -222,9 +207,9 @@ def parse_to_unicode(
 
     if "/ToUnicode" not in ft:
         if ft.get("/Subtype", "") == "/Type1":
-            return type1_alternative(ft, map_dict, space_code, int_entry)
+            return _type1_alternative(ft, map_dict, int_entry)
         else:
-            return {}, space_code, []
+            return {}, []
     process_rg: bool = False
     process_char: bool = False
     multiline_rg: Union[
@@ -241,10 +226,19 @@ def parse_to_unicode(
             int_entry,
         )
 
-    for a, value in map_dict.items():
-        if value == " ":
-            space_code = a
-    return map_dict, space_code, int_entry
+    return map_dict, int_entry
+
+
+def get_actual_str_key(
+    value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
+) -> str:
+    key_dict = {}
+    if isinstance(encoding, dict):
+        key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
+    else:
+        key_dict = {value: key for key, value in map_dict.items() if value == value_char}
+    key_char = key_dict.get(value_char, value_char)
+    return key_char
 
 
 def prepare_cm(ft: DictionaryObject) -> bytes:
@@ -499,17 +493,16 @@ def compute_font_width(
     return char_width
 
 
-def type1_alternative(
+def _type1_alternative(
     ft: DictionaryObject,
     map_dict: Dict[Any, Any],
-    space_code: int,
     int_entry: List[int],
-) -> Tuple[Dict[Any, Any], int, List[int]]:
+) -> Tuple[Dict[Any, Any], List[int]]:
     if "/FontDescriptor" not in ft:
-        return map_dict, space_code, int_entry
+        return map_dict, int_entry
     ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
     if is_null_or_none(ft_desc):
-        return map_dict, space_code, int_entry
+        return map_dict, int_entry
     assert ft_desc is not None, "mypy"
     txt = ft_desc.get_object().get_data()
     txt = txt.split(b"eexec\n")[0]  # only clear part
@@ -532,10 +525,6 @@ def type1_alternative(
                         v = chr(int(words[2][4:], 16))
                     except ValueError:  # pragma: no cover
                         continue
-                else:
-                    continue
-            if words[2].decode() == b" ":
-                space_code = i
             map_dict[chr(i)] = v
             int_entry.append(i)
-    return map_dict, space_code, int_entry
+    return map_dict, int_entry
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 1de3061c8..d57a10042 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -53,8 +53,7 @@
     build_char_map,
     build_font_width_map,
     compute_font_width,
-    parse_encoding,
-    parse_to_unicode,
+    get_actual_str_key,
     unknown_char_map,
 )
 from ._protocols import PdfCommonDocProtocol
@@ -1744,13 +1743,7 @@ def _get_acutual_font_widths(
                 actual_space_width: float = space_width
                 font_width_map["default"] = actual_space_width * 2
             else:
-                space_code = 32
-                _, space_code = parse_encoding(cmap[3], space_code)
-                _, space_code, _ = parse_to_unicode(cmap[3], space_code)
-                if isinstance(space_code, str):
-                    space_char = space_code
-                else:
-                    space_char = chr(space_code)
+                space_char = get_actual_str_key(" ", cmap[0], cmap[1])
                 font_width_map = build_font_width_map(cmap[3], space_width * 2)
                 actual_space_width = compute_font_width(font_width_map, space_char)
             if actual_space_width == 0: