STY: Refactor _xobj_to_image (#2863)

* STY: Refactor _xobj_to_image * Create _apply_decode and move it to _xobj_image_helpers * Create _get_mode_and_invert_color and move it to _xobj_image_helpers * Create _apply_alpha as an inner function as it's calling _xobj_to_image This reduced the Cyclomatic Complexity from 44 to 19 * Comment formatting * Fix
py-pdf · Sep 22, 2024 · 6cfa0c4 · 6cfa0c4
1 parent 966e015
commit 6cfa0c4
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 87 deletions.
diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
@@ -2,10 +2,12 @@
 
 import sys
 from io import BytesIO
-from typing import Any, List, Literal, Tuple, Union, cast
+from typing import Any, Dict, List, Literal, Tuple, Union, cast
 
 from ._utils import check_if_whitespace_only, logger_warning
 from .constants import ColorSpaces
+from .constants import FilterTypes as FT
+from .constants import ImageAttributes as IA
 from .errors import EmptyImageDataError, PdfReadError
 from .generic import (
     ArrayObject,
@@ -303,3 +305,73 @@ def _handle_jpx(
         img = img.convert("RGB")
     image_format = "JPEG2000"
     return img, image_format, extension, invert_color
+
+
+def _apply_decode(
+    img: Image.Image,
+    x_object_obj: Dict[str, Any],
+    lfilters: FT,
+    color_space: Union[str, List[Any], Any],
+    invert_color: bool,
+) -> Image.Image:
+    # CMYK image and other colorspaces without decode
+    # requires reverting scale (cf p243,2§ last sentence)
+    decode = x_object_obj.get(
+        IA.DECODE,
+        ([1.0, 0.0] * len(img.getbands()))
+        if (
+            (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
+            or (invert_color and img.mode == "L")
+        )
+        else None,
+    )
+    if (
+        isinstance(color_space, ArrayObject)
+        and color_space[0].get_object() == "/Indexed"
+    ):
+        decode = None  # decode is meanless of Indexed
+    if (
+        isinstance(color_space, ArrayObject)
+        and color_space[0].get_object() == "/Separation"
+    ):
+        decode = [1.0, 0.0] * len(img.getbands())
+    if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
+        lut: List[int] = []
+        for i in range(0, len(decode), 2):
+            dmin = decode[i]
+            dmax = decode[i + 1]
+            lut.extend(
+                round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
+            )
+        img = img.point(lut)
+    return img
+
+
+def _get_mode_and_invert_color(
+    x_object_obj: Dict[str, Any], colors: int, color_space: Union[str, List[Any], Any]
+) -> Tuple[mode_str_type, bool]:
+    if (
+        IA.COLOR_SPACE in x_object_obj
+        and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
+    ):
+        # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
+        mode: mode_str_type = "RGB"
+    if x_object_obj.get("/BitsPerComponent", 8) < 8:
+        mode, invert_color = _get_imagemode(
+            f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
+        )
+    else:
+        mode, invert_color = _get_imagemode(
+            color_space,
+            2
+            if (
+                colors == 1
+                and (
+                    not isinstance(color_space, NullObject)
+                    and "Gray" not in color_space
+                )
+            )
+            else colors,
+            "",
+        )
+    return mode, invert_color
diff --git a/pypdf/constants.py b/pypdf/constants.py
@@ -11,10 +11,15 @@
 ISO 32000-2:2020 (PDF 2.0)
 """
 
-from enum import IntFlag, auto
+from enum import Enum, IntFlag, auto, unique
 from typing import Dict, Tuple
 
 
+class StrEnum(str, Enum):  # Once we are on Python 3.11+: enum.StrEnum
+    def __str__(self) -> str:
+        return str(self.value)
+
+
 class Core:
     """Keywords that don't quite belong anywhere else."""
 
@@ -167,8 +172,10 @@ class PagesAttributes:
     TYPE = "/Type"  # name, required; must be /Pages
     PARENT = "/Parent"  # dictionary, required; indirect reference to pages object
     KIDS = "/Kids"  # array, required; List of indirect references
-    COUNT = "/Count"  # integer, required; the number of leaf nodes (page objects)
-                      # that are descendants of this node within the page tree
+
+    COUNT = "/Count"
+    # integer, required; the number of leaf nodes (page objects)
+    # that are descendants of this node within the page tree
 
 
 class PageAttributes:
@@ -240,7 +247,8 @@ class StreamAttributes:
     DECODE_PARMS = "/DecodeParms"  # variable, optional -- 'decodeParams is wrong
 
 
-class FilterTypes:
+@unique
+class FilterTypes(StrEnum):
     """§7.4 of the 1.7 and 2.0 references."""
 
     ASCII_HEX_DECODE = "/ASCIIHexDecode"  # abbreviation: AHx

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -49,7 +49,6 @@
     ord_,
 )
 from .constants import CcittFaxDecodeParameters as CCITT
-from .constants import ColorSpaces
 from .constants import FilterTypeAbbreviations as FTA
 from .constants import FilterTypes as FT
 from .constants import ImageAttributes as IA
@@ -735,57 +734,72 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     from ._xobj_image_helpers import (
         Image,
         UnidentifiedImageError,
+        _apply_decode,
         _extended_image_frombytes,
-        _get_imagemode,
+        _get_mode_and_invert_color,
         _handle_flate,
         _handle_jpx,
-        mode_str_type,
     )
 
+    def _apply_alpha(
+        img: Image.Image,
+        x_object_obj: Dict[str, Any],
+        obj_as_text: str,
+        image_format: str,
+        extension: str,
+    ) -> Tuple[Image.Image, str, str]:
+        alpha = None
+        if IA.S_MASK in x_object_obj:  # add alpha channel
+            alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
+            if img.size != alpha.size:
+                logger_warning(
+                    f"image and mask size not matching: {obj_as_text}", __name__
+                )
+            else:
+                # TODO : implement mask
+                if alpha.mode != "L":
+                    alpha = alpha.convert("L")
+                if img.mode == "P":
+                    img = img.convert("RGB")
+                elif img.mode == "1":
+                    img = img.convert("L")
+                img.putalpha(alpha)
+            if "JPEG" in image_format:
+                extension = ".jp2"
+                image_format = "JPEG2000"
+            else:
+                extension = ".png"
+                image_format = "PNG"
+        return img, extension, image_format
+
     # for error reporting
-    if x_object_obj is None:  # pragma: no cover
-        obj_as_text = x_object_obj.indirect_reference.__repr__()
-    else:
-        obj_as_text = x_object_obj.__repr__()
+    obj_as_text = (
+        x_object_obj.indirect_reference.__repr__()  # type: ignore
+        if x_object_obj is None  # pragma: no cover
+        else x_object_obj.__repr__()
+    )
 
+    # Get size and data
     size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
     data = x_object_obj.get_data()  # type: ignore
     if isinstance(data, str):  # pragma: no cover
         data = data.encode()
     if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A:  # ie. '\n'
         data = data[:-1]
+
+    # Get color properties
     colors = x_object_obj.get("/Colors", 1)
     color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
     if isinstance(color_space, list) and len(color_space) == 1:
         color_space = color_space[0].get_object()
-    if (
-        IA.COLOR_SPACE in x_object_obj
-        and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
-    ):
-        # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
-        mode: mode_str_type = "RGB"
-    if x_object_obj.get("/BitsPerComponent", 8) < 8:
-        mode, invert_color = _get_imagemode(
-            f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
-        )
-    else:
-        mode, invert_color = _get_imagemode(
-            color_space,
-            2
-            if (
-                colors == 1
-                and (
-                    not isinstance(color_space, NullObject)
-                    and "Gray" not in color_space
-                )
-            )
-            else colors,
-            "",
-        )
-    extension = None
-    alpha = None
+
+    mode, invert_color = _get_mode_and_invert_color(x_object_obj, colors, color_space)
+
+    # Get filters
     filters = x_object_obj.get(SA.FILTER, NullObject()).get_object()
     lfilters = filters[-1] if isinstance(filters, list) else filters
+
+    extension = None
     if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
         img, image_format, extension, _ = _handle_flate(
             size,
@@ -839,57 +853,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
             ".png",
             False,
         )
-    # CMYK image and other colorspaces without decode
-    # requires reverting scale (cf p243,2§ last sentence)
-    decode = x_object_obj.get(
-        IA.DECODE,
-        ([1.0, 0.0] * len(img.getbands()))
-        if (
-            (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
-            or (invert_color and img.mode == "L")
-        )
-        else None,
-    )
-    if (
-        isinstance(color_space, ArrayObject)
-        and color_space[0].get_object() == "/Indexed"
-    ):
-        decode = None  # decode is meanless of Indexed
-    if (
-        isinstance(color_space, ArrayObject)
-        and color_space[0].get_object() == "/Separation"
-    ):
-        decode = [1.0, 0.0] * len(img.getbands())
-    if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
-        lut: List[int] = []
-        for i in range(0, len(decode), 2):
-            dmin = decode[i]
-            dmax = decode[i + 1]
-            lut.extend(
-                round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
-            )
-        img = img.point(lut)
 
-    if IA.S_MASK in x_object_obj:  # add alpha channel
-        alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
-        if img.size != alpha.size:
-            logger_warning(f"image and mask size not matching: {obj_as_text}", __name__)
-        else:
-            # TODO : implement mask
-            if alpha.mode != "L":
-                alpha = alpha.convert("L")
-            if img.mode == "P":
-                img = img.convert("RGB")
-            elif img.mode == "1":
-                img = img.convert("L")
-            img.putalpha(alpha)
-        if "JPEG" in image_format:
-            extension = ".jp2"
-            image_format = "JPEG2000"
-        else:
-            extension = ".png"
-            image_format = "PNG"
+    img = _apply_decode(img, x_object_obj, lfilters, color_space, invert_color)
+    img, extension, image_format = _apply_alpha(
+        img, x_object_obj, obj_as_text, image_format, extension
+    )
 
+    # Save image to bytes
     img_byte_arr = BytesIO()
     try:
         img.save(img_byte_arr, format=image_format)