Catch exceptions from pdfminer and malformed PDFs

... thanks to OSS-Fuzz and @ennamarie19 Cf.: google/oss-fuzz#12949
jsvine · Feb 9, 2025 · 43ccc5b · 43ccc5b
1 parent a77808a
commit 43ccc5b
Show file tree

Hide file tree

Showing 22 changed files with 85 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [@wodny](https://github.com/wodny)
 - [Michal Stolarczyk](https://github.com/stolarczyk)
 - [Brandon Roberts](https://github.com/brandonrobertz)
+- [@ennamarie19](https://github.com/ennamarie19)
 
 ## Contributing
 

diff --git a/pdfplumber/display.py b/pdfplumber/display.py
@@ -9,6 +9,7 @@
 from . import utils
 from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
 from .table import T_table_settings, Table, TableFinder, TableSettings
+from .utils.exceptions import MalformedPDFException
 
 if TYPE_CHECKING:  # pragma: nocover
     import pandas as pd
@@ -52,7 +53,11 @@ def get_page_image(
         stream.seek(0)
         src = stream
 
-    pdfium_doc = pypdfium2.PdfDocument(src, password=password)
+    try:
+        pdfium_doc = pypdfium2.PdfDocument(src, password=password)
+    except pypdfium2._helpers.misc.PdfiumError as e:
+        raise MalformedPDFException(e)
+
     pdfium_page = pdfium_doc.get_page(page_ix)
 
     img: PIL.Image.Image = pdfium_page.render(

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -1,3 +1,4 @@
+import numbers
 import re
 from functools import lru_cache
 from typing import (
@@ -35,6 +36,7 @@
 from .structure import PDFStructTree, StructTreeMissing
 from .table import T_table_settings, Table, TableFinder, TableSettings
 from .utils import decode_text, resolve_all, resolve_and_decode
+from .utils.exceptions import MalformedPDFException, PdfminerException
 from .utils.text import TextMap
 
 lt_pat = re.compile(r"^LT")
@@ -184,6 +186,10 @@ def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
     # conventionally specified by their lower-left and upperright
     # corners, it is acceptable to specify any two diagonally opposite
     # corners."
+    if not all(isinstance(x, numbers.Number) for x in box_raw):
+        raise MalformedPDFException(
+            f"Bounding box contains non-number coordinate(s): {box_raw}"
+        )
     x0, x1 = sorted((box_raw[0], box_raw[2]))
     y0, y1 = sorted((box_raw[1], box_raw[3]))
     if rotation in [90, 270]:
@@ -276,7 +282,10 @@ def layout(self) -> LTPage:
             laparams=self.pdf.laparams,
         )
         interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
-        interpreter.process_page(self.page_obj)
+        try:
+            interpreter.process_page(self.page_obj)
+        except Exception as e:
+            raise PdfminerException(e)
         self._layout: LTPage = device.get_result()
         return self._layout
 
@@ -339,7 +348,10 @@ def parse(annot: T_obj) -> T_obj:
             parsed["data"] = annot
             return parsed
 
-        raw = resolve_all(self.page_obj.annots) or []
+        try:
+            raw = resolve_all(self.page_obj.annots) or []
+        except RecursionError:
+            raise MalformedPDFException("Annotations are infinitely recursive.")
         parsed = list(map(parse, raw))
         if isinstance(self, CroppedPage):
             return self._crop_fn(parsed)

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -3,7 +3,7 @@
 import pathlib
 from io import BufferedReader, BytesIO
 from types import TracebackType
-from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
 
 from pdfminer.layout import LAParams
 from pdfminer.pdfdocument import PDFDocument
@@ -18,6 +18,7 @@
 from .repair import T_repair_setting, _repair
 from .structure import PDFStructTree, StructTreeMissing
 from .utils import resolve_and_decode
+from .utils.exceptions import PdfminerException
 
 logger = logging.getLogger(__name__)
 
@@ -46,7 +47,10 @@ def __init__(
         self.unicode_norm = unicode_norm
         self.raise_unicode_errors = raise_unicode_errors
 
-        self.doc = PDFDocument(PDFParser(stream), password=password or "")
+        try:
+            self.doc = PDFDocument(PDFParser(stream), password=password or "")
+        except Exception as e:
+            raise PdfminerException(e)
         self.rsrcmgr = PDFResourceManager()
         self.metadata = {}
 
@@ -146,7 +150,18 @@ def pages(self) -> List[Page]:
         doctop: T_num = 0
         pp = self.pages_to_parse
         self._pages: List[Page] = []
-        for i, page in enumerate(PDFPage.create_pages(self.doc)):
+
+        def iter_pages() -> Generator[PDFPage, None, None]:
+            gen = PDFPage.create_pages(self.doc)
+            while True:
+                try:
+                    yield next(gen)
+                except StopIteration:
+                    break
+                except Exception as e:
+                    raise PdfminerException(e)
+
+        for i, page in enumerate(iter_pages()):
             page_number = i + 1
             if pp is not None and page_number not in pp:
                 continue

diff --git a/pdfplumber/utils/exceptions.py b/pdfplumber/utils/exceptions.py
@@ -0,0 +1,6 @@
+class MalformedPDFException(Exception):
+    pass
+
+
+class PdfminerException(Exception):
+    pass
diff --git a/tests/pdfs/from-oss-fuzz/load/4591020179783680.pdf b/tests/pdfs/from-oss-fuzz/load/4591020179783680.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/4652594248613888.pdf b/tests/pdfs/from-oss-fuzz/load/4652594248613888.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/4691742750474240.pdf b/tests/pdfs/from-oss-fuzz/load/4691742750474240.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/4715311080734720.pdf b/tests/pdfs/from-oss-fuzz/load/4715311080734720.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/4736668896133120.pdf b/tests/pdfs/from-oss-fuzz/load/4736668896133120.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/4833695495684096.pdf b/tests/pdfs/from-oss-fuzz/load/4833695495684096.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5177159198507008.pdf b/tests/pdfs/from-oss-fuzz/load/5177159198507008.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5317294594523136.pdf b/tests/pdfs/from-oss-fuzz/load/5317294594523136.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5452007745323008.pdf b/tests/pdfs/from-oss-fuzz/load/5452007745323008.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5592736912179200.pdf b/tests/pdfs/from-oss-fuzz/load/5592736912179200.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5809779695484928.pdf b/tests/pdfs/from-oss-fuzz/load/5809779695484928.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5903429863538688.pdf b/tests/pdfs/from-oss-fuzz/load/5903429863538688.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/5914823472250880.pdf b/tests/pdfs/from-oss-fuzz/load/5914823472250880.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/6013812888633344.pdf b/tests/pdfs/from-oss-fuzz/load/6013812888633344.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/6085913544818688.pdf b/tests/pdfs/from-oss-fuzz/load/6085913544818688.pdf
diff --git a/tests/pdfs/from-oss-fuzz/load/6515565732102144.pdf b/tests/pdfs/from-oss-fuzz/load/6515565732102144.pdf
diff --git a/tests/test_oss_fuzz.py b/tests/test_oss_fuzz.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+import logging
+import os
+import unittest
+from pathlib import Path
+
+import pdfplumber
+from pdfplumber.utils.exceptions import MalformedPDFException, PdfminerException
+
+logging.disable(logging.ERROR)
+
+HERE = Path(os.path.abspath(os.path.dirname(__file__)))
+
+ACCEPTABLE_EXCEPTIONS = (MalformedPDFException, PdfminerException)
+
+
+class Test(unittest.TestCase):
+    def test_load(self):
+        def test_conversions(pdf):
+            methods = [pdf.to_dict, pdf.to_json, pdf.to_csv, pdf.pages[0].to_image]
+            for method in methods:
+                try:
+                    method()
+                except ACCEPTABLE_EXCEPTIONS:
+                    continue
+                except Exception as e:
+                    print(f"Failed on: {path.name}")
+                    raise e
+
+        paths = sorted((HERE / "pdfs/from-oss-fuzz/load/").glob("*.pdf"))
+        for path in paths:
+            try:
+                with pdfplumber.open(path) as pdf:
+                    assert pdf.pages
+                    test_conversions(pdf)
+            except ACCEPTABLE_EXCEPTIONS:
+                continue
+            except Exception as e:
+                print(f"Failed on: {path.name}")
+                raise e