Skip to content

Commit

Permalink
Catch exceptions from pdfminer and malformed PDFs
Browse files Browse the repository at this point in the history
... thanks to OSS-Fuzz and @ennamarie19

Cf.: google/oss-fuzz#12949
  • Loading branch information
jsvine committed Feb 9, 2025
1 parent a77808a commit 43ccc5b
Show file tree
Hide file tree
Showing 22 changed files with 85 additions and 6 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
- [@wodny](https://github.com/wodny)
- [Michal Stolarczyk](https://github.com/stolarczyk)
- [Brandon Roberts](https://github.com/brandonrobertz)
- [@ennamarie19](https://github.com/ennamarie19)

## Contributing

Expand Down
7 changes: 6 additions & 1 deletion pdfplumber/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils.exceptions import MalformedPDFException

if TYPE_CHECKING: # pragma: nocover
import pandas as pd
Expand Down Expand Up @@ -52,7 +53,11 @@ def get_page_image(
stream.seek(0)
src = stream

pdfium_doc = pypdfium2.PdfDocument(src, password=password)
try:
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
except pypdfium2._helpers.misc.PdfiumError as e:
raise MalformedPDFException(e)

pdfium_page = pdfium_doc.get_page(page_ix)

img: PIL.Image.Image = pdfium_page.render(
Expand Down
16 changes: 14 additions & 2 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numbers
import re
from functools import lru_cache
from typing import (
Expand Down Expand Up @@ -35,6 +36,7 @@
from .structure import PDFStructTree, StructTreeMissing
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils import decode_text, resolve_all, resolve_and_decode
from .utils.exceptions import MalformedPDFException, PdfminerException
from .utils.text import TextMap

lt_pat = re.compile(r"^LT")
Expand Down Expand Up @@ -184,6 +186,10 @@ def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
# conventionally specified by their lower-left and upperright
# corners, it is acceptable to specify any two diagonally opposite
# corners."
if not all(isinstance(x, numbers.Number) for x in box_raw):
raise MalformedPDFException(
f"Bounding box contains non-number coordinate(s): {box_raw}"
)
x0, x1 = sorted((box_raw[0], box_raw[2]))
y0, y1 = sorted((box_raw[1], box_raw[3]))
if rotation in [90, 270]:
Expand Down Expand Up @@ -276,7 +282,10 @@ def layout(self) -> LTPage:
laparams=self.pdf.laparams,
)
interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
interpreter.process_page(self.page_obj)
try:
interpreter.process_page(self.page_obj)
except Exception as e:
raise PdfminerException(e)
self._layout: LTPage = device.get_result()
return self._layout

Expand Down Expand Up @@ -339,7 +348,10 @@ def parse(annot: T_obj) -> T_obj:
parsed["data"] = annot
return parsed

raw = resolve_all(self.page_obj.annots) or []
try:
raw = resolve_all(self.page_obj.annots) or []
except RecursionError:
raise MalformedPDFException("Annotations are infinitely recursive.")
parsed = list(map(parse, raw))
if isinstance(self, CroppedPage):
return self._crop_fn(parsed)
Expand Down
21 changes: 18 additions & 3 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
from io import BufferedReader, BytesIO
from types import TracebackType
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union

from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
Expand All @@ -18,6 +18,7 @@
from .repair import T_repair_setting, _repair
from .structure import PDFStructTree, StructTreeMissing
from .utils import resolve_and_decode
from .utils.exceptions import PdfminerException

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,7 +47,10 @@ def __init__(
self.unicode_norm = unicode_norm
self.raise_unicode_errors = raise_unicode_errors

self.doc = PDFDocument(PDFParser(stream), password=password or "")
try:
self.doc = PDFDocument(PDFParser(stream), password=password or "")
except Exception as e:
raise PdfminerException(e)
self.rsrcmgr = PDFResourceManager()
self.metadata = {}

Expand Down Expand Up @@ -146,7 +150,18 @@ def pages(self) -> List[Page]:
doctop: T_num = 0
pp = self.pages_to_parse
self._pages: List[Page] = []
for i, page in enumerate(PDFPage.create_pages(self.doc)):

def iter_pages() -> Generator[PDFPage, None, None]:
gen = PDFPage.create_pages(self.doc)
while True:
try:
yield next(gen)
except StopIteration:
break
except Exception as e:
raise PdfminerException(e)

for i, page in enumerate(iter_pages()):
page_number = i + 1
if pp is not None and page_number not in pp:
continue
Expand Down
6 changes: 6 additions & 0 deletions pdfplumber/utils/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class MalformedPDFException(Exception):
pass


class PdfminerException(Exception):
pass
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/pdfs/from-oss-fuzz/load/6085913544818688.pdf
Binary file not shown.
Binary file not shown.
40 changes: 40 additions & 0 deletions tests/test_oss_fuzz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python
import logging
import os
import unittest
from pathlib import Path

import pdfplumber
from pdfplumber.utils.exceptions import MalformedPDFException, PdfminerException

logging.disable(logging.ERROR)

HERE = Path(os.path.abspath(os.path.dirname(__file__)))

ACCEPTABLE_EXCEPTIONS = (MalformedPDFException, PdfminerException)


class Test(unittest.TestCase):
def test_load(self):
def test_conversions(pdf):
methods = [pdf.to_dict, pdf.to_json, pdf.to_csv, pdf.pages[0].to_image]
for method in methods:
try:
method()
except ACCEPTABLE_EXCEPTIONS:
continue
except Exception as e:
print(f"Failed on: {path.name}")
raise e

paths = sorted((HERE / "pdfs/from-oss-fuzz/load/").glob("*.pdf"))
for path in paths:
try:
with pdfplumber.open(path) as pdf:
assert pdf.pages
test_conversions(pdf)
except ACCEPTABLE_EXCEPTIONS:
continue
except Exception as e:
print(f"Failed on: {path.name}")
raise e

0 comments on commit 43ccc5b

Please sign in to comment.