diff --git a/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py b/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py index b46f318e7..f4b524fd0 100644 --- a/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py +++ b/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py @@ -197,6 +197,8 @@ def get_boxes_and_text(self, image: Image.Image) -> list[dict[str, Any]]: image.save(bytearray, format="BMP") result = self.reader.ocr(bytearray.getvalue(), rec=True, det=True, cls=False) out: list[dict[str, Any]] = [] + if not result or not result[0]: + return out for res in result[0]: raw_bbox = res[0] text = res[1][0] diff --git a/lib/sycamore/sycamore/transforms/text_extraction/pdf_miner.py b/lib/sycamore/sycamore/transforms/text_extraction/pdf_miner.py index 9811093f8..24666156c 100644 --- a/lib/sycamore/sycamore/transforms/text_extraction/pdf_miner.py +++ b/lib/sycamore/sycamore/transforms/text_extraction/pdf_miner.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from PIL.Image import Image - from pdfminer.layout import LTPage + from pdfminer.pdfpage import PDFPage logger = logging.getLogger(__name__) @@ -24,31 +24,21 @@ def __init__(self): from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - rm = PDFResourceManager() - param = LAParams() - self.device = PDFPageAggregator(rm, laparams=param) - self.interpreter = PDFPageInterpreter(rm, self.device) + self.rm = PDFResourceManager() + self.param = LAParams() + self.device = PDFPageAggregator(self.rm, laparams=self.param) + self.interpreter = PDFPageInterpreter(self.rm, self.device) @staticmethod @requires_modules(["pdfminer", "pdfminer.utils"], extra="local-inference") - def pdf_to_pages(file_name: str) -> Generator["LTPage", None, None]: - from pdfminer.converter import PDFPageAggregator - from pdfminer.layout import LAParams - from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - from pdfminer.pdfpage import PDFPage + def pdf_to_pages(file_name: str) -> Generator["PDFPage", None, None]: from pdfminer.utils import open_filename - rm = PDFResourceManager() - param = LAParams() - device = PDFPageAggregator(rm, laparams=param) - interpreter = PDFPageInterpreter(rm, device) with open_filename(file_name, "rb") as fp: fp = cast(BinaryIO, fp) pages = PDFPage.get_pages(fp) for page in pages: - interpreter.process_page(page) - page_layout = device.get_result() - yield page_layout + yield page @staticmethod def _convert_bbox_coordinates( @@ -74,20 +64,8 @@ def extract_document(self, filename: str, hash_key: str, use_cache=False, **kwar return cached_result else: pages = [] - for page_layout in PdfMinerExtractor.pdf_to_pages(filename): - width = page_layout.width - height = page_layout.height - texts: list[Element] = [] - for obj in page_layout: - x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height) - - if hasattr(obj, "get_text"): - text = Element() - text.type = "text" - text.bbox = BoundingBox(x1 / width, y1 / height, x2 / width, y2 / height) - text.text_representation = obj.get_text() - if text.text_representation: - texts.append(text) + for page in PdfMinerExtractor.pdf_to_pages(filename): + texts = self.extract_page(page) pages.append(texts) if use_cache: logger.info("Cache Miss for PDFMiner. Storing the result to the cache.") @@ -95,17 +73,18 @@ def extract_document(self, filename: str, hash_key: str, use_cache=False, **kwar return pages @timetrace("PdfMinerPageEx") - def extract_page(self, page: Union["LTPage", "Image"]) -> list[Element]: - from pdfminer.layout import LTPage + def extract_page(self, page: Union["PDFPage", "Image"]) -> list[Element]: + from pdfminer.pdfpage import PDFPage - assert isinstance(page, LTPage) - width = page.width - height = page.height + assert isinstance(page, PDFPage) + self.interpreter.process_page(page) + page_layout = self.device.get_result() + width = page_layout.width + height = page_layout.height texts: list[Element] = [] - for obj in page: - x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height) - + for obj in page_layout: if hasattr(obj, "get_text"): + x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height) text = Element() text.type = "text" text.bbox = BoundingBox(x1 / width, y1 / height, x2 / width, y2 / height) diff --git a/lib/sycamore/sycamore/transforms/text_extraction/text_extractor.py b/lib/sycamore/sycamore/transforms/text_extraction/text_extractor.py index 292e1f949..8f4f90ddb 100644 --- a/lib/sycamore/sycamore/transforms/text_extraction/text_extractor.py +++ b/lib/sycamore/sycamore/transforms/text_extraction/text_extractor.py @@ -5,12 +5,12 @@ if TYPE_CHECKING: from PIL.Image import Image - from pdfminer.layout import LTPage + from pdfminer.pdfpage import PDFPage class TextExtractor: @abstractmethod - def extract_page(self, filename: Union["Image", "LTPage"]) -> list[Element]: + def extract_page(self, filename: Union["Image", "PDFPage"]) -> list[Element]: pass @abstractmethod