Skip to content

Commit

Permalink
first tranche of fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
karanataryn committed Oct 4, 2024
1 parent 8e69f98 commit 6f2b1a2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def get_boxes_and_text(self, image: Image.Image) -> list[dict[str, Any]]:
image.save(bytearray, format="BMP")
result = self.reader.ocr(bytearray.getvalue(), rec=True, det=True, cls=False)
out: list[dict[str, Any]] = []
if not result or not result[0]:
return out
for res in result[0]:
raw_bbox = res[0]
text = res[1][0]
Expand Down
57 changes: 18 additions & 39 deletions lib/sycamore/sycamore/transforms/text_extraction/pdf_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

if TYPE_CHECKING:
from PIL.Image import Image
from pdfminer.layout import LTPage
from pdfminer.pdfpage import PDFPage

logger = logging.getLogger(__name__)

Expand All @@ -24,31 +24,21 @@ def __init__(self):
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager

rm = PDFResourceManager()
param = LAParams()
self.device = PDFPageAggregator(rm, laparams=param)
self.interpreter = PDFPageInterpreter(rm, self.device)
self.rm = PDFResourceManager()
self.param = LAParams()
self.device = PDFPageAggregator(self.rm, laparams=self.param)
self.interpreter = PDFPageInterpreter(self.rm, self.device)

@staticmethod
@requires_modules(["pdfminer", "pdfminer.utils"], extra="local-inference")
def pdf_to_pages(file_name: str) -> Generator["LTPage", None, None]:
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
def pdf_to_pages(file_name: str) -> Generator["PDFPage", None, None]:
from pdfminer.utils import open_filename

rm = PDFResourceManager()
param = LAParams()
device = PDFPageAggregator(rm, laparams=param)
interpreter = PDFPageInterpreter(rm, device)
with open_filename(file_name, "rb") as fp:
fp = cast(BinaryIO, fp)
pages = PDFPage.get_pages(fp)
for page in pages:
interpreter.process_page(page)
page_layout = device.get_result()
yield page_layout
yield page

@staticmethod
def _convert_bbox_coordinates(
Expand All @@ -74,38 +64,27 @@ def extract_document(self, filename: str, hash_key: str, use_cache=False, **kwar
return cached_result
else:
pages = []
for page_layout in PdfMinerExtractor.pdf_to_pages(filename):
width = page_layout.width
height = page_layout.height
texts: list[Element] = []
for obj in page_layout:
x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height)

if hasattr(obj, "get_text"):
text = Element()
text.type = "text"
text.bbox = BoundingBox(x1 / width, y1 / height, x2 / width, y2 / height)
text.text_representation = obj.get_text()
if text.text_representation:
texts.append(text)
for page in PdfMinerExtractor.pdf_to_pages(filename):
texts = self.extract_page(page)
pages.append(texts)
if use_cache:
logger.info("Cache Miss for PDFMiner. Storing the result to the cache.")
pdf_miner_cache.set(hash_key, pages)
return pages

@timetrace("PdfMinerPageEx")
def extract_page(self, page: Union["LTPage", "Image"]) -> list[Element]:
from pdfminer.layout import LTPage
def extract_page(self, page: Union["PDFPage", "Image"]) -> list[Element]:
from pdfminer.pdfpage import PDFPage

assert isinstance(page, LTPage)
width = page.width
height = page.height
assert isinstance(page, PDFPage)
self.interpreter.process_page(page)
page_layout = self.device.get_result()
width = page_layout.width
height = page_layout.height
texts: list[Element] = []
for obj in page:
x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height)

for obj in page_layout:
if hasattr(obj, "get_text"):
x1, y1, x2, y2 = self._convert_bbox_coordinates(obj.bbox, height)
text = Element()
text.type = "text"
text.bbox = BoundingBox(x1 / width, y1 / height, x2 / width, y2 / height)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

if TYPE_CHECKING:
from PIL.Image import Image
from pdfminer.layout import LTPage
from pdfminer.pdfpage import PDFPage


class TextExtractor:
@abstractmethod
def extract_page(self, filename: Union["Image", "LTPage"]) -> list[Element]:
def extract_page(self, filename: Union["Image", "PDFPage"]) -> list[Element]:
pass

@abstractmethod
Expand Down

0 comments on commit 6f2b1a2

Please sign in to comment.