Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
Filimoa committed Mar 27, 2024
1 parent 8a511ab commit 4dcd8e7
Show file tree
Hide file tree
Showing 25 changed files with 371 additions and 965 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ archive.zip
.DS_Store

.venv/

# misc
trash.py
test-output.pdf
682 changes: 21 additions & 661 deletions LICENSE

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@ Try the sample notebook <a href="https://github.com/pymupdf/PyMuPDF" class="exte

Python 3.8+

OpenParse stands on the shoulders of giants:
**Dealing with PDF's:**

- <a href="https://github.com/pymupdf/PyMuPDF" class="external-link" target="_blank">PyMuPDF</a> for handling pdf files
- <a href="https://huggingface.co/microsoft/table-transformer-detection" class="external-link" target="_blank">Table Transformer</a> for parsing tables
- <a href="https://github.com/pdfminer/pdfminer.six" class="external-link" target="_blank">pdfminer.six</a> Fully open source.

**Extracting Tables:**

- <a href="https://github.com/pymupdf/PyMuPDF" class="external-link" target="_blank">PyMuPDF</a> has some table detection functionality. Please see their <a href="https://mupdf.com/licensing/index.html#commercial" class="external-link" target="_blank">license</a>.
- <a href="https://huggingface.co/microsoft/table-transformer-detection" class="external-link" target="_blank">Table Transformer</a> is a deep learning approach.
- <a href="https://github.com/poloclub/unitable" class="external-link" target="_blank">unitable</a> is a more recent deep learning approach that seems promising *(coming soon)*

## Installation

Expand All @@ -61,7 +64,7 @@ pip install open-parse

**Enabling OCR Support**:

PyMuPDF will already contain all the logic to support OCR functions. But it additionally does need Tesseract’s language support data, so installation of Tesseract-OCR is still required.
PyMuPDF will already contain all the logic to support OCR functions. But it additionally does need Tesseract’s language support data, so installation of Tesseract-OCR is still required.

The language support folder location must be communicated either via storing it in the environment variable "TESSDATA_PREFIX", or as a parameter in the applicable functions.

Expand Down Expand Up @@ -100,3 +103,5 @@ pip install "open-parse[tables]"





File renamed without changes.
Binary file added evals/data/tables/meta-2022-10k-page-53.pdf
Binary file not shown.
Binary file added evals/data/tables/meta-2022-10k-page-69.pdf
Binary file not shown.
2 changes: 2 additions & 0 deletions notebooks/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
config.py
*.ipynb

2 changes: 2 additions & 0 deletions src/consts.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
MAX_EMBEDDING_TOKENS = 8000
TOKENIZATION_LOWER_LIMIT = 256
TOKENIZATION_UPPER_LIMIT = 1024
25 changes: 3 additions & 22 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from typing import List, Union, TypedDict
from pathlib import Path

import fitz

from src import text, tables
from src.processing import run_pipeline, ProcessingArgs
from src.utils import load_doc
from src.schemas import Node
from src.pdf import Pdf


class TableArgs(TypedDict, total=False):
Expand All @@ -28,17 +26,14 @@ def __init__(

def parse(
self,
file: str | Path | fitz.Document,
file: str | Path,
) -> List[Node]:
doc = load_doc(file)
doc = Pdf(file)

text_elems = text.ingest(doc)
text_nodes = [
Node(
elements=[e],
tokenization_upper_limit=self.processing_args[
"tokenization_upper_limit"
],
)
for e in text_elems
]
Expand All @@ -50,17 +45,3 @@ def parse(
all_elems = text_elems + table_elems
processed_elems = run_pipeline(all_elems, self.processing_args)
return processed_elems


parser = DocumentParser(
table_args={
"parse": True,
"args": {
"min_table_confidence": 0.75,
"min_cell_confidence": 0.95,
"table_output_format": "markdown",
},
},
)

parsed = parser.parse("path/to/sample.pdf")
171 changes: 171 additions & 0 deletions src/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from pathlib import Path
from typing import Union, List, Optional, Iterator, Literal
import tempfile
import random

from pypdf import PdfReader, PdfWriter
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage

from src.schemas import Bbox


class Pdf:
"""
Simple utility class for working with PDF files. This class wraps the PdfReader and PdfWriter classes from pypdf.
"""

def __init__(self, file: Union[str, Path, PdfReader]):
self.file_path = str(file) if isinstance(file, (str, Path)) else None
self.reader = PdfReader(file) if isinstance(file, (str, Path)) else file
self.writer = PdfWriter()
for page in self.reader.pages:
self.writer.add_page(page)

def extract_layout_pages(self) -> Iterator[LTPage]:
"""
Yields layout objects for each page in the PDF using pdfminer.six.
"""
assert (
self.file_path is not None
), "PDF file path is required for this method for now."

for page_layout in extract_pages(self.file_path):
yield page_layout

def save(self, output_pdf: Union[str, Path]) -> None:
"""
Saves the content from the PdfWriter to a new PDF file.
"""
with open(str(output_pdf), "wb") as out_pdf:
self.writer.write(out_pdf)

def extract_pages(self, start_page: int, end_page: int) -> None:
"""
Extracts a range of pages from the PDF and adds them to the PdfWriter.
"""
for page_num in range(start_page - 1, end_page):
self.writer.add_page(self.reader.pages[page_num])

def to_pymupdf_doc(self):
"""
Transforms the PDF into a PyMuPDF (fitz) document.
If modifications have been made using PdfWriter, it saves to a temporary file first.
This function dynamically imports PyMuPDF (fitz), requiring it only if this method is called.
"""
try:
import fitz
except ImportError:
raise ImportError(
"PyMuPDF (fitz) is not installed. This method requires PyMuPDF."
)

if not self.writer.pages:
return fitz.open(self.file_path)

with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
self.writer.write(tmpfile.name)
return fitz.open(tmpfile.name)

def _draw_bboxes(
self,
bboxes: List[Bbox],
coordinates: Literal[
"pymupdf",
"pdfminer",
],
):
try:
import fitz
except ImportError:
raise ImportError(
"PyMuPDF (fitz) is not installed. This method requires PyMuPDF."
)

pdf = self.to_pymupdf_doc()

for page in pdf:
page.wrap_contents()

for bbox in bboxes:
if bbox.page != page.number:
continue
if coordinates == "pdfminer":
bbox = self._flip_coordinates(bbox)
r = fitz.Rect(
x0=bbox.x0,
y0=bbox.y0,
x1=bbox.x1,
y1=bbox.y1,
)
color = (
random.randint(0, 255) / 256,
random.randint(0, 255) / 256,
random.randint(0, 255) / 256,
)
page.draw_rect(r, color)
return pdf

def display_with_bboxes(
self,
bboxes: Union[List[Bbox], List[List[Bbox]]],
page_nums: Optional[List[int]] = None,
coordinates: Literal[
"pymupdf",
"pdfminer",
] = "pdfminer",
):
"""
Display a single page of a PDF file using IPython.
"""
try:
from IPython.display import Image, display # type: ignore
except ImportError:
raise ImportError(
"IPython is required to display PDFs. Please install it with `pip install ipython`."
)
flattened_bboxes = self._flatten_bboxes(bboxes)
marked_up_doc = self._draw_bboxes(flattened_bboxes, coordinates)
if not page_nums:
page_nums = list(range(marked_up_doc.page_count))
for page_num in page_nums:
page = marked_up_doc[page_num]
img_data = page.get_pixmap().tobytes("png")
display(Image(data=img_data))

def export_with_bboxes(
self,
bboxes: Union[List[Bbox], List[List[Bbox]]],
output_pdf: Union[str, Path],
coordinates: Literal[
"pymupdf",
"pdfminer",
] = "pdfminer",
) -> None:
flattened_bboxes = self._flatten_bboxes(bboxes)
marked_up_doc = self._draw_bboxes(flattened_bboxes, coordinates)
marked_up_doc.save(str(output_pdf))

def _flip_coordinates(self, bbox: Bbox) -> Bbox:
fy0 = bbox.page_height - bbox.y1
fy1 = bbox.page_height - bbox.y0
return Bbox(
page=bbox.page,
page_height=bbox.page_height,
page_width=bbox.page_width,
x0=bbox.x0,
y0=fy0,
x1=bbox.x1,
y1=fy1,
)

def _flatten_bboxes(
self, bboxes: Union[List[Bbox], List[List[Bbox]]]
) -> List[Bbox]:
res = []
for element in bboxes:
if isinstance(element, Bbox):
res.append(element)
elif isinstance(element, list):
res.extend(bbox for bbox in element if isinstance(bbox, Bbox))
return res
2 changes: 1 addition & 1 deletion src/processing/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .ingest import ProcessingArgs, run_pipeline
from .ingest import run_pipeline
29 changes: 2 additions & 27 deletions src/processing/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,7 @@
from src.processing.steps import ProcessingStep, default_pipeline


class ProcessingArgs(TypedDict, total=False):
min_tokens: int
max_tokens: int
processing_pipeline: List[ProcessingStep]


@dataclass
class ParsedProcessingArgs:
min_tokens: float = 128
max_tokens: float = 1024
processing_pipeline: List[ProcessingStep] = default_pipeline


def merge_with_defaults(user_args: Union[ProcessingArgs, None]) -> ParsedProcessingArgs:
args = ParsedProcessingArgs()

if user_args:
for key, value in user_args.items():
if hasattr(args, key):
setattr(args, key, value)

return args


def run_pipeline(nodes: List[Node], args: Union[ProcessingArgs, None]) -> List[Node]:
parsed_args = merge_with_defaults(args)
for transform in parsed_args.processing_pipeline:
def run_pipeline(nodes: List[Node], args: Union[dict, None]) -> List[Node]:
for transform in default_pipeline:
nodes = transform.process(nodes)
return nodes
28 changes: 22 additions & 6 deletions src/processing/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def process(self, nodes: List[Node]) -> List[Node]:


class RemoveMetadataElements(ProcessingStep):
def __init__(self, min_y0_pct: float = 0.12, max_y0_pct: float = 0.88):
def __init__(self, min_y0_pct: float = 0.10, max_y0_pct: float = 0.90):
self.min_y0_pct = min_y0_pct
self.max_y0_pct = max_y0_pct

Expand Down Expand Up @@ -140,13 +140,29 @@ def process(self, nodes: List[Node]) -> List[Node]:
raise NotImplementedError("Not yet implemented.")


# default_pipeline = [
# RemoveFullPageStubs(max_area_pct=0.5), # Adjust max_area_pct as needed
# CombineNodesSpatially(x_error_margin=4, y_error_margin=4, criteria="both_small"),
# CombineNodesSpatially(), # Default margins and criteria
# # CombineBullets(),
# RemoveMetadataElements(),
# CombineNodesSpatially(x_error_margin=4, y_error_margin=12, criteria="either_stub"),
# CombineNodesSpatially(criteria="either_stub"),
# # SplitLargeElements(), # Implement
# RemoveRepeatedElements(threshold=2),
# # CombineHeadingsWithClosestText(), # Implement
# ]

# optimzed for pdfminer
default_pipeline = [
RemoveFullPageStubs(max_area_pct=0.5),
CombineNodesSpatially(x_error_margin=4, y_error_margin=4, criteria="both_small"),
CombineNodesSpatially(x_error_margin=0, y_error_margin=0, criteria="both_small"),
RemoveFullPageStubs(max_area_pct=0.5), # Adjust max_area_pct as needed
CombineNodesSpatially(x_error_margin=10, y_error_margin=4, criteria="both_small"),
CombineNodesSpatially(x_error_margin=0, y_error_margin=10, criteria="both_small"),
# CombineBullets(),
RemoveMetadataElements(),
# CombineHeadingsWithClosestText(),
RemoveStubs(),
CombineNodesSpatially(criteria="either_stub"),
# SplitLargeElements(), # Implement
RemoveRepeatedElements(threshold=2),
RemoveStubs(),
# CombineHeadingsWithClosestText(), # Implement
]
Loading

0 comments on commit 4dcd8e7

Please sign in to comment.