.

DivinciAI · Mar 27, 2024 · 4dcd8e7 · 4dcd8e7
1 parent 8a511ab
commit 4dcd8e7
Show file tree

Hide file tree

Showing 25 changed files with 371 additions and 965 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,7 @@ archive.zip
 .DS_Store
 
 .venv/
+
+# misc
+trash.py
+test-output.pdf
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -43,12 +43,15 @@ Try the sample notebook <a href="https://github.com/pymupdf/PyMuPDF" class="exte
 
 Python 3.8+
 
-OpenParse stands on the shoulders of giants:
+**Dealing with PDF's:**
 
-- <a href="https://github.com/pymupdf/PyMuPDF" class="external-link" target="_blank">PyMuPDF</a> for handling pdf files
-- <a href="https://huggingface.co/microsoft/table-transformer-detection" class="external-link" target="_blank">Table Transformer</a> for parsing tables
+- <a href="https://github.com/pdfminer/pdfminer.six" class="external-link" target="_blank">pdfminer.six</a> Fully open source.
 
+**Extracting Tables:**
 
+- <a href="https://github.com/pymupdf/PyMuPDF" class="external-link" target="_blank">PyMuPDF</a> has some table detection functionality. Please see their <a href="https://mupdf.com/licensing/index.html#commercial" class="external-link" target="_blank">license</a>.
+- <a href="https://huggingface.co/microsoft/table-transformer-detection" class="external-link" target="_blank">Table Transformer</a> is a deep learning approach.
+- <a href="https://github.com/poloclub/unitable" class="external-link" target="_blank">unitable</a> is a more recent deep learning approach that seems promising *(coming soon)*
 
 ## Installation
 
@@ -61,7 +64,7 @@ pip install open-parse
 
 **Enabling OCR Support**:
 
-PyMuPDF will already contain all the logic to support OCR functions. But it additionally does need Tesseract’s language support data, so installation of Tesseract-OCR is still required.
+PyMuPDF  will already contain all the logic to support OCR functions. But it additionally does need Tesseract’s language support data, so installation of Tesseract-OCR is still required.
 
 The language support folder location must be communicated either via storing it in the environment variable "TESSDATA_PREFIX", or as a parameter in the applicable functions.
 
@@ -100,3 +103,5 @@ pip install "open-parse[tables]"
 
 
 
+
+
diff --git a/...full-pdfs/4._OH_MBH_AIC_R56007_Manual.pdf → evals/data/full-pdfs/mobile-home-manual.pdf b/...full-pdfs/4._OH_MBH_AIC_R56007_Manual.pdf → evals/data/full-pdfs/mobile-home-manual.pdf
diff --git a/...me_Actuarial_Justification_11-16-2021.pdf → .../full-pdfs/summary-of-rate-indication.pdf b/...me_Actuarial_Justification_11-16-2021.pdf → .../full-pdfs/summary-of-rate-indication.pdf
diff --git a/evals/data/tables/meta-2022-10k-page-53.pdf b/evals/data/tables/meta-2022-10k-page-53.pdf
diff --git a/evals/data/tables/meta-2022-10k-page-69.pdf b/evals/data/tables/meta-2022-10k-page-69.pdf
diff --git a/notebooks/.gitignore b/notebooks/.gitignore
@@ -1 +1,3 @@
 config.py
+*.ipynb
+
diff --git a/src/consts.py b/src/consts.py
@@ -1 +1,3 @@
 MAX_EMBEDDING_TOKENS = 8000
+TOKENIZATION_LOWER_LIMIT = 256
+TOKENIZATION_UPPER_LIMIT = 1024
diff --git a/src/main.py b/src/main.py
@@ -1,12 +1,10 @@
 from typing import List, Union, TypedDict
 from pathlib import Path
 
-import fitz
-
 from src import text, tables
 from src.processing import run_pipeline, ProcessingArgs
-from src.utils import load_doc
 from src.schemas import Node
+from src.pdf import Pdf
 
 
 class TableArgs(TypedDict, total=False):
@@ -28,17 +26,14 @@ def __init__(
 
     def parse(
         self,
-        file: str | Path | fitz.Document,
+        file: str | Path,
     ) -> List[Node]:
-        doc = load_doc(file)
+        doc = Pdf(file)
 
         text_elems = text.ingest(doc)
         text_nodes = [
             Node(
                 elements=[e],
-                tokenization_upper_limit=self.processing_args[
-                    "tokenization_upper_limit"
-                ],
             )
             for e in text_elems
         ]
@@ -50,17 +45,3 @@ def parse(
         all_elems = text_elems + table_elems
         processed_elems = run_pipeline(all_elems, self.processing_args)
         return processed_elems
-
-
-parser = DocumentParser(
-    table_args={
-        "parse": True,
-        "args": {
-            "min_table_confidence": 0.75,
-            "min_cell_confidence": 0.95,
-            "table_output_format": "markdown",
-        },
-    },
-)
-
-parsed = parser.parse("path/to/sample.pdf")
diff --git a/src/pdf.py b/src/pdf.py
@@ -0,0 +1,171 @@
+from pathlib import Path
+from typing import Union, List, Optional, Iterator, Literal
+import tempfile
+import random
+
+from pypdf import PdfReader, PdfWriter
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTPage
+
+from src.schemas import Bbox
+
+
+class Pdf:
+    """
+    Simple utility class for working with PDF files. This class wraps the PdfReader and PdfWriter classes from pypdf.
+    """
+
+    def __init__(self, file: Union[str, Path, PdfReader]):
+        self.file_path = str(file) if isinstance(file, (str, Path)) else None
+        self.reader = PdfReader(file) if isinstance(file, (str, Path)) else file
+        self.writer = PdfWriter()
+        for page in self.reader.pages:
+            self.writer.add_page(page)
+
+    def extract_layout_pages(self) -> Iterator[LTPage]:
+        """
+        Yields layout objects for each page in the PDF using pdfminer.six.
+        """
+        assert (
+            self.file_path is not None
+        ), "PDF file path is required for this method for now."
+
+        for page_layout in extract_pages(self.file_path):
+            yield page_layout
+
+    def save(self, output_pdf: Union[str, Path]) -> None:
+        """
+        Saves the content from the PdfWriter to a new PDF file.
+        """
+        with open(str(output_pdf), "wb") as out_pdf:
+            self.writer.write(out_pdf)
+
+    def extract_pages(self, start_page: int, end_page: int) -> None:
+        """
+        Extracts a range of pages from the PDF and adds them to the PdfWriter.
+        """
+        for page_num in range(start_page - 1, end_page):
+            self.writer.add_page(self.reader.pages[page_num])
+
+    def to_pymupdf_doc(self):
+        """
+        Transforms the PDF into a PyMuPDF (fitz) document.
+        If modifications have been made using PdfWriter, it saves to a temporary file first.
+        This function dynamically imports PyMuPDF (fitz), requiring it only if this method is called.
+        """
+        try:
+            import fitz
+        except ImportError:
+            raise ImportError(
+                "PyMuPDF (fitz) is not installed. This method requires PyMuPDF."
+            )
+
+        if not self.writer.pages:
+            return fitz.open(self.file_path)
+
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
+            self.writer.write(tmpfile.name)
+            return fitz.open(tmpfile.name)
+
+    def _draw_bboxes(
+        self,
+        bboxes: List[Bbox],
+        coordinates: Literal[
+            "pymupdf",
+            "pdfminer",
+        ],
+    ):
+        try:
+            import fitz
+        except ImportError:
+            raise ImportError(
+                "PyMuPDF (fitz) is not installed. This method requires PyMuPDF."
+            )
+
+        pdf = self.to_pymupdf_doc()
+
+        for page in pdf:
+            page.wrap_contents()
+
+            for bbox in bboxes:
+                if bbox.page != page.number:
+                    continue
+                if coordinates == "pdfminer":
+                    bbox = self._flip_coordinates(bbox)
+                r = fitz.Rect(
+                    x0=bbox.x0,
+                    y0=bbox.y0,
+                    x1=bbox.x1,
+                    y1=bbox.y1,
+                )
+                color = (
+                    random.randint(0, 255) / 256,
+                    random.randint(0, 255) / 256,
+                    random.randint(0, 255) / 256,
+                )
+                page.draw_rect(r, color)
+        return pdf
+
+    def display_with_bboxes(
+        self,
+        bboxes: Union[List[Bbox], List[List[Bbox]]],
+        page_nums: Optional[List[int]] = None,
+        coordinates: Literal[
+            "pymupdf",
+            "pdfminer",
+        ] = "pdfminer",
+    ):
+        """
+        Display a single page of a PDF file using IPython.
+        """
+        try:
+            from IPython.display import Image, display  # type: ignore
+        except ImportError:
+            raise ImportError(
+                "IPython is required to display PDFs. Please install it with `pip install ipython`."
+            )
+        flattened_bboxes = self._flatten_bboxes(bboxes)
+        marked_up_doc = self._draw_bboxes(flattened_bboxes, coordinates)
+        if not page_nums:
+            page_nums = list(range(marked_up_doc.page_count))
+        for page_num in page_nums:
+            page = marked_up_doc[page_num]
+            img_data = page.get_pixmap().tobytes("png")
+            display(Image(data=img_data))
+
+    def export_with_bboxes(
+        self,
+        bboxes: Union[List[Bbox], List[List[Bbox]]],
+        output_pdf: Union[str, Path],
+        coordinates: Literal[
+            "pymupdf",
+            "pdfminer",
+        ] = "pdfminer",
+    ) -> None:
+        flattened_bboxes = self._flatten_bboxes(bboxes)
+        marked_up_doc = self._draw_bboxes(flattened_bboxes, coordinates)
+        marked_up_doc.save(str(output_pdf))
+
+    def _flip_coordinates(self, bbox: Bbox) -> Bbox:
+        fy0 = bbox.page_height - bbox.y1
+        fy1 = bbox.page_height - bbox.y0
+        return Bbox(
+            page=bbox.page,
+            page_height=bbox.page_height,
+            page_width=bbox.page_width,
+            x0=bbox.x0,
+            y0=fy0,
+            x1=bbox.x1,
+            y1=fy1,
+        )
+
+    def _flatten_bboxes(
+        self, bboxes: Union[List[Bbox], List[List[Bbox]]]
+    ) -> List[Bbox]:
+        res = []
+        for element in bboxes:
+            if isinstance(element, Bbox):
+                res.append(element)
+            elif isinstance(element, list):
+                res.extend(bbox for bbox in element if isinstance(bbox, Bbox))
+        return res
diff --git a/src/processing/__init__.py b/src/processing/__init__.py
@@ -1 +1 @@
-from .ingest import ProcessingArgs, run_pipeline
+from .ingest import run_pipeline
diff --git a/src/processing/ingest.py b/src/processing/ingest.py
@@ -5,32 +5,7 @@
 from src.processing.steps import ProcessingStep, default_pipeline
 
 
-class ProcessingArgs(TypedDict, total=False):
-    min_tokens: int
-    max_tokens: int
-    processing_pipeline: List[ProcessingStep]
-
-
-@dataclass
-class ParsedProcessingArgs:
-    min_tokens: float = 128
-    max_tokens: float = 1024
-    processing_pipeline: List[ProcessingStep] = default_pipeline
-
-
-def merge_with_defaults(user_args: Union[ProcessingArgs, None]) -> ParsedProcessingArgs:
-    args = ParsedProcessingArgs()
-
-    if user_args:
-        for key, value in user_args.items():
-            if hasattr(args, key):
-                setattr(args, key, value)
-
-    return args
-
-
-def run_pipeline(nodes: List[Node], args: Union[ProcessingArgs, None]) -> List[Node]:
-    parsed_args = merge_with_defaults(args)
-    for transform in parsed_args.processing_pipeline:
+def run_pipeline(nodes: List[Node], args: Union[dict, None]) -> List[Node]:
+    for transform in default_pipeline:
         nodes = transform.process(nodes)
     return nodes
diff --git a/src/processing/steps.py b/src/processing/steps.py
@@ -38,7 +38,7 @@ def process(self, nodes: List[Node]) -> List[Node]:
 
 
 class RemoveMetadataElements(ProcessingStep):
-    def __init__(self, min_y0_pct: float = 0.12, max_y0_pct: float = 0.88):
+    def __init__(self, min_y0_pct: float = 0.10, max_y0_pct: float = 0.90):
         self.min_y0_pct = min_y0_pct
         self.max_y0_pct = max_y0_pct
 
@@ -140,13 +140,29 @@ def process(self, nodes: List[Node]) -> List[Node]:
         raise NotImplementedError("Not yet implemented.")
 
 
+# default_pipeline = [
+#     RemoveFullPageStubs(max_area_pct=0.5),  # Adjust max_area_pct as needed
+#     CombineNodesSpatially(x_error_margin=4, y_error_margin=4, criteria="both_small"),
+#     CombineNodesSpatially(),  # Default margins and criteria
+#     # CombineBullets(),
+#     RemoveMetadataElements(),
+#     CombineNodesSpatially(x_error_margin=4, y_error_margin=12, criteria="either_stub"),
+#     CombineNodesSpatially(criteria="either_stub"),
+#     # SplitLargeElements(),  # Implement
+#     RemoveRepeatedElements(threshold=2),
+#     # CombineHeadingsWithClosestText(),  # Implement
+# ]
+
+# optimzed for pdfminer
 default_pipeline = [
-    RemoveFullPageStubs(max_area_pct=0.5),
-    CombineNodesSpatially(x_error_margin=4, y_error_margin=4, criteria="both_small"),
-    CombineNodesSpatially(x_error_margin=0, y_error_margin=0, criteria="both_small"),
+    RemoveFullPageStubs(max_area_pct=0.5),  # Adjust max_area_pct as needed
+    CombineNodesSpatially(x_error_margin=10, y_error_margin=4, criteria="both_small"),
+    CombineNodesSpatially(x_error_margin=0, y_error_margin=10, criteria="both_small"),
     # CombineBullets(),
     RemoveMetadataElements(),
-    # CombineHeadingsWithClosestText(),
-    RemoveStubs(),
+    CombineNodesSpatially(criteria="either_stub"),
+    # SplitLargeElements(),  # Implement
     RemoveRepeatedElements(threshold=2),
+    RemoveStubs(),
+    # CombineHeadingsWithClosestText(),  # Implement
 ]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .ingest import ProcessingArgs, run_pipeline
		from .ingest import run_pipeline