Merge pull request #111 from VikParuchuri/commercial

Add image extraction support
VikParuchuri · May 7, 2024 · fb738ef · fb738ef
2 parents 2f93800 + c8c1f06
commit fb738ef
Show file tree

Hide file tree

Showing 27 changed files with 438 additions and 213 deletions.
diff --git a/README.md b/README.md
@@ -69,24 +69,31 @@ First, clone the repo:
   - GPU only: run `pip install torch` to install other torch dependencies.
   - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions.
 
-- Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend
-  - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
-  - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
-  - Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
-  - Set the tesseract data folder path
-    - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
-    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+**Optional**
+
+Only needed if using `ocrmypdf` as the ocr backend.
+
+- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
+- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
+- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
+- Set the tesseract data folder path
+  - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
+  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 
 ## Mac
 
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
 
-- Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR
-  - Set the tesseract data folder path
-    - Find the tesseract data folder `tessdata` with `brew list tesseract`
-    - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+**Optional**
+
+Only needed if using `ocrmypdf` as the ocr backend.
+
+- Install system requirements from `scripts/install/brew-requirements.txt`
+- Set the tesseract data folder path
+  - Find the tesseract data folder `tessdata` with `brew list tesseract`
+  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 
 # Usage
 
@@ -104,7 +111,7 @@ First, some configuration.  Note that settings can be overridden with env vars,
 Run `convert_single.py`, like this:
 
 ```
-python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 --langs English
+python convert_single.py /path/to/file.pdf /path/to/output/folder --parallel_factor 2 --max_pages 10 --langs English
 ```
 
 - `--parallel_factor` is how much to increase batch size and parallel OCR workers by.  Higher numbers will take more VRAM and CPU, but process faster.  Set to 1 by default.

diff --git a/benchmark.py b/benchmark.py
@@ -68,7 +68,7 @@ def main():
         for method in methods:
             start = time.time()
             if method == "marker":
-                full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
+                full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
             elif method == "nougat":
                 full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
             elif method == "naive":

diff --git a/convert.py b/convert.py
@@ -7,6 +7,7 @@
 import math
 
 from marker.convert import convert_single_pdf
+from marker.output import markdown_exists, save_markdown
 from marker.pdf.utils import find_filetype
 from marker.pdf.extract_text import get_length_of_text
 from marker.models import load_all_models
@@ -20,10 +21,7 @@
 
 @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
 def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
-    out_filename = fname.rsplit(".", 1)[0] + ".md"
-    out_filename = os.path.join(out_folder, os.path.basename(out_filename))
-    out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
-    if os.path.exists(out_filename):
+    if markdown_exists(out_folder, fname):
         return
     try:
         # Skip trying to convert files that don't have a lot of embedded text
@@ -38,12 +36,9 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
             if length < min_length:
                 return
 
-        full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
+        full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
         if len(full_text.strip()) > 0:
-            with open(out_filename, "w+", encoding='utf-8') as f:
-                f.write(full_text)
-            with open(out_meta_filename, "w+") as f:
-                f.write(json.dumps(out_metadata, indent=4))
+            save_markdown(out_folder, fname, full_text, images, out_metadata)
         else:
             print(f"Empty file: {fname}.  Could not convert.")
     except Exception as e:

diff --git a/convert_single.py b/convert_single.py
@@ -1,17 +1,20 @@
 import argparse
+import os
 
 from marker.convert import convert_single_pdf
 from marker.logger import configure_logging
 from marker.models import load_all_models
 import json
 
+from marker.output import save_markdown
+
 configure_logging()
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("filename", help="PDF file to parse")
-    parser.add_argument("output", help="Output file name")
+    parser.add_argument("output", help="Output base folder path")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
     parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
@@ -21,14 +24,10 @@ def main():
 
     fname = args.filename
     model_lst = load_all_models()
-    full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)
-
-    with open(args.output, "w+", encoding='utf-8') as f:
-        f.write(full_text)
+    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)
 
-    out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
-    with open(out_meta_filename, "w+") as f:
-        f.write(json.dumps(out_meta, indent=4))
+    fname = os.path.basename(fname)
+    save_markdown(args.output, fname, full_text, images, out_meta)
 
 
 if __name__ == "__main__":

diff --git a/marker/convert.py b/marker/convert.py
@@ -1,12 +1,10 @@
 import warnings
-
-from marker.cleaners.text import cleanup_text
-
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
+from PIL import Image
 
-from marker.cleaners.table import arrange_table_rows
+from marker.tables.table import format_tables
 from marker.debug.data import dump_bbox_debug_data
 from marker.layout.layout import surya_layout, annotate_block_types
 from marker.layout.order import surya_order, sort_blocks_in_reading_order
@@ -23,9 +21,11 @@
 from marker.cleaners.headings import split_heading_blocks
 from marker.cleaners.fontstyle import find_bold_italic
 from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
+from marker.cleaners.text import cleanup_text
+from marker.images.extract import extract_images
+from marker.images.save import images_to_dict
 
 from typing import List, Dict, Tuple, Optional
-import re
 from marker.settings import settings
 
 
@@ -36,7 +36,7 @@ def convert_single_pdf(
         metadata: Optional[Dict]=None,
         parallel_factor: int = 1,
         langs: Optional[List[str]] = None
-) -> Tuple[str, Dict]:
+) -> Tuple[str, Dict[str, Image.Image], Dict]:
     # Set language needed for OCR
     if langs is None:
         langs = [settings.DEFAULT_LANG]
@@ -107,7 +107,7 @@ def convert_single_pdf(
     indent_blocks(pages)
 
     # Fix table blocks
-    table_count = arrange_table_rows(pages)
+    table_count = format_tables(pages)
     out_meta["block_stats"]["table"] = table_count
 
     for page in pages:
@@ -123,6 +123,10 @@ def convert_single_pdf(
     )
     out_meta["block_stats"]["equations"] = eq_stats
 
+    # Extract images and figures
+    if settings.EXTRACT_IMAGES:
+        extract_images(doc, pages)
+
     # Split out headers
     split_heading_blocks(pages)
     find_bold_italic(pages)
@@ -146,5 +150,6 @@ def convert_single_pdf(
         batch_size=settings.EDITOR_BATCH_SIZE * parallel_factor
     )
     out_meta["postprocess_stats"] = {"edit": edit_stats}
+    doc_images = images_to_dict(pages)
 
-    return full_text, out_meta
+    return full_text, doc_images, out_meta
diff --git a/marker/equations/equations.py b/marker/equations/equations.py
@@ -3,11 +3,11 @@
 from typing import List
 
 from marker.debug.data import dump_equation_debug_data
-from marker.equations.images import get_equation_image
 from marker.equations.inference import get_total_texify_tokens, get_latex_batched
+from marker.pdf.images import render_bbox_image
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
-from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
+from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block
 from marker.settings import settings
 
 
@@ -30,21 +30,29 @@ def find_equation_blocks(page, processor):
                     if region_idx not in insert_points:
                         insert_points[region_idx] = (block_idx, line_idx)
 
+    # Account for regions where the lines were not detected
+    for region_idx, region in enumerate(equation_regions):
+        if region_idx in insert_points:
+            continue
+
+        insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
+
     block_lines_to_remove = defaultdict(set)
     for region_idx, equation_region in enumerate(equation_regions):
         if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
-            continue
-        equation_block = equation_lines[region_idx]
-        equation_insert = insert_points[region_idx]
-        block_text = " ".join([line.prelim_text for line in equation_block])
-        equation_bbox = bbox_from_lines(equation_block)
+            block_text = ""
+            total_tokens = 0
+        else:
+            equation_block = equation_lines[region_idx]
+            block_text = " ".join([line.prelim_text for line in equation_block])
+            total_tokens = get_total_texify_tokens(block_text, processor)
 
-        total_tokens = get_total_texify_tokens(block_text, processor)
+        equation_insert = insert_points[region_idx]
         equation_insert_line_idx = equation_insert[1]
         equation_insert_line_idx -= len(
             [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
 
-        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
+        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
         if total_tokens < settings.TEXIFY_MODEL_MAX:
             # Account for the lines we're about to remove
             for item in lines_to_remove[region_idx]:
@@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
     for page_idx, page_equation_blocks in enumerate(equation_blocks):
         page_obj = doc[page_idx]
         for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
-            png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
+            png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)
 
             images.append(png_image)
             token_counts.append(token_count)

diff --git a/marker/equations/images.py b/marker/equations/images.py
diff --git a/marker/images/extract.py b/marker/images/extract.py
@@ -0,0 +1,62 @@
+from marker.images.save import get_image_filename
+from marker.pdf.images import render_bbox_image
+from marker.schema.bbox import rescale_bbox
+from marker.schema.block import find_insert_block, Span
+
+
+def find_image_blocks(page):
+    image_blocks = []
+    image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
+    image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]
+
+    insert_points = {}
+    for region_idx, region in enumerate(image_regions):
+        for block_idx, block in enumerate(page.blocks):
+            for line_idx, line in enumerate(block.lines):
+                if line.intersection_pct(region) > .8:
+                    line.spans = [] # We will remove this line from the block
+
+                    if region_idx not in insert_points:
+                        insert_points[region_idx] = (block_idx, line_idx)
+
+    # Account for images with no detected lines
+    for region_idx, region in enumerate(image_regions):
+        if region_idx in insert_points:
+            continue
+
+        insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
+
+    for region_idx, image_region in enumerate(image_regions):
+        image_insert = insert_points[region_idx]
+        image_blocks.append([image_insert[0], image_insert[1], image_region])
+
+    return image_blocks
+
+
+def extract_page_images(page_obj, page):
+    page.images = []
+    image_blocks = find_image_blocks(page)
+
+    for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
+        block = page.blocks[block_idx]
+        image = render_bbox_image(page_obj, page, bbox)
+        image_filename = get_image_filename(page, image_idx)
+        image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n"
+        image_span = Span(
+            bbox=bbox,
+            text=image_markdown,
+            font="Image",
+            rotation=0,
+            font_weight=0,
+            font_size=0,
+            image=True,
+            span_id=f"image_{image_idx}"
+        )
+        block.lines[line_idx].spans.append(image_span)
+        page.images.append(image)
+
+
+def extract_images(doc, pages):
+    for page_idx, page in enumerate(pages):
+        page_obj = doc[page_idx]
+        extract_page_images(page_obj, page)
diff --git a/marker/images/save.py b/marker/images/save.py
@@ -0,0 +1,18 @@
+from typing import List
+
+from marker.schema.page import Page
+
+
+def get_image_filename(page: Page, image_idx):
+    return f"{page.pnum}_image_{image_idx}.png"
+
+
+def images_to_dict(pages: List[Page]):
+    images = {}
+    for page in pages:
+        if page.images is None:
+            continue
+        for image_idx, image in enumerate(page.images):
+            image_filename = get_image_filename(page, image_idx)
+            images[image_filename] = image
+    return images
diff --git a/marker/layout/order.py b/marker/layout/order.py
@@ -4,6 +4,7 @@
 from surya.ordering import batch_ordering
 
 from marker.pdf.images import render_image
+from marker.pdf.utils import sort_block_group
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
 from marker.settings import settings
@@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]):
             block_group = sort_block_group(block_groups[position])
             new_blocks.extend(block_group)
 
-        page.blocks = new_blocks
-
-
-def sort_block_group(blocks, tolerance=1.25):
-    vertical_groups = {}
-    for block in blocks:
-        group_key = round(block.bbox[1] / tolerance) * tolerance
-        if group_key not in vertical_groups:
-            vertical_groups[group_key] = []
-        vertical_groups[group_key].append(block)
-
-    # Sort each group horizontally and flatten the groups into a single list
-    sorted_blocks = []
-    for _, group in sorted(vertical_groups.items()):
-        sorted_group = sorted(group, key=lambda x: x.bbox[0])
-        sorted_blocks.extend(sorted_group)
-
-    return sorted_blocks
+        page.blocks = new_blocks