Skip to content

Commit

Permalink
Merge pull request #111 from VikParuchuri/commercial
Browse files Browse the repository at this point in the history
Add image extraction support
  • Loading branch information
VikParuchuri authored May 7, 2024
2 parents 2f93800 + c8c1f06 commit fb738ef
Show file tree
Hide file tree
Showing 27 changed files with 438 additions and 213 deletions.
31 changes: 19 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,24 +69,31 @@ First, clone the repo:
- GPU only: run `pip install torch` to install other torch dependencies.
- CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions.

- Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend
- Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
**Optional**

Only needed if using `ocrmypdf` as the ocr backend.

- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it

## Mac

- Install python requirements
- `poetry install`
- `poetry shell` to activate your poetry venv

- Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `brew list tesseract`
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
**Optional**

Only needed if using `ocrmypdf` as the ocr backend.

- Install system requirements from `scripts/install/brew-requirements.txt`
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `brew list tesseract`
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it

# Usage

Expand All @@ -104,7 +111,7 @@ First, some configuration. Note that settings can be overridden with env vars,
Run `convert_single.py`, like this:

```
python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 --langs English
python convert_single.py /path/to/file.pdf /path/to/output/folder --parallel_factor 2 --max_pages 10 --langs English
```

- `--parallel_factor` is how much to increase batch size and parallel OCR workers by. Higher numbers will take more VRAM and CPU, but process faster. Set to 1 by default.
Expand Down
2 changes: 1 addition & 1 deletion benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main():
for method in methods:
start = time.time()
if method == "marker":
full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
elif method == "nougat":
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
elif method == "naive":
Expand Down
13 changes: 4 additions & 9 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import math

from marker.convert import convert_single_pdf
from marker.output import markdown_exists, save_markdown
from marker.pdf.utils import find_filetype
from marker.pdf.extract_text import get_length_of_text
from marker.models import load_all_models
Expand All @@ -20,10 +21,7 @@

@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
out_filename = fname.rsplit(".", 1)[0] + ".md"
out_filename = os.path.join(out_folder, os.path.basename(out_filename))
out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
if os.path.exists(out_filename):
if markdown_exists(out_folder, fname):
return
try:
# Skip trying to convert files that don't have a lot of embedded text
Expand All @@ -38,12 +36,9 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
if length < min_length:
return

full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
if len(full_text.strip()) > 0:
with open(out_filename, "w+", encoding='utf-8') as f:
f.write(full_text)
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_metadata, indent=4))
save_markdown(out_folder, fname, full_text, images, out_metadata)
else:
print(f"Empty file: {fname}. Could not convert.")
except Exception as e:
Expand Down
15 changes: 7 additions & 8 deletions convert_single.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import argparse
import os

from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
import json

from marker.output import save_markdown

configure_logging()


def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output file name")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
Expand All @@ -21,14 +24,10 @@ def main():

fname = args.filename
model_lst = load_all_models()
full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)

with open(args.output, "w+", encoding='utf-8') as f:
f.write(full_text)
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)

out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_meta, indent=4))
fname = os.path.basename(fname)
save_markdown(args.output, fname, full_text, images, out_meta)


if __name__ == "__main__":
Expand Down
21 changes: 13 additions & 8 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import warnings

from marker.cleaners.text import cleanup_text

warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

import pypdfium2 as pdfium
from PIL import Image

from marker.cleaners.table import arrange_table_rows
from marker.tables.table import format_tables
from marker.debug.data import dump_bbox_debug_data
from marker.layout.layout import surya_layout, annotate_block_types
from marker.layout.order import surya_order, sort_blocks_in_reading_order
Expand All @@ -23,9 +21,11 @@
from marker.cleaners.headings import split_heading_blocks
from marker.cleaners.fontstyle import find_bold_italic
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
from marker.cleaners.text import cleanup_text
from marker.images.extract import extract_images
from marker.images.save import images_to_dict

from typing import List, Dict, Tuple, Optional
import re
from marker.settings import settings


Expand All @@ -36,7 +36,7 @@ def convert_single_pdf(
metadata: Optional[Dict]=None,
parallel_factor: int = 1,
langs: Optional[List[str]] = None
) -> Tuple[str, Dict]:
) -> Tuple[str, Dict[str, Image.Image], Dict]:
# Set language needed for OCR
if langs is None:
langs = [settings.DEFAULT_LANG]
Expand Down Expand Up @@ -107,7 +107,7 @@ def convert_single_pdf(
indent_blocks(pages)

# Fix table blocks
table_count = arrange_table_rows(pages)
table_count = format_tables(pages)
out_meta["block_stats"]["table"] = table_count

for page in pages:
Expand All @@ -123,6 +123,10 @@ def convert_single_pdf(
)
out_meta["block_stats"]["equations"] = eq_stats

# Extract images and figures
if settings.EXTRACT_IMAGES:
extract_images(doc, pages)

# Split out headers
split_heading_blocks(pages)
find_bold_italic(pages)
Expand All @@ -146,5 +150,6 @@ def convert_single_pdf(
batch_size=settings.EDITOR_BATCH_SIZE * parallel_factor
)
out_meta["postprocess_stats"] = {"edit": edit_stats}
doc_images = images_to_dict(pages)

return full_text, out_meta
return full_text, doc_images, out_meta
28 changes: 18 additions & 10 deletions marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from typing import List

from marker.debug.data import dump_equation_debug_data
from marker.equations.images import get_equation_image
from marker.equations.inference import get_total_texify_tokens, get_latex_batched
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block
from marker.settings import settings


Expand All @@ -30,21 +30,29 @@ def find_equation_blocks(page, processor):
if region_idx not in insert_points:
insert_points[region_idx] = (block_idx, line_idx)

# Account for regions where the lines were not detected
for region_idx, region in enumerate(equation_regions):
if region_idx in insert_points:
continue

insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)

block_lines_to_remove = defaultdict(set)
for region_idx, equation_region in enumerate(equation_regions):
if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
continue
equation_block = equation_lines[region_idx]
equation_insert = insert_points[region_idx]
block_text = " ".join([line.prelim_text for line in equation_block])
equation_bbox = bbox_from_lines(equation_block)
block_text = ""
total_tokens = 0
else:
equation_block = equation_lines[region_idx]
block_text = " ".join([line.prelim_text for line in equation_block])
total_tokens = get_total_texify_tokens(block_text, processor)

total_tokens = get_total_texify_tokens(block_text, processor)
equation_insert = insert_points[region_idx]
equation_insert_line_idx = equation_insert[1]
equation_insert_line_idx -= len(
[x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])

selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
if total_tokens < settings.TEXIFY_MODEL_MAX:
# Account for the lines we're about to remove
for item in lines_to_remove[region_idx]:
Expand Down Expand Up @@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
for page_idx, page_equation_blocks in enumerate(equation_blocks):
page_obj = doc[page_idx]
for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)

images.append(png_image)
token_counts.append(token_count)
Expand Down
19 changes: 0 additions & 19 deletions marker/equations/images.py

This file was deleted.

62 changes: 62 additions & 0 deletions marker/images/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from marker.images.save import get_image_filename
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import find_insert_block, Span


def find_image_blocks(page):
image_blocks = []
image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]

insert_points = {}
for region_idx, region in enumerate(image_regions):
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
if line.intersection_pct(region) > .8:
line.spans = [] # We will remove this line from the block

if region_idx not in insert_points:
insert_points[region_idx] = (block_idx, line_idx)

# Account for images with no detected lines
for region_idx, region in enumerate(image_regions):
if region_idx in insert_points:
continue

insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)

for region_idx, image_region in enumerate(image_regions):
image_insert = insert_points[region_idx]
image_blocks.append([image_insert[0], image_insert[1], image_region])

return image_blocks


def extract_page_images(page_obj, page):
page.images = []
image_blocks = find_image_blocks(page)

for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
block = page.blocks[block_idx]
image = render_bbox_image(page_obj, page, bbox)
image_filename = get_image_filename(page, image_idx)
image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n"
image_span = Span(
bbox=bbox,
text=image_markdown,
font="Image",
rotation=0,
font_weight=0,
font_size=0,
image=True,
span_id=f"image_{image_idx}"
)
block.lines[line_idx].spans.append(image_span)
page.images.append(image)


def extract_images(doc, pages):
for page_idx, page in enumerate(pages):
page_obj = doc[page_idx]
extract_page_images(page_obj, page)
18 changes: 18 additions & 0 deletions marker/images/save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import List

from marker.schema.page import Page


def get_image_filename(page: Page, image_idx):
return f"{page.pnum}_image_{image_idx}.png"


def images_to_dict(pages: List[Page]):
images = {}
for page in pages:
if page.images is None:
continue
for image_idx, image in enumerate(page.images):
image_filename = get_image_filename(page, image_idx)
images[image_filename] = image
return images
20 changes: 2 additions & 18 deletions marker/layout/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from surya.ordering import batch_ordering

from marker.pdf.images import render_image
from marker.pdf.utils import sort_block_group
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.settings import settings
Expand Down Expand Up @@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]):
block_group = sort_block_group(block_groups[position])
new_blocks.extend(block_group)

page.blocks = new_blocks


def sort_block_group(blocks, tolerance=1.25):
vertical_groups = {}
for block in blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_blocks.extend(sorted_group)

return sorted_blocks
page.blocks = new_blocks
Loading

0 comments on commit fb738ef

Please sign in to comment.