From 6198478597c4922b79b12cd494d55c0d9cd14a61 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 6 May 2024 11:09:27 -0700 Subject: [PATCH 1/4] Fix rotation issues --- marker/models.py | 2 +- marker/ocr/heuristics.py | 2 +- marker/ocr/lang.py | 4 ++-- marker/ocr/recognition.py | 2 +- marker/pdf/extract_text.py | 8 +++++++- marker/settings.py | 9 +-------- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 8 files changed, 18 insertions(+), 19 deletions(-) diff --git a/marker/models.py b/marker/models.py index 72cb2b62..a77fb7e1 100644 --- a/marker/models.py +++ b/marker/models.py @@ -50,7 +50,7 @@ def load_all_models(langs=None): layout = setup_layout_model() order = setup_order_model() edit = load_editing_model() - ocr = setup_recognition_model(langs) if settings.OCR_ENGINE_INTERNAL == "surya" else None + ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None texify = setup_texify_model() model_lst = [texify, layout, order, edit, detection, ocr] return model_lst diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index 2fdb9d8e..d7bca5bb 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3) total_intersection = 0 for block in page.blocks: for line in block.lines: - intersection_pct = box_intersection_pct(line.bbox, detected_bbox) + intersection_pct = box_intersection_pct(detected_bbox, line.bbox) total_intersection += intersection_pct if total_intersection > intersect_thresh: found_lines += 1 diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py index 8240b057..82d6cc0e 100644 --- a/marker/ocr/lang.py +++ b/marker/ocr/lang.py @@ -5,7 +5,7 @@ def replace_langs_with_codes(langs): - if settings.OCR_ENGINE_INTERNAL == "surya": + if settings.OCR_ENGINE == "surya": for i, lang in enumerate(langs): if lang in LANGUAGE_TO_CODE: langs[i] = LANGUAGE_TO_CODE[lang] @@ -17,7 +17,7 @@ def replace_langs_with_codes(langs): def validate_langs(langs): - if settings.OCR_ENGINE_INTERNAL == "surya": + if settings.OCR_ENGINE == "surya": for lang in langs: if lang not in CODE_TO_LANGUAGE: raise ValueError(f"Invalid language code {lang} for Surya OCR") diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 6da62d8d..24dfbbc7 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -28,7 +28,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor ocr_idxs.append(pnum) ocr_pages += 1 - ocr_method = settings.OCR_ENGINE_INTERNAL + ocr_method = settings.OCR_ENGINE if ocr_method == "surya": new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages) else: diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index bf10e906..4d5aa317 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -57,12 +57,18 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: page_bbox = page["bbox"] page_width = abs(page_bbox[2] - page_bbox[0]) page_height = abs(page_bbox[3] - page_bbox[1]) + rotation = page["rotation"] + + # Flip width and height if rotated + if rotation == 90 or rotation == 270: + page_width, page_height = page_height, page_width + page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], bbox=page_bbox, - rotation=page["rotation"], + rotation=rotation, char_blocks=page["blocks"] ) return out_page diff --git a/marker/settings.py b/marker/settings.py index 9bdc1490..5566e5fb 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -44,7 +44,7 @@ def TORCH_DEVICE_MODEL(self) -> str: # OCR INVALID_CHARS: List[str] = [chr(0xfffd), "�"] - OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU. + OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU. OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted ## Surya @@ -56,13 +56,6 @@ def TORCH_DEVICE_MODEL(self) -> str: TESSERACT_TIMEOUT: int = 20 # When to give up on OCR TESSDATA_PREFIX: str = "" - @computed_field - def OCR_ENGINE_INTERNAL(self) -> str: - if self.OCR_ENGINE is not None: - return self.OCR_ENGINE - - return "surya" - # Texify model TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify diff --git a/poetry.lock b/poetry.lock index 925699e8..50eb44e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2525,13 +2525,13 @@ image = ["Pillow"] [[package]] name = "pdftext" -version = "0.3.5" +version = "0.3.6" description = "Extract structured text from pdfs quickly" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" files = [ - {file = "pdftext-0.3.5-py3-none-any.whl", hash = "sha256:2a1649b1f2b8ea563fd4f2a3a7227afb0693622b5e3820bca390817d92f228c7"}, - {file = "pdftext-0.3.5.tar.gz", hash = "sha256:bd2c4c918889894488b18fa6395eff77138dcb8762fc3c44f08a402597618d41"}, + {file = "pdftext-0.3.6-py3-none-any.whl", hash = "sha256:82c6b0c1e3e1116446c9a5e31f1e15b078cf9195e1cff608e24f9fd5826a88df"}, + {file = "pdftext-0.3.6.tar.gz", hash = "sha256:91be26c76c2a496054d64875edf17349dbf5c17c40bb47f844dc0d9b95d4b7e2"}, ] [package.dependencies] @@ -4990,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb" +content-hash = "8759c2dc6b9d345ae966f2fe10bb8ee9a2bb93c2d6a07ec2a7d2ec4d57bd3b2c" diff --git a/pyproject.toml b/pyproject.toml index ff0c2ff4..219de389 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ texify = "^0.1.8" rapidfuzz = "^3.8.1" surya-ocr = "^0.4.0" filetype = "^1.2.0" -pdftext = "^0.3.4" +pdftext = "^0.3.6" regex = "^2024.4.28" [tool.poetry.group.dev.dependencies] From 77a99f37b35e7124915c230c5e86f9518349f7cd Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 6 May 2024 20:33:55 -0700 Subject: [PATCH 2/4] Work on tables --- marker/convert.py | 5 +- marker/equations/equations.py | 28 ++++--- marker/equations/images.py | 19 ----- marker/images/extract.py | 53 +++++++++++++ marker/layout/order.py | 20 +---- marker/ocr/heuristics.py | 2 +- marker/pdf/extract_text.py | 5 +- marker/pdf/images.py | 19 ++++- marker/pdf/utils.py | 22 ++++++ marker/schema/block.py | 21 +++++ marker/schema/page.py | 3 +- marker/settings.py | 1 + marker/tables/cells.py | 89 +++++++++++++++++++++ marker/{cleaners => tables}/table.py | 114 ++++++--------------------- marker/tables/utils.py | 37 +++++++++ 15 files changed, 295 insertions(+), 143 deletions(-) delete mode 100644 marker/equations/images.py create mode 100644 marker/images/extract.py create mode 100644 marker/tables/cells.py rename marker/{cleaners => tables}/table.py (64%) create mode 100644 marker/tables/utils.py diff --git a/marker/convert.py b/marker/convert.py index f3965be6..f475d3d4 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -6,7 +6,7 @@ import pypdfium2 as pdfium -from marker.cleaners.table import arrange_table_rows +from marker.tables.table import format_tables from marker.debug.data import dump_bbox_debug_data from marker.layout.layout import surya_layout, annotate_block_types from marker.layout.order import surya_order, sort_blocks_in_reading_order @@ -25,7 +25,6 @@ from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text from typing import List, Dict, Tuple, Optional -import re from marker.settings import settings @@ -107,7 +106,7 @@ def convert_single_pdf( indent_blocks(pages) # Fix table blocks - table_count = arrange_table_rows(pages) + table_count = format_tables(pages) out_meta["block_stats"]["table"] = table_count for page in pages: diff --git a/marker/equations/equations.py b/marker/equations/equations.py index da23b136..32df8d4d 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -3,11 +3,11 @@ from typing import List from marker.debug.data import dump_equation_debug_data -from marker.equations.images import get_equation_image from marker.equations.inference import get_total_texify_tokens, get_latex_batched +from marker.pdf.images import render_bbox_image from marker.schema.bbox import rescale_bbox from marker.schema.page import Page -from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines +from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block from marker.settings import settings @@ -30,21 +30,29 @@ def find_equation_blocks(page, processor): if region_idx not in insert_points: insert_points[region_idx] = (block_idx, line_idx) + # Account for regions where the lines were not detected + for region_idx, region in enumerate(equation_regions): + if region_idx in insert_points: + continue + + insert_points[region_idx] = (find_insert_block(page.blocks, region), 0) + block_lines_to_remove = defaultdict(set) for region_idx, equation_region in enumerate(equation_regions): if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0: - continue - equation_block = equation_lines[region_idx] - equation_insert = insert_points[region_idx] - block_text = " ".join([line.prelim_text for line in equation_block]) - equation_bbox = bbox_from_lines(equation_block) + block_text = "" + total_tokens = 0 + else: + equation_block = equation_lines[region_idx] + block_text = " ".join([line.prelim_text for line in equation_block]) + total_tokens = get_total_texify_tokens(block_text, processor) - total_tokens = get_total_texify_tokens(block_text, processor) + equation_insert = insert_points[region_idx] equation_insert_line_idx = equation_insert[1] equation_insert_line_idx -= len( [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]]) - selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox] + selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region] if total_tokens < settings.TEXIFY_MODEL_MAX: # Account for the lines we're about to remove for item in lines_to_remove[region_idx]: @@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings. for page_idx, page_equation_blocks in enumerate(equation_blocks): page_obj = doc[page_idx] for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): - png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox) + png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox) images.append(png_image) token_counts.append(token_count) diff --git a/marker/equations/images.py b/marker/equations/images.py deleted file mode 100644 index f6d4644f..00000000 --- a/marker/equations/images.py +++ /dev/null @@ -1,19 +0,0 @@ -from pypdfium2 import PdfPage - -from marker.pdf.images import render_image -from marker.schema.bbox import rescale_bbox -from marker.schema.page import Page -from marker.settings import settings - - -def get_equation_image(page_obj: PdfPage, page: Page, bbox): - rescaled_bboxes = [] - png_image = render_image(page_obj, settings.TEXIFY_DPI) - # Rescale original pdf bbox bounds to match png image size - png_bbox = [0, 0, png_image.size[0], png_image.size[1]] - rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox) - - # Crop out only the equation image - png_image = png_image.crop(rescaled_merged) - png_image = png_image.convert("RGB") - return png_image diff --git a/marker/images/extract.py b/marker/images/extract.py new file mode 100644 index 00000000..ae7d3367 --- /dev/null +++ b/marker/images/extract.py @@ -0,0 +1,53 @@ +from marker.pdf.images import render_bbox_image +from marker.schema.bbox import rescale_bbox +from marker.schema.block import find_insert_block, Span + + +def find_image_blocks(page): + image_blocks = [] + image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]] + image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions] + + insert_points = {} + for region_idx, region in enumerate(image_regions): + for block_idx, block in enumerate(page.blocks): + for line_idx, line in enumerate(block.lines): + if line.intersection_pct(region) > .8: + line.spans = [] # We will remove this line from the block + + if region_idx not in insert_points: + insert_points[region_idx] = (block_idx, line_idx) + + # Account for images with no detected lines + for region_idx, region in enumerate(image_regions): + if region_idx in insert_points: + continue + + insert_points[region_idx] = (find_insert_block(page.blocks, region), 0) + + for region_idx, image_region in enumerate(image_regions): + image_insert = insert_points[region_idx] + image_blocks.append([image_insert[0], image_insert[1], image_region]) + + return image_blocks + + +def extract_images(page): + image_blocks = find_image_blocks(page) + + for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks): + block = page.blocks[block_idx] + image = render_bbox_image(page.page_obj, page, bbox) + image_filename = f"{page.pnum}_image_{image_idx}.png" + image_markdown = f"![{image_filename}]({image_filename})" + image_span = Span( + bbox=bbox, + text=image_markdown, + font="Image", + rotation=0, + font_weight=0, + font_size=0, + image=True + ) + block.lines[line_idx].spans.append(image_span) + page.images.append(image) diff --git a/marker/layout/order.py b/marker/layout/order.py index 9833f5de..76f9fbc0 100644 --- a/marker/layout/order.py +++ b/marker/layout/order.py @@ -4,6 +4,7 @@ from surya.ordering import batch_ordering from marker.pdf.images import render_image +from marker.pdf.utils import sort_block_group from marker.schema.bbox import rescale_bbox from marker.schema.page import Page from marker.settings import settings @@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]): block_group = sort_block_group(block_groups[position]) new_blocks.extend(block_group) - page.blocks = new_blocks - - -def sort_block_group(blocks, tolerance=1.25): - vertical_groups = {} - for block in blocks: - group_key = round(block.bbox[1] / tolerance) * tolerance - if group_key not in vertical_groups: - vertical_groups[group_key] = [] - vertical_groups[group_key].append(block) - - # Sort each group horizontally and flatten the groups into a single list - sorted_blocks = [] - for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x.bbox[0]) - sorted_blocks.extend(sorted_group) - - return sorted_blocks \ No newline at end of file + page.blocks = new_blocks \ No newline at end of file diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index d7bca5bb..ffe6e422 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]): return len(full_text.strip()) == 0 -def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3): +def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6): found_lines = 0 for detected_line in page.text_lines.bboxes: diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 4d5aa317..ea9182e1 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -4,7 +4,7 @@ import pypdfium2 as pdfium import pypdfium2.internal as pdfium_i -from marker.pdf.utils import find_filetype, font_flags_decomposer +from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group from marker.ocr.heuristics import detect_bad_ocr from marker.settings import settings from marker.schema.block import Span, Line, Block @@ -63,13 +63,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: if rotation == 90 or rotation == 270: page_width, page_height = page_height, page_width + char_blocks = page["blocks"] page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], bbox=page_bbox, rotation=rotation, - char_blocks=page["blocks"] + char_blocks=char_blocks ) return out_page diff --git a/marker/pdf/images.py b/marker/pdf/images.py index 2264c28c..1bf24b56 100644 --- a/marker/pdf/images.py +++ b/marker/pdf/images.py @@ -1,4 +1,9 @@ import pypdfium2 as pdfium +from pypdfium2 import PdfPage + +from marker.schema.page import Page +from marker.schema.bbox import rescale_bbox +from marker.settings import settings def render_image(page: pdfium.PdfPage, dpi): @@ -7,4 +12,16 @@ def render_image(page: pdfium.PdfPage, dpi): draw_annots=False ).to_pil() image = image.convert("RGB") - return image \ No newline at end of file + return image + + +def render_bbox_image(page_obj: PdfPage, page: Page, bbox): + png_image = render_image(page_obj, settings.IMAGE_DPI) + # Rescale original pdf bbox bounds to match png image size + png_bbox = [0, 0, png_image.size[0], png_image.size[1]] + rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox) + + # Crop out only the equation image + png_image = png_image.crop(rescaled_merged) + png_image = png_image.convert("RGB") + return png_image \ No newline at end of file diff --git a/marker/pdf/utils.py b/marker/pdf/utils.py index 1512c17b..e15e9f37 100644 --- a/marker/pdf/utils.py +++ b/marker/pdf/utils.py @@ -52,3 +52,25 @@ def font_flags_decomposer(flags: Optional[int]) -> str: flag_descriptions.append("use_extern_attr") return "_".join(flag_descriptions) + + +def sort_block_group(blocks, tolerance=1.25): + vertical_groups = {} + for block in blocks: + if hasattr(block, "bbox"): + bbox = block.bbox + else: + bbox = block["bbox"] + + group_key = round(bbox[1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) + sorted_blocks.extend(sorted_group) + + return sorted_blocks diff --git a/marker/schema/block.py b/marker/schema/block.py index 1220b698..50ae95c6 100644 --- a/marker/schema/block.py +++ b/marker/schema/block.py @@ -1,3 +1,4 @@ +import math from typing import List, Optional from pydantic import field_validator @@ -19,6 +20,7 @@ class Span(BboxElement): font_size: float bold: Optional[bool] = None italic: Optional[bool] = None + image: Optional[bool] = None @field_validator('text') @@ -98,3 +100,22 @@ def split_block_lines(block: Block, split_line_idx: int): new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum)) new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum)) return new_blocks + + +def find_insert_block(blocks: List[Block], bbox): + nearest_match = None + match_dist = None + for idx, block in enumerate(blocks): + try: + dist = math.sqrt((block.bbox[1] - bbox[1]) ** 2 + (block.bbox[0] - bbox[0]) ** 2) + except Exception as e: + continue + + if nearest_match is None or dist < match_dist: + nearest_match = idx + match_dist = dist + if nearest_match is None: + return 0 + return nearest_match + + diff --git a/marker/schema/page.py b/marker/schema/page.py index 407939eb..c4fca410 100644 --- a/marker/schema/page.py +++ b/marker/schema/page.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Any from marker.schema.bbox import BboxElement from marker.schema.block import Block, Span @@ -15,6 +15,7 @@ class Page(BboxElement): order: Optional[OrderResult] = None ocr_method: Optional[str] = None # One of "surya" or "tesseract" char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext + images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error def get_nonblank_lines(self): lines = self.get_all_lines() diff --git a/marker/settings.py b/marker/settings.py index 5566e5fb..5eb85bed 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -10,6 +10,7 @@ class Settings(BaseSettings): # General TORCH_DEVICE: Optional[str] = None + IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at @computed_field @property diff --git a/marker/tables/cells.py b/marker/tables/cells.py new file mode 100644 index 00000000..d4524314 --- /dev/null +++ b/marker/tables/cells.py @@ -0,0 +1,89 @@ +from marker.schema.bbox import rescale_bbox, box_intersection_pct +from marker.schema.page import Page + + +def find_row_separators(page: Page, table_box, round_factor=4): + top_edges = [] + bottom_edges = [] + + line_boxes = [p.bbox for p in page.text_lines.bboxes] + line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] + line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] + + min_count = len(line_boxes) / 3 + + for cell in line_boxes: + top_edges.append(cell[1] / round_factor * round_factor) + bottom_edges.append(cell[3] / round_factor * round_factor) + + top_edges = [t for t in top_edges if top_edges.count(t) > min_count] + bottom_edges = [b for b in bottom_edges if bottom_edges.count(b) > min_count] + + unique_top = sorted(list(set(top_edges))) + unique_bottom = sorted(list(set(bottom_edges))) + + separators = min([unique_top, unique_bottom], key=len) + + # Add the top and bottom of the page as separators, to grab all possible cells + separators.append(page.bbox[3]) + separators.insert(0, page.bbox[1]) + return separators + + +def find_column_separators(page: Page, table_box, round_factor=4): + left_edges = [] + right_edges = [] + centers = [] + + line_boxes = [p.bbox for p in page.text_lines.bboxes] + line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] + line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] + + min_count = len(line_boxes) / 3 + for cell in line_boxes: + left_edges.append(cell[0] / round_factor * round_factor) + right_edges.append(cell[2] / round_factor * round_factor) + centers.append((cell[0] + cell[2]) / 2 * round_factor / round_factor) + + left_edges = [l for l in left_edges if left_edges.count(l) > min_count] + right_edges = [r for r in right_edges if right_edges.count(r) > min_count] + centers = [c for c in centers if centers.count(c) > min_count] + + unique_left = sorted(list(set(left_edges))) + unique_right = sorted(list(set(right_edges))) + unique_center = sorted(list(set(centers))) + + # Find list with minimum length + separators = min([unique_left, unique_right, unique_center], key=len) + separators.append(page.bbox[2]) + separators.insert(0, page.bbox[0]) + return separators + + +def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4): + separators = find_column_separators(page, table_box, round_factor=round_factor) + new_rows = [] + additional_column_index = 0 + for row in rows: + new_row = {} + last_col_index = -1 + for cell in row: + left_edge = cell[0][0] + column_index = -1 + for i, separator in enumerate(separators): + if left_edge - tolerance < separator and last_col_index < i: + column_index = i + break + if column_index == -1: + column_index = len(separators) + additional_column_index + additional_column_index += 1 + new_row[column_index] = cell[1] + last_col_index = column_index + additional_column_index = 0 + + flat_row = [] + for cell_idx, cell in enumerate(sorted(new_row.items())): + flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]]) + new_rows.append(flat_row) + + return new_rows diff --git a/marker/cleaners/table.py b/marker/tables/table.py similarity index 64% rename from marker/cleaners/table.py rename to marker/tables/table.py index fe33e0e0..d99b758e 100644 --- a/marker/cleaners/table.py +++ b/marker/tables/table.py @@ -1,45 +1,13 @@ +from collections import defaultdict + from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox from marker.schema.block import Line, Span, Block from marker.schema.page import Page from tabulate import tabulate -from typing import List, Dict -import re - - -def sort_table_blocks(blocks, tolerance=5): - vertical_groups = {} - for block in blocks: - if hasattr(block, "bbox"): - bbox = block.bbox - else: - bbox = block["bbox"] - group_key = round(bbox[1] / tolerance) * tolerance - if group_key not in vertical_groups: - vertical_groups[group_key] = [] - vertical_groups[group_key].append(block) - - # Sort each group horizontally and flatten the groups into a single list - sorted_blocks = [] - for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) - sorted_blocks.extend(sorted_group) - - return sorted_blocks +from typing import List - -def replace_dots(text): - dot_pattern = re.compile(r'(\s*\.\s*){4,}') - dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) - - if dot_multiline_pattern.match(text): - text = dot_pattern.sub(' ', text) - return text - - -def replace_newlines(text): - # Replace all newlines - newline_pattern = re.compile(r'[\r\n]+') - return newline_pattern.sub(' ', text.strip()) +from marker.tables.cells import assign_cells_to_columns, find_row_separators, find_column_separators +from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: @@ -73,77 +41,45 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: return table_rows -def assign_cells_to_columns(rows, round_factor=4, tolerance=4): - left_edges = [] - right_edges = [] - centers = [] - - for row in rows: - for cell in row: - left_edges.append(cell[0][0] / round_factor * round_factor) - right_edges.append(cell[0][2] / round_factor * round_factor) - centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor) - - unique_left = sorted(list(set(left_edges))) - unique_right = sorted(list(set(right_edges))) - unique_center = sorted(list(set(centers))) - - # Find list with minimum length - separators = min([unique_left, unique_right, unique_center], key=len) - - new_rows = [] - for row in rows: - new_row = {} - last_col_index = -1 - for cell in row: - left_edge = cell[0][0] - column_index = -1 - for i, separator in enumerate(separators): - if left_edge - tolerance < separator and last_col_index < i: - column_index = i - break - if column_index == -1: - column_index = cell[0][0] # Assign a new column - new_row[column_index] = cell[1] - last_col_index = column_index - - flat_row = [cell[1] for cell in sorted(new_row.items())] - min_column_index = min(new_row.keys()) - flat_row = [""] * min_column_index + flat_row - new_rows.append(flat_row) - - return new_rows - - def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: page_width = page.width table_rows = [] table_cell = "" cell_bbox = None - prev_end = None + prev_char = False table_row = [] sorted_char_blocks = sort_table_blocks(page.char_blocks) + for block_idx, block in enumerate(sorted_char_blocks): - sorted_block_lines = sort_table_blocks(block["lines"]) - for line_idx, line in enumerate(sorted_block_lines): + sorted_lines = sort_table_blocks(block["lines"]) + for line_idx, line in enumerate(sorted_lines): line_bbox = line["bbox"] intersect_pct = box_intersection_pct(line_bbox, table_box) - if intersect_pct < .5: + if intersect_pct < .7: continue for span in line["spans"]: for char in span["chars"]: x_start, y_start, x_end, y_end = char["bbox"] + if cell_bbox is None: cell_bbox = char["bbox"] else: + # Find boundaries of cell bbox before merging + cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox + cell_x_start /= page_width + cell_x_end /= page_width + cell_bbox = merge_boxes(cell_bbox, char["bbox"]) x_start /= page_width x_end /= page_width + cell_content = replace_dots(replace_newlines(table_cell)) - if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell + if not prev_char: # First char + table_cell += char["char"] + elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell table_cell += char["char"] - elif x_start > prev_end - space_tol: # Check if we are on the same line + elif x_start > cell_x_end - space_tol: # Same line, new cell, check against cell bbox if len(table_cell) > 0: table_row.append((cell_bbox, cell_content)) table_cell = char["char"] @@ -156,16 +92,18 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: if len(table_row) > 0: table_rows.append(table_row) table_row = [] - prev_end = x_end + prev_char = True + if len(table_cell) > 0: table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell)))) if len(table_row) > 0: table_rows.append(table_row) - table_rows = assign_cells_to_columns(table_rows) + + table_rows = assign_cells_to_columns(page, table_box, table_rows) return table_rows -def arrange_table_rows(pages: List[Page]): +def format_tables(pages: List[Page]): # Formats tables nicely into github flavored markdown table_count = 0 for page in pages: diff --git a/marker/tables/utils.py b/marker/tables/utils.py new file mode 100644 index 00000000..b7efdabb --- /dev/null +++ b/marker/tables/utils.py @@ -0,0 +1,37 @@ +import re + + +def sort_table_blocks(blocks, tolerance=5): + vertical_groups = {} + for block in blocks: + if hasattr(block, "bbox"): + bbox = block.bbox + else: + bbox = block["bbox"] + group_key = round(bbox[1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) + sorted_blocks.extend(sorted_group) + + return sorted_blocks + + +def replace_dots(text): + dot_pattern = re.compile(r'(\s*\.\s*){4,}') + dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) + + if dot_multiline_pattern.match(text): + text = dot_pattern.sub(' ', text) + return text + + +def replace_newlines(text): + # Replace all newlines + newline_pattern = re.compile(r'[\r\n]+') + return newline_pattern.sub(' ', text.strip()) From f7444f3a814ad6e38a16442592bd3bc9dc5463cc Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 6 May 2024 22:41:17 -0700 Subject: [PATCH 3/4] Improve sorting --- marker/ocr/heuristics.py | 2 +- marker/tables/cells.py | 18 +++++++++++++++++- marker/tables/table.py | 11 ++++------- marker/tables/utils.py | 2 +- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index ffe6e422..278d8295 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]): return len(full_text.strip()) == 0 -def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6): +def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65): found_lines = 0 for detected_line in page.text_lines.bboxes: diff --git a/marker/tables/cells.py b/marker/tables/cells.py index d4524314..1981bcd3 100644 --- a/marker/tables/cells.py +++ b/marker/tables/cells.py @@ -86,4 +86,20 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4): flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]]) new_rows.append(flat_row) - return new_rows + # Pad rows to have the same length + max_row_len = max([len(r) for r in new_rows]) + for row in new_rows: + while len(row) < max_row_len: + row.append("") + + cols_to_remove = set() + for idx, col in enumerate(zip(*new_rows)): + col_total = sum([len(cell.strip()) > 0 for cell in col]) + if col_total == 0: + cols_to_remove.add(idx) + + rows = [] + for row in new_rows: + rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove]) + + return rows diff --git a/marker/tables/table.py b/marker/tables/table.py index d99b758e..ef652634 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -37,11 +37,11 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: x_position = normed_x_end if len(table_row) > 0: table_rows.append(table_row) - table_rows = assign_cells_to_columns(table_rows) + table_rows = assign_cells_to_columns(page, table_box, table_rows) return table_rows -def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: +def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> List[List[str]]: page_width = page.width table_rows = [] table_cell = "" @@ -90,6 +90,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: table_cell = char["char"] cell_bbox = char["bbox"] if len(table_row) > 0: + table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor)) table_rows.append(table_row) table_row = [] prev_char = True @@ -97,6 +98,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: if len(table_cell) > 0: table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell)))) if len(table_row) > 0: + table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor)) table_rows.append(table_row) table_rows = assign_cells_to_columns(page, table_box, table_rows) @@ -139,11 +141,6 @@ def format_tables(pages: List[Page]): if len(table_rows) == 0: continue - max_row_len = max([len(r) for r in table_rows]) - for row in table_rows: - while len(row) < max_row_len: - row.append("") - table_text = tabulate(table_rows, headers="firstrow", tablefmt="github") table_block = Block( bbox=table_box, diff --git a/marker/tables/utils.py b/marker/tables/utils.py index b7efdabb..61b03403 100644 --- a/marker/tables/utils.py +++ b/marker/tables/utils.py @@ -8,7 +8,7 @@ def sort_table_blocks(blocks, tolerance=5): bbox = block.bbox else: bbox = block["bbox"] - group_key = round(bbox[1] / tolerance) * tolerance + group_key = round(bbox[1] / tolerance) if group_key not in vertical_groups: vertical_groups[group_key] = [] vertical_groups[group_key].append(block) From c8c1f06dbaaacc0bf390bb688e78b2e95b01021d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 7 May 2024 10:53:07 -0700 Subject: [PATCH 4/4] Enable extracting and saving images --- README.md | 31 ++++++++++++--------- benchmark.py | 2 +- convert.py | 13 +++------ convert_single.py | 15 +++++------ marker/convert.py | 16 +++++++---- marker/images/extract.py | 19 +++++++++---- marker/images/save.py | 18 +++++++++++++ marker/output.py | 36 +++++++++++++++++++++++++ marker/postprocessors/images.py | 0 marker/settings.py | 1 + marker/tables/cells.py | 48 +++++++++++++-------------------- marker/tables/table.py | 36 ++++++++++++------------- 12 files changed, 148 insertions(+), 87 deletions(-) create mode 100644 marker/images/save.py create mode 100644 marker/output.py delete mode 100644 marker/postprocessors/images.py diff --git a/README.md b/README.md index f31beba6..83b51f5d 100644 --- a/README.md +++ b/README.md @@ -69,13 +69,16 @@ First, clone the repo: - GPU only: run `pip install torch` to install other torch dependencies. - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions. -- Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend - - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`. - - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`. - - Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y` - - Set the tesseract data folder path - - Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple. - - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it +**Optional** + +Only needed if using `ocrmypdf` as the ocr backend. + +- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`. +- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`. +- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y` +- Set the tesseract data folder path + - Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple. + - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it ## Mac @@ -83,10 +86,14 @@ First, clone the repo: - `poetry install` - `poetry shell` to activate your poetry venv -- Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR - - Set the tesseract data folder path - - Find the tesseract data folder `tessdata` with `brew list tesseract` - - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it +**Optional** + +Only needed if using `ocrmypdf` as the ocr backend. + +- Install system requirements from `scripts/install/brew-requirements.txt` +- Set the tesseract data folder path + - Find the tesseract data folder `tessdata` with `brew list tesseract` + - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it # Usage @@ -104,7 +111,7 @@ First, some configuration. Note that settings can be overridden with env vars, Run `convert_single.py`, like this: ``` -python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 --langs English +python convert_single.py /path/to/file.pdf /path/to/output/folder --parallel_factor 2 --max_pages 10 --langs English ``` - `--parallel_factor` is how much to increase batch size and parallel OCR workers by. Higher numbers will take more VRAM and CPU, but process faster. Set to 1 by default. diff --git a/benchmark.py b/benchmark.py index 3214e545..c2685f57 100644 --- a/benchmark.py +++ b/benchmark.py @@ -68,7 +68,7 @@ def main(): for method in methods: start = time.time() if method == "marker": - full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor) + full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor) elif method == "nougat": full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size) elif method == "naive": diff --git a/convert.py b/convert.py index c93e161a..9ddb6226 100755 --- a/convert.py +++ b/convert.py @@ -7,6 +7,7 @@ import math from marker.convert import convert_single_pdf +from marker.output import markdown_exists, save_markdown from marker.pdf.utils import find_filetype from marker.pdf.extract_text import get_length_of_text from marker.models import load_all_models @@ -20,10 +21,7 @@ @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0) def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None): - out_filename = fname.rsplit(".", 1)[0] + ".md" - out_filename = os.path.join(out_folder, os.path.basename(out_filename)) - out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json" - if os.path.exists(out_filename): + if markdown_exists(out_folder, fname): return try: # Skip trying to convert files that don't have a lot of embedded text @@ -38,12 +36,9 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option if length < min_length: return - full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata) + full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata) if len(full_text.strip()) > 0: - with open(out_filename, "w+", encoding='utf-8') as f: - f.write(full_text) - with open(out_meta_filename, "w+") as f: - f.write(json.dumps(out_metadata, indent=4)) + save_markdown(out_folder, fname, full_text, images, out_metadata) else: print(f"Empty file: {fname}. Could not convert.") except Exception as e: diff --git a/convert_single.py b/convert_single.py index 88990a29..b6af88db 100755 --- a/convert_single.py +++ b/convert_single.py @@ -1,17 +1,20 @@ import argparse +import os from marker.convert import convert_single_pdf from marker.logger import configure_logging from marker.models import load_all_models import json +from marker.output import save_markdown + configure_logging() def main(): parser = argparse.ArgumentParser() parser.add_argument("filename", help="PDF file to parse") - parser.add_argument("output", help="Output file name") + parser.add_argument("output", help="Output base folder path") parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse") parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.") parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None) @@ -21,14 +24,10 @@ def main(): fname = args.filename model_lst = load_all_models() - full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs) - - with open(args.output, "w+", encoding='utf-8') as f: - f.write(full_text) + full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs) - out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json" - with open(out_meta_filename, "w+") as f: - f.write(json.dumps(out_meta, indent=4)) + fname = os.path.basename(fname) + save_markdown(args.output, fname, full_text, images, out_meta) if __name__ == "__main__": diff --git a/marker/convert.py b/marker/convert.py index f475d3d4..8c29917c 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -1,10 +1,8 @@ import warnings - -from marker.cleaners.text import cleanup_text - warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings import pypdfium2 as pdfium +from PIL import Image from marker.tables.table import format_tables from marker.debug.data import dump_bbox_debug_data @@ -23,6 +21,9 @@ from marker.cleaners.headings import split_heading_blocks from marker.cleaners.fontstyle import find_bold_italic from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text +from marker.cleaners.text import cleanup_text +from marker.images.extract import extract_images +from marker.images.save import images_to_dict from typing import List, Dict, Tuple, Optional from marker.settings import settings @@ -35,7 +36,7 @@ def convert_single_pdf( metadata: Optional[Dict]=None, parallel_factor: int = 1, langs: Optional[List[str]] = None -) -> Tuple[str, Dict]: +) -> Tuple[str, Dict[str, Image.Image], Dict]: # Set language needed for OCR if langs is None: langs = [settings.DEFAULT_LANG] @@ -122,6 +123,10 @@ def convert_single_pdf( ) out_meta["block_stats"]["equations"] = eq_stats + # Extract images and figures + if settings.EXTRACT_IMAGES: + extract_images(doc, pages) + # Split out headers split_heading_blocks(pages) find_bold_italic(pages) @@ -145,5 +150,6 @@ def convert_single_pdf( batch_size=settings.EDITOR_BATCH_SIZE * parallel_factor ) out_meta["postprocess_stats"] = {"edit": edit_stats} + doc_images = images_to_dict(pages) - return full_text, out_meta \ No newline at end of file + return full_text, doc_images, out_meta \ No newline at end of file diff --git a/marker/images/extract.py b/marker/images/extract.py index ae7d3367..85464f2c 100644 --- a/marker/images/extract.py +++ b/marker/images/extract.py @@ -1,3 +1,4 @@ +from marker.images.save import get_image_filename from marker.pdf.images import render_bbox_image from marker.schema.bbox import rescale_bbox from marker.schema.block import find_insert_block, Span @@ -32,14 +33,15 @@ def find_image_blocks(page): return image_blocks -def extract_images(page): +def extract_page_images(page_obj, page): + page.images = [] image_blocks = find_image_blocks(page) for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks): block = page.blocks[block_idx] - image = render_bbox_image(page.page_obj, page, bbox) - image_filename = f"{page.pnum}_image_{image_idx}.png" - image_markdown = f"![{image_filename}]({image_filename})" + image = render_bbox_image(page_obj, page, bbox) + image_filename = get_image_filename(page, image_idx) + image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n" image_span = Span( bbox=bbox, text=image_markdown, @@ -47,7 +49,14 @@ def extract_images(page): rotation=0, font_weight=0, font_size=0, - image=True + image=True, + span_id=f"image_{image_idx}" ) block.lines[line_idx].spans.append(image_span) page.images.append(image) + + +def extract_images(doc, pages): + for page_idx, page in enumerate(pages): + page_obj = doc[page_idx] + extract_page_images(page_obj, page) diff --git a/marker/images/save.py b/marker/images/save.py new file mode 100644 index 00000000..8397d5cc --- /dev/null +++ b/marker/images/save.py @@ -0,0 +1,18 @@ +from typing import List + +from marker.schema.page import Page + + +def get_image_filename(page: Page, image_idx): + return f"{page.pnum}_image_{image_idx}.png" + + +def images_to_dict(pages: List[Page]): + images = {} + for page in pages: + if page.images is None: + continue + for image_idx, image in enumerate(page.images): + image_filename = get_image_filename(page, image_idx) + images[image_filename] = image + return images diff --git a/marker/output.py b/marker/output.py new file mode 100644 index 00000000..aa53c2f6 --- /dev/null +++ b/marker/output.py @@ -0,0 +1,36 @@ +import os +import json + + +def get_subfolder_path(out_folder, fname): + subfolder_name = fname.split(".")[0] + subfolder_path = os.path.join(out_folder, subfolder_name) + os.makedirs(subfolder_path, exist_ok=True) + return subfolder_path + + +def get_markdown_filepath(out_folder, fname): + subfolder_path = get_subfolder_path(out_folder, fname) + out_filename = fname.rsplit(".", 1)[0] + ".md" + out_filename = os.path.join(subfolder_path, out_filename) + return out_filename + + +def markdown_exists(out_folder, fname): + out_filename = get_markdown_filepath(out_folder, fname) + return os.path.exists(out_filename) + + +def save_markdown(out_folder, fname, full_text, images, out_metadata): + subfolder_path = get_subfolder_path(out_folder, fname) + markdown_filepath = get_markdown_filepath(out_folder, fname) + out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json" + + with open(markdown_filepath, "w+", encoding='utf-8') as f: + f.write(full_text) + with open(out_meta_filepath, "w+") as f: + f.write(json.dumps(out_metadata, indent=4)) + + for filename, image in images.items(): + image_filepath = os.path.join(subfolder_path, filename) + image.save(image_filepath, "PNG") \ No newline at end of file diff --git a/marker/postprocessors/images.py b/marker/postprocessors/images.py deleted file mode 100644 index e69de29b..00000000 diff --git a/marker/settings.py b/marker/settings.py index 5eb85bed..35d7ab81 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -11,6 +11,7 @@ class Settings(BaseSettings): # General TORCH_DEVICE: Optional[str] = None IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at + EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them @computed_field @property diff --git a/marker/tables/cells.py b/marker/tables/cells.py index 1981bcd3..8bf8f9f1 100644 --- a/marker/tables/cells.py +++ b/marker/tables/cells.py @@ -1,36 +1,27 @@ from marker.schema.bbox import rescale_bbox, box_intersection_pct from marker.schema.page import Page +from sklearn.cluster import DBSCAN +import numpy as np -def find_row_separators(page: Page, table_box, round_factor=4): - top_edges = [] - bottom_edges = [] +def cluster_coords(coords): + if len(coords) == 0: + return [] + coords = np.array(sorted(set(coords))).reshape(-1, 1) - line_boxes = [p.bbox for p in page.text_lines.bboxes] - line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] - line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] - - min_count = len(line_boxes) / 3 - - for cell in line_boxes: - top_edges.append(cell[1] / round_factor * round_factor) - bottom_edges.append(cell[3] / round_factor * round_factor) - - top_edges = [t for t in top_edges if top_edges.count(t) > min_count] - bottom_edges = [b for b in bottom_edges if bottom_edges.count(b) > min_count] - - unique_top = sorted(list(set(top_edges))) - unique_bottom = sorted(list(set(bottom_edges))) + clustering = DBSCAN(eps=5, min_samples=1).fit(coords) + clusters = clustering.labels_ - separators = min([unique_top, unique_bottom], key=len) + separators = [] + for label in set(clusters): + clustered_points = coords[clusters == label] + separators.append(np.mean(clustered_points)) - # Add the top and bottom of the page as separators, to grab all possible cells - separators.append(page.bbox[3]) - separators.insert(0, page.bbox[1]) + separators = sorted(separators) return separators -def find_column_separators(page: Page, table_box, round_factor=4): +def find_column_separators(page: Page, table_box, round_factor=4, min_count=1): left_edges = [] right_edges = [] centers = [] @@ -39,7 +30,6 @@ def find_column_separators(page: Page, table_box, round_factor=4): line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] - min_count = len(line_boxes) / 3 for cell in line_boxes: left_edges.append(cell[0] / round_factor * round_factor) right_edges.append(cell[2] / round_factor * round_factor) @@ -49,12 +39,12 @@ def find_column_separators(page: Page, table_box, round_factor=4): right_edges = [r for r in right_edges if right_edges.count(r) > min_count] centers = [c for c in centers if centers.count(c) > min_count] - unique_left = sorted(list(set(left_edges))) - unique_right = sorted(list(set(right_edges))) - unique_center = sorted(list(set(centers))) + sorted_left = cluster_coords(left_edges) + sorted_right = cluster_coords(right_edges) + sorted_center = cluster_coords(centers) # Find list with minimum length - separators = min([unique_left, unique_right, unique_center], key=len) + separators = max([sorted_left, sorted_right, sorted_center], key=len) separators.append(page.bbox[2]) separators.insert(0, page.bbox[0]) return separators @@ -83,7 +73,7 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4): flat_row = [] for cell_idx, cell in enumerate(sorted(new_row.items())): - flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]]) + flat_row.append(cell[1]) new_rows.append(flat_row) # Pad rows to have the same length diff --git a/marker/tables/table.py b/marker/tables/table.py index ef652634..1de64437 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -6,7 +6,7 @@ from tabulate import tabulate from typing import List -from marker.tables.cells import assign_cells_to_columns, find_row_separators, find_column_separators +from marker.tables.cells import assign_cells_to_columns from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines @@ -46,10 +46,12 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L table_rows = [] table_cell = "" cell_bbox = None - prev_char = False table_row = [] sorted_char_blocks = sort_table_blocks(page.char_blocks) + table_width = table_box[2] - table_box[0] + new_line_start_x = table_box[0] + table_width * .2 + for block_idx, block in enumerate(sorted_char_blocks): sorted_lines = sort_table_blocks(block["lines"]) for line_idx, line in enumerate(sorted_lines): @@ -60,31 +62,25 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L for span in line["spans"]: for char in span["chars"]: x_start, y_start, x_end, y_end = char["bbox"] + x_start /= page_width + x_end /= page_width - if cell_bbox is None: - cell_bbox = char["bbox"] - else: + if cell_bbox is not None: # Find boundaries of cell bbox before merging cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox cell_x_start /= page_width cell_x_end /= page_width - cell_bbox = merge_boxes(cell_bbox, char["bbox"]) - - x_start /= page_width - x_end /= page_width - cell_content = replace_dots(replace_newlines(table_cell)) - if not prev_char: # First char + if cell_bbox is None: # First char table_cell += char["char"] + cell_bbox = char["bbox"] elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell table_cell += char["char"] - elif x_start > cell_x_end - space_tol: # Same line, new cell, check against cell bbox - if len(table_cell) > 0: - table_row.append((cell_bbox, cell_content)) - table_cell = char["char"] - cell_bbox = char["bbox"] - else: # New line and cell + cell_bbox = merge_boxes(cell_bbox, char["bbox"]) + # New line and cell + # Use x_start < new_line_start_x to account for out-of-order cells in the pdf + elif x_start < cell_x_end - space_tol and x_start < new_line_start_x: if len(table_cell) > 0: table_row.append((cell_bbox, cell_content)) table_cell = char["char"] @@ -93,7 +89,11 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor)) table_rows.append(table_row) table_row = [] - prev_char = True + else: # Same line, new cell, check against cell bbox + if len(table_cell) > 0: + table_row.append((cell_bbox, cell_content)) + table_cell = char["char"] + cell_bbox = char["bbox"] if len(table_cell) > 0: table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))