diff --git a/marker/v2/builders/document.py b/marker/v2/builders/document.py index 959ebc0..f260d7c 100644 --- a/marker/v2/builders/document.py +++ b/marker/v2/builders/document.py @@ -1,19 +1,19 @@ from marker.settings import settings from marker.v2.builders import BaseBuilder from marker.v2.builders.layout import LayoutBuilder -from marker.v2.providers import BaseProvider +from marker.v2.providers.pdf import PdfProvider from marker.v2.schema.document import Document from marker.v2.schema.groups.page import PageGroup from marker.v2.schema.polygon import PolygonBox class DocumentBuilder(BaseBuilder): - def __call__(self, provider: BaseProvider, layout_builder: LayoutBuilder): + def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder): document = self.build_document(provider) - layout_builder(document) + layout_builder(document, provider) return document - def build_document(self, provider: BaseProvider): + def build_document(self, provider: PdfProvider): if provider.config.page_range is None: page_range = range(len(provider)) else: diff --git a/marker/v2/builders/layout.py b/marker/v2/builders/layout.py index 3d514d2..f00aab8 100644 --- a/marker/v2/builders/layout.py +++ b/marker/v2/builders/layout.py @@ -5,10 +5,12 @@ from marker.settings import settings from marker.v2.builders import BaseBuilder -from marker.v2.schema.blocks import LAYOUT_BLOCK_REGISTRY +from marker.v2.providers.pdf import PdfProvider +from marker.v2.schema.blocks import LAYOUT_BLOCK_REGISTRY, Block, Text from marker.v2.schema.document import Document from marker.v2.schema.groups.page import PageGroup from marker.v2.schema.polygon import PolygonBox +from marker.v2.schema.text.line import Line class LayoutBuilder(BaseBuilder): @@ -17,9 +19,10 @@ def __init__(self, layout_model, config=None): super().__init__(config) - def __call__(self, document: Document): + def __call__(self, document: Document, provider: PdfProvider): layout_results = self.surya_layout(document.pages) self.add_blocks_to_pages(document.pages, layout_results) + self.merge_blocks(document.pages, provider) @classmethod def get_batch_size(cls): @@ -44,3 +47,51 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou for bbox in sorted(layout_result.bboxes, key=lambda x: x.position): block_cls = LAYOUT_BLOCK_REGISTRY[bbox.label] page.add_block(block_cls, PolygonBox(polygon=bbox.polygon)) + + def merge_blocks(self, document_pages: List[PageGroup], provider: PdfProvider): + provider_page_lines = provider.page_lines + for idx, (document_page, provider_lines) in enumerate(zip(document_pages, provider_page_lines.values())): + all_line_idxs = set(range(len(provider_lines))) + page_size = provider.doc[idx].get_size() + max_intersections = {} + for line_idx, line in enumerate(provider_lines): + for block_idx, block in enumerate(document_page.children): + line.polygon.rescale(page_size, document_page.polygon.size) + intersection_pct = line.polygon.intersection_pct(block.polygon) + if line_idx not in max_intersections: + max_intersections[line_idx] = (intersection_pct, block_idx) + elif intersection_pct > max_intersections[line_idx][0]: + max_intersections[line_idx] = (intersection_pct, block_idx) + + assigned_line_idxs = set() + for line_idx, line in enumerate(provider_lines): + if line_idx in max_intersections and max_intersections[line_idx][0] > 0.0: + document_page.add_full_block(line) + block_idx = max_intersections[line_idx][1] + block: Block = document_page.children[block_idx] + block.add_structure(line) + assigned_line_idxs.add(line_idx) + + for line_idx in all_line_idxs.difference(assigned_line_idxs): + min_dist = None + min_dist_idx = None + line: Line = provider_lines[line_idx] + for block_idx, block in enumerate(document_page.children): + if block_idx == line_idx or block.block_type is None: + continue + dist = line.polygon.center_distance(block.polygon) + if min_dist_idx is None or dist < min_dist: + min_dist = dist + min_dist_idx = block_idx + + if min_dist_idx is not None: + document_page.add_full_block(line) + nearest_block = document_page.children[min_dist_idx] + nearest_block.add_structure(line) + assigned_line_idxs.add(line_idx) + + for line_idx in all_line_idxs.difference(assigned_line_idxs): + line: Line = provider_lines[line_idx] + document_page.add_full_block(line) + text_block = document_page.add_block(Text, polygon=line.polygon) + text_block.add_structure(line) diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index e01037e..7ee2340 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -17,6 +17,8 @@ def __init__(self, filepath: str, config: PdfProviderConfig): self.config = config self.page_lines: Dict[int, List[Line]] = {} + self.doc: pdfium.PdfDocument + self.setup() def __len__(self) -> int: diff --git a/marker/v2/schema/__init__.py b/marker/v2/schema/__init__.py index 18ef544..facada4 100644 --- a/marker/v2/schema/__init__.py +++ b/marker/v2/schema/__init__.py @@ -16,8 +16,14 @@ class Block(BaseModel): @property def _id(self): - page_path = f"/page/{self.pnum}" - if self.block_num is not None: - return f"{page_path}/block/{self.block_num}" + page_path = f"/page/{self.page_id}" + if self.block_id is not None: + return f"{page_path}/block/{self.block_id}" else: return page_path + + def add_structure(self, block: Block): + if self.structure is None: + self.structure = [block._id] + else: + self.structure.append(block._id) diff --git a/marker/v2/schema/groups/page.py b/marker/v2/schema/groups/page.py index 57cfde5..9625c7b 100644 --- a/marker/v2/schema/groups/page.py +++ b/marker/v2/schema/groups/page.py @@ -1,9 +1,8 @@ from typing import List -from marker.v2.schema import Block from PIL import Image -from marker.v2.schema.blocks import LAYOUT_BLOCK_REGISTRY +from marker.v2.schema import Block from marker.v2.schema.polygon import PolygonBox @@ -13,22 +12,33 @@ class PageGroup(Block): highres_image: Image.Image | None = None children: List[Block] | None = None - def add_block(self, block_cls: Block, polygon: PolygonBox) -> Block: - max_id = max([b.block_id for b in self.children or []], default=0) + def incr_block_id(self): + if self.block_id is None: + self.block_id = 0 + else: + self.block_id += 1 + def add_child(self, block: Block): + if self.children is None: + self.children = [block] + else: + self.children.append(block) + + def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block: + self.incr_block_id() block = block_cls( polygon=polygon, - block_id=max_id + 1, + block_id=self.block_id, page_id=self.page_id, ) - if isinstance(self.children, list): - self.children.append(block) - else: - self.children = [block] + self.add_child(block) + return block + def add_full_block(self, block: Block) -> Block: + self.incr_block_id() + block.block_id = self.block_id + self.add_child(block) return block - def get_block(self, block_id: str) -> Block | None: - for block in self.children: - if block._id == block_id: - return block + def get_block(self, block_id: int) -> Block | None: + return self.children[block_id] diff --git a/marker/v2/schema/polygon.py b/marker/v2/schema/polygon.py index c172cc9..31d5241 100644 --- a/marker/v2/schema/polygon.py +++ b/marker/v2/schema/polygon.py @@ -32,8 +32,12 @@ def area(self): return self.width * self.height @property - def center(bbox): - return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] + def center(self): + return [(self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2] + + @property + def size(self): + return [self.width, self.height] @computed_field @property @@ -72,8 +76,8 @@ def rescale(self, processor_size, image_size): new_corners = copy.deepcopy(self.polygon) for corner in new_corners: - corner[0] = int(corner[0] * width_scaler) - corner[1] = int(corner[1] * height_scaler) + corner[0] = corner[0] * width_scaler + corner[1] = corner[1] * height_scaler self.polygon = new_corners def fit_to_bounds(self, bounds): @@ -83,23 +87,23 @@ def fit_to_bounds(self, bounds): corner[1] = max(min(corner[1], bounds[3]), bounds[1]) self.polygon = new_corners - def merge(self, other): + def merge(self, other: PolygonBox): x1 = min(self.bbox[0], other.bbox[0]) y1 = min(self.bbox[1], other.bbox[1]) x2 = max(self.bbox[2], other.bbox[2]) y2 = max(self.bbox[3], other.bbox[3]) self.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] - def overlap_x(self, other): + def overlap_x(self, other: PolygonBox): return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0])) - def overlap_y(self, other): + def overlap_y(self, other: PolygonBox): return max(0, min(self.bbox[3], other.bbox[3]) - max(self.bbox[1], other.bbox[1])) - def intersection_area(self, other): + def intersection_area(self, other: PolygonBox): return self.overlap_x(other) * self.overlap_y(other) - def intersection_pct(self, other, x_margin=0, y_margin=0): + def intersection_pct(self, other: PolygonBox, x_margin=0, y_margin=0): assert 0 <= x_margin <= 1 assert 0 <= y_margin <= 1 if self.area == 0: @@ -110,7 +114,7 @@ def intersection_pct(self, other, x_margin=0, y_margin=0): if y_margin: y_margin = int(min(self.height, other.height) * y_margin) - intersection = self.intersection_area(other, x_margin, y_margin) + intersection = self.intersection_area(other) return intersection / self.area @classmethod