Skip to content

Commit

Permalink
Update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 14, 2024
1 parent a571886 commit f4cd1c4
Show file tree
Hide file tree
Showing 11 changed files with 116 additions and 37 deletions.
9 changes: 7 additions & 2 deletions marker/v2/builders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from typing import Optional

from pydantic import BaseModel


class BaseBuilder:
def __init__(self, config=None):
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config:
for k in config.model_fields:
setattr(self, k, config[k])

def __call__(self, data, *args, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def group_caption_blocks(self, page: PageGroup):
for j, prev_block in enumerate(page.children[:i][::-1]):
if all([
prev_block.block_type in ["Caption", "Footnote"],
prev_block.minimum_gap(block) < self.gap_threshold
prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
]):
block_structure.insert(prev_block._id, 0)
selected_polygons.append(prev_block.polygon)
Expand All @@ -42,7 +42,7 @@ def group_caption_blocks(self, page: PageGroup):
for j, next_block in enumerate(page.children[i + 1:]):
if all([
next_block.block_type in ["Caption", "Footnote"],
next_block.minimum_gap(block) < self.gap_threshold
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
]):
block_structure.append(next_block._id)
selected_polygons.append(next_block.polygon)
Expand Down Expand Up @@ -71,7 +71,7 @@ def group_lists(self, page: PageGroup):
for j, next_block in enumerate(page.children[i + 1:]):
if all([
next_block.block_type == "ListItem",
next_block.minimum_gap(block) < self.gap_threshold
next_block.polygon.minimum_gap(block) < self.gap_threshold
]):
block_structure.append(next_block._id)
selected_polygons.append(next_block.polygon)
Expand Down
13 changes: 5 additions & 8 deletions marker/v2/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
from typing import List
from typing import Optional

from pydantic import BaseModel


class ConverterConfig(BaseModel):
filepath: str
page_range: List[int] | None = None


class BaseConverter:
def __init__(self, config: ConverterConfig):
self.config = config
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])

def __call__(self):
raise NotImplementedError
5 changes: 5 additions & 0 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

from surya.model.layout.model import load_model
from surya.model.layout.processor import load_processor

Expand All @@ -10,6 +12,9 @@


class PdfConverter(BaseConverter):
filepath: str
page_range: List[int] | None = None

def __call__(self):
pdf_provider = PdfProvider(self.config.filepath, PdfProviderConfig())

Expand Down
10 changes: 9 additions & 1 deletion marker/v2/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
from typing import Optional

from pydantic import BaseModel


class BaseProcessor:
pass
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])
6 changes: 6 additions & 0 deletions marker/v2/processors/equation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from marker.v2.processors import BaseProcessor


class EquationProcessor(BaseProcessor):
block_type = "Equation"

5 changes: 5 additions & 0 deletions marker/v2/processors/table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from marker.v2.processors import BaseProcessor


class TableProcessor(BaseProcessor):
pass
44 changes: 31 additions & 13 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ def check_elements(cls, v: List[List[float]]) -> List[List[float]]:
for corner in v:
if len(corner) != 2:
raise ValueError('corner must have 2 elements')

min_x = min([corner[0] for corner in v])
min_y = min([corner[1] for corner in v])

# Ensure corners are clockwise from top left
corner_error = f" .Corners are {v}"
assert v[2][1] >= min_y, f'bottom right corner should have a greater y value than top right corner' + corner_error
assert v[3][1] >= min_y, 'bottom left corner should have a greater y value than top left corner' + corner_error
assert v[1][0] >= min_x, 'top right corner should have a greater x value than top left corner' + corner_error
assert v[2][0] >= min_x, 'bottom right corner should have a greater x value than bottom left corner' + corner_error
return v

@property
Expand Down Expand Up @@ -45,8 +55,23 @@ def bbox(self) -> List[float]:
box[1], box[3] = box[3], box[1]
return box

def expand(self, x_margin: float, y_margin: float) -> PolygonBox:
new_polygon = []
x_margin = x_margin * self.width
y_margin = y_margin * self.height
for idx, poly in self.polygon:
if idx == 0:
new_polygon.append([poly[0] - x_margin, poly[1] - y_margin])
elif idx == 1:
new_polygon.append([poly[0] + x_margin, poly[1] - y_margin])
elif idx == 2:
new_polygon.append([poly[0] + x_margin, poly[1] + y_margin])
elif idx == 3:
new_polygon.append([poly[0] - x_margin, poly[1] + y_margin])
return PolygonBox(polygon=new_polygon)

def minimum_gap(self, other: PolygonBox):
if self.intersection_pct(other.bbox) > 0:
if self.intersection_pct(other) > 0:
return 0

x_dist = min(abs(self.bbox[0] - other.bbox[2]), abs(self.bbox[2] - other.bbox[0]))
Expand All @@ -62,11 +87,11 @@ def minimum_gap(self, other: PolygonBox):
def center_distance(self, other: PolygonBox):
return ((self.center[0] - other.center[0]) ** 2 + (self.center[1] - other.center[1]) ** 2) ** 0.5

def rescale(self, processor_size, image_size):
def rescale(self, old_size, new_size):
# Point is in x, y format
page_width, page_height = processor_size
page_width, page_height = old_size
img_width, img_height = new_size

img_width, img_height = image_size
width_scaler = img_width / page_width
height_scaler = img_height / page_height

Expand All @@ -92,18 +117,11 @@ def overlap_y(self, other):
def intersection_area(self, other):
return self.overlap_x(other) * self.overlap_y(other)

def intersection_pct(self, other, x_margin=0, y_margin=0):
assert 0 <= x_margin <= 1
assert 0 <= y_margin <= 1
def intersection_pct(self, other):
if self.area == 0:
return 0

if x_margin:
x_margin = int(min(self.width, other.width) * x_margin)
if y_margin:
y_margin = int(min(self.height, other.height) * y_margin)

intersection = self.intersection_area(other, x_margin, y_margin)
intersection = self.intersection_area(other)
return intersection / self.area

def merge(self, others: List[PolygonBox]) -> PolygonBox:
Expand Down
41 changes: 41 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import tempfile

import datasets
import pytest
from surya.model.layout.model import load_model
from surya.model.layout.processor import load_processor

from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema.config.pdf import PdfProviderConfig
from marker.v2.schema.document import Document


@pytest.fixture(scope="session")
def layout_model():
layout_model = load_model()
layout_model.processor = load_processor()
yield layout_model
del layout_model


@pytest.fixture(scope="session")
def pdf_document(request, layout_model) -> Document:
marker = request.node.get_closest_marker("filename")
if marker is None:
filename = "adversarial.pdf"
else:
filename = marker.args[0]
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(filename)

temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
layout_builder = LayoutBuilder(layout_model)
builder = DocumentBuilder()
document = builder(provider, layout_builder)
return document
7 changes: 1 addition & 6 deletions tests/test_document_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,20 @@
import tempfile

import datasets
from surya.model.layout.model import load_model
from surya.model.layout.processor import load_processor

from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.schema.config.pdf import PdfProviderConfig


def test_document_builder():
def test_document_builder(layout_model):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index('adversarial.pdf')

temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

layout_model = load_model()
layout_model.processor = load_processor()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
layout_builer = LayoutBuilder(layout_model)
builder = DocumentBuilder()
Expand Down
7 changes: 3 additions & 4 deletions tests/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from tests.utils import setup_pdf_document


def test_structure_builder():
document = setup_pdf_document('adversarial.pdf')
def test_structure_builder(pdf_document):
structure = StructureBuilder()
structure(document)
assert len(document.pages[0].structure) > 0
structure(pdf_document)
assert len(pdf_document.pages[0].structure) > 0

0 comments on commit f4cd1c4

Please sign in to comment.