Skip to content

Commit

Permalink
Merge pull request #312 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Bugfixes
  • Loading branch information
VikParuchuri authored Oct 22, 2024
2 parents 6bee852 + 8598d63 commit 189d660
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 11 deletions.
2 changes: 1 addition & 1 deletion marker/layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def annotate_block_types(pages: List[Page]):

for i, block in enumerate(page.blocks):
if block.block_type is None:
block.block_type = "Text"
block.block_type = settings.DEFAULT_BLOCK_TYPE

def get_layout_label(block_labels: List[str]):
counter = Counter(block_labels)
Expand Down
6 changes: 3 additions & 3 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=prev_type,
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
page_end=False
)
)
Expand All @@ -186,7 +186,7 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=prev_type,
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
page_end=True
)
)
Expand All @@ -197,7 +197,7 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type, prev_heading_level),
block_type=block_type,
block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
page_end=False
)
)
Expand Down
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
TABLE_INTERSECTION_THRESH: float = 0.7
LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
DEFAULT_BLOCK_TYPE: str = "Text"

# Ordering model
SURYA_ORDER_DPI: int = 96
Expand Down
12 changes: 6 additions & 6 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,19 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
pnum = page.pnum
# The bbox for the entire table
bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]

if len(bbox) == 0:
table_counts.append(0)
img_sizes.append(None)
continue

highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI)

page_table_imgs = []
page_bboxes = []

# Merge tables that are next to each other
bbox = merge_tables(bbox)
bbox = list(filter(lambda b: b[3] - b[1] > 10 and b[2] - b[0] > 10, bbox))

if len(bbox) == 0:
table_counts.append(0)
img_sizes.append(None)
continue

# Number of tables per page
table_counts.append(len(bbox))
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.3"
version = "0.3.4"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 189d660

Please sign in to comment.