Skip to content

Commit

Permalink
Improve sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 7, 2024
1 parent 77a99f3 commit f7444f3
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 10 deletions.
2 changes: 1 addition & 1 deletion marker/ocr/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
return len(full_text.strip()) == 0


def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
found_lines = 0
for detected_line in page.text_lines.bboxes:

Expand Down
18 changes: 17 additions & 1 deletion marker/tables/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,20 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4):
flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]])
new_rows.append(flat_row)

return new_rows
# Pad rows to have the same length
max_row_len = max([len(r) for r in new_rows])
for row in new_rows:
while len(row) < max_row_len:
row.append("")

cols_to_remove = set()
for idx, col in enumerate(zip(*new_rows)):
col_total = sum([len(cell.strip()) > 0 for cell in col])
if col_total == 0:
cols_to_remove.add(idx)

rows = []
for row in new_rows:
rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove])

return rows
11 changes: 4 additions & 7 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
x_position = normed_x_end
if len(table_row) > 0:
table_rows.append(table_row)
table_rows = assign_cells_to_columns(table_rows)
table_rows = assign_cells_to_columns(page, table_box, table_rows)
return table_rows


def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> List[List[str]]:
page_width = page.width
table_rows = []
table_cell = ""
Expand Down Expand Up @@ -90,13 +90,15 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
table_cell = char["char"]
cell_bbox = char["bbox"]
if len(table_row) > 0:
table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
table_rows.append(table_row)
table_row = []
prev_char = True

if len(table_cell) > 0:
table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
if len(table_row) > 0:
table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
table_rows.append(table_row)

table_rows = assign_cells_to_columns(page, table_box, table_rows)
Expand Down Expand Up @@ -139,11 +141,6 @@ def format_tables(pages: List[Page]):
if len(table_rows) == 0:
continue

max_row_len = max([len(r) for r in table_rows])
for row in table_rows:
while len(row) < max_row_len:
row.append("")

table_text = tabulate(table_rows, headers="firstrow", tablefmt="github")
table_block = Block(
bbox=table_box,
Expand Down
2 changes: 1 addition & 1 deletion marker/tables/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def sort_table_blocks(blocks, tolerance=5):
bbox = block.bbox
else:
bbox = block["bbox"]
group_key = round(bbox[1] / tolerance) * tolerance
group_key = round(bbox[1] / tolerance)
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)
Expand Down

0 comments on commit f7444f3

Please sign in to comment.