diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index ffe6e422..278d8295 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]): return len(full_text.strip()) == 0 -def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6): +def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65): found_lines = 0 for detected_line in page.text_lines.bboxes: diff --git a/marker/tables/cells.py b/marker/tables/cells.py index d4524314..1981bcd3 100644 --- a/marker/tables/cells.py +++ b/marker/tables/cells.py @@ -86,4 +86,20 @@ def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4): flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]]) new_rows.append(flat_row) - return new_rows + # Pad rows to have the same length + max_row_len = max([len(r) for r in new_rows]) + for row in new_rows: + while len(row) < max_row_len: + row.append("") + + cols_to_remove = set() + for idx, col in enumerate(zip(*new_rows)): + col_total = sum([len(cell.strip()) > 0 for cell in col]) + if col_total == 0: + cols_to_remove.add(idx) + + rows = [] + for row in new_rows: + rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove]) + + return rows diff --git a/marker/tables/table.py b/marker/tables/table.py index d99b758e..ef652634 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -37,11 +37,11 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: x_position = normed_x_end if len(table_row) > 0: table_rows.append(table_row) - table_rows = assign_cells_to_columns(table_rows) + table_rows = assign_cells_to_columns(page, table_box, table_rows) return table_rows -def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: +def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> List[List[str]]: page_width = page.width table_rows = [] table_cell = "" @@ -90,6 +90,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: table_cell = char["char"] cell_bbox = char["bbox"] if len(table_row) > 0: + table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor)) table_rows.append(table_row) table_row = [] prev_char = True @@ -97,6 +98,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: if len(table_cell) > 0: table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell)))) if len(table_row) > 0: + table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor)) table_rows.append(table_row) table_rows = assign_cells_to_columns(page, table_box, table_rows) @@ -139,11 +141,6 @@ def format_tables(pages: List[Page]): if len(table_rows) == 0: continue - max_row_len = max([len(r) for r in table_rows]) - for row in table_rows: - while len(row) < max_row_len: - row.append("") - table_text = tabulate(table_rows, headers="firstrow", tablefmt="github") table_block = Block( bbox=table_box, diff --git a/marker/tables/utils.py b/marker/tables/utils.py index b7efdabb..61b03403 100644 --- a/marker/tables/utils.py +++ b/marker/tables/utils.py @@ -8,7 +8,7 @@ def sort_table_blocks(blocks, tolerance=5): bbox = block.bbox else: bbox = block["bbox"] - group_key = round(bbox[1] / tolerance) * tolerance + group_key = round(bbox[1] / tolerance) if group_key not in vertical_groups: vertical_groups[group_key] = [] vertical_groups[group_key].append(block)