Skip to content

Commit

Permalink
Detect dotted gridlines for tables
Browse files Browse the repository at this point in the history
Addresses #3539

We previously did not detect dotted lines used as table gridlines.

If one of width / height is LE edge_min_length and LT the other dimension, the rectangle is treated as a vertical / horizontal line.
We incorrectly used the dimension-specific snap values for this.

We now no longer ignore rectangles if both, width and height are smaller then edge_min_length, but leave this to the snapping and joining algorithms further down the road.
  • Loading branch information
JorjMcKie committed Jun 3, 2024
1 parent 6683a18 commit d5fe5bd
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 8 deletions.
18 changes: 11 additions & 7 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1946,6 +1946,7 @@ def make_edges(page, clip=None, tset=None, add_lines=None):
global EDGES
snap_x = tset.snap_x_tolerance
snap_y = tset.snap_y_tolerance
min_length = tset.edge_min_length
lines_strict = (
tset.vertical_strategy == "lines_strict"
or tset.horizontal_strategy == "lines_strict"
Expand Down Expand Up @@ -2117,13 +2118,14 @@ def make_line(p, p1, p2, clip):
if line_dict:
EDGES.append(line_to_edge(line_dict))

elif i[0] == "re": # a rectangle: decompose into 4 lines
rect = i[1].normalize() # rectangle itself
# ignore minute rectangles
if rect.height <= snap_y and rect.width <= snap_x:
continue
elif i[0] == "re":
# A rectangle: decompose into 4 lines, but filter out
# the ones that simulate a line
rect = i[1].normalize() # normalize the rectangle

if rect.width <= snap_x: # simulates a vertical line
if (
rect.width <= min_length and rect.width < rect.height
): # simulates a vertical line
x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
p1 = Point(x, rect.y0)
p2 = Point(x, rect.y1)
Expand All @@ -2132,7 +2134,9 @@ def make_line(p, p1, p2, clip):
EDGES.append(line_to_edge(line_dict))
continue

if rect.height <= snap_y: # simulates a horizontal line
if (
rect.height <= min_length and rect.height < rect.width
): # simulates a horizontal line
y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
p1 = Point(rect.x0, y)
p2 = Point(rect.x1, y)
Expand Down
Binary file added tests/resources/dotted-gridlines.pdf
Binary file not shown.
18 changes: 17 additions & 1 deletion tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,9 @@ def test_2979():
assert len(lengths) == 1

# test 3001
assert pymupdf.TOOLS.set_small_glyph_heights() is False, f'{pymupdf.TOOLS.set_small_glyph_heights()=}'
assert (
pymupdf.TOOLS.set_small_glyph_heights() is False
), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"


def test_3062():
Expand Down Expand Up @@ -292,3 +294,17 @@ def test_markdown():
"|Col15|Col25 Col26||\n\n"
)
assert tab.to_markdown() == text


def test_dotted_grid():
"""Confirm dotted lines are detected as gridlines."""
filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
doc = pymupdf.open(filename)
page = doc[0]
tabs = page.find_tables()
assert len(tabs.tables) == 3 # must be 3 tables
t0, t1, t2 = tabs # extract them
# check that they have expected dimensions
assert t0.row_count, t0.col_count == (11, 12)
assert t1.row_count, t1.col_count == (25, 11)
assert t2.row_count, t2.col_count == (1, 10)

0 comments on commit d5fe5bd

Please sign in to comment.