Skip to content

Commit

Permalink
Merge pull request #22 from magdaaniol/fix/pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
ines authored Dec 24, 2024
2 parents b90cbb1 + 42fa2f2 commit 4168b64
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 1 deletion.
2 changes: 1 addition & 1 deletion spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc:
inputs = []
pages = {
(page.page_no): PageLayout(
page_no=page.page_no + 1,
page_no=page.page_no,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,22 @@ def test_general(path, nlp, span_labels):
assert span.label_ in span_labels
assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout)

@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)])
def test_pages(path, pg_no, nlp):
layout = spaCyLayout(nlp)
doc = layout(path)
# This should not raise a KeyError when accessing `pages` dict
# Key Error would mean a mismatched pagination on document layout and span layout
result = layout.get_pages(doc)
assert len(result) == pg_no
assert result[0][0].page_no == 1
if pg_no == 6:
# there should be 18 spans on the pg_no 1
assert len(result[0][1]) == 18
elif pg_no == 1:
# there should be 4 spans on pg_no 1
assert len(result[0][1]) == 4


@pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE])
@pytest.mark.parametrize("separator", ["\n\n", ""])
Expand Down

0 comments on commit 4168b64

Please sign in to comment.