Skip to content

Commit

Permalink
TLDR-518: Fix tabby partially read (#372)
Browse files Browse the repository at this point in the history
* Fix tabby partially read

* Add more tests

* Fix tabby page slice parameters

* Fix extract table in tabby with page range parameter

---------

Co-authored-by: Nasty <[email protected]>
  • Loading branch information
sunveil and NastyBoget authored Nov 21, 2023
1 parent fa396ef commit 9c9c50e
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 6 deletions.
10 changes: 5 additions & 5 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

# in java tabby reader page numeration starts with 1, end_page is included
# first_tabby_page = first_page + 1 if first_page is not None else 1
# last_tabby_page = None if last_page is not None and last_page > page_count else last_page
# document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518
first_tabby_page = first_page + 1 if first_page is not None else 1
last_tabby_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
self.logger.info(f"Reading PDF pages from {first_tabby_page} to {last_tabby_page}")
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)

document = self.__process_pdf(path=path)
pages = document.get("pages", [])
for page in pages[first_page:last_page]:
for page in pages:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
Expand Down
Binary file not shown.
14 changes: 13 additions & 1 deletion tests/api_tests/test_api_format_pdf_page_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_auto_tabby(self) -> None:

def __check_out_of_limit(self, reader: str) -> None:
text_expected = ""
for pages in ("10:11", ):
for pages in (":-1", "-1:0", "0:0", "10:11", "11:"):
self.__check(pages, text_expected, reader=reader)

def __check_limit(self, reader: str, check_partially: bool = False) -> None:
Expand All @@ -56,6 +56,18 @@ def __check_limit(self, reader: str, check_partially: bool = False) -> None:
text_expected = "\n".join(self.lines[0:2])
self.__check("1:2", text_expected, reader=reader, check_partially=check_partially)

text_expected = self.lines[0]
self.__check("1:1", text_expected, reader=reader, check_partially=check_partially)

text_expected = self.lines[1]
self.__check("2:2", text_expected, reader=reader, check_partially=check_partially)

text_expected = "\n".join(self.lines[1:3])
self.__check("2:3", text_expected, reader=reader, check_partially=check_partially)

text_expected = "\n".join(self.lines[4:8])
self.__check("5:8", text_expected, reader=reader, check_partially=check_partially)

text_expected = self.lines[8]
self.__check("9:", text_expected, reader=reader, check_partially=False)

Expand Down

0 comments on commit 9c9c50e

Please sign in to comment.