Skip to content

Commit

Permalink
Fix txtlayer classification tests
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Nov 13, 2023
1 parent e48fb17 commit 163332a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, *, config: dict) -> None:
self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config)
self.pdf_tabby_reader = PdfTabbyReader(config=config)
self.pdf_image_reader = PdfImageReader(config=config)
self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config)
self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=config)

self.config = config
self.logger = config.get("logger", logging.getLogger())
Expand Down
19 changes: 4 additions & 15 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,18 @@
from dedoc.data_structures import LineWithMeta
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
from dedoc.utils.pdf_utils import get_pdf_page_count

PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"])


class TxtLayerDetector:

def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None:
def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
self.config = config
self.logger = config.get("logger", logging.getLogger())

self.txtlayer_classifier = TxtlayerClassifier(config=config)
self.pdf_txtlayer_reader = pdf_txtlayer_reader
self.pdf_tabby_reader = pdf_tabby_reader
self.pdf_tabby_reader = pdf_reader

def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
Expand All @@ -42,16 +39,8 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:

def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithMeta]:
parameters_copy = deepcopy(parameters)
parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader
parameters_copy["need_pdf_table_analysis"] = "false"
num_pages = get_pdf_page_count(path)
if num_pages is None or num_pages >= 50:
# TODO remove this when TLDR-404 is done
document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
else:
# tabby reader reads the whole document regardless "pages" parameter
# still it's faster to use tabby for documents with <= 50 pages
document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)
parameters_copy["pages"] = "1:10"
document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)

return document.lines

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def __extract(self, path: str, parameters: dict, warnings: list)\

# in java tabby reader page numeration starts with 1, end_page is included
first_tabby_page = 1 if first_page is None else first_page + 1
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_page)
last_tabby_page = None if last_page is not None and last_page > page_count else last_page
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)

if first_page > 0 or last_page is not None and last_page < page_count:
warnings.append("The document is partially parsed")
Expand Down

0 comments on commit 163332a

Please sign in to comment.