diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 6ed650ef..a0eadb98 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -38,7 +38,7 @@ def __init__(self, *, config: dict) -> None: self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config) self.pdf_tabby_reader = PdfTabbyReader(config=config) self.pdf_image_reader = PdfImageReader(config=config) - self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config) + self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=config) self.config = config self.logger = config.get("logger", logging.getLogger()) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index bbdf6f3e..6fd98130 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -6,21 +6,18 @@ from dedoc.data_structures import LineWithMeta from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.pdf_utils import get_pdf_page_count PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"]) class TxtLayerDetector: - def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None: + def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) self.txtlayer_classifier = TxtlayerClassifier(config=config) - self.pdf_txtlayer_reader = pdf_txtlayer_reader - self.pdf_tabby_reader = pdf_tabby_reader + self.pdf_tabby_reader = pdf_reader def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: """ @@ -42,16 +39,8 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithMeta]: parameters_copy = deepcopy(parameters) - parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader - parameters_copy["need_pdf_table_analysis"] = "false" - num_pages = get_pdf_page_count(path) - if num_pages is None or num_pages >= 50: - # TODO remove this when TLDR-404 is done - document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy) - else: - # tabby reader reads the whole document regardless "pages" parameter - # still it's faster to use tabby for documents with <= 50 pages - document = self.pdf_tabby_reader.read(path, parameters=parameters_copy) + parameters_copy["pages"] = "1:10" + document = self.pdf_tabby_reader.read(path, parameters=parameters_copy) return document.lines diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 6e913e2a..0ee47bed 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -110,7 +110,8 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ # in java tabby reader page numeration starts with 1, end_page is included first_tabby_page = 1 if first_page is None else first_page + 1 - document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_page) + last_tabby_page = None if last_page is not None and last_page > page_count else last_page + document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) if first_page > 0 or last_page is not None and last_page < page_count: warnings.append("The document is partially parsed")