diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index c91cc779..e2c3ad37 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -36,7 +36,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config) self.pdf_tabby_reader = PdfTabbyReader(config=self.config) self.pdf_image_reader = PdfImageReader(config=self.config) - self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=self.config) + self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=self.config) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index 5d3ac9a9..cfff918d 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -6,21 +6,18 @@ from dedoc.data_structures import LineWithMeta from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.pdf_utils import get_pdf_page_count PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"]) class TxtLayerDetector: - def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None: + def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) self.txtlayer_classifier = TxtlayerClassifier(config=config) - self.pdf_txtlayer_reader = pdf_txtlayer_reader - self.pdf_tabby_reader = pdf_tabby_reader + self.pdf_reader = pdf_reader def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: """ @@ -44,15 +41,7 @@ def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithM parameters_copy = deepcopy(parameters) parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader parameters_copy["need_pdf_table_analysis"] = "false" - num_pages = get_pdf_page_count(path) - if num_pages is None or num_pages >= 50: - # TODO remove this when TLDR-518 is done - document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy) - else: - # tabby reader reads the whole document regardless "pages" parameter - # still it's faster to use tabby for documents with <= 50 pages - document = self.pdf_tabby_reader.read(path, parameters=parameters_copy) - + document = self.pdf_reader.read(path, parameters=parameters_copy) return document.lines def __is_first_page_correct(self, lines: List[LineWithMeta], is_txt_layer_correct: bool) -> bool: