Skip to content

Commit

Permalink
remove PdfTxtlayerReader from TxtLayerDetector (#395)
Browse files Browse the repository at this point in the history
  • Loading branch information
dronperminov authored Jan 9, 2024
1 parent f47b02f commit 9ef562b
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config)
self.pdf_tabby_reader = PdfTabbyReader(config=self.config)
self.pdf_image_reader = PdfImageReader(config=self.config)
self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=self.config)
self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=self.config)

def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
17 changes: 3 additions & 14 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,18 @@
from dedoc.data_structures import LineWithMeta
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
from dedoc.utils.pdf_utils import get_pdf_page_count

PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"])


class TxtLayerDetector:

def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None:
def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
self.config = config
self.logger = config.get("logger", logging.getLogger())

self.txtlayer_classifier = TxtlayerClassifier(config=config)
self.pdf_txtlayer_reader = pdf_txtlayer_reader
self.pdf_tabby_reader = pdf_tabby_reader
self.pdf_reader = pdf_reader

def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
Expand All @@ -44,15 +41,7 @@ def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithM
parameters_copy = deepcopy(parameters)
parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader
parameters_copy["need_pdf_table_analysis"] = "false"
num_pages = get_pdf_page_count(path)
if num_pages is None or num_pages >= 50:
# TODO remove this when TLDR-518 is done
document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
else:
# tabby reader reads the whole document regardless "pages" parameter
# still it's faster to use tabby for documents with <= 50 pages
document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)

document = self.pdf_reader.read(path, parameters=parameters_copy)
return document.lines

def __is_first_page_correct(self, lines: List[LineWithMeta], is_txt_layer_correct: bool) -> bool:
Expand Down

0 comments on commit 9ef562b

Please sign in to comment.