diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index a0eadb98..6ed650ef 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -38,7 +38,7 @@ def __init__(self, *, config: dict) -> None: self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config) self.pdf_tabby_reader = PdfTabbyReader(config=config) self.pdf_image_reader = PdfImageReader(config=config) - self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=config) + self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config) self.config = config self.logger = config.get("logger", logging.getLogger()) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index 6fd98130..5d3ac9a9 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -6,18 +6,21 @@ from dedoc.data_structures import LineWithMeta from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader +from dedoc.utils.pdf_utils import get_pdf_page_count PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"]) class TxtLayerDetector: - def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None: + def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) self.txtlayer_classifier = TxtlayerClassifier(config=config) - self.pdf_tabby_reader = pdf_reader + self.pdf_txtlayer_reader = pdf_txtlayer_reader + self.pdf_tabby_reader = pdf_tabby_reader def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: """ @@ -39,8 +42,16 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithMeta]: parameters_copy = deepcopy(parameters) - parameters_copy["pages"] = "1:10" - document = self.pdf_tabby_reader.read(path, parameters=parameters_copy) + parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader + parameters_copy["need_pdf_table_analysis"] = "false" + num_pages = get_pdf_page_count(path) + if num_pages is None or num_pages >= 50: + # TODO remove this when TLDR-518 is done + document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy) + else: + # tabby reader reads the whole document regardless "pages" parameter + # still it's faster to use tabby for documents with <= 50 pages + document = self.pdf_tabby_reader.read(path, parameters=parameters_copy) return document.lines diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 41fb88c2..1b4befc6 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -105,21 +105,25 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ page_count = math.inf if page_count is None else page_count first_page, last_page = get_param_page_slice(parameters) - if (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page): - return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata - - # in java tabby reader page numeration starts with 1, end_page is included - first_tabby_page = first_page + 1 if first_page is not None else 1 - last_tabby_page = None if last_page is not None and last_page > page_count else last_page - document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) - - if (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count): + empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page) + partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count) + if empty_page_limit or partial_page_limit: warnings.append("The document is partially parsed") document_metadata = dict(first_page=first_page) if last_page is not None: document_metadata["last_page"] = last_page - for page in document.get("pages", []): + if empty_page_limit: + return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata + + # in java tabby reader page numeration starts with 1, end_page is included + # first_tabby_page = first_page + 1 if first_page is not None else 1 + # last_tabby_page = None if last_page is not None and last_page > page_count else last_page + # document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518 + + document = self.__process_pdf(path=path) + pages = document.get("pages", []) + for page in pages[first_page:last_page]: page_lines = self.__get_lines_with_location(page, file_hash) if page_lines: all_lines.extend(page_lines) diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py index 18a86e69..0ed01fb3 100644 --- a/tests/api_tests/test_api_format_pdf_page_limit.py +++ b/tests/api_tests/test_api_format_pdf_page_limit.py @@ -34,6 +34,7 @@ def test_auto_text_layer(self) -> None: def test_tabby_layer(self) -> None: self.__check_limit("tabby", check_partially=True) + self.__check_out_of_limit("tabby") def test_auto_tabby(self) -> None: self.__check_limit("auto_tabby", check_partially=True) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 1d194988..c070f176 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -2,6 +2,7 @@ import unittest from typing import List +from dedoc.data_structures import AttachAnnotation from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation @@ -219,6 +220,7 @@ def test_pdf_annotations(self) -> None: self.assertIn(BoldAnnotation.name, annotation_names) self.assertIn(SpacingAnnotation.name, annotation_names) self.assertIn(BBoxAnnotation.name, annotation_names) + self.assertIn(AttachAnnotation.name, annotation_names) def test_tables_with_merged_cells(self) -> None: file_name = "big_table_with_merged_cells.pdf" diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py index 5babe30f..66422a7d 100644 --- a/tests/api_tests/test_api_misc_with_attachments.py +++ b/tests/api_tests/test_api_misc_with_attachments.py @@ -50,8 +50,10 @@ def test_attachments_pmi_document(self) -> None: attachments = result["attachments"] - self.assertEqual(attachments[0]["metadata"]["file_type"], "application/json") - self.assertEqual(attachments[1]["metadata"]["file_type"], "application/json") + self.assertEqual(attachments[0]["metadata"]["file_type"], "image/png") + self.assertEqual(attachments[1]["metadata"]["file_type"], "image/png") + self.assertEqual(attachments[2]["metadata"]["file_type"], "application/json") + self.assertEqual(attachments[3]["metadata"]["file_type"], "application/json") def test_need_content_analysis(self) -> None: file_name = "pdf_with_text_layer/Document635.pdf" diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index d1b90a4a..3d024033 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -5,7 +5,7 @@ class TestApiImageRefs(AbstractTestApiDocReader): - data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx") + data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "with_attachments") def test_docx_with_images(self) -> None: file_name = "docx_with_images.docx" @@ -58,6 +58,46 @@ def test_docx_with_images_from_mac(self) -> None: image_paragraph = content["subparagraphs"][5] self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.png"]) + def test_pdf_pdfminer_images_refs(self) -> None: + file_name = "with_attachments_1.docx.pdf" + result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="true")) + structure = result["content"]["structure"] + + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + self.assertEqual(len(attachment_uids), 3) + + attach_annotation = structure["subparagraphs"][0]["annotations"][-1] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + + attach_annotation = structure["subparagraphs"][3]["annotations"][-2] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + + attach_annotation = structure["subparagraphs"][3]["annotations"][-1] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + + def test_pdf_tabby_images_refs(self) -> None: + file_name = "with_attachments_1.docx.pdf" + result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="tabby")) + structure = result["content"]["structure"] + + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + self.assertEqual(len(attachment_uids), 3) + + attach_annotation = structure["subparagraphs"][2]["annotations"][-1] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + + attach_annotation = structure["subparagraphs"][4]["annotations"][-2] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + + attach_annotation = structure["subparagraphs"][4]["annotations"][-1] + self.assertEqual(attach_annotation["name"], "attachment") + self.assertIn(attach_annotation["value"], attachment_uids) + def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None: text = image_paragraph["text"] image_annotations = image_paragraph["annotations"] diff --git a/tests/data/docx/doc_with_images.docx b/tests/data/with_attachments/doc_with_images.docx similarity index 100% rename from tests/data/docx/doc_with_images.docx rename to tests/data/with_attachments/doc_with_images.docx diff --git a/tests/data/docx/docx_with_images.docx b/tests/data/with_attachments/docx_with_images.docx similarity index 100% rename from tests/data/docx/docx_with_images.docx rename to tests/data/with_attachments/docx_with_images.docx diff --git a/tests/data/docx/odt_with_images.odt b/tests/data/with_attachments/odt_with_images.odt similarity index 100% rename from tests/data/docx/odt_with_images.odt rename to tests/data/with_attachments/odt_with_images.odt diff --git a/tests/data/with_attachments/with_attachments_1.docx.pdf b/tests/data/with_attachments/with_attachments_1.docx.pdf new file mode 100644 index 00000000..026e4e0a Binary files /dev/null and b/tests/data/with_attachments/with_attachments_1.docx.pdf differ