ispras · dronperminov · Nov 24, 2023 · Nov 23, 2023
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -46,7 +46,8 @@
     "first_page",
     "last_page",
     "need_binarization",
-    "table_type"
+    "table_type",
+    "attachments_dir"
 ])
 
 
@@ -75,6 +76,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         """
         parameters = {} if parameters is None else parameters
         first_page, last_page = param_utils.get_param_page_slice(parameters)
+        attachments_dir = parameters.get("attachments_dir", None)
+        attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir
+
         params_for_parse = ParametersForParseDoc(
             language=param_utils.get_param_language(parameters),
             orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
@@ -87,7 +91,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
             first_page=first_page,
             last_page=last_page,
             need_binarization=param_utils.get_param_need_binarization(parameters),
-            table_type=param_utils.get_param_table_type(parameters)
+            table_type=param_utils.get_param_table_type(parameters),
+            attachments_dir=attachments_dir
         )
 
         lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -2,6 +2,7 @@
 import logging
 import math
 import os
+import shutil
 import subprocess
 import uuid
 from typing import List, Optional, Tuple
@@ -35,7 +36,7 @@
 from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
 from dedoc.utils.parameter_utils import get_param_page_slice
 from dedoc.utils.pdf_utils import get_pdf_page_count
-from dedoc.utils.utils import calculate_file_hash
+from dedoc.utils.utils import calculate_file_hash, get_unique_name
 
 
 class PdfTabbyReader(PdfBaseReader):
@@ -100,6 +101,9 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
             -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
         all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
         document_metadata = None
+        attachments_dir = parameters.get("attachments_dir", None)
+        attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir
+
         file_hash = calculate_file_hash(path=path)
         page_count = get_pdf_page_count(path)
         page_count = math.inf if page_count is None else page_count
@@ -133,7 +137,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
                 all_tables.extend(page_tables)
                 all_tables_on_images.extend(table_on_images)
 
-            attached_images = self.__get_attached_images(page=page)
+            attached_images = self.__get_attached_images(page=page, attachments_dir=attachments_dir)
             if attached_images:
                 all_attached_images.extend(attached_images)
 
@@ -180,16 +184,21 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
 
         return tables, tables_on_image
 
-    def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
+    def __get_attached_images(self, page: dict, attachments_dir: str) -> List[PdfImageAttachment]:
         image_attachment_list = []
         for image_dict in page["images"]:
             image_location = Location(
                 page_number=page["number"],
                 bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
             )
+
+            tmp_file_name = get_unique_name(image_dict["original_name"])
+            tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
+            shutil.move(image_dict["tmp_file_path"], tmp_file_path)
+
             image_attachment = PdfImageAttachment(
                 original_name=image_dict["original_name"],
-                tmp_file_path=image_dict["tmp_file_path"],
+                tmp_file_path=tmp_file_path,
                 need_content_analysis=False,
                 uid=f"attach_{uuid.uuid4()}",
                 location=image_location

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -57,7 +57,7 @@ def _process_one_page(self,
         else:
             tables = []
 
-        page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
+        page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, attachments_dir=parameters.attachments_dir)
         if page is None:
             return [], [], [], []
         unreadable_blocks = [location.bbox for table in tables for location in table.locations]

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -44,22 +44,22 @@ def __init__(self, *, config: dict) -> None:
         self.config = config
         self.logger = self.config.get("logger", logging.getLogger())
 
-    def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]:
+    def extract_text_layer(self, path: str, page_number: int, attachments_dir: str) -> Optional[PageWithBBox]:
         """
         Extract text information with metadata from pdf with help pdfminer.six
         :param path: path to pdf
         :param page_number: number of the page to read
+        :param attachments_dir: directory for saving attachments
         :return: pages_with_bbox - page with extracted text
         """
         with open(path, "rb") as fp:
             pages = PDFPage.get_pages(fp)
             for page_num, page in enumerate(pages):
                 if page_num != page_number:
                     continue
-                return self.__handle_page(page=page, page_number=page_number, path=path)
+                return self.__handle_page(page=page, page_number=page_number, path=path, attachments_dir=attachments_dir)
 
-    def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox:
-        directory = os.path.dirname(path)
+    def __handle_page(self, page: PDFPage, page_number: int, path: str, attachments_dir: str) -> PageWithBBox:
         device, interpreter = self.__get_interpreter()
         try:
             interpreter.process_page(page)
@@ -95,7 +95,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB
                 lobjs_textline.append(lobj)
 
             elif isinstance(lobj, LTFigure) and not page_broken:
-                attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number)
+                attachment = self.__extract_image(attachments_dir, height, image_page, k_h, k_w, lobj, page_number)
                 if attachment is not None:
                     images.append(attachment)
 

diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py
@@ -7,7 +7,7 @@
 from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
 from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
 from dedoc.dedoc_manager import DedocManager
-from dedoc.readers import ArchiveReader
+from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader
 from dedoc.readers.docx_reader.docx_reader import DocxReader
 from tests.test_utils import get_test_config
 
@@ -126,31 +126,28 @@ def test_manager_attachments_dir(self) -> None:
                     self.assertIn(attachment.metadata.temporary_file_name, attachment_names)
 
     def test_reader_attachments_dir(self) -> None:
-        file_name = "with_attachments_0.docx"
-        docx_reader = DocxReader(config=get_test_config())
+        file_name_reader_list = [
+            ("with_attachments_0.docx", DocxReader(config=get_test_config())),
+            ("with_attachments_1.docx.pdf", PdfTxtlayerReader(config=get_test_config())),
+            ("with_attachments_1.docx.pdf", PdfTabbyReader(config=get_test_config()))
+        ]
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            params = {
-                "with_attachments": True,
-                "attachments_dir": tmpdir
-            }
-            result = docx_reader.read(path=os.path.join(self.src_dir, file_name), parameters=params)
+        for file_name, reader in file_name_reader_list:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir))
 
-            attachment_names = os.listdir(tmpdir)
-            for attachment in result.attachments:
-                attachment_fname = attachment.tmp_file_path.split("/")[-1]
-                self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
-                self.assertIn(attachment_fname, attachment_names)
+                attachment_names = os.listdir(tmpdir)
+                for attachment in result.attachments:
+                    attachment_fname = attachment.tmp_file_path.split("/")[-1]
+                    self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
+                    self.assertIn(attachment_fname, attachment_names)
 
     def test_attachments_extractor_attachments_dir(self) -> None:
         file_name = "with_attachments_0.docx"
         docx_attachment_extractor = DocxAttachmentsExtractor()
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            params = {
-                "with_attachments": True,
-                "attachments_dir": tmpdir
-            }
+            params = {"with_attachments": True, "attachments_dir": tmpdir}
             result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params)
 
             attachment_names = os.listdir(tmpdir)