diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index a5a43d4d..66c2be25 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -46,7 +46,8 @@ "first_page", "last_page", "need_binarization", - "table_type" + "table_type", + "attachments_dir" ]) @@ -75,6 +76,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ parameters = {} if parameters is None else parameters first_page, last_page = param_utils.get_param_page_slice(parameters) + attachments_dir = parameters.get("attachments_dir", None) + attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), @@ -87,7 +91,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio first_page=first_page, last_page=last_page, need_binarization=param_utils.get_param_need_binarization(parameters), - table_type=param_utils.get_param_table_type(parameters) + table_type=param_utils.get_param_table_type(parameters), + attachments_dir=attachments_dir ) lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 972cc37c..c204bf48 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -2,6 +2,7 @@ import logging import math import os +import shutil import subprocess import uuid from typing import List, Optional, Tuple @@ -35,7 +36,7 @@ from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.utils.parameter_utils import get_param_page_slice from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import calculate_file_hash +from dedoc.utils.utils import calculate_file_hash, get_unique_name class PdfTabbyReader(PdfBaseReader): @@ -100,6 +101,9 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]: all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], [] document_metadata = None + attachments_dir = parameters.get("attachments_dir", None) + attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + file_hash = calculate_file_hash(path=path) page_count = get_pdf_page_count(path) page_count = math.inf if page_count is None else page_count @@ -133,7 +137,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ all_tables.extend(page_tables) all_tables_on_images.extend(table_on_images) - attached_images = self.__get_attached_images(page=page) + attached_images = self.__get_attached_images(page=page, attachments_dir=attachments_dir) if attached_images: all_attached_images.extend(attached_images) @@ -180,16 +184,21 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: return tables, tables_on_image - def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]: + def __get_attached_images(self, page: dict, attachments_dir: str) -> List[PdfImageAttachment]: image_attachment_list = [] for image_dict in page["images"]: image_location = Location( page_number=page["number"], bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"]) ) + + tmp_file_name = get_unique_name(image_dict["original_name"]) + tmp_file_path = os.path.join(attachments_dir, tmp_file_name) + shutil.move(image_dict["tmp_file_path"], tmp_file_path) + image_attachment = PdfImageAttachment( original_name=image_dict["original_name"], - tmp_file_path=image_dict["tmp_file_path"], + tmp_file_path=tmp_file_path, need_content_analysis=False, uid=f"attach_{uuid.uuid4()}", location=image_location diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index a8702768..16a49ca4 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -57,7 +57,7 @@ def _process_one_page(self, else: tables = [] - page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number) + page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, attachments_dir=parameters.attachments_dir) if page is None: return [], [], [], [] unreadable_blocks = [location.bbox for table in tables for location in table.locations] diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 1d17f85f..c9cda801 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -44,11 +44,12 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = self.config.get("logger", logging.getLogger()) - def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]: + def extract_text_layer(self, path: str, page_number: int, attachments_dir: str) -> Optional[PageWithBBox]: """ Extract text information with metadata from pdf with help pdfminer.six :param path: path to pdf :param page_number: number of the page to read + :param attachments_dir: directory for saving attachments :return: pages_with_bbox - page with extracted text """ with open(path, "rb") as fp: @@ -56,10 +57,9 @@ def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBB for page_num, page in enumerate(pages): if page_num != page_number: continue - return self.__handle_page(page=page, page_number=page_number, path=path) + return self.__handle_page(page=page, page_number=page_number, path=path, attachments_dir=attachments_dir) - def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox: - directory = os.path.dirname(path) + def __handle_page(self, page: PDFPage, page_number: int, path: str, attachments_dir: str) -> PageWithBBox: device, interpreter = self.__get_interpreter() try: interpreter.process_page(page) @@ -95,7 +95,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB lobjs_textline.append(lobj) elif isinstance(lobj, LTFigure) and not page_broken: - attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number) + attachment = self.__extract_image(attachments_dir, height, image_page, k_h, k_w, lobj, page_number) if attachment is not None: images.append(attachment) diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index fa596819..eb79ebce 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -7,7 +7,7 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.dedoc_manager import DedocManager -from dedoc.readers import ArchiveReader +from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader from dedoc.readers.docx_reader.docx_reader import DocxReader from tests.test_utils import get_test_config @@ -126,31 +126,28 @@ def test_manager_attachments_dir(self) -> None: self.assertIn(attachment.metadata.temporary_file_name, attachment_names) def test_reader_attachments_dir(self) -> None: - file_name = "with_attachments_0.docx" - docx_reader = DocxReader(config=get_test_config()) + file_name_reader_list = [ + ("with_attachments_0.docx", DocxReader(config=get_test_config())), + ("with_attachments_1.docx.pdf", PdfTxtlayerReader(config=get_test_config())), + ("with_attachments_1.docx.pdf", PdfTabbyReader(config=get_test_config())) + ] - with tempfile.TemporaryDirectory() as tmpdir: - params = { - "with_attachments": True, - "attachments_dir": tmpdir - } - result = docx_reader.read(path=os.path.join(self.src_dir, file_name), parameters=params) + for file_name, reader in file_name_reader_list: + with tempfile.TemporaryDirectory() as tmpdir: + result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir)) - attachment_names = os.listdir(tmpdir) - for attachment in result.attachments: - attachment_fname = attachment.tmp_file_path.split("/")[-1] - self.assertTrue(os.path.isfile(attachment.get_filename_in_path())) - self.assertIn(attachment_fname, attachment_names) + attachment_names = os.listdir(tmpdir) + for attachment in result.attachments: + attachment_fname = attachment.tmp_file_path.split("/")[-1] + self.assertTrue(os.path.isfile(attachment.get_filename_in_path())) + self.assertIn(attachment_fname, attachment_names) def test_attachments_extractor_attachments_dir(self) -> None: file_name = "with_attachments_0.docx" docx_attachment_extractor = DocxAttachmentsExtractor() with tempfile.TemporaryDirectory() as tmpdir: - params = { - "with_attachments": True, - "attachments_dir": tmpdir - } + params = {"with_attachments": True, "attachments_dir": tmpdir} result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params) attachment_names = os.listdir(tmpdir)