diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 152ea31f..49d402e8 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -73,7 +73,7 @@ def __get_lines(self) -> List[LineWithMeta]: self.__handle_table_xml(paragraph_xml, table_refs) continue - if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor + if self.attachment_name2uid and paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor self.__handle_diagram_xml(paragraph_xml, diagram_refs) continue @@ -84,9 +84,11 @@ def __get_lines(self) -> List[LineWithMeta]: continue self.paragraph_list.append(self.paragraph_maker.make_paragraph(paragraph_xml, self.paragraph_list)) - images = paragraph_xml.find_all("pic:pic") - if images: - self.__handle_images_xml(images, image_refs) + + if self.attachment_name2uid: + images = paragraph_xml.find_all("pic:pic") + if images: + self.__handle_images_xml(images, image_refs) return self.__paragraphs2lines(image_refs, table_refs, diagram_refs) diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index e91163fb..1e503738 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -35,7 +35,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) + + with_attachments = self.attachment_extractor.with_attachments(parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index e6340251..e387de46 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,12 +1,16 @@ -from typing import Optional +from typing import Dict, List, Optional +from bs4 import BeautifulSoup from pptx import Presentation +from pptx.shapes.graphfrm import GraphicFrame +from pptx.shapes.picture import Picture +from pptx.slide import Slide from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor +from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes @@ -38,23 +42,60 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters + + with_attachments = self.attachments_extractor.with_attachments(parameters=parameters) + attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] + attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} + prs = Presentation(file_path) lines, tables = [], [] for page_id, slide in enumerate(prs.slides, start=1): + images_rels = self.__get_slide_images_rels(slide) + for paragraph_id, shape in enumerate(slide.shapes, start=1): if shape.has_text_frame: - lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) if shape.has_table: - cells = [ - [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] - for row in shape.table.rows - ] + self.__add_table(lines, tables, page_id, paragraph_id, shape) - tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id))) - - attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) + if with_attachments and hasattr(shape, "image"): + if len(lines) == 0: + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) + + def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None: + cells = [ + [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] + for row in shape.table.rows + ] + table = Table(cells=cells, metadata=TableMetadata(page_id=page_id)) + + if len(lines) == 0: + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid)) + tables.append(table) + + def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]: + rels = BeautifulSoup(slide.part.rels.xml, "xml") + images_dir = "../media/" + + images_rels = dict() + for rel in rels.find_all("Relationship"): + if rel["Target"].startswith(images_dir): + images_rels[rel["Id"]] = rel["Target"][len(images_dir):] + + return images_rels + + def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None: + try: + image_rels_id = shape.element.blip_rId + image_name = images_rels[image_rels_id] + image_uid = attachment_name2uid[image_name] + line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid)) + except KeyError as e: + self.logger.warning(f"Attachment key hasn't been found ({e})") diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index 6c6ec17f..b2df8351 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -1,5 +1,6 @@ import os +from dedoc.data_structures import TableAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -24,11 +25,15 @@ def test_odp(self) -> None: def __check_content(self, content: dict) -> None: subparagraphs = content["structure"]["subparagraphs"] - self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"]) - self.assertEqual("Example", subparagraphs[1]["text"]) - self.assertEqual("Some author", subparagraphs[2]["text"]) - self.assertEqual("This is simple table", subparagraphs[3]["text"]) - - table = content["tables"][0]["cells"] - self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table[0])) - self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table[1])) + self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip()) + self.assertEqual("Example", subparagraphs[1]["text"].strip()) + self.assertEqual("Some author", subparagraphs[2]["text"].strip()) + self.assertEqual("This is simple table", subparagraphs[3]["text"].strip()) + + table = content["tables"][0] + self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0])) + self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1])) + + table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name] + self.assertEqual(1, len(table_annotations)) + self.assertEqual(table_annotations[0]["value"], table["metadata"]["uid"]) diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index 3d024033..cf6526df 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -1,5 +1,6 @@ import os +from dedoc.data_structures import AttachAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -98,6 +99,22 @@ def test_pdf_tabby_images_refs(self) -> None: self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) + def test_pptx_images_refs(self) -> None: + file_name = "with_attachments_1.pptx" + result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + self.assertEqual(len(attachment_uids), 5) + + subparagraphs = result["content"]["structure"]["subparagraphs"] + attach_annotations = [ann for ann in subparagraphs[1]["annotations"] if ann["name"] == AttachAnnotation.name] + self.assertEqual(len(attach_annotations), 1) + self.assertIn(attach_annotations[0]["value"], attachment_uids) + + attach_annotations = [ann for ann in subparagraphs[3]["annotations"] if ann["name"] == AttachAnnotation.name] + self.assertEqual(len(attach_annotations), 1) + self.assertIn(attach_annotations[0]["value"], attachment_uids) + def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None: text = image_paragraph["text"] image_annotations = image_paragraph["annotations"] diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index 49355997..11cbf719 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -245,7 +245,7 @@ def test_tables_with_merged_cells(self) -> None: def test_diagram_annotation(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("diagram_1.docx") - result = docx_reader.read(path) + result = docx_reader.read(path, parameters={"with_attachments": True}) for annotation in result.lines[0].annotations: if annotation.name == "attachment": @@ -253,7 +253,7 @@ def test_diagram_annotation(self) -> None: break path = self._get_path("diagram_2.docx") - result = docx_reader.read(path) + result = docx_reader.read(path, parameters={"with_attachments": True}) for i in [0, 22]: annotation_found = False diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index f9506ce0..ac816cb6 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -7,7 +7,7 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.dedoc_manager import DedocManager -from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader +from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader, PptxReader from dedoc.readers.docx_reader.docx_reader import DocxReader from tests.test_utils import get_test_config @@ -156,3 +156,14 @@ def test_attachments_extractor_attachments_dir(self) -> None: attachment_fname = attachment.tmp_file_path.split("/")[-1] self.assertTrue(os.path.isfile(attachment.get_filename_in_path())) self.assertIn(attachment_fname, attachment_names) + + def test_with_attachments_false(self) -> None: + files = ["with_attachments_0.docx", "with_attachments_1.pptx"] + readers = [DocxReader(), PptxReader()] + + with tempfile.TemporaryDirectory() as tmpdir: + for file_name, reader in zip(files, readers): + params = {"attachments_dir": tmpdir} + result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=params) + self.assertEqual(len(result.attachments), 0) + self.assertEqual(len(os.listdir(tmpdir)), 0)