Add attach and table annotations to PPTX

ispras · Dec 22, 2023 · 97ef286 · 97ef286
1 parent 75ba843
commit 97ef286
Show file tree

Hide file tree

Showing 7 changed files with 104 additions and 26 deletions.
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -73,7 +73,7 @@ def __get_lines(self) -> List[LineWithMeta]:
                 self.__handle_table_xml(paragraph_xml, table_refs)
                 continue
 
-            if paragraph_xml.pict:  # diagrams are saved using docx_attachments_extractor
+            if self.attachment_name2uid and paragraph_xml.pict:  # diagrams are saved using docx_attachments_extractor
                 self.__handle_diagram_xml(paragraph_xml, diagram_refs)
                 continue
 
@@ -84,9 +84,11 @@ def __get_lines(self) -> List[LineWithMeta]:
                 continue
 
             self.paragraph_list.append(self.paragraph_maker.make_paragraph(paragraph_xml, self.paragraph_list))
-            images = paragraph_xml.find_all("pic:pic")
-            if images:
-                self.__handle_images_xml(images, image_refs)
+
+            if self.attachment_name2uid:
+                images = paragraph_xml.find_all("pic:pic")
+                if images:
+                    self.__handle_images_xml(images, image_refs)
 
         return self.__paragraphs2lines(image_refs, table_refs, diagram_refs)
 

diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py
@@ -35,7 +35,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
+
+        with_attachments = self.attachment_extractor.with_attachments(parameters=parameters)
+        attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
 
         docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger)
         lines = self.__fix_lines(docx_document.lines)

diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py
@@ -1,12 +1,16 @@
-from typing import Optional
+from typing import Dict, List, Optional
 
+from bs4 import BeautifulSoup
 from pptx import Presentation
+from pptx.shapes.graphfrm import GraphicFrame
+from pptx.shapes.picture import Picture
+from pptx.slide import Slide
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
+from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation
 from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.line_metadata import LineMetadata
 from dedoc.data_structures.line_with_meta import LineWithMeta
-from dedoc.data_structures.table import Table
 from dedoc.data_structures.table_metadata import TableMetadata
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.extensions import recognized_extensions, recognized_mimes
@@ -38,23 +42,60 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
+
+        with_attachments = self.attachments_extractor.with_attachments(parameters=parameters)
+        attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
+        attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
+
         prs = Presentation(file_path)
         lines, tables = [], []
 
         for page_id, slide in enumerate(prs.slides, start=1):
+            images_rels = self.__get_slide_images_rels(slide)
+
             for paragraph_id, shape in enumerate(slide.shapes, start=1):
 
                 if shape.has_text_frame:
-                    lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
+                    lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
 
                 if shape.has_table:
-                    cells = [
-                        [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
-                        for row in shape.table.rows
-                    ]
+                    self.__add_table(lines, tables, page_id, paragraph_id, shape)
 
-                    tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id)))
-
-        attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters)
+                if with_attachments and hasattr(shape, "image"):
+                    if len(lines) == 0:
+                        lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
+                    self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels)
 
         return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])
+
+    def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None:
+        cells = [
+            [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
+            for row in shape.table.rows
+        ]
+        table = Table(cells=cells, metadata=TableMetadata(page_id=page_id))
+
+        if len(lines) == 0:
+            lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
+        lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
+        tables.append(table)
+
+    def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]:
+        rels = BeautifulSoup(slide.part.rels.xml, "xml")
+        images_dir = "../media/"
+
+        images_rels = dict()
+        for rel in rels.find_all("Relationship"):
+            if rel["Target"].startswith(images_dir):
+                images_rels[rel["Id"]] = rel["Target"][len(images_dir):]
+
+        return images_rels
+
+    def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None:
+        try:
+            image_rels_id = shape.element.blip_rId
+            image_name = images_rels[image_rels_id]
+            image_uid = attachment_name2uid[image_name]
+            line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid))
+        except KeyError as e:
+            self.logger.warning(f"Attachment key hasn't been found ({e})")
diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py
@@ -1,5 +1,6 @@
 import os
 
+from dedoc.data_structures import TableAnnotation
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
 
 
@@ -24,11 +25,15 @@ def test_odp(self) -> None:
 
     def __check_content(self, content: dict) -> None:
         subparagraphs = content["structure"]["subparagraphs"]
-        self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"])
-        self.assertEqual("Example", subparagraphs[1]["text"])
-        self.assertEqual("Some author", subparagraphs[2]["text"])
-        self.assertEqual("This is simple table", subparagraphs[3]["text"])
-
-        table = content["tables"][0]["cells"]
-        self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table[0]))
-        self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table[1]))
+        self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip())
+        self.assertEqual("Example", subparagraphs[1]["text"].strip())
+        self.assertEqual("Some author", subparagraphs[2]["text"].strip())
+        self.assertEqual("This is simple table", subparagraphs[3]["text"].strip())
+
+        table = content["tables"][0]
+        self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0]))
+        self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1]))
+
+        table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name]
+        self.assertEqual(1, len(table_annotations))
+        self.assertEqual(table_annotations[0]["value"], table["metadata"]["uid"])
diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py
@@ -1,5 +1,6 @@
 import os
 
+from dedoc.data_structures import AttachAnnotation
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
 
 
@@ -98,6 +99,22 @@ def test_pdf_tabby_images_refs(self) -> None:
         self.assertEqual(attach_annotation["name"], "attachment")
         self.assertIn(attach_annotation["value"], attachment_uids)
 
+    def test_pptx_images_refs(self) -> None:
+        file_name = "with_attachments_1.pptx"
+        result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
+
+        attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
+        self.assertEqual(len(attachment_uids), 5)
+
+        subparagraphs = result["content"]["structure"]["subparagraphs"]
+        attach_annotations = [ann for ann in subparagraphs[1]["annotations"] if ann["name"] == AttachAnnotation.name]
+        self.assertEqual(len(attach_annotations), 1)
+        self.assertIn(attach_annotations[0]["value"], attachment_uids)
+
+        attach_annotations = [ann for ann in subparagraphs[3]["annotations"] if ann["name"] == AttachAnnotation.name]
+        self.assertEqual(len(attach_annotations), 1)
+        self.assertIn(attach_annotations[0]["value"], attachment_uids)
+
     def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
         text = image_paragraph["text"]
         image_annotations = image_paragraph["annotations"]

diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py
@@ -245,15 +245,15 @@ def test_tables_with_merged_cells(self) -> None:
     def test_diagram_annotation(self) -> None:
         docx_reader = DocxReader(config=get_config())
         path = self._get_path("diagram_1.docx")
-        result = docx_reader.read(path)
+        result = docx_reader.read(path, parameters={"with_attachments": True})
 
         for annotation in result.lines[0].annotations:
             if annotation.name == "attachment":
                 self.assertEqual("dee352a576cf5ffd27ee1574d4dc4431", annotation.value)
             break
 
         path = self._get_path("diagram_2.docx")
-        result = docx_reader.read(path)
+        result = docx_reader.read(path, parameters={"with_attachments": True})
 
         for i in [0, 22]:
             annotation_found = False

diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py
@@ -7,7 +7,7 @@
 from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
 from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
 from dedoc.dedoc_manager import DedocManager
-from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader
+from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader, PptxReader
 from dedoc.readers.docx_reader.docx_reader import DocxReader
 from tests.test_utils import get_test_config
 
@@ -156,3 +156,14 @@ def test_attachments_extractor_attachments_dir(self) -> None:
                 attachment_fname = attachment.tmp_file_path.split("/")[-1]
                 self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
                 self.assertIn(attachment_fname, attachment_names)
+
+    def test_with_attachments_false(self) -> None:
+        files = ["with_attachments_0.docx", "with_attachments_1.pptx"]
+        readers = [DocxReader(), PptxReader()]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            for file_name, reader in zip(files, readers):
+                params = {"attachments_dir": tmpdir}
+                result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=params)
+                self.assertEqual(len(result.attachments), 0)
+                self.assertEqual(len(os.listdir(tmpdir)), 0)