Skip to content

Commit

Permalink
Add attach and table annotations to PPTX
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Dec 22, 2023
1 parent 75ba843 commit 97ef286
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 26 deletions.
10 changes: 6 additions & 4 deletions dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __get_lines(self) -> List[LineWithMeta]:
self.__handle_table_xml(paragraph_xml, table_refs)
continue

if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor
if self.attachment_name2uid and paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor
self.__handle_diagram_xml(paragraph_xml, diagram_refs)
continue

Expand All @@ -84,9 +84,11 @@ def __get_lines(self) -> List[LineWithMeta]:
continue

self.paragraph_list.append(self.paragraph_maker.make_paragraph(paragraph_xml, self.paragraph_list))
images = paragraph_xml.find_all("pic:pic")
if images:
self.__handle_images_xml(images, image_refs)

if self.attachment_name2uid:
images = paragraph_xml.find_all("pic:pic")
if images:
self.__handle_images_xml(images, image_refs)

return self.__paragraphs2lines(image_refs, table_refs, diagram_refs)

Expand Down
4 changes: 3 additions & 1 deletion dedoc/readers/docx_reader/docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters)

with_attachments = self.attachment_extractor.with_attachments(parameters=parameters)
attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []

docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger)
lines = self.__fix_lines(docx_document.lines)
Expand Down
61 changes: 51 additions & 10 deletions dedoc/readers/pptx_reader/pptx_reader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from typing import Optional
from typing import Dict, List, Optional

from bs4 import BeautifulSoup
from pptx import Presentation
from pptx.shapes.graphfrm import GraphicFrame
from pptx.shapes.picture import Picture
from pptx.slide import Slide

from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
Expand Down Expand Up @@ -38,23 +42,60 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters

with_attachments = self.attachments_extractor.with_attachments(parameters=parameters)
attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}

prs = Presentation(file_path)
lines, tables = [], []

for page_id, slide in enumerate(prs.slides, start=1):
images_rels = self.__get_slide_images_rels(slide)

for paragraph_id, shape in enumerate(slide.shapes, start=1):

if shape.has_text_frame:
lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))

if shape.has_table:
cells = [
[CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
for row in shape.table.rows
]
self.__add_table(lines, tables, page_id, paragraph_id, shape)

tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id)))

attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters)
if with_attachments and hasattr(shape, "image"):
if len(lines) == 0:
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels)

return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])

def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None:
cells = [
[CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
for row in shape.table.rows
]
table = Table(cells=cells, metadata=TableMetadata(page_id=page_id))

if len(lines) == 0:
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
tables.append(table)

def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]:
rels = BeautifulSoup(slide.part.rels.xml, "xml")
images_dir = "../media/"

images_rels = dict()
for rel in rels.find_all("Relationship"):
if rel["Target"].startswith(images_dir):
images_rels[rel["Id"]] = rel["Target"][len(images_dir):]

return images_rels

def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None:
try:
image_rels_id = shape.element.blip_rId
image_name = images_rels[image_rels_id]
image_uid = attachment_name2uid[image_name]
line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid))
except KeyError as e:
self.logger.warning(f"Attachment key hasn't been found ({e})")
21 changes: 13 additions & 8 deletions tests/api_tests/test_api_format_pptx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from dedoc.data_structures import TableAnnotation
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader


Expand All @@ -24,11 +25,15 @@ def test_odp(self) -> None:

def __check_content(self, content: dict) -> None:
subparagraphs = content["structure"]["subparagraphs"]
self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"])
self.assertEqual("Example", subparagraphs[1]["text"])
self.assertEqual("Some author", subparagraphs[2]["text"])
self.assertEqual("This is simple table", subparagraphs[3]["text"])

table = content["tables"][0]["cells"]
self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table[0]))
self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table[1]))
self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip())
self.assertEqual("Example", subparagraphs[1]["text"].strip())
self.assertEqual("Some author", subparagraphs[2]["text"].strip())
self.assertEqual("This is simple table", subparagraphs[3]["text"].strip())

table = content["tables"][0]
self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0]))
self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1]))

table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name]
self.assertEqual(1, len(table_annotations))
self.assertEqual(table_annotations[0]["value"], table["metadata"]["uid"])
17 changes: 17 additions & 0 deletions tests/api_tests/test_api_misc_with_images_refs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from dedoc.data_structures import AttachAnnotation
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader


Expand Down Expand Up @@ -98,6 +99,22 @@ def test_pdf_tabby_images_refs(self) -> None:
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

def test_pptx_images_refs(self) -> None:
file_name = "with_attachments_1.pptx"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))

attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
self.assertEqual(len(attachment_uids), 5)

subparagraphs = result["content"]["structure"]["subparagraphs"]
attach_annotations = [ann for ann in subparagraphs[1]["annotations"] if ann["name"] == AttachAnnotation.name]
self.assertEqual(len(attach_annotations), 1)
self.assertIn(attach_annotations[0]["value"], attachment_uids)

attach_annotations = [ann for ann in subparagraphs[3]["annotations"] if ann["name"] == AttachAnnotation.name]
self.assertEqual(len(attach_annotations), 1)
self.assertIn(attach_annotations[0]["value"], attachment_uids)

def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
text = image_paragraph["text"]
image_annotations = image_paragraph["annotations"]
Expand Down
4 changes: 2 additions & 2 deletions tests/unit_tests/test_format_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,15 @@ def test_tables_with_merged_cells(self) -> None:
def test_diagram_annotation(self) -> None:
docx_reader = DocxReader(config=get_config())
path = self._get_path("diagram_1.docx")
result = docx_reader.read(path)
result = docx_reader.read(path, parameters={"with_attachments": True})

for annotation in result.lines[0].annotations:
if annotation.name == "attachment":
self.assertEqual("dee352a576cf5ffd27ee1574d4dc4431", annotation.value)
break

path = self._get_path("diagram_2.docx")
result = docx_reader.read(path)
result = docx_reader.read(path, parameters={"with_attachments": True})

for i in [0, 22]:
annotation_found = False
Expand Down
13 changes: 12 additions & 1 deletion tests/unit_tests/test_module_attachment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.dedoc_manager import DedocManager
from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader
from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader, PptxReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from tests.test_utils import get_test_config

Expand Down Expand Up @@ -156,3 +156,14 @@ def test_attachments_extractor_attachments_dir(self) -> None:
attachment_fname = attachment.tmp_file_path.split("/")[-1]
self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
self.assertIn(attachment_fname, attachment_names)

def test_with_attachments_false(self) -> None:
files = ["with_attachments_0.docx", "with_attachments_1.pptx"]
readers = [DocxReader(), PptxReader()]

with tempfile.TemporaryDirectory() as tmpdir:
for file_name, reader in zip(files, readers):
params = {"attachments_dir": tmpdir}
result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=params)
self.assertEqual(len(result.attachments), 0)
self.assertEqual(len(os.listdir(tmpdir)), 0)

0 comments on commit 97ef286

Please sign in to comment.