Skip to content

Commit

Permalink
TLDR-533 extract images from PDF to attachments_dir
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Nov 23, 2023
1 parent 2ee1485 commit 76e53eb
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 30 deletions.
9 changes: 7 additions & 2 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
"first_page",
"last_page",
"need_binarization",
"table_type"
"table_type",
"attachments_dir"
])


Expand Down Expand Up @@ -75,6 +76,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
"""
parameters = {} if parameters is None else parameters
first_page, last_page = param_utils.get_param_page_slice(parameters)
attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir

params_for_parse = ParametersForParseDoc(
language=param_utils.get_param_language(parameters),
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
Expand All @@ -87,7 +91,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
first_page=first_page,
last_page=last_page,
need_binarization=param_utils.get_param_need_binarization(parameters),
table_type=param_utils.get_param_table_type(parameters)
table_type=param_utils.get_param_table_type(parameters),
attachments_dir=attachments_dir
)

lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)
Expand Down
17 changes: 13 additions & 4 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import math
import os
import shutil
import subprocess
import uuid
from typing import List, Optional, Tuple
Expand Down Expand Up @@ -35,7 +36,7 @@
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash
from dedoc.utils.utils import calculate_file_hash, get_unique_name


class PdfTabbyReader(PdfBaseReader):
Expand Down Expand Up @@ -100,6 +101,9 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
document_metadata = None
attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir

file_hash = calculate_file_hash(path=path)
page_count = get_pdf_page_count(path)
page_count = math.inf if page_count is None else page_count
Expand Down Expand Up @@ -133,7 +137,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)

attached_images = self.__get_attached_images(page=page)
attached_images = self.__get_attached_images(page=page, attachments_dir=attachments_dir)
if attached_images:
all_attached_images.extend(attached_images)

Expand Down Expand Up @@ -180,16 +184,21 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:

return tables, tables_on_image

def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
def __get_attached_images(self, page: dict, attachments_dir: str) -> List[PdfImageAttachment]:
image_attachment_list = []
for image_dict in page["images"]:
image_location = Location(
page_number=page["number"],
bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
)

tmp_file_name = get_unique_name(image_dict["original_name"])
tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
shutil.move(image_dict["tmp_file_path"], tmp_file_path)

image_attachment = PdfImageAttachment(
original_name=image_dict["original_name"],
tmp_file_path=image_dict["tmp_file_path"],
tmp_file_path=tmp_file_path,
need_content_analysis=False,
uid=f"attach_{uuid.uuid4()}",
location=image_location
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _process_one_page(self,
else:
tables = []

page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, attachments_dir=parameters.attachments_dir)
if page is None:
return [], [], [], []
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,22 @@ def __init__(self, *, config: dict) -> None:
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]:
def extract_text_layer(self, path: str, page_number: int, attachments_dir: str) -> Optional[PageWithBBox]:
"""
Extract text information with metadata from pdf with help pdfminer.six
:param path: path to pdf
:param page_number: number of the page to read
:param attachments_dir: directory for saving attachments
:return: pages_with_bbox - page with extracted text
"""
with open(path, "rb") as fp:
pages = PDFPage.get_pages(fp)
for page_num, page in enumerate(pages):
if page_num != page_number:
continue
return self.__handle_page(page=page, page_number=page_number, path=path)
return self.__handle_page(page=page, page_number=page_number, path=path, attachments_dir=attachments_dir)

def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox:
directory = os.path.dirname(path)
def __handle_page(self, page: PDFPage, page_number: int, path: str, attachments_dir: str) -> PageWithBBox:
device, interpreter = self.__get_interpreter()
try:
interpreter.process_page(page)
Expand Down Expand Up @@ -95,7 +95,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB
lobjs_textline.append(lobj)

elif isinstance(lobj, LTFigure) and not page_broken:
attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number)
attachment = self.__extract_image(attachments_dir, height, image_page, k_h, k_w, lobj, page_number)
if attachment is not None:
images.append(attachment)

Expand Down
33 changes: 15 additions & 18 deletions tests/unit_tests/test_module_attachment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.dedoc_manager import DedocManager
from dedoc.readers import ArchiveReader
from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from tests.test_utils import get_test_config

Expand Down Expand Up @@ -126,31 +126,28 @@ def test_manager_attachments_dir(self) -> None:
self.assertIn(attachment.metadata.temporary_file_name, attachment_names)

def test_reader_attachments_dir(self) -> None:
file_name = "with_attachments_0.docx"
docx_reader = DocxReader(config=get_test_config())
file_name_reader_list = [
("with_attachments_0.docx", DocxReader(config=get_test_config())),
("with_attachments_1.docx.pdf", PdfTxtlayerReader(config=get_test_config())),
("with_attachments_1.docx.pdf", PdfTabbyReader(config=get_test_config()))
]

with tempfile.TemporaryDirectory() as tmpdir:
params = {
"with_attachments": True,
"attachments_dir": tmpdir
}
result = docx_reader.read(path=os.path.join(self.src_dir, file_name), parameters=params)
for file_name, reader in file_name_reader_list:
with tempfile.TemporaryDirectory() as tmpdir:
result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir))

attachment_names = os.listdir(tmpdir)
for attachment in result.attachments:
attachment_fname = attachment.tmp_file_path.split("/")[-1]
self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
self.assertIn(attachment_fname, attachment_names)
attachment_names = os.listdir(tmpdir)
for attachment in result.attachments:
attachment_fname = attachment.tmp_file_path.split("/")[-1]
self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
self.assertIn(attachment_fname, attachment_names)

def test_attachments_extractor_attachments_dir(self) -> None:
file_name = "with_attachments_0.docx"
docx_attachment_extractor = DocxAttachmentsExtractor()

with tempfile.TemporaryDirectory() as tmpdir:
params = {
"with_attachments": True,
"attachments_dir": tmpdir
}
params = {"with_attachments": True, "attachments_dir": tmpdir}
result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params)

attachment_names = os.listdir(tmpdir)
Expand Down

0 comments on commit 76e53eb

Please sign in to comment.