Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-533 extract images from PDF to attachments_dir #374

Merged
merged 1 commit into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
"first_page",
"last_page",
"need_binarization",
"table_type"
"table_type",
"attachments_dir"
])


Expand Down Expand Up @@ -75,6 +76,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
"""
parameters = {} if parameters is None else parameters
first_page, last_page = param_utils.get_param_page_slice(parameters)
attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir

params_for_parse = ParametersForParseDoc(
language=param_utils.get_param_language(parameters),
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
Expand All @@ -87,7 +91,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
first_page=first_page,
last_page=last_page,
need_binarization=param_utils.get_param_need_binarization(parameters),
table_type=param_utils.get_param_table_type(parameters)
table_type=param_utils.get_param_table_type(parameters),
attachments_dir=attachments_dir
)

lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)
Expand Down
17 changes: 13 additions & 4 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import math
import os
import shutil
import subprocess
import uuid
from typing import List, Optional, Tuple
Expand Down Expand Up @@ -35,7 +36,7 @@
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash
from dedoc.utils.utils import calculate_file_hash, get_unique_name


class PdfTabbyReader(PdfBaseReader):
Expand Down Expand Up @@ -100,6 +101,9 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
document_metadata = None
attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir

file_hash = calculate_file_hash(path=path)
page_count = get_pdf_page_count(path)
page_count = math.inf if page_count is None else page_count
Expand Down Expand Up @@ -133,7 +137,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)

attached_images = self.__get_attached_images(page=page)
attached_images = self.__get_attached_images(page=page, attachments_dir=attachments_dir)
if attached_images:
all_attached_images.extend(attached_images)

Expand Down Expand Up @@ -180,16 +184,21 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:

return tables, tables_on_image

def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
def __get_attached_images(self, page: dict, attachments_dir: str) -> List[PdfImageAttachment]:
image_attachment_list = []
for image_dict in page["images"]:
image_location = Location(
page_number=page["number"],
bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
)

tmp_file_name = get_unique_name(image_dict["original_name"])
tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
shutil.move(image_dict["tmp_file_path"], tmp_file_path)

image_attachment = PdfImageAttachment(
original_name=image_dict["original_name"],
tmp_file_path=image_dict["tmp_file_path"],
tmp_file_path=tmp_file_path,
need_content_analysis=False,
uid=f"attach_{uuid.uuid4()}",
location=image_location
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _process_one_page(self,
else:
tables = []

page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, attachments_dir=parameters.attachments_dir)
if page is None:
return [], [], [], []
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,22 @@ def __init__(self, *, config: dict) -> None:
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]:
def extract_text_layer(self, path: str, page_number: int, attachments_dir: str) -> Optional[PageWithBBox]:
"""
Extract text information with metadata from pdf with help pdfminer.six
:param path: path to pdf
:param page_number: number of the page to read
:param attachments_dir: directory for saving attachments
:return: pages_with_bbox - page with extracted text
"""
with open(path, "rb") as fp:
pages = PDFPage.get_pages(fp)
for page_num, page in enumerate(pages):
if page_num != page_number:
continue
return self.__handle_page(page=page, page_number=page_number, path=path)
return self.__handle_page(page=page, page_number=page_number, path=path, attachments_dir=attachments_dir)

def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox:
directory = os.path.dirname(path)
def __handle_page(self, page: PDFPage, page_number: int, path: str, attachments_dir: str) -> PageWithBBox:
device, interpreter = self.__get_interpreter()
try:
interpreter.process_page(page)
Expand Down Expand Up @@ -95,7 +95,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB
lobjs_textline.append(lobj)

elif isinstance(lobj, LTFigure) and not page_broken:
attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number)
attachment = self.__extract_image(attachments_dir, height, image_page, k_h, k_w, lobj, page_number)
if attachment is not None:
images.append(attachment)

Expand Down
33 changes: 15 additions & 18 deletions tests/unit_tests/test_module_attachment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.dedoc_manager import DedocManager
from dedoc.readers import ArchiveReader
from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from tests.test_utils import get_test_config

Expand Down Expand Up @@ -126,31 +126,28 @@ def test_manager_attachments_dir(self) -> None:
self.assertIn(attachment.metadata.temporary_file_name, attachment_names)

def test_reader_attachments_dir(self) -> None:
file_name = "with_attachments_0.docx"
docx_reader = DocxReader(config=get_test_config())
file_name_reader_list = [
("with_attachments_0.docx", DocxReader(config=get_test_config())),
("with_attachments_1.docx.pdf", PdfTxtlayerReader(config=get_test_config())),
("with_attachments_1.docx.pdf", PdfTabbyReader(config=get_test_config()))
]

with tempfile.TemporaryDirectory() as tmpdir:
params = {
"with_attachments": True,
"attachments_dir": tmpdir
}
result = docx_reader.read(path=os.path.join(self.src_dir, file_name), parameters=params)
for file_name, reader in file_name_reader_list:
with tempfile.TemporaryDirectory() as tmpdir:
result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir))

attachment_names = os.listdir(tmpdir)
for attachment in result.attachments:
attachment_fname = attachment.tmp_file_path.split("/")[-1]
self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
self.assertIn(attachment_fname, attachment_names)
attachment_names = os.listdir(tmpdir)
for attachment in result.attachments:
attachment_fname = attachment.tmp_file_path.split("/")[-1]
self.assertTrue(os.path.isfile(attachment.get_filename_in_path()))
self.assertIn(attachment_fname, attachment_names)

def test_attachments_extractor_attachments_dir(self) -> None:
file_name = "with_attachments_0.docx"
docx_attachment_extractor = DocxAttachmentsExtractor()

with tempfile.TemporaryDirectory() as tmpdir:
params = {
"with_attachments": True,
"attachments_dir": tmpdir
}
params = {"with_attachments": True, "attachments_dir": tmpdir}
result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params)

attachment_names = os.listdir(tmpdir)
Expand Down
Loading