Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-354 images attachments extraction from PDF #368

Merged
merged 7 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithM
parameters_copy["need_pdf_table_analysis"] = "false"
num_pages = get_pdf_page_count(path)
if num_pages is None or num_pages >= 50:
# TODO remove this when TLDR-404 is done
# TODO remove this when TLDR-518 is done
document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
else:
# tabby reader reads the whole document regardless "pages" parameter
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self, config: dict) -> None:
"""
:param config: configuration of the reader, e.g. logger for logging
"""
config["n_jobs"] = config.get("n_jobs", 1)
self.table_recognizer = TableRecognizer(config=config)
self.metadata_extractor = LineMetadataExtractor(config=config)
self.config = config
Expand Down
76 changes: 53 additions & 23 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash


Expand Down Expand Up @@ -79,24 +80,11 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, tables, tables_on_images = self.__extract(path=path)
warnings = []
document_metadata = None
lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=path, parameters=parameters, warnings=warnings)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=image_attachments)

first_page, last_page = get_param_page_slice(parameters)
first_page = 0 if first_page is None else first_page
last_page = math.inf if last_page is None else last_page
extracted_lines_length = len(lines)
lines = [line for line in lines if first_page <= line.metadata.page_id < last_page]
if len(lines) < extracted_lines_length:
warnings.append("The document is partially parsed")
document_metadata = dict(first_page=first_page)
if last_page != math.inf:
document_metadata["last_page"] = last_page

lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=[])

attachments = []
attachments = image_attachments
if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters):
tmp_dir = os.path.dirname(path)
file_name = os.path.basename(path)
Expand All @@ -108,14 +96,34 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[Table], List[ScanTable]]:
def __extract(self, path: str, parameters: dict, warnings: list)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
document_metadata = None
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
page_count = get_pdf_page_count(path)
page_count = math.inf if page_count is None else page_count
first_page, last_page = get_param_page_slice(parameters)

all_lines = []
all_tables = []
all_tables_on_images = []
for page in document.get("pages", []):
empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page)
partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count)
if empty_page_limit or partial_page_limit:
warnings.append("The document is partially parsed")
document_metadata = dict(first_page=first_page)
if last_page is not None:
document_metadata["last_page"] = last_page

if empty_page_limit:
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

# in java tabby reader page numeration starts with 1, end_page is included
# first_tabby_page = first_page + 1 if first_page is not None else 1
# last_tabby_page = None if last_page is not None and last_page > page_count else last_page
# document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518

document = self.__process_pdf(path=path)
pages = document.get("pages", [])
for page in pages[first_page:last_page]:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
Expand All @@ -125,7 +133,11 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)

return all_lines, all_tables, all_tables_on_images
attached_images = self.__get_attached_images(page=page)
if attached_images:
all_attached_images.extend(attached_images)

return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
tables = []
Expand Down Expand Up @@ -168,6 +180,24 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:

return tables, tables_on_image

def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
image_attachment_list = []
for image_dict in page["images"]:
image_location = Location(
page_number=page["number"],
bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
)
image_attachment = PdfImageAttachment(
original_name=image_dict["original_name"],
tmp_file_path=image_dict["tmp_file_path"],
need_content_analysis=False,
uid=f"attach_{uuid.uuid4()}",
location=image_location
)
image_attachment_list.append(image_attachment)

return image_attachment_list

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
Expand Down
134 changes: 134 additions & 0 deletions dedoc/scripts/benchmark_pdf_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
import os
import shutil
import tempfile
import zipfile
from collections import OrderedDict
from typing import Tuple

import wget

from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor
from dedoc.config import get_config
from dedoc.data_structures import AttachedFile
from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader


def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf") or file_name == "large.pdf":
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
document = reader.read(file_path, parameters={"with_attachments": "true"})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in document.attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf"):
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: int) -> Tuple[str, int, int]:
attachment_name = attachment.original_name
if attachment_name.endswith(".png"):
png_files += 1
attachment_name = f"{png_files}.png"
if attachment_name.endswith(".json"):
json_files += 1
attachment_name = f"{json_files}.json"
return attachment_name, png_files, json_files


if __name__ == "__main__":
data_url = "https://at.ispras.ru/owncloud/index.php/s/EoczXGwWzai8ztN/download"
data_dir = os.path.join(get_config()["intermediate_data_path"], "benchmark_pdf_attachments")

if not os.path.isdir(data_dir):
os.makedirs(data_dir)
archive_path = os.path.join(data_dir, "with_attachments.zip")
wget.download(data_url, archive_path)
with zipfile.ZipFile(archive_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(archive_path)

print(f"Benchmark data downloaded to {data_dir}")
else:
print(f"Use cached benchmark data from {data_dir}")

in_dir = os.path.join(data_dir, "with_attachments")
out_dir = os.path.join(in_dir, "extracted_attachments")

if os.path.exists(out_dir):
shutil.rmtree(out_dir)
os.makedirs(out_dir)

benchmarks_dict = {}

print("Get tabby attachments")
tabby_reader = PdfTabbyReader(config={})
tabby_out_dir = os.path.join(out_dir, "tabby")
benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)

print("Get pdfminer attachments")
pdfminer_reader = PdfTxtlayerReader(config={})
pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)

print("Get common attachments")
common_out_dir = os.path.join(out_dir, "common")
pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)

json_out_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)

print(f"Attachments were extracted to {out_dir}")
3 changes: 3 additions & 0 deletions dedoc/utils/parameter_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Opt
first_page = None if first_page == "" else int(first_page) - 1
last_page = None if last_page == "" else int(last_page)

first_page = 0 if first_page is None or first_page < 0 else first_page
last_page = 0 if last_page and last_page < 0 else last_page

return first_page, last_page
except Exception:
raise ValueError(f"Error input parameter 'pages'. Bad page limit {pages}")
Expand Down
3 changes: 2 additions & 1 deletion docs/source/tutorials/add_new_doc_type.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ You should implement the following methods:
For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
We use PyPDF2 to extract the text and tabula to extract tables. They must be added to ``requirements.txt`` of the project.
We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
They must be added to ``requirements.txt`` of the project.
We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
It must be added to the reader's constructor and used in ``read`` method.

Expand Down
Loading