Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TLDR-354 images attachments extraction from PDF (#368)
Browse files Browse the repository at this point in the history
* Benchmarks before changes

* Add image extraction to tabby

* Fix document partial parsing

* Use start_page, end_page in java tabby execution

* Fix txtlayer classification tests

* Fixes in partial parsing

* Fix tests
NastyBoget authored Nov 14, 2023
1 parent e6abe72 commit fa396ef
Showing 14 changed files with 380 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -46,7 +46,7 @@ def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithM
parameters_copy["need_pdf_table_analysis"] = "false"
num_pages = get_pdf_page_count(path)
if num_pages is None or num_pages >= 50:
# TODO remove this when TLDR-404 is done
# TODO remove this when TLDR-518 is done
document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
else:
# tabby reader reads the whole document regardless "pages" parameter
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
@@ -58,6 +58,7 @@ def __init__(self, config: dict) -> None:
"""
:param config: configuration of the reader, e.g. logger for logging
"""
config["n_jobs"] = config.get("n_jobs", 1)
self.table_recognizer = TableRecognizer(config=config)
self.metadata_extractor = LineMetadataExtractor(config=config)
self.config = config
76 changes: 53 additions & 23 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash


@@ -79,24 +80,11 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, tables, tables_on_images = self.__extract(path=path)
warnings = []
document_metadata = None
lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=path, parameters=parameters, warnings=warnings)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=image_attachments)

first_page, last_page = get_param_page_slice(parameters)
first_page = 0 if first_page is None else first_page
last_page = math.inf if last_page is None else last_page
extracted_lines_length = len(lines)
lines = [line for line in lines if first_page <= line.metadata.page_id < last_page]
if len(lines) < extracted_lines_length:
warnings.append("The document is partially parsed")
document_metadata = dict(first_page=first_page)
if last_page != math.inf:
document_metadata["last_page"] = last_page

lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=[])

attachments = []
attachments = image_attachments
if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters):
tmp_dir = os.path.dirname(path)
file_name = os.path.basename(path)
@@ -108,14 +96,34 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[Table], List[ScanTable]]:
def __extract(self, path: str, parameters: dict, warnings: list)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
document_metadata = None
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
page_count = get_pdf_page_count(path)
page_count = math.inf if page_count is None else page_count
first_page, last_page = get_param_page_slice(parameters)

all_lines = []
all_tables = []
all_tables_on_images = []
for page in document.get("pages", []):
empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page)
partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count)
if empty_page_limit or partial_page_limit:
warnings.append("The document is partially parsed")
document_metadata = dict(first_page=first_page)
if last_page is not None:
document_metadata["last_page"] = last_page

if empty_page_limit:
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

# in java tabby reader page numeration starts with 1, end_page is included
# first_tabby_page = first_page + 1 if first_page is not None else 1
# last_tabby_page = None if last_page is not None and last_page > page_count else last_page
# document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518

document = self.__process_pdf(path=path)
pages = document.get("pages", [])
for page in pages[first_page:last_page]:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
@@ -125,7 +133,11 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)

return all_lines, all_tables, all_tables_on_images
attached_images = self.__get_attached_images(page=page)
if attached_images:
all_attached_images.extend(attached_images)

return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
tables = []
@@ -168,6 +180,24 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:

return tables, tables_on_image

def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
image_attachment_list = []
for image_dict in page["images"]:
image_location = Location(
page_number=page["number"],
bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
)
image_attachment = PdfImageAttachment(
original_name=image_dict["original_name"],
tmp_file_path=image_dict["tmp_file_path"],
need_content_analysis=False,
uid=f"attach_{uuid.uuid4()}",
location=image_location
)
image_attachment_list.append(image_attachment)

return image_attachment_list

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
134 changes: 134 additions & 0 deletions dedoc/scripts/benchmark_pdf_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
import os
import shutil
import tempfile
import zipfile
from collections import OrderedDict
from typing import Tuple

import wget

from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor
from dedoc.config import get_config
from dedoc.data_structures import AttachedFile
from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader


def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf") or file_name == "large.pdf":
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
document = reader.read(file_path, parameters={"with_attachments": "true"})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in document.attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf"):
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: int) -> Tuple[str, int, int]:
attachment_name = attachment.original_name
if attachment_name.endswith(".png"):
png_files += 1
attachment_name = f"{png_files}.png"
if attachment_name.endswith(".json"):
json_files += 1
attachment_name = f"{json_files}.json"
return attachment_name, png_files, json_files


if __name__ == "__main__":
data_url = "https://at.ispras.ru/owncloud/index.php/s/EoczXGwWzai8ztN/download"
data_dir = os.path.join(get_config()["intermediate_data_path"], "benchmark_pdf_attachments")

if not os.path.isdir(data_dir):
os.makedirs(data_dir)
archive_path = os.path.join(data_dir, "with_attachments.zip")
wget.download(data_url, archive_path)
with zipfile.ZipFile(archive_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(archive_path)

print(f"Benchmark data downloaded to {data_dir}")
else:
print(f"Use cached benchmark data from {data_dir}")

in_dir = os.path.join(data_dir, "with_attachments")
out_dir = os.path.join(in_dir, "extracted_attachments")

if os.path.exists(out_dir):
shutil.rmtree(out_dir)
os.makedirs(out_dir)

benchmarks_dict = {}

print("Get tabby attachments")
tabby_reader = PdfTabbyReader(config={})
tabby_out_dir = os.path.join(out_dir, "tabby")
benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)

print("Get pdfminer attachments")
pdfminer_reader = PdfTxtlayerReader(config={})
pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)

print("Get common attachments")
common_out_dir = os.path.join(out_dir, "common")
pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)

json_out_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)

print(f"Attachments were extracted to {out_dir}")
3 changes: 3 additions & 0 deletions dedoc/utils/parameter_utils.py
Original file line number Diff line number Diff line change
@@ -117,6 +117,9 @@ def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Opt
first_page = None if first_page == "" else int(first_page) - 1
last_page = None if last_page == "" else int(last_page)

first_page = 0 if first_page is None or first_page < 0 else first_page
last_page = 0 if last_page and last_page < 0 else last_page

return first_page, last_page
except Exception:
raise ValueError(f"Error input parameter 'pages'. Bad page limit {pages}")
3 changes: 2 additions & 1 deletion docs/source/tutorials/add_new_doc_type.rst
Original file line number Diff line number Diff line change
@@ -175,7 +175,8 @@ You should implement the following methods:
For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
We use PyPDF2 to extract the text and tabula to extract tables. They must be added to ``requirements.txt`` of the project.
We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
They must be added to ``requirements.txt`` of the project.
We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
It must be added to the reader's constructor and used in ``read`` method.

136 changes: 136 additions & 0 deletions resources/benchmarks/benchmark_pdf_attachments.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
{
"tabby": {
"Document635.pdf": [
"1.json",
"1.png",
"2.json",
"2.png"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [
"1.png",
"2.png"
],
"with_attachments_0.docx.pdf": [
"1.png",
"2.png",
"3.png",
"4.png"
],
"with_attachments_1.docx.pdf": [
"1.png",
"2.png",
"3.png"
],
"with_attachments_1.pptx.pdf": [
"1.png"
],
"with_attachments_2.docx.pdf": [],
"with_attachments_2.pptx.pdf": [
"1.png",
"2.png",
"3.png",
"4.png",
"5.png",
"6.png",
"7.png"
],
"with_attachments_3.pdf": [
"1.png",
"10.png",
"11.png",
"12.png",
"13.png",
"14.png",
"15.png",
"16.png",
"17.png",
"18.png",
"19.png",
"2.png",
"3.png",
"4.png",
"5.png",
"6.png",
"7.png",
"8.png",
"9.png"
]
},
"pdfminer": {
"Document635.pdf": [
"1.json",
"1.png",
"2.json",
"2.png"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [
"1.png",
"2.png"
],
"with_attachments_0.docx.pdf": [
"1.png",
"2.png",
"3.png",
"4.png"
],
"with_attachments_1.docx.pdf": [
"1.png",
"2.png",
"3.png"
],
"with_attachments_1.pptx.pdf": [
"1.png",
"2.png",
"3.png"
],
"with_attachments_2.docx.pdf": [
"1.png",
"2.png"
],
"with_attachments_2.pptx.pdf": [],
"with_attachments_3.pdf": [
"1.png",
"2.png",
"3.png",
"4.png",
"5.png",
"6.png",
"7.png"
]
},
"common": {
"Document635.pdf": [
"1.json",
"2.json"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [],
"large.pdf": [],
"with_attachments_0.docx.pdf": [],
"with_attachments_1.docx.pdf": [],
"with_attachments_1.pptx.pdf": [],
"with_attachments_2.docx.pdf": [],
"with_attachments_2.pptx.pdf": [],
"with_attachments_3.pdf": []
}
}
1 change: 1 addition & 0 deletions tests/api_tests/test_api_format_pdf_page_limit.py
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@ def test_auto_text_layer(self) -> None:

def test_tabby_layer(self) -> None:
self.__check_limit("tabby", check_partially=True)
self.__check_out_of_limit("tabby")

def test_auto_tabby(self) -> None:
self.__check_limit("auto_tabby", check_partially=True)
14 changes: 8 additions & 6 deletions tests/api_tests/test_api_misc_with_attachments.py
Original file line number Diff line number Diff line change
@@ -50,21 +50,23 @@ def test_attachments_pmi_document(self) -> None:

attachments = result["attachments"]

self.assertEqual(attachments[0]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[1]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[0]["metadata"]["file_type"], "image/png")
self.assertEqual(attachments[1]["metadata"]["file_type"], "image/png")
self.assertEqual(attachments[2]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[3]["metadata"]["file_type"], "application/json")

def test_need_content_analysis(self) -> None:
file_name = "pdf_with_text_layer/Document635.pdf"
result = self._send_request(file_name, dict(with_attachments=True, need_content_analysis=False, pdf_with_text_layer="tabby"))

attachments = result["attachments"]
self.assertEqual(len(attachments[0]["content"]["structure"]["subparagraphs"]), 0)
self.assertEqual(len(attachments[1]["content"]["structure"]["subparagraphs"]), 0)
self.assertEqual(len(attachments[2]["content"]["structure"]["subparagraphs"]), 0)
self.assertEqual(len(attachments[3]["content"]["structure"]["subparagraphs"]), 0)

result = self._send_request(file_name, dict(with_attachments=True, need_content_analysis=True, pdf_with_text_layer="tabby"))
attachments = result["attachments"]
self.assertGreater(len(attachments[0]["content"]["structure"]["subparagraphs"]), 0)
self.assertGreater(len(attachments[1]["content"]["structure"]["subparagraphs"]), 0)
self.assertGreater(len(attachments[2]["content"]["structure"]["subparagraphs"]), 0)
self.assertGreater(len(attachments[3]["content"]["structure"]["subparagraphs"]), 0)

def test_get_without_attachments(self) -> None:
file_name = "with_attachments/example_with_attachments_depth_1.pdf"
42 changes: 41 additions & 1 deletion tests/api_tests/test_api_misc_with_images_refs.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@

class TestApiImageRefs(AbstractTestApiDocReader):

data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx")
data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "with_attachments")

def test_docx_with_images(self) -> None:
file_name = "docx_with_images.docx"
@@ -58,6 +58,46 @@ def test_docx_with_images_from_mac(self) -> None:
image_paragraph = content["subparagraphs"][5]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.png"])

def test_pdf_pdfminer_images_refs(self) -> None:
file_name = "with_attachments_1.docx.pdf"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="true"))
structure = result["content"]["structure"]

attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
self.assertEqual(len(attachment_uids), 3)

attach_annotation = structure["subparagraphs"][0]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][3]["annotations"][-2]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][3]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

def test_pdf_tabby_images_refs(self) -> None:
file_name = "with_attachments_1.docx.pdf"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="tabby"))
structure = result["content"]["structure"]

attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
self.assertEqual(len(attachment_uids), 3)

attach_annotation = structure["subparagraphs"][2]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][4]["annotations"][-2]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][4]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
text = image_paragraph["text"]
image_annotations = image_paragraph["annotations"]
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file not shown.

0 comments on commit fa396ef

Please sign in to comment.