TLDR-354 images attachments extraction from PDF (#368)

* Benchmarks before changes * Add image extraction to tabby * Fix document partial parsing * Use start_page, end_page in java tabby execution * Fix txtlayer classification tests * Fixes in partial parsing * Fix tests
ispras · Nov 14, 2023 · fa396ef · fa396ef
1 parent e6abe72
commit fa396ef
Showing 14 changed files with 380 additions and 32 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -46,7 +46,7 @@ def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithM
         parameters_copy["need_pdf_table_analysis"] = "false"
         num_pages = get_pdf_page_count(path)
         if num_pages is None or num_pages >= 50:
-            # TODO remove this when TLDR-404 is done
+            # TODO remove this when TLDR-518 is done
             document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
         else:
             # tabby reader reads the whole document regardless "pages" parameter

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -58,6 +58,7 @@ def __init__(self, config: dict) -> None:
         """
         :param config: configuration of the reader, e.g. logger for logging
         """
+        config["n_jobs"] = config.get("n_jobs", 1)
         self.table_recognizer = TableRecognizer(config=config)
         self.metadata_extractor = LineMetadataExtractor(config=config)
         self.config = config

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -34,6 +34,7 @@
 from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
 from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
 from dedoc.utils.parameter_utils import get_param_page_slice
+from dedoc.utils.pdf_utils import get_pdf_page_count
 from dedoc.utils.utils import calculate_file_hash
 
 
@@ -79,24 +80,11 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        lines, tables, tables_on_images = self.__extract(path=path)
         warnings = []
-        document_metadata = None
+        lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=path, parameters=parameters, warnings=warnings)
+        lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=image_attachments)
 
-        first_page, last_page = get_param_page_slice(parameters)
-        first_page = 0 if first_page is None else first_page
-        last_page = math.inf if last_page is None else last_page
-        extracted_lines_length = len(lines)
-        lines = [line for line in lines if first_page <= line.metadata.page_id < last_page]
-        if len(lines) < extracted_lines_length:
-            warnings.append("The document is partially parsed")
-            document_metadata = dict(first_page=first_page)
-            if last_page != math.inf:
-                document_metadata["last_page"] = last_page
-
-        lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=[])
-
-        attachments = []
+        attachments = image_attachments
         if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters):
             tmp_dir = os.path.dirname(path)
             file_name = os.path.basename(path)
@@ -108,14 +96,34 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         return self._postprocess(result)
 
-    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[Table], List[ScanTable]]:
+    def __extract(self, path: str, parameters: dict, warnings: list)\
+            -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
+        all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
+        document_metadata = None
         file_hash = calculate_file_hash(path=path)
-        document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
+        page_count = get_pdf_page_count(path)
+        page_count = math.inf if page_count is None else page_count
+        first_page, last_page = get_param_page_slice(parameters)
 
-        all_lines = []
-        all_tables = []
-        all_tables_on_images = []
-        for page in document.get("pages", []):
+        empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page)
+        partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count)
+        if empty_page_limit or partial_page_limit:
+            warnings.append("The document is partially parsed")
+            document_metadata = dict(first_page=first_page)
+            if last_page is not None:
+                document_metadata["last_page"] = last_page
+
+            if empty_page_limit:
+                return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
+
+        # in java tabby reader page numeration starts with 1, end_page is included
+        # first_tabby_page = first_page + 1 if first_page is not None else 1
+        # last_tabby_page = None if last_page is not None and last_page > page_count else last_page
+        # document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518
+
+        document = self.__process_pdf(path=path)
+        pages = document.get("pages", [])
+        for page in pages[first_page:last_page]:
             page_lines = self.__get_lines_with_location(page, file_hash)
             if page_lines:
                 all_lines.extend(page_lines)
@@ -125,7 +133,11 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->
                 all_tables.extend(page_tables)
                 all_tables_on_images.extend(table_on_images)
 
-        return all_lines, all_tables, all_tables_on_images
+            attached_images = self.__get_attached_images(page=page)
+            if attached_images:
+                all_attached_images.extend(attached_images)
+
+        return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
 
     def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
         tables = []
@@ -168,6 +180,24 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
 
         return tables, tables_on_image
 
+    def __get_attached_images(self, page: dict) -> List[PdfImageAttachment]:
+        image_attachment_list = []
+        for image_dict in page["images"]:
+            image_location = Location(
+                page_number=page["number"],
+                bbox=BBox(x_top_left=image_dict["x_top_left"], y_top_left=image_dict["y_top_left"], width=image_dict["width"], height=image_dict["height"])
+            )
+            image_attachment = PdfImageAttachment(
+                original_name=image_dict["original_name"],
+                tmp_file_path=image_dict["tmp_file_path"],
+                need_content_analysis=False,
+                uid=f"attach_{uuid.uuid4()}",
+                location=image_location
+            )
+            image_attachment_list.append(image_attachment)
+
+        return image_attachment_list
+
     def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
         lines = []
         page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])

diff --git a/dedoc/scripts/benchmark_pdf_attachments.py b/dedoc/scripts/benchmark_pdf_attachments.py
@@ -0,0 +1,134 @@
+import json
+import os
+import shutil
+import tempfile
+import zipfile
+from collections import OrderedDict
+from typing import Tuple
+
+import wget
+
+from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor
+from dedoc.config import get_config
+from dedoc.data_structures import AttachedFile
+from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader
+
+
+def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict:
+    os.makedirs(attachments_dir)
+    result_dict = OrderedDict()
+
+    for file_name in sorted(os.listdir(input_dir)):
+        if not file_name.endswith("pdf") or file_name == "large.pdf":
+            continue
+
+        attachment_names = []
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, file_name)
+            shutil.copy(os.path.join(input_dir, file_name), file_path)
+            document = reader.read(file_path, parameters={"with_attachments": "true"})
+            os.remove(file_path)
+
+            file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
+            os.makedirs(file_attachments_dir)
+
+            png_files, json_files = 0, 0
+            for attachment in document.attachments:
+                if os.path.isfile(attachment.tmp_file_path):
+                    attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
+                    shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
+                    attachment_names.append(attachment_name)
+
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
+        result_dict[file_name] = sorted(attachment_names)
+
+    return result_dict
+
+
+def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_dir: str, attachments_dir: str) -> dict:
+    os.makedirs(attachments_dir)
+    result_dict = OrderedDict()
+
+    for file_name in sorted(os.listdir(input_dir)):
+        if not file_name.endswith("pdf"):
+            continue
+
+        attachment_names = []
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, file_name)
+            shutil.copy(os.path.join(input_dir, file_name), file_path)
+            attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={})
+            os.remove(file_path)
+
+            file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
+            os.makedirs(file_attachments_dir)
+
+            png_files, json_files = 0, 0
+            for attachment in attachments:
+                if os.path.isfile(attachment.tmp_file_path):
+                    attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
+                    shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
+                    attachment_names.append(attachment_name)
+
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
+        result_dict[file_name] = sorted(attachment_names)
+
+    return result_dict
+
+
+def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: int) -> Tuple[str, int, int]:
+    attachment_name = attachment.original_name
+    if attachment_name.endswith(".png"):
+        png_files += 1
+        attachment_name = f"{png_files}.png"
+    if attachment_name.endswith(".json"):
+        json_files += 1
+        attachment_name = f"{json_files}.json"
+    return attachment_name, png_files, json_files
+
+
+if __name__ == "__main__":
+    data_url = "https://at.ispras.ru/owncloud/index.php/s/EoczXGwWzai8ztN/download"
+    data_dir = os.path.join(get_config()["intermediate_data_path"], "benchmark_pdf_attachments")
+
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+        archive_path = os.path.join(data_dir, "with_attachments.zip")
+        wget.download(data_url, archive_path)
+        with zipfile.ZipFile(archive_path, "r") as zip_ref:
+            zip_ref.extractall(data_dir)
+        os.remove(archive_path)
+
+        print(f"Benchmark data downloaded to {data_dir}")
+    else:
+        print(f"Use cached benchmark data from {data_dir}")
+
+    in_dir = os.path.join(data_dir, "with_attachments")
+    out_dir = os.path.join(in_dir, "extracted_attachments")
+
+    if os.path.exists(out_dir):
+        shutil.rmtree(out_dir)
+    os.makedirs(out_dir)
+
+    benchmarks_dict = {}
+
+    print("Get tabby attachments")
+    tabby_reader = PdfTabbyReader(config={})
+    tabby_out_dir = os.path.join(out_dir, "tabby")
+    benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)
+
+    print("Get pdfminer attachments")
+    pdfminer_reader = PdfTxtlayerReader(config={})
+    pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
+    benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)
+
+    print("Get common attachments")
+    common_out_dir = os.path.join(out_dir, "common")
+    pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
+    benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)
+
+    json_out_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
+    with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
+        json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)
+
+    print(f"Attachments were extracted to {out_dir}")
diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py
@@ -117,6 +117,9 @@ def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Opt
         first_page = None if first_page == "" else int(first_page) - 1
         last_page = None if last_page == "" else int(last_page)
 
+        first_page = 0 if first_page is None or first_page < 0 else first_page
+        last_page = 0 if last_page and last_page < 0 else last_page
+
         return first_page, last_page
     except Exception:
         raise ValueError(f"Error input parameter 'pages'. Bad page limit {pages}")

diff --git a/docs/source/tutorials/add_new_doc_type.rst b/docs/source/tutorials/add_new_doc_type.rst
@@ -175,7 +175,8 @@ You should implement the following methods:
 For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
 For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
 You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
-We use PyPDF2 to extract the text and tabula to extract tables. They must be added to ``requirements.txt`` of the project.
+We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
+They must be added to ``requirements.txt`` of the project.
 We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
 It must be added to the reader's constructor and used in ``read`` method.