From 188865952e585d7fed195393b3013ec9b296b13f Mon Sep 17 00:00:00 2001 From: Bogatenkova Anastasiya Date: Mon, 25 Dec 2023 16:34:32 +0300 Subject: [PATCH] update master (#391) * TLDR 531 pdf_txtlayer_reader table fix (#380) * TLDR-538 tesseract trustai (#377) * fixed training script (#383) * TLDR-521 Fix splittext for file names with several dots (#385) * TLDR-527 refactor methods and parameters for all main classes (#387) * Add attach and table annotations to PPTX (#389) * TLDR-544 docx bugs (#382) * TLDR-516 GPU in docker (#384) * new version 2.0 (#390) --------- Co-authored-by: raxtemur <31087838+raxtemur@users.noreply.github.com> Co-authored-by: Oksana Belyaeva Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Alexander Golodkov Co-authored-by: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com> --- VERSION | 2 +- .../abstract_attachment_extractor.py | 28 ++- .../abstract_office_attachments_extractor.py | 5 +- .../docx_attachments_extractor.py | 15 +- .../excel_attachments_extractor.py | 17 +- .../json_attachment_extractor.py | 15 +- .../pdf_attachments_extractor.py | 40 +++-- .../pptx_attachments_extractor.py | 17 +- dedoc/attachments_extractors/utils.py | 17 -- .../attachments_handler.py | 16 +- dedoc/config.py | 16 +- dedoc/converters/__init__.py | 4 +- .../concrete_converters/abstract_converter.py | 54 +++--- .../concrete_converters/binary_converter.py | 16 +- .../concrete_converters/docx_converter.py | 24 ++- .../concrete_converters/excel_converter.py | 24 ++- .../concrete_converters/pdf_converter.py | 23 ++- .../concrete_converters/png_converter.py | 28 +-- .../concrete_converters/pptx_converter.py | 24 ++- .../concrete_converters/txt_converter.py | 22 ++- ..._converter.py => converter_composition.py} | 30 ++-- .../concrete_annotations/attach_annotation.py | 2 +- .../concrete_annotations/bbox_annotation.py | 2 +- .../linked_text_annotation.py | 2 +- .../concrete_annotations/table_annotation.py | 2 +- dedoc/dedoc_manager.py | 34 ++-- dedoc/extensions.py | 14 +- dedoc/manager_config.py | 107 +++++------ .../abstract_metadata_extractor.py | 34 ++-- .../base_metadata_extractor.py | 44 +++-- .../docx_metadata_extractor.py | 34 ++-- .../image_metadata_extractor.py | 44 +++-- .../note_metadata_extarctor.py | 37 ++-- .../pdf_metadata_extractor.py | 45 +++-- .../metadata_extractor_composition.py | 34 ++-- .../readers/archive_reader/archive_reader.py | 20 +-- dedoc/readers/base_reader.py | 38 ++-- dedoc/readers/csv_reader/csv_reader.py | 18 +- dedoc/readers/docx_reader/README.md | 3 +- .../data_structures/docx_document.py | 10 +- dedoc/readers/docx_reader/docx_reader.py | 24 +-- .../docx_reader/properties_extractor.py | 20 ++- dedoc/readers/email_reader/email_reader.py | 33 ++-- dedoc/readers/excel_reader/excel_reader.py | 17 +- .../html2pdf_reader/html2pdf_reader.py | 15 +- dedoc/readers/html_reader/html_reader.py | 20 +-- dedoc/readers/json_reader/json_reader.py | 18 +- dedoc/readers/mhtml_reader/mhtml_reader.py | 25 ++- dedoc/readers/note_reader/note_reader.py | 22 ++- .../data_classes/tables/table_tree.py | 6 +- .../pdf_auto_reader/pdf_auto_reader.py | 48 +++-- dedoc/readers/pdf_reader/pdf_base_reader.py | 40 ++--- .../columns_orientation_classifier.py | 13 +- .../ocr/ocr_cell_extractor.py | 14 +- .../pdf_image_reader/pdf_image_reader.py | 37 ++-- .../onepage_table_extractor.py | 5 +- .../table_recognizer/table_recognizer.py | 6 +- .../table_utils/img_processing.py | 44 ++--- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 34 ++-- .../pdf_txtlayer_reader.py | 21 +-- .../pdfminer_reader/pdfminer_extractor.py | 7 +- dedoc/readers/pptx_reader/pptx_reader.py | 75 ++++++-- dedoc/readers/reader_composition.py | 29 ++- dedoc/readers/txt_reader/raw_text_reader.py | 21 +-- dedoc/scripts/accsum | Bin 0 -> 39280 bytes dedoc/scripts/benchmark_pdf_attachments.py | 2 +- dedoc/scripts/calc_tesseract_benchmarks.py | 142 ++++++++++++--- dedoc/scripts/create_txtlayer_dataset.py | 8 +- .../train/train_acc_orientation_classifier.py | 11 +- .../abstract_structure_constructor.py | 6 +- .../linear_constructor.py | 2 +- .../tree_constructor.py | 2 +- .../structure_constructor_composition.py | 8 +- .../abstract_structure_extractor.py | 13 +- .../abstract_law_structure_extractor.py | 11 +- .../classifying_law_structure_extractor.py | 10 +- .../default_structure_extractor.py | 2 +- .../diploma_structure_extractor.py | 9 +- .../foiv_law_structure_extractor.py | 4 +- .../law_structure_excractor.py | 4 +- .../tz_structure_extractor.py | 10 +- .../structure_extractor_composition.py | 10 +- dedoc/train_dataset/train_dataset_utils.py | 7 +- dedoc/train_dataset/trainer/errors_saver.py | 2 +- dedoc/utils/annotation_merger.py | 30 +++- dedoc/utils/parameter_utils.py | 45 ++--- dedoc/utils/utils.py | 32 ++-- docker-compose.yml | 5 - docker_gpu/Dockerfile | 24 +++ docker_gpu/README.md | 17 ++ docker_gpu/docker-compose.yml | 33 ++++ .../dedoc_add_new_doc_type_tutorial.py | 32 ++-- .../dedoc_creating_dedoc_document.py | 2 +- .../code_examples/dedoc_usage_tutorial.py | 41 ++--- .../_static/code_examples/djvu_converter.py | 22 ++- .../code_examples/pdf_attachment_extractor.py | 15 +- .../_static/code_examples/pdf_reader.py | 21 ++- docs/source/changelog.rst | 12 ++ docs/source/getting_started/usage.rst | 62 +++---- docs/source/index.rst | 6 +- .../source/modules/attachments_extractors.rst | 6 +- docs/source/modules/converters.rst | 2 +- docs/source/modules/metadata_extractors.rst | 6 +- docs/source/modules/readers.rst | 16 -- docs/source/modules/structure_extractors.rst | 7 +- .../parameters/attachments_handling.rst | 59 ++++++ .../parameters/other_formats_handling.rst | 43 +++++ docs/source/parameters/parameters.rst | 21 +++ docs/source/parameters/pdf_handling.rst | 163 +++++++++++++++++ docs/source/parameters/structure_type.rst | 52 ++++++ docs/source/tutorials/add_new_doc_type.rst | 61 ++++--- examples/create_structured_document.py | 2 +- examples/create_unstructured_document.py | 2 +- examples/example_doc_parser.py | 2 +- examples/example_img_parser.py | 2 +- examples/example_pdf_parser.py | 4 +- resources/benchmarks/tesseract_benchmark.txt | 170 +++++++++++++++++- tests/api_tests/test_api_format_json.py | 2 +- tests/api_tests/test_api_format_pptx.py | 21 ++- .../test_api_misc_with_images_refs.py | 17 ++ tests/data/docx/size1.docx | Bin 0 -> 37083 bytes tests/data/docx/size2.docx | Bin 0 -> 9952 bytes tests/unit_tests/abstract_converter_test.py | 5 +- .../test_doctype_law_dynamic_classifier.py | 2 +- .../unit_tests/test_doctype_law_txt_reader.py | 7 +- tests/unit_tests/test_format_docx_reader.py | 25 ++- tests/unit_tests/test_format_pdf_reader.py | 23 +-- tests/unit_tests/test_format_txt_reader.py | 4 +- tests/unit_tests/test_misc_annotations.py | 28 +++ tests/unit_tests/test_misc_tasker.py | 4 +- .../test_misc_toc_feature_extractor.py | 2 +- .../test_module_attachment_extractor.py | 26 ++- tests/unit_tests/test_module_utils.py | 12 ++ 133 files changed, 1963 insertions(+), 1050 deletions(-) delete mode 100644 dedoc/attachments_extractors/utils.py rename dedoc/converters/{file_converter.py => converter_composition.py} (51%) create mode 100755 dedoc/scripts/accsum create mode 100644 docker_gpu/Dockerfile create mode 100644 docker_gpu/README.md create mode 100644 docker_gpu/docker-compose.yml create mode 100644 docs/source/parameters/attachments_handling.rst create mode 100644 docs/source/parameters/other_formats_handling.rst create mode 100644 docs/source/parameters/parameters.rst create mode 100644 docs/source/parameters/pdf_handling.rst create mode 100644 docs/source/parameters/structure_type.rst create mode 100644 tests/data/docx/size1.docx create mode 100644 tests/data/docx/size2.docx diff --git a/VERSION b/VERSION index 8cfbc905..415b19fc 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.1 \ No newline at end of file +2.0 \ No newline at end of file diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 32200e94..d62a192c 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,3 +1,4 @@ +import logging import os import uuid from abc import ABC, abstractmethod @@ -11,29 +12,40 @@ class AbstractAttachmentsExtractor(ABC): """ This class is responsible for extracting files attached to the documents of different formats. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the attachments extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ - Check if this attachments extractor can get attachments of the file with the given extension. + Check if this attachments extractor can get attachments of the file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param extension: file extension, for example .doc or .pdf + :param file_path: the path of the file to extract attachments from + :param extension: file extension with a dot, for example .doc or .pdf :param mime: MIME type of file - :param parameters: any additional parameters for given document + :param parameters: any additional parameters for the given document :return: the indicator of possibility to get attachments of this file """ pass @abstractmethod - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Extract attachments from the given file. This method can only be called on appropriate files, ensure that \ :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` is True for the given file. - :param tmpdir: directory where file is located and where the attached files will be saved - :param filename: name of the file to extract attachments (not absolute path) - :param parameters: dict with different parameters for extracting + :param file_path: path of the file to extract attachments from + :param parameters: dict with different parameters for extracting, see :ref:`attachments_handling_parameters` for more details :return: list of file's attachments """ pass diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index f8a7db1e..40fc0c62 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -1,7 +1,7 @@ import os import zipfile from abc import ABC -from typing import List, Tuple +from typing import List, Optional, Tuple import olefile from charset_normalizer import from_bytes @@ -14,6 +14,9 @@ class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC): """ Extract attachments from files of Microsoft Office format like docx, pptx, xlsx. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: """ Parse the binary content of olefile. diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index d39f288e..1c307409 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -11,25 +11,36 @@ from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes +from dedoc.utils.utils import get_mime_extension class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extract attachments from docx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .docx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given docx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) result = [] try: with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index bbcf1953..cf5cfefa 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -1,28 +1,39 @@ +import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import splitext_ +from dedoc.utils.utils import get_mime_extension, splitext_ class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extracts attachments from xlsx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .xlsx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given xlsx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) attachments = [] name, ext = splitext_(filename) if ext.lower() != ".xlsx": diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index 25a204dd..39e11c69 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -4,19 +4,28 @@ from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile +from dedoc.utils.utils import get_mime_extension class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): """ Extract attachments from json files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .json extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".json") - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given json document. Attached files are html files if the option `html_fields` is given in the `parameters`. @@ -33,6 +42,8 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) attachments = [] with open(os.path.join(tmpdir, filename)) as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index 28b5f55f..0ae13fb4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,4 +1,4 @@ -import logging +import json import os import uuid from typing import List, Optional, Tuple @@ -8,36 +8,39 @@ from PyPDF2.utils import PdfReadError from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor -from dedoc.attachments_extractors.utils import create_note from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import convert_datetime +from dedoc.utils.utils import convert_datetime, get_mime_extension, get_unique_name class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): """ Extract attachments from pdf files. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .pdf extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given pdf document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) + with open(os.path.join(tmpdir, filename), "rb") as handler: try: reader = PyPDF2.PdfFileReader(handler) @@ -74,7 +77,7 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: user = note.get("/T") data = note.get("/Contents", "") - name, content = create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) + name, content = self.__create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) attachments.append((name, bytes(content))) return attachments @@ -108,3 +111,16 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str attachments.append((name, data)) return attachments + + def __create_note(self, content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: + filename = get_unique_name("note.json") + note_dict = { + "content": content, + "modified_time": modified_time, + "created_time": created_time, + "size": size if size else len(content), + "author": author + } + encode_data = json.dumps(note_dict).encode("utf-8") + + return filename, encode_data diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py index f463b4aa..34acdef4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py @@ -1,28 +1,39 @@ +import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import splitext_ +from dedoc.utils.utils import get_mime_extension, splitext_ class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extract attachments from pptx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .pptx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given pptx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) result = [] name, ext = splitext_(filename) diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py deleted file mode 100644 index 7c99e9cf..00000000 --- a/dedoc/attachments_extractors/utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import json - -from dedoc.utils.utils import get_unique_name - - -def create_note(content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: - filename = get_unique_name("note.json") - note_dict = { - "content": content, - "modified_time": modified_time, - "created_time": created_time, - "size": size if size else len(content), - "author": author - } - encode_data = json.dumps(note_dict).encode("utf-8") - - return filename, encode_data diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 5fda5a91..1017ad45 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -2,7 +2,7 @@ import logging import os import time -from typing import List +from typing import List, Optional from dedoc.attachments_extractors import AbstractAttachmentsExtractor from dedoc.common.exceptions.dedoc_error import DedocError @@ -22,11 +22,11 @@ class AttachmentsHandler: the parsing recursion may be set via `recursion_deep_attachments` parameter. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the handler, e.g. logger for logging """ - self.config = config + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa @@ -77,10 +77,10 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct return parsed_attachment_files def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa - attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) - metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir, - filename=attachment_name, converted_filename=attachment_name, - original_filename=attachment.get_original_filename(), - parameters=parameters) + metadata = document_parser.document_metadata_extractor.extract( + file_path=attachment.get_filename_in_path(), + original_filename=attachment.get_original_filename(), + parameters=parameters + ) metadata = DocumentMetadata(**metadata) return ParsedDocument(content=get_empty_content(), metadata=metadata) diff --git a/dedoc/config.py b/dedoc/config.py index 1e7cacc0..10711e25 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -22,7 +22,7 @@ # number of parallel jobs in some tasks as OCR n_jobs=1, - # --------------------------------------------GPU SETTINGS------------------------------------------------------- + # --------------------------------------------GPU SETTINGS---------------------------------------------------------- # set gpu in XGBoost and torch models on_gpu=False, @@ -36,19 +36,9 @@ logger=logging.getLogger(), import_path_init_api_args="dedoc.api.api_args", - # ----------------------------------------TABLE RECOGNIZER SETTINGS------------------------------------------------- - min_h_cell=8, - min_w_cell=20, - type_top_attr=1, - type_left_top_attr=2, - type_left_attr=3, - max_vertical_extended=20, - minimal_cell_cnt_line=5, - minimal_cell_avg_length_line=10, - - path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"), + # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- + # path to save debug images for tables recognizer path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), - rotate_threshold=0.3, # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) diff --git a/dedoc/converters/__init__.py b/dedoc/converters/__init__.py index 2a29479e..b71ca457 100644 --- a/dedoc/converters/__init__.py +++ b/dedoc/converters/__init__.py @@ -6,7 +6,7 @@ from .concrete_converters.png_converter import PNGConverter from .concrete_converters.pptx_converter import PptxConverter from .concrete_converters.txt_converter import TxtConverter -from .file_converter import FileConverterComposition +from .converter_composition import ConverterComposition -__all__ = ["AbstractConverter", "BinaryConverter", "DocxConverter", "ExcelConverter", "FileConverterComposition", "PDFConverter", "PNGConverter", +__all__ = ["AbstractConverter", "BinaryConverter", "DocxConverter", "ExcelConverter", "ConverterComposition", "PDFConverter", "PNGConverter", "PptxConverter", "TxtConverter"] diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index 14b7ace6..a0a5baf5 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -1,7 +1,6 @@ import logging import os import subprocess -import time from abc import ABC, abstractmethod from typing import List, Optional @@ -10,42 +9,48 @@ class AbstractConverter(ABC): """ - This class provides the common methods for all converters: can_convert() and do_convert(). + This class provides the common methods for all converters: can_convert() and convert(). """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the converter, e.g. logger for logging """ self.timeout = 60 self.period_checking = 0.05 - self.config = config - self.logger = config.get("logger", logging.getLogger()) + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ - Convert the given file to another format if it's possible. - This method can only be called on appropriate files, ensure that :meth:`~dedoc.converters.AbstractConverter.can_convert` \ - is True for the given file. - If the file format is unsupported the ConversionException will be thrown. + Check if this converter can convert file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param tmp_dir: directory where the original file is located and where result will be saved - :param filename: name of the original file without extension - :param extension: extension of the original file - :return: name of the converted file + :param file_path: path of the file to convert + :param extension: file extension, for example .doc or .pdf + :param mime: MIME type of file + :param parameters: any additional parameters for the given document + :return: the indicator of possibility to convert this file """ pass @abstractmethod - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ - Check if this converter can convert file with the given extension. + Convert the given file to another format if it's possible. + This method can only be called on appropriate files, ensure that :meth:`~dedoc.converters.AbstractConverter.can_convert` \ + is True for the given file. + If the file format is unsupported the ConversionException will be thrown. - :param extension: file extension, for example .doc or .pdf - :param mime: MIME type of file - :param parameters: any additional parameters for given document - :return: the indicator of possibility to convert this file + :param file_path: path of the file to convert + :param parameters: parameters of converting, see :ref:`parameters_description` for more details + :return: path of converted file if conversion was executed """ + pass def _run_subprocess(self, command: List[str], filename: str, expected_path: str) -> None: try: @@ -63,12 +68,3 @@ def _run_subprocess(self, command: List[str], filename: str, expected_path: str) message = f"Conversion of the {filename} hadn't terminated after {self.timeout} seconds" self.logger.error(message) raise ConversionError(msg=message) - - def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: - t = 0 - while (not os.path.isfile(f"{tmp_dir}/{filename}")) and (t < self.timeout): - time.sleep(self.period_checking) - t += self.period_checking - - if t >= self.timeout: - raise ConversionError(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index 2089d66c..46142cff 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.converters.concrete_converters.png_converter import PNGConverter from dedoc.utils import supported_image_types +from dedoc.utils.utils import get_mime_extension class BinaryConverter(AbstractConverter): @@ -10,18 +11,23 @@ class BinaryConverter(AbstractConverter): Converts image-like documents with `mime=application/octet-stream` into PNG. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.png_converter = PNGConverter(config=config) + self.png_converter = PNGConverter(config=self.config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime == "application/octet-stream" and extension in supported_image_types - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like and application/octet-stream documents into files with .png extension. """ - return self.png_converter.do_convert(tmp_dir, filename, extension) + return self.png_converter.convert(file_path, parameters=parameters) diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index 2f0f30db..3b50416a 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class DocxConverter(AbstractConverter): @@ -10,23 +11,28 @@ class DocxConverter(AbstractConverter): Converts docx-like documents into DOCX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.docx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.docx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index 661fb5c2..1396a12b 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class ExcelConverter(AbstractConverter): @@ -10,23 +11,28 @@ class ExcelConverter(AbstractConverter): Converts xlsx-like documents into XLSX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is xlsx-like, e.g. it has .xls or .ods extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.xlsx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.xlsx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index e1f1c00c..f0b929e8 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PDFConverter(AbstractConverter): @@ -10,22 +11,28 @@ class PDFConverter(AbstractConverter): Converts pdf-like documents into PDF using the ddjvu application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is pdf-like, e.g. it has .djvu extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - expected_path = os.path.join(tmp_dir, f"{filename}.pdf") - command = ["ddjvu", "--format=pdf", path_in, expected_path] - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") + command = ["ddjvu", "--format=pdf", file_path, converted_file_path] + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return filename + ".pdf" + return converted_file_path diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index 3fdcac26..cb50245d 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -6,6 +6,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PNGConverter(AbstractConverter): @@ -13,26 +14,33 @@ class PNGConverter(AbstractConverter): Converts image-like documents into PNG. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like documents into files with .png extension. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - path_out = os.path.join(tmp_dir, f"{filename}.png") + file_dir, file_name = os.path.split(file_path) + name_wo_ext, extension = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.png") + if extension in [".hdr", ".pic", ".sr", ".ras", ".j2k"]: - img = cv2.imread(path_in) - cv2.imwrite(path_out, img) + img = cv2.imread(file_path) + cv2.imwrite(converted_file_path, img) else: - img = Image.open(path_in) - img.save(path_out) + img = Image.open(file_path) + img.save(converted_file_path) - return f"{filename}.png" + return converted_file_path diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index 312791fe..d1e7aec3 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PptxConverter(AbstractConverter): @@ -10,23 +11,28 @@ class PptxConverter(AbstractConverter): Converts pptx-like documents into PPTX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is pptx-like, e.g. it has .ppt or .odp extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.pptx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pptx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py index 5a8e85cc..b1543fa0 100644 --- a/dedoc/converters/concrete_converters/txt_converter.py +++ b/dedoc/converters/concrete_converters/txt_converter.py @@ -4,6 +4,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class TxtConverter(AbstractConverter): @@ -11,20 +12,27 @@ class TxtConverter(AbstractConverter): Converts txt-like documents into TXT by simple renaming. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is txt-like, e.g. it has .xml extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the txt-like documents into files with .txt extension by renaming it. """ - file_path = os.path.join(tmp_dir, f"{filename}{extension}") - converted_file_name = f"{filename}.txt" - shutil.copy(file_path, os.path.join(tmp_dir, converted_file_name)) - return converted_file_name + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.txt") + shutil.copy(file_path, converted_file_path) + + return converted_file_path diff --git a/dedoc/converters/file_converter.py b/dedoc/converters/converter_composition.py similarity index 51% rename from dedoc/converters/file_converter.py rename to dedoc/converters/converter_composition.py index 7048d0ac..cf12c2ed 100644 --- a/dedoc/converters/file_converter.py +++ b/dedoc/converters/converter_composition.py @@ -3,10 +3,10 @@ from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import get_file_mime_type, splitext_ +from dedoc.utils.utils import get_mime_extension -class FileConverterComposition(object): +class ConverterComposition(object): """ This class allows to convert any document into the predefined list of formats according to the available list of converters. The list of converters is set via the class constructor. @@ -15,28 +15,26 @@ class FileConverterComposition(object): """ def __init__(self, converters: List[AbstractConverter]) -> None: """ - :param converters: the list of converters that have methods can_convert() and do_convert(), \ + :param converters: the list of converters that have methods can_convert() and convert(), \ they are used for files converting into specified formats """ self.converters = converters - def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] = None) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert file if there is the converter that can do it. If there isn't any converter that is able to convert the file, it isn't changed. - :param tmp_dir: the directory where the file is located and where the converted file will be saved - :param filename: the name of the file to convert - :param parameters: parameters of converting - :return: name of the converted file if conversion was executed else name of the original file + :param file_path: path of the file to convert + :param parameters: parameters of converting, see :ref:`parameters_description` for more details + :return: path of converted file if conversion was executed else path of the original file """ - name, extension = splitext_(filename) - mime = get_file_mime_type(os.path.join(tmp_dir, filename)) + extension, mime = get_mime_extension(file_path=file_path) + converted_file_path = file_path + for converter in self.converters: - can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) - if can_convert: - filename = converter.do_convert(tmp_dir, name, extension) + if converter.can_convert(file_path=file_path, extension=extension, mime=mime, parameters=parameters): + converted_file_path = converter.convert(file_path, parameters=parameters) break - file_path = os.path.join(tmp_dir, filename) - os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH) - return filename + os.chmod(converted_file_path, S_IREAD | S_IRGRP | S_IROTH) + return converted_file_path diff --git a/dedoc/data_structures/concrete_annotations/attach_annotation.py b/dedoc/data_structures/concrete_annotations/attach_annotation.py index 7c34be22..b3c33bbe 100644 --- a/dedoc/data_structures/concrete_annotations/attach_annotation.py +++ b/dedoc/data_structures/concrete_annotations/attach_annotation.py @@ -15,4 +15,4 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None: :param start: start of the annotated text (usually zero) :param end: end of the annotated text (usually end of the line) """ - super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid) + super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid, is_mergeable=False) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index c08f359a..bd453d24 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -23,7 +23,7 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig if not isinstance(value, BBox): raise ValueError("the value of bounding box annotation should be instance of BBox") - super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height))) + super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)), is_mergeable=False) @staticmethod def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]: diff --git a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py index 3c62b8f2..ae6c6899 100644 --- a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py +++ b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py @@ -14,4 +14,4 @@ def __init__(self, start: int, end: int, value: str) -> None: :param end: end of the annotated text (not included) :param value: text, linked to given one, for example text of the footnote """ - super().__init__(start=start, end=end, name=LinkedTextAnnotation.name, value=value) + super().__init__(start=start, end=end, name=LinkedTextAnnotation.name, value=value, is_mergeable=False) diff --git a/dedoc/data_structures/concrete_annotations/table_annotation.py b/dedoc/data_structures/concrete_annotations/table_annotation.py index 8575d564..1b052468 100644 --- a/dedoc/data_structures/concrete_annotations/table_annotation.py +++ b/dedoc/data_structures/concrete_annotations/table_annotation.py @@ -14,4 +14,4 @@ def __init__(self, name: str, start: int, end: int) -> None: :param start: start of the annotated text (usually zero) :param end: end of the annotated text (usually end of the line) """ - super().__init__(start=start, end=end, name=TableAnnotation.name, value=name) + super().__init__(start=start, end=end, name=TableAnnotation.name, value=name, is_mergeable=False) diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index d2be05bf..64d96306 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -32,7 +32,7 @@ def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] :param manager_config: dictionary with different stage document processors. The following keys should be in the `manager_config` dictionary: - - converter (optional) (:class:`~dedoc.converters.FileConverterComposition`) + - converter (optional) (:class:`~dedoc.converters.ConverterComposition`) - reader (:class:`~dedoc.readers.ReaderComposition`) - structure_extractor (:class:`~dedoc.structure_extractors.StructureExtractorComposition`) - structure_constructor (:class:`~dedoc.structure_constructors.StructureConstructorComposition`) @@ -63,10 +63,10 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> If some error occurred, file metadata are stored in the exception's metadata field. :param file_path: full path where the file is located - :param parameters: any parameters, specify how to parse file (see API parameters documentation for more details) + :param parameters: any parameters, specify how to parse file, see :ref:`parameters_description` for more details :return: parsed document """ - parameters = self.__init_parameters(parameters) + parameters = self.__init_parameters(file_path, parameters) self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: @@ -92,37 +92,32 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) unique_filename = get_unique_name(file_name) with tempfile.TemporaryDirectory() as tmp_dir: - shutil.copy(file_path, os.path.join(tmp_dir, unique_filename)) + tmp_file_path = os.path.join(tmp_dir, unique_filename) + shutil.copy(file_path, tmp_file_path) # Step 1 - Converting - converted_filename = self.converter.do_converting(tmp_dir, unique_filename, parameters=parameters) - self.logger.info(f"Finish conversion {file_name} -> {converted_filename}") + converted_file_path = self.converter.convert(tmp_file_path) + self.logger.info(f"Finish conversion {file_name} -> {os.path.basename(converted_file_path)}") # Step 2 - Reading content - unstructured_document = self.reader.parse_file(tmp_dir=tmp_dir, filename=converted_filename, parameters=parameters) + unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters) self.logger.info(f"Finish parse file {file_name}") # Step 3 - Adding meta-information - metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir, - filename=unique_filename, - converted_filename=converted_filename, - original_filename=file_name, - parameters=parameters, - other_fields=unstructured_document.metadata) + metadata = self.document_metadata_extractor.extract(file_path=tmp_file_path, converted_filename=os.path.basename(converted_file_path), + original_filename=file_name, parameters=parameters, other_fields=unstructured_document.metadata) unstructured_document.metadata = metadata self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure - unstructured_document = self.structure_extractor.extract_structure(unstructured_document, parameters) + unstructured_document = self.structure_extractor.extract(unstructured_document, parameters) self.logger.info(f"Extract structure from file {file_name}") if self.config.get("labeling_mode", False): self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document) # Step 5 - Form the output structure - parsed_document = self.structure_constructor.structure_document(document=unstructured_document, - structure_type=parameters.get("structure_type"), - parameters=parameters) + parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters) self.logger.info(f"Get structured document {file_name}") # Step 6 - Get attachments @@ -133,13 +128,16 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) self.logger.info(f"Finish handle {file_name}") return parsed_document - def __init_parameters(self, parameters: Optional[dict]) -> dict: + def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: parameters = {} if parameters is None else parameters result_parameters = {} for parameter_name, parameter_value in self.default_parameters.items(): result_parameters[parameter_name] = parameters.get(parameter_name, parameter_value) + attachments_dir = parameters.get("attachments_dir", None) + result_parameters["attachments_dir"] = os.path.dirname(file_path) if attachments_dir is None else attachments_dir + return result_parameters def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: diff --git a/dedoc/extensions.py b/dedoc/extensions.py index d35e12bf..bddce5c8 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -45,14 +45,14 @@ ) recognized_extensions = Extensions( - excel_like_format=[], - docx_like_format=[], - pptx_like_format=[], + excel_like_format=[".xlsx"], + docx_like_format=[".docx"], + pptx_like_format=[".pptx"], archive_like_format=[".tar.gz"], - image_like_format=[], - pdf_like_format=[], + image_like_format=[".png"], + pdf_like_format=[".pdf"], csv_like_format=[".csv", ".tsv"], - txt_like_format=[] + txt_like_format=[".txt", ".txt.gz"] ) recognized_mimes = Extensions( @@ -62,7 +62,7 @@ archive_like_format=["application/zip", "application/x-tar", "application/x-rar-compressed", "application/rar", "application/x-7z-compressed"], image_like_format=["image/jpeg", "image/png", "image/tiff", "image/x-ms-bmp", "image/bmp"], pdf_like_format=["application/pdf"], - csv_like_format=[], + csv_like_format=["text/csv"], txt_like_format=["text/plain", "text/html"] ) diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index b7993f53..6854c6f4 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -1,51 +1,52 @@ from typing import Optional -from dedoc.attachments_handler.attachments_handler import AttachmentsHandler -from dedoc.converters.concrete_converters.binary_converter import BinaryConverter -from dedoc.converters.concrete_converters.docx_converter import DocxConverter -from dedoc.converters.concrete_converters.excel_converter import ExcelConverter -from dedoc.converters.concrete_converters.pdf_converter import PDFConverter -from dedoc.converters.concrete_converters.png_converter import PNGConverter -from dedoc.converters.concrete_converters.pptx_converter import PptxConverter -from dedoc.converters.concrete_converters.txt_converter import TxtConverter -from dedoc.converters.file_converter import FileConverterComposition -from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor -from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition -from dedoc.readers.archive_reader.archive_reader import ArchiveReader -from dedoc.readers.csv_reader.csv_reader import CSVReader -from dedoc.readers.docx_reader.docx_reader import DocxReader -from dedoc.readers.email_reader.email_reader import EmailReader -from dedoc.readers.excel_reader.excel_reader import ExcelReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.readers.json_reader.json_reader import JsonReader -from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader -from dedoc.readers.note_reader.note_reader import NoteReader -from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.readers.pptx_reader.pptx_reader import PptxReader -from dedoc.readers.reader_composition import ReaderComposition -from dedoc.readers.txt_reader.raw_text_reader import RawTextReader -from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor -from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor -from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition -from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor -from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition - -"""MANAGER SETTINGS""" - def _get_manager_config(config: dict) -> dict: + """ + Imports are here in order not to do all of them when someone does `import dedoc` + """ + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.converters.concrete_converters.binary_converter import BinaryConverter + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.readers.note_reader.note_reader import NoteReader + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + converters = [ DocxConverter(config=config), ExcelConverter(config=config), @@ -57,13 +58,13 @@ def _get_manager_config(config: dict) -> dict: ] readers = [ DocxReader(config=config), - ExcelReader(), - PptxReader(), - CSVReader(), + ExcelReader(config=config), + PptxReader(config=config), + CSVReader(config=config), HtmlReader(config=config), RawTextReader(config=config), NoteReader(config=config), - JsonReader(), + JsonReader(config=config), ArchiveReader(config=config), PdfAutoReader(config=config), PdfTabbyReader(config=config), @@ -74,11 +75,11 @@ def _get_manager_config(config: dict) -> dict: ] metadata_extractors = [ - DocxMetadataExtractor(), + DocxMetadataExtractor(config=config), PdfMetadataExtractor(config=config), ImageMetadataExtractor(config=config), - NoteMetadataExtractor(), - BaseMetadataExtractor() + NoteMetadataExtractor(config=config), + BaseMetadataExtractor(config=config) ] law_extractors = { @@ -86,14 +87,14 @@ def _get_manager_config(config: dict) -> dict: LawStructureExtractor.document_type: LawStructureExtractor(config=config) } structure_extractors = { - DefaultStructureExtractor.document_type: DefaultStructureExtractor(), + DefaultStructureExtractor.document_type: DefaultStructureExtractor(config=config), DiplomaStructureExtractor.document_type: DiplomaStructureExtractor(config=config), TzStructureExtractor.document_type: TzStructureExtractor(config=config), ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config) } return dict( - converter=FileConverterComposition(converters=converters), + converter=ConverterComposition(converters=converters), reader=ReaderComposition(readers=readers), structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), structure_constructor=StructureConstructorComposition( diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 602ee68e..3aa74bfe 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -8,35 +8,33 @@ class AbstractMetadataExtractor(ABC): """ @abstractmethod def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ pass @abstractmethod - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. - :param directory: path to the directory where the original and converted files are located - :param filename: name of the file after renaming (for example 23141.doc). \ - The file gets a new name during processing by the dedoc manager (if used) - :param converted_filename: name of the file after renaming and conversion (for example 23141.docx) - :param original_filename: name of the file before renaming - :param parameters: additional parameters for document parsing + :param file_path: path to the file to extract metadata. \ + If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc) + :param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \ + by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting. + :param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path + :param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details :param other_fields: other fields that should be added to the document's metadata :return: dict with metadata information about the document """ diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index e685becc..0e467760 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -1,6 +1,7 @@ +import logging import os from base64 import b64encode -from typing import Optional +from typing import Optional, Tuple from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.utils.utils import get_file_mime_type @@ -20,11 +21,17 @@ class BaseMetadataExtractor(AbstractMetadataExtractor): - time when the file was last modified. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) + def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ @@ -33,24 +40,24 @@ def can_extract(self, """ return True - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Gets the basic meta-information about the file. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters - meta_info = self._get_base_meta_information(directory, filename, original_filename) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + meta_info = self._get_base_meta_information(file_dir, file_name, original_filename) if parameters.get("is_attached", False) and str(parameters.get("return_base64", "false")).lower() == "true": other_fields = {} if other_fields is None else other_fields - path = os.path.join(directory, filename) + path = os.path.join(file_dir, converted_filename) with open(path, "rb") as file: other_fields["base64_encode"] = b64encode(file.read()).decode("utf-8") @@ -72,3 +79,10 @@ def _get_base_meta_information(directory: str, filename: str, name_actual: str) } return meta + + def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]: + file_dir, file_name = os.path.split(file_path) + converted_filename = file_name if converted_filename is None else converted_filename + original_filename = file_name if original_filename is None else original_filename + + return file_dir, file_name, converted_filename, original_filename diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 49b87001..be0964c2 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -22,36 +22,40 @@ class DocxMetadataExtractor(BaseMetadataExtractor): - author who last modified the file; - created, modified and last printed date. """ + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .docx extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) return converted_filename.lower().endswith("docx") - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the docx documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) - file_path = os.path.join(directory, converted_filename) + file_path = os.path.join(file_dir, converted_filename) docx_other_fields = self._get_docx_fields(file_path) result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index 31062c72..465c9dea 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -1,4 +1,3 @@ -import logging import math import os from typing import Optional, Union @@ -28,12 +27,9 @@ class ImageMetadataExtractor(BaseMetadataExtractor): - subject distance range; - user comment. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - self.logger = config.get("logger", logging.getLogger()) - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.keys = { "DateTime": ("date_time", self.__parse_date), "DateTimeDigitized": ("date_time_digitized", self.__parse_date), @@ -53,33 +49,33 @@ def __init__(self, *, config: dict) -> None: } def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has image-like extension (".png", ".jpg", ".jpeg"). Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith((".png", ".jpg", ".jpeg")) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith((".png", ".jpg", ".jpeg")) - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for images. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) - path = os.path.join(directory, filename) + path = os.path.join(file_dir, converted_filename) exif_fields = self._get_exif(path) if len(exif_fields) > 0: result["other_fields"] = {**result.get("other_fields", {}), **exif_fields} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 18b49d6b..e0dc4b6e 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -13,36 +13,37 @@ class NoteMetadataExtractor(BaseMetadataExtractor): In addition to them, the `author` field can be added to the metadata other fields. """ - def __init__(self) -> None: - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .note.pickle extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith(".note.pickle") - - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith(".note.pickle") + + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the .note.pickle documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) try: - file_path = os.path.join(directory, filename) + file_path = os.path.join(file_dir, converted_filename) with open(file_path, "rb") as infile: note_dict = pickle.load(infile) @@ -58,4 +59,4 @@ def extract_metadata(self, other_fields=other_fields) return meta_info except Exception: - raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 274a8d26..e3502e44 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -1,4 +1,3 @@ -import logging import os from typing import Optional @@ -24,11 +23,9 @@ class PdfMetadataExtractor(BaseMetadataExtractor): - creation date; - modification date. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.keys = { "/Producer": "producer", "/Creator": "creator", @@ -42,36 +39,34 @@ def __init__(self, *, config: dict) -> None: "/CreationDate": "creation_date", "/ModDate": "modification_date", } - self.config = config - self.logger = config.get("logger", logging.getLogger()) def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .pdf extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith(".pdf") + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith(".pdf") - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the pdf documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) - path = os.path.join(directory, filename) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) + path = os.path.join(file_dir, converted_filename) pdf_fields = self._get_pdf_info(path) if len(pdf_fields) > 0: result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields} diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index e9c182d4..ba46c4b0 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,3 +1,4 @@ +import os.path from typing import List, Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -12,33 +13,24 @@ class MetadataExtractorComposition: """ def __init__(self, extractors: List[AbstractMetadataExtractor]) -> None: """ - :param extractors: the list of extractors with methods can_extract() and extract_metadata() to extract metadata from file + :param extractors: the list of extractors with methods can_extract() and extract() to extract metadata from file """ self.extractors = extractors - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Extract metadata using one of the extractors if suitable extractor was found. - Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` of the class + Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` of the class :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters. """ for extractor in self.extractors: - if extractor.can_extract(directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, + if extractor.can_extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, other_fields=other_fields): - return extractor.extract_metadata(directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - other_fields=other_fields) - raise Exception(f"Can't extract metadata from from file {filename}") + return extractor.extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) + raise Exception(f"Can't extract metadata from from file {os.path.basename(file_path)}") diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index fac1b86c..d8831b58 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -1,4 +1,3 @@ -import logging import os import tarfile import uuid @@ -14,7 +13,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_file_mime_type, save_data_to_unique_file +from dedoc.utils.utils import get_file_mime_type, get_mime_extension, save_data_to_unique_file class ArchiveReader(BaseReader): @@ -22,21 +21,18 @@ class ArchiveReader(BaseReader): This reader allows to get archived files as attachments of the :class:`~dedoc.data_structures.UnstructuredDocument`. Documents with the following extensions can be parsed: .zip, .tar, .tar.gz, .rar, .7z. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return empty content of archive, all content will be placed inside attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. @@ -48,10 +44,10 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return UnstructuredDocument(lines=[], tables=[], attachments=[]) attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" - attachments = self.__get_attachments(path=path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis) + attachments = self.__get_attachments(path=file_path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis) return UnstructuredDocument(lines=[], tables=[], attachments=attachments) def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]: diff --git a/dedoc/readers/base_reader.py b/dedoc/readers/base_reader.py index 247270e9..6d857107 100644 --- a/dedoc/readers/base_reader.py +++ b/dedoc/readers/base_reader.py @@ -1,3 +1,4 @@ +import logging from abc import ABC, abstractmethod from typing import Optional @@ -14,34 +15,39 @@ class BaseReader(ABC): Some of the readers can also extract information about line type and hierarchy level (for example, list item) - this information is stored in the `tag_hierarchy_level` attribute of the class :class:`~dedoc.data_structures.LineMetadata`. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the reader, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ - Read file from disk and extract text with annotations, tables and attachments from the document. - The given file should have appropriate extension and type so it should be checked by the method - :meth:`~dedoc.readers.BaseReader.can_read`, which should return True beforehand. + Check if this reader can handle the given file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param path: path to the file in the file system - :param document_type: type of the file, for example scientific article, presentation slides and so on - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path to the file in the file system + :param mime: MIME type of a file + :param extension: file extension, for example .doc or .pdf + :param parameters: dict with additional parameters for document reader, see :ref:`parameters_description` for more details - :return: intermediate representation of the document with lines, tables and attachments + :return: True if this reader can handle the file, False otherwise """ pass @abstractmethod - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ - Check if this reader can handle the given file. + Read file from disk and extract text with annotations, tables and attachments from the document. + The given file should have appropriate extension and mime type, so it should be checked by the method + :meth:`~dedoc.readers.BaseReader.can_read`, which should return True beforehand. - :param path: path to the file in the file system - :param mime: MIME type of a file - :param extension: file extension, for example .doc or .pdf - :param document_type: type of file, for example scientific article, presentation slides and so on - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path to the file in the file system + :param parameters: dict with additional parameters for document reader, see :ref:`parameters_description` for more details - :return: True if this reader can handle the file, False otherwise + :return: intermediate representation of the document with lines, tables and attachments """ pass diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index be02a6b8..d1de64ed 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -8,34 +8,38 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_encoding +from dedoc.utils.utils import get_encoding, get_mime_extension class CSVReader(BaseReader): """ This class allows to parse files with the following extensions: .csv, .tsv. """ - def __init__(self) -> None: + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.default_separator = "," - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.csv_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method will place all extracted content inside tables of the :class:`~dedoc.data_structures.UnstructuredDocument`. The lines and attachments remain empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + parameters = {} if parameters is None else parameters delimiter = parameters.get("delimiter") if delimiter is None: - delimiter = "\t" if path.endswith(".tsv") else self.default_separator - encoding, encoding_warning = self.__get_encoding(path, parameters) - with open(path, errors="ignore", encoding=encoding) as file: + delimiter = "\t" if file_path.endswith(".tsv") else self.default_separator + encoding, encoding_warning = self.__get_encoding(file_path, parameters) + with open(file_path, errors="ignore", encoding=encoding) as file: csv_reader = csv.reader(file, delimiter=delimiter) data = list(csv_reader) table_metadata = TableMetadata(page_id=0) diff --git a/dedoc/readers/docx_reader/README.md b/dedoc/readers/docx_reader/README.md index 35e2343c..ad28b7cc 100644 --- a/dedoc/readers/docx_reader/README.md +++ b/dedoc/readers/docx_reader/README.md @@ -1,6 +1,5 @@ # Docx reader documentation - -[стандарт Office Open XML File Formats с. 28-62; 167-1301](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-376,%20Fifth%20Edition,%20Part%201%20-%20Fundamentals%20And%20Markup%20Language%20Reference.zip) +[Стандарт Office Open XML File Formats с. 28-62; 167-1301](https://ecma-international.org/wp-content/uploads/ECMA-376-1_5th_edition_december_2016.zip) ## Структура docx diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 152ea31f..49d402e8 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -73,7 +73,7 @@ def __get_lines(self) -> List[LineWithMeta]: self.__handle_table_xml(paragraph_xml, table_refs) continue - if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor + if self.attachment_name2uid and paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor self.__handle_diagram_xml(paragraph_xml, diagram_refs) continue @@ -84,9 +84,11 @@ def __get_lines(self) -> List[LineWithMeta]: continue self.paragraph_list.append(self.paragraph_maker.make_paragraph(paragraph_xml, self.paragraph_list)) - images = paragraph_xml.find_all("pic:pic") - if images: - self.__handle_images_xml(images, image_refs) + + if self.attachment_name2uid: + images = paragraph_xml.find_all("pic:pic") + if images: + self.__handle_images_xml(images, image_refs) return self.__paragraphs2lines(image_refs, table_refs, diagram_refs) diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 552b09c5..1e503738 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -1,5 +1,3 @@ -import logging -import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor @@ -9,6 +7,7 @@ from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument +from dedoc.utils.utils import get_mime_extension class DocxReader(BaseReader): @@ -16,30 +15,31 @@ class DocxReader(BaseReader): This class is used for parsing documents with .docx extension. Please use :class:`~dedoc.converters.DocxConverter` for getting docx file from similar formats. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.attachment_extractor = DocxAttachmentsExtractor() - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = DocxAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) - docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger) + with_attachments = self.attachment_extractor.with_attachments(parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] + + docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/docx_reader/properties_extractor.py b/dedoc/readers/docx_reader/properties_extractor.py index 0e92b80a..b5349a27 100644 --- a/dedoc/readers/docx_reader/properties_extractor.py +++ b/dedoc/readers/docx_reader/properties_extractor.py @@ -1,8 +1,16 @@ +from typing import Union + from bs4 import Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties +def spacing_to_float(spacing: Union[str, int, float]) -> float: + if str(spacing).endswith("pt"): + return float(spacing[:-2]) + return float(spacing) + + def check_if_true(value: str) -> bool: if value == "1" or value == "True" or value == "true": return True @@ -79,7 +87,7 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None: ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"] } for attribute in attributes: - attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0)) + attributes[attribute] = spacing_to_float(tree.ind.get(f"w:{attribute}", 0)) indentation = 0 if attributes["left"] != 0: @@ -109,7 +117,7 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None: :param tree: BeautifulSoup tree with properties """ if tree.sz: - new_size = float(tree.sz.get("w:val", old_properties.size)) + new_size = spacing_to_float(tree.sz.get("w:val", old_properties.size)) old_properties.size = int(new_size) @@ -180,19 +188,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None: if not before_autospacing: before_lines = tree.spacing.get("w:beforeLines", False) - before_lines = int(float(before_lines)) if before_lines else before_lines + before_lines = int(spacing_to_float(before_lines)) if before_lines else before_lines if not before_lines: before_tag = tree.spacing.get("w:before", False) - before = int(float(before_tag)) if before_tag else before + before = int(spacing_to_float(before_tag)) if before_tag else before else: before = before_lines if not after_autospacing: after_lines = tree.spacing.get("w:afterLines", False) - after_lines = int(float(after_lines)) if after_lines else after_lines + after_lines = int(spacing_to_float(after_lines)) if after_lines else after_lines if not after_lines: after_tag = tree.spacing.get("w:after", False) - after = int(float(after_tag)) if after_tag else after + after = int(spacing_to_float(after_tag)) if after_tag else after else: after = after_lines diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 93c02c4d..7a239e31 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -1,6 +1,5 @@ import email import json -import logging import mimetypes import os import re @@ -17,29 +16,27 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils.utils import get_unique_name, save_data_to_unique_file +from dedoc.utils.utils import get_mime_extension, get_unique_name, save_data_to_unique_file class EmailReader(BaseReader): """ This class is used for parsing documents with .eml extension (e-mail messages saved into files). """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - super().__init__() - self.logger = config.get("logger", logging.getLogger()) - self.html_reader = HtmlReader(config=config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.html_reader = HtmlReader(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension or mime is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ - return path.lower().endswith(".eml") or mime == "message/rfc822" + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return file_path.lower().endswith(".eml") or mime == "message/rfc822" - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. @@ -50,9 +47,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ parameters = {} if parameters is None else parameters attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir - with open(path, "rb") as f: + with open(file_path, "rb") as f: msg = email.message_from_binary_file(f) tables, attachments = [], [] @@ -77,7 +74,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio if content_type == "text/plain": text_parts.append(msg) if content_type == "text/html": - self.__add_content_from_html(msg, lines, tables) + self.__add_content_from_html(msg, lines, tables, parameters) html_found = True for part in msg.walk(): @@ -87,7 +84,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio continue if content_type == "text/html": - self.__add_content_from_html(part, lines, tables) + self.__add_content_from_html(part, lines, tables, parameters) html_found = True continue @@ -131,7 +128,7 @@ def __add_attachment(self, message: Message, attachments_dir: str, attachments: uid=f"attach_{uuid.uuid1()}", need_content_analysis=need_content_analysis)) - def __add_content_from_html(self, message: Message, lines: list, tables: list) -> None: + def __add_content_from_html(self, message: Message, lines: list, tables: list, parameters: dict) -> None: payload = message.get_payload(decode=True) if payload is None: return @@ -143,7 +140,7 @@ def __add_content_from_html(self, message: Message, lines: list, tables: list) - file.write(payload) file.flush() - document = self.html_reader.read(path=file.name) + document = self.html_reader.read(file_path=file.name, parameters=parameters) part_messages = [line for line in document.lines if line.line is not None] for line in part_messages: line._line += "\n" diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index e846c0ca..91501e97 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -1,4 +1,3 @@ -import os from typing import Optional import xlrd @@ -12,6 +11,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension xlrd.xlsx.ensure_elementtree_imported(False, None) xlrd.xlsx.Element_has_iter = True @@ -22,30 +22,33 @@ class ExcelReader(BaseReader): This class is used for parsing documents with .xlsx extension. Please use :class:`~dedoc.converters.ExcelConverter` for getting xlsx file from similar formats. """ - def __init__(self) -> None: - self.attachment_extractor = ExcelAttachmentsExtractor() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = ExcelAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method extracts tables and attachments from the document, `lines` attribute remains empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with xlrd.open_workbook(path) as book: + with xlrd.open_workbook(file_path) as book: sheets_num = book.nsheets tables = [] for sheet_num in range(sheets_num): sheet = book.sheet_by_index(sheet_num) tables.append(self.__parse_sheet(sheet_num, sheet)) if self.attachment_extractor.with_attachments(parameters=parameters): - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) else: attachments = [] return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/html2pdf_reader/html2pdf_reader.py b/dedoc/readers/html2pdf_reader/html2pdf_reader.py index 678ba53a..f18cbf16 100644 --- a/dedoc/readers/html2pdf_reader/html2pdf_reader.py +++ b/dedoc/readers/html2pdf_reader/html2pdf_reader.py @@ -1,4 +1,3 @@ -import logging import os import re from copy import deepcopy @@ -19,22 +18,20 @@ class Html2PdfReader(HtmlReader): - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.pdf_reader = PdfTxtlayerReader(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) + self.pdf_reader = PdfTxtlayerReader(config=self.config) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: parameters = {} if parameters is None else parameters with TemporaryDirectory() as tmp_dir: - modified_path, tables = self._modify_html(path, tmp_dir) - converted_path = os.path.join(tmp_dir, os.path.basename(path).replace(".html", ".pdf")) + modified_path, tables = self._modify_html(file_path, tmp_dir) + converted_path = os.path.join(tmp_dir, os.path.basename(file_path).replace(".html", ".pdf")) HTML(filename=modified_path).write_pdf(converted_path) self.logger.info(f"Convert {modified_path} to {converted_path}") parameters_new = deepcopy(parameters) parameters_new["pdf_with_text_layer"] = "true" - unstructured_document = self.pdf_reader.read(path=converted_path, document_type=document_type, parameters=parameters_new) + unstructured_document = self.pdf_reader.read(file_path=converted_path, parameters=parameters_new) return self._add_tables(document=unstructured_document, tables=tables) diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index fe97614a..0e4e0a45 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -1,5 +1,4 @@ import hashlib -import logging import string from typing import List, Optional, Union @@ -17,7 +16,7 @@ from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser from dedoc.readers.html_reader.html_tags import HtmlTags -from dedoc.utils.utils import calculate_file_hash +from dedoc.utils.utils import calculate_file_hash, get_mime_extension class HtmlReader(BaseReader): @@ -25,34 +24,31 @@ class HtmlReader(BaseReader): This reader allows to handle documents with the following extensions: .html, .shtml """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.postprocessor = HtmlLinePostprocessing() self.tag_annotation_parser = HtmlTagAnnotationParser() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in [".html", ".shtml"] or mime in ["text/html"] - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines and tables, attachments remain empty. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with open(path, "rb") as f: + with open(file_path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true" - path_hash = calculate_file_hash(path=path) + path_hash = calculate_file_hash(path=file_path) lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table) tables = [ self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table) diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py index 7f3cb2f2..f408674f 100644 --- a/dedoc/readers/json_reader/json_reader.py +++ b/dedoc/readers/json_reader/json_reader.py @@ -1,4 +1,3 @@ -import os from json import JSONDecodeError from typing import Any, List, Optional @@ -12,24 +11,27 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class JsonReader(BaseReader): """ This reader allows handle json files. """ - def __init__(self) -> None: - super().__init__() - self.attachment_extractor = JsonAttachmentsExtractor() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = JsonAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (it has .json extension). Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".json") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines and attachments, tables remain empty. This reader considers json lists as list items and adds this information to the `tag_hierarchy_level` @@ -38,7 +40,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with open(path) as file: + with open(file_path) as file: try: json_data = json.load(file) except (JSONDecodeError, ValueError): @@ -51,7 +53,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio except (JSONDecodeError, ValueError): raise BadParametersError(f"can't read html_fields {fields}") json_data = self.__exclude_html_fields(json_data, key_fields) - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) else: attachments = [] diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 79b5d7bd..ea980dec 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -1,6 +1,5 @@ import email import gzip -import logging import os import uuid from typing import List, Optional, Tuple @@ -13,32 +12,30 @@ from dedoc.readers.base_reader import BaseReader from dedoc.readers.html_reader.html_reader import HtmlReader from dedoc.utils import supported_image_types -from dedoc.utils.utils import check_filename_length, get_encoding, save_data_to_unique_file +from dedoc.utils.utils import check_filename_length, get_encoding, get_mime_extension, save_data_to_unique_file class MhtmlReader(BaseReader): """ This reader can process files with the following extensions: .mhtml, .mht, .mhtml.gz, .mht.gz """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.mhtml_extensions = [".mhtml", ".mht"] self.mhtml_extensions += [f"{extension}.gz" for extension in self.mhtml_extensions] self.mhtml_extensions = tuple(self.mhtml_extensions) - self.html_reader = HtmlReader(config=config) + self.html_reader = HtmlReader(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(tuple(self.mhtml_extensions)) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. @@ -46,15 +43,15 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ parameters = {} if parameters is None else parameters attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir - names_list, original_names_list = self.__extract_files(path=path, save_dir=attachments_dir) + names_list, original_names_list = self.__extract_files(path=file_path, save_dir=attachments_dir) names_html = self.__find_html(names_list=names_list) lines = [] tables = [] for html_file in names_html: - result = self.html_reader.read(path=html_file, parameters=parameters, document_type=document_type) + result = self.html_reader.read(file_path=html_file, parameters=parameters) lines.extend(result.lines) tables.extend(result.tables) diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py index 350e3bf0..836a98bb 100644 --- a/dedoc/readers/note_reader/note_reader.py +++ b/dedoc/readers/note_reader/note_reader.py @@ -1,4 +1,3 @@ -import logging import os import pickle from typing import Optional @@ -7,34 +6,33 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class NoteReader(BaseReader): """ This class is used for parsing documents with .note.pickle extension. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".note.pickle") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ try: - with open(path, "rb") as infile: + with open(file_path, "rb") as infile: note_dict = pickle.load(infile) text = note_dict["content"] if isinstance(text, bytes): @@ -44,5 +42,5 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return unstructured except Exception as e: - self.logger.warning(f"Can't handle {path}\n{e}") - raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(path)}. Seems note-format is broken") + self.logger.warning(f"Can't handle {file_path}\n{e}") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken") diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py index 2bf9e9a5..5516bd71 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py @@ -21,6 +21,10 @@ class TableTree(object): Table which has cells as sorted childs of tree. Table has type of tree and was obtained with help contour analysis. """ + min_h_cell = 8 + min_w_cell = 20 + minimal_cell_cnt_line = 5 + minimal_cell_avg_length_line = 10 def __init__(self, *, config: dict) -> None: self.left = None @@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> " if h[3] == cur.id_contours: bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height] # Эвристика №1 на ячейку - if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]: + if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell: if self.config.get("debug_mode", False): self.logger.debug(f"Contour {i} isn't correct") continue diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 6ed650ef..c91cc779 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -1,5 +1,4 @@ import copy -import logging import os from itertools import chain from typing import Optional @@ -14,6 +13,7 @@ from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer +from dedoc.utils.utils import get_mime_extension class PdfAutoReader(BaseReader): @@ -28,55 +28,49 @@ class PdfAutoReader(BaseReader): * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config) - self.pdf_tabby_reader = PdfTabbyReader(config=config) - self.pdf_image_reader = PdfImageReader(config=config) - self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config) - - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config) + self.pdf_tabby_reader = PdfTabbyReader(config=self.config) + self.pdf_image_reader = PdfImageReader(config=self.config) + self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `auto` or `auto_tabby` is set in the dictionary `parameters`. It is recommended to use `pdf_with_text_layer=auto_tabby` because it's faster and allows to get better results. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. - - Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ - if mime not in recognized_mimes.pdf_like_format: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + if not (mime in recognized_mimes.pdf_like_format or extension.lower() == ".pdf"): return False parameters = {} if parameters is None else parameters - pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters) - return pdf_with_txt_layer in ("auto", "auto_tabby") + return get_param_pdf_with_txt_layer(parameters) in ("auto", "auto_tabby") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ warnings = [] - txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=path, parameters=parameters) + txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters) if txtlayer_parameters.is_correct_text_layer: result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct, parameters=parameters, - path=path, + path=file_path, warnings=warnings) else: - result = self.__handle_incorrect_text_layer(parameters, path, warnings) + result = self.__handle_incorrect_text_layer(parameters, file_path, warnings) result.warnings.extend(warnings) return result @@ -84,7 +78,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument: self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer") warnings.append("Assume document has incorrect textual layer") - result = self.pdf_image_reader.read(path=path, parameters=parameters_copy) + result = self.pdf_image_reader.read(file_path=path, parameters=parameters_copy) return result def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument: @@ -99,14 +93,14 @@ def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: d # GET THE FIRST PAGE: recognize the first page like a scanned page scan_parameters = self.__preparing_first_page_parameters(parameters) - recognized_first_page = self.pdf_image_reader.read(path=path, parameters=scan_parameters) + recognized_first_page = self.pdf_image_reader.read(file_path=path, parameters=scan_parameters) # PREPARE PARAMETERS: from the second page we recognize the content like PDF with a textual layer parameters = self.__preparing_other_pages_parameters(parameters) pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters) reader = self.pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self.pdf_tabby_reader - result = reader.read(path=path, parameters=parameters) + result = reader.read(file_path=path, parameters=parameters) result = self.__merge_documents(recognized_first_page, result) if recognized_first_page is not None else result return result diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 66c2be25..d52e0d3c 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,4 +1,3 @@ -import logging import math import os from abc import abstractmethod @@ -39,7 +38,6 @@ "orient_cell_angle", "is_one_column_document", "document_orientation", - "document_type", "language", "need_header_footers_analysis", "need_pdf_table_analysis", @@ -55,29 +53,28 @@ class PdfBaseReader(BaseReader): """ Base class for pdf documents parsing. """ - def __init__(self, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - config["n_jobs"] = config.get("n_jobs", 1) - self.table_recognizer = TableRecognizer(config=config) - self.metadata_extractor = LineMetadataExtractor(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) - self.attachment_extractor = PDFAttachmentsExtractor(config=config) - self.linker = LineObjectLinker(config=config) - self.paragraph_extractor = ScanParagraphClassifierExtractor(config=config) - - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.config["n_jobs"] = config.get("n_jobs", 1) + self.table_recognizer = TableRecognizer(config=self.config) + self.metadata_extractor = LineMetadataExtractor(config=self.config) + self.attachment_extractor = PDFAttachmentsExtractor(config=self.config) + self.linker = LineObjectLinker(config=self.config) + self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config) + + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ parameters = {} if parameters is None else parameters first_page, last_page = param_utils.get_param_page_slice(parameters) attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), @@ -85,7 +82,6 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), - document_type=document_type, need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), need_pdf_table_analysis=param_utils.get_param_need_pdf_table_analysis(parameters), first_page=first_page, @@ -95,7 +91,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio attachments_dir=attachments_dir ) - lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse) + lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) tables = [] for scan_table in scan_tables: metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle) @@ -103,10 +99,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio table = Table(metadata=metadata, cells=cells_with_meta) tables.append(table) - if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters): - tmp_dir = os.path.dirname(path) - file_name = os.path.basename(path) - attachments += self.attachment_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters=parameters) + if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): + attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields) return self._postprocess(result) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py index 5897ecb0..bcd1fd68 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py @@ -19,28 +19,25 @@ class ColumnsOrientationClassifier(object): Class Classifier for work with Orientation Network. This class set device, preprocessing (transform) input data, weights of model """ - - _nets = {} - def __init__(self, on_gpu: bool, checkpoint_path: Optional[str], *, config: dict) -> None: self.logger = config.get("logger", logging.getLogger()) self._set_device(on_gpu) self._set_transform_image() self.checkpoint_path = path.abspath(checkpoint_path) self.classes = [1, 2, 0, 90, 180, 270] + self._net = None @property def net(self) -> ClassificationModelTorch: - # lazy loading and net sharing, comrade - if self.checkpoint_path not in self._nets: + if self._net is None: if self.checkpoint_path is not None: net = ClassificationModelTorch(path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth")) self._load_weights(net) else: net = ClassificationModelTorch(None) - net.to(self.device) - self._nets[self.checkpoint_path] = net - return self._nets[self.checkpoint_path] + self._net = net + self._net.to(self.device) + return self._net @staticmethod def my_resize(image: Image) -> Image: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 04fdd2d1..c9ef35a8 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -11,6 +11,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells from dedoc.utils.image_utils import get_highest_pixel_frequency +from dedoc.utils.parameter_utils import get_path_param class OCRCellExtractor: @@ -30,7 +31,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], for num_batch, nodes_batch in enumerate(batches): if self.config.get("debug_mode", False): - tmp_dir = os.path.join(self.config.get("path_debug"), "debug_tables/batches/") + tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables/batches/") os.makedirs(tmp_dir, exist_ok=True) for i, table_tree_node in enumerate(nodes_batch): cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box)) @@ -64,7 +65,9 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes) if self.config.get("debug_mode", False): - image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png") + debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") + os.makedirs(debug_dir, exist_ok=True) + image_path = os.path.join(debug_dir, f"stacked_batch_image_{num_batch}.png") cv2.imwrite(image_path, concatenated) ocr_result = get_text_with_bbox_from_cells(concatenated, language, ocr_conf_threshold=0.0) @@ -82,8 +85,11 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr for tree_node in tree_table_nodes: x_coord = space cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box) - image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png") - cv2.imwrite(image_path, cell_image) + if self.config.get("debug_mode", False): + debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") + os.makedirs(debug_dir, exist_ok=True) + image_path = os.path.join(debug_dir, "cell_croped.png") + cv2.imwrite(image_path, cell_image) cell_height, cell_width = cell_image.shape[0], cell_image.shape[1] stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 72498e70..2daede1b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,4 +1,3 @@ -import logging import os from datetime import datetime from typing import List, Optional, Tuple @@ -17,6 +16,8 @@ from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox from dedoc.utils import supported_image_types +from dedoc.utils.parameter_utils import get_path_param +from dedoc.utils.utils import get_mime_extension class PdfImageReader(PdfBaseReader): @@ -41,27 +42,23 @@ class PdfImageReader(PdfBaseReader): It isn't recommended to use this reader for extracting content from PDF documents with a correct textual layer, use other PDF readers instead. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.scew_corrector = SkewCorrector() + self.skew_corrector = SkewCorrector() self.column_orientation_classifier = ColumnsOrientationClassifier(on_gpu=self.config.get("on_gpu", False), - checkpoint_path=get_config()["resources_path"], config=config) + checkpoint_path=get_config()["resources_path"], config=self.config) self.binarizer = AdaptiveBinarizer() - self.ocr = OCRLineExtractor(config=config) - self.logger = config.get("logger", logging.getLogger()) - if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]): - os.makedirs(self.config["path_debug"]) + self.ocr = OCRLineExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader, i.e. it has .pdf extension, or it is an image. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime in recognized_mimes.pdf_like_format or mime in recognized_mimes.image_like_format or \ - path.lower().endswith(tuple(recognized_extensions.image_like_format)) or extension.lower().replace(".", "") in supported_image_types + file_path.lower().endswith(tuple(recognized_extensions.image_like_format)) or extension.lower().replace(".", "") in supported_image_types def _process_one_page(self, image: np.ndarray, @@ -70,14 +67,15 @@ def _process_one_page(self, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: # --- Step 1: correct orientation and detect column count --- rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) - if self.config.get("debug_mode"): + if self.config.get("debug_mode", False): self.logger.info(f"Angle page rotation = {angle}") # --- Step 2: do binarization --- if parameters.need_binarization: rotated_image, _ = self.binarizer.preprocess(rotated_image) - if self.config.get("debug_mode"): - cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image) + if self.config.get("debug_mode", False): + debug_dir = get_path_param(self.config, "path_debug") + cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image) # --- Step 3: table detection and recognition --- if parameters.need_pdf_table_analysis: @@ -119,11 +117,12 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa angle = angle if parameters.document_orientation is None else 0 self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}") - rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle}) + rotated_image, result_angle = self.skew_corrector.preprocess(image, {"orientation_angle": angle}) result_angle = result_angle["rotated_angle"] - if self.config.get("debug_mode"): - img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") + if self.config.get("debug_mode", False): + debug_dir = get_path_param(self.config, "path_debug") + img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") self.logger.info(f"Save image to {img_path}") cv2.imwrite(img_path, rotated_image) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index de623863..c946cccf 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor): def __init__(self, *, config: dict, logger: logging.Logger) -> None: super().__init__(config=config, logger=logger) + self.image = None self.page_number = 0 self.attribute_selector = TableAttributeExtractor(logger=self.logger) @@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool: avg_len_part = np.average(len_parts) # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие - if len(parts) > self.config["minimal_cell_cnt_line"] \ - and avg_len_part < self.config["minimal_cell_avg_length_line"]: + if len(parts) > TableTree.minimal_cell_cnt_line \ + and avg_len_part < TableTree.minimal_cell_avg_length_line: return True return False diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index e07a3171..4a61530a 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -19,6 +19,7 @@ class TableRecognizer(object): + def __init__(self, *, config: dict = None) -> None: self.logger = config.get("logger", logging.getLogger(__name__)) @@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None: self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger) self.config = config self.table_type = TableTypeAdditionalOptions() - if config.get("debug", False): - if not os.path.exists(self.config["path_cells"]): - os.makedirs(self.config["path_cells"]) - if not os.path.exists(self.config["path_detect"]): - os.makedirs(self.config["path_detect"]) def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index b24c1a53..4aad36fa 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -10,16 +10,19 @@ from dedoc.config import get_config from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions +from dedoc.utils.parameter_utils import get_path_param logger = get_config().get("logger", logging.getLogger()) logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours") table_options = TableTypeAdditionalOptions() +ROTATE_THRESHOLD = 0.3 + def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray: """rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction""" if threshold is None: - threshold = config["rotate_threshold"] + threshold = ROTATE_THRESHOLD rotated = img if abs(angle) > threshold: if config.get("debug_mode", False): @@ -79,9 +82,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An img_bin = 255 - img_bin if config.get("debug_mode", False): - os.makedirs(config["path_cells"], exist_ok=True) - os.makedirs(config["path_detect"], exist_ok=True) - cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin) # step 2 img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables") # step 3 @@ -89,33 +90,33 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin.jpg"), img_final_bin) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin_houph.jpg"), img_final_bin_houph) # step 4 - rotating img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config) img = rotate_with_threshold(img, angle_alignment, config=config) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "aligned_img.jpg"), img) img_final_bin_houph = __paint_bounds(img_final_bin_houph) # step 5 - detect contours contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) img_w_contour = img.copy() cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8) - cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_contours.jpg"), img_w_contour) # Draw external contours for tables without external contours. It is a rare case, but important for invoices if table_options.table_wo_external_bounds in table_type: - contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy) + contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, config) return contours, hierarchy, img, angle_alignment -def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]: +def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, config: dict) -> [Any, Any]: # get children (get table counters) contours = np.array(contours) list_contours, table_contours = __get_table_contours(contours, hierarchy) @@ -137,8 +138,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou x, y, w, h = cv2.boundingRect(c) cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5) - if get_config().get("debug_mode", False): - cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours) + if config.get("debug_mode", False): + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_external_bounds.jpg"), img_with_contours) contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) return contours, hierarchy @@ -172,7 +173,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg) else: gap_avg = 5 - if config["debug_mode"]: + if config.get("debug_mode", False): config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}") # ----- image alignment ----- @@ -191,8 +192,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta elif task == "tables": length_div = 55 height_div = 100 - kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"]) # 35 - kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"]) # 100 + + kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell) # 35 + kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell) # 100 # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image. verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height)) @@ -211,8 +213,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img) - cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "verticle_lines.jpg"), verticle_lines_img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "horizontal_lines.jpg"), horizontal_lines_img) """Now we will add these two images. This will have only boxes and the information written in the box will be erased. @@ -228,7 +230,7 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2) (thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_bin_with_lines.jpg"), img_bin_with_lines) return img_bin_with_lines @@ -265,8 +267,8 @@ def detect_tables_by_contours(img: np.ndarray, if config.get("debug_mode", False): config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}") tree_table.print_tree(depth=0) - if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img) + + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_draw_counters.jpg"), img) tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index c204bf48..0edc191c 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -1,5 +1,4 @@ import json -import logging import math import os import shutil @@ -27,6 +26,7 @@ from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument +from dedoc.extensions import recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -34,9 +34,9 @@ from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_page_slice +from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import calculate_file_hash, get_unique_name +from dedoc.utils.utils import calculate_file_hash, get_mime_extension, get_unique_name class PdfTabbyReader(PdfBaseReader): @@ -46,50 +46,46 @@ class PdfTabbyReader(PdfBaseReader): It is recommended to use this class as a handler for PDF documents with a correct textual layer if you don't need to check textual layer correctness. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) self.tabby_java_version = "2.0.0" self.jar_name = "ispras_tbl_extr.jar" self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `tabby` is set in the dictionary `parameters`. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - return extension.endswith("pdf") and (str(parameters.get("pdf_with_text_layer", "false")).lower() == "tabby") + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "tabby" - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ parameters = {} if parameters is None else parameters warnings = [] - lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=path, parameters=parameters, warnings=warnings) + lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings) lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=image_attachments) attachments = image_attachments - if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters): - tmp_dir = os.path.dirname(path) - file_name = os.path.basename(path) - attachments += self.attachment_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters=parameters) + if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): + attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) lines = [line for line_group in lines for line in line_group.split("\n")] lines = self.paragraph_extractor.extract(lines) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 16a49ca4..c0e99c43 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -4,12 +4,15 @@ import numpy as np from dedocutils.data_structures import BBox +from dedoc.extensions import recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox +from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer +from dedoc.utils.utils import get_mime_extension class PdfTxtlayerReader(PdfBaseReader): @@ -17,27 +20,25 @@ class PdfTxtlayerReader(PdfBaseReader): This class allows to extract content (text, tables, attachments) from the .pdf documents with a textual layer (copyable documents). It uses a pdfminer library for content extraction. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.extractor_layer = PdfminerExtractor(config=config) + self.extractor_layer = PdfminerExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `true` is set in the dictionary `parameters`. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - return extension.lower().endswith("pdf") and (str(parameters.get("pdf_with_text_layer", "false")).lower() == "true") + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "true" def _process_one_page(self, image: np.ndarray, @@ -72,7 +73,7 @@ def _process_one_page(self, def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: """ - Change table boxes's width height into pdf space like textual lines + Change table boxes' width height into pdf space like textual lines """ for table in tables: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index c9cda801..41196f00 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -28,6 +28,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation +from dedoc.utils.parameter_utils import get_path_param from dedoc.utils.pdf_utils import get_page_image logging.getLogger("pdfminer").setLevel(logging.ERROR) @@ -251,7 +252,7 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag :param layout: container of layout element :return: None """ - tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer") + tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "pdfminer") os.makedirs(tmp_dir, exist_ok=True) file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt") @@ -268,10 +269,10 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag for lobj in lobjs: if isinstance(lobj, LTTextBoxHorizontal): - annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width)) lobjs_textline.extend(lobj) elif isinstance(lobj, LTTextLineHorizontal): - annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width)) lobjs_textline.append(lobj) elif isinstance(lobj, LTRect): lobjs_box.append(lobj) diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 0428ae56..e387de46 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,17 +1,21 @@ -import os -from typing import Optional +from typing import Dict, List, Optional +from bs4 import BeautifulSoup from pptx import Presentation +from pptx.shapes.graphfrm import GraphicFrame +from pptx.shapes.picture import Picture +from pptx.slide import Slide from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor +from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class PptxReader(BaseReader): @@ -20,39 +24,78 @@ class PptxReader(BaseReader): Please use :class:`~dedoc.converters.PptxConverter` for getting pptx file from similar formats. """ - def __init__(self) -> None: - self.attachments_extractor = PptxAttachmentsExtractor() + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: str = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - prs = Presentation(path) + + with_attachments = self.attachments_extractor.with_attachments(parameters=parameters) + attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] + attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} + + prs = Presentation(file_path) lines, tables = [], [] for page_id, slide in enumerate(prs.slides, start=1): + images_rels = self.__get_slide_images_rels(slide) + for paragraph_id, shape in enumerate(slide.shapes, start=1): if shape.has_text_frame: - lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) if shape.has_table: - cells = [ - [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] - for row in shape.table.rows - ] + self.__add_table(lines, tables, page_id, paragraph_id, shape) - tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id))) - - attachments = self.attachments_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + if with_attachments and hasattr(shape, "image"): + if len(lines) == 0: + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) + + def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None: + cells = [ + [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] + for row in shape.table.rows + ] + table = Table(cells=cells, metadata=TableMetadata(page_id=page_id)) + + if len(lines) == 0: + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) + lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid)) + tables.append(table) + + def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]: + rels = BeautifulSoup(slide.part.rels.xml, "xml") + images_dir = "../media/" + + images_rels = dict() + for rel in rels.find_all("Relationship"): + if rel["Target"].startswith(images_dir): + images_rels[rel["Id"]] = rel["Target"][len(images_dir):] + + return images_rels + + def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None: + try: + image_rels_id = shape.element.blip_rId + image_name = images_rels[image_rels_id] + image_uid = attachment_name2uid[image_name] + line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid)) + except KeyError as e: + self.logger.warning(f"Attachment key hasn't been found ({e})") diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py index 7b1c9bcd..9cf0aec3 100644 --- a/dedoc/readers/reader_composition.py +++ b/dedoc/readers/reader_composition.py @@ -1,11 +1,10 @@ import os -from typing import Dict, List +from typing import List, Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_file_mime_type, splitext_ +from dedoc.utils.utils import get_mime_extension class ReaderComposition(object): @@ -21,30 +20,24 @@ def __init__(self, readers: List[BaseReader]) -> None: """ self.readers = readers - def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Get intermediate representation for the document of any format which one of the available readers can parse. If there is no suitable reader for the given document, the BadFileFormatException will be raised. - :param tmp_dir: the directory where the file is located - :param filename: name of the given file - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path of the file to be parsed + :param parameters: dict with additional parameters for document readers, see :ref:`parameters_description` for more details :return: intermediate representation of the document with lines, tables and attachments """ - name, extension = splitext_(filename) - file_path = os.path.join(tmp_dir, filename) - mime = get_file_mime_type(file_path) - document_type = parameters.get("document_type") + file_name = os.path.basename(file_path) + extension, mime = get_mime_extension(file_path=file_path) for reader in self.readers: - can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type, parameters=parameters) - - if can_read: - unstructured_document = reader.read(path=file_path, document_type=document_type, parameters=parameters) - assert len(unstructured_document.lines) == 0 or isinstance(unstructured_document.lines[0], LineWithMeta) + if reader.can_read(file_path=file_path, mime=mime, extension=extension, parameters=parameters): + unstructured_document = reader.read(file_path=file_path, parameters=parameters) return unstructured_document raise BadFileFormatError( - msg=f"No one can read file: name = {filename}, extension = {extension}, mime = {mime}, document type = {document_type}", - msg_api=f"Unsupported file format {mime} of the input file {filename}" + msg=f"No one can read file: name = {file_name}, extension = {extension}, mime = {mime}", + msg_api=f"Unsupported file format {mime} of the input file {file_name}" ) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index d52ae567..33ffe656 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -1,6 +1,5 @@ import codecs import gzip -import logging import re import time from typing import Iterable, List, Optional, Tuple @@ -14,36 +13,34 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.utils.utils import calculate_file_hash, get_encoding +from dedoc.utils.utils import calculate_file_hash, get_encoding, get_mime_extension class RawTextReader(BaseReader): """ This class allows to parse files with the following extensions: .txt, .txt.gz """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.space_regexp = re.compile(r"^\s+") - self.config = config - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith((".txt", "txt.gz")) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - encoding = self.__get_encoding(path=path, parameters=parameters) - lines = self._get_lines_with_meta(path=path, encoding=encoding) + encoding = self.__get_encoding(path=file_path, parameters=parameters) + lines = self._get_lines_with_meta(path=file_path, encoding=encoding) encoding_warning = f"encoding is {encoding}" result = UnstructuredDocument(lines=lines, tables=[], attachments=[], warnings=[encoding_warning]) return self._postprocess(result) diff --git a/dedoc/scripts/accsum b/dedoc/scripts/accsum new file mode 100755 index 0000000000000000000000000000000000000000..1efd2a47ac37b8f777fd58074895dc33ea0cc6f1 GIT binary patch literal 39280 zcmeHwdwf*Ywf{~+G(=3|14KnVs8~>l0Y!s?CV_!L6Ny3;EjkRzOfoW=8Rr2(3B(54 zj3FAWT5h%1M{lY1Ra&i5st`~idbu@PAF0wdK07g3qqP`YWbSwEz4nHZp%El_Bj2VNsa{#1Fl!}q~f3i49 z9ESJ^i79@WBH*fHh*fKv$9N7Xr7O)=fSrzGIZbs5iRLnIII?l6LNyh_kN~BlvTj~3 zqo9TiXgV=p;a$tdb5@E^#iMjxe&+iX-o;dK`8Cz`qq@<#i1~IpN2wEGoPnln9C5bax9q%g&-T|%@%sX?rYTLO z=S?~9+~P>Ec$%y?=_lEFm(|FesJ*qRNae9O2_t^@r;*ifEeV#5`DMwMqi^26eEkEr ztS8=SICZ3^?g9hSBq*4Dz**9WLKnGT;|w&=bvo zFU^1(8Su&s_^J%$31pCeJ_Eim1O8eD{a0s@e>MaDlMHw<^cy(s$M-VG?}XrR?eca8 z`6n~r`5Ev78T5ai0skZe-kCxFEXW%;?Z@T@p|A#iQ`3I%#d6? z-n|FBNQ@P|jO(QPC4RgJ{k|Nb)KSWM25oviVtKosYmxM5afB!xlhQ$Vsj6 zY9GoEP@s;oa_0$86ABawg?)jjS9npBfQUrHf!f9p>gJ0CYiG=GMIyECfLD0J;b0gj zDqFi8rCILsx_y2T3t&Q6j#PENfIy8xzB*D-7Ym87XQeMtC%ix-qSo#A2WyGtmzmTB zV^QG^#lZ5``hyVY&%SHxEzaQlUKWf-Lv-mu3FOc{oInSjy6jBlt8X;3COsCHhSwK_&Npluv#}7dDJbj$ccOV5xG!KGO-hUBS~K&WrjnqYjl4oJ6_>i*uOQ7t}KtdkfErG3LT$l!`mvAK&cJ? zA>&RP{&U6`+i)%4V8gY1lMU};J#9Ap4aPUxa4r9k4S$d2x7ctk-)+PH#qzy2T+1hH zxUScb4L_zz*)8t|Lk6_{Co(?PhF35?(S~2mc&QE7@=hDB|x@UIvjvf*PcWBZ5G?SBg6V{Q0E z#wXhF3m7l8;qw@G+VHCxUu?s@j5pZu6^u98@S7QLv*B^ZH`?$!7=Ory-^2J88~y;} z-8TH^jQ85`#~Dx9@TV9bvf*8f=S9-({{rJ0(14L_OjCL2DD@irTN8RHvm_)^9nvf*uvZ?WO`Fy3v$A7Z@MhX0cBgbm-t_>c|% zGvj&Dbo+n8_*ff$WVN!-L>qoO~Wyvc?K8E>=U?Tl}< z;de6rkPZI{<6CU_CdRvMco*ZnHoTYdgbg2Je8`69U#{$#XIt+c#rRkoK9TW>HvD47 zOKtd-j5}?3J>!dQ_&UZLZ1{tWH`(xQjJMhFKQO-0h7U0QkPS~VzQu;0c!jc0w+)}c zcvn#QE$wmWJr7FJZNUo(;G@TakJcb)uLVEFg7;Z)>vfO59-uv^$yRwTC*iawp$~mM zLpbeIm3J-pZu$q7;XFFd6TA&NIp%KpA2jtSROoLs%XN5+>?oLs!WOUAFEIJtKJW*NVN z;?(8)@0amQC{8Zk-zno26erj2Z;|l}C{8ZjACmED6em~iUn=8gQk-14zgosmr8v26 zf0>M*Kyh-}{t_8Kn&RZD{f3O^Q=DA1zevWjDPBNvA>;qN3h_dU4;}4%lW&9%GX5aN$<_MHWcui00uzFEtNA4|Cg>ZHpd5k2pK7>?q^mWkjdEqH@h%?K5ts!ch9l2M7 zI%f#UcN03}0tMmgJOeS}!zNVH(W ziGRCt|G2~uSyY@BD%jtjgG$tg#INo5LV`l*6-K;ypEDlobGGB`aP0dnNjp16KT-k` z-@S#RiHp%C+a3E_hqBflg&OSwPcdEwW4Ge~2@Rax?l|Dw=_VE;JX1N%dUS?X$AK&;oZ&dI`o(Qhykl0DDRBNz@baZZ%8N#|$o;Ou_{_-7aEtsISc}PS{WYOBI;Z=^_lr<V$BMu}Imv8hlp3+SxrL74Pf3Wx?v z74UB;V5(|G)EKff@txb~)&d-);u`pG)xVI@`mVyex=b?|Qr6n`y4`*WbFJ%&&W0j5 z#!mFHxy4e|7grlxkq+pC*TznfMPsLdOG>|iaf`N4r3Z3lwaqNckYzV4-&N;qA2ES! zL&lie1toyFPfV4$7p~hRiw}c#gvO!+k;@b*3U5Swe*DwDSznKxGuNvL(>V$Xf~4($vu03kV)-t)#QPYN*b6&L~-} zXl~-3FOx~L;zmYW&&F5^TTeq%QwM~-xjuizDif`@4zr5#ofBkzvSWu^b2W1)Bh%W< zy4$fAwxs?9GeI@f7d|Cs-W1K>LU=#jOLDe42D3D0X5sq(BCAt5TD!7HxH&coYGwlM z|DKJtBz6)@W|L$fRsczwKuKrLK)F)gQH9~JX0Md&n0`FfO1HEVCW>S9Yu=Zoi}VPR zQ{UL=%t{sbBF*h!iIHNn}lE%nWkoovqO%5@Vnoe`M%A`2e2B0UdT|7tLjyGq~xAp zx#yG|bZ{3AFymo6qlOu`r!=Ea!$YL!r!-&9a=w%t#(+cRrm|dlO3oNgZY0YcZ!ggE z=u~7W?3kyU@DXPGmHLP*9oOv+X8hjHSjCLrCU}{$JRX9Z5x`HhJvf0A~EZ8)ZpCDoP}pP20tq- zI`T2t=Ci`9cMwvQ_^j}{9h~7b&X5u(R(?%})LiXNG~qPPd4j3?J@heb-s_H9_CN3^>Op?T+VvOBRXm z+K&I+*6p}BA|@w2)0%V^l6HcT*6I&^YH9T%v(?irh+ah&R0}+1xIbcMG7a}0l%$Mi z)q;hcOoh~CW1Ta0(!~v_DjUlyNQ5TsjQ0#wQNbY0S(O=9VZ#(Qq_8eekxEpC98=@f zLvS>6i9rsTkn3iep*yBewHEtyTuJlF7JhxC=XA2&=<_6*KGG_gjS3(WY$lmkXg@g46c4wsUSlZ-nv1Ro*Zrx zGBd5S_+v|JELACx+s!-GmAAb?k=skm_G7bU zP2_gO4lF3nbPzfsOO1Pph5KlM9D5wf+APXqO8~3{;GHG1VzgN^3XP-|1*c*O0PfY` z;;P9T&s`uhLL2d(M@Q}pbIhz!qERnM6LjRxCBgWybOB}&?u2mri0pV54X6VrQZ3>r z>FBYvNFp0%^|a-o#zVGrOU!gL97AimA(2oLtwYxoe*a}mh?a>Z-h2RM{@Ep{5vrT7 zYH~MZ+G`HLDvw@)ecPu-WSgUZQGBS)K}o{tNotX#@TnT&ECy%ejGe2=IwLV#DxD*a zpk1J}Ev$~*Du}}9DfK1h>Ojup!W38$E95!I9iz~jrvFSr zC{0H-9E6I{mjjQ&aum_*0rESxvCjV{mA_x*j|5+n|Jqdk=OX`7mA|T|%722&A3jr! z!Gelr&0m+kaKo3h+G)?-Uo9=gBLeT?Wosi1!brVqFptTu*lV5WS5SIOU3!{HtVd>G z8BZ2KEagLnxx^vMV23Pg4Gm2glye(pL14Fd?)55&FTJxHJ>vPn!56 zz*AjZwr6{8AB~6@LgaS*b2@^(a2mdMCm#4PQdT-~spG5ITfiI#%hIlIXHnG5rYw=nh!oWB{pK21VM>?m?5`P7nId=+_XI7EHc6a0cRg zh}WL$nGIK@-q|s|1lgKR`Yi0D41NceO?)eY<`C7XXk)*%xmOgfdkxD3v;gJ`HV947 zsIas|92>%2v~r?RHBQ52ueE!l8_8YJyl;>$-4Xk*ZxEq98G5w->%`>M3;M>odtfrz zO08YAvkL1Uf!g$;Ha8RJYl<#a$mMb2`Y-KTqWMy3M|JkV2sxw_Zm+^!vu(i%zw4U|P()4YO)vjd-%#ixRn5w865Az|lMGzgIWVFVbBD}E zj&%m^Vk?h$6F!*EJI3Vw4m{Jt)BOcEf|R3>&UQg}H^#yLpC~hE#yCnEM)%)yzDgz` zd>~8oU#nUb>l?k$loLr0u2N_~pHjH~AIfIKN$@T^dB@9NBZUEB<~ual?0|uW)f0b7 zjrbSfzFuN;pIpMG7r6zO8JJt-c0|1Fzc9)E_wFvG*qLPF1CMiz>FJiCH8PARWh{u`4TzL^kf>^w?#~>`H zH|0Zbp@CE5Y!QY+3`S*0OU_+@@#$QxC;6&MaU}HPMlG7^0hpggB+FLPlNbqkkmA8_ z^uXP6fLnKw0kg^hZXI1MY7g7{7pTj<09=iP>^kVyUHj;tDm^o$9zv>y)UMJjtGW)U zZeUhQ-T2|vEmG<}hNLozR`)kN8eo?0BB)E5hispS55{Zy+8pn18w*FV-10PVlS0pu zxK4P#P2J{%C&jCeh3(bC%(BM(4?IJmb>F1Wrq9sbQg~LM(ame&KW%z0Xt?j znVO5C8JnY6YdJnQ@4>zv_6Wuo9^=|MvIo_XGF9Bv&{_0me^sa zZPvia?KQoH+p69~A-hr+YKgm%mDL(qPz7tWHt$UquKyV0K__)x>-PI8D2HiidJqGT zM4${?-lj4*hU60$N3K1pb?AuZajiomqsO%luOY1rw;WDl$ zl@lH^Xmil6Ij$DCN`>psL&bHk#=|i*Vn8B6%`tEq$x0(7)ePe72adz20jW(`@u-nMFL#oqR^5g2N4)h2_OHLsIx22XzKimbwIuhPYa zvmKAf>`)56lGBbw<63oB1M+1f;TqI5gF>#=P{6W0-1L$~hPlxfEl+-|qaG+rS(cA< z6EddLMW@HYQua$dpHzdsjz^W=KvQn&<=T_AVb5VwjHQ;$*m!Z|LU} zWTVPnGnBO&U2V*o+}5F!G}rcE2u-kR@*MF;x`H|EYx-Fn_QX^Z?~ebot!7`FV=pFx z{n8uZ3XY+c=6xc1Iu#Vg+D?t3?4An;^cHUG!SKF6s|VKr`!OQs$IfqQ-it1e@$|2_ zwI}Z|zDIWxu{Pchi_7JX#pu65P5Y4>hT(*^D-A)kf}`t43rrKNY)>p#pbF)|g$9a5 z&mpKHg^@75NVM8w*`3tt)F?`J?jf>N5xtPwkAywQgLWr&p&D1++?+_FI4dV=jeS_d zWmD7*-8a#F&lpzjUHP~YKbo}_aw+$z*3%c-VvB9+LP4;qoBoJQ&E+3K zw-zw1z)QD?s85y^w;=Fvpt;fgA9(mN5k)S;F`p#nRhrq32N&2fN%SzNwdq<<;uDp4 zw_FA~dI#o0vbQ~ixtd0cvLc!|aewc(D1*7mqRB?Kdk2OvwcnEUhAa{W897L+E6dmu zDqF7JR_xDr0!f>OHok6|hG@L%rTO7dW5F#j8#wk+Ie2>;sw6rvA0?!ba?rVZ7aWS_ z&_pNns^@0tQ5Q-zFikFE9lhHqrcIMrg0&a*?TwbFX>c|9sGQgAtCNpjvy|4L(N$dz z4NR6h1RHUIn5!<5<(|HF%wOQf3taJ_S~4B<-rRg}f7Tkh2cK~6Zt1&-3M6|cMH3v= zencigW5@xQZ~N#C{sD};vgXupFt*c!g!D|1+N88{P@~sSnL+2)7tT84iKUla@;$n! z{o*5M$B2CAj-fm%G=LpL7ds~*)Z_GZeL)v(xMv*iHurmEjpckI-HF@vN|;=At%S4o zRct`#QZL38HqBdVN6b1wMQO7hbBCNo`?UYSuvjUQpQZ-SA{dw^8JyBcv|c^c8b>>K zzbYMX$YMurIP5D)1@amIR%xgEwp`{1F;r|so15(g3(}N2Xq`QhxDb}v=~Q*(T&g_O zO1vY4L9~;i|4zPV{yX`)=sq1~qx0Q|RO$Jus6F2j*?)Ln7YX1v+Hi&IA5ktwm@GW) zqWmZw+lGx|G#lIKmVW9=WED7HCwZxPFX>999BUqHYbOS#DZEVzQw`KDfg!nZ(!uv| z;@ObW3k+V8?}pFfCI#*=(o67b?jLr`oBP^1!}K*OC44FM%%^YIF86}WSy%8bwK^ES z(<#T>T}CERt)ZUqMXMA+3mmQt&%!cmxB3Z~L$>e8b>VgJTe(1H7fc+}qn%diNO_K7 zdMw>8P?zX16$~wNr68(_N6q8{2p8Gp*Cbh;H$Y1*eYMZbMDM-Q3y~R8@Ga zV~Cv(Rg@kUTM0!J8B6YVy^f@J>8EMaS$1i18Oj0g$jQ_JTzJE^qlk()T zd!4P%;S&8%v5)B8o9|v*zNGxx^6Sc7&vmBWm%w`p=DQU1CIzneo$-TmpQiu*|ABhS zOTM{7wx@T{FhSsO*Ve|u?%GwZuqPA@N2eM#V46`}Tx?uqLq#AMH9Ub}tiHkU1@L1I zhS%r!M4)eizm87Ba_YF4&@>?+!ZKSCtPOisxcx>s=Wkpg^Ea;agd@ISz&O9SB%dUw zeDjgdKSJRPK%uYBpdZX|*WxEHP&A=IZ!p~GjuJAV6vZ=)36XqZ7)m>^;%Q#Rk?hnz zC76mM4JS2jnh|i(yMX(|y@=;#Hm>-A`Sr}UuiBiBY z=6L-0ugaD$WS(`RCeTQk*NH{JsM{~o&JM-`l%&cRiFl;lCS2jGZ-~mw(LT_xl#q-} zi~yHl$~;d%{_uz@Dx|M2s|nz@O7IgO$fSJX?D_K{8;FL3e$YBkgt3Jocdai_Z?%pR zUI+{M;57?ZH7*PKwfG!gy$?SpqC{#!_+6D+H%wRQiNYKjucRMWNy97MQD0!HtpE%T zQv#EdUIc~@&6XJ%@--YyQT+Ibr*2BQkdi5&Ld>r&S5fK;5upoH6|D1leQ04_v)RGM zW%y|sSZa>XU5g(~@!{82*kvl*5nrt_4?lpiT*&|0Xofpbhf0SaEM~6?`}}?c%EOJG zfX^KePR}y@d`qe@$Fo5OCMI;g^2*Dx8gCQG?qDLj}b=PcU4M zlBP1@nldh&?m&Iaf4#dgXfv!)URMVb1?lHyltpwKKq)z@->8lSYNIg}Di{z8V{oJh ztQv|&q|K=JsHm77Ll+CwuF^IY;%l11z}RC7s;2Y%mU~R-vRLCXH0oT8P;gd^7AqsD zc(lQT0ZTUM0u;m@r9lL}u-4=Ey91tJEMhG3)HVc20=cVBRP96Y!{|o~sCiK#bI`!} zS{tm#&k#}790B>uJ9S36Kh)q3#2P(eUu{HG1e=UGVK;ufr(RUxM|+kV4!;L}fxZQ+ zH+rIwG8Q(tL&zY#B)UJ&LDFSz$j`pyQe8i3JF8)xzA#)O;6^J)-ODLZ5ex+zgWe#w z9lWzKgt{;D`F+t<#wFC0q%bW{njCPLRUrZH7c$2xn501!BuyRXp|e6Ohg2oq;Z^91 z%6l+UTHy+reNBLRU!5_(&J(N;yF<`k9rmrj4=Wip7;eh(Q$HLE3F@%M*Mx#5h$}+#oAD)6U=t;I#&w2*`BvpWZEz5`o4f!Kb zWT{3^qx@+qlY?~t;k16hZ%S!-`b8>*ogHlqon4GcwQkCUIzd0r<%vX$h0iFBU6US>e~T z+;y&~rzvWtq@UD+QL&V&4aTsTQHwG9u`oj_6*HFvE)w}-=7f@Ii!UN_F33uebz;ti zEm-#9`BYK4Gnw3j@MJvI@j2*B(0f7WzK~3g!u97Tpp!w1u`5*x+6U?f{oU?las%jJ zK_3OZrW<-d7rvNG?gRY|=qNn0;rwr;2d#b?=|O)0>IWSH-2i$OE+ZcWo%;&XgKhxb z2RaseBBSuw$-UT=n+*B^_V_A6$6#aD4;lmA06Gab5*`Iz1-b`x8|XgJI@|*qb(j!e zf=&i4#;u}C&{j}C=wW}twHN4RpqoJ-2JHbo{LjhcAZRIQ5gwXygO-5)7PJzSUTXJ) z{^nie4|?=_$RG4l&~DJ{K;H*_5LDp0`%}zJXv=$G2?FHQgdI0o8Y+#JTGxrleLVC~$=n~NX1YHd}0gtiX3wkA<;e7^l1U4x5 zf;NI40NoEd4vRDo_GM>+ZUS8bdh#IBgWe5#FQ^|+>puhfF6drR19vM9fG!3dhnqm3 zfX)Ox4exs_0bLBb8uT{MdqH=DJ_C9qUQpQ!I{F_-59$XUhv#PZg4TiV16>V@ooRVw z-FSt_YAVV)an#{?omqz$5k3y*#vRGzSxQ9yA6{OF(^vC51<< z%xe)BpK#%s=bU~jDWLRoaF#$X%xWH#*Akr5&<}n=Xn8@=_p@go2^WwD@mJ&g9rza$ zqWL$Xz=#t6UYu`%|7%c_pZ%B=ApSEr{m&(nZ>RC!F!_6de+T_;vXx(-(tiN>IpEWE zRZ4%rOg|3w{yz9$rtv>F`7^H6i|nneOs{?)*bML*9& zzfAFSMri)M;2(G?nLOXBKP=VD^v{5Q490;Qto&@hng3q!>%e!V@vkxY2f+U+_@AWl zKQj5_FkXdm={z@$UvBbeg1-^`4Qc!znEWN+kHEF@y0rSOHS4z;{059m^cG&K{&P(G z+zbBg7_aCihsjsfQ}qY`LGUxxZ!h@ofKR_IYNpTmrK$e__%UoQS6lUGUuu?r9LC$9 zf$vD;mzn&T;J*a^rZoNoCVvU|)vqU$cct+=P5x@|UjTo)l^>F9<^SNn1%8c{pIt2l z3}DZIUx;zM)yj9a=47u(>F9$@05VRiO!jQE4D=&FcZ0tqjenKNH^6@t{A<(ri%otR z_=T85GPTiC@D1=MS<~04@{I-70{$fM7hCxVQ~$UZJP&v!Xs1;Bos~_giT@1vG2HLm zlh(ew&Gy|3{&>t&!``ah zV;}fuVZNh{3&VaCz|PLzF+&7WOBAO{hCzyX&vw&_+@G7FErEB`d}yK z`3h_L=2ZF(;J*+4rB;6ST(f+Sf?xO!&tG~x^`_GA0sjQ>e~^~{_ssnFfqygB7xbId zX4~XUHu~PN)?1}kzAB#VuYU09`Ek0ll(O#~rv44!KLY-_R(>$0 z|55O_gP&=<+ynk=;AfgI_klkEex~w`8j0Uh$NKOIYyR1fo8_Af{@viy^Tnz1-)-_M z!9NP?$}7|OmrFj46@KuY;CqN)QBZVSc16M1TXP%*M#qSXf{7b)odqS|Kg?NB+Io0Z zL0Pz=nhg;=wi6YvJ z`g<98c}gC*=>qJFg=gyT&7nt^)g+-^!rTjX1a;#Q%rX; zeS_(TO!qTAj2&bQ(=)Wg*y{a%8qEtCUz=2a_l1PW5%hOXsF|rx>F>7SB|dSAg>N8C zr{<4G_-veAmH3CF1m_H#`a3QcAY6u1^UD#w2B-dR%Xbl8ic|C52;YcPf2ZXpgx6U3 zYZ0bXm$wDs`*7;-vD}X^o%%a04ptg+xq({)d>F*r~YorW`t=2MSm{^p9vB{occQ{ zA%tJXslSiXgK!e3{w|8Z-fhGX*BK$H={r!_3gg?Tmzk4!(a1y8f z-U$tGSKz$*GE@r3LWGkz_4iHip-8b7r~a-<3&Kg9H?ur$M~k&M;~Z~6IEhn#zeHew zT#HkGx1=?s`MBo8IoIOU-zC8(!Ngje`gAQG{4?t>L)e8=e+OhK!bzO^`yXNg?1fW*_oD^j`*7;-ecX?55~u#o2YvJ) ziBo^yLwpDQ38((92X*+1aq91Rlp%Z{PW>H^`w>p!)ZgzAQ!!rP)ZgtWLHIVD`g2pi|AqF#slTsLjd1cS++Se6aS-*vslTUDg75;Ie_(z! z!a$;d*j=$tou_65eonq_!OGlqv|rx%w@ zIVUD#Q`bx_EiO5Cl0u~szTsaEH0tlOT1DwgXZpLQ>3EKaVvMpMIT(EOI?2kzSImT7 zAEo1m3B8U=#}5~JeUXmm3B4{!$43gi{z=F4&2>^be&}}wiJybjWHqk`(&dj7CBw|e zIT&ozOB7aV(o=ZI_Xm~!B2jcnx~#&FGQR_B<>uf^hHY0`5f#o6$C%%LvvPB=3$NF) z>A2;)hZaVT7=!QL8WuzabFj+P>rpE+hgOlH;p9`_i%ge4!5Woe^4$^QQ+Raxfgv5< zRv)9b`v3KF3M&?Tp_bQH)uPA7I?YcpeN+ z^54ZAL&qnIh+`SQ1ak=;)E;!4!gwd+8x$-i10T-LrNF6u?`M1JVd&#rg?BNoFL1UV z#`qdTYSwq5=14tbg~9&38R_Y`gv)dPnJO-y0YT7}p*%m!(fXp2yaB`(;goZV~E zaWCs%#SJT;0YS=82K_66lb#;dqwCwsdVYYpl#W(K1bvT~?2n(;lgARqA7cCn%$;;F zThPzrlKxj&Kcez@h2>|m{3OQTVcf&`V#fC|{w(A2*$D8yWc*`!^Az_25vOAWDnjKP z!;UsX!D1xv;o7|jIF<7cwsM}tdTuXO3jWA?CNX|JE0)h5fOkIQf4D#i*0X#W;}=}0 z0Qsx|f|oO1KU3keS^m3>PrgV2mogq=d?n*&G5!PK!`16HmcOAw3C3Ce9>#r~VL9WQ zGU(Crr*p?*bBM=S&l1Le#CoV1hAYo+A)keL=gy@pXd%PNUjYB7b{w3q1dCXHH}v2t z?2!4WSL#1>-g$@R^*#-&5%i2IeYsbEFJ0q@Bj2oH=Bbl_(|3XUxI8jB5=~+JtFx5` zd9NP9Qo^$jE&s(*Ugwsgm$RN9&QqASGyRtTQJ5clVuoa}n~N10^xTpGzfbDHe9C^_ zw4g|;`Z;i=S>>z$Kd0q6Up@Z3gz?N6X1;z?=35}9@widLir!=VBdnL{s8B?FDshfj z$cPbC-{Xb;9-7YgIN(QHI~e@;_$$aULMAWt@J+m;CDC*$QZ4{5#SF{qZj1SQ+1hSz zu>J=2&$~EHUj{w&jM#8?UJh@ea!x5$3aVL85&VJhQ|Bl^m;WS*vyg=pXTZx@&yQIT z?VZpOljXs>;ye}CcB^Lj?{I(7{mz|1e@h1ZhZ*qyV*S^Yt9&06z8g6Ck9{5c zdItG^)_(%e*Ls~ZltG>zBqlrmll?HriPoBA&>7$d7U~%S7gB3q@LqNgRMLpGsr(CUgYhZ7( zPd^(>uMh8%@|@U09s*AD-D_N5?Qg$k{mon+uD1AH20c9)@OLxd`+-xxu-iEs1F2yt zpAL=#KAitdmUw~K#Qv~=6U@jUU&ZqOVEHnZU&8p3c?w{*2r~XY`_J1f-vs8x8GGqFC)6#u_nBL)QH#T@Xk{4`RC0j z9bU-g#XG7lcR1``<-+?^;Z?#Lb~k!lb+N|ARY+oqxgZ%$6UB??kys-Ng+MqIMIaWc z!z*1V7G5M)FGu-(0gnpd1@IULgCXWdSK>`6dWVYMP7^EfmaNC68m za4_JN^>l}ktBgRAlt5|l@}w#XOcchOryjheg_kVxrt6K6)`B8}ck3a2L(EsZoWdd& z_4xrrJ&hrH_08q0LkrNGaCp^AK>^qZO+nT*v53Iynsp>j@1%uep{N4z+GPqVo_5}O z=ZYxaP)uPF5orjnbj9d>IT68YaZwk)6BlxaBOaK%K0>97qC`zpQW)10j=CbK2HUX# zHuQwqlz6#Pz8C6?xR&8{)aBAZc&9Pqil}$DgpV^H77rB=?j!v0N^psQNhO5;%)O$#%HeWcHpk^cEX{-Jglo>#mz7t| zpPkN$MBOs!yvu4_4kv4H&bdOk=2b4JD6e!am^*i&W07l7c}1mzwEBI^Q2Izr`m&I3 z&mxFd7F{mripnyUmCdW1UoqP?t$14TbZZ%1Zh9|YNaMR!Hh2QAKu}pc7J%z=553hF%*Zf6Qu2lbAR5hD==OVHe{R%=b7EMnleaq7Xdqo6ZSjH=Yu3*?z z=krjB-F4~0P~Z!sL$Rp0l-{{?)qA2Y1n?$uR5b(JA65uQ$eme^Gm(7_X+l0*9xjip zhi+fpRfJdCG(B^8crYxM;az^JhU^bIoosqo3UGQFW#ljnL4yLlMrvuN+U0PyI%GA(U((_!6 zZo8qqFqTcDnJY#ty#MVHwWu)l^k`7M4@&QW25ZxcfngecLQV+>hx{R)WR6KLX;5zn zDx=&69t4PqL1|;Fe&n^UbjCIXl0CD&d#v8n( zi!QP%vI0X%J>I-e&&aGIy`@`EF9z58J$NBK;M_E8V zYqL)3U0MtjkznnN8LmjA)*Z0a|9p&q;YMG846f{e@fbeuu0!qQoTH`%H6NjJ%V^M( z=H_9^^l36RHI`~No?7Iaywv^}4`lP`k%ApZ7Lk>E$$!(Ujy^&5h(&~cr5eNeFinU) zZ!?-#mOEXW%Y_j{51ll$=!rjVU@=<>nKwr2h3uEQR;UEtvrbpL(jBH>rO+al;o7y# zgI`1NC~MMO>T=aKxoK3DGiJIPnNC?>2DvxFwNQd|QBfRO)rjT-jfNF%&>=ZjhD0%5 zllBysSInOhb=PyOJ`gKj7W4V*reOXprdWeJ(jbcKRt2C>(P&t4R;XX5NsG9^3w!)- zO2DCzKPrl)3l<|-TpvVKws|ovgCG=?17NYIffqgvbx@*XDk-~yOrgV+it>U2SuXrs zMlAv~DHOv6MKPvR`mF}Np8LO~bbEm=y>NLY4}D)y({YSzLbBH5G(@mOH`D9;hngBz zVTAQ{gNA8&YE7^2GirLe7G!EqKL_}JJd>o$zmWr)R&qLACdxyvk2y-WgXxh3v;6wL zrKY8vA(rdXPC7kbZVFnTV@t?PukUYas-Fvkh&=4}yACKlbEV6#?}KW34X4xP*K(RJ z2cMqd(s6yiRMRp}Z!bT?@>v_IqE4^xt7@vBE7SSg>%R_ZDSZVi)X%ADx`h=}`r~lg z)87h=Y(nLuLqFH2DW$O<+#DtpN64C9-_O;w&t`DFU#;n#w)FbGuco@ZIzt`T^e49T zdLKg5E`4Fm3Ut0&-%oAng}z_Rbfgw!YOg=-3s4nx`|115ntpCmpv$e*{>qkK@4IWd zPAldF_WZZo((C)znil2joD{X^|03W5oYaTt(Dws2lVjts%j>wNuR{jAv1WRGKYYu- znf@K5CHv|8yOtY>VLyx0OQuBBKf@BH;NNhP@0#iP{(6_6xH#B;+P`(VwI6JRNLu~@ zgHqvsyd+{GF)fEqujv=yS<{>QoFSE{S9e_2VKCL?D+EZFE`J~I@ASr$MCao1X(cce zZ2r#%mjp^oL$1y){Lg4pf$cg0 Y<2qk0dlUl0rT List: _get_avg(statistics[dataset]["Accuracy"])] -if __name__ == "__main__": - base_zip = "data_tesseract_benchmarks" - output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) - cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") - os.makedirs(cache_dir, exist_ok=True) - benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") +def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: + symbols_info = [] + matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] + start_block_line = matched_symbols[0] - if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) - print(f"Benchmark data downloaded to {benchmark_data_path}") - else: - print(f"Use cached benchmark data from {benchmark_data_path}") - assert os.path.isfile(benchmark_data_path) + for line in lines[start_block_line + 1:]: + # example line: "1187 11 99.07 {<\n>}" + row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] + row_values[-1] = row_values[-1][1:-1] # get symbol value + symbols_info.append(row_values) + # Sort errors + symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed + + return symbols_info, start_block_line + + +def __parse_ocr_errors(lines: List[str]) -> List: + ocr_errors = [] + matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] + for num, line in enumerate(lines[matched_errors[0] + 1:]): + # example line: " 2 0 { 6}-{б}" + errors = re.findall(r"(\d+)", line)[0] + chars = re.findall(r"{(.*)}-{(.*)}", line)[0] + ocr_errors.append([errors, chars[0], chars[1]]) + + return ocr_errors + +def __get_summary_symbol_error(path_reports: str) -> Texttable: + # 1 - call accsum for get summary of all reports + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accsum")) + + if os.path.exists(f"{path_reports}/../accsum_report.txt"): + os.remove(f"{path_reports}/../accsum_report.txt") + + file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) + + command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" + os.system(command) + accsum_report_path = os.path.join(path_reports, "../accsum_report.txt") + + # 2 - parse report info + with open(accsum_report_path, "r") as f: + lines = f.readlines() + + symbols_info, start_symbol_block_line = __parse_symbol_info(lines) + ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) + + # 3 - calculate ocr errors according to a symbol + ocr_errors_by_symbol = {} + for symbol_info in symbols_info: + ocr_errors_by_symbol[symbol_info[-1]] = [] + for ocr_err in ocr_errors: + if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text + continue + if symbol_info[-1] in ocr_err[-2]: + ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") + + # 4 - create table with OCR errors + ocr_err_by_symbol_table = Texttable() + title = [["Symbol", "Cnt Errors & Correct-Generated"]] + ocr_err_by_symbol_table.add_rows(title) + for symbol, value in ocr_errors_by_symbol.items(): + if len(value) != 0: + ocr_err_by_symbol_table.add_row([symbol, value]) + + return ocr_err_by_symbol_table + + +def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]] + + table_accuracy_per_image = Texttable() + accs.extend(accuracy_values) + table_accuracy_per_image.add_rows(accs) + + # calculating average accuracy for each data set + table_common = Texttable() + + for dataset_name in sorted(statistics.keys()): + row = [dataset_name] + row.extend(_get_avg_by_dataset(statistics, dataset_name)) + accs_common.append(row) + table_common.add_rows(accs_common) + + return table_common, table_accuracy_per_image + + +def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) -> Tuple[Texttable, Texttable]: statistics = {} + accuracy_values = [] with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -115,7 +190,7 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt") imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name) - accuracy_path = os.path.join(cache_dir, f"{dataset_name}_{base_name}_accuracy.txt") + accuracy_path = os.path.join(cache_dir_accuracy, f"{dataset_name}_{base_name}_accuracy.txt") with TemporaryDirectory() as tmpdir: tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt") @@ -145,30 +220,45 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: os.system(command) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accs.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: print(ex) print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") - table_aacuracy_per_image = Texttable() - table_aacuracy_per_image.add_rows(accs) + table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) + return table_common, table_accuracy_per_image - # calculating average accuracy for each data set - table_common = Texttable() - for dataset_name in sorted(statistics.keys()): - row = [dataset_name] - row.extend(_get_avg_by_dataset(statistics, dataset_name)) - accs_common.append(row) - table_common.add_rows(accs_common) +if __name__ == "__main__": + base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") + os.makedirs(cache_dir, exist_ok=True) + cache_dir_accuracy = os.path.join(cache_dir, "accuracy") + os.makedirs(cache_dir_accuracy, exist_ok=True) + + benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") + if not os.path.isfile(benchmark_data_path): + wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) + print(f"Benchmark data downloaded to {benchmark_data_path}") + else: + print(f"Use cached benchmark data from {benchmark_data_path}") + assert os.path.isfile(benchmark_data_path) + + table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path) + + table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n") - res_file.write(table_aacuracy_per_image.draw()) + res_file.write(table_accuracy_per_image.draw()) res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) + res_file.write(f"\n\nTable 3 -OCR error by symbol:\n") + res_file.write(table_errors.draw()) print(f"Tesseract version is {pytesseract.get_tesseract_version()}") - print(table_aacuracy_per_image.draw()) + print(table_accuracy_per_image.draw()) print(table_common.draw()) + print(table_errors.draw()) diff --git a/dedoc/scripts/create_txtlayer_dataset.py b/dedoc/scripts/create_txtlayer_dataset.py index 744307b6..84fafa28 100644 --- a/dedoc/scripts/create_txtlayer_dataset.py +++ b/dedoc/scripts/create_txtlayer_dataset.py @@ -128,10 +128,10 @@ def corrupt(self, text: str, lang: str) -> str: # 2 - read text from the image using OCR with another language lines = [] for image_path in images_path_list: - document = self.image_reader.read(image_path, document_type=None, parameters=dict(language=ocr_lang, - need_pdf_table_analysis="false", - document_orientation="no_change", - is_one_column_document="true")) + document = self.image_reader.read(image_path, parameters=dict(language=ocr_lang, + need_pdf_table_analysis="false", + document_orientation="no_change", + is_one_column_document="true")) lines.extend(document.lines) return "".join([line.line for line in lines]) diff --git a/dedoc/scripts/train/train_acc_orientation_classifier.py b/dedoc/scripts/train/train_acc_orientation_classifier.py index fc43db40..825e5707 100644 --- a/dedoc/scripts/train/train_acc_orientation_classifier.py +++ b/dedoc/scripts/train/train_acc_orientation_classifier.py @@ -13,9 +13,9 @@ parser = argparse.ArgumentParser() checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), - "../../resources/efficient_net_b0_fixed.pth")) + "../../../resources/efficient_net_b0_fixed.pth")) checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), - "../../../resources/efficient_net_b0_fixed_tmp.pth")) + "../../../resources/efficient_net_b0_fixed.pth")) checkpoint_path = "../../../resources" parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False) @@ -39,6 +39,7 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri :param net_executor: Classifier :return: """ + net_executor.net.eval() testloader = data_executor.load_dataset( csv_path=os.path.join(args.input_data_folder, 'test/labels.csv'), image_path=args.input_data_folder, @@ -157,6 +158,7 @@ def train_model(trainloader: DataLoader, def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientationClassifier) -> None: + classifier.net.train() # Part 1 - load datas trainloader = data_executor.load_dataset( csv_path=os.path.join(args.input_data_folder, 'train/labels.csv'), @@ -178,12 +180,11 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat if __name__ == "__main__": - from config import _config as config + from dedoc.config import _config as config data_executor = DataLoaderImageOrient() net = ColumnsOrientationClassifier(on_gpu=True, - checkpoint_path=checkpoint_path if not args.train else None, + checkpoint_path=checkpoint_path if not args.train else '', config=config) - if args.train: train_step(data_executor, net) else: diff --git a/dedoc/structure_constructors/abstract_structure_constructor.py b/dedoc/structure_constructors/abstract_structure_constructor.py index bda3e927..063e6682 100644 --- a/dedoc/structure_constructors/abstract_structure_constructor.py +++ b/dedoc/structure_constructors/abstract_structure_constructor.py @@ -15,17 +15,17 @@ class AbstractStructureConstructor(ABC): that are retrieved with the help of some structure extractor. The order of the document lines and their hierarchy can be represented in different ways, e.g. standard tree of lines hierarchy. - Also some other custom structure can be defined by the specific constructor. + Also, some other custom structure can be defined by the specific constructor. """ @abstractmethod - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Process unstructured document and build parsed document representation on this basis. :param document: intermediate representation of the document received from some structure extractor \ (there should be filled hierarchy levels for all lines) - :param structure_type: type of the structure that should be retrieved for the document + :param parameters: additional parameters for document parsing, see :ref:`structure_type_parameters` for more details :return: the structured representation of the given document """ pass diff --git a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py index c7160d1f..86e2522e 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py @@ -14,7 +14,7 @@ class LinearConstructor(AbstractStructureConstructor): The result contains the empty root node with the consecutive list of all document lines as its children. """ - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Build the linear structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index ed1f3277..5c986c1b 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -33,7 +33,7 @@ class TreeConstructor(AbstractStructureConstructor): - **second child line (1, 0)** """ - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Build the tree structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. diff --git a/dedoc/structure_constructors/structure_constructor_composition.py b/dedoc/structure_constructors/structure_constructor_composition.py index d516fe72..eaf62e55 100644 --- a/dedoc/structure_constructors/structure_constructor_composition.py +++ b/dedoc/structure_constructors/structure_constructor_composition.py @@ -20,16 +20,18 @@ def __init__(self, constructors: Dict[str, AbstractStructureConstructor], defaul self.constructors = constructors self.default_constructor = default_constructor - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None, parameters: Optional[dict] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Construct the result document structure according to the `structure_type` parameter. If `structure_type` is empty string or None the default constructor will be used. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. """ + structure_type = parameters.get("structure_type") + if structure_type in self.constructors: - return self.constructors[structure_type].structure_document(document) + return self.constructors[structure_type].construct(document) if structure_type is None or structure_type == "": - return self.default_constructor.structure_document(document) + return self.default_constructor.construct(document) raise StructureExtractorError(f"Bad structure type {structure_type}, available structure types is: {' '.join(self.constructors.keys())}") diff --git a/dedoc/structure_extractors/abstract_structure_extractor.py b/dedoc/structure_extractors/abstract_structure_extractor.py index 83155800..4514d892 100644 --- a/dedoc/structure_extractors/abstract_structure_extractor.py +++ b/dedoc/structure_extractors/abstract_structure_extractor.py @@ -1,6 +1,7 @@ +import logging from abc import ABC, abstractmethod from copy import deepcopy -from typing import List +from typing import List, Optional from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation @@ -21,15 +22,21 @@ class AbstractStructureExtractor(ABC): The paragraph type of the line should be one of the predefined types for some certain document domain, e.g. header, list_item, raw_text, etc. Each concrete structure extractor defines the rules of structuring: the levels and possible types of the lines. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method extracts structure for the document content received from some reader: it finds lines types and their hierarchy levels and adds them to the lines' metadata. :param document: document content that has been received from some of the readers - :param parameters: additional parameters for document parsing + :param parameters: additional parameters for document parsing, see :ref:`structure_type_parameters` for more details :return: document content with added additional information about lines types and hierarchy levels """ pass diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 0e4eba00..142982c2 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -1,6 +1,6 @@ import os from abc import ABC, abstractmethod -from typing import List, Tuple +from typing import List, Optional, Tuple from dedoc.config import get_config from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -20,19 +20,20 @@ class AbstractLawStructureExtractor(AbstractStructureExtractor, ABC): You can find the description of this type of structure in the section :ref:`law_structure`. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=config) - self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=config) + self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config) + self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config) self.hierarchy_level_builders = [StubHierarchyLevelBuilder()] self.hl_type = "law" self.init_hl_depth = 1 self.except_words = {"приказ", "положение", "требования", "постановление", "перечень", "регламент", "закон"} - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract law structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py index 8a9dc4f8..324f4622 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py @@ -1,4 +1,3 @@ -import logging import re from abc import ABC from collections import OrderedDict @@ -56,13 +55,13 @@ class ClassifyingLawStructureExtractor(AbstractStructureExtractor, ABC): """ document_type = "law" - def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: dict) -> None: + def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: Optional[dict] = None) -> None: """ :param extractors: mapping law_type -> structure extractor, defined for certain law types :param config: configuration of the extractor, e.g. logger for logging """ + super().__init__(config=config) self.extractors = extractors - self.logger = config.get("logger", logging.getLogger()) self.hat_batch_size = 3 self.hat_batch_count = 7 @@ -106,14 +105,15 @@ def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: instruction_ws = self.__add_whitespace_match("инструкция") self.main_templates[LawDocType.instruction] = {rf"\b{instruction_ws}\b"} - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Classify law kind and extract structure according to the specific law format. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + parameters = {} if parameters is None else parameters selected_extractor = self._predict_extractor(lines=document.lines) - result = selected_extractor.extract_structure(document, parameters) + result = selected_extractor.extract(document, parameters) warning = f"Use {selected_extractor.document_type} classifier" result.warnings = result.warnings + [warning] return result diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index ed65170a..92a39fb0 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -22,7 +22,7 @@ class DefaultStructureExtractor(AbstractStructureExtractor): prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, LetterPrefix, BulletPrefix] - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract basic structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index d1ce8818..c08674c6 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -1,6 +1,6 @@ import os import re -from typing import List +from typing import List, Optional from dedoc.config import get_config from dedoc.data_structures.line_with_meta import LineWithMeta @@ -21,19 +21,20 @@ class DiplomaStructureExtractor(AbstractStructureExtractor): """ document_type = "diploma" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) self.toc_extractor = TOCFeatureExtractor() self.header_builder = HeaderHierarchyLevelBuilder() self.toc_builder = TocBuilder() self.body_builder = DiplomaBodyBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=config) + self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config) self.footnote_start_regexp = re.compile(r"^\d+ ") - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract diploma structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py index 47c1bb2c..549603dc 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor @@ -18,7 +18,7 @@ class FoivLawStructureExtractor(AbstractLawStructureExtractor): """ document_type = "foiv_law" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), diff --git a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py index f2bed5eb..f360011a 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py @@ -1,5 +1,5 @@ import re -from typing import List +from typing import List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor @@ -19,7 +19,7 @@ class LawStructureExtractor(AbstractLawStructureExtractor): """ document_type = "law" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py index 8dd76083..4c9c0993 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py @@ -1,4 +1,5 @@ import os +from typing import Optional from dedoc.config import get_config from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -20,18 +21,19 @@ class TzStructureExtractor(AbstractStructureExtractor): """ document_type = "tz" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) self.header_builder = HeaderHierarchyLevelBuilder() self.body_builder = TzBodyBuilder() self.toc_builder = TocBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=config) - self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=config) + self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config) + self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config) - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract technical task structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/structure_extractor_composition.py b/dedoc/structure_extractors/structure_extractor_composition.py index 85453132..6160a35e 100644 --- a/dedoc/structure_extractors/structure_extractor_composition.py +++ b/dedoc/structure_extractors/structure_extractor_composition.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor @@ -10,23 +10,25 @@ class StructureExtractorComposition(AbstractStructureExtractor): The list of structure extractors and names of document types for them is set via the class constructor. Each document type defines some specific document domain, those structure is extracted via the corresponding structure extractor. """ - def __init__(self, extractors: Dict[str, AbstractStructureExtractor], default_key: str) -> None: + def __init__(self, extractors: Dict[str, AbstractStructureExtractor], default_key: str, *, config: Optional[dict] = None) -> None: """ :param extractors: mapping document_type -> structure extractor, defined for certain document domains :param default_key: the document_type of the structure extractor, that will be used by default if the wrong parameters are given. \ default_key should exist as a key in the extractors' dictionary. """ + super().__init__(config=config) assert default_key in extractors self.extractors = extractors self.default_extractor_key = default_key - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Adds information about the document structure according to the document type received from parameters (the key `document_type`). If there isn't `document_type` key in parameters or this document_type isn't found in the supported types, the default extractor will be used. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + parameters = {} if parameters is None else parameters document_type = parameters.get("document_type", self.default_extractor_key) extractor = self.extractors.get(document_type, self.extractors[self.default_extractor_key]) - return extractor.extract_structure(document, parameters) + return extractor.extract(document, parameters) diff --git a/dedoc/train_dataset/train_dataset_utils.py b/dedoc/train_dataset/train_dataset_utils.py index 70ca3725..aec0a9d5 100644 --- a/dedoc/train_dataset/train_dataset_utils.py +++ b/dedoc/train_dataset/train_dataset_utils.py @@ -8,9 +8,6 @@ import numpy as np from PIL.Image import Image -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox - def __to_pil(image: np.ndarray) -> Image: return PIL.Image.fromarray(image) @@ -33,7 +30,7 @@ def _get_images_path(config: dict, document_name: str) -> str: return os.path.join(get_path_original_documents(config), document_name.split(".")[0]) -def save_page_with_bbox(page: PageWithBBox, document_name: str, *, config: dict) -> None: +def save_page_with_bbox(page: "PageWithBBox", document_name: str, *, config: dict) -> None: # noqa __create_images_path(config) uid = document_name images_path = _get_images_path(config=config, document_name=document_name) @@ -63,7 +60,7 @@ def _convert2zip(config: dict, document_name: str) -> str: return archive_filename -def save_line_with_meta(lines: List[LineWithMeta], original_document: str, *, config: dict) -> None: +def save_line_with_meta(lines: List["LineWithMeta"], original_document: str, *, config: dict) -> None: # noqa __create_images_path(config) if original_document.endswith((".jpg", ".png", ".pdf")): diff --git a/dedoc/train_dataset/trainer/errors_saver.py b/dedoc/train_dataset/trainer/errors_saver.py index ae7fd26e..1d591a96 100644 --- a/dedoc/train_dataset/trainer/errors_saver.py +++ b/dedoc/train_dataset/trainer/errors_saver.py @@ -46,7 +46,7 @@ def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str, with open(path_file) as file: lines = file.readlines() lines_cnt = Counter(lines) - lines.sort(key=lambda l: (-lines_cnt[l], l)) + lines.sort(key=lambda value: (-lines_cnt[value], value)) path_out = os.path.join(self.errors_path, f"{int(1000 * len(lines) / errors_total_num):04d}_{file_name}") with open(path_out, "w") as file_out: diff --git a/dedoc/utils/annotation_merger.py b/dedoc/utils/annotation_merger.py index 8bf0c299..572e9ef8 100644 --- a/dedoc/utils/annotation_merger.py +++ b/dedoc/utils/annotation_merger.py @@ -71,6 +71,7 @@ def merge_annotations(self, annotations: List[Annotation], text: str) -> List[An """ if not annotations: return [] + annotations_group_by_name_value = self._group_annotations(annotations).values() spaces = [Space(m.start(), m.end()) for m in self.spaces.finditer(text)] @@ -78,11 +79,13 @@ def merge_annotations(self, annotations: List[Annotation], text: str) -> List[An for annotation_group in annotations_group_by_name_value: group = self._merge_one_group(annotations=annotation_group, spaces=spaces) merged.extend(group) - return merged + + filtered = self.__filter_contradicting_annotations(merged, text) + return filtered def _merge_one_group(self, annotations: List[Annotation], spaces: List[Space]) -> List[Annotation]: """ - Merge one group annotations, assume that all annotations has the same name and value + Merge one group annotations, assume that all annotations have the same name and value """ if len(annotations) <= 1 or not annotations[0].is_mergeable: return annotations @@ -118,6 +121,29 @@ def _group_annotations(annotations: List[Annotation]) -> Dict[str, List[Annotati annotations_group_by_value[(annotation.name, annotation.value)].append(annotation) return annotations_group_by_value + def __filter_contradicting_annotations(self, annotations: List[Annotation], text: str) -> List[Annotation]: + annotations_by_type = defaultdict(list) + for annotation in annotations: + annotations_by_type[annotation.name].append(annotation) + + filtered = [] + for annotation_list in annotations_by_type.values(): + if not annotation_list[0].is_mergeable: # there may be different values of the same annotation type on the text + filtered.extend(annotation_list) + continue + + sorted_annotations = sorted(annotation_list, key=lambda x: x.start) + prev_end = 0 + for annotation in sorted_annotations: + if annotation.start >= prev_end: + filtered.append(annotation) + prev_end = annotation.end + elif self.spaces.match(text[filtered[-1].start:filtered[-1].end]): + filtered[-1] = annotation + prev_end = annotation.end + + return filtered + @staticmethod def delete_previous_merged(merged: List[Annotation], new_annotations: Annotation) -> List[Annotation]: """ diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 66b00218..f7f0a090 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -1,7 +1,10 @@ +import os import subprocess from logging import Logger from typing import Any, Dict, Optional, Tuple +from dedoc.config import RESOURCES_PATH, get_config + def get_param_language(parameters: Optional[dict]) -> str: if parameters is None: @@ -19,28 +22,28 @@ def get_param_language(parameters: Optional[dict]) -> str: def get_param_orient_analysis_cells(parameters: Optional[dict]) -> bool: if parameters is None: return False - orient_analysis_cells = parameters.get("orient_analysis_cells", "False").lower() == "true" + orient_analysis_cells = str(parameters.get("orient_analysis_cells", "False")).lower() == "true" return orient_analysis_cells def get_param_need_header_footers_analysis(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_header_footers_analysis = parameters.get("need_header_footer_analysis", "False").lower() == "true" + need_header_footers_analysis = str(parameters.get("need_header_footer_analysis", "False")).lower() == "true" return need_header_footers_analysis def get_param_need_pdf_table_analysis(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_pdf_table_analysis = parameters.get("need_pdf_table_analysis", "True").lower() == "true" + need_pdf_table_analysis = str(parameters.get("need_pdf_table_analysis", "True")).lower() == "true" return need_pdf_table_analysis def get_param_need_binarization(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_binarization = parameters.get("need_binarization", "False").lower() == "true" + need_binarization = str(parameters.get("need_binarization", "False")).lower() == "true" return need_binarization @@ -48,7 +51,7 @@ def get_param_orient_cell_angle(parameters: Optional[dict]) -> int: if parameters is None: return 90 - orient_cell_angle = parameters.get("orient_cell_angle", "90") + orient_cell_angle = str(parameters.get("orient_cell_angle", "90")) if orient_cell_angle == "": orient_cell_angle = "90" return int(orient_cell_angle) @@ -75,28 +78,13 @@ def get_param_document_orientation(parameters: Optional[dict]) -> Optional[bool] return None -def get_param_project(parameters: Optional[dict]) -> str: - if parameters is None: - return "docreader_project" - project = str(parameters.get("project", "docreader_project")).lower() - return project - - def get_param_pdf_with_txt_layer(parameters: Optional[dict]) -> str: if parameters is None: - return "false" - pdf_with_txt_layer = str(parameters.get("pdf_with_text_layer", "false")).lower() + return "auto_tabby" + pdf_with_txt_layer = str(parameters.get("pdf_with_text_layer", "auto_tabby")).lower() return pdf_with_txt_layer -def get_param_image_document_page(parameters: Optional[dict]) -> str: - if parameters is None: - return "" - - image_document_page = str(parameters.get("image_document_page", "")) - return image_document_page - - def get_param_table_type(parameters: Optional[dict]) -> str: if parameters is None: return "" @@ -136,6 +124,7 @@ def get_param_gpu_available(parameters: Optional[dict], logger: Logger) -> bool: Returns: bool: True if GPU is available, False otherwise. """ + parameters = {} if parameters is None else parameters if not parameters.get("on_gpu", False): return False @@ -148,3 +137,15 @@ def get_param_gpu_available(parameters: Optional[dict], logger: Logger) -> bool: return False return True + + +def get_path_param(parameters: Optional[dict], path_key: str) -> str: + parameters = {} if parameters is None else parameters + path_value = parameters.get(path_key) + + if path_value is None: + default_config = get_config() + path_value = default_config.get(path_key, RESOURCES_PATH) + + os.makedirs(path_value, exist_ok=True) + return path_value diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index 5ab6521a..003e6829 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -9,7 +9,6 @@ import re import shutil import time -from os.path import splitext from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar import requests @@ -24,6 +23,7 @@ from dedoc.data_structures.tree_node import TreeNode T = TypeVar("T") +double_dot_extensions = (".txt.gz", ".tar.gz", ".mht.gz", ".mhtml.gz", ".note.pickle") def list_get(ls: List[T], index: int, default: Optional[T] = None) -> Optional[T]: @@ -63,16 +63,26 @@ def splitext_(path: str) -> Tuple[str, str]: """ get extensions with several dots """ - if len(path.split()) > 1: - first, second = path.rsplit(maxsplit=1) - sep = path[len(first)] - name, ext = splitext(second) - if len(ext) == 0: - name, ext = ext, name - return first + sep + name, ext - if len(path.split(".")) > 2: - return path.split(".")[0], "." + ".".join(path.split(".")[-2:]) - return splitext(path) + if not path.endswith(double_dot_extensions): + return os.path.splitext(path) + + name, *ext_list = path.rsplit(".", maxsplit=2) + return name, f".{'.'.join(ext_list)}" + + +def get_mime_extension(file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None) -> Tuple[str, str]: + if mime is not None and extension is not None: + return mime, extension + + if file_path: + name, extension = splitext_(file_path) + mime = get_file_mime_type(file_path) + else: + assert mime is not None or extension is not None, "When file_path is None, mime or extension should be provided" + mime = "" if mime is None else mime + extension = "" if extension is None else extension + + return mime, extension def _text_from_item(item: dict) -> str: diff --git a/docker-compose.yml b/docker-compose.yml index 3cfe4b62..7c7be0d0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,8 +28,3 @@ services: PYTHONPATH: $PYTHONPATH:/dedoc_root/tests:/dedoc_root command: bash dedoc_root/tests/run_tests_in_docker.sh - -## By default this config uses default local driver, -## For custom volumes replace with volume driver configuration. -volumes: - data1-1: \ No newline at end of file diff --git a/docker_gpu/Dockerfile b/docker_gpu/Dockerfile new file mode 100644 index 00000000..95fb012a --- /dev/null +++ b/docker_gpu/Dockerfile @@ -0,0 +1,24 @@ +ARG REPOSITORY="docker.io" +FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 + +ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" +ENV RESOURCES_PATH "/dedoc_root/resources" + +ADD requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html + +RUN mkdir /dedoc_root +RUN mkdir /dedoc_root/dedoc +ADD dedoc/config.py /dedoc_root/dedoc/config.py +ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py +RUN python3 /dedoc_root/dedoc/download_models.py + +ADD dedoc /dedoc_root/dedoc +ADD VERSION /dedoc_root +RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py + +ADD tests /dedoc_root/tests +ADD resources /dedoc_root/resources + +CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"] \ No newline at end of file diff --git a/docker_gpu/README.md b/docker_gpu/README.md new file mode 100644 index 00000000..3e5a0f8c --- /dev/null +++ b/docker_gpu/README.md @@ -0,0 +1,17 @@ +To run Dedoc on CUDA with Docker use files from `docker_gpu` directory +([CUDA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +should be installed on your machine): + +1. Set `on_gpu` to `True` in `config.py` +2. Run application: +```shell +cd docker_gpu +docker-compose up --build +``` + +You can change index of CUDA device at `docker-compose.yml`: +``` +NVIDIA_VISIBLE_DEVICES: 0 +NVIDIA_VISIBLE_DEVICES: 0, 3 +NVIDIA_VISIBLE_DEVICES: all +``` diff --git a/docker_gpu/docker-compose.yml b/docker_gpu/docker-compose.yml new file mode 100644 index 00000000..2434b5f0 --- /dev/null +++ b/docker_gpu/docker-compose.yml @@ -0,0 +1,33 @@ +version: '2.4' + +services: + dedoc: + mem_limit: 16G + build: + context: .. + dockerfile: docker_gpu/Dockerfile + restart: always + tty: true + ports: + - 1231:1231 + environment: + DOCREADER_PORT: 1231 + NVIDIA_VISIBLE_DEVICES: 0 + runtime: nvidia + + test: + depends_on: + - dedoc + build: + context: .. + dockerfile: docker_gpu/Dockerfile + tty: true + environment: + DOC_READER_HOST: "dedoc" + DOCREADER_PORT: 1231 + is_test: $test + PYTHONPATH: $PYTHONPATH:/dedoc_root/tests:/dedoc_root + NVIDIA_VISIBLE_DEVICES: 0 + runtime: nvidia + command: + bash dedoc_root/tests/run_tests_in_docker.sh diff --git a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py index d23f9967..4ca9a336 100644 --- a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py @@ -1,4 +1,3 @@ -import mimetypes import os from djvu_converter import DjvuConverter @@ -6,33 +5,23 @@ from dedoc import DedocManager from dedoc.attachments_handler import AttachmentsHandler -from dedoc.converters import FileConverterComposition +from dedoc.converters import ConverterComposition from dedoc.metadata_extractors import BaseMetadataExtractor, DocxMetadataExtractor, MetadataExtractorComposition from dedoc.readers import ReaderComposition from dedoc.structure_constructors import LinearConstructor, StructureConstructorComposition, TreeConstructor from dedoc.structure_extractors import DefaultStructureExtractor, StructureExtractorComposition -file_dir, file_name = "test_dir", "The_New_Yorker_Case_Study.djvu" -file_path = os.path.join(file_dir, file_name) +file_path = "test_dir/The_New_Yorker_Case_Study.djvu" +djvu_converter = DjvuConverter() +djvu_converter.can_convert(file_path) # True +djvu_converter.convert(file_path) # 'test_dir/The_New_Yorker_Case_Study.pdf' -djvu_converter = DjvuConverter(config=dict()) pdf_reader = PdfReader() -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] - -djvu_converter.can_convert(file_extension, file_mime) # True -djvu_converter.do_convert(file_dir, name_wo_extension, file_extension) # 'The_New_Yorker_Case_Study.pdf' - -file_dir, file_name = "test_dir", "pdf_with_attachment.pdf" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] -pdf_reader.can_read(file_path, file_mime, file_extension) # True - +file_path = "test_dir/pdf_with_attachment.pdf" +pdf_reader.can_read(file_path) # True pdf_reader.read(file_path, parameters={"with_attachments": "true"}) # document = pdf_reader.read(file_path, parameters={"with_attachments": "true"}) @@ -41,9 +30,8 @@ len(document.lines) # 11 """Adding the implemented handlers to the manager config""" -config = {} manager_config = dict( - converter=FileConverterComposition(converters=[DjvuConverter(config=config)]), + converter=ConverterComposition(converters=[DjvuConverter()]), reader=ReaderComposition(readers=[PdfReader()]), structure_extractor=StructureExtractorComposition(extractors={DefaultStructureExtractor.document_type: DefaultStructureExtractor()}, default_key="other"), structure_constructor=StructureConstructorComposition( @@ -51,10 +39,10 @@ default_constructor=LinearConstructor() ), document_metadata_extractor=MetadataExtractorComposition(extractors=[DocxMetadataExtractor(), BaseMetadataExtractor()]), - attachments_handler=AttachmentsHandler(config=config), + attachments_handler=AttachmentsHandler(), ) -manager = DedocManager(config=config, manager_config=manager_config) +manager = DedocManager(manager_config=manager_config) result = manager.parse(file_path=file_path, parameters={"with_attachments": "true"}) result # diff --git a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py index 48c9eab7..b0069517 100644 --- a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py +++ b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py @@ -110,6 +110,6 @@ } structure_constructor = TreeConstructor() -parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") +parsed_document = structure_constructor.construct(document=unstructured_document) parsed_document.to_api_schema().model_dump() diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 122ea40e..671a5ee6 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -1,6 +1,3 @@ -import mimetypes -import os - from dedoc import DedocManager from dedoc.attachments_extractors import DocxAttachmentsExtractor from dedoc.converters import DocxConverter @@ -10,27 +7,17 @@ from dedoc.structure_extractors import DefaultStructureExtractor """Using converters.""" -converter = DocxConverter(config={}) - -file_dir, file_name = "test_dir", "example.odt" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] +converter = DocxConverter() +file_path = "test_dir/example.odt" -converter.can_convert(file_extension, file_mime) # True -converter.do_convert(file_dir, name_wo_extension, file_extension) # 'example.docx' +converter.can_convert(file_path) # True +converter.convert(file_path) # 'test_dir/example.docx' """Using readers.""" -reader = DocxReader(config={}) - -file_dir, file_name = "test_dir", "example.docx" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] -reader.can_read(file_path, file_mime, file_extension) # True +reader = DocxReader() +file_path = "test_dir/example.docx" +reader.can_read(file_path) # True reader.read(file_path, parameters={"with_attachments": "true"}) # document = reader.read(file_path, parameters={"with_attachments": "true"}) @@ -75,8 +62,8 @@ """Using metadata extractors""" metadata_extractor = DocxMetadataExtractor() -metadata_extractor.can_extract(file_dir, file_name, file_name, file_name) # True -document.metadata = metadata_extractor.extract_metadata(file_dir, file_name, file_name, file_name) +metadata_extractor.can_extract(file_path) # True +document.metadata = metadata_extractor.extract(file_path) document.metadata # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, # 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', # 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, @@ -85,21 +72,21 @@ """Using attachments extractors""" attachments_extractor = DocxAttachmentsExtractor() -attachments_extractor.can_extract(file_extension, file_mime) # True -attachments = attachments_extractor.get_attachments(file_dir, file_name, {}) +attachments_extractor.can_extract(file_path) # True +attachments = attachments_extractor.extract(file_path) attachments[0] # """Using structure extractors""" structure_extractor = DefaultStructureExtractor() document.lines[0].metadata.hierarchy_level # None -document = structure_extractor.extract_structure(document, {}) +document = structure_extractor.extract(document) document.lines[0].metadata.hierarchy_level # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header) """Using structure constructors""" constructor = TreeConstructor() -parsed_document = constructor.structure_document(document) +parsed_document = constructor.construct(document) parsed_document # list(vars(parsed_document)) # ['metadata', 'content', 'attachments', 'version', 'warnings'] @@ -110,7 +97,7 @@ """Run the whole pipeline""" manager = DedocManager() -result = manager.parse(file_path=file_path, parameters={}) +result = manager.parse(file_path=file_path) result # result.to_api_schema().model_dump() # {'content': {'structure': {'node_id': '0', 'text': '', 'annotations': [], 'metadata': {'paragraph_type': 'root', ... diff --git a/docs/source/_static/code_examples/djvu_converter.py b/docs/source/_static/code_examples/djvu_converter.py index eb31b5fe..192f889f 100644 --- a/docs/source/_static/code_examples/djvu_converter.py +++ b/docs/source/_static/code_examples/djvu_converter.py @@ -2,17 +2,27 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter +from dedoc.utils.utils import get_mime_extension, splitext_ class DjvuConverter(AbstractConverter): - def __init__(self, config: dict) -> None: + def __init__(self, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension == ".djvu" - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: - os.system(f"ddjvu -format=pdf {tmp_dir}/{filename}{extension} {tmp_dir}/{filename}.pdf") - self._await_for_conversion(filename + ".pdf", tmp_dir) - return filename + ".pdf" + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") + command = ["ddjvu", "--format=pdf", file_path, converted_file_path] + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) + + return converted_file_path diff --git a/docs/source/_static/code_examples/pdf_attachment_extractor.py b/docs/source/_static/code_examples/pdf_attachment_extractor.py index 1d966549..e28a7a2e 100644 --- a/docs/source/_static/code_examples/pdf_attachment_extractor.py +++ b/docs/source/_static/code_examples/pdf_attachment_extractor.py @@ -6,14 +6,21 @@ from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes +from dedoc.utils.utils import get_mime_extension class PdfAttachmentsExtractor(AbstractAttachmentsExtractor): - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: - handler = open(os.path.join(tmpdir, filename), "rb") + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: + parameters = {} if parameters is None else parameters + handler = open(os.path.join(file_path), "rb") reader = PyPDF2.PdfFileReader(handler) catalog = reader.trailer["/Root"] attachments = [] @@ -27,5 +34,5 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ f_dict = filenames[data_index].getObject() f_data = f_dict["/EF"]["/F"].getData() attachments.append((name, f_data)) - attachments = self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=False, parameters=parameters) + attachments = self._content2attach_file(content=attachments, tmpdir=os.path.dirname(file_path), need_content_analysis=False, parameters=parameters) return attachments diff --git a/docs/source/_static/code_examples/pdf_reader.py b/docs/source/_static/code_examples/pdf_reader.py index f8d032bc..b588ae65 100644 --- a/docs/source/_static/code_examples/pdf_reader.py +++ b/docs/source/_static/code_examples/pdf_reader.py @@ -1,4 +1,3 @@ -import os from typing import List, Optional import tabula @@ -12,20 +11,24 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class PdfReader(BaseReader): - def __init__(self) -> None: - self.attachment_extractor = PdfAttachmentsExtractor() + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = PdfAttachmentsExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: - return (extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format) and not document_type + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: - lines = self.__process_lines(path) - tables = self.__process_tables(path) - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + parameters = {} if parameters is None else parameters + lines = self.__process_lines(file_path) + tables = self.__process_tables(file_path) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments) def __process_tables(self, path: str) -> List[Table]: diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index a224cbe2..3dd5a20a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,18 @@ Changelog ========= +v2.0 (2023-12-25) +----------------- +Release note: `v2.0 `_ + +* Fix table extraction from PDF using empty config (see `issue `_) +* Add more benchmarks for Tesseract +* Fix extension extraction for file names with several dots +* Change names of some methods and their parameters for all main classes (attachments extractors, converters, readers, metadata extractors, structure extractors, structure constructors). + Please look to the `Package reference` of `documentation `_ for more details +* Add `AttachAnnotation` and `TableAnnotation` to `PPTX` (see `discussion `_) +* Fix bugs in `DOCX` handling (see issues `378 `_, `379 `_ + v1.1.1 (2023-11-24) ------------------- Release note: `v1.1.1 `_ diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 1114cb87..9164721b 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -11,7 +11,7 @@ In the context of this tutorial, you'll need to include certain import statement .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 1-10 + :lines: 1-7 Using converters @@ -22,20 +22,20 @@ For this purpose one can use :class:`~dedoc.converters.DocxConverter` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 13 + :lines: 10 Method :meth:`~dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 15-21 + :lines: 11-13 Since we have checked if the converter is able to convert the file, -we can convert it using :meth:`~dedoc.converters.DocxConverter.do_convert` method: +we can convert it using :meth:`~dedoc.converters.DocxConverter.convert` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 22 + :lines: 14 To get the information about available converters, their methods and parameters see :ref:`dedoc_converters`. The supported document formats that can be converted to another formats (which can be parsed by readers) are enlisted in the table :ref:`table_formats`. @@ -61,30 +61,30 @@ Assume we need to parse file :download:`example.docx <../_static/code_examples/t As we see, the file contains text of different styles, two tables and an attached image. To read the contents of this file in the intermediate representation (see :class:`~dedoc.data_structures.UnstructuredDocument`) -one can use :class:`~dedoc.converters.DocxReader` class: +one can use :class:`~dedoc.readers.DocxReader` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 25 + :lines: 17 Method :meth:`~dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 27-32 + :lines: 18-20 Since we have checked if the reader is able to read the file, we can get its content (:class:`~dedoc.data_structures.UnstructuredDocument`) using :meth:`~dedoc.readers.DocxReader.read` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 34 + :lines: 21 Let's save the document in the variable and look at it in more detail: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 36-38 + :lines: 23-25 As we see, the document object has the following attributes: ``lines``, ``tables``, ``attachments``, ``metadata`` and ``warnings``. Document metadata is the empty dict on this stage, because it should be filled by one of the metadata extractors (see :ref:`dedoc_metadata_extractors` and :ref:`using_metadata_extractors`). @@ -100,20 +100,20 @@ We can get the text of any line: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 40 + :lines: 27 Also some of the readers can detect line types based of their styles, e.g.: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 41 + :lines: 28 Formatting of each line is stored in the ``annotations`` attribute: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 42-49 + :lines: 29-36 See :ref:`dedoc_data_structures` to get more information about main classes forming a document line. @@ -126,20 +126,20 @@ Each table is represented as a list of table rows, each row is a list of cells w .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 51-54 + :lines: 38-41 It also has metadata, containing table's unique identifier, rotation angle (if table has been rotated - for images) and so on. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 55-57 + :lines: 42-44 All tables have rectangular form, so if the cells are merged, in the intermediate representation they aren't and have the same contents. Use cells metadata for getting information about merged cells. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 58-63 + :lines: 45-50 As we see in the :ref:`docx_example_image`, the second table has some merged cells, e.g. in the first row. In the intermediate representation this row consists of two cells, and the second cell @@ -150,7 +150,7 @@ The unique identifier links the table with the previous non-empty line in the do .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 64-66 + :lines: 51-53 In the current example (:ref:`docx_example_image`), the line with the text "Bold, italic, small text." is the first non-empty line before the first table, so the table uid is linked to this line using :class:`~dedoc.data_structures.TableAnnotation`. @@ -164,7 +164,7 @@ In the :ref:`docx_example_image` there is an image attached to the file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 68-71 + :lines: 55-58 The ``tmp_file_path`` contains the path to the image saved on disk, the image is saved in the same directory as the parent docx file. @@ -174,7 +174,7 @@ In our :ref:`docx_example_image` it is a line with text "More text.". .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 72-74 + :lines: 59-61 The annotation uid is linked to the line using :class:`~dedoc.data_structures.AttachAnnotation`. @@ -191,20 +191,20 @@ we can add some metadata using :class:`~dedoc.metadata_extractors.DocxMetadataEx .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 77 + :lines: 64 Method :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if the metadata extractor can extract metadata from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 78 + :lines: 65 -To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.extract_metadata` method. +To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.extract` method. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 79-83 + :lines: 66-70 As we see, the attribute ``metadata`` has been filled with some metadata fields. The list of common fields for any metadata extractor along with the specific fields @@ -221,20 +221,20 @@ For example, in the :ref:`docx_example_image` we can use :class:`~dedoc.attachme .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 87 + :lines: 74 Method :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 88 + :lines: 75 Since we have checked if the extractor can extract attachments from the file, -we can extract them it using :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.get_attachments` method: +we can extract them it using :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.extract` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 89-90 + :lines: 76-77 As we see, attachment extractors return the same list of :class:`~dedoc.data_structures.AttachedFile`, as in the attribute ``attachments`` of the :class:`~dedoc.data_structures.UnstructuredDocument`, @@ -256,7 +256,7 @@ Let's extract the default structure based on the document styles: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 94-97 + :lines: 81-84 As we see, the ``hierarchy_level`` has been filled. @@ -274,14 +274,14 @@ Let's construct the tree structure of the document: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 101-104 + :lines: 88-91 As we see, parsed document has similar attributes as :class:`~dedoc.data_structures.UnstructuredDocument`. The main difference is in the ``content`` attribute, that contains hierarchical document structure and tables. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 106-108 + :lines: 93-95 To get more information about :class:`~dedoc.data_structures.ParsedDocument`, :class:`~dedoc.data_structures.DocumentContent` and other classes, that form the output format, see :ref:`dedoc_data_structures`. @@ -298,7 +298,7 @@ one may use manager class (see :ref:`dedoc_manager` for more details). .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 112-116 + :lines: 99-103 Manager allows to run workflow (see :ref:`dedoc_workflow`) for a file of any format supported by dedoc (see :ref:`table_formats`). One can also make a custom ``config`` and ``manager_config`` (parameters of the manager constructor) for more flexible usage of the library. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 0c4cc32e..b8ab6264 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -208,11 +208,15 @@ For a document of unknown or unsupported domain there is an option to use defaul .. toctree:: :maxdepth: 1 - :caption: Getting started: + :caption: Getting started getting_started/installation getting_started/usage +.. toctree:: + :maxdepth: 1 + + parameters/parameters .. toctree:: :maxdepth: 1 diff --git a/docs/source/modules/attachments_extractors.rst b/docs/source/modules/attachments_extractors.rst index aa6f0579..b5d55e17 100644 --- a/docs/source/modules/attachments_extractors.rst +++ b/docs/source/modules/attachments_extractors.rst @@ -4,6 +4,7 @@ dedoc.attachments_extractors ============================ .. autoclass:: dedoc.attachments_extractors.AbstractAttachmentsExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.AbstractOfficeAttachmentsExtractor @@ -12,25 +13,20 @@ dedoc.attachments_extractors .. autoclass:: dedoc.attachments_extractors.DocxAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.ExcelAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.JsonAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.PptxAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.PDFAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/converters.rst b/docs/source/modules/converters.rst index 5556df22..1a387d73 100644 --- a/docs/source/modules/converters.rst +++ b/docs/source/modules/converters.rst @@ -7,7 +7,7 @@ dedoc.converters :special-members: __init__ :members: -.. autoclass:: dedoc.converters.FileConverterComposition +.. autoclass:: dedoc.converters.ConverterComposition :special-members: __init__ :members: diff --git a/docs/source/modules/metadata_extractors.rst b/docs/source/modules/metadata_extractors.rst index 30424706..0c1f49e7 100644 --- a/docs/source/modules/metadata_extractors.rst +++ b/docs/source/modules/metadata_extractors.rst @@ -8,29 +8,25 @@ dedoc.metadata_extractors :members: .. autoclass:: dedoc.metadata_extractors.AbstractMetadataExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.BaseMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.DocxMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.ImageMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.NoteMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.PdfMetadataExtractor :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/readers.rst b/docs/source/modules/readers.rst index 2250e5f2..7666f8bf 100644 --- a/docs/source/modules/readers.rst +++ b/docs/source/modules/readers.rst @@ -12,80 +12,64 @@ dedoc.readers .. autoclass:: dedoc.readers.ArchiveReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.CSVReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.DocxReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.EmailReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.ExcelReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.HtmlReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.JsonReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.MhtmlReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.NoteReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PptxReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfBaseReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfImageReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfTabbyReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfTxtlayerReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfAutoReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.RawTextReader :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst index 441fcfb4..79d80f0f 100644 --- a/docs/source/modules/structure_extractors.rst +++ b/docs/source/modules/structure_extractors.rst @@ -4,6 +4,7 @@ dedoc.structure_extractors ========================== .. autoclass:: dedoc.structure_extractors.AbstractStructureExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.structure_extractors.StructureExtractorComposition @@ -13,14 +14,12 @@ dedoc.structure_extractors .. autoclass:: dedoc.structure_extractors.DefaultStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.AbstractLawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.structure_extractors.ClassifyingLawStructureExtractor @@ -32,28 +31,24 @@ dedoc.structure_extractors .. autoclass:: dedoc.structure_extractors.LawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.FoivLawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.DiplomaStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.TzStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type diff --git a/docs/source/parameters/attachments_handling.rst b/docs/source/parameters/attachments_handling.rst new file mode 100644 index 00000000..589a49a2 --- /dev/null +++ b/docs/source/parameters/attachments_handling.rst @@ -0,0 +1,59 @@ +.. _attachments_handling_parameters: + +Attachments handling +==================== + +.. flat-table:: Parameters for attachments handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - with_attachments + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * method :meth:`~dedoc.readers.BaseReader.read` of inheritors of :class:`~dedoc.readers.BaseReader` + * :meth:`dedoc.readers.ReaderComposition.read` + - The option to enable attached files extraction. + If the option is ``False``, all attached files will be ignored. + + * - need_content_analysis + - True, False + - False + - :meth:`dedoc.DedocManager.parse` + - The option to enable file's attachments parsing along with the given file. + The content of the parsed attachments will be represented as :class:`~dedoc.data_structures.ParsedDocument`. + Use ``True`` value to enable this behaviour. + + * - recursion_deep_attachments + - integer value >= 0 + - 10 + - :meth:`dedoc.DedocManager.parse` + - If the attached files of the given file contain some attachments, they can also be extracted. + The level of this recursion can be set via this parameter. + + * - return_base64 + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` for inheritors of :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` + * :meth:`dedoc.metadata_extractors.MetadataExtractorComposition.extract` + - Attached files can be encoded in base64 and their contents will be saved instead of saving attached file on disk. + The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field. + Use ``True`` value to enable this behaviour. + + * - attachments_dir + - optional string with a valid path + - None + - * :meth:`dedoc.DedocManager.parse` + * method :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract` of inheritors of :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` + * method :meth:`~dedoc.readers.BaseReader.read` of inheritors of :class:`~dedoc.readers.BaseReader` + * :meth:`dedoc.readers.ReaderComposition.read` + - The path to the directory where document's attached files can be saved. + By default, attachments are saved into the directory where the given file is located. diff --git a/docs/source/parameters/other_formats_handling.rst b/docs/source/parameters/other_formats_handling.rst new file mode 100644 index 00000000..85a02ecc --- /dev/null +++ b/docs/source/parameters/other_formats_handling.rst @@ -0,0 +1,43 @@ +.. _other_handling_parameters: + +Other formats handling +====================== + +.. flat-table:: Parameters for other formats handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - delimiter + - any string + - None + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.CSVReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - A column separator for files in CSV and TSV format. + By default "," (comma) is used for CSV and "\\t" (tabulation) for TSV. + + * - encoding + - any string + - None + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.CSVReader.read`, :meth:`dedoc.readers.RawTextReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - The encoding of documents of textual formats like TXT, CSV, TSV. + Look `here `_ to get the list of possible values for the ``encoding`` parameter. + By default the encoding of the document is detected automatically. + + * - handle_invisible_table + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.HtmlReader.read`, :meth:`dedoc.readers.EmailReader.read`, :meth:`dedoc.readers.MhtmlReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - Handle tables without visible borders as tables for HTML documents. + By default tables without visible borders are parsed as usual textual lines. diff --git a/docs/source/parameters/parameters.rst b/docs/source/parameters/parameters.rst new file mode 100644 index 00000000..430e7f43 --- /dev/null +++ b/docs/source/parameters/parameters.rst @@ -0,0 +1,21 @@ +.. _parameters_description: + +Parameters description +====================== + +This page contains parameters description for main classes of `dedoc` -- when it is used as a library. +If you want to use `dedoc` as a service, the section :ref:`api_parameters` may be useful. + +Here there are some groups of parameters, that can be used during documents handling. +These parameters can be passed to specific classes like :class:`dedoc.DedocManager`, :class:`dedoc.readers.PdfImageReader`, etc. + +**Note:** all parameters work for :class:`dedoc.DedocManager`, but for other classes, only some subset of the supported options works. +In the pages below, we enlist the configurable classes for each supported parameter. + +.. toctree:: + :maxdepth: 1 + + attachments_handling + pdf_handling + other_formats_handling + structure_type diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst new file mode 100644 index 00000000..b3781b2d --- /dev/null +++ b/docs/source/parameters/pdf_handling.rst @@ -0,0 +1,163 @@ +.. _pdf_handling_parameters: + +PDF and images handling +======================= + +.. flat-table:: Parameters for PDF and images handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - pdf_with_text_layer + - true, false, tabby, auto, auto_tabby + - auto_tabby + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.can_read`, :meth:`dedoc.readers.PdfTxtlayerReader.can_read`, :meth:`dedoc.readers.PdfTabbyReader.can_read` + * :meth:`dedoc.readers.PdfAutoReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for choosing a specific reader of PDF documents for :class:`dedoc.DedocManager` or :class:`dedoc.readers.ReaderComposition`. + For readers, the option is used to check if the reader is able to parse the file. + The following options are available: + + * **true** -- parsing PDF files with a textual layer (text is copiable). + This option is used to choose :class:`dedoc.readers.PdfTxtlayerReader` for parsing. + + * **false** -- parsing scanned documents (images, PDF without a textual layer) + even if the document has a textual layer (is copyable). + This option is used to choose :class:`dedoc.readers.PdfImageReader` for parsing. + Note: :class:`dedoc.readers.PdfImageReader` doesn't check the option because it can handle both scanned and copyable documents. + + * **tabby** -- parsing PDF files with a textual layer (text is copiable). + This option is used to choose :class:`dedoc.readers.PdfTabbyReader` for parsing. + + * **auto** -- automatic detection of textual layer presence in the PDF document. + This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing. + If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTxtlayerReader` will be used for parsing. + If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used. + + + * **auto_tabby** -- automatic detection of textual layer presence in the PDF document. + This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing. + If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTabbyReader` will be used for parsing. + If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used. + It is highly recommended to use this option value for any PDF document parsing. + + * - language + - rus, eng, rus+eng + - rus+eng + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - Language of the document without a textual layer. The following values are available: + + * **rus** -- Russian; + * **eng** -- English; + * **rus+eng** -- both Russian and English. + + * - pages + - :, start:, :end, start:end + - : + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - If you need to read a part of the PDF document, you can use page slice to define the reading range. + If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` + (``start_page`` to ``end_page`` are included to the range). + + * using **:** means reading all document pages; + * using empty ``end`` -- **start:** (e.g. 5:) means reading the document from ``start`` up to the end of the document; + * using empty ``start`` -- **:end** (e.g. :5) means reading the document from the beginning up to the ``end`` page; + * using **start:end** means reading document pages from ``start`` to ``end`` inclusively. + + If ``start`` > ``end`` or ``start`` > the number of pages in the document, the empty document will be returned. + If ``end`` > the number of pages in the document, the document will be read up to its end. + For example, if ``1:3`` is given, 1, 2 and 3 document pages will be processed. + + * - is_one_column_document + - true, false, auto + - auto + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand. + The following values are available: + + * **true** -- the document is single column; + * **false** -- the document is multi-column (two columns parsing is supported); + * **auto** -- automatic detection of the number of columns in the document. + + If you are not sure about the number of columns in the documents you need to parse, it is recommended to use ``auto``. + + * - document_orientation + - auto, no_change + - auto + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to control document orientation analysis for PDF documents without a textual layer. + The following values are available: + + * **auto** -- automatic detection of rotated document pages (rotation angle 0, 90, 180, 270 degrees) and rotation of document pages; + * **no_change** -- parse document pages as they are without rotated pages detection. + + If you are sure that the documents you need to parse consist of vertical (not rotated) pages, you can use ``no_change``. + + * - need_header_footer_analysis + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to **remove** headers and footers of PDF documents from the output result. + If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. + + * - need_binarization + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. + If the document's background is heterogeneous, this option may help to improve the result of document text recognition. + By default ``need_binarization=False`` because its usage may decrease the quality of the document page (and the recognised text on it). + + * - need_pdf_table_analysis + - True, False + - True + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to enable table recognition for PDF documents or images. + The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. + If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`, + in this case tables will be parsed much easier and faster. + + * - orient_analysis_cells + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for a table recognition for PDF documents or images. + It is ignored when ``need_pdf_table_analysis=False``. + When set to ``True``, it enables analysis of rotated cells in table headers. + Use this option if you are sure that the cells of the table header are rotated. + + * - orient_cell_angle + - 90, 270 + - 90 + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for a table recognition for PDF documents or images. + It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. + The option is used to set orientation of cells in table headers: + + * **270** -- cells are rotated 90 degrees clockwise; + * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst new file mode 100644 index 00000000..546ddbfc --- /dev/null +++ b/docs/source/parameters/structure_type.rst @@ -0,0 +1,52 @@ +.. _structure_type_parameters: + +Structure type configuring +========================== + +.. flat-table:: Parameters for structure type configuring + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - document_type + - other, law, tz, diploma + - other + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract` + - Type of the document structure according to specific domain. + If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available: + + * **other** -- structure for document of any domain (:ref:`other_structure`) + In this case, :class:`~dedoc.structure_extractors.DefaultStructureExtractor` is used. + * **law** -- Russian laws (:ref:`law_structure`) + In this case, :class:`~dedoc.structure_extractors.ClassifyingLawStructureExtractor` is used. + * **tz** -- Russian technical specifications (:ref:`tz_structure`) + In this case, :class:`~dedoc.structure_extractors.TzStructureExtractor` is used. + * **diploma** -- Russian thesis (:ref:`diploma_structure`) + In this case, :class:`~dedoc.structure_extractors.DiplomaStructureExtractor` is used. + + If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition` + + * - structure_type + - tree, linear + - tree + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.structure_constructors.StructureConstructorComposition.construct` + - The type of output document representation. + If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available: + + * **tree** -- the document is represented as a hierarchical structure where nodes are document lines/paragraphs + and child nodes have greater hierarchy level then parents according to the level found by structure extractor. + In this case, :class:`~dedoc.structure_constructors.TreeConstructor` is used to construct structure. + + * **linear** -- the document is represented as a tree where the root is empty node, + and all document lines are children of the root. + In this case, :class:`~dedoc.structure_constructors.LinearConstructor` is used to construct structure. + + If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_constructors.StructureConstructorComposition` diff --git a/docs/source/tutorials/add_new_doc_type.rst b/docs/source/tutorials/add_new_doc_type.rst index fe2f1569..c8a0d1dd 100644 --- a/docs/source/tutorials/add_new_doc_type.rst +++ b/docs/source/tutorials/add_new_doc_type.rst @@ -23,20 +23,24 @@ You should call the constructor of the base class in the constructor of the curr from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter class NewtypeConverter(AbstractConverter): - def __init__(self, config): - super().__init__(config=config) - - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: pass # some code here - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: pass # some code here 2. Implement converter methods to convert other formats to this format: * :meth:`~dedoc.converters.AbstractConverter.can_convert` method checks if the new converter can process the file, for example, you can return True for the list of some specific file extensions. -* :meth:`~dedoc.converters.AbstractConverter.do_convert` method performs the required file conversion. Don't worry about the file name containing spaces or other unwanted characters because the file has been renamed by the manager. +* :meth:`~dedoc.converters.AbstractConverter.convert` method performs the required file conversion. 3. Add the converter to manager config, see :ref:`adding_handlers_to_manager_config`. @@ -52,15 +56,15 @@ General scheme of adding Reader class NewtypeReader(BaseReader): - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: pass # some code here - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: pass # some code here 2. You should implement reader methods according to specific file format processing. -* :meth:`~dedoc.readers.BaseReader.can_read` method checks if the given file can be processed. For processing the following information is required: the path to the file, file extension, mime and document type (for example, you can process only articles). It is better to make this method fast because it will be called frequently. +* :meth:`~dedoc.readers.BaseReader.can_read` method checks if the given file can be processed. For processing the following information is required: the path to the file, file extension or mime. It is better to make this method fast because it will be called frequently. * :meth:`~dedoc.readers.BaseReader.read` method must form :class:`~dedoc.data_structures.unstructured_document.UnstructuredDocument` (document lines, tables and attachments). 3. Add the reader to manager config, see :ref:`adding_handlers_to_manager_config`. @@ -78,17 +82,21 @@ General scheme of adding AttachmentExtractor from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor class NewtypeAttachmentsExtractor(AbstractAttachmentsExtractor): - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: pass # some code here - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: pass # some code here 2. You should implement methods according to the specifics of extracting attachments for this format. * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()` method checks if the new extractor can process the file, for example, you can return True for the list of some specific file extensions. -* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.get_attachments()` method should return a list of attachments that were extracted from the document: for each attachment :class:`~dedoc.data_structures.attached_file.AttachedFile` is returned, you can see its code in ``dedoc/data_structures/attached_file.py``. +* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` method should return a list of attachments that were extracted from the document: for each attachment :class:`~dedoc.data_structures.attached_file.AttachedFile` is returned, you can see its code in ``dedoc/data_structures/attached_file.py``. 3. Add attachments extractor to the reader's code. @@ -99,12 +107,13 @@ General scheme of adding AttachmentExtractor .. code-block:: python class NewtypeReader(BaseReader): - def __init__(self) -> None: - self.attachment_extractor = PdfAttachmentsExtractor() + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = PdfAttachmentsExtractor(config=self.config) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: # some code - attachments = self.attachment_extractor.get_attachments(tmpdir, filename, parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) # some code Example of adding pdf/djvu handlers @@ -114,9 +123,9 @@ Suppose we want to add the ability to handle pdf/djvu documents with a text laye We don't want to deal with two formats, because we can convert djvu to pdf. The following steps are proposed: -1. Implementing the converter from djvu to pdf DjvuConverter. -2. Implementing of PdfAttachmentsExtractor. -3. Implementing of PdfReader. +1. Implementing the converter from djvu to pdf ``DjvuConverter``. +2. Implementing of ``PdfAttachmentsExtractor``. +3. Implementing of ``PdfReader``. 4. Adding the implemented handlers to the manager config. Let's describe each step in more detail. @@ -132,13 +141,13 @@ Implement class ``DjvuConverter``. You should implement the following methods: * :meth:`~dedoc.converters.AbstractConverter.can_convert`: return True if file extension is `.djvu`. You can see the file ``dedoc/extensions.py`` for more accurate work with extensions. -* :meth:`~dedoc.converters.AbstractConverter.do_convert`: use `ddjvu` utility and run it using ``os.system``. ``._await_for_conversion()`` method ensures that the converted file was saved. +* :meth:`~dedoc.converters.AbstractConverter.convert`: use `ddjvu` utility and run it using ``._run_subprocess`` method ensures that the converted file was saved. You can use the converter in your code: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 20, 16-17, 22-27 + :lines: 15-19 Implementing of PdfAttachmentsExtractor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -151,7 +160,7 @@ Implement ``PdfAttachmentsExtractor``. You should implement the following methods: * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()`: use file extension or mime to check if we could read the given file. You can learn more about extensions and mime using file ``dedoc/extensions.py`` -* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.get_attachments()` : use information about file path and file name to extract attachments from the given file. +* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` : use information about file path and file name to extract attachments from the given file. The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor._content2attach_file` method. @@ -184,7 +193,7 @@ You can use the reader in your code: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 21, 29-41 + :lines: 21-30 .. _adding_handlers_to_manager_config: @@ -199,18 +208,18 @@ your custom handlers directly in your code. Example of a manager config with the .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 1-15, 44-55 + :lines: 1-14, 33-43 Then create an object of :class:`~dedoc.DedocManager` and use :meth:`~dedoc.DedocManager.parse` method: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 16-17, 57-58 + :lines: 15, 45-46 Result is :class:`~dedoc.data_structures.ParsedDocument`: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 60-61 + :lines: 48-49 Adding support for a new document type is completed. diff --git a/examples/create_structured_document.py b/examples/create_structured_document.py index 434563c5..dced3ae5 100644 --- a/examples/create_structured_document.py +++ b/examples/create_structured_document.py @@ -5,6 +5,6 @@ # to create structured document you can use TreeConstructor and apply it to unstructured document # in this example we'll use unstructured_document from create_unstructured_document.py structure_constructor = TreeConstructor() -parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") +parsed_document = structure_constructor.construct(document=unstructured_document) print(parsed_document.to_api_schema().model_dump()) diff --git a/examples/create_unstructured_document.py b/examples/create_unstructured_document.py index cf724e1a..f9a026c9 100644 --- a/examples/create_unstructured_document.py +++ b/examples/create_unstructured_document.py @@ -58,5 +58,5 @@ # HierarchyLevel(1, 1) for 1. # HierarchyLevel(1, 2) for 1.1. # HierarchyLevel(1, 4) for 1.2.1.1. and so on -metadata = BaseMetadataExtractor().extract_metadata(directory="./", filename="example.docx", converted_filename="example.doc", original_filename="example.docx") +metadata = BaseMetadataExtractor().extract(file_path="example.docx", converted_filename="example.doc", original_filename="example.docx") unstructured_document.metadata = metadata diff --git a/examples/example_doc_parser.py b/examples/example_doc_parser.py index bb1cc9f8..bf9f059d 100644 --- a/examples/example_doc_parser.py +++ b/examples/example_doc_parser.py @@ -8,7 +8,7 @@ file_name = "example.docx" # we get unstructured file with lines and tables -unstructured_document = docx_reader.read(path=file_name, document_type="example") +unstructured_document = docx_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/examples/example_img_parser.py b/examples/example_img_parser.py index 3f136cf1..009e2708 100644 --- a/examples/example_img_parser.py +++ b/examples/example_img_parser.py @@ -9,7 +9,7 @@ file_name = "example.jpg" # we get unstructured file with lines and tables -unstructured_document = img_reader.read(path=file_name, document_type="example") +unstructured_document = img_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/examples/example_pdf_parser.py b/examples/example_pdf_parser.py index 36568546..7fe44a99 100644 --- a/examples/example_pdf_parser.py +++ b/examples/example_pdf_parser.py @@ -9,7 +9,7 @@ file_name = "example_with_text_layer.pdf" # we get unstructured file with lines and tables -unstructured_document = pdf_txt_layer_reader.read(path=file_name, document_type="example") +unstructured_document = pdf_txt_layer_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) @@ -38,7 +38,7 @@ file_name = "example_without_text_layer.pdf" # we get unstructured file with lines and tables -unstructured_document = pdf_image_reader.read(path=file_name, document_type="example") +unstructured_document = pdf_image_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt index 6a59d51a..fd980a45 100644 --- a/resources/benchmarks/tesseract_benchmark.txt +++ b/resources/benchmarks/tesseract_benchmark.txt @@ -1,4 +1,5 @@ Tesseract version is 5.0.0 +Table 1 - Accuracy for each file +---------------+---------------------+-------+-----------------+--------------+ | Dataset | Image name | --psm | Amount of words | Accuracy OCR | +===============+=====================+=======+=================+==============+ @@ -18,7 +19,7 @@ Tesseract version is 5.0.0 | others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | | | oga_01 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 85.500 | +| others | napalm_doc_2_2_6 | 4 | 124 | 86.100 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | 1.620e+14 | 4 | 695 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ @@ -74,6 +75,8 @@ Tesseract version is 5.0.0 +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ТЗ_09 | 4 | 154 | 97.500 | +---------------+---------------------+-------+-----------------+--------------+ + +Table 2 - AVG by each type of symbols: +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | | t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | @@ -84,9 +87,170 @@ Tesseract version is 5.0.0 | h- | | | | | | | | 0 | | words | | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.967 | 79.867 | 89.533 | 0 | 0 | 86.133 | 890 | 86.03 | +| others | 90.967 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.23 | | | | | | | | | | 3 | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | | | | | | | | | | 6 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ \ No newline at end of file ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['3 & -> ', '2 & < 6> -> <б>', '2 & < > -> <__>', "2 & | +| | <1 > -> <'>", '2 & <и > -> <н>'] | ++--------+---------------------------------------------------------------------+ +| . | ['5 & <.> -> <,>', '3 & <3.> -> < De>', '3 & -> ', '2 & | +| | <6.> -> ', '2 & <г.> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['6 & <1> -> <|>', '4 & <1С> -> ', "3 & <1> -> <'>", '3 & <№1> | +| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | +| | ', '2 & <1C> -> <С>', '2 & <1> -> ', '1 & <1> -> <Г>', '1 & | +| | <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| е | ['6 & <е> -> <с>', '2 & <не> -> ', '2 & <ре> -> <с>', '1 & <е> | +| | -> <а>'] | ++--------+---------------------------------------------------------------------+ +| н | ['2 & <н> -> <и>', '2 & <не> -> ', '1 & <н> -> <й>', '1 & <н> | +| | -> <п>'] | ++--------+---------------------------------------------------------------------+ +| и | ['3 & <ти> -> < TH>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | +| | <ис> -> <не>'] | ++--------+---------------------------------------------------------------------+ +| а | ['3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| о | ['2 & <то> -> ', '1 & <о> -> <0>'] | ++--------+---------------------------------------------------------------------+ +| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ти> -> < TH>', '3 & | +| | <тип> -> ', '2 & <то> -> '] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| л | ['2 & <л> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '4 & <ОС> -> ', '3 & | +| | <С> -> ', '2 & <СА> -> ', '1 & <С> -> <—>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | ++--------+---------------------------------------------------------------------+ +| г | ['2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & <г> -> <т>', '2 & <гр> | +| | -> ', '2 & <гр> -> <тв>'] | ++--------+---------------------------------------------------------------------+ +| N | ['22 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '1 & <в> -> <В>', '1 & <в> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| р | ['2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & <ре> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['6 & <Н> -> <* П>', '6 & <Н> -> <° >', '3 & <Н> -> <¢ П>', '2 & | +| | <ЕН> -> <ек>', '2 & <Н> -> <. >', '2 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| с | ['2 & <ис> -> <не>', '1 & <с> -> ', '1 & <с> -> <©>', '1 & <с> | +| | -> <е>'] | ++--------+---------------------------------------------------------------------+ +| А | ['2 & <СА> -> '] | ++--------+---------------------------------------------------------------------+ +| И | ['3 & <И> -> ', '1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> | +| | -> <П>'] | ++--------+---------------------------------------------------------------------+ +| д | ['3 & <д> -> <л>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕН> -> <ек>'] | ++--------+---------------------------------------------------------------------+ +| О | ['4 & <ОС> -> ', '2 & <ВО> -> <Ю>', '2 & <Об> -> <06>', '1 & | +| | <О> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| П | ['1 & <П> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['4 & <Т> -> <Г>', '3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| п | ['3 & <тип> -> ', '2 & <п> -> <и>', '2 & <п> -> <н>'] | ++--------+---------------------------------------------------------------------+ +| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| - | ['3 & <-> -> <=>', '1 & <-> -> <|>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['3 & <608> -> ', '2 & < 6> -> <б>', '2 & <6.> -> '] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> ', '3 & -> <Ш>', '3 & -> <УП>', '1 | +| | & -> <|>'] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МРТ> -> '] | ++--------+---------------------------------------------------------------------+ +| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <Об> -> <06>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['8 & <;> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| ь | ['2 & <ь> -> < Ь>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| E | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | +| | '2 & <ВЗ> -> <Ръ>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['1 & <ч> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['3 & <БЗ> -> <653>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| й | ['1 & <й> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| P | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| R | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '1 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| m | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| № | ['3 & <№1> -> ', '3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1О>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> < >'] | ++--------+---------------------------------------------------------------------+ +| c | ['1 & -> <с>'] | ++--------+---------------------------------------------------------------------+ +| d | ['1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| o | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| y | ['1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| щ | ['1 & <щ> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & <‚> -> <_,>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index 72128afc..8b81cf93 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -41,7 +41,7 @@ def test_dict_with_list(self) -> None: result = self._send_request(file_name)["content"]["structure"] first_list_items = result["subparagraphs"][0]["subparagraphs"][0]["subparagraphs"] second_list_items = result["subparagraphs"][1]["subparagraphs"][0]["subparagraphs"] - first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda l: -len(l)) + first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda value: -len(value)) nodes = result["subparagraphs"][1]["subparagraphs"] self.assertEqual("list", nodes[0]["metadata"]["paragraph_type"]) diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index 6c6ec17f..b2df8351 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -1,5 +1,6 @@ import os +from dedoc.data_structures import TableAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -24,11 +25,15 @@ def test_odp(self) -> None: def __check_content(self, content: dict) -> None: subparagraphs = content["structure"]["subparagraphs"] - self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"]) - self.assertEqual("Example", subparagraphs[1]["text"]) - self.assertEqual("Some author", subparagraphs[2]["text"]) - self.assertEqual("This is simple table", subparagraphs[3]["text"]) - - table = content["tables"][0]["cells"] - self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table[0])) - self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table[1])) + self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip()) + self.assertEqual("Example", subparagraphs[1]["text"].strip()) + self.assertEqual("Some author", subparagraphs[2]["text"].strip()) + self.assertEqual("This is simple table", subparagraphs[3]["text"].strip()) + + table = content["tables"][0] + self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0])) + self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1])) + + table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name] + self.assertEqual(1, len(table_annotations)) + self.assertEqual(table_annotations[0]["value"], table["metadata"]["uid"]) diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index 3d024033..cf6526df 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -1,5 +1,6 @@ import os +from dedoc.data_structures import AttachAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -98,6 +99,22 @@ def test_pdf_tabby_images_refs(self) -> None: self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) + def test_pptx_images_refs(self) -> None: + file_name = "with_attachments_1.pptx" + result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + self.assertEqual(len(attachment_uids), 5) + + subparagraphs = result["content"]["structure"]["subparagraphs"] + attach_annotations = [ann for ann in subparagraphs[1]["annotations"] if ann["name"] == AttachAnnotation.name] + self.assertEqual(len(attach_annotations), 1) + self.assertIn(attach_annotations[0]["value"], attachment_uids) + + attach_annotations = [ann for ann in subparagraphs[3]["annotations"] if ann["name"] == AttachAnnotation.name] + self.assertEqual(len(attach_annotations), 1) + self.assertIn(attach_annotations[0]["value"], attachment_uids) + def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None: text = image_paragraph["text"] image_annotations = image_paragraph["annotations"] diff --git a/tests/data/docx/size1.docx b/tests/data/docx/size1.docx new file mode 100644 index 0000000000000000000000000000000000000000..7008b2f82720e1fb620f9d52f7bfbb82b998049e GIT binary patch literal 37083 zcmeFZV~}RuvMyS-U0pW1Y}>YN+qP|Y+3K=w+h&(-o%+@~``&e9ef!2)zxNq2=ln5q z#Ef@jJem1q<`}P>BrpgH02lxS00001z`?E|)c_y>KqnLc05SjskfxxmjgzsBldh7x zow1`fjhnRKhaQDw7OqS4uuY!& zALaO=@Y1e0QtP`bS1Yn&r~2RO%SGsc+>BRmQN#!7+TCi2-275&?E%?`iYfkyI_Axk zD_4V)2NA}Q!HRuu;GpXQ2FrKziK_-!_{K^X3cX8VyI9BK)x8l(ZvZibMX@UQokOVb z0z46@*LNrPKU22|*s}DBdXMVyYUh zIYYB3&$1ep2)y$^hTu7r;(f8sPkmN-&3--pwql?DgTP6TybW=10 zGA@~TJkq?zO@Zw<s z;fncNgrlu&Yfwb|%LtM=6<( z5M$*)1KRn05VC^ky_!@_Mex29G8j_eW*|Np=iJown zKFdt_82ma`I}kLnUF8|bN3Q<~U7BJrtsaQ#C_z?;P%Z%Uu5IujxF-v}H;_(cEI<^G zzMC~&wx2a~+pU_8-VLKetkH0F0uP+U-R=(hcZL!`8fy9u0sz2{3IKrktsPfe2P0Y| zTSI5-?@;+yj67GHwAp7t=tBR)3DGu;^kAx~=J{LZoRAM1BXbj6k9@#yfkqA1{OB>rC`Ehal5bCJG z8}l{_@_;1W7lCeNfPiW=#6Hc%VnNWeLJk_@+e!#ptqZrz#j2bn#G%6unF2>K_bF1Y zM~+n@UfPX)z?>R=?=#)io4I1GS#E*8xRp?!VoV!*Ku+Cz0{hP=l0TiZ88A0_rlg3rWxu#Dy5yr7O zl>7GQh6Dq4Ze~}yDp@P!Tn9Le{bI8e`Cnj}Lq#0R8tWF^M0EBos|Z2|OZKXt3!bZ< z>s#8tZaz!i;HOu1zq~oz4Dn>g_h?rxWUt>aA7A@P#Wk1viQQa=Pa%1K4U@uL`I7Sh z^>+hI^Y%YiKQ}&iKATz1>w0b7eB1=Y#s!j^&GF{${F3EbP__+GvEI&}ZS3^C1g$Oz zcQ0?^g|QP;xN& z!KI|l*&T&IxtT>z3cp|{5nW~yeuV=av6ri5y;HlZCH^cm*V7Au1{Sr zL=p?9FUq7VL2aQ?>ctZff&~R7CGvX&xue=3YmZ%-awZR&l1z&|?q_;6GgVE!P~&^J z4#gI{)TWy%=t5cBb&$VsM^dE(z1@e=VM0XWB6BmDMMY)Js)8C_m0E?;MzMD4Z>c{{ z)x(HH+jb9SNThaQCBKQ|5UY`u+ztWf`fm#c*4~fV-?9GARWjfjUAcZI6x`qW1vUT# z;6GgDKZ(V^I?Vqf8GzrJ#rN$0*`p(AT&AB6LHJ4RUGRY0elF!!Zr%n#p;?Z&shZIp zf|t+}=>F=r1~a{!x|oReRJ6nN#_!?C#S)ZJPK(Nz`J2E1QZI?!XDFO0KyZsS;SDGs(rpIQzDh zU>u`$?95%@>OOZ;OUC>YZBFl#)q%b3XVGHJ=w7xZjdvw&+zd{!A-G|uOMc&jsX>~6;GK#?{FprG6sFe2(eEifDV;;(ml4F-7;w?^ zg?%ptDd7%3RM9Bly%5X}&7Ga-mg02Vdf{pUbvgqh^3lZ|(7S)fo(;qiB5Q6z-qEf1 z#(}}fe!Z6fchnhXP^N9x+W?AQ;fE@YCRNqg4Y3Y(?FOR|n=%;9l{j2l=G-hm+MAlP zFEh=9yZqcoeT(;s3m}S}@_GHh)}cLBytsw_JK(B|Ik5+S6Jg5lOc?3EV!Vm1t&@$d zlddijBnI8bP8? z0~g!$oZa!pZDenkJYpvhb39oq-wY$)PrRIg##QqE&6zPRuZM$>QGt9k<<9_>hnn|S z6I-EXQI=B7Dgee@@qLR>t0TwY%3KztL1=(tLFHN-fih6VMT%kwqad8$usudJ-DJ4^ zd6vZ1`gUl$JGNsPX^oP5cH)?4-6S}-5H9C~x zZL$kaAs=wgU0lFqO1{(ehAO-sXDLJGCZcte=SNN%$CDpH2~zvLY5VWK$9sl}5B zy_~EnW{v#2X}Nwsi;?qmSpU2{={E@ZJ1dbe%}3bIBx~N+etcSc$Fxc$xom*>$+4KE z!b)V18%9w}widcjfikY07GN<_|K`C3-Djh}hjGG=w~J7hh~7t6V(iVXBHP;L=~UEBj1P3<(uG_f%M4=e{JYU-0B3$Ri%R_ z8ecphD^U%|?JJU3n)=gLM_d;!1ygIJU&It9t+P>@(9T=lN$r4Ju&o=|X`3ZMh5a(_ zLRBqikG7_IaWnDJ_538yN&~g~#tMM>XnIp_!{glsRFYsOljaZDvo~o|w2_n7-4j`bd{Gv^E39~P17Bl2Ge$N|o(bB;Y%+sK9I`VnmB!O&{j|8uGb-j4~3%#e+ZagZjDgFr|rpI4rrVO!@)|p8%PT3-`9xurKN{rsa7U z%6eg1zv!dP#^h64(7Vn|V?}o~s0QZq6TCPX`BX_OxV!5}(54VsWPH)4apmL6$p*}Y ztkJ}#`R@_SgA9p;^qpiWVgLXj{Fj?GGuAgUcA)>u$u6jB+U^UX_=xWDs`=P@MP{?D znJf}hS1p8EP>kO#IrKsE^%kwyZ+Q5m%N>S?~8yV+JyK*>h|VYw6c ztnZb~~)M1|&lpeLCkszK7GeWLGO@<@Jo1NEh0qr4(*hF4-+^5hR6M+ld z*l7SFx~4?%n%u(3BX}7Y$}u@6H`2tO%T0c?(~vBTu*X`*P}x zMMFVrR`CqB!AWXt5%^P`OdF0p$w~E-{!11p!a9Z}7(%iL0y%EkAn zj&|uYt^~5)1g)!cfu_mpvHxt^Jwo{9wz?l?58XVA{nA`9>JuROf0F6 zAKX71F3~c>YdseZA@~i(mxurclb|$+3INvxcNU!v);4GaNH@KadJnPGLp$ADfxP8m zrlsu-GTd-a^=a6v3N?EZ-@*F5gVIk4>LA=Ttk`H|FJ7eMN~TtO;5{x*@y}^HQ^YUC zCVrS@!ARSVddZGFJgR$IOrW}L{|b|vM=sx=Y^WYDL2WFSa66C~WmGWzN{TX)Cd#Uf z>_V5AihqJVYd&C1;u>=d%Aqn-GGoXYe`Ne#{u3Zy)+``23rJ-6r)_nf;pYaMfr+(| zFMALDlj>Q-+M?p&*vHA#zbq7t5KHd%Xs`4J(vh8^wWEKhp^ZcuPL~L zz5L#Yh-M}{Ab+C=pLMnpUx^eIQ!cx@d<9Z^ z@%<%rSuKD80A^wT#ee=qQ~$UB zoJgH?++#uMxynBI%DpMOkRlw{-%4!^l)0JEPy;i0MY9-0MLdB%#!z}0V8~y#Ev-mN zv+KqJtn~-d9sw>BKR-0WjXhsR>C|E=u8R&c-=+%wbvZxOdi)k-^>PdfllX^Ap-du^ z*pdnB?u3>2(*!IQIe|kvy0LOKua!@x{j+}Q}4ACN|5oTGgvfN8Ci&|D`RF@ zV36x-0981;BRNIB%vbS@bGe|?=cSa{%_m)*j&;Dx(9!UxwzehO&+Ki=lzQ#9q=0S7 z&N73ay+wl=$8nq_(iy15GGELfD=`fUUYzKEGDj+$&jPeo6fBDj(F?n=P&MdnU=G~d zWS;=tmJS^p^_+49-l1K_2rQ!OO=P3}`S^r!qZ%c8<&j2%L&ZTMe)!?;qvL*}8p<)y zaXMA(8=*Ie51uS$oySQz?501;n#KT-Hracil9b*?WYaV<6&2H%i5@Joqb{{nv5m>c ze>4a9vSUkFx+d%P-c}fSiq-xxrbsT1#QtWh5akdT`A*xln3un?ZYO0fZ^L^C9k|b; zcP9ab4h@Sp`BG!aNUHeH!hWRi*E(iU@^d2;BAd&AOe`>Rb@em%smE@SV|yFT+Yl8AQK3;=C~&d@%MX^_N8#3g=t1iVnzcdK~W zzNSUR<_GiwgMg4_WunGK+9=6M>*me?&-vna@@z25Ks}{6&&44yoH&Ql2)GeN>Wb!E zxfUB?#Az_i%dq_c+x^hrOd@kiUN1Q0O!VZ9;5}zKR*sUDP_KUK^)gIO8h4|8Ev@Ql zbpNP6#%sS2$K~z$h&`S4(&Tfdc}9{McQ}l_B}p7QWk3Jp#^UZH1o82L_4@w&ecc~E z+!iVw6eml_VAt>WOn%~>kZbB+h5y_y zuQ)T=lFgLfyx4H<7k9N=TllzjtS-}huw680jJ*!EdT0GxCN6l+Z{hpR#C0J5#V-HJ zYZ(6GHH9hbF`0jL&wf7OAUuWBuVSNF*GVL6I~VCxHeCR_MMGY``dJ;{KjI+co0k#h zw)PmEzi@qWI(@9v-8 zm4pf=l#l}08Y6pXg|s>f2gDVzCl17rl!$0%x6&^nolxy3+7 z%qF(>cj`rPGFcPb+o@S46rUKb#sF-=kLh;o-Goe=tsFwA<^dA|(ShZcncE6tPv9^% zNLWJYgmD!iN&YImdCDU*{VF%-O8dh?E#(@!G7EWTp*!z}#z{1k0i`ik#C)&ngC*^n zKoJUqoSUt}avW@Za3$z>F=9FVQ5AeyR62|UwQ(bc(;)sqSa4fKm%(7ke>|yv34Luc z@?HL2w}y)0I8PE~!euS_aAcqp8yOil3M6hpLUjEu|Ki&Knj|?>eKH%$(<5|dsZ8(U z#EMU*Gh_Q8BD+?a`+|R`j%AW}I!P@{sP4LZVlq~VGdUx#pv=~GWbQ@sy>>}+5o@`bbeG&W6G~&Rw1@ysxDL)I@16qFz@NGs&wm9;37D{ zfbJ2T+yO%J2?@1861KLw1cehe%jt+0!Q*jXzf7JTf8*lHN{E2lWYc*>k1&ZKE>9Q< zbXU#R{AwSC5}s1bmN#e8Ensy0V}JKD-%mf;8O>LvJtkwR=yr~3f63y89 zi=m)))UlUsii#TugN(klzcP2}4%q_Jh|VbH>ZTl8Lc~uj3VBX=6087?O=%HuGa86r z8E4K5`vfmhGm3~rd(Hvde2JuLD4t_o{5RZ4l2*T-O=UiA*BWbOsrPm&GC=A=q$)u$ zFEFDH69(n|TUZ3&BGLjCmXw--F$0aMx*)w}Lo-L!L!t_vge>p9^E_qCc`Dl6rlHU6 z+>3~%rg3YtgGHKAvi4AqTh;oHCRHs@t5%S!D3@SzZObnNAS%&W2>51F54pOR&ELe_ zu?w>+R8kDp9!1PioZvjRyysm2wkcY)aQ8NfVjG45lWQg5aV5acWIsYV4Men&tVNGWp1_q_(?quHD(|-Dmd#S+W6GzvunU$4* zI$wmq3Gq~xx<_(W{EU^j@x}UlaiGBI*L?v{*+`2f9Gp95wWjh;?(w6>=%tA$R}Ueh z5-}^M;Z?rbF$KXaurrwj+rpLz+b$w$yvIcIe!OpZyJcR@ql9v8fuA-0@Hd09C0l)B zj@VyK*&r)~wRV?znsXJ^5g;cpl1agX)U{fCzcz(*E(C{4-JimkrtZ@hG9%}l40is^ zgg6=~f+uQO3Tg7_lbk;KdtA=Ql%ETN0RWi)008)Iwf(ct=457UZA|-*=RY#UbM?)L zbrF;<+IyU!+*BROrh{K3TYBRzb0(~itI`rQ!*gmGZiVz2HYGP*dp?}PXSATdmWjf(Nkg%u~FmAu3JDPN1N}|)?hN!KF(hCz? zoE(MZS<*R#hZ1%dZEu{V{%8}y4CC?>2hn^*QTW(fdb`pF$fWWkSZMJPV&#{F$H1~p z8Q&L=VhxMjghhHY7I5=jjqC+v9nmDK`JO`mI?jxiaoFR34{-#2}7f1~L3A$97EPuKdB#KE@o2xxP%Q6UQVxI-|{ z3`4$;L%oTB*V*Bm$EDhLjm52T35U++m-U zeytDGcjW#05jrl_KY+B=eY>RRyfcxjc=;-WzVu7wKE*Bg7E5%)BxhtlSkNnl) z)32|OKV6T@)7u?h&%r-rBcIWEeQx)fc|Rw)68m0np9lB%k9ECYkKy4_uU%lkgr9Mt z2o|gRfG@xxUjR_ycJO_;qdNg|^F=ssDZ=>j3;3SY@bhY90KuTNz{lJ>P#s_80BNN! z#Q?9sZh{CvuO$#9dT<2nLI=j^wZ*+q>*>P18t>c`1~viobPw0n%N=YPj?Mteq&NVR z`Z{9VNB!Z6Mj8rZqwtkVH$}w8&fpvj1xDk{o8y-N#nKbFnc__51x^^C+_j6#0o*%_ zHqJNZI6x{UTz9*o4lIH7lEuS{m~?=U!32y4@CSqsSoHiUM>fsRB_K@wt2y}@M#F)k42>tqp(0k`Pr_v7bG6B&FhSx~ z#T=&30P+>by`lNbHcV?SGP6P!kcm&y2g*qR!#i+G+#qu4baeI}5QI$=-jO=jZDY3B z!+E}5<1-wW`A#f$+($Zahz>0@im&JG#s@CqRs?N%ZC!hb+u&Xl$y@VF9j)>O0Al;ksJWe#z4u&Q>jPRw^kN~!>K-802;oz_;} zmjV2;01%8#C7z##ZvD(JXC#kpu4AYxZcNFSm^x;ZaN+r73V5AMrDds}>HbleO9}P# zF!VZ}AvZV|F&ytzf^$8n)YyuN9;Nvj_v^!r5!=y|Nnh0OqqL}#G;(A@f*W&J6H{Z}ZM}|^X7ILe7oqJOWppp)sYoQL^B9DT z)u>N!3$_E;+K=o8wrAdGd6L*0H{U8p5`Hk)ymMq1i440v$qA)YzLj%9R=<+9MH^jU%v>9aLjTyqIqgS2@+kZcOl zLtZXWsyMwwq2XL(YoE@(Q5vW z3+0oWfEq%E%W3=cSHh0@qmJ;%@s)}YDZnPgUE*zsWFG@!jqI2J`@ms@zDnT})yWyE zHSNhV^8lk`v&so;#iUHtLS@q%NyWN(Qie>G;^|VBNAtGxxk$aQ?D8Y%iiIjp#gb)` zMe%-8Wk!6^wpq%lO3_@?>4nI&#rC+vq-c)r-F-w$)92r&)PJnrqXWKsLVxJ~rHJlm z?Brx_WBSj=kLIck_Ash9e4H;n!~r#mr7p&}NR^J>pg#rC^kjPR%FM)lZV5#O-0f8> zH?+893GB~s-GLkqZRO@E?TWVhK~>%wRa3k*{pE+u<>Pl$PF_paNS{^ftxle=6YEa9 zW7GEEt5liF-R8Q#c$?bqYP#Osbh>%_u*R%cjE`^bRco`h zt9~?XBDk%?pP3Z?>Te%+X6)$R)}eZRXu@pIv{{UfO%0%T^uaFn_4eVls*VxVs%#&e zxq4lf4juO~MPk+er6=cAW*ms%t2 z=(F8}&9)_5 zxOl48Bn&m}x@)!SKCazZoImXG)W5(XeI)%{7$@I>(;wG|tu@z8I>;J7_!BRv3#!{L zGHn>gO-()?~2_wBEZEkrU z_g8AV-8Z%nm%qTJb-I{zDd|$$=lf$1C`wc@_az1~;1Xp1Fbe2Xt{15LM5&n|3$ekg z7zjbJ&WpBpJax+={v3=Sj78`ram0vU+L{@9ak)x zjzD#*3aQ~1j3kYr5GdpPa2&nB0Hi$qvVUlRab%*Sp%6RdMYJgcRTK&hwMmIX*)^B) zV$g$PRahJjeL6&zsGUBOktjrk{WrO+g&;wQhy-cGPl7>uEtAtA21cAL3ei9iK|7D` zlR&@bk^zfZwNH}NJ3!7^KBYGZ3Vuc8hlo9fTsy%WNO;9haEqG}5mrc=HiNs3v zuapYe*vSl>LB0R?5yMl9M}n|Q2(xN*xE#=+6}cU^gqeR;s>59rh=d~5uy<5B=4mJr zemDT4EogrMsLpTJ@?4^7Y$YSqWv>Vih*HqxAVfNiav|a!_XG(r#*E}XL={vt`;TIZ zK6(s&NO3fW++dg%nT0tA_<}l>B+ERX24)RG-I*;8>>B5~YQ3^Zy_T*eVRkFqASJEc zvvTj?pCrfWHzNIH-hx6Bu|%zTX(J{Gl-Rk!7}B%YwmpXeCHABwD_=0n5*b>K>Q}FWNXRKC)x_xO4Y$gF zyXne?4WF~E$$P8}LmtvC$0XPV)v+6cR*z=qxMX6`b4}p0h>kQeZWz$oi-^Ftx+EDf zMSV_Xq~k<77IXZ8o>Et#vyHihKblm04xXI@MI_E4Bd^MvZq= zb2|3{tY4u|0ZWB2Ws*Gd=aruDfU=Wan}JARJ?`=+!HiPc*$8_z80748624I0$AnoM zshKi?JU3JUF9E4Rbs}X864dA#NTR;ke37$I(F2A2ipdtkQ8p*z4Gm%4Y}CMddU33$8@kq6A>5nw;Bv@n z17y<_4w4iI637Adp+__J9aofE`i{fPAR@al5fxyXvh$&;!H_Qo?1QA0?IWwoGt3AI z+}G{|AsJF*1ffT@(BX{71MHat$zz=}f>~MuCgzB-;p^rAf2kU;QIn6zETtG?O^T!e zFsD{MBCcsZI&Ra~m1O^1aDk)T<)A04Lr?=fpn9J0h!As=&8)~CWe7?FY1GDGWz#^5 z@M4=sf--E=|`S| z9+X}RL%h7k?PNo|QB@GI=sklx=%wpQCWqMyPbD|?swT$0#(`O@hVbIis6%0_GgiLN zX;L!2P7UR-=F9C=CdK(o&<$2@5y|f%<1#Wu;uHKT%K-QCk!zH98`h0Be$L8QSm;$R z*1$3)wj&I}43ZEtvt6_d=?+%0<^E(*4s>GsTA-rl$s-xqP5dRXi!}VKVWtY}9TV3> zUy$tBfn1|ZZ&lEk;c@+I9QeEb5&5v_r0(7(?&g+JeV$zf&#h43h1qs=KIxSG;^47w z;)d5g3E6FK`JeNKNK{qRHu>z`%+q*vX>}JP)&9v%f zdwaYr`nJexqgBY$8y@S`03x!d6n5=LtOVb8omb>z4Y<|f z#n73{I?Ut>JlVOeXH-RnarbjdRC3Pa1&grp`;7qV@>^uv{wETl{F|XCR;G44mqAJO z>&%00nM9%^40?z~HFUC9M_ATyqy&HRt$*SB*{An?;JmN=^DXb+*4jIR42a^sCF|ce zG5|S%p|hitt+kr96|K3GvGqTSMELo{0OYy&-xL1-ht(7}Tpt}w$ZkdMp>|VA-8v=da&HyH)$8Jg0IAS1>tj^E(uiDxFq3Orq>ei56?X zVVPX&<`gXxeFaT1kVbW@WN`AkzXNThEn$b&5{C5f3Lhg*RyWJ`*tO~>jnf6532<%s zmt}bZ(7&{fK@mQ2S#7_6=D#=lh6lD)Zoegh-+2qhzb#bG!Pd_4KU%F;Y_G-lMnj?R zBRqkt?bIdq$Yd*U=0<|mMgBU$9TA`@OJWXY6dzAxC{QQn>Dk6xR?g3Kh7N+q%&R|C z)ZrjC#5EM7Z3hsD{7biEqOn}R`fm9=VRkZ-2$fFU|BjC>Q)qZ;JxRQZ zfVk@@P&|SOUs<-oJCc8eYwPp3`t&Gua2ouUFnsIt-+}Xw()a%ga;`?SECHfPVbeKXJ4g z7_!euaj)7x&6eO|WCiNc>07F_&ZtyANw#5Acc`SqaAquyHgW1fxr$GaTSqe!*fdyV zb@?(P-UX{X?tJe4b~CAzgD%d&_kj@%%)bK~<6jNI|5KoKsB7Bpv7z|rnturu!^J37M*;SgX5Q@ihRQi67VMGwro2HTTr?VvaRw zrtAydci&*NGrdP0h%ATQLdQEx`s1>EJ@D%6Xmz{DV)JM;ijqpb{KEY18ZZJZ)XVZcN>)1OL_!h$$j}<$)biAiLR1jtwtrrlkS1 zrffl#IzHtvgXmxr^cdiq({=HP3n-D>q!Hk{$oBF((uscl`*ic zbZH&CQ0$UyV0Hu7Yan=!jjkKqbLv8eU#IG&Wgn&{$L^Yo|sUb808+k`zfKCaz)ujbxVsyH*A9tAzlvo3ia8E9E)b?fpg}KqL;vKyVC34xz8^fU1U+nZIpP9 zeM&zHoOvj?4mvUc0i}qy-8*P>3IdPJiuZ}itw7*iK)W^G`8|?@zZ5DrwWL`XBO+_I z|9DUk)6?Fu$MBTOL_Ko_@|>W!inX-Hoi()`SPe?KU^0krrvqy$mt{d)ze%+!^87h0 zcHKF1?oE1%*^Bas)17D$bN2w8B#;!Kg>;n(mo^0U!+{qgz87LM z)R*FRTj`(*Mu$lPX_5em$bIIHxXvd$J71>81I$o4XkTAqF%8u|9uhktd*;&9Iiam8 z=w2L52xt<<@&Ht1D0v;pj%PgyR8;nkuwZs%#0^I|33ksL>SavGP&C)bLAengDlrh@ zVZqBPp2F;R-_5+(;SzKfWJfdQWVAEP+LDPuku}MD+z30*h8%b4y3DoowI$ zZa=nP=g%}YWY|s=06#diS*xp`m@DVR30{VA20Kp4f&XIgY*CfaGZ~88*3X);JhlT? z?X_!(V#Ez-17Y|u!`D+^lq4ZSmqWGgRoLVNbY(a&u40nLjg$p3<(EtqaO4T%q?lxx zaBs+1!h?9$2hC)vhVk^;A6al(3}@n0-(YZcbZZYuc1)6aNfc4~5q>E(5Y4)L<&)6sdy_le!jux}(&dg1)^#rIa~rJNA0QuQ#uCRota~CU=~VXu z?AscJ+-T4koUt(;4q5FHvE?<&WUhz?50yag+ACiy*RStL^Y5(Zz(4=d=3A2ez0K*r zVLktm2W?Q?mi!LYKIrdw5?kvHGRG*K=F6~PPVOjbtCt2{(~ zI_=_B=JNX^2*Y1tGp`13;wykA!#`dpzM)?lj?_leAu-|ffgLP&z04_u`Twl(|vs%_J3JuB>^ z|1IP4jTG$f`IZp=E#qSPPXb}oG_PepA57?Oje*VdsCh2+kV{B4BubkTs%f@&Tw+Y3 z!ts7H%?GQ?{4Edf0-o*5T!;K3F;`2ct+pVxkpvg~rG@>|pBO{kHiaU&Mxiw?luB*P z4FeSp09a^arC*Ar`AC!Cqxkg2e0IxCPjRDm0)o%V8|aSn_rO(Wr^|t<9JZ^tI@2Yo zvYx4Gu*j4FoZvVAIt`8MZdkjqk5rbb!j?Rw|CSIzsJf2)`_ zD=0;cr?SS1B|_|L#cmEn=XlZ~y?5?g9VbBRw|G)&|B7->sd$uy?LK9)mTSya`YD zg%2@Pr+(uDVS^P0}JtZt0Q*%{c`!?bTW0&rFmn+_X};RXPq_jZGCd{_Vf5^ z`*T`$)phS8bJNGimsQvL5`DY-qiLn{W7GEQW83HM;p5}@A#_k>`&lH`zT4CH?d-{x z_u>iu%GSoE`4`&Ri%pZxmJMtB*LzEc_QrVyYBmI z6E1KJZqU{U)+{cBIt)lJY`{g{^`E>I=})C9|FA{A*D?(v`Sy4Y(&2Lvfs6Q-O|cg4 zgj%e!BAK)GCBDqi!OqX?eUtW_s|Vg@-tEfcbzL4_uM%*dw5wFSl#0X8OkCvJe!9EE z4_Y+Y!fxf*+9B5h(H+*a$xVacE4TuyF5fqI&#$i6PuGX1J5gU6lXO*A`eme;#e&(WjDI}iKr?o}Ft%hVbk!`Qz2516Lj1t*^UoaB2Uz&ww@CwL^BGDn#5^>4v2QUstPZC| z8fve`J#~vH#bzhV>611D(BA&BSt4JM3tY|m;(Gk{Qu0!*^2cjnuq8DA!ESpAHVQZ8 znBpBiHgj(Ioez&5jm_`laq{)?1DuIn?eJjg;Ax31d}Kd2J6MIVeQSqxw~rbQ z&wzM@S;oL2Sq1G2xa7z1?r=mX`v%N-I}_~K(!*f;;DrNsO^DC){^xr6=OJEdy9MFH zHSY<|)AbD}TSd*+vGqsG3$04Q$8wXKPPEd)@h<5H#E2-#fSCTOQF; zjIvj)ueo-cg~eZOzG~(_@DqbZ)6_<9#1xFcOs~pqA0W;k@3E&KF8IIB-mD))n_9Q8ZZ%V% z-rn7LrkFt%s|LBj`MI+lXhJQ<{P-a6TXnQ%MJsu7yQywDL-=dgytJ)+wa%YPP|#o% zS4vL7-FP8383n7-g$+$^w=*B!yF3Y7ED0muIQy9(Y-9!D(m{E0Ku4$!4Hafx2Six3 z*2Y4#WX?(lClCso7z=PTT!$uLG)6=$3G#6`$MC6*QfCEhh<@sK3SN(!=hdcRir{1p z)ux9>n&)9x1*f@KVUac15tofLzuy!FL}C!(Cd*n$QW^~<1K1CeWY(F*LG>W!@#Xuk z_&#@NP*Z}k_kH~GeS5icsw0p68QbU3E?2|4kc-X83_03q=T^25vGCJ6nsevrA91?1Br3C zT#;BgsMBDyO6n7Y_Q`_gNs=ocfDaY1c7)UM~5s5|jarMs zh`A)u`9;HDD$%l0SSc|80)ttvZmNe6hOz&vy|)aGTUon>&CC$9V`gSMW@d_+nVDl| zW@e_C8Di#`Ddw1&V~Qb$JIUGSB->}d_f~!XzN&Ys z&D11t2N~mgMM%_Y;z!uEmsyG^VN~MTsl)oez^R7|k_t)4sKNu1oa_OI zmloBOK%$I9wHiyOtIDp4NsNaAp2BKolireZ`0SFy=UqYigP}{?$)b14>X6)^5@9K3 z^wj2Mn`zQ}NLeRWp{~F>wf^SuT}10Re^LOei+j@6^Topw9V(kzl;U6s6)BeTYl}qd5UXu!FZb5l-+xMvXs%w~*8T~!97N4gZ zAKEV?YxK|LULHD6fZs_tdG(F+jzZbq)$HG5^t?X3M&mOMm zWab{PQq^snm{d|($Eq@LEw5H)Fe55IhkJV-q@r6+Fl_`V?_9Z3T^D!BEa1q-6JzY& z`nu>tgV`sXevFeBSZv=PlD>o>ytLnX`(|#-ec!s~2+lIhevfdX7rE@2NL{O7Irl4C zNoc**r_zT=or}*i`e5sxb;wRrc1gK5@`U zq(s>?RlLOSxtC!%A|(84L(I@rTihc>?V~LFrjU@oG)T%~s>UEw06w2es0I86noFB^ zgZXG@D_KfMZK2me0_sga*9=KqEG2rJ26iL9lW_V59p!C%lRe>H@R)37enC)X#xb42 zaY$w0v#Mq_Hd*;Wl5A>l@W*Qc5s#**tt&FcD0(-M5vtH#G({om`CIEcLp(4*5%{Rr z*yyne9X&8ZH4c;RG=r=>&Z*%M;;CIg`p{JB0(|DWJQn%lFn)~a#Rt^3K@40&SLSG4 z40VESXcxGvX2yB23=?6(FRRzNq*exmCjx?A?s|P#JjpYg=?e1u^@bVH8muA)W`mlJ zC%dHkR~k5|!)%YA%pRS%ho2KJNabfruKJB2Iwr4L7k6eqIw**iv6UQg6J242ZM?6> z=snFUeQ9@7z&-oy1vBESP>t&C@^Irc5kIC;n0rq&IbK-8pOu|mrO;guQI53tex;AFSV*wAjM?o6uQek`J6EP850F|QNr#JNW3+CgHVDX9mO{>9q6r71jBllqv$ci?Ew2PP=Xjt@wdlG zg3!<_VAN_@69`nzD-I0tDl}M^(e>;6$~L$+g8J$<(j^*JA6BV$K&~jfaGrFmJL#JE z=)P-^tfJP3IZzVupP0;-HGmX2ny65@?)P&EX}-C24K_#qnw^ZxR1N`Ioit&iB1+m2 zZm>C{q5?mEtO;xFuYyzogB_itUcME|PJ0q5$7liQ8LSKv&|0NHUE~L;AXD_Z51A={ z|5g&jW>1FcY=zwiI7<7b4D35liAar5^PwhtLD-?oU{BrJ4S47tdjmWq91%6-^nO;P zMod9#!`X_}6=UkHz_OtZq@c1P1e$R98#g1ZLexaTxotmwKoM~bjB*U5;Z1%S zzEC+}^f?F^5PDL}B3G+YJOkZs;V8c0nyef4NoTwN$j}{}o8W~Z!KCk`Gq=jhFniX+ zkP}LI%`QWxG_(=NlQG9G)#rVy%c_*kL|DR{`W0Ov+8xKHD|rO0toqTMF1R5k)ABbi z9H>wq12X25IY9-mCSuD3ss>i{ChTRWvEUUNCi2aoqy4rlKAn2#dZN)rLv0xJ#@SaB zgsTit3Yg>OoAHQF4?=?~iV}v;8=}!lD2|K2M>H-ILOTUE1TOM57(vda=L>&xDNOKj zWR+NW+C3~k3QLJZ0UrfU2+cYs&1gQ(u9QYJfeI0F+1V0dAySc> z?^}`k6R(B8ieJ4pLcUaDNVyJSR93EXW_av^0v^J*@{b5&Q|QVeqwBM}%FY}VOE=0P zec!0c`v_G;76eX~IzaQ_`4 zCtv6$laoJGc-~7j60^me)xPQFJDOn`BNC+%k)##8J23?4QlbyyF{P|>0Rf>R&4iVV z4TgmDvsZ2jrfL*OaICzZwro$Re?~SULVIG#XSFDh?yyilXLu%J7@K7dj&H=-MCPn8 zC<@?P`Ak!eq)PX<*I5QOtS}(NV2Ve^A}xot8{HVL;UjYtv61!C8mckO`5VQW{-CyL zw6*mX~< ze0>No`e=%@reW(4m!|YnRYgr9ZFW5h=|#?+n&q~Zp&{Y#H>dLrO$k%FpyAV zO0yximn8}fij|#56n0SDRsbv2W~ZMz^i85KieDWP?E#}ktm6QQtkAej&a^*~5N5*R z(FI_4VXVbOWO3WuE`Kut*+zWJ5pMNJUF!ydx}!2cE#f5WPy{uZDdUj?ILa~`Wai9L zC6Ggv1?{=;l-&+ZP`1VOK}w{ybiEaoQ4+us4wI3*;VYrdAmSqd^8vt^577qUnJJ|V z{ZH4WApW%lUBoh=D6JZ!>z4*5&`(ke7!hGILm7!4FJ&>po%cEYjf0h<;h;tvp{ldf`0ZZwE^`*(MYO+zifx z(2{^{{#-%eO=Ptokluh=cLnf3g)=Z5IT3JKd!3AmZewnAWn`1^x1H;W=U2P8eJ&cO zo9R;EXL?z4oqQ^OLz8i?U$DHa@1KySkHx>4e$zmCM6Qtix~HUT?aybRn!FJM-1-U3 zr()?lE7=_?6nWf6jKk!m+cI$hnsM(U?iXPXpb1P-2T9{7d1D;*UHg_jMbvj@F{IR( zd}3=Nr*!-tCn?K!oEFQD@R^Ver1ut_lQ&5C-qwMTx;QKP*3i1(D`_BewHC-CL{-hX zW2v9s=&)rE_+&pBZ^J2N61LmIo#W&<+6Uf70UDv(s7c&QeN;pGuItFhsyiol0JAnW`)}7cE1zn4hHhIu{npRRFHd}1O+O#XV zjGOj58}xW}-Oo6sti0apbf4Nm=2+X)`gbfOV~&{*!q=pUJhh{_3ez6vlJHY@sMe(o zXi6?jX392Uz+vPes4lIw-7_VNn9QnJ(5133;oy0s;mwKGqf$_W@f7F-5_oldgj~hn zEqlWak~=x2+`-*FH?${5pFs^v(DZSK8-|Y}F>(PKz18S5%*^`HHl5(v(}U;>&bFuH z^MmMn&XxN)S10m~Ozr1KV~%7_P1e&qitSJey<(6oS5PeOYI!e~#nVE| z;9%HZD4kVJV5r5@Dzxzwb1c}S%$j&eV1{u<_V5n0YacmsL$1X>b*mtUIuMNbQ9$P7 zhSo(GV}@SxXM!jo2y-IshEo`pXX`1EMC!Q|cX1X7lQOYkw53S&8cA!hgpo6Ic&K_= zMv9*vT(%nBvAW%3UOj`(QDomN`ol5>9ZTYo< zbU@|F#cYg$#6K~{N=g7zB%Ii|n4m{>K|s#G+6HMGy7SXUIqZM_ICjAWRG%qyRCTMG z&e;bQ!yOvFjHN8&GW7zpn<~$>4PBkQ@uPw)JF#(uQu;t3`*R())FF4|=L$DgStj+W zS=ks68O_LqXQhn1C=jYiSp|ojKrimK-CL@OY#6O)_m;M`0Gvk+5&2xS4+n!MP|7}D zE7}Fi1Y5DSYLum95oMD)r}$y-iqphJPV)BasI_>$o4Q6>2Aq|Lmc%D029Sr94`rVK zx}9nuXa$&z8bmxIT3+AvZ_PLhX&}=2XICLTzv}=1eQW&Y6>p_wQ{eATn$NHoOk$i< z2PL--&p>%SDsHi%FWy%5|9VZ0DTC!eob zgmMC=_PYrs^Ckrj95+Bq%TFAAD?iyVepU=MYecZ_I6$6$>B-(<$Se#+W&9FhGeKTw zP(a9j;vv?mpZy-6OTC0~S6{q%k<#Cdr9AeTRTft0VPU&L@L?fAJ?Ogns?kp4P*-I< z5p#qAnCY%9Qvoig9-)#aNA65FV3fM0J^N&(XV*DW(eA@;vVh$O_#vv6b_l>VtEJQG z#h4w%ge2smmoT4IjwbsaQ#nmThLSx3nZS=@KxlW(^ZxnU&{MPW6Ze&@HYT>m8o3iu zhN*P(6yhyQ5O0ttt+j#%ONd7!>z12AfV3Fm*Y`P9M)UKb=M*-9m&m{^#dn zHqvo^7tSjhUv3CXf)8iFu!M`SDqfS|-A9+mTtJq&ay5uFGT!bzTwfi5Yd&ekAar9N zn4Ex0ZF$!q+-d<;(AgG))f=)qUZ*9Nb@aS}Z!Wtf}d@!iRL>1j)em5eBJP=qQ zt7}hdh`{8ld_<6U3`oGVWsDctZr2p8S+r%cr`>LMRfe%k7bc@pOnxXK9E_g2PWikG zGj5VL$02|sfd~x&C`s$yg_(p95Q{_75}uut%?Z=2`etS4mMZCIq>&w)&_8R~n@r7( z!}3-zn+ZLsWE$<_I;|7{2`1&QQ!4cBpB(2>Af`(!8G^5iCv#8+8efpZ-0le8fpME8xuV6j6h70Aq zK&IZNZy~a9a{qW--QohliMTsYrp~Bu(KvsybaPDK-~yqHpggZ5p9Nwt#~JyaqD9!0 zEJPp;Ig#Jd0TU&37?T5L4VOfEo$g)-B~u|dqU(jfS(`BB()1|0!4)n|v|cmh>N-WI$8`W251AsBgBq z1-?oVL5~C`hMj8G!uES`*d2d&5D|vT2U+U$Xyqpe0X4OvKQG{?koK#Dya7SRM$js4 z>Ypfki&KJJD6dYwQ*@t#e0uHH6sWkmo866f$|(kffq@OwJbXu5ceeb#Ub5QIY%oSH z8A8CBS)lyOrwM3{-6`}W=xB0@w;0J85*nmKS?Pt^Dtu(j2VU7{vl~{$BO+-wUoy+C z8nEdPt=Ke?dYRq1HSZN?W~|7h1=4P|oqO?TN3Hf~hAUDH94BOOrpF7Pio?~%a17Op z)p*m`xnAI~rQ9kWEG`0}d^sui*|WcWhAQC*sBQ)k*iC&je>Shr#$clBh>yVtm){(U zi5Ymfc?#KjseHV-7vlB4icj9ZOu_zsw57d2H@^QQDYxb3D|gj4zEBVmaT79rc2|;o z3|WcGnm;#3fAwScZ1y78hgXX~Dcw`u>eiz15pbRMgxuT2@yWhiX6Xe%P`vFs%n^DQ zO?stxQ&>+BV0e>e5(ld@I}d#{$@q+4;X6&rMAz)4+slE;=S7{d$podg1g8R}mWGl7 z+%u9`1Ku;z+X9C$X`_GLg5jad^{okOFgL;5vU%>%i>69H6p&i+fHuZ5JCkQ;D zh=mM&t?7I#fr3~92LHJ1OP#gJkVslQ3L+_CG1P9?V_%T`lkpJVKJQ+D(wD+*)ML8^ zO!@e!&tlR3IDt>bWB0->$YQ>J;oxDOM$@eVi>7a~0QoTmn7GddC?3Ob_JOrSV_Z3y z>;?otvWh{omm?&M-&3vkQMXM3!NFi)IaXH}O(C-7A~OWKiI9m5@RvPWdlwB=03|rn zgdk+nMxMRyCyehW$p@nrQ4*Gy-dzq2kwfE$yPD;roGJHp!?CBxO}*MKD%;x z)^DVs6NK?KicfB|%vMu`WEtsi%P`2DFCAFP2Mm4?q^LzI!m)cS4FX@lPLk+FMs8_$g(M%gqSsF_^gDH7_%jt0Bu(kUolCR&iijwYz! zP$U}3uG2eZpf(H)KP3D9NE|==#k08qj@(epT2{oB)^?T#I6{QVw@r>8pg*QmHb6Ex zIv-kRypN94rzR;T)w~Wfi=v`LisYXa1(JaUcBZ2zAsB^aXTCY-3`PP zP$ONFKQTvIOCp%V5S#^s;p{6HwcOego89S~s+iDFW4<^sK$?aG#jKCn2v^Dh04@tE ziF*wK1++xY9BFAl=?U2BgbdeMr@7dC@p2-B0``lfP#s1ggIb#h-mK_MSJo95ZskjCw{_-Rh(AlOSX-(I(o+W>N zde9yQ_9ZdVgn}O@GZrF28)FS7G6gR0g+_wbKKTvDGFC2LcBl{4TR?FzXN65coj{v} zQp7EPXBr+YtSj_zwnqRbU34PQBW_q7l`@NX17zc_mM{{JIuUU3LUo^>1{!OzLWBwL^MwWISoUIbJ8SfOnpWdjT-vw5`v_j}@L5xo1bZ>1l< z+>{X972aS2Vfq`)Q!PEUD>q?Z24uQ~H&dwI@Nge>T8i~oJjM`RyK>gcyxKL!&iVLo z&kCXpt2~8XR$l&buwc2?-Zv(IP9j1f5YSI`@n1Thj!qw}{-u?;qb*>&^Vzu*@|zFA z!w#;fV)G}>nQjfC8Syj$6O$ogneS#w33C=_HR{&!JlMNN@Aw@)6Mi1X0ELd{s{wBR zZh-OK_TXOZBh{Ow0dJc4YPA`f6YZ0-jGZ54M_%V0?$zNk&nNqDQ{ry@$jm=)&Gy)x z>upRgnhId9XCthwYVmO5yP1n_`3ln%mjrmEBX~a#$UUE#y2{O-@aBIU0^YhloZ-Wx zsk6Fq^5MI|aFUqeY078h<1`(PH+r;mXzL;w{&8~@HknKvA5_l(Q~%>>x~In}S1%sL zxKTErXHTpX%_uTw-Sq0i!K~?+F1>l?DLKm)1&u7mCKWBx88-5n8cGGq-Ym;5@eX7% zNu<>DCMKK#)&sQE68Fk>mI_`hMf*x0=)F0XH82^6FEUW-!!x^;-`6 zY_K;k``Z{kJ7NzC_!vT%o;}vQde6*U;s4-#yaE_5t$=Nu*~L5q{)pMS5#7d1qdX$BP(XS3|EM{gRgR$e%ykW2S`Lio{cj z8zJtgi7PcPx7Io9;lP!9k>I+pH5Iv?jUaxx^B@g%R_?d!)3{mX(ls2c+K*`b&L~^q zY+|Tf+E+9$6u)nJZP>_T<5UBQe;(^SyhQAyEp!KB!$e1R+lw>4_nkVI&+y|dOUuMv z*ldrV?>^*uhg(7*%_iZVlbu68g3he*9XA{Zwk5`31VgSat=S`8QP1}~`YNLXMM%%4 z===S?f{zV>6!3I>4^E^YNPIY`J)@R{NMYp*n@#qV&*dOirHiEry7ELs)$AvTj3_7{ z*_PeMZYCCYl}m#WG;!iTov%;b)ncb}s|d{wSsWbHVsm*{1e1&jv7rqDBR*fcFT(L# z_c6K=V-Iaunc)V|=h?$m`42L>fnX2m9RLdO3hd!b!n=o4ZR-R9qvGM`*CJPV#TGZ4 zbQm0b8425t@12S2mjT%(XpfwzrQzb_;KT}&(}2;=E@xELCsl>1x| z=szqVpcMNY0#;NJuuZF!3a4<*CG$8HQw1Fho4@jEAIju$&SU^K*n{SEZQ1qT3?o&` zne?4eJ7OTCW*~I@u*X{)3^WCMD#2+iJ?bEYZk73Lu^Co$3{XQITXdOFw>w{LNu8A> zQBCV&iG$@mV$iu&H3J_{a!hiI{#xGexT8m^!}vuqCRj!zHjO{S+Av4@o8R;l9X*LA;7z4 zGI;i2*T1`e3MXr)@mR+>3>QIeEAV^Qe@vs_T6svv7oX_la6h_hAuuC9FNq#{_TI}r z!v8kB!X5EIM!0to-Zn)JaJ6&0V)b8TsM2@fG@%$5%SYmLJ0=@j9m2h@M1Q`f=KA5m zj>o+%V&{2o)?gYvA3?Gxa-bJK9H+Jc6;_g_kIR)NM7qW|^8<@mVguq-M(|2Ak?9EP zBArI{!^~!Pvzu1?_Qj9@8_C6U7)hQQhQqEByKHxZbH5YMkMxHpp;gO0PIasYhpgcn zv9u#eb62~k*+3Yr7c_yo;SxywE(8cozhk~dn9TAX9bpm~p>6zf*6i}lH*Qr!D_wW* z-DnJkJ@H!K>j57~2Y}}?qt}^rM7~cUPIg0pq`A}dA_|c(|fYr8}7j^0>VzEe&@*b^lo`~x?5{04b0Kl6lE+a zOM*ew#pQ0S;EGeue^rJNyO26#mS9ze#a=XpIyrV{4SB~{*fuNxdzcNjhJ)~LE!u7w5=tcGc%=Fnvo6Ed$y#yTM-v=9rqZec>ae`@iiw@yE0Nu3LbaO0bPs8h_N0%r4DUEg$nQjD&}gTCsqeQ?J{-mC zVdJucYagZz$WVH6f%MAgi=yIV_h;lck|8L6MJr$xDBUdnTmdZz&OTr$W49Hy(N4{C zSXyHK!H1IHj7bcB^K;~sQF3fK5w6hCfn+*X9<_%_U=>R`vb-m@{qM0&ECTnmHt3sYWk5Hexjec)gR4!SeG;LKN@H zHkE>lu!;0O1_MwYvY$mUJ<13ZS?7>OqAgOt>i;Sx1WC3C)e~X$tu4V8RDTZ-I2433 zQ@TZ{?_Wq2CD4Dv%EByK!xi(e2WrTorFvNez6Mr6Z~51gnJLhk&m_;q2kWWR3RJ;N z*w4kw{*Nazs4V5IwDPVD3s{P^N`px;syrF{BcL!!X}gS+d9KCcW3+`m+4r&+L*O2tnvlkQ>tl29$G*Bgi#8Q+Sw)@q<(3Et!P^)2_v+qBhn0GebItx~ zI?yh+ERo=sTHJ}3MeTQXb%1Qc@$ORfQ~nax=o^c>vhn5S+AIW{UQ(Hkh3g5A(PUwV z1}{jBafy4jre$Vt7{pN>^J>HQ9_>jBwdg-O?~fOoqV~^?Mu+6v?PZu|di96XmjmB| ztTk1mC(uqpv8T`nr482$55%i+Th|4fZyFC=MZ0jrqJyMaXJsXv$ zyv;|cOo#;8Ag$$s+f{JG5lXq6YSS zCmE2lT43mwhzEr8eS1m2DqGWYBM;6B!udZbDh>E?}%p(P#%9{!1 z4m&8n%@|3k9zXVPhm57(t#5I~0raEl4L-U-eVblu7~^X=ZjnmDhzd#Q5ZY87NkThf z(^Sa4KRoi>iy&!hIlJS3)THR*;8MB@kQMW;(B8#Vl`kuI1WRxE=I=l8wgB;ZYm@Nn z*4`*=>e8+)q}3Jl`*@<6lV%pAnEj$lAh^zfdP^%Uu?c0(NvvdEg?)~=21s7e+RD#v z4lV3K-eC2g&HXY`VCy!mBYJ^2@bcV>TIXRm+!;L*7}(Bxrj?&X8?}?aM9eefQHw1< zq<+wFLjE#X(4hdz&WSa&U3W<}%wHT24=jfBt?J?Oy%%Wo)}Tg)5A%%?s05g3?)W+T zy7$BB9(@L%Rp*au+pe#V=X*cK*)#YqUY^GgA0bB$!pfR8Buc^dL7yk*(B+eQ%REpY zK5-DN4JDZV_?+hWPA>UazAbzy`_R&1a9Sg(kE1Q~%LH9n0-Xs)hWL$Lecy%AIe#Ft zTn1jaA!q*6$rS4Wa?AD9mfYH&>ti2##<^wtzL=JRp7dRu;1aOXlaJTK&0gC$qSn37 zS5NBej{EN^88694FQ9zi`bvC;f|zesVpZB8BQoD%g$GPWZ<@V?ARhjZ>$< z&v@AeeL0)$a>3j-j4CA|sp09~b81Zs{sLNr`7nqIhYlra{k6Xcr>jy&Qt!>AzFA7|pd?wL6}W)Zd_aaU8ux=Mf7^)SeilcatYpNN zM&tE|dQ)OX#5f-6M(y%jFt$|33#uNjaNo+u6j>6j6)%{HeR%Z zo^s1EHeDQ;F?XS(lc573F7lj7ypf^#b0*d7Sa^DtANB7a?l*Ui%05Aucxal&xxa|_ zE*N6G4D`jjV9mZQRN6O~!wHd=0xlit+B_Q{24wT+n!_t@6RUKtRCXCJM9|bn9yDq6 z&xPk~rZ6oc>OXm4>{4h*i*<_aJJpcr*6*}3z^po5C!`#yne#Npds9WYIs@*21x2Jk zmOOeCUB1=EkDA<_B)A^Q`RuW0{FoF{jX|=}y|1@@S!1%z#AjmZV6dMd7v8}3gnCOB znsB!x7vdFL_s!GuzG89q0e6@HbEE1k95%Bs06p2R%xZ2$*Vx@mLuhQyjP6!4qM7zP z{+E8fDUx>)y(@KZ58Xu)zR2}DSkJkv`L)p+OsiWf@}A@~t1v=t7xbCZb`gHm4Zx53 zan9PAe9HuCJsc(KTqR?AX0>zV;4aT?(rBMyLqxy4AnUpkNyA&D`}ogOyy*&fwP64< z@w~=BKxqF?FS!~Uy#GU%=_JeAc7+XTXdQ9UmEp0IEe6rmXsVE3#R+qYNALK%PF#{& zDhYNx3j5P_0nLCtZfnHC*i1zIra0L+b5YKVYoR!m?Yh@Q{jj#8%18Q%5(d;+3B`Wz zxHs(kvwVDBZiXcZWzpdxiT6aDIcwx|*Q*1!>mAxTl?mm1@t+WKD#&^y6|#0O?w!;Z z%Iojj!&4;im6^LF!(%_3j&hgvtz4R9i|wvAm$TPSsw!mZ6sc>hPZl9@oQ@@GQkSnV zW!!T_u!DCX-aGnuWepB{yj|auP?#JgfmQKTpq*8|(bIoYhPBkCQz?o`D1%8tg#G}S zW0{|ugD0FlSXBM!%mbTrMpaXbQU-5OpJEyVF=h|3!wiZ^o)c5A{|@I$-eE!Z6UPLC z&1qo--n*Adrgso!A{}b@#v;6#3m@LIl@%B((Yn%-4JnUUjj!&HbB=7+=h(pLimIyD zpq0vjv3;26-&qw= z|C~9I@#S0nORCla;zt8z+gTqu+vE5MJhiWeuwR2; zQM73yTp$xgxXsJ%y_Z`e+6YvhhU-9{ky0LsnON#KiwxcMPywdkY6G`c6P8#J!zAPJ zNLglzN*Ha4g}VE73af$s_$Abk_mUFU@SvDbK&4Z_jOp@>etuA~0(Ja%RQ4Ir?UCJG zL)M^g3azUWns4@6c(Pk)3Ar;Ji-YPz1ZJeML>UupNLV%l)z}02-2F+N;ED(eutx8` zrrD)8kQ`VC*1p4;$;um;VgrS*$e2Eas9>4^auL$3$Ur^}o`Q)`X{3vCW{x-UCH^AI zHONFqL(n@5lkWl)-x^6ZmJ1{%*fbl}83%$`fm@A0oQPT~a!vyc!Jrfh=@$wj@Qu4& zVHi9R9ibaVt{*3Iv`0^n3=D;`^i-~n{#^%`IHgeFsxLPx2M@66XZ0d*%Z)%V9h6Bt z4yP&m%@)Qt9Y7ipd7!;n-+NkcBljU1NRO3X#`Jk+tHnUl`hkNu@@mpJ#*iSK-wzV{ zn^3|@5^|6b(#qRjB~xaWCFDUju;lyTfxgGAOM_+OMy8qkj2}7Qvxk;DosRpxI&_evI4!Yj_LAmvU>=6N&y}mY*(258ZhuS#H~Z0arj{5K#djk&H4EWHkM#=PpmUbUG(*_Lfaq%Cs<0ag zI;u*Y!9~llheH`1R9>;(qaN>t2^zVX%h zW;Bv!1Pjdl1~S}yf|+BNIc-D&RLzT2Rn9d&OFs;$tCQ@lO=_$eLeloqcz3)Xli{k6e1Bux^|(Ndf(UG(gTCbBuNZ`GankIbT)RaH8 z0auv;3YTLMc#KQAMIQPl1lxTl=Q9IL5zbuUuF*ArC1k?#irvQ!XyQJN-s0t ze^;Mw)td=UPwLJuq;xZ$hr5Y@Yn)LpLN^}d_XmRN-8HdDJRu?Em>Vhm>Oye91~dtx zFSgh$d@nr36HTy2c%ab49c?(}5Xj(|5n?)Yam~ify|Q)RsMiD~ZBoBQk?dA!d9a5f z1!E9yE$X5rzY4!9$q@yHw>DllW26Cs>UjgitF;lfig)SC*EyVw?@F0lK7T2|lPKIh z28}TWrmK$km0FxpN`p~|@l#^J>i+``hYqGnRl zajv2ZKd6O$T&|YEn1E_SnT3opa65XzXSNzzGs5K&%VyFc_F%^67Sl<)s7$wNvhKS<9>q*c|llxqg|}&u>8+;T>-PGlBnm zdS6XoPx>do#6M2JUSxpP9Kc`hmm~_kp{>KOFb9~}{;$6rpz{Y(>- zmY{*QD?quR(yrE%Jc124TGQMU*9dW4J@hP`(+8Q{gIFN5V&oewcV$;_&gR0y?1Ljp zXUN-4%10fau=P7cpHH;e=#99rh;?3cyT=oZ#eMiGDpZL1V^CQ|iG@o&{AFrdj7Z`7n9Z-0kjo^g@Pj)YcC? zb?bgxoAS@zA1?D43_E~;9|Z(F(w_mZZ)f+fnbiLq?|^UzN_;#UJ*(CZ_+yrry~X)h}I>OihcI zrD;(}daw;VL#DU_sKm?ot4WW|Xfr*XTAX~Vr24sD4?^r>k>|5?MLr_ZG(S~nZxQ$k zQ84GeKE`RkWjQ&oP0KCXZf8|gFm<&nJjc#A-wmqLY>q{KruFTYZ%R6@XopqYU|X^I zfn;hy1blak6z@KAv-!v9D7Y&%vA};;-NA>iRm2f@AiD$V?}$NPGWACkr)j35iuywS zL7*s&axG9VN<|z#@jNP;_Z~YUL&tnN2{yX~O0f%~BLV`^fy&J$+;tG13xfYs{<1K7 zQ_PmAR7GQ5P2uRM#uAu6A7?*nsf{c9tF8NU_5{$)M-A6>Bj^&S0rf&BGSn*1pT1_!b6L+@PANjLF? zVK8)7it?IG5~2-TOhVdvba6y&y|Xx^7PsiF7vB+LS5NudkpcTJQ>?TU;mPC}81TB= zhr4)6$f)Goobkap=p1m9a!*0+aqdbgDdA<3G}tO}*M4dzl0AvRr~~O^nE`QJd*BJR zs`W^nO{X|}FA#9=-e-}+-*Y?X4r=D}y|Z3ox9ae+8=Di$HZ?u`>;j8sJej{5Zg+?9 z%)!u~hKXCuFYvG)k`M?ourLV)(m5$Im=#}$bv#X!llmdv#61?Xwt*My1~V0>ODp9kaV4vp9($ieJ}fqo zYd_;m*-ATO!2>LRfLOVio|it*<`H6iO&~7oEH=MHSRbxLPOwrxRe2M>&gl83jMk#Y zSd?)CeB064stowk`So7OGeCpnzuShwicdMCfb;vmwGF@YWd1iA{>Nz0O?3I$HjGkS ztBC$;8-7NEtok2qLo}dmpfp@dfyZ@UZ^N>yQo@`e zp3=?agGpJ-@9>(oZ*d)O;!1|qAN1NF!i`7swTphX4e_l_r9C@o;Sj16+$!`ktUx~y z*!gZ;0jKld;V_$#U-kbAhhNGo{~HefmvBgC{Qq${0E5s1lFAT30E==ULZF>ZLz;d- zzo7%*p^N&*DgSF#Q^?lF$=Jq8SLuVDv7`3SZnnJC-w85-A^PqBpJoN*TmWi-f4rUm z9rb^8Kz;?&YoI?_i0LChC6E)~b@S7`_xFnokl6V(qViAA^w;peTdn^F0|6-jGC#lI z|AX=RYo^!U*T1=70Ao9UG5vIh`j4ydYy4{);@|i#z;Hq!cy{{HrF$Noq3 zYrCW0=xCz9(67ypURU6?cg1fCFv`Cu{&cc<4gb56?r$&(#zy`MWObpDbr=uUY=_Klyj*(mx5{I9?O{s>VOE{9U#4PZm|q*DSw!46o1Q z@5*(55+w4zCiqp2e`NW)X4;=Ddi<|he*IFtuEyWB#r`DtAn=;tS2g~T<+Xa&Z=O}b zze4M8T3WAb@p^>#Z#<{ozeDpsLf589B+`LBr{eJdO^pNkL-{3#p-M;4f z``yr=Tyy^a%=PQy_{U$YzxSp8(}-Adq?a~uFK$m=K9?&la~Sq*u?$3 Q*T)Cy2mDq7hyC38KboQ4tpET3 literal 0 HcmV?d00001 diff --git a/tests/data/docx/size2.docx b/tests/data/docx/size2.docx new file mode 100644 index 0000000000000000000000000000000000000000..ec5c69484e9a6a678d72801179dc5585080527a0 GIT binary patch literal 9952 zcmeHtWmsIzvhHBP-QC^Y-6goYySux)yC%3h0fIXOcXziyus{fxPxihcXMg9pfA0Bp z);zPOr{{TVO?7wmTU}N1(jcHH00002uwWdcCat1+v-J9n7Z?D52A}}+98Ij9=;{8r z0|5|Te|URUCBByi5rhf7d`1<%ShuDQmvHk!EMQM~1nHG=bu&K(RuB9tVk`WT%)f@f~5-Ez)4D);V&xp=u1&iGJO|IRMsWpJ=tDmHnFY}+K& zllG)lnPX&lB|!SFU~vEAv@+9{N5So7$P9z~nA->+c&z6{l-%faU3;Ynryf(=qufW2 zU9AJw2;qB7zvZnn=*IzW`8R_CO1Oa9_Nq8G5CDMqIw;0=M)Hn!_D=Lhc8(@=?l#u{ zX=rR>-&+L>Ac$Q4gvw}dwj}`7xJ^|;1D4dU5ni1($bm2B#`N-XA37GH|$Xn&_W0UElUzJ~ceIxv;yn(&_ zKk1$`F%H$sfB?GXl}EINGTMbqRX)1s;iXn`Fq;MEVZ%R?Ev;$ZJ3HP@;cuE04Mr? zO{JTiqcQ#Ke7e|}*gF4{I_ElCPP1YtzK3e&EVC_GBaulQcsQZbtzWufg!6f7ikdpA z3g;^qTziq_rMQKm;m?)uM;kA5Bq=EgDYxk*UP{f>*jG6sPC2S0*g+ER949V(56|50 zI1F0t9cG=H=u8OqKxVMqWe8rWkTj}xzzGS$V(P#n@~tkmUbuz(W>uD&C~hq#ys}4{<=JE$ zz*gUwudn8|>560#`=!X|!vgOqN+3NJZGv zVvYa+#lH-@pVV7gD68xkFT|q7KQK1H=X#i&qfAT4NNbU&-L+v~DHMyY`cz-a2UojG zFo*UEX0L@?hL7zbHL6LsDr&9G7t`}7v=ko17{qeQKNF{qE&z7EqO~8~N0pR*?K~Vv zK5SMb`XSc5=e_PCwH0;h(IN@WUQ1zN#*N{~(izsR2Sq4^jnK&>?_4n+5|QhYo~7P( zUw~<}kS`?(MU8av4+gDJl@ZJ8;2;+BT$rj* zV$mgXj8QF6u6JzmPwT#CN@)ZKU@+hqE(|<+-cb3XoQpk83*NXjB6oz{hh;~w%9p}dIcVoSr~K1wjh`bnh9-CI zn#|FE4qw~eVEycj=nA-deh4dkrZDBqP7t%puwXd&5zq;)-x1yg;;l#S4~#Fc?|Or_ zv<<0DJ~hNYD7RbkwGBK4*wUbIXh#RboZq)>VDfrm8hk4icBQ#u5mo z`DVmMU{$wbzOk43abw`@)v#=oeA6;m-7dPz8j^e;hn6qT3K>zas9$k*J@xYL?cOx3 zJoK#D0)=5}H&MR6cts{IetqsN`OCWy;-~DKC}tRszW4ZL3Ro8)@xhhOCN4HAlW~d@ zWL!MOl66_v0)Rl11}-5uPwh}|SR{9i4}!&Fd$x3`Q6+R>{#n=3%E9obZF4DH5&c}& z#9?-*95xVRxq+Is_$3>Z6sCpqd@+#7_SoQ?9IWGdvN{cnX9NbTBhOz3!f#^; zThe)K+NMs@;ifb$j9RA9&cCibXBs1O4**)@^vwDZYiQkd3v5q5jZstX!YXUclUgae z3{;|IGKXaEuO3H@L!ukPP;6o9)MJ0z)T?BTFy!9H^_7knCd(aJ8o95$(bw05t7-GX zZcv>l5E#a(W0A1;`Z8`cTJr^nYIfywQU$N+H`(fkqjBysT_cFoy7dwWp0&;xb%PQE zFE#KhzDmxa^IEv&NAVVHpKnLHj*?^e8BiwGoU#;`i2i3#BC=v(a#;BLwPvisr%vE` zpNu&ANR|iWuJd2uX6g?cxMwEEAI0;e`14*w9LYNC(#X>NBH;5yA~|-9ZBYUaAHFtN z(^jEOyATU-DyUQ4lumsVniQHIo7^!HzG|JNpRPEH1B>QEFz4gWR*d24;b681g>@zI z1*fimX&mm%6O4Jm_!bqdxvT}|hPsAM=7#YgsNd)`vqty?A$t|RgE&A8-A5eu^Q2oL z+L0`FBJu!boH8@(+KzJ~upw1?e;zr{Re7P%TwuDpL>93IIq8_PMdS|900{IrNUD?)3M2)Rx;0X1Ig2P`k|EiE%r`e4$}}43BFac` z6nGjIc$zX6(FA{@fB~`qm6T!%Q^i+N$Hr=n<9q08esR7ry5o0HT^f}Y%53v;F({jq zol3{)QbT`;l|NomI8IO47q((J|1`xHw< z`cVipNdW;&v+%Dn5W|8=IHY^2B*Ics2?L}GG8CoevUKcpHJK)yp&|v|t%<^pEtXE^W8VhR}IggW!Bbn{dvxT-`p=>wriikonHzMphwtAd?6zv3ckD7FRL3 zG7Z&j8f?bdiSfGwM<(WkxA4KYw6tUoEh1D@?~6D&@lJ6LG{pUi)|Vg7F4s{n`#XJy z-o-bui2DH?Y7O2OtWa5sLY=COyV1T=0q_ zt(S9!4;Z|ZiKp*f14T0l-wThHZfNKfWqR|RA4yWC3Pq^r^>zyMGY<4?f^rWLb4z7h z7P-4N>OQq+tQq2wFpq4r((-g*btM%f|_S!94Fat3$aBkp|V(|N{=TEWz*znX9_0# zsM+hmi64hkPnOsc%v4Yss%+==*aS+3S|6t!VI$=aEqexp{dAHVqF_Q-^2I}}P?w>t zyc3g1RHio4B=iC5U4V3DevYk%m#Vv1J@@qt+=l==K_~7#Ow9yo$*IRg0fQzv%2W!{ z`$7AD2co^IZmR|efeT;c`*xq;sU)OIIo--Ar@RF9tnbw*o=fSYMD!3^-tD!U=}+?W z)cS!tN9H8gq;p9qm0Sac5q2}ZCZo-jCLERPBCs^2Hu70CN!`3;1a4U@ps`kA`%LwT zn#MUw%)ZXu^5(QNa5TqM$B1NHrc9Bw4VGP6)%?&Wcj3(oj_P%t0)(sQL z?HsP^6Vk@R%h=-dbQ{_3ZHfhvg&`fpq)vJCSxd$2>kW$07<{sD+pw4f>>*_;A1f6O z=$GE25jMY*WXE>?J2{hD?SeUJ;-lJT8}(Y1OQr|;P!dhaWYR(ZQUqqn1J0f-^NY=l z;y!EeEqSDUGhFRL?W&^_j&!8COyL;0VqR~i^)%lIMBnQlOJ_i+C^Ha`V2R{$&|Q+K z3pdpb96)xA*=zm{bjlk(m>82Rjj`Xr`?-2JXxs`jQVinzM0!VRxSP+^PU@=Fm)Dpu zIO&8|SbhW?@npDM3J9^FaVH#Ar;Zc|tER&QVQUePcA<%h(hqBNr6up{epb6SuWSs~ zIcvDRA#i+&2MG6_s`e<>3W5or67Ebt$eB9_AtkAi4lWA$;w~^!^o%6d_L*LL+iSpH zs?J~Zc?{Tm3=qL7#Jo(UTJK{9+Yn}H3L(mX)0!e^^HD%VK|FpAo;g!1l`)8{R?_8V z!qtL<@`=kD>F(IV#v+~j?AW!hx|$MAvj$zz6-~BJ`ytt*$6A6$=V zG1w-e7^7#`Sa>u{=1{cWZ^W7Xb4!@aE+JFBpRinlou@mDHcBRZoa@glt-aG_{vR-y za%NcNcX7Q>z*{>|&P&n9T5SG^6sA%PSo6h>sC1aYFoRY3V6<2HWoNe=>y!2l{AB~+ z?leDZl+el6*0X8c-cR!TRvN=ktxlSm4b84Tbj++GF=9!1g4WEF_#v$DlQ&Aj7rYOB zNt&tM+cfz!XeDhci=LY+z{pkB7dFgh|;x z2AHr*uSg2F)Orx1`4tNxA?>a1HgJ0-x29x%mF%Xo^O3f5P$7w+b)*-*C#9YJlRhj) zef{g|`1@}%P|hptd#)jCs*`SUjw%zWMjM{!8YC+Yd}#Gib(1NE>jhxBPk1_`n)o#R z&am99k5e~2R>rb@l?qWbUh-(eBiB4~(XAj+A(g{KQS3Ma?Jf4e&fo!8!!Y_#877MF zee_3g=;j%=m8O*D){FQA2ufk`h+PtW5^Uj`?+&yUcWQJ_gVQ(pqr?q6#YmfDXV4Tr z+g87Pd8f6M^^kM}HtwImQb>AnMK(&#pbXv?Q4vVT`Vx2Iz2$fZ`Icz3rCr=^zyJVw zRR93>UjyiDZen9X|L2|YAL&MW)p3;#!RJ!V{vlUBSwi|+WMWB`GX7YzPTVi+sjAM**M9Bg zS(#Buq?Vs_9}K7c7b!{v3qJFz!6Q)-<|)00Jo#E6jug6@-b6j8zZ9|c;J zp-G;Sc>oZ59e~5wFC+(>x&fNSvyDl0n%A|{AJXfXUB-_hSS-hq2eyHs-VZiblToh< z)W@&(R)H~ENEe>OsoPAjk$`wqC)B2jE?QZblz~PraX+z5kQ|h&TNE(^SQNU740L%c zxgyH3leET)^Aj$%>Uza?Ymg5Py2YiOo_#HTcFbglvKkbqi`ZdbY%rkPjQBHPS+*bI zCeL06D(%5xIW-azo>SUJPvXjM-<~bpbV)=UGG6B!8JDzYV%Qy4PBMF)!27I1Nw75# z+atxxFdfNVH`yRIhaLtB?1Ol_({5m1HRh>Ybg!D+riRN%u#!6_wJ@pdrw&^^hjo^m zf8_zO&n+J?-|-cv&2Cqc{uutwQd}}8r0sLk#>yc$w@+r(LKObev6q}?V-vxpG)~yz zJ!kCk&8f0`^f$|J?7KMkqo+6*B}@fuX+OZR5n`>U6~COE!Clp;xTd9PSLU9i0&2Cu zp`?+0TZLUykjy*!u1mIP-W7who-LE;^ZTWhuFW}Fw+OD7FwUgY>`ycE=;sdl} z(#G#vdFoKsRXV?M%m?mIEa1kjPTUceDBwhOcGsPdYlwyn>=bVG=kMQU>B_=eprHAn z2Wb^;-zu8hT3%={L8CEBa1kofMi69{JZ@F-^@{f^Vkm1CT3aPWW_3Eogf%eFtt^-o zfYZOMAqm<k6eC?idvBw<1^O(bC;~_xFw)vZ%`7#Z(T2N6RFs>Zu|?(Kx`Is=kz}iYU78 zd?Yg(o+yA!uP#*nSta5v=(3(6DLC@}g5O|plni~9M0Q5bL%MpGhNiRN(n{-CF84%q z+W7kg=}KgDqyBkH-Tc5n{3V`XoM1#sxYtB(RpvrLg9>p?(od?AM&k@^v@f5HZSQ)W zofBgDL2y@5D8m)(jl*#SV{aUc)t!uk|7s~a^$W)mh(B`d%;Q`o5DzEt#Xhi#-^Ksg zI-AGABo>b+_QAni#K|D|vz0N2lR@-X%haR|aU;bf@Jb0w!@*qzgQF|plU6G9DM8N{ zl?+CK0y7s2^>thZ-$SGVCg2U&H7|Amwey@GyZju_v#o%O?K}5!yLTTheVP5M56GQa zF{;EP^&9ax1E)RrAjD1+h+jKm-*gNkMlTSH`|g?hpcbmB$we-(%f|?`-*>$rcJ}6N zeN5}l+uAJ-5%k%Ecz4TK8i&)9&*uml^f-(Nb3!c6@V0~BO};C9pL@OQj`HQ_;a1uU z;4O8e&T7cMe~q>JzjE&%A;&+@DCQ;x#wL#c$QngSBd^Sh5PHcYx(Cw2Mkxf5QQV;n zW-Ac_ZjVb|SIfJwBzd&DdnP17L4{7=;LZHg#_nEEQ?yBdyrF zXr6fa(zY3vmN{Ce(g8(D(bLWXhg%nFd=0CTp(hQ*@}Lhb3gcSa(E*@qVc_kZMgq?W zHQ-&kD`kO;X&$2!I2O-ZKw9Og2$HS=N|z12Abgn}F73|`Nvy_fLH31AQZghIE;%I? zr$_KY4Tt3%UP5ak+u^0+CbPkPg3>J zH5;yd*sil9ddcBfn#?JQ=sipU)3md9lw82>0VKA46HG})@-vw`5&nJE*nMFu>)yXb zfd8B6VEQK#|KI8O|27>J_8nS#uh!uL{)=^(+Sxh(H`XzpGGe>>hjn;B1-NNBo`6;% za6%-KQA`Q@e}WfKB94G%|1zbzRzlyjNQ;DVOxS6kv7a?9C)L*GJ599$l-NjS6y?!N z#L#;%lZ^RP*O}*Ix>(Ur!9~j>Mz|m!iCOX4%>pN>RKO`y3!Ll$Dk}4-c7Ul_ln`u9 zqi$Z`+dG#ejWKYCZ>~U!%aFIqb!f|R&&6D_B34=J3$7-RNRQ_tr}z$Iw{UOwymlsH zgTyN~=`J}V6_|FpcQSb1;@;n)FM}!dB^fX2fUr!hh!ziuF447Uz7VVfoPl>pX#iKI z@%qIa-n_*&-@+lfGIRIa0xNK<{Ty~#;bzMWT>83J<^>t$8RClD?9!H3jfuS~%?#1A zDpr*DM*^d5ie;)b&qcVnC0RM&8iMj!Ts2JU%3FULTb^9I(mZhX(fVtevus$iM^X*l z*aG2j`ECcO@6Ha>!Trb=B^-q=8=YCUM~n9h?od71!O56f-$pGr@++`~lM2rLePQRm z?d!J|b=e%v>q|ZM`qT;Sx1bK0YggUVQ8WkCd$?(yx>4u83_ne|77@l-xnGU5N{%=n zX1YLtegbKZ#IwWe6np;z!yWj2*9XRc-C^GB>@Ou$80kNP8o89oY`aJtp3rr2p5MLQ zLv1^I!T!r0`qwJ?A2ix=-q-cP5lKne(ZUjoOCMrt3#*#GH3r1`%^*q?#)Gbv15ne* zz}G3A7={sI$T#Ou)EEoXI7ruHGMqV;sLERg^o0f(eFE&Xz)iNFv^y_2p(29?*)*xJ zP)HbjR7yH_i{PFbF7pn&kk)MC(~F9ylxA3=U`n++#V5(O6x9Z_3#@MMSf!E5bc=x( z^GS#lT#HQl1Qq5VmDbepuxdW3QjgN)=iy~9GmW`L{$MNjvvXTblS~}aNEWhs&>+JT zQiUYiq{|^{cJ14+?Z(j zYzv>;b^^Vf4zynOBahdd>g;Ri?T-}VUtu>BXJ-prvwsY{#VC*2A~T|_-%}IS=ugWg z@|Av&txzpG2?SPz!tU?Qa4@ZOw;sDA5h+bX6yga+fEK7Cz(%uHCuR~tMaoP3?*AQ( zXV*bELfM72^Kk9d%jZI}*)^A>kWoa)P|Rma(<1QQ2Owz181JuJ5BeU+XV&yKFL*2jQy@k6NabM^J8^1*g+sj3^;z#$#t+_= z))??ffSu2&(U5Z>$kcaINpn?h4gr<(R~x#k3@{MV**WKY7n2Frt_4V1Y^26pvN5w3 z>#u(H2%j!B;ER9ohn7M8D#g(w#NYrb8eHb=J?qHp^ijbnZ{4IkEAw-5Xcz?68CC>% zbXiD0g0$?GJN&z#($SN__>YlD=HX)lyp2wpP2*?*kkL4hAEXuvGMw=fxN4C~Er+V1 zPx_Kejh0TS2G=24GP6=c%M8Pe@Q5w%hD&#~QbmPD+$`MbEA!eYps;ZmkT&R9S#(}L zQnm+Ct{_9e?b&Wp3FvdK*-eIzIc}or_k# z_8d>%Ys*C*#~D60q?-=hs4?==7k5RSX(aH{*QbfPsv74-Su1Ay>6~?iMROcQ0!^5D zIx;Wfs~Bl^9x5$i&0bEx&|^lzyf|b?+%aH`Qe%&As*6pN{%jbz5KL zRz{AvUXO<>IAOD@J9iD#qflAyLX8|Uh^;aewZghs{i%7s9PlonE_$RI-l_7u*6&&s zx0!2b0jHU5XaTpGZ-@=2S;6osomh|o7ejPy`Y{=SO=x>NThHH;cWra_Zbt9z+GPTT z{PE*86q5G3c5(i-cK<32_%(C-k3|Aa$?yMg%|h;ZLs-3bNCe8{b8Hb5TPSe4Ig;=b zmQr~MJiRK#8q1V5Xx8?zS1-Mou{_$JGQ{p;Cg8=ODe7@BEKM>6-GDbv7u#Xa%IEK3 z#U-~RFTqbW+&^~s=MM8ZU8uw*-DK32_DNc;kx)u3I#AD-oW-YkbRO6q1I>q!ud)NL zbyE7zgGaqwvDt7_cr%_@3B6ZCSR+I0WM<{|;|OQCTG>cT6K!7O0SwEWJv!`WAc z0_~9J4CjpDz!~x}b{k)D$154OjoEiWH2THe%NKLEX7<9rfYfhc`e&Z)WZYes1BWXsWtx)?)-CYm>0P^cdY6-svlwK3hddeR5CQds4dymx{juecTCRlel0&;LIVfr3kp=wQ2z!&9Cf#== zGS=D4WmUI6vv}s}nkBISYZ!rJzi4`mPjv4H|LjS!OZtv|#M@&Y$33a){O~D16_NH+`*qbFq1dild z-tr)^NIu<94N4G7$OLfCFBs%YXnZPoGs9Uy_csg{Gpmt1ikIhoAVdy7fB0A@-Z#s~ zJCE>Na`?fo9z8pCyTGQU3uU#d=}lJvUp)T!N%tV|aHWvt&aOQ+UXa+<@blbC5vC-Cco4o$}IPua!Lj0Nm@#?KO0uMEu9M{{ujN88QF> literal 0 HcmV?d00001 diff --git a/tests/unit_tests/abstract_converter_test.py b/tests/unit_tests/abstract_converter_test.py index cd8e16fb..a05daa6e 100644 --- a/tests/unit_tests/abstract_converter_test.py +++ b/tests/unit_tests/abstract_converter_test.py @@ -29,6 +29,5 @@ def _convert(self, filename: str, extension: str, converter: AbstractConverter) tmp_file = os.path.join(self.tmp_dir.name, filename_with_extension) self.assertTrue(os.path.isfile(file), f"no such file {file}") shutil.copy(file, tmp_file) - result = converter.do_convert(tmp_dir=self.tmp_dir.name, filename=filename, extension=extension) - path = os.path.join(self.tmp_dir.name, result) - self.assertTrue(os.path.isfile(path), f"no such file {path}") + result = converter.convert(file_path=tmp_file) + self.assertTrue(os.path.isfile(result), f"no such file {result}") diff --git a/tests/unit_tests/test_doctype_law_dynamic_classifier.py b/tests/unit_tests/test_doctype_law_dynamic_classifier.py index 2ce8bdc9..8fd89960 100644 --- a/tests/unit_tests/test_doctype_law_dynamic_classifier.py +++ b/tests/unit_tests/test_doctype_law_dynamic_classifier.py @@ -21,7 +21,7 @@ def _get_abs_path(self, file_name: str) -> str: def _test_document_type(self, file_name: str, expected_type: str) -> None: config = {} base_reader = RawTextReader(config=config) - unstructured_document = base_reader.read(path=self._get_abs_path(file_name), document_type=None, parameters=None) + unstructured_document = base_reader.read(file_path=self._get_abs_path(file_name), parameters=None) result = self.structure_extractor._predict_extractor(unstructured_document.lines) self.assertEqual(result.document_type, expected_type) diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index 62d3e739..9a802723 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -18,10 +18,9 @@ def _get_abs_path(self, file_name: str) -> str: def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") - directory, filename = os.path.split(path) - document = self.txt_reader.read(path=path, document_type="law", parameters={}) - document.metadata = self.metadata_extractor.extract_metadata(directory, filename, filename, filename) - document = self.law_extractor.extract_structure(document, {}) + document = self.txt_reader.read(file_path=path) + document.metadata = self.metadata_extractor.extract(path) + document = self.law_extractor.extract(document) self.assertListEqual([], document.attachments) self.assertListEqual([], document.tables) diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index 49355997..f26c1e27 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -4,6 +4,7 @@ from tempfile import TemporaryDirectory from dedoc.config import get_config +from dedoc.data_structures import SizeAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor from dedoc.readers.docx_reader.docx_reader import DocxReader @@ -245,7 +246,7 @@ def test_tables_with_merged_cells(self) -> None: def test_diagram_annotation(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("diagram_1.docx") - result = docx_reader.read(path) + result = docx_reader.read(path, parameters={"with_attachments": True}) for annotation in result.lines[0].annotations: if annotation.name == "attachment": @@ -253,7 +254,7 @@ def test_diagram_annotation(self) -> None: break path = self._get_path("diagram_2.docx") - result = docx_reader.read(path) + result = docx_reader.read(path, parameters={"with_attachments": True}) for i in [0, 22]: annotation_found = False @@ -283,6 +284,26 @@ def test_docx_metadata_broken_file(self) -> None: path = os.path.abspath(path) self.assertDictEqual({"broken_docx": True}, extractor._get_docx_fields(path)) + def test_annotations(self) -> None: + docx_reader = DocxReader(config=get_config()) + path = self._get_path("size1.docx") + # test 'pt' ending in size and check font size value + document = docx_reader.read(path) + for i in range(len(document.lines)): + for annotation in document.lines[i].annotations: + if annotation.name == SizeAnnotation.name: + self.assertEqual(12.0, float(annotation.value)) + + # test that different annotations of one type don't overlap + path = self._get_path("size2.docx") + document = docx_reader.read(path) + size_annotations = [annotation for annotation in document.lines[2].annotations if annotation.name == SizeAnnotation.name] + size_annotations = sorted(size_annotations, key=lambda x: x.start) + prev_end = size_annotations[0].end + for annotation in size_annotations[1:]: + self.assertGreaterEqual(annotation.start, prev_end, "Annotations of one type with different values shouldn't overlap") + prev_end = annotation.end + def _get_path(self, file_name: str) -> str: path_in = os.path.join(self.directory, file_name) path_out = os.path.join(self.tmpdir.name, file_name) diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index 1226cd01..5a21ae32 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -60,9 +60,7 @@ def test_header_footer_search(self) -> None: filename = "prospectus.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -79,9 +77,7 @@ def test_header_footer_search_2(self) -> None: filename = "with_changed_header_footer.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -98,9 +94,7 @@ def test_header_footer_search_3(self) -> None: filename = "with_header_footer_2.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -114,7 +108,7 @@ def test_long_list_in_pdf(self) -> None: config = get_test_config() any_doc_reader = PdfImageReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/scanned/doc_with_long_list.pdf") - result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(path, parameters={"need_pdf_table_analysis": "False"}) list_elements = result.lines[1:] self.assertEqual(list_elements[0].line.lower().strip(), "1. январь") self.assertEqual(list_elements[1].line.lower().strip(), "2. февраль") @@ -134,9 +128,16 @@ def test_pdf_text_layer(self) -> None: config = get_test_config() any_doc_reader = PdfTxtlayerReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") - result = any_doc_reader.read(path, document_type=None, parameters={}) + result = any_doc_reader.read(path, parameters={}) for line in result.lines: # check that annotations not duplicated annotations = line.annotations annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations} self.assertEqual(len(annotations_set), len(annotations)) + + def test_table_extractor(self) -> None: + config = {} # Has to work without config + any_doc_reader = PdfTxtlayerReader(config=config) + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") + result = any_doc_reader.read(path, parameters={"need_pdf_table_analysis": "True"}) + self.assertEqual(len(result.tables), 1) diff --git a/tests/unit_tests/test_format_txt_reader.py b/tests/unit_tests/test_format_txt_reader.py index e7249651..17e7482d 100644 --- a/tests/unit_tests/test_format_txt_reader.py +++ b/tests/unit_tests/test_format_txt_reader.py @@ -15,7 +15,7 @@ def test_read_law(self) -> None: file = os.path.join(self.path, "laws", "коап_москвы_8_7_2015_utf.txt") uids_set = set() prefix = "txt_6210f1fb59150aae33a09f49c8724baf" # это строка, содержащая хэш файла, который обратаывается ридером - document = self.reader.read(file, None, {}) + document = self.reader.read(file, {}) for line in document.lines: self.assertNotIn(line.uid, uids_set) uids_set.add(line.uid) @@ -25,7 +25,7 @@ def test_read_tz(self) -> None: file = os.path.join(self.path, "tz", "tz.txt") uids_set = set() prefix = "txt_0e576a9e0008225ac27f961af60c0bee" - document = self.reader.read(file, None, {}) + document = self.reader.read(file, {}) for line in document.lines: self.assertNotIn(line.uid, uids_set) uids_set.add(line.uid) diff --git a/tests/unit_tests/test_misc_annotations.py b/tests/unit_tests/test_misc_annotations.py index f9b76470..088b98dc 100644 --- a/tests/unit_tests/test_misc_annotations.py +++ b/tests/unit_tests/test_misc_annotations.py @@ -173,6 +173,34 @@ def test_merge_1000_no_intersection(self) -> None: result = self.merge(annotations, text) self.assertSetEqual({(a.start, a.end, a.name, a.value) for a in annotations}, result) + def test_merge_space(self) -> None: + annotations = [ + Annotation(start=0, end=6, name="size", value="12.0"), + Annotation(start=7, end=11, name="size", value="12.0"), + Annotation(start=6, end=7, name="size", value="1"), + Annotation(start=6, end=7, name="bold", value="True") + ] + text = "normal text" + result = self.merge(annotations, text) + self.assertEqual(2, len(result)) + self.assertIn((0, 11, "size", "12.0"), result) + self.assertIn((6, 7, "bold", "True"), result) + + def test_merge_only_spaces(self) -> None: + annotations = [ + Annotation(start=0, end=1, name="size", value="12.0"), + Annotation(start=0, end=1, name="bold", value="True"), + Annotation(start=1, end=2, name="italic", value="True"), + Annotation(start=2, end=3, name="bold", value="False"), + Annotation(start=3, end=4, name="size", value="1"), + Annotation(start=4, end=5, name="size", value="5") + ] + text = " \t \t\n" + result = self.merge(annotations, text) + self.assertEqual(6, len(result)) + actual_result = {(ann.start, ann.end, ann.name, ann.value) for ann in annotations} + self.assertSetEqual(actual_result, result) + class TestAbstractStructureExtractor(unittest.TestCase): def test_annotation_extractor_left(self) -> None: diff --git a/tests/unit_tests/test_misc_tasker.py b/tests/unit_tests/test_misc_tasker.py index 8724f046..8fd3c66c 100644 --- a/tests/unit_tests/test_misc_tasker.py +++ b/tests/unit_tests/test_misc_tasker.py @@ -8,7 +8,7 @@ from PIL import Image from dedoc.attachments_handler.attachments_handler import AttachmentsHandler -from dedoc.converters.file_converter import FileConverterComposition +from dedoc.converters.converter_composition import ConverterComposition from dedoc.dedoc_manager import DedocManager from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition @@ -157,7 +157,7 @@ def __create_test_manager_config(self, config: dict) -> dict: } return dict( - converter=FileConverterComposition(converters=[]), + converter=ConverterComposition(converters=[]), reader=ReaderComposition(readers=readers), structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), structure_constructor=StructureConstructorComposition(default_constructor=TreeConstructor(), constructors={"tree": TreeConstructor()}), diff --git a/tests/unit_tests/test_misc_toc_feature_extractor.py b/tests/unit_tests/test_misc_toc_feature_extractor.py index 7389c15d..bd45393b 100644 --- a/tests/unit_tests/test_misc_toc_feature_extractor.py +++ b/tests/unit_tests/test_misc_toc_feature_extractor.py @@ -19,7 +19,7 @@ class TestTOCFeatureExtractor(unittest.TestCase): @property def document(self) -> UnstructuredDocument: if self._document is None: - self._document = self.reader.read(path=self.path, parameters={}, document_type=None) + self._document = self.reader.read(file_path=self.path, parameters={}) return self._document def test_toc_extractor(self) -> None: diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index eb79ebce..ac816cb6 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -7,7 +7,7 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.dedoc_manager import DedocManager -from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader +from dedoc.readers import ArchiveReader, PdfTabbyReader, PdfTxtlayerReader, PptxReader from dedoc.readers.docx_reader.docx_reader import DocxReader from tests.test_utils import get_test_config @@ -41,7 +41,7 @@ def test_docx_attachments_extractor(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) - attachments = docx_attachment_extractor.get_attachments(tmpdir, filename, {}) + attachments = docx_attachment_extractor.extract(file_path=os.path.join(tmpdir, filename)) for _, file in enumerate(attachments): self.assertIn(file.original_name, attachments_name_list) @@ -72,7 +72,7 @@ def test_pptx_attachments_extractor(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) - attachments = pptx_attachment_extractor.get_attachments(tmpdir, filename, {}) + attachments = pptx_attachment_extractor.extract(file_path=os.path.join(tmpdir, filename)) for _, file in enumerate(attachments): self.assertIn(file.original_name, attachments_name_list) @@ -86,7 +86,8 @@ def test_docx_diagrams_extraction(self) -> None: files = [("diagram_1.docx", 1), ("diagram_2.docx", 5)] with tempfile.TemporaryDirectory() as tmp_dir: for file, num_attachments in files: - attachments = docx_attachment_extractor.get_attachments(tmp_dir, os.path.join(docx_dir, file), {}) + shutil.copy(os.path.join(docx_dir, file), os.path.join(tmp_dir, file)) + attachments = docx_attachment_extractor.extract(file_path=os.path.join(tmp_dir, file)) self.assertEqual(num_attachments, len(attachments)) def test_archive_with_slash(self) -> None: @@ -106,7 +107,7 @@ def __get_list_of_files_in_archive(self, file_name: str) -> List[str]: file_path = os.path.join(tmp_dir, file_name) shutil.copyfile(os.path.join(self.src_dir, file_name), file_path) config = get_test_config() - document = ArchiveReader(config=config).read(path=file_path, parameters={"with_attachments": True}) + document = ArchiveReader(config=config).read(file_path=file_path, parameters={"with_attachments": True}) files = [file.original_name for file in document.attachments] return files @@ -134,7 +135,7 @@ def test_reader_attachments_dir(self) -> None: for file_name, reader in file_name_reader_list: with tempfile.TemporaryDirectory() as tmpdir: - result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir)) + result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir)) attachment_names = os.listdir(tmpdir) for attachment in result.attachments: @@ -148,10 +149,21 @@ def test_attachments_extractor_attachments_dir(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: params = {"with_attachments": True, "attachments_dir": tmpdir} - result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params) + result = docx_attachment_extractor.extract(file_path=os.path.join(self.src_dir, file_name), parameters=params) attachment_names = os.listdir(tmpdir) for attachment in result: attachment_fname = attachment.tmp_file_path.split("/")[-1] self.assertTrue(os.path.isfile(attachment.get_filename_in_path())) self.assertIn(attachment_fname, attachment_names) + + def test_with_attachments_false(self) -> None: + files = ["with_attachments_0.docx", "with_attachments_1.pptx"] + readers = [DocxReader(), PptxReader()] + + with tempfile.TemporaryDirectory() as tmpdir: + for file_name, reader in zip(files, readers): + params = {"attachments_dir": tmpdir} + result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=params) + self.assertEqual(len(result.attachments), 0) + self.assertEqual(len(os.listdir(tmpdir)), 0) diff --git a/tests/unit_tests/test_module_utils.py b/tests/unit_tests/test_module_utils.py index 5f27e98c..839e1ed9 100644 --- a/tests/unit_tests/test_module_utils.py +++ b/tests/unit_tests/test_module_utils.py @@ -22,3 +22,15 @@ def test_splitext_space_name(self) -> None: name, extension = splitext_(name_extension) self.assertEqual("some file ", name) self.assertEqual(".doc", extension) + + def test_splitext_dots_name(self) -> None: + name_extension = "1700134420_941.23_to_csv.csv" + name, extension = splitext_(name_extension) + self.assertEqual("1700134420_941.23_to_csv", name) + self.assertEqual(".csv", extension) + + def test_splitext_double_dot_extension(self) -> None: + name_extension = "some_name.tar.gz" + name, extension = splitext_(name_extension) + self.assertEqual("some_name", name) + self.assertEqual(".tar.gz", extension)