From 75ba84343fc5c2812535437e8b3bb0daf4f4959b Mon Sep 17 00:00:00 2001 From: Bogatenkova Anastasiya Date: Thu, 21 Dec 2023 12:20:01 +0300 Subject: [PATCH] TLDR-527 refactor methods and parameters for all main classes (#387) * Refactor attachments extractors * Refactor converters * Refactor readers * Refactor metadata extractors and other fixes * Refactor structure extractors * Refactor structure constructors * Fix documentation * Add parameters description * Move imports of manager_config inside the _get_manager_config function --- .../abstract_attachment_extractor.py | 28 ++- .../abstract_office_attachments_extractor.py | 5 +- .../docx_attachments_extractor.py | 15 +- .../excel_attachments_extractor.py | 17 +- .../json_attachment_extractor.py | 15 +- .../pdf_attachments_extractor.py | 40 +++-- .../pptx_attachments_extractor.py | 17 +- dedoc/attachments_extractors/utils.py | 17 -- .../attachments_handler.py | 16 +- dedoc/converters/__init__.py | 4 +- .../concrete_converters/abstract_converter.py | 54 +++--- .../concrete_converters/binary_converter.py | 16 +- .../concrete_converters/docx_converter.py | 24 ++- .../concrete_converters/excel_converter.py | 24 ++- .../concrete_converters/pdf_converter.py | 23 ++- .../concrete_converters/png_converter.py | 28 +-- .../concrete_converters/pptx_converter.py | 24 ++- .../concrete_converters/txt_converter.py | 22 ++- ..._converter.py => converter_composition.py} | 30 ++-- dedoc/dedoc_manager.py | 34 ++-- dedoc/extensions.py | 14 +- dedoc/manager_config.py | 107 ++++++------ .../abstract_metadata_extractor.py | 34 ++-- .../base_metadata_extractor.py | 44 +++-- .../docx_metadata_extractor.py | 34 ++-- .../image_metadata_extractor.py | 44 +++-- .../note_metadata_extarctor.py | 37 ++-- .../pdf_metadata_extractor.py | 45 +++-- .../metadata_extractor_composition.py | 34 ++-- .../readers/archive_reader/archive_reader.py | 20 +-- dedoc/readers/base_reader.py | 38 ++-- dedoc/readers/csv_reader/csv_reader.py | 18 +- dedoc/readers/docx_reader/docx_reader.py | 22 ++- dedoc/readers/email_reader/email_reader.py | 33 ++-- dedoc/readers/excel_reader/excel_reader.py | 17 +- .../html2pdf_reader/html2pdf_reader.py | 15 +- dedoc/readers/html_reader/html_reader.py | 20 +-- dedoc/readers/json_reader/json_reader.py | 18 +- dedoc/readers/mhtml_reader/mhtml_reader.py | 25 ++- dedoc/readers/note_reader/note_reader.py | 22 ++- .../pdf_auto_reader/pdf_auto_reader.py | 48 +++--- dedoc/readers/pdf_reader/pdf_base_reader.py | 40 ++--- .../pdf_image_reader/pdf_image_reader.py | 22 ++- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 34 ++-- .../pdf_txtlayer_reader.py | 21 +-- dedoc/readers/pptx_reader/pptx_reader.py | 16 +- dedoc/readers/reader_composition.py | 29 ++-- dedoc/readers/txt_reader/raw_text_reader.py | 21 +-- dedoc/scripts/benchmark_pdf_attachments.py | 2 +- dedoc/scripts/create_txtlayer_dataset.py | 8 +- .../abstract_structure_constructor.py | 6 +- .../linear_constructor.py | 2 +- .../tree_constructor.py | 2 +- .../structure_constructor_composition.py | 8 +- .../abstract_structure_extractor.py | 13 +- .../abstract_law_structure_extractor.py | 11 +- .../classifying_law_structure_extractor.py | 10 +- .../default_structure_extractor.py | 2 +- .../diploma_structure_extractor.py | 9 +- .../foiv_law_structure_extractor.py | 4 +- .../law_structure_excractor.py | 4 +- .../tz_structure_extractor.py | 10 +- .../structure_extractor_composition.py | 10 +- dedoc/train_dataset/train_dataset_utils.py | 7 +- dedoc/utils/parameter_utils.py | 29 +--- dedoc/utils/utils.py | 15 ++ .../dedoc_add_new_doc_type_tutorial.py | 32 ++-- .../dedoc_creating_dedoc_document.py | 2 +- .../code_examples/dedoc_usage_tutorial.py | 41 ++--- .../_static/code_examples/djvu_converter.py | 22 ++- .../code_examples/pdf_attachment_extractor.py | 15 +- .../_static/code_examples/pdf_reader.py | 21 ++- docs/source/getting_started/usage.rst | 62 +++---- docs/source/index.rst | 6 +- .../source/modules/attachments_extractors.rst | 6 +- docs/source/modules/converters.rst | 2 +- docs/source/modules/metadata_extractors.rst | 6 +- docs/source/modules/readers.rst | 16 -- docs/source/modules/structure_extractors.rst | 7 +- .../parameters/attachments_handling.rst | 59 +++++++ .../parameters/other_formats_handling.rst | 43 +++++ docs/source/parameters/parameters.rst | 21 +++ docs/source/parameters/pdf_handling.rst | 163 ++++++++++++++++++ docs/source/parameters/structure_type.rst | 52 ++++++ docs/source/tutorials/add_new_doc_type.rst | 61 ++++--- examples/create_structured_document.py | 2 +- examples/create_unstructured_document.py | 2 +- examples/example_doc_parser.py | 2 +- examples/example_img_parser.py | 2 +- examples/example_pdf_parser.py | 4 +- tests/unit_tests/abstract_converter_test.py | 5 +- .../test_doctype_law_dynamic_classifier.py | 2 +- .../unit_tests/test_doctype_law_txt_reader.py | 7 +- tests/unit_tests/test_format_pdf_reader.py | 18 +- tests/unit_tests/test_format_txt_reader.py | 4 +- tests/unit_tests/test_misc_tasker.py | 4 +- .../test_misc_toc_feature_extractor.py | 2 +- .../test_module_attachment_extractor.py | 13 +- 98 files changed, 1285 insertions(+), 896 deletions(-) delete mode 100644 dedoc/attachments_extractors/utils.py rename dedoc/converters/{file_converter.py => converter_composition.py} (51%) create mode 100644 docs/source/parameters/attachments_handling.rst create mode 100644 docs/source/parameters/other_formats_handling.rst create mode 100644 docs/source/parameters/parameters.rst create mode 100644 docs/source/parameters/pdf_handling.rst create mode 100644 docs/source/parameters/structure_type.rst diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 32200e94..d62a192c 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,3 +1,4 @@ +import logging import os import uuid from abc import ABC, abstractmethod @@ -11,29 +12,40 @@ class AbstractAttachmentsExtractor(ABC): """ This class is responsible for extracting files attached to the documents of different formats. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the attachments extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ - Check if this attachments extractor can get attachments of the file with the given extension. + Check if this attachments extractor can get attachments of the file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param extension: file extension, for example .doc or .pdf + :param file_path: the path of the file to extract attachments from + :param extension: file extension with a dot, for example .doc or .pdf :param mime: MIME type of file - :param parameters: any additional parameters for given document + :param parameters: any additional parameters for the given document :return: the indicator of possibility to get attachments of this file """ pass @abstractmethod - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Extract attachments from the given file. This method can only be called on appropriate files, ensure that \ :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` is True for the given file. - :param tmpdir: directory where file is located and where the attached files will be saved - :param filename: name of the file to extract attachments (not absolute path) - :param parameters: dict with different parameters for extracting + :param file_path: path of the file to extract attachments from + :param parameters: dict with different parameters for extracting, see :ref:`attachments_handling_parameters` for more details :return: list of file's attachments """ pass diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index f8a7db1e..40fc0c62 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -1,7 +1,7 @@ import os import zipfile from abc import ABC -from typing import List, Tuple +from typing import List, Optional, Tuple import olefile from charset_normalizer import from_bytes @@ -14,6 +14,9 @@ class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC): """ Extract attachments from files of Microsoft Office format like docx, pptx, xlsx. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: """ Parse the binary content of olefile. diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index d39f288e..1c307409 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -11,25 +11,36 @@ from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes +from dedoc.utils.utils import get_mime_extension class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extract attachments from docx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .docx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given docx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) result = [] try: with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index bbcf1953..cf5cfefa 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -1,28 +1,39 @@ +import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import splitext_ +from dedoc.utils.utils import get_mime_extension, splitext_ class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extracts attachments from xlsx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .xlsx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given xlsx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) attachments = [] name, ext = splitext_(filename) if ext.lower() != ".xlsx": diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index 25a204dd..39e11c69 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -4,19 +4,28 @@ from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile +from dedoc.utils.utils import get_mime_extension class JsonAttachmentsExtractor(AbstractAttachmentsExtractor): """ Extract attachments from json files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .json extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".json") - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given json document. Attached files are html files if the option `html_fields` is given in the `parameters`. @@ -33,6 +42,8 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) attachments = [] with open(os.path.join(tmpdir, filename)) as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index 28b5f55f..0ae13fb4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,4 +1,4 @@ -import logging +import json import os import uuid from typing import List, Optional, Tuple @@ -8,36 +8,39 @@ from PyPDF2.utils import PdfReadError from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor -from dedoc.attachments_extractors.utils import create_note from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import convert_datetime +from dedoc.utils.utils import convert_datetime, get_mime_extension, get_unique_name class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): """ Extract attachments from pdf files. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .pdf extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given pdf document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) + with open(os.path.join(tmpdir, filename), "rb") as handler: try: reader = PyPDF2.PdfFileReader(handler) @@ -74,7 +77,7 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: user = note.get("/T") data = note.get("/Contents", "") - name, content = create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) + name, content = self.__create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) attachments.append((name, bytes(content))) return attachments @@ -108,3 +111,16 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str attachments.append((name, data)) return attachments + + def __create_note(self, content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: + filename = get_unique_name("note.json") + note_dict = { + "content": content, + "modified_time": modified_time, + "created_time": created_time, + "size": size if size else len(content), + "author": author + } + encode_data = json.dumps(note_dict).encode("utf-8") + + return filename, encode_data diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py index f463b4aa..34acdef4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py @@ -1,28 +1,39 @@ +import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes -from dedoc.utils.utils import splitext_ +from dedoc.utils.utils import get_mime_extension, splitext_ class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor): """ Extract attachments from pptx files. """ - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if this extractor can get attachments from the document (it should have .pptx extension) """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Get attachments from the given pptx document. Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) result = [] name, ext = splitext_(filename) diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py deleted file mode 100644 index 7c99e9cf..00000000 --- a/dedoc/attachments_extractors/utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import json - -from dedoc.utils.utils import get_unique_name - - -def create_note(content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: - filename = get_unique_name("note.json") - note_dict = { - "content": content, - "modified_time": modified_time, - "created_time": created_time, - "size": size if size else len(content), - "author": author - } - encode_data = json.dumps(note_dict).encode("utf-8") - - return filename, encode_data diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 5fda5a91..1017ad45 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -2,7 +2,7 @@ import logging import os import time -from typing import List +from typing import List, Optional from dedoc.attachments_extractors import AbstractAttachmentsExtractor from dedoc.common.exceptions.dedoc_error import DedocError @@ -22,11 +22,11 @@ class AttachmentsHandler: the parsing recursion may be set via `recursion_deep_attachments` parameter. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the handler, e.g. logger for logging """ - self.config = config + self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa @@ -77,10 +77,10 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct return parsed_attachment_files def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa - attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) - metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir, - filename=attachment_name, converted_filename=attachment_name, - original_filename=attachment.get_original_filename(), - parameters=parameters) + metadata = document_parser.document_metadata_extractor.extract( + file_path=attachment.get_filename_in_path(), + original_filename=attachment.get_original_filename(), + parameters=parameters + ) metadata = DocumentMetadata(**metadata) return ParsedDocument(content=get_empty_content(), metadata=metadata) diff --git a/dedoc/converters/__init__.py b/dedoc/converters/__init__.py index 2a29479e..b71ca457 100644 --- a/dedoc/converters/__init__.py +++ b/dedoc/converters/__init__.py @@ -6,7 +6,7 @@ from .concrete_converters.png_converter import PNGConverter from .concrete_converters.pptx_converter import PptxConverter from .concrete_converters.txt_converter import TxtConverter -from .file_converter import FileConverterComposition +from .converter_composition import ConverterComposition -__all__ = ["AbstractConverter", "BinaryConverter", "DocxConverter", "ExcelConverter", "FileConverterComposition", "PDFConverter", "PNGConverter", +__all__ = ["AbstractConverter", "BinaryConverter", "DocxConverter", "ExcelConverter", "ConverterComposition", "PDFConverter", "PNGConverter", "PptxConverter", "TxtConverter"] diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index 14b7ace6..a0a5baf5 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -1,7 +1,6 @@ import logging import os import subprocess -import time from abc import ABC, abstractmethod from typing import List, Optional @@ -10,42 +9,48 @@ class AbstractConverter(ABC): """ - This class provides the common methods for all converters: can_convert() and do_convert(). + This class provides the common methods for all converters: can_convert() and convert(). """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: configuration of the converter, e.g. logger for logging """ self.timeout = 60 self.period_checking = 0.05 - self.config = config - self.logger = config.get("logger", logging.getLogger()) + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ - Convert the given file to another format if it's possible. - This method can only be called on appropriate files, ensure that :meth:`~dedoc.converters.AbstractConverter.can_convert` \ - is True for the given file. - If the file format is unsupported the ConversionException will be thrown. + Check if this converter can convert file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param tmp_dir: directory where the original file is located and where result will be saved - :param filename: name of the original file without extension - :param extension: extension of the original file - :return: name of the converted file + :param file_path: path of the file to convert + :param extension: file extension, for example .doc or .pdf + :param mime: MIME type of file + :param parameters: any additional parameters for the given document + :return: the indicator of possibility to convert this file """ pass @abstractmethod - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ - Check if this converter can convert file with the given extension. + Convert the given file to another format if it's possible. + This method can only be called on appropriate files, ensure that :meth:`~dedoc.converters.AbstractConverter.can_convert` \ + is True for the given file. + If the file format is unsupported the ConversionException will be thrown. - :param extension: file extension, for example .doc or .pdf - :param mime: MIME type of file - :param parameters: any additional parameters for given document - :return: the indicator of possibility to convert this file + :param file_path: path of the file to convert + :param parameters: parameters of converting, see :ref:`parameters_description` for more details + :return: path of converted file if conversion was executed """ + pass def _run_subprocess(self, command: List[str], filename: str, expected_path: str) -> None: try: @@ -63,12 +68,3 @@ def _run_subprocess(self, command: List[str], filename: str, expected_path: str) message = f"Conversion of the {filename} hadn't terminated after {self.timeout} seconds" self.logger.error(message) raise ConversionError(msg=message) - - def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: - t = 0 - while (not os.path.isfile(f"{tmp_dir}/{filename}")) and (t < self.timeout): - time.sleep(self.period_checking) - t += self.period_checking - - if t >= self.timeout: - raise ConversionError(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index 2089d66c..46142cff 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.converters.concrete_converters.png_converter import PNGConverter from dedoc.utils import supported_image_types +from dedoc.utils.utils import get_mime_extension class BinaryConverter(AbstractConverter): @@ -10,18 +11,23 @@ class BinaryConverter(AbstractConverter): Converts image-like documents with `mime=application/octet-stream` into PNG. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.png_converter = PNGConverter(config=config) + self.png_converter = PNGConverter(config=self.config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime == "application/octet-stream" and extension in supported_image_types - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like and application/octet-stream documents into files with .png extension. """ - return self.png_converter.do_convert(tmp_dir, filename, extension) + return self.png_converter.convert(file_path, parameters=parameters) diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index 2f0f30db..3b50416a 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class DocxConverter(AbstractConverter): @@ -10,23 +11,28 @@ class DocxConverter(AbstractConverter): Converts docx-like documents into DOCX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.docx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.docx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index 661fb5c2..1396a12b 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class ExcelConverter(AbstractConverter): @@ -10,23 +11,28 @@ class ExcelConverter(AbstractConverter): Converts xlsx-like documents into XLSX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is xlsx-like, e.g. it has .xls or .ods extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.xlsx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.xlsx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index e1f1c00c..f0b929e8 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PDFConverter(AbstractConverter): @@ -10,22 +11,28 @@ class PDFConverter(AbstractConverter): Converts pdf-like documents into PDF using the ddjvu application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is pdf-like, e.g. it has .djvu extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - expected_path = os.path.join(tmp_dir, f"{filename}.pdf") - command = ["ddjvu", "--format=pdf", path_in, expected_path] - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") + command = ["ddjvu", "--format=pdf", file_path, converted_file_path] + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return filename + ".pdf" + return converted_file_path diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index 3fdcac26..cb50245d 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -6,6 +6,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PNGConverter(AbstractConverter): @@ -13,26 +14,33 @@ class PNGConverter(AbstractConverter): Converts image-like documents into PNG. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the image-like documents into files with .png extension. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - path_out = os.path.join(tmp_dir, f"{filename}.png") + file_dir, file_name = os.path.split(file_path) + name_wo_ext, extension = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.png") + if extension in [".hdr", ".pic", ".sr", ".ras", ".j2k"]: - img = cv2.imread(path_in) - cv2.imwrite(path_out, img) + img = cv2.imread(file_path) + cv2.imwrite(converted_file_path, img) else: - img = Image.open(path_in) - img.save(path_out) + img = Image.open(file_path) + img.save(converted_file_path) - return f"{filename}.png" + return converted_file_path diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index 312791fe..d1e7aec3 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -3,6 +3,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class PptxConverter(AbstractConverter): @@ -10,23 +11,28 @@ class PptxConverter(AbstractConverter): Converts pptx-like documents into PPTX using the soffice application. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is pptx-like, e.g. it has .ppt or .odp extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ - path_in = os.path.join(tmp_dir, f"{filename}{extension}") - command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", tmp_dir, path_in] - file_out = f"{filename}.pptx" - expected_path = os.path.join(tmp_dir, file_out) - self._run_subprocess(command=command, filename=filename, expected_path=expected_path) + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", file_dir, file_path] + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pptx") + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) - return file_out + return converted_file_path diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py index 5a8e85cc..b1543fa0 100644 --- a/dedoc/converters/concrete_converters/txt_converter.py +++ b/dedoc/converters/concrete_converters/txt_converter.py @@ -4,6 +4,7 @@ from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.extensions import converted_extensions, converted_mimes +from dedoc.utils.utils import get_mime_extension, splitext_ class TxtConverter(AbstractConverter): @@ -11,20 +12,27 @@ class TxtConverter(AbstractConverter): Converts txt-like documents into TXT by simple renaming. Look to the :class:`~dedoc.converters.AbstractConverter` documentation to get the information about the methods' parameters. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: """ Checks if the document is txt-like, e.g. it has .xml extension. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert the txt-like documents into files with .txt extension by renaming it. """ - file_path = os.path.join(tmp_dir, f"{filename}{extension}") - converted_file_name = f"{filename}.txt" - shutil.copy(file_path, os.path.join(tmp_dir, converted_file_name)) - return converted_file_name + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.txt") + shutil.copy(file_path, converted_file_path) + + return converted_file_path diff --git a/dedoc/converters/file_converter.py b/dedoc/converters/converter_composition.py similarity index 51% rename from dedoc/converters/file_converter.py rename to dedoc/converters/converter_composition.py index 7048d0ac..cf12c2ed 100644 --- a/dedoc/converters/file_converter.py +++ b/dedoc/converters/converter_composition.py @@ -3,10 +3,10 @@ from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import get_file_mime_type, splitext_ +from dedoc.utils.utils import get_mime_extension -class FileConverterComposition(object): +class ConverterComposition(object): """ This class allows to convert any document into the predefined list of formats according to the available list of converters. The list of converters is set via the class constructor. @@ -15,28 +15,26 @@ class FileConverterComposition(object): """ def __init__(self, converters: List[AbstractConverter]) -> None: """ - :param converters: the list of converters that have methods can_convert() and do_convert(), \ + :param converters: the list of converters that have methods can_convert() and convert(), \ they are used for files converting into specified formats """ self.converters = converters - def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] = None) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: """ Convert file if there is the converter that can do it. If there isn't any converter that is able to convert the file, it isn't changed. - :param tmp_dir: the directory where the file is located and where the converted file will be saved - :param filename: the name of the file to convert - :param parameters: parameters of converting - :return: name of the converted file if conversion was executed else name of the original file + :param file_path: path of the file to convert + :param parameters: parameters of converting, see :ref:`parameters_description` for more details + :return: path of converted file if conversion was executed else path of the original file """ - name, extension = splitext_(filename) - mime = get_file_mime_type(os.path.join(tmp_dir, filename)) + extension, mime = get_mime_extension(file_path=file_path) + converted_file_path = file_path + for converter in self.converters: - can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) - if can_convert: - filename = converter.do_convert(tmp_dir, name, extension) + if converter.can_convert(file_path=file_path, extension=extension, mime=mime, parameters=parameters): + converted_file_path = converter.convert(file_path, parameters=parameters) break - file_path = os.path.join(tmp_dir, filename) - os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH) - return filename + os.chmod(converted_file_path, S_IREAD | S_IRGRP | S_IROTH) + return converted_file_path diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index d2be05bf..64d96306 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -32,7 +32,7 @@ def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] :param manager_config: dictionary with different stage document processors. The following keys should be in the `manager_config` dictionary: - - converter (optional) (:class:`~dedoc.converters.FileConverterComposition`) + - converter (optional) (:class:`~dedoc.converters.ConverterComposition`) - reader (:class:`~dedoc.readers.ReaderComposition`) - structure_extractor (:class:`~dedoc.structure_extractors.StructureExtractorComposition`) - structure_constructor (:class:`~dedoc.structure_constructors.StructureConstructorComposition`) @@ -63,10 +63,10 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> If some error occurred, file metadata are stored in the exception's metadata field. :param file_path: full path where the file is located - :param parameters: any parameters, specify how to parse file (see API parameters documentation for more details) + :param parameters: any parameters, specify how to parse file, see :ref:`parameters_description` for more details :return: parsed document """ - parameters = self.__init_parameters(parameters) + parameters = self.__init_parameters(file_path, parameters) self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: @@ -92,37 +92,32 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) unique_filename = get_unique_name(file_name) with tempfile.TemporaryDirectory() as tmp_dir: - shutil.copy(file_path, os.path.join(tmp_dir, unique_filename)) + tmp_file_path = os.path.join(tmp_dir, unique_filename) + shutil.copy(file_path, tmp_file_path) # Step 1 - Converting - converted_filename = self.converter.do_converting(tmp_dir, unique_filename, parameters=parameters) - self.logger.info(f"Finish conversion {file_name} -> {converted_filename}") + converted_file_path = self.converter.convert(tmp_file_path) + self.logger.info(f"Finish conversion {file_name} -> {os.path.basename(converted_file_path)}") # Step 2 - Reading content - unstructured_document = self.reader.parse_file(tmp_dir=tmp_dir, filename=converted_filename, parameters=parameters) + unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters) self.logger.info(f"Finish parse file {file_name}") # Step 3 - Adding meta-information - metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir, - filename=unique_filename, - converted_filename=converted_filename, - original_filename=file_name, - parameters=parameters, - other_fields=unstructured_document.metadata) + metadata = self.document_metadata_extractor.extract(file_path=tmp_file_path, converted_filename=os.path.basename(converted_file_path), + original_filename=file_name, parameters=parameters, other_fields=unstructured_document.metadata) unstructured_document.metadata = metadata self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure - unstructured_document = self.structure_extractor.extract_structure(unstructured_document, parameters) + unstructured_document = self.structure_extractor.extract(unstructured_document, parameters) self.logger.info(f"Extract structure from file {file_name}") if self.config.get("labeling_mode", False): self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document) # Step 5 - Form the output structure - parsed_document = self.structure_constructor.structure_document(document=unstructured_document, - structure_type=parameters.get("structure_type"), - parameters=parameters) + parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters) self.logger.info(f"Get structured document {file_name}") # Step 6 - Get attachments @@ -133,13 +128,16 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) self.logger.info(f"Finish handle {file_name}") return parsed_document - def __init_parameters(self, parameters: Optional[dict]) -> dict: + def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: parameters = {} if parameters is None else parameters result_parameters = {} for parameter_name, parameter_value in self.default_parameters.items(): result_parameters[parameter_name] = parameters.get(parameter_name, parameter_value) + attachments_dir = parameters.get("attachments_dir", None) + result_parameters["attachments_dir"] = os.path.dirname(file_path) if attachments_dir is None else attachments_dir + return result_parameters def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: diff --git a/dedoc/extensions.py b/dedoc/extensions.py index d35e12bf..bddce5c8 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -45,14 +45,14 @@ ) recognized_extensions = Extensions( - excel_like_format=[], - docx_like_format=[], - pptx_like_format=[], + excel_like_format=[".xlsx"], + docx_like_format=[".docx"], + pptx_like_format=[".pptx"], archive_like_format=[".tar.gz"], - image_like_format=[], - pdf_like_format=[], + image_like_format=[".png"], + pdf_like_format=[".pdf"], csv_like_format=[".csv", ".tsv"], - txt_like_format=[] + txt_like_format=[".txt", ".txt.gz"] ) recognized_mimes = Extensions( @@ -62,7 +62,7 @@ archive_like_format=["application/zip", "application/x-tar", "application/x-rar-compressed", "application/rar", "application/x-7z-compressed"], image_like_format=["image/jpeg", "image/png", "image/tiff", "image/x-ms-bmp", "image/bmp"], pdf_like_format=["application/pdf"], - csv_like_format=[], + csv_like_format=["text/csv"], txt_like_format=["text/plain", "text/html"] ) diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index b7993f53..6854c6f4 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -1,51 +1,52 @@ from typing import Optional -from dedoc.attachments_handler.attachments_handler import AttachmentsHandler -from dedoc.converters.concrete_converters.binary_converter import BinaryConverter -from dedoc.converters.concrete_converters.docx_converter import DocxConverter -from dedoc.converters.concrete_converters.excel_converter import ExcelConverter -from dedoc.converters.concrete_converters.pdf_converter import PDFConverter -from dedoc.converters.concrete_converters.png_converter import PNGConverter -from dedoc.converters.concrete_converters.pptx_converter import PptxConverter -from dedoc.converters.concrete_converters.txt_converter import TxtConverter -from dedoc.converters.file_converter import FileConverterComposition -from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor -from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor -from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition -from dedoc.readers.archive_reader.archive_reader import ArchiveReader -from dedoc.readers.csv_reader.csv_reader import CSVReader -from dedoc.readers.docx_reader.docx_reader import DocxReader -from dedoc.readers.email_reader.email_reader import EmailReader -from dedoc.readers.excel_reader.excel_reader import ExcelReader -from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.readers.json_reader.json_reader import JsonReader -from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader -from dedoc.readers.note_reader.note_reader import NoteReader -from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.readers.pptx_reader.pptx_reader import PptxReader -from dedoc.readers.reader_composition import ReaderComposition -from dedoc.readers.txt_reader.raw_text_reader import RawTextReader -from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor -from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor -from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition -from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor -from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor -from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition - -"""MANAGER SETTINGS""" - def _get_manager_config(config: dict) -> dict: + """ + Imports are here in order not to do all of them when someone does `import dedoc` + """ + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.converters.concrete_converters.binary_converter import BinaryConverter + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.readers.note_reader.note_reader import NoteReader + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor + from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + converters = [ DocxConverter(config=config), ExcelConverter(config=config), @@ -57,13 +58,13 @@ def _get_manager_config(config: dict) -> dict: ] readers = [ DocxReader(config=config), - ExcelReader(), - PptxReader(), - CSVReader(), + ExcelReader(config=config), + PptxReader(config=config), + CSVReader(config=config), HtmlReader(config=config), RawTextReader(config=config), NoteReader(config=config), - JsonReader(), + JsonReader(config=config), ArchiveReader(config=config), PdfAutoReader(config=config), PdfTabbyReader(config=config), @@ -74,11 +75,11 @@ def _get_manager_config(config: dict) -> dict: ] metadata_extractors = [ - DocxMetadataExtractor(), + DocxMetadataExtractor(config=config), PdfMetadataExtractor(config=config), ImageMetadataExtractor(config=config), - NoteMetadataExtractor(), - BaseMetadataExtractor() + NoteMetadataExtractor(config=config), + BaseMetadataExtractor(config=config) ] law_extractors = { @@ -86,14 +87,14 @@ def _get_manager_config(config: dict) -> dict: LawStructureExtractor.document_type: LawStructureExtractor(config=config) } structure_extractors = { - DefaultStructureExtractor.document_type: DefaultStructureExtractor(), + DefaultStructureExtractor.document_type: DefaultStructureExtractor(config=config), DiplomaStructureExtractor.document_type: DiplomaStructureExtractor(config=config), TzStructureExtractor.document_type: TzStructureExtractor(config=config), ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config) } return dict( - converter=FileConverterComposition(converters=converters), + converter=ConverterComposition(converters=converters), reader=ReaderComposition(readers=readers), structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), structure_constructor=StructureConstructorComposition( diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 602ee68e..3aa74bfe 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -8,35 +8,33 @@ class AbstractMetadataExtractor(ABC): """ @abstractmethod def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ pass @abstractmethod - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. - :param directory: path to the directory where the original and converted files are located - :param filename: name of the file after renaming (for example 23141.doc). \ - The file gets a new name during processing by the dedoc manager (if used) - :param converted_filename: name of the file after renaming and conversion (for example 23141.docx) - :param original_filename: name of the file before renaming - :param parameters: additional parameters for document parsing + :param file_path: path to the file to extract metadata. \ + If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc) + :param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \ + by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting. + :param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path + :param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details :param other_fields: other fields that should be added to the document's metadata :return: dict with metadata information about the document """ diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index e685becc..0e467760 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -1,6 +1,7 @@ +import logging import os from base64 import b64encode -from typing import Optional +from typing import Optional, Tuple from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.utils.utils import get_file_mime_type @@ -20,11 +21,17 @@ class BaseMetadataExtractor(AbstractMetadataExtractor): - time when the file was last modified. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) + def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ @@ -33,24 +40,24 @@ def can_extract(self, """ return True - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Gets the basic meta-information about the file. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters - meta_info = self._get_base_meta_information(directory, filename, original_filename) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + meta_info = self._get_base_meta_information(file_dir, file_name, original_filename) if parameters.get("is_attached", False) and str(parameters.get("return_base64", "false")).lower() == "true": other_fields = {} if other_fields is None else other_fields - path = os.path.join(directory, filename) + path = os.path.join(file_dir, converted_filename) with open(path, "rb") as file: other_fields["base64_encode"] = b64encode(file.read()).decode("utf-8") @@ -72,3 +79,10 @@ def _get_base_meta_information(directory: str, filename: str, name_actual: str) } return meta + + def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]: + file_dir, file_name = os.path.split(file_path) + converted_filename = file_name if converted_filename is None else converted_filename + original_filename = file_name if original_filename is None else original_filename + + return file_dir, file_name, converted_filename, original_filename diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 49b87001..be0964c2 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -22,36 +22,40 @@ class DocxMetadataExtractor(BaseMetadataExtractor): - author who last modified the file; - created, modified and last printed date. """ + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .docx extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) return converted_filename.lower().endswith("docx") - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the docx documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) - file_path = os.path.join(directory, converted_filename) + file_path = os.path.join(file_dir, converted_filename) docx_other_fields = self._get_docx_fields(file_path) result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index 31062c72..465c9dea 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -1,4 +1,3 @@ -import logging import math import os from typing import Optional, Union @@ -28,12 +27,9 @@ class ImageMetadataExtractor(BaseMetadataExtractor): - subject distance range; - user comment. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - self.logger = config.get("logger", logging.getLogger()) - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.keys = { "DateTime": ("date_time", self.__parse_date), "DateTimeDigitized": ("date_time_digitized", self.__parse_date), @@ -53,33 +49,33 @@ def __init__(self, *, config: dict) -> None: } def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has image-like extension (".png", ".jpg", ".jpeg"). Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith((".png", ".jpg", ".jpeg")) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith((".png", ".jpg", ".jpeg")) - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for images. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) - path = os.path.join(directory, filename) + path = os.path.join(file_dir, converted_filename) exif_fields = self._get_exif(path) if len(exif_fields) > 0: result["other_fields"] = {**result.get("other_fields", {}), **exif_fields} diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 18b49d6b..e0dc4b6e 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -13,36 +13,37 @@ class NoteMetadataExtractor(BaseMetadataExtractor): In addition to them, the `author` field can be added to the metadata other fields. """ - def __init__(self) -> None: - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .note.pickle extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith(".note.pickle") - - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith(".note.pickle") + + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the .note.pickle documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) try: - file_path = os.path.join(directory, filename) + file_path = os.path.join(file_dir, converted_filename) with open(file_path, "rb") as infile: note_dict = pickle.load(infile) @@ -58,4 +59,4 @@ def extract_metadata(self, other_fields=other_fields) return meta_info except Exception: - raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 274a8d26..e3502e44 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -1,4 +1,3 @@ -import logging import os from typing import Optional @@ -24,11 +23,9 @@ class PdfMetadataExtractor(BaseMetadataExtractor): - creation date; - modification date. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the extractor, e.g. logger for logging - """ - super().__init__() + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.keys = { "/Producer": "producer", "/Creator": "creator", @@ -42,36 +39,34 @@ def __init__(self, *, config: dict) -> None: "/CreationDate": "creation_date", "/ModDate": "modification_date", } - self.config = config - self.logger = config.get("logger", logging.getLogger()) def can_extract(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> bool: """ Check if the document has .pdf extension. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` documentation to get the information about parameters. """ - return filename.lower().endswith(".pdf") + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + return converted_filename.lower().endswith(".pdf") - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the pdf documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` documentation to get the information about parameters. """ - result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) - path = os.path.join(directory, filename) + file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) + result = super().extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) + path = os.path.join(file_dir, converted_filename) pdf_fields = self._get_pdf_info(path) if len(pdf_fields) > 0: result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields} diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index e9c182d4..ba46c4b0 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,3 +1,4 @@ +import os.path from typing import List, Optional from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -12,33 +13,24 @@ class MetadataExtractorComposition: """ def __init__(self, extractors: List[AbstractMetadataExtractor]) -> None: """ - :param extractors: the list of extractors with methods can_extract() and extract_metadata() to extract metadata from file + :param extractors: the list of extractors with methods can_extract() and extract() to extract metadata from file """ self.extractors = extractors - def extract_metadata(self, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> dict: + def extract(self, + file_path: str, + converted_filename: Optional[str] = None, + original_filename: Optional[str] = None, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Extract metadata using one of the extractors if suitable extractor was found. - Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` of the class + Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` of the class :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters. """ for extractor in self.extractors: - if extractor.can_extract(directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, + if extractor.can_extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, other_fields=other_fields): - return extractor.extract_metadata(directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - other_fields=other_fields) - raise Exception(f"Can't extract metadata from from file {filename}") + return extractor.extract(file_path=file_path, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, + other_fields=other_fields) + raise Exception(f"Can't extract metadata from from file {os.path.basename(file_path)}") diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index fac1b86c..d8831b58 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -1,4 +1,3 @@ -import logging import os import tarfile import uuid @@ -14,7 +13,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_file_mime_type, save_data_to_unique_file +from dedoc.utils.utils import get_file_mime_type, get_mime_extension, save_data_to_unique_file class ArchiveReader(BaseReader): @@ -22,21 +21,18 @@ class ArchiveReader(BaseReader): This reader allows to get archived files as attachments of the :class:`~dedoc.data_structures.UnstructuredDocument`. Documents with the following extensions can be parsed: .zip, .tar, .tar.gz, .rar, .7z. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return empty content of archive, all content will be placed inside attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. @@ -48,10 +44,10 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return UnstructuredDocument(lines=[], tables=[], attachments=[]) attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" - attachments = self.__get_attachments(path=path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis) + attachments = self.__get_attachments(path=file_path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis) return UnstructuredDocument(lines=[], tables=[], attachments=attachments) def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]: diff --git a/dedoc/readers/base_reader.py b/dedoc/readers/base_reader.py index 247270e9..6d857107 100644 --- a/dedoc/readers/base_reader.py +++ b/dedoc/readers/base_reader.py @@ -1,3 +1,4 @@ +import logging from abc import ABC, abstractmethod from typing import Optional @@ -14,34 +15,39 @@ class BaseReader(ABC): Some of the readers can also extract information about line type and hierarchy level (for example, list item) - this information is stored in the `tag_hierarchy_level` attribute of the class :class:`~dedoc.data_structures.LineMetadata`. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the reader, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ - Read file from disk and extract text with annotations, tables and attachments from the document. - The given file should have appropriate extension and type so it should be checked by the method - :meth:`~dedoc.readers.BaseReader.can_read`, which should return True beforehand. + Check if this reader can handle the given file. + You should provide at least one of the following parameters: file_path, extension, mime. - :param path: path to the file in the file system - :param document_type: type of the file, for example scientific article, presentation slides and so on - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path to the file in the file system + :param mime: MIME type of a file + :param extension: file extension, for example .doc or .pdf + :param parameters: dict with additional parameters for document reader, see :ref:`parameters_description` for more details - :return: intermediate representation of the document with lines, tables and attachments + :return: True if this reader can handle the file, False otherwise """ pass @abstractmethod - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ - Check if this reader can handle the given file. + Read file from disk and extract text with annotations, tables and attachments from the document. + The given file should have appropriate extension and mime type, so it should be checked by the method + :meth:`~dedoc.readers.BaseReader.can_read`, which should return True beforehand. - :param path: path to the file in the file system - :param mime: MIME type of a file - :param extension: file extension, for example .doc or .pdf - :param document_type: type of file, for example scientific article, presentation slides and so on - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path to the file in the file system + :param parameters: dict with additional parameters for document reader, see :ref:`parameters_description` for more details - :return: True if this reader can handle the file, False otherwise + :return: intermediate representation of the document with lines, tables and attachments """ pass diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index be02a6b8..d1de64ed 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -8,34 +8,38 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_encoding +from dedoc.utils.utils import get_encoding, get_mime_extension class CSVReader(BaseReader): """ This class allows to parse files with the following extensions: .csv, .tsv. """ - def __init__(self) -> None: + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.default_separator = "," - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.csv_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method will place all extracted content inside tables of the :class:`~dedoc.data_structures.UnstructuredDocument`. The lines and attachments remain empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ + parameters = {} if parameters is None else parameters delimiter = parameters.get("delimiter") if delimiter is None: - delimiter = "\t" if path.endswith(".tsv") else self.default_separator - encoding, encoding_warning = self.__get_encoding(path, parameters) - with open(path, errors="ignore", encoding=encoding) as file: + delimiter = "\t" if file_path.endswith(".tsv") else self.default_separator + encoding, encoding_warning = self.__get_encoding(file_path, parameters) + with open(file_path, errors="ignore", encoding=encoding) as file: csv_reader = csv.reader(file, delimiter=delimiter) data = list(csv_reader) table_metadata = TableMetadata(page_id=0) diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 552b09c5..e91163fb 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -1,5 +1,3 @@ -import logging -import os from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor @@ -9,6 +7,7 @@ from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument +from dedoc.utils.utils import get_mime_extension class DocxReader(BaseReader): @@ -16,30 +15,29 @@ class DocxReader(BaseReader): This class is used for parsing documents with .docx extension. Please use :class:`~dedoc.converters.DocxConverter` for getting docx file from similar formats. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.attachment_extractor = DocxAttachmentsExtractor() - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = DocxAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) - docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger) + docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 93c02c4d..7a239e31 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -1,6 +1,5 @@ import email import json -import logging import mimetypes import os import re @@ -17,29 +16,27 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader from dedoc.readers.html_reader.html_reader import HtmlReader -from dedoc.utils.utils import get_unique_name, save_data_to_unique_file +from dedoc.utils.utils import get_mime_extension, get_unique_name, save_data_to_unique_file class EmailReader(BaseReader): """ This class is used for parsing documents with .eml extension (e-mail messages saved into files). """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - super().__init__() - self.logger = config.get("logger", logging.getLogger()) - self.html_reader = HtmlReader(config=config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.html_reader = HtmlReader(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension or mime is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ - return path.lower().endswith(".eml") or mime == "message/rfc822" + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return file_path.lower().endswith(".eml") or mime == "message/rfc822" - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. @@ -50,9 +47,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ parameters = {} if parameters is None else parameters attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir - with open(path, "rb") as f: + with open(file_path, "rb") as f: msg = email.message_from_binary_file(f) tables, attachments = [], [] @@ -77,7 +74,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio if content_type == "text/plain": text_parts.append(msg) if content_type == "text/html": - self.__add_content_from_html(msg, lines, tables) + self.__add_content_from_html(msg, lines, tables, parameters) html_found = True for part in msg.walk(): @@ -87,7 +84,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio continue if content_type == "text/html": - self.__add_content_from_html(part, lines, tables) + self.__add_content_from_html(part, lines, tables, parameters) html_found = True continue @@ -131,7 +128,7 @@ def __add_attachment(self, message: Message, attachments_dir: str, attachments: uid=f"attach_{uuid.uuid1()}", need_content_analysis=need_content_analysis)) - def __add_content_from_html(self, message: Message, lines: list, tables: list) -> None: + def __add_content_from_html(self, message: Message, lines: list, tables: list, parameters: dict) -> None: payload = message.get_payload(decode=True) if payload is None: return @@ -143,7 +140,7 @@ def __add_content_from_html(self, message: Message, lines: list, tables: list) - file.write(payload) file.flush() - document = self.html_reader.read(path=file.name) + document = self.html_reader.read(file_path=file.name, parameters=parameters) part_messages = [line for line in document.lines if line.line is not None] for line in part_messages: line._line += "\n" diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index e846c0ca..91501e97 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -1,4 +1,3 @@ -import os from typing import Optional import xlrd @@ -12,6 +11,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension xlrd.xlsx.ensure_elementtree_imported(False, None) xlrd.xlsx.Element_has_iter = True @@ -22,30 +22,33 @@ class ExcelReader(BaseReader): This class is used for parsing documents with .xlsx extension. Please use :class:`~dedoc.converters.ExcelConverter` for getting xlsx file from similar formats. """ - def __init__(self) -> None: - self.attachment_extractor = ExcelAttachmentsExtractor() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = ExcelAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method extracts tables and attachments from the document, `lines` attribute remains empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with xlrd.open_workbook(path) as book: + with xlrd.open_workbook(file_path) as book: sheets_num = book.nsheets tables = [] for sheet_num in range(sheets_num): sheet = book.sheet_by_index(sheet_num) tables.append(self.__parse_sheet(sheet_num, sheet)) if self.attachment_extractor.with_attachments(parameters=parameters): - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) else: attachments = [] return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/html2pdf_reader/html2pdf_reader.py b/dedoc/readers/html2pdf_reader/html2pdf_reader.py index 678ba53a..f18cbf16 100644 --- a/dedoc/readers/html2pdf_reader/html2pdf_reader.py +++ b/dedoc/readers/html2pdf_reader/html2pdf_reader.py @@ -1,4 +1,3 @@ -import logging import os import re from copy import deepcopy @@ -19,22 +18,20 @@ class Html2PdfReader(HtmlReader): - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.pdf_reader = PdfTxtlayerReader(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) + self.pdf_reader = PdfTxtlayerReader(config=self.config) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: parameters = {} if parameters is None else parameters with TemporaryDirectory() as tmp_dir: - modified_path, tables = self._modify_html(path, tmp_dir) - converted_path = os.path.join(tmp_dir, os.path.basename(path).replace(".html", ".pdf")) + modified_path, tables = self._modify_html(file_path, tmp_dir) + converted_path = os.path.join(tmp_dir, os.path.basename(file_path).replace(".html", ".pdf")) HTML(filename=modified_path).write_pdf(converted_path) self.logger.info(f"Convert {modified_path} to {converted_path}") parameters_new = deepcopy(parameters) parameters_new["pdf_with_text_layer"] = "true" - unstructured_document = self.pdf_reader.read(path=converted_path, document_type=document_type, parameters=parameters_new) + unstructured_document = self.pdf_reader.read(file_path=converted_path, parameters=parameters_new) return self._add_tables(document=unstructured_document, tables=tables) diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index fe97614a..0e4e0a45 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -1,5 +1,4 @@ import hashlib -import logging import string from typing import List, Optional, Union @@ -17,7 +16,7 @@ from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser from dedoc.readers.html_reader.html_tags import HtmlTags -from dedoc.utils.utils import calculate_file_hash +from dedoc.utils.utils import calculate_file_hash, get_mime_extension class HtmlReader(BaseReader): @@ -25,34 +24,31 @@ class HtmlReader(BaseReader): This reader allows to handle documents with the following extensions: .html, .shtml """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.postprocessor = HtmlLinePostprocessing() self.tag_annotation_parser = HtmlTagAnnotationParser() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in [".html", ".shtml"] or mime in ["text/html"] - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines and tables, attachments remain empty. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with open(path, "rb") as f: + with open(file_path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true" - path_hash = calculate_file_hash(path=path) + path_hash = calculate_file_hash(path=file_path) lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table) tables = [ self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table) diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py index 7f3cb2f2..f408674f 100644 --- a/dedoc/readers/json_reader/json_reader.py +++ b/dedoc/readers/json_reader/json_reader.py @@ -1,4 +1,3 @@ -import os from json import JSONDecodeError from typing import Any, List, Optional @@ -12,24 +11,27 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class JsonReader(BaseReader): """ This reader allows handle json files. """ - def __init__(self) -> None: - super().__init__() - self.attachment_extractor = JsonAttachmentsExtractor() - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = JsonAttachmentsExtractor(config=self.config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (it has .json extension). Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".json") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines and attachments, tables remain empty. This reader considers json lists as list items and adds this information to the `tag_hierarchy_level` @@ -38,7 +40,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with open(path) as file: + with open(file_path) as file: try: json_data = json.load(file) except (JSONDecodeError, ValueError): @@ -51,7 +53,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio except (JSONDecodeError, ValueError): raise BadParametersError(f"can't read html_fields {fields}") json_data = self.__exclude_html_fields(json_data, key_fields) - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) else: attachments = [] diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 79b5d7bd..ea980dec 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -1,6 +1,5 @@ import email import gzip -import logging import os import uuid from typing import List, Optional, Tuple @@ -13,32 +12,30 @@ from dedoc.readers.base_reader import BaseReader from dedoc.readers.html_reader.html_reader import HtmlReader from dedoc.utils import supported_image_types -from dedoc.utils.utils import check_filename_length, get_encoding, save_data_to_unique_file +from dedoc.utils.utils import check_filename_length, get_encoding, get_mime_extension, save_data_to_unique_file class MhtmlReader(BaseReader): """ This reader can process files with the following extensions: .mhtml, .mht, .mhtml.gz, .mht.gz """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.mhtml_extensions = [".mhtml", ".mht"] self.mhtml_extensions += [f"{extension}.gz" for extension in self.mhtml_extensions] self.mhtml_extensions = tuple(self.mhtml_extensions) - self.html_reader = HtmlReader(config=config) + self.html_reader = HtmlReader(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(tuple(self.mhtml_extensions)) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. @@ -46,15 +43,15 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ parameters = {} if parameters is None else parameters attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir - names_list, original_names_list = self.__extract_files(path=path, save_dir=attachments_dir) + names_list, original_names_list = self.__extract_files(path=file_path, save_dir=attachments_dir) names_html = self.__find_html(names_list=names_list) lines = [] tables = [] for html_file in names_html: - result = self.html_reader.read(path=html_file, parameters=parameters, document_type=document_type) + result = self.html_reader.read(file_path=html_file, parameters=parameters) lines.extend(result.lines) tables.extend(result.tables) diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py index 350e3bf0..836a98bb 100644 --- a/dedoc/readers/note_reader/note_reader.py +++ b/dedoc/readers/note_reader/note_reader.py @@ -1,4 +1,3 @@ -import logging import os import pickle from typing import Optional @@ -7,34 +6,33 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class NoteReader(BaseReader): """ This class is used for parsing documents with .note.pickle extension. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.config = config - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith(".note.pickle") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ try: - with open(path, "rb") as infile: + with open(file_path, "rb") as infile: note_dict = pickle.load(infile) text = note_dict["content"] if isinstance(text, bytes): @@ -44,5 +42,5 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return unstructured except Exception as e: - self.logger.warning(f"Can't handle {path}\n{e}") - raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(path)}. Seems note-format is broken") + self.logger.warning(f"Can't handle {file_path}\n{e}") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken") diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 6ed650ef..c91cc779 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -1,5 +1,4 @@ import copy -import logging import os from itertools import chain from typing import Optional @@ -14,6 +13,7 @@ from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer +from dedoc.utils.utils import get_mime_extension class PdfAutoReader(BaseReader): @@ -28,55 +28,49 @@ class PdfAutoReader(BaseReader): * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config) - self.pdf_tabby_reader = PdfTabbyReader(config=config) - self.pdf_image_reader = PdfImageReader(config=config) - self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config) - - self.config = config - self.logger = config.get("logger", logging.getLogger()) + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.pdf_txtlayer_reader = PdfTxtlayerReader(config=self.config) + self.pdf_tabby_reader = PdfTabbyReader(config=self.config) + self.pdf_image_reader = PdfImageReader(config=self.config) + self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `auto` or `auto_tabby` is set in the dictionary `parameters`. It is recommended to use `pdf_with_text_layer=auto_tabby` because it's faster and allows to get better results. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. - - Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ - if mime not in recognized_mimes.pdf_like_format: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + if not (mime in recognized_mimes.pdf_like_format or extension.lower() == ".pdf"): return False parameters = {} if parameters is None else parameters - pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters) - return pdf_with_txt_layer in ("auto", "auto_tabby") + return get_param_pdf_with_txt_layer(parameters) in ("auto", "auto_tabby") - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ warnings = [] - txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=path, parameters=parameters) + txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters) if txtlayer_parameters.is_correct_text_layer: result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct, parameters=parameters, - path=path, + path=file_path, warnings=warnings) else: - result = self.__handle_incorrect_text_layer(parameters, path, warnings) + result = self.__handle_incorrect_text_layer(parameters, file_path, warnings) result.warnings.extend(warnings) return result @@ -84,7 +78,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument: self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer") warnings.append("Assume document has incorrect textual layer") - result = self.pdf_image_reader.read(path=path, parameters=parameters_copy) + result = self.pdf_image_reader.read(file_path=path, parameters=parameters_copy) return result def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument: @@ -99,14 +93,14 @@ def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: d # GET THE FIRST PAGE: recognize the first page like a scanned page scan_parameters = self.__preparing_first_page_parameters(parameters) - recognized_first_page = self.pdf_image_reader.read(path=path, parameters=scan_parameters) + recognized_first_page = self.pdf_image_reader.read(file_path=path, parameters=scan_parameters) # PREPARE PARAMETERS: from the second page we recognize the content like PDF with a textual layer parameters = self.__preparing_other_pages_parameters(parameters) pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters) reader = self.pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self.pdf_tabby_reader - result = reader.read(path=path, parameters=parameters) + result = reader.read(file_path=path, parameters=parameters) result = self.__merge_documents(recognized_first_page, result) if recognized_first_page is not None else result return result diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 66c2be25..d52e0d3c 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,4 +1,3 @@ -import logging import math import os from abc import abstractmethod @@ -39,7 +38,6 @@ "orient_cell_angle", "is_one_column_document", "document_orientation", - "document_type", "language", "need_header_footers_analysis", "need_pdf_table_analysis", @@ -55,29 +53,28 @@ class PdfBaseReader(BaseReader): """ Base class for pdf documents parsing. """ - def __init__(self, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ - config["n_jobs"] = config.get("n_jobs", 1) - self.table_recognizer = TableRecognizer(config=config) - self.metadata_extractor = LineMetadataExtractor(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) - self.attachment_extractor = PDFAttachmentsExtractor(config=config) - self.linker = LineObjectLinker(config=config) - self.paragraph_extractor = ScanParagraphClassifierExtractor(config=config) - - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.config["n_jobs"] = config.get("n_jobs", 1) + self.table_recognizer = TableRecognizer(config=self.config) + self.metadata_extractor = LineMetadataExtractor(config=self.config) + self.attachment_extractor = PDFAttachmentsExtractor(config=self.config) + self.linker = LineObjectLinker(config=self.config) + self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config) + + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ parameters = {} if parameters is None else parameters first_page, last_page = param_utils.get_param_page_slice(parameters) attachments_dir = parameters.get("attachments_dir", None) - attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir + attachments_dir = os.path.dirname(file_path) if attachments_dir is None else attachments_dir params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), @@ -85,7 +82,6 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), - document_type=document_type, need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), need_pdf_table_analysis=param_utils.get_param_need_pdf_table_analysis(parameters), first_page=first_page, @@ -95,7 +91,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio attachments_dir=attachments_dir ) - lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse) + lines, scan_tables, attachments, warnings, other_fields = self._parse_document(file_path, params_for_parse) tables = [] for scan_table in scan_tables: metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle) @@ -103,10 +99,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio table = Table(metadata=metadata, cells=cells_with_meta) tables.append(table) - if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters): - tmp_dir = os.path.dirname(path) - file_name = os.path.basename(path) - attachments += self.attachment_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters=parameters) + if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): + attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=other_fields) return self._postprocess(result) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index ad5f1335..2daede1b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,4 +1,3 @@ -import logging import os from datetime import datetime from typing import List, Optional, Tuple @@ -18,6 +17,7 @@ from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox from dedoc.utils import supported_image_types from dedoc.utils.parameter_utils import get_path_param +from dedoc.utils.utils import get_mime_extension class PdfImageReader(PdfBaseReader): @@ -42,25 +42,23 @@ class PdfImageReader(PdfBaseReader): It isn't recommended to use this reader for extracting content from PDF documents with a correct textual layer, use other PDF readers instead. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.scew_corrector = SkewCorrector() + self.skew_corrector = SkewCorrector() self.column_orientation_classifier = ColumnsOrientationClassifier(on_gpu=self.config.get("on_gpu", False), - checkpoint_path=get_config()["resources_path"], config=config) + checkpoint_path=get_config()["resources_path"], config=self.config) self.binarizer = AdaptiveBinarizer() - self.ocr = OCRLineExtractor(config=config) - self.logger = config.get("logger", logging.getLogger()) + self.ocr = OCRLineExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader, i.e. it has .pdf extension, or it is an image. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return mime in recognized_mimes.pdf_like_format or mime in recognized_mimes.image_like_format or \ - path.lower().endswith(tuple(recognized_extensions.image_like_format)) or extension.lower().replace(".", "") in supported_image_types + file_path.lower().endswith(tuple(recognized_extensions.image_like_format)) or extension.lower().replace(".", "") in supported_image_types def _process_one_page(self, image: np.ndarray, @@ -119,7 +117,7 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa angle = angle if parameters.document_orientation is None else 0 self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}") - rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle}) + rotated_image, result_angle = self.skew_corrector.preprocess(image, {"orientation_angle": angle}) result_angle = result_angle["rotated_angle"] if self.config.get("debug_mode", False): diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index c204bf48..0edc191c 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -1,5 +1,4 @@ import json -import logging import math import os import shutil @@ -27,6 +26,7 @@ from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument +from dedoc.extensions import recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -34,9 +34,9 @@ from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth -from dedoc.utils.parameter_utils import get_param_page_slice +from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer from dedoc.utils.pdf_utils import get_pdf_page_count -from dedoc.utils.utils import calculate_file_hash, get_unique_name +from dedoc.utils.utils import calculate_file_hash, get_mime_extension, get_unique_name class PdfTabbyReader(PdfBaseReader): @@ -46,50 +46,46 @@ class PdfTabbyReader(PdfBaseReader): It is recommended to use this class as a handler for PDF documents with a correct textual layer if you don't need to check textual layer correctness. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.config = config - self.logger = config.get("logger", logging.getLogger()) self.tabby_java_version = "2.0.0" self.jar_name = "ispras_tbl_extr.jar" self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `tabby` is set in the dictionary `parameters`. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - return extension.endswith("pdf") and (str(parameters.get("pdf_with_text_layer", "false")).lower() == "tabby") + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "tabby" - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. + + You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ parameters = {} if parameters is None else parameters warnings = [] - lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=path, parameters=parameters, warnings=warnings) + lines, tables, tables_on_images, image_attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings) lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=image_attachments) attachments = image_attachments - if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters): - tmp_dir = os.path.dirname(path) - file_name = os.path.basename(path) - attachments += self.attachment_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters=parameters) + if self._can_contain_attachements(file_path) and self.attachment_extractor.with_attachments(parameters): + attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) lines = [line for line_group in lines for line in line_group.split("\n")] lines = self.paragraph_extractor.extract(lines) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 16a49ca4..c0e99c43 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -4,12 +4,15 @@ import numpy as np from dedocutils.data_structures import BBox +from dedoc.extensions import recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox +from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer +from dedoc.utils.utils import get_mime_extension class PdfTxtlayerReader(PdfBaseReader): @@ -17,27 +20,25 @@ class PdfTxtlayerReader(PdfBaseReader): This class allows to extract content (text, tables, attachments) from the .pdf documents with a textual layer (copyable documents). It uses a pdfminer library for content extraction. - For more information, look to `pdf_with_text_layer` option description in the table :ref:`table_parameters`. + For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) - self.extractor_layer = PdfminerExtractor(config=config) + self.extractor_layer = PdfminerExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `true` is set in the dictionary `parameters`. - You can look to the table :ref:`table_parameters` to get more information about `parameters` dictionary possible arguments. + You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - return extension.lower().endswith("pdf") and (str(parameters.get("pdf_with_text_layer", "false")).lower() == "true") + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return (mime in recognized_mimes.pdf_like_format or extension.lower().endswith("pdf")) and get_param_pdf_with_txt_layer(parameters) == "true" def _process_one_page(self, image: np.ndarray, @@ -72,7 +73,7 @@ def _process_one_page(self, def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: """ - Change table boxes's width height into pdf space like textual lines + Change table boxes' width height into pdf space like textual lines """ for table in tables: diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 0428ae56..e6340251 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,4 +1,3 @@ -import os from typing import Optional from pptx import Presentation @@ -12,6 +11,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class PptxReader(BaseReader): @@ -20,23 +20,25 @@ class PptxReader(BaseReader): Please use :class:`~dedoc.converters.PptxConverter` for getting pptx file from similar formats. """ - def __init__(self) -> None: - self.attachments_extractor = PptxAttachmentsExtractor() + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: str = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - prs = Presentation(path) + prs = Presentation(file_path) lines, tables = [], [] for page_id, slide in enumerate(prs.slides, start=1): @@ -53,6 +55,6 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id))) - attachments = self.attachments_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py index 7b1c9bcd..9cf0aec3 100644 --- a/dedoc/readers/reader_composition.py +++ b/dedoc/readers/reader_composition.py @@ -1,11 +1,10 @@ import os -from typing import Dict, List +from typing import List, Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import get_file_mime_type, splitext_ +from dedoc.utils.utils import get_mime_extension class ReaderComposition(object): @@ -21,30 +20,24 @@ def __init__(self, readers: List[BaseReader]) -> None: """ self.readers = readers - def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Get intermediate representation for the document of any format which one of the available readers can parse. If there is no suitable reader for the given document, the BadFileFormatException will be raised. - :param tmp_dir: the directory where the file is located - :param filename: name of the given file - :param parameters: dict with additional parameters for document reader (as language for scans or delimiter for csv) + :param file_path: path of the file to be parsed + :param parameters: dict with additional parameters for document readers, see :ref:`parameters_description` for more details :return: intermediate representation of the document with lines, tables and attachments """ - name, extension = splitext_(filename) - file_path = os.path.join(tmp_dir, filename) - mime = get_file_mime_type(file_path) - document_type = parameters.get("document_type") + file_name = os.path.basename(file_path) + extension, mime = get_mime_extension(file_path=file_path) for reader in self.readers: - can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type, parameters=parameters) - - if can_read: - unstructured_document = reader.read(path=file_path, document_type=document_type, parameters=parameters) - assert len(unstructured_document.lines) == 0 or isinstance(unstructured_document.lines[0], LineWithMeta) + if reader.can_read(file_path=file_path, mime=mime, extension=extension, parameters=parameters): + unstructured_document = reader.read(file_path=file_path, parameters=parameters) return unstructured_document raise BadFileFormatError( - msg=f"No one can read file: name = {filename}, extension = {extension}, mime = {mime}, document type = {document_type}", - msg_api=f"Unsupported file format {mime} of the input file {filename}" + msg=f"No one can read file: name = {file_name}, extension = {extension}, mime = {mime}", + msg_api=f"Unsupported file format {mime} of the input file {file_name}" ) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index d52ae567..33ffe656 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -1,6 +1,5 @@ import codecs import gzip -import logging import re import time from typing import Iterable, List, Optional, Tuple @@ -14,36 +13,34 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor -from dedoc.utils.utils import calculate_file_hash, get_encoding +from dedoc.utils.utils import calculate_file_hash, get_encoding, get_mime_extension class RawTextReader(BaseReader): """ This class allows to parse files with the following extensions: .txt, .txt.gz """ - def __init__(self, *, config: dict) -> None: - """ - :param config: configuration of the reader, e.g. logger for logging - """ + + def __init__(self, *, config: Optional[dict] = None) -> None: + super().__init__(config=config) self.space_regexp = re.compile(r"^\s+") - self.config = config - self.logger = config.get("logger", logging.getLogger()) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower().endswith((".txt", "txt.gz")) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - encoding = self.__get_encoding(path=path, parameters=parameters) - lines = self._get_lines_with_meta(path=path, encoding=encoding) + encoding = self.__get_encoding(path=file_path, parameters=parameters) + lines = self._get_lines_with_meta(path=file_path, encoding=encoding) encoding_warning = f"encoding is {encoding}" result = UnstructuredDocument(lines=lines, tables=[], attachments=[], warnings=[encoding_warning]) return self._postprocess(result) diff --git a/dedoc/scripts/benchmark_pdf_attachments.py b/dedoc/scripts/benchmark_pdf_attachments.py index 5dcafd7e..5b433117 100644 --- a/dedoc/scripts/benchmark_pdf_attachments.py +++ b/dedoc/scripts/benchmark_pdf_attachments.py @@ -57,7 +57,7 @@ def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_d with tempfile.TemporaryDirectory() as tmp_dir: file_path = os.path.join(tmp_dir, file_name) shutil.copy(os.path.join(input_dir, file_name), file_path) - attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={}) + attachments = attachments_extractor.extract(file_path=file_path) os.remove(file_path) file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_")) diff --git a/dedoc/scripts/create_txtlayer_dataset.py b/dedoc/scripts/create_txtlayer_dataset.py index 744307b6..84fafa28 100644 --- a/dedoc/scripts/create_txtlayer_dataset.py +++ b/dedoc/scripts/create_txtlayer_dataset.py @@ -128,10 +128,10 @@ def corrupt(self, text: str, lang: str) -> str: # 2 - read text from the image using OCR with another language lines = [] for image_path in images_path_list: - document = self.image_reader.read(image_path, document_type=None, parameters=dict(language=ocr_lang, - need_pdf_table_analysis="false", - document_orientation="no_change", - is_one_column_document="true")) + document = self.image_reader.read(image_path, parameters=dict(language=ocr_lang, + need_pdf_table_analysis="false", + document_orientation="no_change", + is_one_column_document="true")) lines.extend(document.lines) return "".join([line.line for line in lines]) diff --git a/dedoc/structure_constructors/abstract_structure_constructor.py b/dedoc/structure_constructors/abstract_structure_constructor.py index bda3e927..063e6682 100644 --- a/dedoc/structure_constructors/abstract_structure_constructor.py +++ b/dedoc/structure_constructors/abstract_structure_constructor.py @@ -15,17 +15,17 @@ class AbstractStructureConstructor(ABC): that are retrieved with the help of some structure extractor. The order of the document lines and their hierarchy can be represented in different ways, e.g. standard tree of lines hierarchy. - Also some other custom structure can be defined by the specific constructor. + Also, some other custom structure can be defined by the specific constructor. """ @abstractmethod - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Process unstructured document and build parsed document representation on this basis. :param document: intermediate representation of the document received from some structure extractor \ (there should be filled hierarchy levels for all lines) - :param structure_type: type of the structure that should be retrieved for the document + :param parameters: additional parameters for document parsing, see :ref:`structure_type_parameters` for more details :return: the structured representation of the given document """ pass diff --git a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py index c7160d1f..86e2522e 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py @@ -14,7 +14,7 @@ class LinearConstructor(AbstractStructureConstructor): The result contains the empty root node with the consecutive list of all document lines as its children. """ - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Build the linear structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index ed1f3277..5c986c1b 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -33,7 +33,7 @@ class TreeConstructor(AbstractStructureConstructor): - **second child line (1, 0)** """ - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Build the tree structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. diff --git a/dedoc/structure_constructors/structure_constructor_composition.py b/dedoc/structure_constructors/structure_constructor_composition.py index d516fe72..eaf62e55 100644 --- a/dedoc/structure_constructors/structure_constructor_composition.py +++ b/dedoc/structure_constructors/structure_constructor_composition.py @@ -20,16 +20,18 @@ def __init__(self, constructors: Dict[str, AbstractStructureConstructor], defaul self.constructors = constructors self.default_constructor = default_constructor - def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None, parameters: Optional[dict] = None) -> ParsedDocument: + def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> ParsedDocument: """ Construct the result document structure according to the `structure_type` parameter. If `structure_type` is empty string or None the default constructor will be used. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. """ + structure_type = parameters.get("structure_type") + if structure_type in self.constructors: - return self.constructors[structure_type].structure_document(document) + return self.constructors[structure_type].construct(document) if structure_type is None or structure_type == "": - return self.default_constructor.structure_document(document) + return self.default_constructor.construct(document) raise StructureExtractorError(f"Bad structure type {structure_type}, available structure types is: {' '.join(self.constructors.keys())}") diff --git a/dedoc/structure_extractors/abstract_structure_extractor.py b/dedoc/structure_extractors/abstract_structure_extractor.py index 83155800..4514d892 100644 --- a/dedoc/structure_extractors/abstract_structure_extractor.py +++ b/dedoc/structure_extractors/abstract_structure_extractor.py @@ -1,6 +1,7 @@ +import logging from abc import ABC, abstractmethod from copy import deepcopy -from typing import List +from typing import List, Optional from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation @@ -21,15 +22,21 @@ class AbstractStructureExtractor(ABC): The paragraph type of the line should be one of the predefined types for some certain document domain, e.g. header, list_item, raw_text, etc. Each concrete structure extractor defines the rules of structuring: the levels and possible types of the lines. """ + def __init__(self, *, config: Optional[dict] = None) -> None: + """ + :param config: configuration of the extractor, e.g. logger for logging + """ + self.config = {} if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) @abstractmethod - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method extracts structure for the document content received from some reader: it finds lines types and their hierarchy levels and adds them to the lines' metadata. :param document: document content that has been received from some of the readers - :param parameters: additional parameters for document parsing + :param parameters: additional parameters for document parsing, see :ref:`structure_type_parameters` for more details :return: document content with added additional information about lines types and hierarchy levels """ pass diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 0e4eba00..142982c2 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -1,6 +1,6 @@ import os from abc import ABC, abstractmethod -from typing import List, Tuple +from typing import List, Optional, Tuple from dedoc.config import get_config from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -20,19 +20,20 @@ class AbstractLawStructureExtractor(AbstractStructureExtractor, ABC): You can find the description of this type of structure in the section :ref:`law_structure`. """ - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=config) - self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=config) + self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config) + self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config) self.hierarchy_level_builders = [StubHierarchyLevelBuilder()] self.hl_type = "law" self.init_hl_depth = 1 self.except_words = {"приказ", "положение", "требования", "постановление", "перечень", "регламент", "закон"} - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract law structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py index 8a9dc4f8..324f4622 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py @@ -1,4 +1,3 @@ -import logging import re from abc import ABC from collections import OrderedDict @@ -56,13 +55,13 @@ class ClassifyingLawStructureExtractor(AbstractStructureExtractor, ABC): """ document_type = "law" - def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: dict) -> None: + def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: Optional[dict] = None) -> None: """ :param extractors: mapping law_type -> structure extractor, defined for certain law types :param config: configuration of the extractor, e.g. logger for logging """ + super().__init__(config=config) self.extractors = extractors - self.logger = config.get("logger", logging.getLogger()) self.hat_batch_size = 3 self.hat_batch_count = 7 @@ -106,14 +105,15 @@ def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: instruction_ws = self.__add_whitespace_match("инструкция") self.main_templates[LawDocType.instruction] = {rf"\b{instruction_ws}\b"} - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Classify law kind and extract structure according to the specific law format. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + parameters = {} if parameters is None else parameters selected_extractor = self._predict_extractor(lines=document.lines) - result = selected_extractor.extract_structure(document, parameters) + result = selected_extractor.extract(document, parameters) warning = f"Use {selected_extractor.document_type} classifier" result.warnings = result.warnings + [warning] return result diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index ed65170a..92a39fb0 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -22,7 +22,7 @@ class DefaultStructureExtractor(AbstractStructureExtractor): prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, LetterPrefix, BulletPrefix] - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract basic structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index d1ce8818..c08674c6 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -1,6 +1,6 @@ import os import re -from typing import List +from typing import List, Optional from dedoc.config import get_config from dedoc.data_structures.line_with_meta import LineWithMeta @@ -21,19 +21,20 @@ class DiplomaStructureExtractor(AbstractStructureExtractor): """ document_type = "diploma" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) self.toc_extractor = TOCFeatureExtractor() self.header_builder = HeaderHierarchyLevelBuilder() self.toc_builder = TocBuilder() self.body_builder = DiplomaBodyBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=config) + self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config) self.footnote_start_regexp = re.compile(r"^\d+ ") - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract diploma structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py index 47c1bb2c..549603dc 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor @@ -18,7 +18,7 @@ class FoivLawStructureExtractor(AbstractLawStructureExtractor): """ document_type = "foiv_law" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), diff --git a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py index f2bed5eb..f360011a 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py @@ -1,5 +1,5 @@ import re -from typing import List +from typing import List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor @@ -19,7 +19,7 @@ class LawStructureExtractor(AbstractLawStructureExtractor): """ document_type = "law" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) self.hierarchy_level_builders = [ HeaderHierarchyLevelBuilder(), diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py index 8dd76083..4c9c0993 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py @@ -1,4 +1,5 @@ import os +from typing import Optional from dedoc.config import get_config from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -20,18 +21,19 @@ class TzStructureExtractor(AbstractStructureExtractor): """ document_type = "tz" - def __init__(self, *, config: dict) -> None: + def __init__(self, *, config: Optional[dict] = None) -> None: """ :param config: some configuration for document parsing """ + super().__init__(config=config) self.header_builder = HeaderHierarchyLevelBuilder() self.body_builder = TzBodyBuilder() self.toc_builder = TocBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=config) - self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=config) + self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config) + self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config) - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract technical task structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ diff --git a/dedoc/structure_extractors/structure_extractor_composition.py b/dedoc/structure_extractors/structure_extractor_composition.py index 85453132..6160a35e 100644 --- a/dedoc/structure_extractors/structure_extractor_composition.py +++ b/dedoc/structure_extractors/structure_extractor_composition.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor @@ -10,23 +10,25 @@ class StructureExtractorComposition(AbstractStructureExtractor): The list of structure extractors and names of document types for them is set via the class constructor. Each document type defines some specific document domain, those structure is extracted via the corresponding structure extractor. """ - def __init__(self, extractors: Dict[str, AbstractStructureExtractor], default_key: str) -> None: + def __init__(self, extractors: Dict[str, AbstractStructureExtractor], default_key: str, *, config: Optional[dict] = None) -> None: """ :param extractors: mapping document_type -> structure extractor, defined for certain document domains :param default_key: the document_type of the structure extractor, that will be used by default if the wrong parameters are given. \ default_key should exist as a key in the extractors' dictionary. """ + super().__init__(config=config) assert default_key in extractors self.extractors = extractors self.default_extractor_key = default_key - def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: + def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Adds information about the document structure according to the document type received from parameters (the key `document_type`). If there isn't `document_type` key in parameters or this document_type isn't found in the supported types, the default extractor will be used. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ + parameters = {} if parameters is None else parameters document_type = parameters.get("document_type", self.default_extractor_key) extractor = self.extractors.get(document_type, self.extractors[self.default_extractor_key]) - return extractor.extract_structure(document, parameters) + return extractor.extract(document, parameters) diff --git a/dedoc/train_dataset/train_dataset_utils.py b/dedoc/train_dataset/train_dataset_utils.py index 70ca3725..aec0a9d5 100644 --- a/dedoc/train_dataset/train_dataset_utils.py +++ b/dedoc/train_dataset/train_dataset_utils.py @@ -8,9 +8,6 @@ import numpy as np from PIL.Image import Image -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox - def __to_pil(image: np.ndarray) -> Image: return PIL.Image.fromarray(image) @@ -33,7 +30,7 @@ def _get_images_path(config: dict, document_name: str) -> str: return os.path.join(get_path_original_documents(config), document_name.split(".")[0]) -def save_page_with_bbox(page: PageWithBBox, document_name: str, *, config: dict) -> None: +def save_page_with_bbox(page: "PageWithBBox", document_name: str, *, config: dict) -> None: # noqa __create_images_path(config) uid = document_name images_path = _get_images_path(config=config, document_name=document_name) @@ -63,7 +60,7 @@ def _convert2zip(config: dict, document_name: str) -> str: return archive_filename -def save_line_with_meta(lines: List[LineWithMeta], original_document: str, *, config: dict) -> None: +def save_line_with_meta(lines: List["LineWithMeta"], original_document: str, *, config: dict) -> None: # noqa __create_images_path(config) if original_document.endswith((".jpg", ".png", ".pdf")): diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 126e1d6a..f7f0a090 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -22,28 +22,28 @@ def get_param_language(parameters: Optional[dict]) -> str: def get_param_orient_analysis_cells(parameters: Optional[dict]) -> bool: if parameters is None: return False - orient_analysis_cells = parameters.get("orient_analysis_cells", "False").lower() == "true" + orient_analysis_cells = str(parameters.get("orient_analysis_cells", "False")).lower() == "true" return orient_analysis_cells def get_param_need_header_footers_analysis(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_header_footers_analysis = parameters.get("need_header_footer_analysis", "False").lower() == "true" + need_header_footers_analysis = str(parameters.get("need_header_footer_analysis", "False")).lower() == "true" return need_header_footers_analysis def get_param_need_pdf_table_analysis(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_pdf_table_analysis = parameters.get("need_pdf_table_analysis", "True").lower() == "true" + need_pdf_table_analysis = str(parameters.get("need_pdf_table_analysis", "True")).lower() == "true" return need_pdf_table_analysis def get_param_need_binarization(parameters: Optional[dict]) -> bool: if parameters is None: return False - need_binarization = parameters.get("need_binarization", "False").lower() == "true" + need_binarization = str(parameters.get("need_binarization", "False")).lower() == "true" return need_binarization @@ -51,7 +51,7 @@ def get_param_orient_cell_angle(parameters: Optional[dict]) -> int: if parameters is None: return 90 - orient_cell_angle = parameters.get("orient_cell_angle", "90") + orient_cell_angle = str(parameters.get("orient_cell_angle", "90")) if orient_cell_angle == "": orient_cell_angle = "90" return int(orient_cell_angle) @@ -78,28 +78,13 @@ def get_param_document_orientation(parameters: Optional[dict]) -> Optional[bool] return None -def get_param_project(parameters: Optional[dict]) -> str: - if parameters is None: - return "docreader_project" - project = str(parameters.get("project", "docreader_project")).lower() - return project - - def get_param_pdf_with_txt_layer(parameters: Optional[dict]) -> str: if parameters is None: - return "false" - pdf_with_txt_layer = str(parameters.get("pdf_with_text_layer", "false")).lower() + return "auto_tabby" + pdf_with_txt_layer = str(parameters.get("pdf_with_text_layer", "auto_tabby")).lower() return pdf_with_txt_layer -def get_param_image_document_page(parameters: Optional[dict]) -> str: - if parameters is None: - return "" - - image_document_page = str(parameters.get("image_document_page", "")) - return image_document_page - - def get_param_table_type(parameters: Optional[dict]) -> str: if parameters is None: return "" diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index 4b9e2d40..003e6829 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -70,6 +70,21 @@ def splitext_(path: str) -> Tuple[str, str]: return name, f".{'.'.join(ext_list)}" +def get_mime_extension(file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None) -> Tuple[str, str]: + if mime is not None and extension is not None: + return mime, extension + + if file_path: + name, extension = splitext_(file_path) + mime = get_file_mime_type(file_path) + else: + assert mime is not None or extension is not None, "When file_path is None, mime or extension should be provided" + mime = "" if mime is None else mime + extension = "" if extension is None else extension + + return mime, extension + + def _text_from_item(item: dict) -> str: res = item.get("text", "") if "subparagraphs" in item: diff --git a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py index d23f9967..4ca9a336 100644 --- a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py @@ -1,4 +1,3 @@ -import mimetypes import os from djvu_converter import DjvuConverter @@ -6,33 +5,23 @@ from dedoc import DedocManager from dedoc.attachments_handler import AttachmentsHandler -from dedoc.converters import FileConverterComposition +from dedoc.converters import ConverterComposition from dedoc.metadata_extractors import BaseMetadataExtractor, DocxMetadataExtractor, MetadataExtractorComposition from dedoc.readers import ReaderComposition from dedoc.structure_constructors import LinearConstructor, StructureConstructorComposition, TreeConstructor from dedoc.structure_extractors import DefaultStructureExtractor, StructureExtractorComposition -file_dir, file_name = "test_dir", "The_New_Yorker_Case_Study.djvu" -file_path = os.path.join(file_dir, file_name) +file_path = "test_dir/The_New_Yorker_Case_Study.djvu" +djvu_converter = DjvuConverter() +djvu_converter.can_convert(file_path) # True +djvu_converter.convert(file_path) # 'test_dir/The_New_Yorker_Case_Study.pdf' -djvu_converter = DjvuConverter(config=dict()) pdf_reader = PdfReader() -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] - -djvu_converter.can_convert(file_extension, file_mime) # True -djvu_converter.do_convert(file_dir, name_wo_extension, file_extension) # 'The_New_Yorker_Case_Study.pdf' - -file_dir, file_name = "test_dir", "pdf_with_attachment.pdf" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] -pdf_reader.can_read(file_path, file_mime, file_extension) # True - +file_path = "test_dir/pdf_with_attachment.pdf" +pdf_reader.can_read(file_path) # True pdf_reader.read(file_path, parameters={"with_attachments": "true"}) # document = pdf_reader.read(file_path, parameters={"with_attachments": "true"}) @@ -41,9 +30,8 @@ len(document.lines) # 11 """Adding the implemented handlers to the manager config""" -config = {} manager_config = dict( - converter=FileConverterComposition(converters=[DjvuConverter(config=config)]), + converter=ConverterComposition(converters=[DjvuConverter()]), reader=ReaderComposition(readers=[PdfReader()]), structure_extractor=StructureExtractorComposition(extractors={DefaultStructureExtractor.document_type: DefaultStructureExtractor()}, default_key="other"), structure_constructor=StructureConstructorComposition( @@ -51,10 +39,10 @@ default_constructor=LinearConstructor() ), document_metadata_extractor=MetadataExtractorComposition(extractors=[DocxMetadataExtractor(), BaseMetadataExtractor()]), - attachments_handler=AttachmentsHandler(config=config), + attachments_handler=AttachmentsHandler(), ) -manager = DedocManager(config=config, manager_config=manager_config) +manager = DedocManager(manager_config=manager_config) result = manager.parse(file_path=file_path, parameters={"with_attachments": "true"}) result # diff --git a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py index 48c9eab7..b0069517 100644 --- a/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py +++ b/docs/source/_static/code_examples/dedoc_creating_dedoc_document.py @@ -110,6 +110,6 @@ } structure_constructor = TreeConstructor() -parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") +parsed_document = structure_constructor.construct(document=unstructured_document) parsed_document.to_api_schema().model_dump() diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 122ea40e..671a5ee6 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -1,6 +1,3 @@ -import mimetypes -import os - from dedoc import DedocManager from dedoc.attachments_extractors import DocxAttachmentsExtractor from dedoc.converters import DocxConverter @@ -10,27 +7,17 @@ from dedoc.structure_extractors import DefaultStructureExtractor """Using converters.""" -converter = DocxConverter(config={}) - -file_dir, file_name = "test_dir", "example.odt" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] +converter = DocxConverter() +file_path = "test_dir/example.odt" -converter.can_convert(file_extension, file_mime) # True -converter.do_convert(file_dir, name_wo_extension, file_extension) # 'example.docx' +converter.can_convert(file_path) # True +converter.convert(file_path) # 'test_dir/example.docx' """Using readers.""" -reader = DocxReader(config={}) - -file_dir, file_name = "test_dir", "example.docx" -file_path = os.path.join(file_dir, file_name) - -name_wo_extension, file_extension = os.path.splitext(file_name) -file_mime = mimetypes.guess_type(file_path)[0] -reader.can_read(file_path, file_mime, file_extension) # True +reader = DocxReader() +file_path = "test_dir/example.docx" +reader.can_read(file_path) # True reader.read(file_path, parameters={"with_attachments": "true"}) # document = reader.read(file_path, parameters={"with_attachments": "true"}) @@ -75,8 +62,8 @@ """Using metadata extractors""" metadata_extractor = DocxMetadataExtractor() -metadata_extractor.can_extract(file_dir, file_name, file_name, file_name) # True -document.metadata = metadata_extractor.extract_metadata(file_dir, file_name, file_name, file_name) +metadata_extractor.can_extract(file_path) # True +document.metadata = metadata_extractor.extract(file_path) document.metadata # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, # 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', # 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, @@ -85,21 +72,21 @@ """Using attachments extractors""" attachments_extractor = DocxAttachmentsExtractor() -attachments_extractor.can_extract(file_extension, file_mime) # True -attachments = attachments_extractor.get_attachments(file_dir, file_name, {}) +attachments_extractor.can_extract(file_path) # True +attachments = attachments_extractor.extract(file_path) attachments[0] # """Using structure extractors""" structure_extractor = DefaultStructureExtractor() document.lines[0].metadata.hierarchy_level # None -document = structure_extractor.extract_structure(document, {}) +document = structure_extractor.extract(document) document.lines[0].metadata.hierarchy_level # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header) """Using structure constructors""" constructor = TreeConstructor() -parsed_document = constructor.structure_document(document) +parsed_document = constructor.construct(document) parsed_document # list(vars(parsed_document)) # ['metadata', 'content', 'attachments', 'version', 'warnings'] @@ -110,7 +97,7 @@ """Run the whole pipeline""" manager = DedocManager() -result = manager.parse(file_path=file_path, parameters={}) +result = manager.parse(file_path=file_path) result # result.to_api_schema().model_dump() # {'content': {'structure': {'node_id': '0', 'text': '', 'annotations': [], 'metadata': {'paragraph_type': 'root', ... diff --git a/docs/source/_static/code_examples/djvu_converter.py b/docs/source/_static/code_examples/djvu_converter.py index eb31b5fe..192f889f 100644 --- a/docs/source/_static/code_examples/djvu_converter.py +++ b/docs/source/_static/code_examples/djvu_converter.py @@ -2,17 +2,27 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter +from dedoc.utils.utils import get_mime_extension, splitext_ class DjvuConverter(AbstractConverter): - def __init__(self, config: dict) -> None: + def __init__(self, config: Optional[dict] = None) -> None: super().__init__(config=config) - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension == ".djvu" - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: - os.system(f"ddjvu -format=pdf {tmp_dir}/{filename}{extension} {tmp_dir}/{filename}.pdf") - self._await_for_conversion(filename + ".pdf", tmp_dir) - return filename + ".pdf" + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: + file_dir, file_name = os.path.split(file_path) + name_wo_ext, _ = splitext_(file_name) + converted_file_path = os.path.join(file_dir, f"{name_wo_ext}.pdf") + command = ["ddjvu", "--format=pdf", file_path, converted_file_path] + self._run_subprocess(command=command, filename=file_name, expected_path=converted_file_path) + + return converted_file_path diff --git a/docs/source/_static/code_examples/pdf_attachment_extractor.py b/docs/source/_static/code_examples/pdf_attachment_extractor.py index 1d966549..e28a7a2e 100644 --- a/docs/source/_static/code_examples/pdf_attachment_extractor.py +++ b/docs/source/_static/code_examples/pdf_attachment_extractor.py @@ -6,14 +6,21 @@ from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor from dedoc.data_structures import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes +from dedoc.utils.utils import get_mime_extension class PdfAttachmentsExtractor(AbstractAttachmentsExtractor): - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: - handler = open(os.path.join(tmpdir, filename), "rb") + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: + parameters = {} if parameters is None else parameters + handler = open(os.path.join(file_path), "rb") reader = PyPDF2.PdfFileReader(handler) catalog = reader.trailer["/Root"] attachments = [] @@ -27,5 +34,5 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ f_dict = filenames[data_index].getObject() f_data = f_dict["/EF"]["/F"].getData() attachments.append((name, f_data)) - attachments = self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=False, parameters=parameters) + attachments = self._content2attach_file(content=attachments, tmpdir=os.path.dirname(file_path), need_content_analysis=False, parameters=parameters) return attachments diff --git a/docs/source/_static/code_examples/pdf_reader.py b/docs/source/_static/code_examples/pdf_reader.py index f8d032bc..b588ae65 100644 --- a/docs/source/_static/code_examples/pdf_reader.py +++ b/docs/source/_static/code_examples/pdf_reader.py @@ -1,4 +1,3 @@ -import os from typing import List, Optional import tabula @@ -12,20 +11,24 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.utils.utils import get_mime_extension class PdfReader(BaseReader): - def __init__(self) -> None: - self.attachment_extractor = PdfAttachmentsExtractor() + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = PdfAttachmentsExtractor(config=self.config) - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: - return (extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format) and not document_type + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension) + return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: - lines = self.__process_lines(path) - tables = self.__process_tables(path) - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + parameters = {} if parameters is None else parameters + lines = self.__process_lines(file_path) + tables = self.__process_tables(file_path) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments) def __process_tables(self, path: str) -> List[Table]: diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 1114cb87..9164721b 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -11,7 +11,7 @@ In the context of this tutorial, you'll need to include certain import statement .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 1-10 + :lines: 1-7 Using converters @@ -22,20 +22,20 @@ For this purpose one can use :class:`~dedoc.converters.DocxConverter` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 13 + :lines: 10 Method :meth:`~dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 15-21 + :lines: 11-13 Since we have checked if the converter is able to convert the file, -we can convert it using :meth:`~dedoc.converters.DocxConverter.do_convert` method: +we can convert it using :meth:`~dedoc.converters.DocxConverter.convert` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 22 + :lines: 14 To get the information about available converters, their methods and parameters see :ref:`dedoc_converters`. The supported document formats that can be converted to another formats (which can be parsed by readers) are enlisted in the table :ref:`table_formats`. @@ -61,30 +61,30 @@ Assume we need to parse file :download:`example.docx <../_static/code_examples/t As we see, the file contains text of different styles, two tables and an attached image. To read the contents of this file in the intermediate representation (see :class:`~dedoc.data_structures.UnstructuredDocument`) -one can use :class:`~dedoc.converters.DocxReader` class: +one can use :class:`~dedoc.readers.DocxReader` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 25 + :lines: 17 Method :meth:`~dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 27-32 + :lines: 18-20 Since we have checked if the reader is able to read the file, we can get its content (:class:`~dedoc.data_structures.UnstructuredDocument`) using :meth:`~dedoc.readers.DocxReader.read` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 34 + :lines: 21 Let's save the document in the variable and look at it in more detail: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 36-38 + :lines: 23-25 As we see, the document object has the following attributes: ``lines``, ``tables``, ``attachments``, ``metadata`` and ``warnings``. Document metadata is the empty dict on this stage, because it should be filled by one of the metadata extractors (see :ref:`dedoc_metadata_extractors` and :ref:`using_metadata_extractors`). @@ -100,20 +100,20 @@ We can get the text of any line: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 40 + :lines: 27 Also some of the readers can detect line types based of their styles, e.g.: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 41 + :lines: 28 Formatting of each line is stored in the ``annotations`` attribute: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 42-49 + :lines: 29-36 See :ref:`dedoc_data_structures` to get more information about main classes forming a document line. @@ -126,20 +126,20 @@ Each table is represented as a list of table rows, each row is a list of cells w .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 51-54 + :lines: 38-41 It also has metadata, containing table's unique identifier, rotation angle (if table has been rotated - for images) and so on. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 55-57 + :lines: 42-44 All tables have rectangular form, so if the cells are merged, in the intermediate representation they aren't and have the same contents. Use cells metadata for getting information about merged cells. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 58-63 + :lines: 45-50 As we see in the :ref:`docx_example_image`, the second table has some merged cells, e.g. in the first row. In the intermediate representation this row consists of two cells, and the second cell @@ -150,7 +150,7 @@ The unique identifier links the table with the previous non-empty line in the do .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 64-66 + :lines: 51-53 In the current example (:ref:`docx_example_image`), the line with the text "Bold, italic, small text." is the first non-empty line before the first table, so the table uid is linked to this line using :class:`~dedoc.data_structures.TableAnnotation`. @@ -164,7 +164,7 @@ In the :ref:`docx_example_image` there is an image attached to the file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 68-71 + :lines: 55-58 The ``tmp_file_path`` contains the path to the image saved on disk, the image is saved in the same directory as the parent docx file. @@ -174,7 +174,7 @@ In our :ref:`docx_example_image` it is a line with text "More text.". .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 72-74 + :lines: 59-61 The annotation uid is linked to the line using :class:`~dedoc.data_structures.AttachAnnotation`. @@ -191,20 +191,20 @@ we can add some metadata using :class:`~dedoc.metadata_extractors.DocxMetadataEx .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 77 + :lines: 64 Method :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if the metadata extractor can extract metadata from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 78 + :lines: 65 -To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.extract_metadata` method. +To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.extract` method. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 79-83 + :lines: 66-70 As we see, the attribute ``metadata`` has been filled with some metadata fields. The list of common fields for any metadata extractor along with the specific fields @@ -221,20 +221,20 @@ For example, in the :ref:`docx_example_image` we can use :class:`~dedoc.attachme .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 87 + :lines: 74 Method :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 88 + :lines: 75 Since we have checked if the extractor can extract attachments from the file, -we can extract them it using :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.get_attachments` method: +we can extract them it using :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.extract` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 89-90 + :lines: 76-77 As we see, attachment extractors return the same list of :class:`~dedoc.data_structures.AttachedFile`, as in the attribute ``attachments`` of the :class:`~dedoc.data_structures.UnstructuredDocument`, @@ -256,7 +256,7 @@ Let's extract the default structure based on the document styles: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 94-97 + :lines: 81-84 As we see, the ``hierarchy_level`` has been filled. @@ -274,14 +274,14 @@ Let's construct the tree structure of the document: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 101-104 + :lines: 88-91 As we see, parsed document has similar attributes as :class:`~dedoc.data_structures.UnstructuredDocument`. The main difference is in the ``content`` attribute, that contains hierarchical document structure and tables. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 106-108 + :lines: 93-95 To get more information about :class:`~dedoc.data_structures.ParsedDocument`, :class:`~dedoc.data_structures.DocumentContent` and other classes, that form the output format, see :ref:`dedoc_data_structures`. @@ -298,7 +298,7 @@ one may use manager class (see :ref:`dedoc_manager` for more details). .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 112-116 + :lines: 99-103 Manager allows to run workflow (see :ref:`dedoc_workflow`) for a file of any format supported by dedoc (see :ref:`table_formats`). One can also make a custom ``config`` and ``manager_config`` (parameters of the manager constructor) for more flexible usage of the library. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 0c4cc32e..b8ab6264 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -208,11 +208,15 @@ For a document of unknown or unsupported domain there is an option to use defaul .. toctree:: :maxdepth: 1 - :caption: Getting started: + :caption: Getting started getting_started/installation getting_started/usage +.. toctree:: + :maxdepth: 1 + + parameters/parameters .. toctree:: :maxdepth: 1 diff --git a/docs/source/modules/attachments_extractors.rst b/docs/source/modules/attachments_extractors.rst index aa6f0579..b5d55e17 100644 --- a/docs/source/modules/attachments_extractors.rst +++ b/docs/source/modules/attachments_extractors.rst @@ -4,6 +4,7 @@ dedoc.attachments_extractors ============================ .. autoclass:: dedoc.attachments_extractors.AbstractAttachmentsExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.AbstractOfficeAttachmentsExtractor @@ -12,25 +13,20 @@ dedoc.attachments_extractors .. autoclass:: dedoc.attachments_extractors.DocxAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.ExcelAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.JsonAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.PptxAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.attachments_extractors.PDFAttachmentsExtractor :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/converters.rst b/docs/source/modules/converters.rst index 5556df22..1a387d73 100644 --- a/docs/source/modules/converters.rst +++ b/docs/source/modules/converters.rst @@ -7,7 +7,7 @@ dedoc.converters :special-members: __init__ :members: -.. autoclass:: dedoc.converters.FileConverterComposition +.. autoclass:: dedoc.converters.ConverterComposition :special-members: __init__ :members: diff --git a/docs/source/modules/metadata_extractors.rst b/docs/source/modules/metadata_extractors.rst index 30424706..0c1f49e7 100644 --- a/docs/source/modules/metadata_extractors.rst +++ b/docs/source/modules/metadata_extractors.rst @@ -8,29 +8,25 @@ dedoc.metadata_extractors :members: .. autoclass:: dedoc.metadata_extractors.AbstractMetadataExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.BaseMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.DocxMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.ImageMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.NoteMetadataExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.metadata_extractors.PdfMetadataExtractor :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/readers.rst b/docs/source/modules/readers.rst index 2250e5f2..7666f8bf 100644 --- a/docs/source/modules/readers.rst +++ b/docs/source/modules/readers.rst @@ -12,80 +12,64 @@ dedoc.readers .. autoclass:: dedoc.readers.ArchiveReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.CSVReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.DocxReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.EmailReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.ExcelReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.HtmlReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.JsonReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.MhtmlReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.NoteReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PptxReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfBaseReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfImageReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfTabbyReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfTxtlayerReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.PdfAutoReader :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.readers.RawTextReader :show-inheritance: - :special-members: __init__ :members: diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst index 441fcfb4..79d80f0f 100644 --- a/docs/source/modules/structure_extractors.rst +++ b/docs/source/modules/structure_extractors.rst @@ -4,6 +4,7 @@ dedoc.structure_extractors ========================== .. autoclass:: dedoc.structure_extractors.AbstractStructureExtractor + :special-members: __init__ :members: .. autoclass:: dedoc.structure_extractors.StructureExtractorComposition @@ -13,14 +14,12 @@ dedoc.structure_extractors .. autoclass:: dedoc.structure_extractors.DefaultStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.AbstractLawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.structure_extractors.ClassifyingLawStructureExtractor @@ -32,28 +31,24 @@ dedoc.structure_extractors .. autoclass:: dedoc.structure_extractors.LawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.FoivLawStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.DiplomaStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type .. autoclass:: dedoc.structure_extractors.TzStructureExtractor :show-inheritance: - :special-members: __init__ :members: .. autoattribute:: document_type diff --git a/docs/source/parameters/attachments_handling.rst b/docs/source/parameters/attachments_handling.rst new file mode 100644 index 00000000..589a49a2 --- /dev/null +++ b/docs/source/parameters/attachments_handling.rst @@ -0,0 +1,59 @@ +.. _attachments_handling_parameters: + +Attachments handling +==================== + +.. flat-table:: Parameters for attachments handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - with_attachments + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * method :meth:`~dedoc.readers.BaseReader.read` of inheritors of :class:`~dedoc.readers.BaseReader` + * :meth:`dedoc.readers.ReaderComposition.read` + - The option to enable attached files extraction. + If the option is ``False``, all attached files will be ignored. + + * - need_content_analysis + - True, False + - False + - :meth:`dedoc.DedocManager.parse` + - The option to enable file's attachments parsing along with the given file. + The content of the parsed attachments will be represented as :class:`~dedoc.data_structures.ParsedDocument`. + Use ``True`` value to enable this behaviour. + + * - recursion_deep_attachments + - integer value >= 0 + - 10 + - :meth:`dedoc.DedocManager.parse` + - If the attached files of the given file contain some attachments, they can also be extracted. + The level of this recursion can be set via this parameter. + + * - return_base64 + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract` for inheritors of :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` + * :meth:`dedoc.metadata_extractors.MetadataExtractorComposition.extract` + - Attached files can be encoded in base64 and their contents will be saved instead of saving attached file on disk. + The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field. + Use ``True`` value to enable this behaviour. + + * - attachments_dir + - optional string with a valid path + - None + - * :meth:`dedoc.DedocManager.parse` + * method :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract` of inheritors of :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` + * method :meth:`~dedoc.readers.BaseReader.read` of inheritors of :class:`~dedoc.readers.BaseReader` + * :meth:`dedoc.readers.ReaderComposition.read` + - The path to the directory where document's attached files can be saved. + By default, attachments are saved into the directory where the given file is located. diff --git a/docs/source/parameters/other_formats_handling.rst b/docs/source/parameters/other_formats_handling.rst new file mode 100644 index 00000000..85a02ecc --- /dev/null +++ b/docs/source/parameters/other_formats_handling.rst @@ -0,0 +1,43 @@ +.. _other_handling_parameters: + +Other formats handling +====================== + +.. flat-table:: Parameters for other formats handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - delimiter + - any string + - None + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.CSVReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - A column separator for files in CSV and TSV format. + By default "," (comma) is used for CSV and "\\t" (tabulation) for TSV. + + * - encoding + - any string + - None + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.CSVReader.read`, :meth:`dedoc.readers.RawTextReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - The encoding of documents of textual formats like TXT, CSV, TSV. + Look `here `_ to get the list of possible values for the ``encoding`` parameter. + By default the encoding of the document is detected automatically. + + * - handle_invisible_table + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.HtmlReader.read`, :meth:`dedoc.readers.EmailReader.read`, :meth:`dedoc.readers.MhtmlReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - Handle tables without visible borders as tables for HTML documents. + By default tables without visible borders are parsed as usual textual lines. diff --git a/docs/source/parameters/parameters.rst b/docs/source/parameters/parameters.rst new file mode 100644 index 00000000..430e7f43 --- /dev/null +++ b/docs/source/parameters/parameters.rst @@ -0,0 +1,21 @@ +.. _parameters_description: + +Parameters description +====================== + +This page contains parameters description for main classes of `dedoc` -- when it is used as a library. +If you want to use `dedoc` as a service, the section :ref:`api_parameters` may be useful. + +Here there are some groups of parameters, that can be used during documents handling. +These parameters can be passed to specific classes like :class:`dedoc.DedocManager`, :class:`dedoc.readers.PdfImageReader`, etc. + +**Note:** all parameters work for :class:`dedoc.DedocManager`, but for other classes, only some subset of the supported options works. +In the pages below, we enlist the configurable classes for each supported parameter. + +.. toctree:: + :maxdepth: 1 + + attachments_handling + pdf_handling + other_formats_handling + structure_type diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst new file mode 100644 index 00000000..b3781b2d --- /dev/null +++ b/docs/source/parameters/pdf_handling.rst @@ -0,0 +1,163 @@ +.. _pdf_handling_parameters: + +PDF and images handling +======================= + +.. flat-table:: Parameters for PDF and images handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - pdf_with_text_layer + - true, false, tabby, auto, auto_tabby + - auto_tabby + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.can_read`, :meth:`dedoc.readers.PdfTxtlayerReader.can_read`, :meth:`dedoc.readers.PdfTabbyReader.can_read` + * :meth:`dedoc.readers.PdfAutoReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for choosing a specific reader of PDF documents for :class:`dedoc.DedocManager` or :class:`dedoc.readers.ReaderComposition`. + For readers, the option is used to check if the reader is able to parse the file. + The following options are available: + + * **true** -- parsing PDF files with a textual layer (text is copiable). + This option is used to choose :class:`dedoc.readers.PdfTxtlayerReader` for parsing. + + * **false** -- parsing scanned documents (images, PDF without a textual layer) + even if the document has a textual layer (is copyable). + This option is used to choose :class:`dedoc.readers.PdfImageReader` for parsing. + Note: :class:`dedoc.readers.PdfImageReader` doesn't check the option because it can handle both scanned and copyable documents. + + * **tabby** -- parsing PDF files with a textual layer (text is copiable). + This option is used to choose :class:`dedoc.readers.PdfTabbyReader` for parsing. + + * **auto** -- automatic detection of textual layer presence in the PDF document. + This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing. + If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTxtlayerReader` will be used for parsing. + If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used. + + + * **auto_tabby** -- automatic detection of textual layer presence in the PDF document. + This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing. + If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTabbyReader` will be used for parsing. + If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used. + It is highly recommended to use this option value for any PDF document parsing. + + * - language + - rus, eng, rus+eng + - rus+eng + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - Language of the document without a textual layer. The following values are available: + + * **rus** -- Russian; + * **eng** -- English; + * **rus+eng** -- both Russian and English. + + * - pages + - :, start:, :end, start:end + - : + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - If you need to read a part of the PDF document, you can use page slice to define the reading range. + If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` + (``start_page`` to ``end_page`` are included to the range). + + * using **:** means reading all document pages; + * using empty ``end`` -- **start:** (e.g. 5:) means reading the document from ``start`` up to the end of the document; + * using empty ``start`` -- **:end** (e.g. :5) means reading the document from the beginning up to the ``end`` page; + * using **start:end** means reading document pages from ``start`` to ``end`` inclusively. + + If ``start`` > ``end`` or ``start`` > the number of pages in the document, the empty document will be returned. + If ``end`` > the number of pages in the document, the document will be read up to its end. + For example, if ``1:3`` is given, 1, 2 and 3 document pages will be processed. + + * - is_one_column_document + - true, false, auto + - auto + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand. + The following values are available: + + * **true** -- the document is single column; + * **false** -- the document is multi-column (two columns parsing is supported); + * **auto** -- automatic detection of the number of columns in the document. + + If you are not sure about the number of columns in the documents you need to parse, it is recommended to use ``auto``. + + * - document_orientation + - auto, no_change + - auto + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to control document orientation analysis for PDF documents without a textual layer. + The following values are available: + + * **auto** -- automatic detection of rotated document pages (rotation angle 0, 90, 180, 270 degrees) and rotation of document pages; + * **no_change** -- parse document pages as they are without rotated pages detection. + + If you are sure that the documents you need to parse consist of vertical (not rotated) pages, you can use ``no_change``. + + * - need_header_footer_analysis + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to **remove** headers and footers of PDF documents from the output result. + If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. + + * - need_binarization + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. + If the document's background is heterogeneous, this option may help to improve the result of document text recognition. + By default ``need_binarization=False`` because its usage may decrease the quality of the document page (and the recognised text on it). + + * - need_pdf_table_analysis + - True, False + - True + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to enable table recognition for PDF documents or images. + The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. + If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`, + in this case tables will be parsed much easier and faster. + + * - orient_analysis_cells + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for a table recognition for PDF documents or images. + It is ignored when ``need_pdf_table_analysis=False``. + When set to ``True``, it enables analysis of rotated cells in table headers. + Use this option if you are sure that the cells of the table header are rotated. + + * - orient_cell_angle + - 90, 270 + - 90 + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used for a table recognition for PDF documents or images. + It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. + The option is used to set orientation of cells in table headers: + + * **270** -- cells are rotated 90 degrees clockwise; + * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst new file mode 100644 index 00000000..546ddbfc --- /dev/null +++ b/docs/source/parameters/structure_type.rst @@ -0,0 +1,52 @@ +.. _structure_type_parameters: + +Structure type configuring +========================== + +.. flat-table:: Parameters for structure type configuring + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - document_type + - other, law, tz, diploma + - other + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract` + - Type of the document structure according to specific domain. + If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available: + + * **other** -- structure for document of any domain (:ref:`other_structure`) + In this case, :class:`~dedoc.structure_extractors.DefaultStructureExtractor` is used. + * **law** -- Russian laws (:ref:`law_structure`) + In this case, :class:`~dedoc.structure_extractors.ClassifyingLawStructureExtractor` is used. + * **tz** -- Russian technical specifications (:ref:`tz_structure`) + In this case, :class:`~dedoc.structure_extractors.TzStructureExtractor` is used. + * **diploma** -- Russian thesis (:ref:`diploma_structure`) + In this case, :class:`~dedoc.structure_extractors.DiplomaStructureExtractor` is used. + + If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition` + + * - structure_type + - tree, linear + - tree + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.structure_constructors.StructureConstructorComposition.construct` + - The type of output document representation. + If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available: + + * **tree** -- the document is represented as a hierarchical structure where nodes are document lines/paragraphs + and child nodes have greater hierarchy level then parents according to the level found by structure extractor. + In this case, :class:`~dedoc.structure_constructors.TreeConstructor` is used to construct structure. + + * **linear** -- the document is represented as a tree where the root is empty node, + and all document lines are children of the root. + In this case, :class:`~dedoc.structure_constructors.LinearConstructor` is used to construct structure. + + If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_constructors.StructureConstructorComposition` diff --git a/docs/source/tutorials/add_new_doc_type.rst b/docs/source/tutorials/add_new_doc_type.rst index fe2f1569..c8a0d1dd 100644 --- a/docs/source/tutorials/add_new_doc_type.rst +++ b/docs/source/tutorials/add_new_doc_type.rst @@ -23,20 +23,24 @@ You should call the constructor of the base class in the constructor of the curr from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter class NewtypeConverter(AbstractConverter): - def __init__(self, config): - super().__init__(config=config) - - def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + + def can_convert(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: pass # some code here - def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: + def convert(self, file_path: str, parameters: Optional[dict] = None) -> str: pass # some code here 2. Implement converter methods to convert other formats to this format: * :meth:`~dedoc.converters.AbstractConverter.can_convert` method checks if the new converter can process the file, for example, you can return True for the list of some specific file extensions. -* :meth:`~dedoc.converters.AbstractConverter.do_convert` method performs the required file conversion. Don't worry about the file name containing spaces or other unwanted characters because the file has been renamed by the manager. +* :meth:`~dedoc.converters.AbstractConverter.convert` method performs the required file conversion. 3. Add the converter to manager config, see :ref:`adding_handlers_to_manager_config`. @@ -52,15 +56,15 @@ General scheme of adding Reader class NewtypeReader(BaseReader): - def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: + def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: pass # some code here - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: pass # some code here 2. You should implement reader methods according to specific file format processing. -* :meth:`~dedoc.readers.BaseReader.can_read` method checks if the given file can be processed. For processing the following information is required: the path to the file, file extension, mime and document type (for example, you can process only articles). It is better to make this method fast because it will be called frequently. +* :meth:`~dedoc.readers.BaseReader.can_read` method checks if the given file can be processed. For processing the following information is required: the path to the file, file extension or mime. It is better to make this method fast because it will be called frequently. * :meth:`~dedoc.readers.BaseReader.read` method must form :class:`~dedoc.data_structures.unstructured_document.UnstructuredDocument` (document lines, tables and attachments). 3. Add the reader to manager config, see :ref:`adding_handlers_to_manager_config`. @@ -78,17 +82,21 @@ General scheme of adding AttachmentExtractor from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor class NewtypeAttachmentsExtractor(AbstractAttachmentsExtractor): - def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: + def can_extract(self, + file_path: Optional[str] = None, + extension: Optional[str] = None, + mime: Optional[str] = None, + parameters: Optional[dict] = None) -> bool: pass # some code here - def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: pass # some code here 2. You should implement methods according to the specifics of extracting attachments for this format. * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()` method checks if the new extractor can process the file, for example, you can return True for the list of some specific file extensions. -* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.get_attachments()` method should return a list of attachments that were extracted from the document: for each attachment :class:`~dedoc.data_structures.attached_file.AttachedFile` is returned, you can see its code in ``dedoc/data_structures/attached_file.py``. +* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` method should return a list of attachments that were extracted from the document: for each attachment :class:`~dedoc.data_structures.attached_file.AttachedFile` is returned, you can see its code in ``dedoc/data_structures/attached_file.py``. 3. Add attachments extractor to the reader's code. @@ -99,12 +107,13 @@ General scheme of adding AttachmentExtractor .. code-block:: python class NewtypeReader(BaseReader): - def __init__(self) -> None: - self.attachment_extractor = PdfAttachmentsExtractor() + def __init__(self, config: Optional[dict] = None) -> None: + super().__init__(config=config) + self.attachment_extractor = PdfAttachmentsExtractor(config=self.config) - def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: # some code - attachments = self.attachment_extractor.get_attachments(tmpdir, filename, parameters) + attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) # some code Example of adding pdf/djvu handlers @@ -114,9 +123,9 @@ Suppose we want to add the ability to handle pdf/djvu documents with a text laye We don't want to deal with two formats, because we can convert djvu to pdf. The following steps are proposed: -1. Implementing the converter from djvu to pdf DjvuConverter. -2. Implementing of PdfAttachmentsExtractor. -3. Implementing of PdfReader. +1. Implementing the converter from djvu to pdf ``DjvuConverter``. +2. Implementing of ``PdfAttachmentsExtractor``. +3. Implementing of ``PdfReader``. 4. Adding the implemented handlers to the manager config. Let's describe each step in more detail. @@ -132,13 +141,13 @@ Implement class ``DjvuConverter``. You should implement the following methods: * :meth:`~dedoc.converters.AbstractConverter.can_convert`: return True if file extension is `.djvu`. You can see the file ``dedoc/extensions.py`` for more accurate work with extensions. -* :meth:`~dedoc.converters.AbstractConverter.do_convert`: use `ddjvu` utility and run it using ``os.system``. ``._await_for_conversion()`` method ensures that the converted file was saved. +* :meth:`~dedoc.converters.AbstractConverter.convert`: use `ddjvu` utility and run it using ``._run_subprocess`` method ensures that the converted file was saved. You can use the converter in your code: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 20, 16-17, 22-27 + :lines: 15-19 Implementing of PdfAttachmentsExtractor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -151,7 +160,7 @@ Implement ``PdfAttachmentsExtractor``. You should implement the following methods: * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()`: use file extension or mime to check if we could read the given file. You can learn more about extensions and mime using file ``dedoc/extensions.py`` -* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.get_attachments()` : use information about file path and file name to extract attachments from the given file. +* :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` : use information about file path and file name to extract attachments from the given file. The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor._content2attach_file` method. @@ -184,7 +193,7 @@ You can use the reader in your code: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 21, 29-41 + :lines: 21-30 .. _adding_handlers_to_manager_config: @@ -199,18 +208,18 @@ your custom handlers directly in your code. Example of a manager config with the .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 1-15, 44-55 + :lines: 1-14, 33-43 Then create an object of :class:`~dedoc.DedocManager` and use :meth:`~dedoc.DedocManager.parse` method: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 16-17, 57-58 + :lines: 15, 45-46 Result is :class:`~dedoc.data_structures.ParsedDocument`: .. literalinclude:: ../_static/code_examples/dedoc_add_new_doc_type_tutorial.py :language: python - :lines: 60-61 + :lines: 48-49 Adding support for a new document type is completed. diff --git a/examples/create_structured_document.py b/examples/create_structured_document.py index 434563c5..dced3ae5 100644 --- a/examples/create_structured_document.py +++ b/examples/create_structured_document.py @@ -5,6 +5,6 @@ # to create structured document you can use TreeConstructor and apply it to unstructured document # in this example we'll use unstructured_document from create_unstructured_document.py structure_constructor = TreeConstructor() -parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") +parsed_document = structure_constructor.construct(document=unstructured_document) print(parsed_document.to_api_schema().model_dump()) diff --git a/examples/create_unstructured_document.py b/examples/create_unstructured_document.py index cf724e1a..f9a026c9 100644 --- a/examples/create_unstructured_document.py +++ b/examples/create_unstructured_document.py @@ -58,5 +58,5 @@ # HierarchyLevel(1, 1) for 1. # HierarchyLevel(1, 2) for 1.1. # HierarchyLevel(1, 4) for 1.2.1.1. and so on -metadata = BaseMetadataExtractor().extract_metadata(directory="./", filename="example.docx", converted_filename="example.doc", original_filename="example.docx") +metadata = BaseMetadataExtractor().extract(file_path="example.docx", converted_filename="example.doc", original_filename="example.docx") unstructured_document.metadata = metadata diff --git a/examples/example_doc_parser.py b/examples/example_doc_parser.py index bb1cc9f8..bf9f059d 100644 --- a/examples/example_doc_parser.py +++ b/examples/example_doc_parser.py @@ -8,7 +8,7 @@ file_name = "example.docx" # we get unstructured file with lines and tables -unstructured_document = docx_reader.read(path=file_name, document_type="example") +unstructured_document = docx_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/examples/example_img_parser.py b/examples/example_img_parser.py index 3f136cf1..009e2708 100644 --- a/examples/example_img_parser.py +++ b/examples/example_img_parser.py @@ -9,7 +9,7 @@ file_name = "example.jpg" # we get unstructured file with lines and tables -unstructured_document = img_reader.read(path=file_name, document_type="example") +unstructured_document = img_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/examples/example_pdf_parser.py b/examples/example_pdf_parser.py index 36568546..7fe44a99 100644 --- a/examples/example_pdf_parser.py +++ b/examples/example_pdf_parser.py @@ -9,7 +9,7 @@ file_name = "example_with_text_layer.pdf" # we get unstructured file with lines and tables -unstructured_document = pdf_txt_layer_reader.read(path=file_name, document_type="example") +unstructured_document = pdf_txt_layer_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) @@ -38,7 +38,7 @@ file_name = "example_without_text_layer.pdf" # we get unstructured file with lines and tables -unstructured_document = pdf_image_reader.read(path=file_name, document_type="example") +unstructured_document = pdf_image_reader.read(file_path=file_name) # let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) diff --git a/tests/unit_tests/abstract_converter_test.py b/tests/unit_tests/abstract_converter_test.py index cd8e16fb..a05daa6e 100644 --- a/tests/unit_tests/abstract_converter_test.py +++ b/tests/unit_tests/abstract_converter_test.py @@ -29,6 +29,5 @@ def _convert(self, filename: str, extension: str, converter: AbstractConverter) tmp_file = os.path.join(self.tmp_dir.name, filename_with_extension) self.assertTrue(os.path.isfile(file), f"no such file {file}") shutil.copy(file, tmp_file) - result = converter.do_convert(tmp_dir=self.tmp_dir.name, filename=filename, extension=extension) - path = os.path.join(self.tmp_dir.name, result) - self.assertTrue(os.path.isfile(path), f"no such file {path}") + result = converter.convert(file_path=tmp_file) + self.assertTrue(os.path.isfile(result), f"no such file {result}") diff --git a/tests/unit_tests/test_doctype_law_dynamic_classifier.py b/tests/unit_tests/test_doctype_law_dynamic_classifier.py index 2ce8bdc9..8fd89960 100644 --- a/tests/unit_tests/test_doctype_law_dynamic_classifier.py +++ b/tests/unit_tests/test_doctype_law_dynamic_classifier.py @@ -21,7 +21,7 @@ def _get_abs_path(self, file_name: str) -> str: def _test_document_type(self, file_name: str, expected_type: str) -> None: config = {} base_reader = RawTextReader(config=config) - unstructured_document = base_reader.read(path=self._get_abs_path(file_name), document_type=None, parameters=None) + unstructured_document = base_reader.read(file_path=self._get_abs_path(file_name), parameters=None) result = self.structure_extractor._predict_extractor(unstructured_document.lines) self.assertEqual(result.document_type, expected_type) diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index 62d3e739..9a802723 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -18,10 +18,9 @@ def _get_abs_path(self, file_name: str) -> str: def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") - directory, filename = os.path.split(path) - document = self.txt_reader.read(path=path, document_type="law", parameters={}) - document.metadata = self.metadata_extractor.extract_metadata(directory, filename, filename, filename) - document = self.law_extractor.extract_structure(document, {}) + document = self.txt_reader.read(file_path=path) + document.metadata = self.metadata_extractor.extract(path) + document = self.law_extractor.extract(document) self.assertListEqual([], document.attachments) self.assertListEqual([], document.tables) diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index eb6af291..5a21ae32 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -60,9 +60,7 @@ def test_header_footer_search(self) -> None: filename = "prospectus.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -79,9 +77,7 @@ def test_header_footer_search_2(self) -> None: filename = "with_changed_header_footer.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -98,9 +94,7 @@ def test_header_footer_search_3(self) -> None: filename = "with_header_footer_2.pdf" path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) shutil.copy(path, os.path.join(tmpdir, filename)) - result = any_doc_reader.read(os.path.join(tmpdir, filename), - document_type=None, - parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(os.path.join(tmpdir, filename), parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) lines_by_page = self._split_lines_on_pages(result.lines) @@ -114,7 +108,7 @@ def test_long_list_in_pdf(self) -> None: config = get_test_config() any_doc_reader = PdfImageReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/scanned/doc_with_long_list.pdf") - result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "False"}) + result = any_doc_reader.read(path, parameters={"need_pdf_table_analysis": "False"}) list_elements = result.lines[1:] self.assertEqual(list_elements[0].line.lower().strip(), "1. январь") self.assertEqual(list_elements[1].line.lower().strip(), "2. февраль") @@ -134,7 +128,7 @@ def test_pdf_text_layer(self) -> None: config = get_test_config() any_doc_reader = PdfTxtlayerReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") - result = any_doc_reader.read(path, document_type=None, parameters={}) + result = any_doc_reader.read(path, parameters={}) for line in result.lines: # check that annotations not duplicated annotations = line.annotations @@ -145,5 +139,5 @@ def test_table_extractor(self) -> None: config = {} # Has to work without config any_doc_reader = PdfTxtlayerReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") - result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "True"}) + result = any_doc_reader.read(path, parameters={"need_pdf_table_analysis": "True"}) self.assertEqual(len(result.tables), 1) diff --git a/tests/unit_tests/test_format_txt_reader.py b/tests/unit_tests/test_format_txt_reader.py index e7249651..17e7482d 100644 --- a/tests/unit_tests/test_format_txt_reader.py +++ b/tests/unit_tests/test_format_txt_reader.py @@ -15,7 +15,7 @@ def test_read_law(self) -> None: file = os.path.join(self.path, "laws", "коап_москвы_8_7_2015_utf.txt") uids_set = set() prefix = "txt_6210f1fb59150aae33a09f49c8724baf" # это строка, содержащая хэш файла, который обратаывается ридером - document = self.reader.read(file, None, {}) + document = self.reader.read(file, {}) for line in document.lines: self.assertNotIn(line.uid, uids_set) uids_set.add(line.uid) @@ -25,7 +25,7 @@ def test_read_tz(self) -> None: file = os.path.join(self.path, "tz", "tz.txt") uids_set = set() prefix = "txt_0e576a9e0008225ac27f961af60c0bee" - document = self.reader.read(file, None, {}) + document = self.reader.read(file, {}) for line in document.lines: self.assertNotIn(line.uid, uids_set) uids_set.add(line.uid) diff --git a/tests/unit_tests/test_misc_tasker.py b/tests/unit_tests/test_misc_tasker.py index 8724f046..8fd3c66c 100644 --- a/tests/unit_tests/test_misc_tasker.py +++ b/tests/unit_tests/test_misc_tasker.py @@ -8,7 +8,7 @@ from PIL import Image from dedoc.attachments_handler.attachments_handler import AttachmentsHandler -from dedoc.converters.file_converter import FileConverterComposition +from dedoc.converters.converter_composition import ConverterComposition from dedoc.dedoc_manager import DedocManager from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition @@ -157,7 +157,7 @@ def __create_test_manager_config(self, config: dict) -> dict: } return dict( - converter=FileConverterComposition(converters=[]), + converter=ConverterComposition(converters=[]), reader=ReaderComposition(readers=readers), structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), structure_constructor=StructureConstructorComposition(default_constructor=TreeConstructor(), constructors={"tree": TreeConstructor()}), diff --git a/tests/unit_tests/test_misc_toc_feature_extractor.py b/tests/unit_tests/test_misc_toc_feature_extractor.py index 7389c15d..bd45393b 100644 --- a/tests/unit_tests/test_misc_toc_feature_extractor.py +++ b/tests/unit_tests/test_misc_toc_feature_extractor.py @@ -19,7 +19,7 @@ class TestTOCFeatureExtractor(unittest.TestCase): @property def document(self) -> UnstructuredDocument: if self._document is None: - self._document = self.reader.read(path=self.path, parameters={}, document_type=None) + self._document = self.reader.read(file_path=self.path, parameters={}) return self._document def test_toc_extractor(self) -> None: diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index eb79ebce..f9506ce0 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -41,7 +41,7 @@ def test_docx_attachments_extractor(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) - attachments = docx_attachment_extractor.get_attachments(tmpdir, filename, {}) + attachments = docx_attachment_extractor.extract(file_path=os.path.join(tmpdir, filename)) for _, file in enumerate(attachments): self.assertIn(file.original_name, attachments_name_list) @@ -72,7 +72,7 @@ def test_pptx_attachments_extractor(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) - attachments = pptx_attachment_extractor.get_attachments(tmpdir, filename, {}) + attachments = pptx_attachment_extractor.extract(file_path=os.path.join(tmpdir, filename)) for _, file in enumerate(attachments): self.assertIn(file.original_name, attachments_name_list) @@ -86,7 +86,8 @@ def test_docx_diagrams_extraction(self) -> None: files = [("diagram_1.docx", 1), ("diagram_2.docx", 5)] with tempfile.TemporaryDirectory() as tmp_dir: for file, num_attachments in files: - attachments = docx_attachment_extractor.get_attachments(tmp_dir, os.path.join(docx_dir, file), {}) + shutil.copy(os.path.join(docx_dir, file), os.path.join(tmp_dir, file)) + attachments = docx_attachment_extractor.extract(file_path=os.path.join(tmp_dir, file)) self.assertEqual(num_attachments, len(attachments)) def test_archive_with_slash(self) -> None: @@ -106,7 +107,7 @@ def __get_list_of_files_in_archive(self, file_name: str) -> List[str]: file_path = os.path.join(tmp_dir, file_name) shutil.copyfile(os.path.join(self.src_dir, file_name), file_path) config = get_test_config() - document = ArchiveReader(config=config).read(path=file_path, parameters={"with_attachments": True}) + document = ArchiveReader(config=config).read(file_path=file_path, parameters={"with_attachments": True}) files = [file.original_name for file in document.attachments] return files @@ -134,7 +135,7 @@ def test_reader_attachments_dir(self) -> None: for file_name, reader in file_name_reader_list: with tempfile.TemporaryDirectory() as tmpdir: - result = reader.read(path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir)) + result = reader.read(file_path=os.path.join(self.src_dir, file_name), parameters=dict(with_attachments=True, attachments_dir=tmpdir)) attachment_names = os.listdir(tmpdir) for attachment in result.attachments: @@ -148,7 +149,7 @@ def test_attachments_extractor_attachments_dir(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: params = {"with_attachments": True, "attachments_dir": tmpdir} - result = docx_attachment_extractor.get_attachments(tmpdir=self.src_dir, filename=file_name, parameters=params) + result = docx_attachment_extractor.extract(file_path=os.path.join(self.src_dir, file_name), parameters=params) attachment_names = os.listdir(tmpdir) for attachment in result: