diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 6346d155..90f8b5f4 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Dict from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -10,7 +10,6 @@ class AbstractMetadataExtractor(ABC): """ @abstractmethod def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -19,23 +18,21 @@ def can_extract(self, other_fields: Optional[dict] = None) -> bool: """ Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ pass @abstractmethod - def add_metadata(self, - document: UnstructuredDocument, + def extract_metadata(self, directory: str, filename: str, converted_filename: str, original_filename: str, parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + other_fields: Optional[dict] = None) -> Dict[str]: """ - Add metadata to the document if possible, i.e. method :meth:`can_extract` returned True. + Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. - :type document: document content that has been received from some of the readers :type directory: path to the directory where the original and converted files are located :type filename: name of the file after renaming (for example 23141.doc). \ The file gets a new name during processing by the dedoc manager (if used) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index f545ee1e..6f4c180c 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -1,6 +1,6 @@ import os from base64 import b64encode -from typing import Optional +from typing import Optional, Dict from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -22,7 +22,6 @@ class BaseMetadataExtractor(AbstractMetadataExtractor): """ def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -35,17 +34,16 @@ def can_extract(self, """ return True - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> Dict[str]: """ Gets the basic meta-information about the file. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters meta_info = self._get_base_meta_information(directory, filename, original_filename) @@ -59,8 +57,7 @@ def add_metadata(self, if other_fields is not None and len(other_fields) > 0: meta_info["other_fields"] = other_fields - document.metadata = meta_info - return document + return meta_info @staticmethod def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 377cba55..57401b3c 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -1,6 +1,6 @@ import os from datetime import datetime -from typing import Optional +from typing import Optional, Dict import docx from docx.opc.exceptions import PackageNotFoundError @@ -24,7 +24,6 @@ class DocxMetadataExtractor(BaseMetadataExtractor): - created, modified and last printed date. """ def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -37,27 +36,26 @@ def can_extract(self, """ return converted_filename.lower().endswith("docx") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> Dict[str]: """ Add the predefined list of metadata for the docx documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) file_path = os.path.join(directory, converted_filename) docx_other_fields = self._get_docx_fields(file_path) - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **docx_other_fields} + result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields} return result def __convert_date(self, date: Optional[datetime]) -> Optional[int]: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index ac573d02..4ab08aed 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -54,7 +54,6 @@ def __init__(self, *, config: dict) -> None: } def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -67,8 +66,7 @@ def can_extract(self, """ return filename.lower().endswith((".png", ".jpg", ".jpeg")) - def add_metadata(self, - document: UnstructuredDocument, + def extract_metadata(self, directory: str, filename: str, converted_filename: str, @@ -77,15 +75,15 @@ def add_metadata(self, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ Add the predefined list of metadata for images. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) exif_fields = self._get_exif(path) if len(exif_fields) > 0: - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **exif_fields} + result["other_fields"] = {**result.get("other_fields", {}), **exif_fields} return result def __encode_exif(self, exif: Union[str, bytes]) -> Optional[str]: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 2708e5e6..7001d6c5 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -1,6 +1,6 @@ import os import pickle -from typing import Optional +from typing import Optional, Dict from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -18,7 +18,6 @@ def __init__(self) -> None: super().__init__() def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -31,17 +30,16 @@ def can_extract(self, """ return filename.lower().endswith(".note.pickle") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> Dict[str]: """ Add the predefined list of metadata for the .note.pickle documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ try: @@ -59,7 +57,6 @@ def add_metadata(self, created_time=note_dict["created_time"], modified_time=note_dict["modified_time"], other_fields=other_fields) - document.metadata = meta_info - return document + return meta_info except Exception: raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 96682fc0..a6ae9b72 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -1,6 +1,6 @@ import logging import os -from typing import Optional +from typing import Optional, Dict from PyPDF2 import PdfFileReader from PyPDF2.utils import PdfReadError @@ -47,7 +47,6 @@ def __init__(self, *, config: dict) -> None: self.logger = config.get("logger", logging.getLogger()) def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -60,24 +59,23 @@ def can_extract(self, """ return filename.lower().endswith(".pdf") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> Dict[str]: """ Add the predefined list of metadata for the pdf documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) pdf_fields = self._get_pdf_info(path) if len(pdf_fields) > 0: - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **pdf_fields} + result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields} return result def _get_pdf_info(self, path: str) -> dict: