Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
change add_metadata to extract_metadata in metadata readers
Browse files Browse the repository at this point in the history
Nikita Shevtsov committed Oct 19, 2023
1 parent e7c1067 commit 4137827
Showing 6 changed files with 52 additions and 67 deletions.
13 changes: 5 additions & 8 deletions dedoc/metadata_extractors/abstract_metadata_extractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Optional
from typing import Optional, Dict

from dedoc.data_structures.unstructured_document import UnstructuredDocument

@@ -10,7 +10,6 @@ class AbstractMetadataExtractor(ABC):
"""
@abstractmethod
def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -19,23 +18,21 @@ def can_extract(self,
other_fields: Optional[dict] = None) -> bool:
"""
Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
pass

@abstractmethod
def add_metadata(self,
document: UnstructuredDocument,
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Add metadata to the document if possible, i.e. method :meth:`can_extract` returned True.
Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True.
:type document: document content that has been received from some of the readers
:type directory: path to the directory where the original and converted files are located
:type filename: name of the file after renaming (for example 23141.doc). \
The file gets a new name during processing by the dedoc manager (if used)
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from base64 import b64encode
from typing import Optional
from typing import Optional, Dict

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor
@@ -22,7 +22,6 @@ class BaseMetadataExtractor(AbstractMetadataExtractor):
"""

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -35,17 +34,16 @@ def can_extract(self,
"""
return True

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Gets the basic meta-information about the file.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
parameters = {} if parameters is None else parameters
meta_info = self._get_base_meta_information(directory, filename, original_filename)
@@ -59,8 +57,7 @@ def add_metadata(self,

if other_fields is not None and len(other_fields) > 0:
meta_info["other_fields"] = other_fields
document.metadata = meta_info
return document
return meta_info

@staticmethod
def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict:
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from datetime import datetime
from typing import Optional
from typing import Optional, Dict

import docx
from docx.opc.exceptions import PackageNotFoundError
@@ -24,7 +24,6 @@ class DocxMetadataExtractor(BaseMetadataExtractor):
- created, modified and last printed date.
"""
def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -37,27 +36,26 @@ def can_extract(self,
"""
return converted_filename.lower().endswith("docx")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Add the predefined list of metadata for the docx documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
parameters = {} if parameters is None else parameters

result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)

file_path = os.path.join(directory, converted_filename)
docx_other_fields = self._get_docx_fields(file_path)

result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **docx_other_fields}
result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields}
return result

def __convert_date(self, date: Optional[datetime]) -> Optional[int]:
Original file line number Diff line number Diff line change
@@ -54,7 +54,6 @@ def __init__(self, *, config: dict) -> None:
}

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -67,8 +66,7 @@ def can_extract(self,
"""
return filename.lower().endswith((".png", ".jpg", ".jpeg"))

def add_metadata(self,
document: UnstructuredDocument,
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
@@ -77,15 +75,15 @@ def add_metadata(self,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
"""
Add the predefined list of metadata for images.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)

path = os.path.join(directory, filename)
exif_fields = self._get_exif(path)
if len(exif_fields) > 0:
result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **exif_fields}
result["other_fields"] = {**result.get("other_fields", {}), **exif_fields}
return result

def __encode_exif(self, exif: Union[str, bytes]) -> Optional[str]:
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import pickle
from typing import Optional
from typing import Optional, Dict

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.unstructured_document import UnstructuredDocument
@@ -18,7 +18,6 @@ def __init__(self) -> None:
super().__init__()

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -31,17 +30,16 @@ def can_extract(self,
"""
return filename.lower().endswith(".note.pickle")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Add the predefined list of metadata for the .note.pickle documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""

try:
@@ -59,7 +57,6 @@ def add_metadata(self,
created_time=note_dict["created_time"],
modified_time=note_dict["modified_time"],
other_fields=other_fields)
document.metadata = meta_info
return document
return meta_info
except Exception:
raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken")
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import os
from typing import Optional
from typing import Optional, Dict

from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
@@ -47,7 +47,6 @@ def __init__(self, *, config: dict) -> None:
self.logger = config.get("logger", logging.getLogger())

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
@@ -60,24 +59,23 @@ def can_extract(self,
"""
return filename.lower().endswith(".pdf")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Add the predefined list of metadata for the pdf documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
"""
result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
path = os.path.join(directory, filename)
pdf_fields = self._get_pdf_info(path)
if len(pdf_fields) > 0:
result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **pdf_fields}
result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields}
return result

def _get_pdf_info(self, path: str) -> dict:

0 comments on commit 4137827

Please sign in to comment.