Skip to content

Commit

Permalink
fix usage of extract_metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikita Shevtsov committed Oct 19, 2023
1 parent 4137827 commit 020f5ff
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 37 deletions.
5 changes: 2 additions & 3 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
metadata = DocumentMetadata(**metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
14 changes: 7 additions & 7 deletions dedoc/dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
self.logger.info(f"Finish parse file {file_name}")

# Step 3 - Adding meta-information
unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document,
directory=tmp_dir,
filename=unique_filename,
converted_filename=converted_filename,
original_filename=file_name,
parameters=parameters,
other_fields=unstructured_document.metadata)
metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir,
filename=unique_filename,
converted_filename=converted_filename,
original_filename=file_name,
parameters=parameters,
other_fields=unstructured_document.metadata)
unstructured_document.metadata = metadata
self.logger.info(f"Add metadata of file {file_name}")

# Step 4 - Extract structure
Expand Down
38 changes: 17 additions & 21 deletions dedoc/metadata_extractors/metadata_extractor_composition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Optional
from typing import Dict, List, Optional

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor


Expand All @@ -13,36 +12,33 @@ class MetadataExtractorComposition:
"""
def __init__(self, extractors: List[AbstractMetadataExtractor]) -> None:
"""
:param extractors: the list of extractors with methods can_extract() and add_metadata() to extract metadata from file
:param extractors: the list of extractors with methods can_extract() and extract_metadata() to extract metadata from file
"""
self.extractors = extractors

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> Dict[str]:
"""
Add metadata to the document using one of the extractors if suitable extractor was found.
Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` of the class
Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` of the class
:class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters.
"""
for extractor in self.extractors:
if extractor.can_extract(document=document,
directory=directory,
if extractor.can_extract(directory=directory,
filename=filename,
converted_filename=converted_filename,
original_filename=original_filename,
parameters=parameters,
other_fields=other_fields):
return extractor.add_metadata(document=document,
directory=directory,
filename=filename,
converted_filename=converted_filename,
original_filename=original_filename,
parameters=parameters,
other_fields=other_fields)
return extractor.extract_metadata(directory=directory,
filename=filename,
converted_filename=converted_filename,
original_filename=original_filename,
parameters=parameters,
other_fields=other_fields)
raise Exception(f"Can't extract metadata from from file {filename}")
10 changes: 5 additions & 5 deletions examples/create_unstructured_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
# HierarchyLevel(1, 1) for 1.
# HierarchyLevel(1, 2) for 1.1.
# HierarchyLevel(1, 4) for 1.2.1.1. and so on
unstructured_document = BaseMetadataExtractor().add_metadata(document=unstructured_document,
directory="./",
filename="example.docx",
converted_filename="example.doc",
original_filename="example.docx")
metadata = BaseMetadataExtractor().extract_metadata(directory="./",
filename="example.docx",
converted_filename="example.doc",
original_filename="example.docx")
unstructured_document.metadata = metadata
2 changes: 1 addition & 1 deletion tests/unit_tests/test_doctype_law_txt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_law_document_spaces_correctness(self) -> None:
path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt")
directory, filename = os.path.split(path)
document = self.txt_reader.read(path=path, document_type="law", parameters={})
document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename)
document.metadata = self.metadata_extractor.extract_metadata(directory, filename, filename, filename)
document = self.law_extractor.extract_structure(document, {})

self.assertListEqual([], document.attachments)
Expand Down

0 comments on commit 020f5ff

Please sign in to comment.