From 020f5ff580a6c26e6068acc8e1ffa05c8cd80d44 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 19 Oct 2023 14:09:39 +0300 Subject: [PATCH] fix usage of extract_metadata --- .../attachments_handler.py | 5 +-- dedoc/dedoc_manager.py | 14 +++---- .../metadata_extractor_composition.py | 38 +++++++++---------- examples/create_unstructured_document.py | 10 ++--- .../unit_tests/test_doctype_law_txt_reader.py | 2 +- 5 files changed, 32 insertions(+), 37 deletions(-) diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 37f4f98c..ecd40de1 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) attachment.tmp_file_path = new_path def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa - unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[]) attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) - unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir, + metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir, filename=attachment_name, converted_filename=attachment_name, original_filename=attachment.get_original_filename(), parameters=parameters) - metadata = DocumentMetadata(**unstructured_document.metadata) + metadata = DocumentMetadata(**metadata) return ParsedDocument(content=get_empty_content(), metadata=metadata) diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index ee308f1a..1ee0b947 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -103,13 +103,13 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) self.logger.info(f"Finish parse file {file_name}") # Step 3 - Adding meta-information - unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, - directory=tmp_dir, - filename=unique_filename, - converted_filename=converted_filename, - original_filename=file_name, - parameters=parameters, - other_fields=unstructured_document.metadata) + metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir, + filename=unique_filename, + converted_filename=converted_filename, + original_filename=file_name, + parameters=parameters, + other_fields=unstructured_document.metadata) + unstructured_document.metadata = metadata self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index 68e308c9..296c3f60 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,6 +1,5 @@ -from typing import List, Optional +from typing import Dict, List, Optional -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -13,36 +12,33 @@ class MetadataExtractorComposition: """ def __init__(self, extractors: List[AbstractMetadataExtractor]) -> None: """ - :param extractors: the list of extractors with methods can_extract() and add_metadata() to extract metadata from file + :param extractors: the list of extractors with methods can_extract() and extract_metadata() to extract metadata from file """ self.extractors = extractors - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> Dict[str]: """ Add metadata to the document using one of the extractors if suitable extractor was found. - Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` of the class + Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` of the class :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters. """ for extractor in self.extractors: - if extractor.can_extract(document=document, - directory=directory, + if extractor.can_extract(directory=directory, filename=filename, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, other_fields=other_fields): - return extractor.add_metadata(document=document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - other_fields=other_fields) + return extractor.extract_metadata(directory=directory, + filename=filename, + converted_filename=converted_filename, + original_filename=original_filename, + parameters=parameters, + other_fields=other_fields) raise Exception(f"Can't extract metadata from from file {filename}") diff --git a/examples/create_unstructured_document.py b/examples/create_unstructured_document.py index 0da38dc9..59e4d02a 100644 --- a/examples/create_unstructured_document.py +++ b/examples/create_unstructured_document.py @@ -58,8 +58,8 @@ # HierarchyLevel(1, 1) for 1. # HierarchyLevel(1, 2) for 1.1. # HierarchyLevel(1, 4) for 1.2.1.1. and so on -unstructured_document = BaseMetadataExtractor().add_metadata(document=unstructured_document, - directory="./", - filename="example.docx", - converted_filename="example.doc", - original_filename="example.docx") +metadata = BaseMetadataExtractor().extract_metadata(directory="./", + filename="example.docx", + converted_filename="example.doc", + original_filename="example.docx") +unstructured_document.metadata = metadata diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index 391075b5..62d3e739 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -20,7 +20,7 @@ def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") directory, filename = os.path.split(path) document = self.txt_reader.read(path=path, document_type="law", parameters={}) - document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename) + document.metadata = self.metadata_extractor.extract_metadata(directory, filename, filename, filename) document = self.law_extractor.extract_structure(document, {}) self.assertListEqual([], document.attachments)