diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py index 591f8cc3..5cf46cc3 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py @@ -119,6 +119,17 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe self.logger.info(f"Got automatic TOC from {os.path.basename(file_path)}") return toc + lines = self.__read_one_column_lines(file_path) + return self.toc_extractor.get_toc(lines) + + def __read_one_column_lines(self, file_path: str) -> List[LineWithMeta]: + """ + TOC is one-columned even in two-columned documents (as a rule), so we handle TOC lines extraction separately: + 1. save first 10 pages of the document to a temporary directory; + 2. read lines from these pages in one-column mode without headers and footers. + + Later these lines will be analysed for TOC lines extraction. + """ pdf_reader = PdfFileReader(file_path) writer = PdfFileWriter() @@ -130,14 +141,12 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe writer.write(write_file) lines = self.pdf_reader.read(file_path=tmp_path, parameters={"is_one_column_document": "True", "need_header_footer_analysis": "True"}).lines - return self.toc_extractor.get_toc(lines) + return lines def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]: result = [] with os.popen(f'pdftocio -p "{path}"') as out: toc = out.readlines() - if len(toc) == 0: - return result for line in toc: match = self.toc_item_regexp.match(line.strip()) diff --git a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py index 79ca919e..82e53111 100644 --- a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py @@ -85,14 +85,13 @@ def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[ for line_id, _ in enumerate(document): if line_id >= n: prev_line = document[line_id - n] - is_prev_line_ends = prev_line.line.endswith((".", ";")) - res["prev_line_ends"].append(1 if is_prev_line_ends else 0) + res["prev_line_ends"].append(prev_line.line.endswith((".", ";"))) res["prev_ends_with_colon"].append(prev_line.line.endswith(":")) res["prev_is_space"].append(prev_line.line.lower().isspace()) else: - res["prev_line_ends"].append(0) - res["prev_ends_with_colon"].append(0) - res["prev_is_space"].append(0) + res["prev_line_ends"].append(False) + res["prev_ends_with_colon"].append(False) + res["prev_is_space"].append(False) return res def __get_line_relative_length(self, lines: List[LineWithMeta]) -> List[float]: diff --git a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py index cafb4c10..130a5560 100755 --- a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py @@ -13,6 +13,22 @@ class PairedFeatureExtractor(AbstractFeatureExtractor): + """ + This class is used as an auxiliary feature extractor to the main extractor. + It allows to add "raw" features related to the lines importance. + Based on one line property (size, indentation) it computes a raw line's depth inside the document tree. + + Example: + For lines + line1 (size=16) + line2 (size=14) + line3 (size=12) + line4 (size=12) + line5 (size=14) + line6 (size=12) + We will obtain a feature vector (raw_depth_size) + [0, 1, 2, 2, 1, 2] + """ def parameters(self) -> dict: return {} diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py index 15516c75..9e00e819 100755 --- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py @@ -2,7 +2,7 @@ import logging import os import pickle -from typing import List, Optional +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd @@ -47,7 +47,11 @@ def predict(self, features: pd.DataFrame) -> List[int]: # return list [1, 2, 3, -1, -1, ...], where positive values mean headers depth, -1 mean non-header lines return list(result) - def fit(self, binary_classifier_parameters: dict, target_classifier_parameters: dict, features: pd.DataFrame, features_names: list[str]) -> None: + def fit(self, + binary_classifier_parameters: Dict[str, Union[int, float, str]], + target_classifier_parameters: Dict[str, Union[int, float, str]], + features: pd.DataFrame, + features_names: List[str]) -> None: self.classifiers["binary"] = XGBClassifier(**binary_classifier_parameters) self.classifiers["target"] = XGBClassifier(**target_classifier_parameters) self.binary_classifier.fit(features[features_names], features.label != -1) diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 8db61804..ee68c29f 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -70,7 +70,7 @@ Api parameters description * - :cspan:`3` **Type of document structure parsing** * - document_type - - other, law, tz, diploma + - other, law, tz, diploma, article, fintoc - other - Type of the document structure according to specific domain. @@ -216,13 +216,15 @@ Api parameters description It is highly recommended to use this option value for any PDF document parsing. * - language - - rus, eng, rus+eng + - rus, eng, rus+eng, fra, spa - rus+eng - Language of the parsed PDF document without a textual layer. The following values are available: * **rus** -- Russian; * **eng** -- English; - * **rus+eng** -- both Russian and English. + * **rus+eng** -- both Russian and English; + * **fra** -- French (for fintoc structure type); + * **spa** -- Spanish (for fintoc structure type). * - pages - :, start:, :end, start:end diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index b3781b2d..60ad0096 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -49,16 +49,19 @@ PDF and images handling It is highly recommended to use this option value for any PDF document parsing. * - language - - rus, eng, rus+eng + - rus, eng, rus+eng, fra, spa - rus+eng - * :meth:`dedoc.DedocManager.parse` * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` + * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract` - Language of the document without a textual layer. The following values are available: * **rus** -- Russian; * **eng** -- English; - * **rus+eng** -- both Russian and English. + * **rus+eng** -- both Russian and English; + * **fra** -- French (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`); + * **spa** -- Spanish (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`). * - pages - :, start:, :end, start:end diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst index dfd09803..09b592c2 100644 --- a/docs/source/parameters/structure_type.rst +++ b/docs/source/parameters/structure_type.rst @@ -15,7 +15,7 @@ Structure type configuring - Description * - document_type - - other, law, tz, diploma + - other, law, tz, diploma, article, fintoc - other - * :meth:`dedoc.DedocManager.parse` * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract` diff --git a/scripts/fintoc2022/dataset_loader.py b/scripts/fintoc2022/dataset_loader.py index f2beb730..9d9560e2 100755 --- a/scripts/fintoc2022/dataset_loader.py +++ b/scripts/fintoc2022/dataset_loader.py @@ -25,29 +25,29 @@ class FintocLineWithLabelExtractor: def __init__(self) -> None: self.readers = {"tabby": PdfTabbyReader(), "txt_layer": PdfTxtlayerReader()} - def get_lines(self, file_name: str, file_path: str, annotation_path: str, reader_name: str) -> List[LineWithLabel]: + def get_lines(self, file_name: str, file_path: str, gt_path: str, reader_name: str) -> List[LineWithLabel]: """ - Extract lines from PDF document, create labels for lines from annotation file given by FinTOC. - Annotations are matched to lines using Levenshtein distance (threshold=0.8). + Extract lines from PDF document, create labels for lines from the groundtruth file given by FinTOC. + Labeled lines are matched to the lines extracted by dedoc using Levenshtein distance (threshold=0.8). :param file_name: name of the file (PDF, json) :param file_path: path to the PDF document - :param annotation_path: path to the JSON file with annotations + :param gt_path: path to the groundtruth JSON file with labels :param reader_name: ("tabby", "txt_layer") - type of PDF reader used for lines extraction :return: document in form of list of lines with labels """ document = self.readers[reader_name].read(file_path, parameters={"need_header_footer_analysis": "True"}) - annotations = defaultdict(list) - with open(annotation_path) as annotations_file: - for annotation in json.load(annotations_file): - annotations[annotation["page"] - 1].append(annotation) + labeled_lines = defaultdict(list) + with open(gt_path) as gt_file: + for labeled_line in json.load(gt_file): + labeled_lines[labeled_line["page"] - 1].append(labeled_line) result = [] for line in document.lines: - annotations_page = [(ratio(line.line, annotation["text"]), annotation) for annotation in annotations[line.metadata.page_id]] - best_annotation = max(annotations_page, key=lambda t: t[0], default=(0, {})) - depth = best_annotation[1]["depth"] if len(annotations_page) > 0 and best_annotation[0] > 0.8 else "-1" + page_candidates = [(ratio(line.line, labeled_line["text"]), labeled_line) for labeled_line in labeled_lines[line.metadata.page_id]] + best_line = max(page_candidates, key=lambda t: t[0], default=(0, {})) + depth = best_line[1]["depth"] if len(page_candidates) > 0 and best_line[0] > 0.8 else "-1" result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=str(depth), group=file_name, uid=line.uid)) return sorted(result, key=lambda x: (x.metadata.page_id, x.metadata.line_id)) @@ -89,21 +89,27 @@ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> D if os.path.isfile(pkl_path) and use_cache: with gzip.open(pkl_path) as input_file: - lines = pickle.load(input_file) - self.logger.info("Data were loaded from the local disk") - return lines + parsed_files = pickle.load(input_file) + self.logger.info(f"Data were loaded from the local disk: {len(parsed_files)} files") + return parsed_files + result = self.__read_pdf_lines(archive_path, language, reader_name) + + with gzip.open(pkl_path, "wb") as out: + pickle.dump(obj=result, file=out) + self.logger.info(Counter([line.label for document in result.values() for line in document])) + return result + + def __read_pdf_lines(self, archive_path: str, language: str, reader_name: str) -> Dict[str, List[LineWithLabel]]: with zipfile.ZipFile(archive_path, "r") as zip_ref: zip_ref.extractall(self.dataset_dir) + data_dir = os.path.join(self.dataset_dir, "data", language) pdf_dir = os.path.join(data_dir, "pdf") - annotations_dir = os.path.join(data_dir, "annots") + gt_dir = os.path.join(data_dir, "annots") pdf_files = {pdf_file[:-len(".pdf")]: os.path.join(pdf_dir, pdf_file) for pdf_file in os.listdir(pdf_dir) if pdf_file.endswith(".pdf")} - annotations_files = { - ann_file[:-len(".pdf.fintoc4.json")]: os.path.join(annotations_dir, ann_file) - for ann_file in os.listdir(annotations_dir) if ann_file.endswith(".json") - } - assert set(pdf_files) == set(annotations_files) + gt_files = {gt_file[:-len(".pdf.fintoc4.json")]: os.path.join(gt_dir, gt_file) for gt_file in os.listdir(gt_dir) if gt_file.endswith(".json")} + assert set(pdf_files) == set(gt_files) result = {} with tempfile.TemporaryDirectory() as tmp_dir: @@ -111,17 +117,8 @@ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> D pdf_tmp_path = os.path.join(tmp_dir, file_name) + ".pdf" shutil.copy(pdf_files[file_name], pdf_tmp_path) try: - document = self.line_extractor.get_lines( - file_name=file_name, - file_path=pdf_tmp_path, - annotation_path=annotations_files[file_name], - reader_name=reader_name - ) + document = self.line_extractor.get_lines(file_name=file_name, file_path=pdf_tmp_path, gt_path=gt_files[file_name], reader_name=reader_name) result[pdf_files[file_name]] = document except Exception as e: self.logger.warning(f"Failed to read {file_name} by {reader_name}, error: {e}") - - with gzip.open(pkl_path, "wb") as out: - pickle.dump(obj=result, file=out) - self.logger.info(Counter([line.label for document in result.values() for line in document])) return result diff --git a/scripts/fintoc2022/trainer.py b/scripts/fintoc2022/trainer.py index 88339138..aab3fb20 100644 --- a/scripts/fintoc2022/trainer.py +++ b/scripts/fintoc2022/trainer.py @@ -4,7 +4,7 @@ import os import shutil from statistics import mean -from typing import Optional +from typing import Dict, List, Optional, Union import pandas as pd from sklearn.model_selection import GroupKFold @@ -101,15 +101,15 @@ def fit(self, cross_val: bool = True, use_cache: bool = True) -> None: scores = self.__cross_validate(features=features, gt_dir=gt_dir) if cross_val else None # train resulting classifiers on all data - self.logger.info("Train resulting classifiers") + self.logger.info("Start training resulting classifiers on all data") self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=features, features_names=features_names) self.__save(features_names=features_names, scores=scores) - def __get_features_names(self, features_df: pd.DataFrame) -> list: + def __get_features_names(self, features_df: pd.DataFrame) -> List[str]: features_names = [col for col in features_df.columns if col not in self.additional_features_fields] return features_names - def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict: + def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> Dict[str, Union[List[float], float]]: self.logger.info("Start cross-validation") features_names = self.__get_features_names(features) results_path = os.path.join(self.scores_dir_path, f"cross_val_results_{self.language}_{self.reader_name}") @@ -158,7 +158,7 @@ def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict: result_scores["toc_mean"] = mean(result_scores["toc_scores"]) return result_scores - def __save(self, features_names: list[str], scores: Optional[dict]) -> None: + def __save(self, features_names: List[str], scores: Optional[Dict[str, Union[List[float], float]]]) -> None: if scores is not None: os.makedirs(self.scores_dir_path, exist_ok=True) diff --git a/scripts/fintoc2022/utils.py b/scripts/fintoc2022/utils.py index 187f5158..b9354a7b 100755 --- a/scripts/fintoc2022/utils.py +++ b/scripts/fintoc2022/utils.py @@ -1,11 +1,11 @@ import os from collections import defaultdict -from typing import List, Tuple +from typing import Dict, List, Tuple, Union import pandas as pd -def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict: +def create_json_result(data: pd.DataFrame, predictions: List[int]) -> Dict[str, List[Dict[str, Union[str, int]]]]: """ Creates dictionary with TOCs for each document: {"doc_name": TOC}. TOC is a following list of dictionaries: diff --git a/tests/api_tests/test_api_doctype_fintoc.py b/tests/api_tests/test_api_doctype_fintoc.py index a238d476..7a70ca56 100644 --- a/tests/api_tests/test_api_doctype_fintoc.py +++ b/tests/api_tests/test_api_doctype_fintoc.py @@ -4,27 +4,27 @@ class TestApiFintoc(AbstractTestApiDocReader): def test_article_en(self) -> None: - file_name = "pdf_with_text_layer/prospectus.pdf" + file_name = "fintoc/prospectus_en.pdf" result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true")) tree = result["content"]["structure"] self._check_tree_sanity(tree) # headers - node = self._get_by_tree_path(tree, "0.1") + node = self._get_by_tree_path(tree, "0.0") self.assertEqual("header", node["metadata"]["paragraph_type"]) - self.assertEqual("Prospectus", node["text"].strip()) - node = self._get_by_tree_path(tree, "0.1.2") + self.assertEqual("Key Information Document (KID)", node["text"].strip()) + node = self._get_by_tree_path(tree, "0.0.0") self.assertEqual("header", node["metadata"]["paragraph_type"]) - self.assertEqual("TABLE OF CONTENTS", node["text"].strip()) + self.assertEqual("PURPOSE", node["text"].strip()) # raw text - node = self._get_by_tree_path(tree, "0.1.2.0") + node = self._get_by_tree_path(tree, "0.0.0.0") self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) - self.assertTrue(node["text"].startswith("PART I - GENERAL INFORMATION")) + self.assertTrue(node["text"].startswith("This document provides")) def test_article_fr(self) -> None: - file_name = "pdf_with_text_layer/prospectus_fr.pdf" + file_name = "fintoc/prospectus_fr.pdf" result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="fr")) tree = result["content"]["structure"] @@ -44,7 +44,7 @@ def test_article_fr(self) -> None: self.assertEqual("OPCVM relevant de la directive européenne 2009/65/CE", node["text"].strip()) def test_article_sp(self) -> None: - file_name = "pdf_with_text_layer/prospectus_sp.pdf" + file_name = "fintoc/prospectus_sp.pdf" result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="sp")) tree = result["content"]["structure"] diff --git a/tests/data/fintoc/prospectus_en.pdf b/tests/data/fintoc/prospectus_en.pdf new file mode 100644 index 00000000..cf54b365 Binary files /dev/null and b/tests/data/fintoc/prospectus_en.pdf differ diff --git a/tests/data/pdf_with_text_layer/prospectus_fr.pdf b/tests/data/fintoc/prospectus_fr.pdf similarity index 100% rename from tests/data/pdf_with_text_layer/prospectus_fr.pdf rename to tests/data/fintoc/prospectus_fr.pdf diff --git a/tests/data/pdf_with_text_layer/prospectus_sp.pdf b/tests/data/fintoc/prospectus_sp.pdf similarity index 100% rename from tests/data/pdf_with_text_layer/prospectus_sp.pdf rename to tests/data/fintoc/prospectus_sp.pdf