From cc5f45cb22687995354bf571c6d8bb1a23388b0f Mon Sep 17 00:00:00 2001
From: Nasty
Date: Thu, 18 Apr 2024 18:04:53 +0300
Subject: [PATCH 1/8] Add fintoc scripts (work in progress)
---
.flake8 | 9 +-
.pre-commit-config.yaml | 2 +-
.../fintoc_structure_extractor.py | 23 +
.../fintoc_feature_extractor.py | 245 +++++++
.../paired_feature_extractor.py | 71 ++
.../fintoc_classifier.py | 213 ++++++
scripts/fintoc2022/__init__.py | 0
scripts/fintoc2022/benchmark_fintoc.py | 0
scripts/fintoc2022/dataset_loader.py | 128 ++++
scripts/fintoc2022/metric.py | 624 ++++++++++++++++++
scripts/fintoc2022/toc_extractor.py | 93 +++
scripts/fintoc2022/train_fintoc_classifier.py | 0
scripts/fintoc2022/utils.py | 43 ++
13 files changed, 1447 insertions(+), 4 deletions(-)
create mode 100644 dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
create mode 100644 dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
create mode 100755 dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py
create mode 100755 dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
create mode 100644 scripts/fintoc2022/__init__.py
create mode 100644 scripts/fintoc2022/benchmark_fintoc.py
create mode 100755 scripts/fintoc2022/dataset_loader.py
create mode 100755 scripts/fintoc2022/metric.py
create mode 100755 scripts/fintoc2022/toc_extractor.py
create mode 100644 scripts/fintoc2022/train_fintoc_classifier.py
create mode 100755 scripts/fintoc2022/utils.py
diff --git a/.flake8 b/.flake8
index d7afb7d1..555b4381 100644
--- a/.flake8
+++ b/.flake8
@@ -16,12 +16,15 @@ exclude =
resources,
venv,
build,
- dedoc.egg-info
- docs/_build
+ dedoc.egg-info,
+ docs/_build,
+ scripts/fintoc2022/metric.py
# ANN101 - type annotations for self
+# T201 - prints found
+# JS101 - Multi-line container not broken after opening character
ignore =
ANN101
per-file-ignores =
scripts/*:T201
- scripts/benchmark_pdf_performance*:JS101,T201
+ scripts/benchmark_pdf_performance*:JS101
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 76ee04b4..0f439368 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
- exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
+ exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
args:
- "--config=.flake8"
additional_dependencies: [
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
new file mode 100644
index 00000000..517fe13f
--- /dev/null
+++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
@@ -0,0 +1,23 @@
+from typing import Optional
+
+from dedoc.data_structures import UnstructuredDocument
+from dedoc.structure_extractors import AbstractStructureExtractor
+
+
+class FintocStructureExtractor(AbstractStructureExtractor):
+ """
+ This class is an implementation of the TOC extractor for the `FinTOC 2022 Shared task`_.
+ The code is a modification of the winner's solution (ISP RAS team).
+
+ You can find the description of this type of structure in the section :ref:`fintoc_structure`.
+ """
+ document_type = "fintoc"
+
+ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
+ """
+
+ To get the information about the method's parameters look at the documentation of the class \
+ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
+ """
+
+ return document
diff --git a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
new file mode 100644
index 00000000..b5719f44
--- /dev/null
+++ b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
@@ -0,0 +1,245 @@
+import gzip
+import json
+import os
+import pickle
+import re
+import zipfile
+from collections import defaultdict
+from typing import Dict, Iterator, List, Optional, Tuple
+
+import pandas as pd
+import wget
+from Levenshtein._levenshtein import ratio
+from tqdm import tqdm
+
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor
+from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.empty_prefix import EmptyPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix
+from dedoc.structure_extractors.feature_extractors.list_features.prefix.roman_prefix import RomanPrefix
+from dedoc.structure_extractors.feature_extractors.paired_feature_extractor import PairedFeatureExtractor
+from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor
+from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max
+from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_year
+from dedoc.utils.utils import flatten
+
+
+class FintocFeatureExtractor(AbstractFeatureExtractor):
+
+ def __init__(self, tocs: dict):
+ self.paired_feature_extractor = PairedFeatureExtractor()
+ self.prefix_list = [BulletPrefix, AnyLetterPrefix, LetterPrefix, BracketPrefix, BracketRomanPrefix, DottedPrefix, RomanPrefix]
+ self.list_feature_extractors = [
+ ListFeaturesExtractor(window_size=10, prefix_list=self.prefix_list),
+ ListFeaturesExtractor(window_size=25, prefix_list=self.prefix_list),
+ ListFeaturesExtractor(window_size=100, prefix_list=self.prefix_list)
+ ]
+ self.prefix2number = {prefix.name: i for i, prefix in enumerate(self.prefix_list, start=1)}
+ self.prefix2number[EmptyPrefix.name] = 0
+ self.tocs = tocs
+
+ def parameters(self) -> dict:
+ return {}
+
+ def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> "AbstractFeatureExtractor":
+ return self
+
+ def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame:
+ assert len(documents) > 0
+ result_matrix = pd.concat([self.__process_document(document) for document in tqdm(documents)], ignore_index=True)
+ result_matrix = pd.concat([result_matrix, self.paired_feature_extractor.transform(documents)], axis=1)
+ features = sorted(result_matrix.columns)
+ result_matrix = result_matrix[features].astype(float)
+ result_matrix["text"] = [line.line for line in flatten(documents)]
+ features.append("text")
+ return result_matrix[features]
+
+ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame:
+ features_df = pd.DataFrame(self.__look_at_prev_line(document=lines, n=1))
+ features_df["line_relative_length"] = self.__get_line_relative_length(lines)
+
+ list_features = pd.concat([f_e.one_document(lines)[1] for f_e in self.list_feature_extractors], axis=1)
+
+ page_ids = [line.metadata.page_id for line in lines]
+ start_page, finish_page = (min(page_ids), max(page_ids)) if page_ids else (0, 0)
+
+ total_lines = len(lines)
+ one_line_features_dict = defaultdict(list)
+ for line_id, line in enumerate(lines):
+ for item in self.__one_line_features(line, total_lines, start_page=start_page, finish_page=finish_page):
+ feature_name, feature = item[0], item[1]
+ one_line_features_dict[feature_name].append(feature)
+
+ one_line_features_df = pd.DataFrame(one_line_features_dict)
+ one_line_features_df["font_size"] = self._normalize_features(one_line_features_df.font_size)
+
+ one_line_features_df = self.prev_next_line_features(one_line_features_df, 3, 3)
+ result_matrix = pd.concat([one_line_features_df, features_df, list_features], axis=1)
+ return result_matrix
+
+ def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[str, List]:
+ """
+ Look at previous line and compare with current line
+
+ :param document: list of lines
+ :param n: previous line number to look
+ :return: dict of features
+ """
+ res = defaultdict(list)
+ for line_id, line in enumerate(document):
+ if line_id >= n:
+ prev_line = document[line_id - n]
+ is_prev_line_ends = prev_line.line.endswith(('.', ';'))
+ res["prev_line_ends"].append(1 if is_prev_line_ends else 0)
+ res["prev_ends_with_colon"].append(prev_line.line.endswith(":"))
+ res["prev_is_space"].append(prev_line.line.lower().isspace())
+ else:
+ res["prev_line_ends"].append(0)
+ res["prev_ends_with_colon"].append(0)
+ res["prev_is_space"].append(0)
+ return res
+
+ def __get_line_relative_length(self, lines: List[LineWithMeta]) -> List[float]:
+ max_len = max([len(line.line) for line in lines])
+ relative_lengths = [len(line.line) / max_len for line in lines]
+ return relative_lengths
+
+ def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page: int, finish_page: int) -> Iterator[Tuple[str, int]]:
+ yield "normalized_page_id", normalization_by_min_max(line.metadata.page_id, min_v=start_page, max_v=finish_page)
+ yield "indentation", self._get_indentation(line)
+ yield "spacing", self._get_spacing(line)
+ yield "bold", self._get_bold(line)
+ yield "italic", self._get_italic(line)
+ yield from self._get_color(line)
+ yield "font_size", self._get_size(line)
+
+ yield "line_id", normalization_by_min_max(line.metadata.line_id, min_v=0, max_v=total_lines)
+ yield "num_year_regexp", len(regexps_year.findall(line.line))
+ yield "endswith_dot", line.line.endswith(".")
+ yield "endswith_semicolon", line.line.endswith(";")
+ yield "endswith_colon", line.line.endswith(":")
+ yield "endswith_comma", line.line.endswith(",")
+ yield "startswith_bracket", line.line.strip().startswith(('(', '{'))
+
+ bracket_cnt = 0
+ for char in line.line:
+ if char == '(':
+ bracket_cnt += 1
+ elif char == ')':
+ bracket_cnt = max(0, bracket_cnt - 1)
+ yield "bracket_num", bracket_cnt
+
+ probable_toc_title = re.sub(r"[\s:]", "", line.line).lower()
+ yield "is_toc_title", probable_toc_title in TOCFeatureExtractor.titles
+ yield from self.__find_in_toc(line)
+
+ line_length = len(line.line) + 1
+ yield "supper_percent", sum((1 for letter in line.line if letter.isupper())) / line_length
+ yield "letter_percent", sum((1 for letter in line.line if letter.isalpha())) / line_length
+ yield "number_percent", sum((1 for letter in line.line if letter.isnumeric())) / line_length
+ yield "words_number", len(line.line.split())
+
+ def __find_in_toc(self, line: LineWithMeta) -> Iterator[Tuple[str, int]]:
+ if not hasattr(line, "group"):
+ yield "is_toc", 0
+ yield "in_toc", 0
+ yield "toc_exists", 0
+ else:
+ toc = self.tocs.get(line.group, [])
+ is_toc = 0
+ in_toc = 0
+ toc_exists = int(len(toc) > 0)
+ line_text = line.line.lower().strip()
+ for item in toc:
+ if ratio(line_text, item["text"].lower()) > 0.8:
+ is_toc = 0 if line.metadata.page_id + 1 == int(item["page"]) else 1
+ in_toc = 1 if line.metadata.page_id + 1 == int(item["page"]) else 0
+ break
+ yield "is_toc", is_toc
+ yield "in_toc", in_toc
+ yield "toc_exists", toc_exists
+
+
+def handle_file(file: str, dir_out: str, extractor: AbstractFeatureExtractor):
+ file_name = os.path.split(file)[-1].split(".")[0]
+ with gzip.open(file) as f_in:
+ lines = pickle.load(file=f_in)
+ df = lines2dataframe(lines, extractor)
+ df.to_csv(os.path.join(dir_out, file_name + "_df.csv.gz"), index=False)
+ df.to_pickle(os.path.join(dir_out, file_name + "_df.pkl.gz"))
+
+
+def lines2dataframe(lines: List[LineWithLabel], extractor: AbstractFeatureExtractor) -> pd.DataFrame:
+ assert(len(lines) > 0)
+ lines2docs = []
+ current_document = None
+ reg_empty_string = re.compile(r"^\s*\n$")
+ special_unicode_symbols = [u"\uf0b7", u"\uf0d8", u"\uf084", u"\uf0a7", u"\uf0f0", u"\x83"]
+
+ lines = [line for line in lines if not reg_empty_string.match(line.line)]
+ for line in lines:
+ for ch in special_unicode_symbols:
+ line.set_line(line.line.replace(ch, ""))
+ if line.group == current_document:
+ lines2docs[-1].append(line)
+ else:
+ current_document = line.group
+ lines2docs.append([line])
+ df = extractor.transform(lines2docs)
+
+ df["label"] = [int(line.label) for line in lines]
+ df["group"] = [line.group for line in lines]
+ df["uid"] = [line.uid for line in lines]
+ df["page_id"] = [line.metadata.page_id for line in lines]
+ return df
+
+
+def main(dir_out: str, train: bool):
+ os.makedirs(dir_out, exist_ok=True)
+
+ root = "/tmp/.fintoc/train" if train else "/tmp/.fintoc/test"
+ lines_dir = os.path.join(root, "lines")
+ if train:
+ lines_url = "https://at.ispras.ru/owncloud/index.php/s/yvYn491d6Du8ZuV/download" # train
+ else:
+ lines_url = "https://at.ispras.ru/owncloud/index.php/s/h3TdYfQipiVAxpE/download" # test
+
+ toc_dir = os.path.join(root, "toc")
+ if train:
+ toc_url = "https://at.ispras.ru/owncloud/index.php/s/0VJbQWrD11R98Sy/download" # train
+ else:
+ toc_url = "https://at.ispras.ru/owncloud/index.php/s/GCoZitUsfCLPLVI/download" # test
+
+ if not os.path.isdir(root):
+ os.makedirs(root)
+
+ if not os.path.isdir(lines_dir):
+ archive = os.path.join(root, "lines.zip")
+ wget.download(lines_url, archive)
+ with zipfile.ZipFile(archive, 'r') as zip_ref:
+ zip_ref.extractall(root)
+
+ if not os.path.isdir(toc_dir):
+ archive = os.path.join(root, "toc.zip")
+ wget.download(toc_url, archive)
+ with zipfile.ZipFile(archive, 'r') as zip_ref:
+ zip_ref.extractall(root)
+
+ for lang in tqdm(["en", "fr", "sp"]):
+ lines_file = os.path.join(lines_dir, f"lines_{lang}_txt_layer.pkg.gz")
+ tocs_file = os.path.join(toc_dir, f"{lang}_toc.json")
+ with open(tocs_file) as f:
+ tocs = json.load(f)
+ extractor = FintocFeatureExtractor(tocs)
+ handle_file(file=lines_file, extractor=extractor, dir_out=dir_out)
+
+
+if __name__ == '__main__':
+ stage = "test"
+ main(dir_out=f"/home/nasty/fintoc2022/{stage}/pandas", train=stage == "train")
diff --git a/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py
new file mode 100755
index 00000000..cafb4c10
--- /dev/null
+++ b/dedoc/structure_extractors/feature_extractors/paired_feature_extractor.py
@@ -0,0 +1,71 @@
+import json
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+from scipy.stats._multivariate import method
+
+from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
+from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor
+from dedoc.utils.utils import flatten
+
+
+class PairedFeatureExtractor(AbstractFeatureExtractor):
+
+ def parameters(self) -> dict:
+ return {}
+
+ def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> "AbstractFeatureExtractor":
+ return self
+
+ def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame:
+ df = pd.DataFrame()
+ df["raw_depth_size"] = list(flatten([self._handle_one_document(document, self.__get_size) for document in documents]))
+ df["raw_depth_indentation"] = list(flatten([self._handle_one_document(document, self._get_indentation) for document in documents]))
+ return df
+
+ def _handle_one_document(self, document: List[LineWithMeta], get_feature: method) -> List[int]:
+ if len(document) == 0:
+ return []
+ if len(document) == 1:
+ return [0]
+
+ features = [get_feature(line) for line in document]
+ std = np.std(features)
+ result = []
+ stack = []
+
+ for line in document:
+ while len(stack) > 0 and self.__compare_lines(stack[-1], line, get_feature, std) <= 0: # noqa
+ stack.pop()
+ result.append(len(stack))
+ stack.append(line)
+
+ return result
+
+ def __get_size(self, line: LineWithMeta) -> float:
+ annotations = line.annotations
+ size_annotation = [annotation for annotation in annotations if annotation.name == SizeAnnotation.name]
+ if len(size_annotation) > 0:
+ return float(size_annotation[0].value)
+
+ bbox_annotation = [annotation for annotation in annotations if annotation.name == BBoxAnnotation.name]
+ if len(bbox_annotation) > 0:
+ bbox = json.loads(bbox_annotation[0].value)
+ return bbox["height"]
+
+ return 0
+
+ def __compare_lines(self, first_line: LineWithMeta, second_line: LineWithMeta, get_feature: method, threshold: float = 0) -> int:
+ first_feature = get_feature(first_line)
+ second_feature = get_feature(second_line)
+
+ if first_feature > second_feature + threshold:
+ return 1
+
+ if second_feature > first_feature + threshold:
+ return -1
+
+ return 0
diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
new file mode 100755
index 00000000..1e0b1541
--- /dev/null
+++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
@@ -0,0 +1,213 @@
+import gzip
+import json
+import os
+import pickle
+import shutil
+from statistics import mean
+from typing import List, Optional, Union
+
+import pandas as pd
+import xgbfir
+import xgboost as xgb
+from sklearn.model_selection import GroupKFold
+from tqdm import tqdm
+
+from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
+from train_dataset.data_structures.line_with_label import LineWithLabel
+
+
+class FintocClassifier:
+
+ def __init__(self,
+ tocs_path: str,
+ save_path: str,
+ binary_classifier_params: Optional[dict] = None,
+ target_classifier_params: Optional[dict] = None,
+ load_trained: bool = False,
+ lang: str = "en"):
+ self.save_path = save_path
+ self.lang = lang
+ if load_trained:
+ with gzip.open(os.path.join(self.save_path, f"binary_classifier_{self.lang}.pkg.gz"), "rb") as input_file:
+ self.binary_classifier = pickle.load(file=input_file)
+ with gzip.open(os.path.join(self.save_path, f"target_classifier_{self.lang}.pkg.gz"), "rb") as input_file:
+ self.target_classifier = pickle.load(file=input_file)
+ else:
+ assert(binary_classifier_params is not None and target_classifier_params is not None)
+ self.binary_classifier = xgb.XGBClassifier(**binary_classifier_params)
+ self.target_classifier = xgb.XGBClassifier(**target_classifier_params)
+ with open(tocs_path) as f:
+ tocs = json.load(f)
+ self.features_extractor = FintocFeatureExtractor(tocs)
+
+ def fit(self, data: Union[pd.DataFrame, List[LineWithLabel]],
+ cross_val: bool = True,
+ save: bool = False,
+ gt_dir: Optional[str] = None,
+ n_splits: int = 3) -> None:
+ if isinstance(data, pd.DataFrame):
+ features_df = data
+ else:
+ features_df = lines2dataframe(data, self.features_extractor)
+ print("Features shape: {}".format(features_df.shape))
+ results = None
+
+ if cross_val:
+ assert(gt_dir is not None)
+ results = self.evaluate_fintoc_metric(features_df=features_df, gt_dir=gt_dir, n_splits=n_splits)
+
+ if not save:
+ return
+
+ features_names = self.__get_features_names(features_df)
+ self.binary_classifier.fit(features_df[features_names], features_df.label != -1)
+ self.target_classifier.fit(features_df[features_names][features_df.label != -1],
+ features_df.label[features_df.label != -1])
+ self._save(features_names, results)
+
+ def _save(self, features_names: list, scores: Optional[dict]) -> None:
+ os.makedirs(self.save_path, exist_ok=True)
+ if scores is not None:
+ with open(os.path.join(self.save_path, f"scores_{self.lang}.json"), "w") as f:
+ json.dump(scores, f)
+ print("Scores were saved in {}".format(os.path.join(self.save_path, f"scores_{self.lang}.json")))
+
+ with gzip.open(os.path.join(self.save_path, F"binary_classifier_{self.lang}.pkg.gz"), "wb") as output_file:
+ pickle.dump(self.binary_classifier, output_file)
+ with gzip.open(os.path.join(self.save_path, f"target_classifier_{self.lang}.pkg.gz"), "wb") as output_file:
+ pickle.dump(self.target_classifier, output_file)
+ print("Classifiers were saved in {} directory".format(self.save_path))
+
+ xgbfir.saveXgbFI(self.binary_classifier, feature_names=features_names,
+ OutputXlsxFile=os.path.join(self.save_path, f"feature_importances_binary_{self.lang}.xlsx"))
+ xgbfir.saveXgbFI(self.target_classifier, feature_names=features_names,
+ OutputXlsxFile=os.path.join(self.save_path, f"feature_importances_target_{self.lang}.xlsx"))
+ print("Features importances were saved in {} directory".format(self.save_path))
+
+ def predict(self, data: Union[pd.DataFrame, List[LineWithLabel]]) -> dict:
+ """
+ param lines: list of documents lines, label isn't known or dataframe with lines features
+ :return: dict with TOC of the documents in the required format
+ """
+ if isinstance(data, pd.DataFrame):
+ features_df = data
+ else:
+ features_df = lines2dataframe(data, self.features_extractor)
+ features_names = self.__get_features_names(features_df)
+ binary_predictions = self.binary_classifier.predict(features_df[features_names])
+ features_df["label"] = binary_predictions
+ target_predictions = self.target_classifier.predict(features_df[features_names][features_df.label])
+ result_dict = create_json_result(features_df[features_df.label], target_predictions)
+ return result_dict
+
+ def evaluate_fintoc_metric(self,
+ features_df: pd.DataFrame,
+ gt_dir: str,
+ n_splits: int = 3) -> dict:
+
+ features_names = self.__get_features_names(features_df)
+ results_path = os.path.join(self.save_path, "results")
+ os.makedirs(results_path, exist_ok=True)
+
+ kf = GroupKFold(n_splits=n_splits)
+
+ result_scores = {"td_scores": [], "toc_scores": []}
+ for i, (train_index, val_index) in tqdm(enumerate(kf.split(features_df, groups=features_df.group)),
+ total=n_splits):
+ df_train = features_df.loc[train_index]
+ df_val = features_df.loc[val_index]
+ self.binary_classifier.fit(df_train[features_names], df_train.label != -1)
+ self.target_classifier.fit(
+ df_train[features_names][df_train.label != -1], df_train.label[df_train.label != -1])
+ result_dict = self.predict(df_val)
+
+ tmpdir = "/tmp/fintoc/eval"
+ if os.path.isdir(tmpdir):
+ shutil.rmtree(tmpdir)
+ os.makedirs(tmpdir)
+ tmp_gt_dir, predictions_dir = os.path.join(tmpdir, "groundtruth"), os.path.join(tmpdir, "predictions")
+ os.makedirs(tmp_gt_dir)
+ os.makedirs(predictions_dir)
+
+ for doc_name, result in result_dict.items():
+ gt_doc_name = doc_name + ".pdf.fintoc4.json"
+ if gt_doc_name not in os.listdir(gt_dir):
+ print(f"\n{gt_doc_name} not found in groundtruth")
+ continue
+ with open(os.path.join(predictions_dir, gt_doc_name), "w") as json_file:
+ json.dump(result, json_file, indent=2)
+ shutil.copy(os.path.join(gt_dir, gt_doc_name), os.path.join(tmp_gt_dir, gt_doc_name))
+ score(tmp_gt_dir, predictions_dir)
+ shutil.rmtree(tmpdir)
+
+ path_scores = os.path.join(results_path, str(i))
+ os.makedirs(path_scores, exist_ok=True)
+ for file_name in ['td.log', 'toc.log', 'td_report.csv', 'toc_report.csv']:
+ shutil.move(file_name, os.path.join(path_scores, file_name))
+ f1, inex_f1 = get_values_from_csv(path_scores)
+ result_scores["td_scores"].append(f1)
+ result_scores["toc_scores"].append(inex_f1)
+ print(f"it {i}:\ntd {result_scores['td_scores'][-1]}\ntoc {result_scores['toc_scores'][-1]}")
+ result_scores["td_mean"] = mean(result_scores["td_scores"])
+ result_scores["toc_mean"] = mean(result_scores["toc_scores"])
+ return result_scores
+
+ def __get_features_names(self, features_df: pd.DataFrame) -> list:
+ features_names = [col for col in features_df.columns if col not in ("text", "label", "group", "uid")]
+ return features_names
+
+
+def train_classifier(train_dir: str) -> None:
+ clf_params = {
+ "en_binary": dict(random_state=42, learning_rate=0.25, max_depth=5, n_estimators=400,
+ colsample_bynode=0.8, colsample_bytree=0.5, tree_method="hist"),
+ "fr_binary": dict(random_state=42, learning_rate=0.1, max_depth=5, n_estimators=800,
+ colsample_bynode=0.5, colsample_bytree=0.8, tree_method="approx"),
+ "sp_binary": dict(random_state=42, learning_rate=0.25, max_depth=4, n_estimators=600,
+ colsample_bynode=0.5, colsample_bytree=0.5, tree_method="approx"),
+ "en_target": dict(random_state=42, learning_rate=0.07, max_depth=4, n_estimators=800,
+ colsample_bynode=1, colsample_bytree=1, tree_method="hist"),
+ "fr_target": dict(random_state=42, learning_rate=0.4, max_depth=5, n_estimators=800,
+ colsample_bynode=1, colsample_bytree=0.5, tree_method="exact"),
+ "sp_target": dict(random_state=42, learning_rate=0.25, max_depth=3, n_estimators=600,
+ colsample_bynode=0.5, colsample_bytree=1, tree_method="hist")
+ }
+ for lang in ("en", "fr", "sp"):
+ pandas_path = os.path.join(train_dir, "pandas", f"lines_{lang}_txt_layer_df.csv.gz")
+ cls = FintocClassifier(binary_classifier_params=clf_params[f"{lang}_binary"],
+ target_classifier_params=clf_params[f"{lang}_target"],
+ tocs_path=os.path.join(train_dir, "toc", f"{lang}_toc.json"),
+ save_path="resources",
+ load_trained=False,
+ lang=lang)
+ features_df = pd.read_csv(pandas_path, index_col=False)
+ cls.fit(data=features_df,
+ cross_val=True,
+ save=True,
+ gt_dir=os.path.join(train_dir, "data", lang, "annots"))
+
+
+def get_results(test_dir: str) -> None:
+ for lang in ("en", "fr", "sp"):
+ pandas_path = os.path.join(test_dir, "pandas", f"lines_{lang}_txt_layer_df.csv.gz")
+ cls = FintocClassifier(tocs_path=os.path.join(test_dir, "toc", f"{lang}_toc.json"),
+ save_path="resources",
+ load_trained=True,
+ lang=lang)
+ features_df = pd.read_csv(pandas_path, index_col=False)
+ result_dict = cls.predict(features_df)
+ results_dir = os.path.join(test_dir, "results", lang)
+ os.makedirs(results_dir, exist_ok=True)
+ for doc_name, result in result_dict.items():
+ json_doc_name = doc_name + ".pdf.fintoc4.json"
+ with open(os.path.join(results_dir, json_doc_name), "w") as json_file:
+ json.dump(result, json_file, indent=2)
+
+
+if __name__ == "__main__":
+ train = False
+ fintoc_dir = "/home/nasty/fintoc2022"
+ if train:
+ train_classifier(os.path.join(fintoc_dir, "train"))
+ else:
+ get_results(os.path.join(fintoc_dir, "test"))
diff --git a/scripts/fintoc2022/__init__.py b/scripts/fintoc2022/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/fintoc2022/benchmark_fintoc.py b/scripts/fintoc2022/benchmark_fintoc.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/fintoc2022/dataset_loader.py b/scripts/fintoc2022/dataset_loader.py
new file mode 100755
index 00000000..1a271b41
--- /dev/null
+++ b/scripts/fintoc2022/dataset_loader.py
@@ -0,0 +1,128 @@
+import gzip
+import json
+import logging
+import os
+import pickle
+import shutil
+import tempfile
+import zipfile
+from collections import Counter, defaultdict
+from typing import List
+
+import wget
+from Levenshtein._levenshtein import ratio
+
+from dedoc.config import get_config
+from dedoc.readers import PdfTabbyReader, PdfTxtlayerReader
+from dedoc.utils.utils import flatten
+from train_dataset.data_structures.line_with_label import LineWithLabel
+
+
+class FintocLineWithLabelExtractor:
+ """
+ Create LineWithLabel from documents and their annotations
+ """
+ def __init__(self):
+ self.readers = {
+ "tabby": PdfTabbyReader(),
+ "txt_layer": PdfTxtlayerReader()
+ }
+
+ def get_lines(self, file_name: str, file_path: str, annotation_path: str, reader_name: str) -> List[LineWithLabel]:
+ """
+ Extract lines from PDF document, create labels for lines from annotation file given by FinTOC.
+ Annotations are matched to lines using Levenshtein distance (threshold=0.8).
+
+ :param file_name: name of the file (PDF, json)
+ :param file_path: path to the PDF document
+ :param annotation_path: path to the JSON file with annotations
+ :param reader_name: ("tabby", "txt_layer") - type of PDF reader used for lines extraction
+ :return: document in form of list of lines with labels
+ """
+ document = self.readers[reader_name].read(file_path, parameters={"need_header_footer_analysis": "True"})
+
+ annotations = defaultdict(list)
+ with open(annotation_path) as annotations_file:
+ for annotation in json.load(annotations_file):
+ annotations[annotation["page"] - 1].append(annotation)
+
+ result = []
+ for line in document.lines:
+ annotations_page = [(ratio(line.line, annotation["text"]), annotation) for annotation in annotations[line.metadata.page_id]]
+ best_annotation = max(annotations_page, key=lambda t: t[0], default=(0, {}))
+ depth = best_annotation[1]["depth"] if len(annotations_page) > 0 and best_annotation[0] > 0.8 else "-1"
+ result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=depth, group=file_name, uid=line.uid))
+
+ return sorted(result, key=lambda l: (l.metadata.page_id, l.metadata.line_id))
+
+
+class FintocDatasetLoader:
+ """
+ Class for downloading data from the cloud, distributing lines into document groups and sorting them.
+ Returns data in form of document lines with their labels.
+ """
+ def __init__(self, dataset_dir: str, logger: logging.Logger) -> None:
+ """
+ :param dataset_dir: path to the directory where to store downloaded dataset
+ :param logger: logger for logging details of dataset loading
+ """
+ self.dataset_dir = dataset_dir
+ self.logger = logger
+ self.config = get_config()
+ self.data_url = "https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download"
+ self.line_extractor = FintocLineWithLabelExtractor()
+
+ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> List[List[LineWithLabel]]:
+ """
+ Download data from a cloud at `self.data_url` and sort document lines.
+
+ :param language: ("en", "fr", "sp") - language group
+ :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF
+ :param use_cache: whether to use cached data (if dataset is already downloaded) or download it anyway
+ :return: list of documents, which are lists of lines with labels of the training dataset
+ """
+ archive_path = os.path.join(self.dataset_dir, "dataset.zip")
+ if not os.path.isfile(archive_path):
+ os.makedirs(self.dataset_dir, exist_ok=True)
+ self.logger.info("Start download dataset")
+ wget.download(self.data_url, archive_path)
+ self.logger.info(f"Finish download dataset to {archive_path}")
+
+ pkl_path = os.path.join(self.dataset_dir, f"lines_{language}_{reader_name}.pkl.gz")
+
+ if os.path.isfile(pkl_path) and use_cache:
+ with gzip.open(pkl_path) as input_file:
+ lines = pickle.load(input_file)
+ self.logger.info("Data were loaded from the local disk")
+ return lines
+
+ with zipfile.ZipFile(archive_path, "r") as zip_ref:
+ zip_ref.extractall(self.dataset_dir)
+ data_dir = os.path.join(self.dataset_dir, "data", language)
+ pdf_dir = os.path.join(data_dir, "pdf")
+ annotations_dir = os.path.join(data_dir, "annots")
+ pdf_files = {pdf_file[:-len(".pdf")]: os.path.join(pdf_dir, pdf_file) for pdf_file in os.listdir(pdf_dir) if pdf_file.endswith(".pdf")}
+ annotations_files = {ann_file[:-len(".pdf.fintoc4.json")]: os.path.join(annotations_dir, ann_file)
+ for ann_file in os.listdir(annotations_dir) if ann_file.endswith(".json")}
+ assert set(pdf_files) == set(annotations_files)
+
+ result = []
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ for file_name in pdf_files:
+ pdf_tmp_path = os.path.join(tmp_dir, file_name) + ".pdf"
+ shutil.copy(pdf_files[file_name], pdf_tmp_path)
+ try:
+ document = self.line_extractor.get_lines(
+ file_name=file_name,
+ file_path=pdf_tmp_path,
+ annotation_path=annotations_files[file_name],
+ reader_name=reader_name
+ )
+ result.append(document)
+ except Exception as e:
+ self.logger.warning(f"Failed to read {file_name} by {reader_name}, error: {e}")
+
+ with gzip.open(pkl_path, "wb") as out:
+ pickle.dump(obj=result, file=out)
+ self.logger.info(Counter([line.label for line in flatten(result)]))
+ return result
diff --git a/scripts/fintoc2022/metric.py b/scripts/fintoc2022/metric.py
new file mode 100755
index 00000000..eb82fd4a
--- /dev/null
+++ b/scripts/fintoc2022/metric.py
@@ -0,0 +1,624 @@
+"""
+This is a python3 script that rewrites the score function used in Book Structure
+Extraction Competition @ ICDAR2013
+(https://www.cs.helsinki.fi/u/doucet/papers/ICDAR2013.pdf).
+It uses a classic levenshtein distance implemented by
+https://pypi.org/project/python-Levenshtein/ instead of a customized levenshtein
+distance.
+It is used to score participants in FinTOC2020 shared task.
+
+------
+INSTALL
+------
+pip install python-Levenshtein
+
+------
+USAGE
+------
+python metric.fintoc2.py--gt_folder --submission_folder
+
+ and are paths to folders containing JSON files:
+[
+ {
+ "text": String, # text of the TOC item/entry
+ "id": Int # identifer of the item/entry corresponding to its order in the TOC
+ "depth": Int # hierarchical level of the item
+ "page": Int # the (physical) page number where the item appears
+ }
+
+]
+"""
+
+import argparse
+import csv
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from glob import glob
+from operator import itemgetter
+
+import Levenshtein
+import numpy as np
+
+JSON_EXTENSION = ".fintoc4.json"
+VERBOSE = True
+STRING_THRESHOLD = 0.85
+
+
+class TOCJson:
+ def __init__(self, json_file):
+ self.parse(json_file)
+
+ def parse(self, json_file):
+ with open(json_file, "r", encoding="utf-8") as infile:
+ content = json.load(infile)
+ self.entries = []
+ for dict_entry in content:
+ self.entries.append(Title(dict_entry["text"], dict_entry["page"], dict_entry["id"], dict_entry["depth"]))
+
+
+class Title:
+ def __init__(self, text, page_nb, id_, depth):
+ self.text = text
+ self.page_nb = page_nb
+ self.id_ = id_
+ self.depth = depth
+ self.matched = False
+
+ def __repr__(self):
+ return f"page={self.page_nb} title={repr(self.text)}"
+
+ def compare_page_nb(self, entry):
+ if isinstance(entry.page_nb, str):
+ entry.page_nb = int(entry.page_nb)
+ if self.page_nb == entry.page_nb:
+ return 0
+ if self.page_nb > entry.page_nb:
+ return 1
+ return -1
+
+ def compare_depth(self, entry):
+ if str(self.depth) == entry.depth:
+ return 0
+ if str(self.depth) > entry.depth:
+ return 1
+ return -1
+
+
+class ICDARMetric(ABC):
+
+ def __init__(self):
+ self.correct = 0
+ self.added = 0
+ self.missed = 0
+ self.mismatch = 0
+ self.p_per_doc = {}
+ self.r_per_doc = {}
+ self.f_per_doc = {}
+ self.title_acc_per_doc = {}
+
+ def compute_prf(self):
+ self.compute_p()
+ self.compute_r()
+ try:
+ self.f_score = 2 * self.prec * self.reca / (self.prec + self.reca)
+ except ZeroDivisionError:
+ self.f_score = 0
+ return self.prec, self.reca, self.f_score
+
+ def compute_p(self):
+ try:
+ self.prec = self.correct / (self.correct + self.added + self.mismatch)
+ except ZeroDivisionError:
+ self.prec = 0
+
+ def compute_r(self):
+ try:
+ self.reca = self.correct / (self.correct + self.missed + self.mismatch)
+ except ZeroDivisionError:
+ self.reca = 0
+
+ @abstractmethod
+ def initialize_stats(self):
+ self.correct = 0
+ self.added = 0
+ self.missed = 0
+ self.mismatch = 0
+ self.prec = 0.0
+ self.reca = 0.0
+ self.f_score = 0.0
+ self.title_acc = 0.0
+
+ @abstractmethod
+ def get_title_acc(self, *args):
+ pass
+
+ def format_float_percent(self, float_nb):
+ return "%.1f" % (100 * float_nb)
+
+ def format_res(self):
+ out = ["%6s" % self.format_float_percent(self.prec)]
+ out.append("%6s" % self.format_float_percent(self.reca))
+ out.append("%6s" % self.format_float_percent(self.f_score))
+ out.append("%6s" % self.format_float_percent(self.title_acc))
+ return out
+
+ def compute_avg_p(self):
+ return np.mean(list(self.p_per_doc.values()))
+
+ def compute_std_p(self):
+ return np.std(list(self.p_per_doc.values()))
+
+ def compute_avg_r(self):
+ return np.mean(list(self.r_per_doc.values()))
+
+ def compute_std_r(self):
+ return np.std(list(self.r_per_doc.values()))
+
+ def compute_avg_f(self):
+ return np.mean(list(self.f_per_doc.values()))
+
+ def compute_std_f(self):
+ return np.std(list(self.f_per_doc.values()))
+
+ def compute_avg_title_acc(self):
+ return np.mean(list(self.title_acc_per_doc.values()))
+
+ def compute_std_title_acc(self):
+ return np.std(list(self.title_acc_per_doc.values()))
+
+
+class InexMetric(ICDARMetric):
+
+ def __init__(self):
+ super().__init__()
+ self.level_correct = 0
+ self.level_acc = 0.0
+ self.level_acc_per_doc = {}
+
+ def initialize_stats(self):
+ super().initialize_stats()
+ self.level_correct = 0
+ self.level_acc = 0.0
+
+ def get_level_acc(self, nb_valid_links):
+ try:
+ self.level_acc = self.level_correct / nb_valid_links
+ except ZeroDivisionError:
+ self.level_acc = 0.0
+ return self.level_acc
+
+ def get_title_acc(self, nb_valid_links):
+ try:
+ self.title_acc = self.correct / nb_valid_links
+ except ZeroDivisionError:
+ self.title_acc = 0.0
+ return self.title_acc
+
+ def format_res(self):
+ out = super().format_res()
+ out.append("%6s" % self.format_float_percent(self.level_acc))
+ return out
+
+ def compute_avg_level_acc(self):
+ return np.mean(list(self.level_acc_per_doc.values()))
+
+ def compute_std_level_acc(self):
+ return np.std(list(self.level_acc_per_doc.values()))
+
+
+class XeroxMetric(ICDARMetric):
+
+ def __init__(self):
+ super().__init__()
+ self.text_sim = 0
+
+ def initialize_stats(self):
+ super().initialize_stats()
+ self.text_sim = 0
+
+ def get_title_acc(self):
+ try:
+ self.title_acc = self.text_sim / float(self.correct)
+ except ZeroDivisionError:
+ self.title_acc = 0.0
+ return self.title_acc
+
+
+class Stats:
+
+ def __init__(self):
+ self.ok_per_doc = {}
+ self.pbttl_per_doc = {}
+ self.pblvl_per_doc = {}
+ self.err_per_doc = {}
+ self.miss_per_doc = {}
+
+ def compute_sum_ok(self):
+ return sum(list(self.ok_per_doc.values()))
+
+ def compute_sum_pbttl(self):
+ return sum(list(self.pbttl_per_doc.values()))
+
+ def compute_sum_pblvl(self):
+ return sum(list(self.pblvl_per_doc.values()))
+
+ def compute_sum_err(self):
+ return sum(list(self.err_per_doc.values()))
+
+ def compute_sum_miss(self):
+ return sum(list(self.miss_per_doc.values()))
+
+
+class Writer:
+
+ def __init__(self):
+ self.toc_rows = self.format_icdar_heading()
+ self.td_rows = self.format_td_heading()
+
+ @classmethod
+ def format_icdar_heading(self):
+ out = [
+ "Doc", "Xrx-P", "Xrx-R", "Xrx-F1", "Xrx-Title acc", "Inex08-P", "Inex08-R",
+ "Inex08-F1", "Inex08-Title acc", "Inex08-Level acc", "Ok", "PbTtl",
+ "PbLvl", "Err", "Miss", "book id"
+ ]
+ return [out]
+
+ @classmethod
+ def format_td_heading(self):
+ out = ["Doc", "Prec", "Rec", "F1", "Book id"]
+ return [out]
+
+ def dump_all(self):
+ self.dump_toc()
+ self.dump_td()
+
+ def dump_toc(self):
+ with open("toc_report.csv", "w", encoding="utf-8") as outfile:
+ writer = csv.writer(outfile, dialect=csv.excel, delimiter="\t")
+ writer.writerows(self.toc_rows)
+
+ def dump_td(self):
+ with open("td_report.csv", "w", encoding="utf-8") as outfile:
+ writer = csv.writer(outfile, dialect=csv.excel, delimiter="\t")
+ writer.writerows(self.td_rows)
+
+
+def score_title_detection(toc1, toc2, log):
+ correct = 0
+ for entry1 in toc1.entries:
+ res = find_matching_entry(entry1.text, toc2)
+ if res is not None:
+ index, match_score = res
+ matched_text = toc2.entries[index].text
+ msg = "Gt title %s is matched to %s (score=%.4g)" % (repr(entry1.text), repr(matched_text), match_score)
+ log.info(msg)
+ entry1.matched = True
+ toc2.entries[index].matched = True
+ correct += 1
+ else:
+ log.info(f"Gt title {repr(entry1.text)} is not matched to any submission title")
+ for entry in toc2.entries:
+ if not entry.matched:
+ log.info(f"{entry} in submission not matched")
+ added = len([entry for entry in toc2.entries if not entry.matched])
+ missed = len([entry for entry in toc1.entries if not entry.matched])
+ log.info("nb of added titles: %i", added)
+ log.info("nb of missed titles: %i", missed)
+ log.info("nb of correct titles: %i", correct)
+ # return score
+ try:
+ prec = correct / (correct + added)
+ except ZeroDivisionError:
+ prec = 0.0
+ try:
+ reca = correct / (correct + missed)
+ except ZeroDivisionError:
+ reca = 0.0
+ try:
+ f1_score = 2 * prec * reca / (prec + reca)
+ except ZeroDivisionError:
+ f1_score = 0.0
+ return prec, reca, f1_score
+
+
+def find_matching_entry(text, toc):
+ if len(toc.entries) == 0:
+ return None
+ similarities = []
+ for entry in toc.entries:
+ if not entry.matched:
+ similarities.append(Levenshtein.ratio(text, entry.text))
+ else:
+ similarities.append(0)
+ index, match_score = max(enumerate(similarities), key=itemgetter(1))
+ if match_score > STRING_THRESHOLD:
+ return index, match_score
+ return None
+
+
+def update_icdar_stats(toc1, toc2, inex_metric, xerox_metric, log):
+ i1, i2 = 0, 0
+ if len(toc1.entries) > 0 and len(toc2.entries) > 0:
+ entry1 = toc1.entries[i1]
+ entry2 = toc2.entries[i2]
+ while True:
+ link_result = entry1.compare_page_nb(entry2)
+ try:
+ if link_result == 0:
+ xerox_metric.correct += 1
+ text_similarity = Levenshtein.ratio(entry1.text, entry2.text)
+ xerox_metric.text_sim += text_similarity
+ if text_similarity > STRING_THRESHOLD:
+ inex_metric.correct += 1
+ else:
+ if VERBOSE:
+ log.info(f"TITLE ERROR: {entry1} <--> {repr(entry2.text)}")
+ inex_metric.mismatch += 1
+ depth_result = entry1.compare_depth(entry2)
+ if depth_result == 0:
+ inex_metric.level_correct += 1
+ i1 += 1
+ i2 += 1
+ entry1 = toc1.entries[i1]
+ entry2 = toc2.entries[i2]
+ elif link_result < 0:
+ inex_metric.missed += 1
+ xerox_metric.missed += 1
+ if VERBOSE:
+ log.info(f"MISS: {entry1}")
+ i1 += 1
+ entry1 = toc1.entries[i1]
+ else:
+ inex_metric.added += 1
+ xerox_metric.added += 1
+ if VERBOSE:
+ log.info(f"ADDED: {entry2}")
+ i2 += 1
+ entry2 = toc2.entries[i2]
+ except IndexError:
+ break
+ # take into account remaining entries in gt
+ while i1 < len(toc1.entries):
+ if VERBOSE:
+ entry1 = toc1.entries[i1]
+ log.info(f"MISS: {entry1}")
+ i1 += 1
+ inex_metric.missed += 1
+ xerox_metric.missed += 1
+ # take into account remaining entries in submission
+ while i2 < len(toc2.entries):
+ if VERBOSE:
+ entry2 = toc2.entries[i2]
+ log.info(f"ADDED: {entry2}")
+ i2 += 1
+ inex_metric.added += 1
+ xerox_metric.added += 1
+
+
+def score(folder1, folder2):
+ def get_docnames(folder, ext):
+ out = []
+ for ele in ls(folder, ext):
+ out.append(basename(ele, ext))
+ return out
+
+ docnames1 = get_docnames(folder1, JSON_EXTENSION)
+ docnames2 = get_docnames(folder2, JSON_EXTENSION)
+ docnames = list(set(docnames1) & set(docnames2))
+ n_missing_docs = len([ele for ele in docnames1 if ele not in docnames2])
+ n_added_docs = len([ele for ele in docnames2 if ele not in docnames1])
+ writer = Writer()
+ doc_id = 0
+ # TOC generation metrics
+ inex = InexMetric()
+ xerox = XeroxMetric()
+ count = Stats()
+ # Title detection metrics
+ td_prec = dict(zip(docnames, [None] * len(docnames)))
+ td_reca = dict(zip(docnames, [None] * len(docnames)))
+ td_f1 = dict(zip(docnames, [None] * len(docnames)))
+ # loggers
+ toc_logger = get_logger("toc", "toc.log")
+ td_logger = get_logger("td", "td.log")
+ for json1 in ls(folder1, JSON_EXTENSION):
+ xerox.initialize_stats()
+ inex.initialize_stats()
+ toc1 = TOCJson(json1)
+ docname = basename(json1, JSON_EXTENSION)
+ if VERBOSE:
+ toc_logger.info(f"\n\nCOMPARING {docname}")
+ td_logger.info(f"\n\nCOMPARING {docname}")
+ json2 = os.path.join(folder2, docname + JSON_EXTENSION)
+ if not os.path.isfile(json2):
+ toc_logger.info(f"{docname} missing from submission")
+ td_logger.info(f"{docname} missing from submission")
+ else:
+ # Title detection
+ toc2 = TOCJson(json2)
+ td_prec[docname], td_reca[docname], td_f1[docname] = score_title_detection(toc1, toc2, td_logger)
+ writer.td_rows.append([doc_id, td_prec[docname], td_reca[docname], td_f1[docname], docname])
+ # TOC generation
+ update_icdar_stats(toc1, toc2, inex, xerox, toc_logger)
+ # compute stats
+ count.ok_per_doc[docname] = xerox.correct
+ count.pbttl_per_doc[docname] = xerox.correct - inex.correct
+ count.pblvl_per_doc[docname] = xerox.correct - inex.level_correct
+ count.err_per_doc[docname] = xerox.added
+ count.miss_per_doc[docname] = xerox.missed
+ # compute Xerox score
+ xerox.compute_prf()
+ xerox.p_per_doc[docname] = xerox.prec
+ xerox.r_per_doc[docname] = xerox.reca
+ xerox.f_per_doc[docname] = xerox.f_score
+ xerox.title_acc_per_doc[docname] = xerox.get_title_acc()
+ # compute Inex score
+ inex.compute_prf()
+ inex.p_per_doc[docname] = inex.prec
+ inex.r_per_doc[docname] = inex.reca
+ inex.f_per_doc[docname] = inex.f_score
+ inex.title_acc_per_doc[docname] = inex.get_title_acc(xerox.correct)
+ inex.level_acc_per_doc[docname] = inex.get_level_acc(xerox.correct)
+ # result row
+ writer.toc_rows.append(get_row_result(doc_id, docname, xerox, inex))
+ doc_id += 1
+ # get avg and std scores
+ writer.toc_rows.append(get_avg_row(xerox, inex, count))
+ writer.toc_rows.append(get_std_row(xerox, inex))
+ writer.td_rows.append(get_avg_row(td_prec, td_reca, td_f1))
+ writer.td_rows.append(get_std_row(td_prec, td_reca, td_f1))
+ # get stats about missing and added docs
+ writer.toc_rows.append(
+ [f"Done: {len(docnames)} comparisons for {len(docnames1)} in groundtruth and {len(docnames2)} in submission"])
+ if n_missing_docs:
+ writer.toc_rows.append([f"{n_missing_docs} docs missing from submission"])
+ if n_added_docs:
+ writer.toc_rows.append([f"{n_added_docs} additional docs in submission (ignored)"])
+ # dump
+ writer.dump_all()
+
+
+def get_row_result(doc_id, doc, xerox, inex):
+ out = ["%4s " % doc_id]
+ out.extend(xerox.format_res())
+ out.extend(inex.format_res())
+ out.append("%7s" % xerox.correct)
+ out.append("%7s" % (xerox.correct - inex.correct))
+ out.append("%7s" % (xerox.correct - inex.level_correct))
+ out.append("%7s" % xerox.added)
+ out.append("%7s" % xerox.missed)
+ out.append("%s" % doc)
+ return out
+
+
+"""
+https://medium.com/practo-engineering/function-overloading-in-python-94a8b10d1e08
+"""
+registry = {}
+
+
+class MultiMethod(object):
+ def __init__(self, name):
+ self.name = name
+ self.typemap = {}
+
+ def __call__(self, *args):
+ types = tuple(arg.__class__ for arg in args)
+ function = self.typemap.get(types)
+ if function is None:
+ raise TypeError("no match")
+ return function(*args)
+
+ def register(self, types, function):
+ self.typemap[types] = function
+
+
+def overload(*types):
+ def register(function):
+ name = function.__name__
+ mm = registry.get(name)
+ if mm is None:
+ mm = registry[name] = MultiMethod(name)
+ mm.register(types, function)
+ return mm
+
+ return register
+
+
+"""
+https://medium.com/practo-engineering/function-overloading-in-python-94a8b10d1e08
+"""
+
+
+@overload(XeroxMetric, InexMetric, Stats)
+def get_avg_row(xerox, inex, count):
+ out = []
+ out.append("%4s " % "AVG")
+ # xerox
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_p()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_r()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_f()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_avg_title_acc()))
+ # inex
+ out.append("%6s" % inex.format_float_percent(inex.compute_avg_p()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_avg_r()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_avg_f()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_avg_title_acc()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_avg_level_acc()))
+ # count stats
+ out.append("%7s" % (count.compute_sum_ok()))
+ out.append("%7s" % (count.compute_sum_pbttl()))
+ out.append("%7s" % (count.compute_sum_pblvl()))
+ out.append("%7s" % (count.compute_sum_err()))
+ out.append("%7s" % (count.compute_sum_miss()))
+ return out
+
+
+@overload(XeroxMetric, InexMetric)
+def get_std_row(xerox, inex):
+ out = ["%4s " % "sdev"]
+ # xerox
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_std_p()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_std_r()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_std_f()))
+ out.append("%6s" % xerox.format_float_percent(xerox.compute_std_title_acc()))
+ # inex
+ out.append("%6s" % inex.format_float_percent(inex.compute_std_p()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_std_r()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_std_f()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_std_title_acc()))
+ out.append("%6s" % inex.format_float_percent(inex.compute_std_level_acc()))
+ return out
+
+
+@overload(dict, dict, dict)
+def get_avg_row(td_prec, td_reca, td_f1):
+ return [
+ "AVG",
+ np.mean(list(td_prec.values())),
+ np.mean(list(td_reca.values())),
+ np.mean(list(td_f1.values()))
+ ]
+
+
+@overload(dict, dict, dict)
+def get_std_row(td_prec, td_reca, td_f1):
+ return [
+ "stdev",
+ np.std(list(td_prec.values())),
+ np.std(list(td_reca.values())),
+ np.std(list(td_f1.values()))
+ ]
+
+
+def get_logger(name, path_to_log, level=logging.INFO):
+ handler = logging.FileHandler(path_to_log, mode="w")
+ formatter = logging.Formatter("%(message)s")
+ handler.setFormatter(formatter)
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
+ logger.addHandler(handler)
+ return logger
+
+
+def basename(path, ext):
+ return os.path.basename(path).split(ext)[0]
+
+
+def ls(folder, ext):
+ pattern = os.path.join(folder, "*" + ext)
+ return glob(pattern)
+
+
+if __name__ == "__main__":
+ PARSER = argparse.ArgumentParser(description="This is the scoring script used for FincTOC2021. It outputs two csv "
+ "reports, one for title detection, and another for toc generation. It also logs "
+ "information in two separate log files.")
+ PARSER.add_argument("--gt_folder", required=True, type=str,
+ help="path to folder containing groundtruth files (one groundtruth file in json format per document")
+ PARSER.add_argument("--submission_folder", required=True, type=str,
+ help="path to folder containing submission files (one submission file in json format per document")
+ ARGS = PARSER.parse_args()
+ score(ARGS.gt_folder, ARGS.submission_folder)
diff --git a/scripts/fintoc2022/toc_extractor.py b/scripts/fintoc2022/toc_extractor.py
new file mode 100755
index 00000000..6c7eb826
--- /dev/null
+++ b/scripts/fintoc2022/toc_extractor.py
@@ -0,0 +1,93 @@
+import json
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import zipfile
+from typing import List
+
+import wget
+from PyPDF2 import PdfFileReader, PdfFileWriter
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from tqdm import tqdm
+
+from config import _config as config
+from doc_reader.line_type_classifier.feature_extractor.toc_feature_extractor import TOCFeatureExtractor
+from doc_reader.readers.scanned_reader.pdftxtlayer_reader.pdf_with_text_reader import PdfWithTextReader
+
+toc_item = re.compile(r'"([^"]+)" (\d+)')
+reader = PdfWithTextReader(config=config)
+
+
+def get_one_columns_lines(path: str) -> List[LineWithMeta]:
+ if path.startswith("file://"):
+ path = path[len("file://"):]
+ with tempfile.TemporaryDirectory() as tmpdir:
+ path_tmp = os.path.join(tmpdir, os.path.basename(path))
+ pdf_reader = PdfFileReader(path)
+ writer = PdfFileWriter()
+ for page_id in range(0, min(9, pdf_reader.getNumPages())):
+ writer.addPage(pdf_reader.getPage(page_id))
+ with open(path_tmp, 'wb') as write_file:
+ writer.write(write_file)
+ return reader.read(path=path_tmp, document_type=None, parameters={"is_one_column_document": True,
+ "need_header_footer_analysis": "True"}).lines
+
+
+def get_automatic_toc(path: str) -> List[dict]:
+ result = []
+ cmd = "pdftocio -p {}".format(path)
+ with os.popen(cmd) as out:
+ toc = out.readlines()
+ if toc:
+ for line in toc:
+ match = toc_item.match(line.strip())
+ if match:
+ result.append({"text": match.group(1), "page": match.group(2)})
+ return result
+
+
+def main(dir_out: str):
+ toc_extractor = TOCFeatureExtractor()
+ os.makedirs(dir_out, exist_ok=True)
+ data_url = "https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download"
+ print("use 'pip install -U pdf.tocgen' to install tool for automatic toc extraction")
+ subprocess.run("pip install -U pdf.tocgen".split(" "))
+
+ root = "/tmp/.fintoc/"
+ if os.path.isdir(root):
+ shutil.rmtree(root)
+ os.makedirs(root)
+ archive = os.path.join(root, "dataset.zip")
+ wget.download(data_url, archive)
+ with zipfile.ZipFile(archive, 'r') as zip_ref:
+ zip_ref.extractall(root)
+ data_dir = os.path.join(root, "data")
+
+ for lang in ("en", "fr", "sp"):
+ pdf_dir = os.path.join(data_dir, lang, "pdf")
+ lang_dir_out = os.path.join(dir_out, lang)
+ if os.path.isdir(lang_dir_out):
+ shutil.rmtree(lang_dir_out)
+ os.makedirs(lang_dir_out)
+
+ tocs = {}
+ for file in tqdm(os.listdir(pdf_dir)):
+ if not file.endswith(".pdf"):
+ continue
+ path = os.path.join(pdf_dir, file)
+ toc = get_automatic_toc(path)
+ if len(toc) == 0:
+ lines = get_one_columns_lines(path)
+ toc = toc_extractor.get_toc(lines)
+ doc_name = file[: -len(".pdf")]
+ tocs[doc_name] = toc
+ with open(os.path.join(lang_dir_out, f"{doc_name}_toc.json"), "w") as out:
+ json.dump(obj=toc, fp=out, indent=4, ensure_ascii=False)
+
+ with open(os.path.join(dir_out, f"{lang}_toc.json"), "w") as out:
+ json.dump(tocs, out)
+
+
+main(dir_out="/home/nasty/fintoc2022/toc")
diff --git a/scripts/fintoc2022/train_fintoc_classifier.py b/scripts/fintoc2022/train_fintoc_classifier.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/fintoc2022/utils.py b/scripts/fintoc2022/utils.py
new file mode 100755
index 00000000..9a4ab457
--- /dev/null
+++ b/scripts/fintoc2022/utils.py
@@ -0,0 +1,43 @@
+import os
+from collections import defaultdict
+from typing import List, Tuple
+
+import pandas as pd
+
+
+def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict:
+ """
+ Creates dictionary with TOCs for each document: {"doc_name": TOC}.
+ TOC is a following list of dictionaries:
+ [
+ {
+ "text": String, # text of the TOC item/entry
+ "id": Int # identifier of the item/entry corresponding to its order in the TOC
+ "depth": Int # hierarchical level of the item
+ "page": Int # the (physical) page number where the item appears
+ }
+ ]
+ """
+ uid2line = {item[1].uid: {"text": item[1].text.strip() if isinstance(item[1].text, str) else "",
+ "page": item[1].page_id + 1,
+ "group": item[1].group} for item in data.iterrows()}
+ result = defaultdict(list)
+ assert(data.shape[0] == len(predictions))
+ for i, (line_uid, prediction) in enumerate(zip(data.uid, predictions)):
+ line = uid2line[line_uid]
+ if line["text"] == "":
+ continue
+ # TODO crop text lines containing colon
+ result[line["group"]].append({"id": i, "depth": str(prediction),
+ "text": line["text"], "page": line["page"]})
+ return result
+
+
+def get_values_from_csv(dir_path: str) -> Tuple[float, float]:
+ td_name = "td_report.csv"
+ toc_name = "toc_report.csv"
+ td_df = pd.read_csv(os.path.join(dir_path, td_name), delimiter="\t")
+ toc_df = pd.read_csv(os.path.join(dir_path, toc_name), delimiter="\t")
+ f1 = td_df[td_df["Doc"] == "AVG"]["F1"].item()
+ inex_f1 = toc_df[toc_df["Doc"] == " AVG "]["Inex08-F1"].item()
+ return f1, inex_f1
From be31fca00756ba6362c5469b2acdea54308741ea Mon Sep 17 00:00:00 2001
From: Nasty
Date: Tue, 23 Apr 2024 17:13:01 +0300
Subject: [PATCH 2/8] Big refactoring of fintoc scripts
---
dedoc/readers/pdf_reader/pdf_base_reader.py | 2 +-
.../fintoc_structure_extractor.py | 95 ++++++-
.../fintoc_feature_extractor.py | 136 ++-------
.../toc_feature_extractor.py | 6 +-
.../fintoc_classifier.py | 269 +++++-------------
requirements.txt | 1 +
scripts/benchmark_pdf_performance.py | 2 +-
scripts/fintoc2022/benchmark_fintoc.py | 0
scripts/fintoc2022/dataset_loader.py | 31 +-
scripts/fintoc2022/toc_extractor.py | 93 ------
scripts/fintoc2022/train_fintoc_classifier.py | 45 +++
scripts/fintoc2022/trainer.py | 173 +++++++++++
scripts/fintoc2022/utils.py | 15 +-
13 files changed, 433 insertions(+), 435 deletions(-)
delete mode 100644 scripts/fintoc2022/benchmark_fintoc.py
delete mode 100755 scripts/fintoc2022/toc_extractor.py
create mode 100644 scripts/fintoc2022/trainer.py
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
index 8372fb92..688d82c7 100644
--- a/dedoc/readers/pdf_reader/pdf_base_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -53,7 +53,7 @@ class PdfBaseReader(BaseReader):
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)
- self.config["n_jobs"] = config.get("n_jobs", 1)
+ self.config["n_jobs"] = self.config.get("n_jobs", 1)
self.table_recognizer = TableRecognizer(config=self.config)
self.metadata_extractor = LineMetadataExtractor(config=self.config)
self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
index 517fe13f..3d69ccad 100644
--- a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
+++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
@@ -1,7 +1,17 @@
-from typing import Optional
+import os
+import re
+import tempfile
+from typing import Dict, List, Optional, Tuple, Union
-from dedoc.data_structures import UnstructuredDocument
+import pandas as pd
+from PyPDF2 import PdfFileReader, PdfFileWriter
+
+from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument
+from dedoc.readers import PdfTxtlayerReader
from dedoc.structure_extractors import AbstractStructureExtractor
+from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
+from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor
+from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier
class FintocStructureExtractor(AbstractStructureExtractor):
@@ -13,11 +23,90 @@ class FintocStructureExtractor(AbstractStructureExtractor):
"""
document_type = "fintoc"
- def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
+ def __init__(self, *, config: Optional[dict] = None) -> None:
+ super().__init__(config=config)
+ self.pdf_reader = PdfTxtlayerReader(config=self.config)
+ self.toc_extractor = TOCFeatureExtractor()
+ self.features_extractor = FintocFeatureExtractor()
+ self.languages = ("en", "fr", "sp")
+ self.classifiers = {language: FintocClassifier(language=language) for language in self.languages}
+ self.toc_item_regexp = re.compile(r'"([^"]+)" (\d+)')
+ self.empty_string_regexp = re.compile(r"^\s*\n$")
+
+ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None, file_path: Optional[str] = None) -> UnstructuredDocument:
"""
To get the information about the method's parameters look at the documentation of the class \
:class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
"""
+ parameters = {} if parameters is None else parameters
+ language = parameters.get("language", "en")
+ if language not in self.languages:
+ raise ValueError(f"Language {language} is not supported by this extractor. Supported languages: {self.languages}")
+
+ features, documents = self.get_features(documents_dict={file_path: document.lines})
+ predictions = self.classifiers[language].predict(features)
+ lines: List[LineWithMeta] = documents[0]
+ assert len(lines) == len(predictions)
+
+ for line, prediction in zip(lines, predictions):
+ if prediction > 0:
+ line.metadata.hierarchy_level = HierarchyLevel(level_1=1, level_2=prediction, line_type=HierarchyLevel.header, can_be_multiline=True)
+ else:
+ line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
+ document.lines = lines
return document
+
+ def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[pd.DataFrame, List[List[LineWithMeta]]]:
+ toc_lines, documents = [], []
+ for file_path, document_lines in documents_dict.items():
+ toc_lines.append(self.__get_toc(file_path=file_path))
+ documents.append(self.__filter_lines(document_lines))
+ features = self.features_extractor.transform(documents=documents, toc_lines=toc_lines)
+ return features, documents
+
+ def __filter_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
+ special_unicode_symbols = [u"\uf0b7", u"\uf0d8", u"\uf084", u"\uf0a7", u"\uf0f0", u"\x83"]
+
+ lines = [line for line in lines if not self.empty_string_regexp.match(line.line)]
+ for line in lines:
+ for ch in special_unicode_symbols:
+ line.set_line(line.line.replace(ch, ""))
+
+ return lines
+
+ def __get_toc(self, file_path: Optional[str]) -> Optional[List[Dict[str, Union[LineWithMeta, str]]]]:
+ if file_path is None:
+ return
+
+ toc = self.__get_automatic_toc(path=file_path)
+ if len(toc) > 0:
+ return toc
+
+ pdf_reader = PdfFileReader(file_path)
+ writer = PdfFileWriter()
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmp_path = os.path.join(tmpdir, os.path.basename(file_path))
+ for page_id in range(0, min(9, pdf_reader.getNumPages())):
+ writer.addPage(pdf_reader.getPage(page_id))
+ with open(tmp_path, "wb") as write_file:
+ writer.write(write_file)
+ lines = self.pdf_reader.read(file_path=tmp_path, parameters={"is_one_column_document": "True", "need_header_footer_analysis": "True"}).lines
+
+ return self.toc_extractor.get_toc(lines)
+
+ def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]:
+ result = []
+ with os.popen(f"pdftocio -p {path}") as out:
+ toc = out.readlines()
+ if len(toc) == 0:
+ return result
+
+ for line in toc:
+ match = self.toc_item_regexp.match(line.strip())
+ if match:
+ result.append({"line": LineWithMeta(match.group(1)), "page": match.group(2)})
+
+ return result
diff --git a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
index b5719f44..79ca919e 100644
--- a/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
+++ b/dedoc/structure_extractors/feature_extractors/fintoc_feature_extractor.py
@@ -1,16 +1,9 @@
-import gzip
-import json
-import os
-import pickle
import re
-import zipfile
from collections import defaultdict
from typing import Dict, Iterator, List, Optional, Tuple
import pandas as pd
-import wget
from Levenshtein._levenshtein import ratio
-from tqdm import tqdm
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor
@@ -27,12 +20,11 @@
from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor
from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max
from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_year
-from dedoc.utils.utils import flatten
class FintocFeatureExtractor(AbstractFeatureExtractor):
- def __init__(self, tocs: dict):
+ def __init__(self) -> None:
self.paired_feature_extractor = PairedFeatureExtractor()
self.prefix_list = [BulletPrefix, AnyLetterPrefix, LetterPrefix, BracketPrefix, BracketRomanPrefix, DottedPrefix, RomanPrefix]
self.list_feature_extractors = [
@@ -42,7 +34,6 @@ def __init__(self, tocs: dict):
]
self.prefix2number = {prefix.name: i for i, prefix in enumerate(self.prefix_list, start=1)}
self.prefix2number[EmptyPrefix.name] = 0
- self.tocs = tocs
def parameters(self) -> dict:
return {}
@@ -50,17 +41,15 @@ def parameters(self) -> dict:
def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> "AbstractFeatureExtractor":
return self
- def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame:
+ def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None, toc_lines: Optional[List[List[dict]]] = None) -> pd.DataFrame:
assert len(documents) > 0
- result_matrix = pd.concat([self.__process_document(document) for document in tqdm(documents)], ignore_index=True)
+ result_matrix = pd.concat([self.__process_document(document, d_toc_lines) for document, d_toc_lines in zip(documents, toc_lines)], ignore_index=True)
result_matrix = pd.concat([result_matrix, self.paired_feature_extractor.transform(documents)], axis=1)
features = sorted(result_matrix.columns)
result_matrix = result_matrix[features].astype(float)
- result_matrix["text"] = [line.line for line in flatten(documents)]
- features.append("text")
return result_matrix[features]
- def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame:
+ def __process_document(self, lines: List[LineWithMeta], toc: Optional[list] = None) -> pd.DataFrame:
features_df = pd.DataFrame(self.__look_at_prev_line(document=lines, n=1))
features_df["line_relative_length"] = self.__get_line_relative_length(lines)
@@ -71,8 +60,8 @@ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame:
total_lines = len(lines)
one_line_features_dict = defaultdict(list)
- for line_id, line in enumerate(lines):
- for item in self.__one_line_features(line, total_lines, start_page=start_page, finish_page=finish_page):
+ for line in lines:
+ for item in self.__one_line_features(line, total_lines, start_page=start_page, finish_page=finish_page, toc=toc):
feature_name, feature = item[0], item[1]
one_line_features_dict[feature_name].append(feature)
@@ -81,6 +70,7 @@ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame:
one_line_features_df = self.prev_next_line_features(one_line_features_df, 3, 3)
result_matrix = pd.concat([one_line_features_df, features_df, list_features], axis=1)
+ result_matrix["page_id"] = [line.metadata.page_id for line in lines]
return result_matrix
def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[str, List]:
@@ -92,10 +82,10 @@ def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[
:return: dict of features
"""
res = defaultdict(list)
- for line_id, line in enumerate(document):
+ for line_id, _ in enumerate(document):
if line_id >= n:
prev_line = document[line_id - n]
- is_prev_line_ends = prev_line.line.endswith(('.', ';'))
+ is_prev_line_ends = prev_line.line.endswith((".", ";"))
res["prev_line_ends"].append(1 if is_prev_line_ends else 0)
res["prev_ends_with_colon"].append(prev_line.line.endswith(":"))
res["prev_is_space"].append(prev_line.line.lower().isspace())
@@ -110,7 +100,7 @@ def __get_line_relative_length(self, lines: List[LineWithMeta]) -> List[float]:
relative_lengths = [len(line.line) / max_len for line in lines]
return relative_lengths
- def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page: int, finish_page: int) -> Iterator[Tuple[str, int]]:
+ def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page: int, finish_page: int, toc: Optional[list]) -> Iterator[tuple]:
yield "normalized_page_id", normalization_by_min_max(line.metadata.page_id, min_v=start_page, max_v=finish_page)
yield "indentation", self._get_indentation(line)
yield "spacing", self._get_spacing(line)
@@ -125,19 +115,19 @@ def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page:
yield "endswith_semicolon", line.line.endswith(";")
yield "endswith_colon", line.line.endswith(":")
yield "endswith_comma", line.line.endswith(",")
- yield "startswith_bracket", line.line.strip().startswith(('(', '{'))
+ yield "startswith_bracket", line.line.strip().startswith(("(", "{"))
bracket_cnt = 0
for char in line.line:
- if char == '(':
+ if char == "(":
bracket_cnt += 1
- elif char == ')':
+ elif char == ")":
bracket_cnt = max(0, bracket_cnt - 1)
yield "bracket_num", bracket_cnt
probable_toc_title = re.sub(r"[\s:]", "", line.line).lower()
yield "is_toc_title", probable_toc_title in TOCFeatureExtractor.titles
- yield from self.__find_in_toc(line)
+ yield from self.__find_in_toc(line, toc)
line_length = len(line.line) + 1
yield "supper_percent", sum((1 for letter in line.line if letter.isupper())) / line_length
@@ -145,101 +135,25 @@ def __one_line_features(self, line: LineWithMeta, total_lines: int, start_page:
yield "number_percent", sum((1 for letter in line.line if letter.isnumeric())) / line_length
yield "words_number", len(line.line.split())
- def __find_in_toc(self, line: LineWithMeta) -> Iterator[Tuple[str, int]]:
- if not hasattr(line, "group"):
+ def __find_in_toc(self, line: LineWithMeta, toc: Optional[List[dict]]) -> Iterator[Tuple[str, int]]:
+ if toc is None:
yield "is_toc", 0
yield "in_toc", 0
yield "toc_exists", 0
else:
- toc = self.tocs.get(line.group, [])
- is_toc = 0
- in_toc = 0
- toc_exists = int(len(toc) > 0)
+ is_toc, in_toc, toc_exists = 0, 0, int(len(toc) > 0)
line_text = line.line.lower().strip()
for item in toc:
- if ratio(line_text, item["text"].lower()) > 0.8:
+ if ratio(line_text, item["line"].line.lower()) < 0.8:
+ continue
+ # toc entry found
+ try:
is_toc = 0 if line.metadata.page_id + 1 == int(item["page"]) else 1
in_toc = 1 if line.metadata.page_id + 1 == int(item["page"]) else 0
- break
+ except TypeError:
+ pass
+ break
+
yield "is_toc", is_toc
yield "in_toc", in_toc
yield "toc_exists", toc_exists
-
-
-def handle_file(file: str, dir_out: str, extractor: AbstractFeatureExtractor):
- file_name = os.path.split(file)[-1].split(".")[0]
- with gzip.open(file) as f_in:
- lines = pickle.load(file=f_in)
- df = lines2dataframe(lines, extractor)
- df.to_csv(os.path.join(dir_out, file_name + "_df.csv.gz"), index=False)
- df.to_pickle(os.path.join(dir_out, file_name + "_df.pkl.gz"))
-
-
-def lines2dataframe(lines: List[LineWithLabel], extractor: AbstractFeatureExtractor) -> pd.DataFrame:
- assert(len(lines) > 0)
- lines2docs = []
- current_document = None
- reg_empty_string = re.compile(r"^\s*\n$")
- special_unicode_symbols = [u"\uf0b7", u"\uf0d8", u"\uf084", u"\uf0a7", u"\uf0f0", u"\x83"]
-
- lines = [line for line in lines if not reg_empty_string.match(line.line)]
- for line in lines:
- for ch in special_unicode_symbols:
- line.set_line(line.line.replace(ch, ""))
- if line.group == current_document:
- lines2docs[-1].append(line)
- else:
- current_document = line.group
- lines2docs.append([line])
- df = extractor.transform(lines2docs)
-
- df["label"] = [int(line.label) for line in lines]
- df["group"] = [line.group for line in lines]
- df["uid"] = [line.uid for line in lines]
- df["page_id"] = [line.metadata.page_id for line in lines]
- return df
-
-
-def main(dir_out: str, train: bool):
- os.makedirs(dir_out, exist_ok=True)
-
- root = "/tmp/.fintoc/train" if train else "/tmp/.fintoc/test"
- lines_dir = os.path.join(root, "lines")
- if train:
- lines_url = "https://at.ispras.ru/owncloud/index.php/s/yvYn491d6Du8ZuV/download" # train
- else:
- lines_url = "https://at.ispras.ru/owncloud/index.php/s/h3TdYfQipiVAxpE/download" # test
-
- toc_dir = os.path.join(root, "toc")
- if train:
- toc_url = "https://at.ispras.ru/owncloud/index.php/s/0VJbQWrD11R98Sy/download" # train
- else:
- toc_url = "https://at.ispras.ru/owncloud/index.php/s/GCoZitUsfCLPLVI/download" # test
-
- if not os.path.isdir(root):
- os.makedirs(root)
-
- if not os.path.isdir(lines_dir):
- archive = os.path.join(root, "lines.zip")
- wget.download(lines_url, archive)
- with zipfile.ZipFile(archive, 'r') as zip_ref:
- zip_ref.extractall(root)
-
- if not os.path.isdir(toc_dir):
- archive = os.path.join(root, "toc.zip")
- wget.download(toc_url, archive)
- with zipfile.ZipFile(archive, 'r') as zip_ref:
- zip_ref.extractall(root)
-
- for lang in tqdm(["en", "fr", "sp"]):
- lines_file = os.path.join(lines_dir, f"lines_{lang}_txt_layer.pkg.gz")
- tocs_file = os.path.join(toc_dir, f"{lang}_toc.json")
- with open(tocs_file) as f:
- tocs = json.load(f)
- extractor = FintocFeatureExtractor(tocs)
- handle_file(file=lines_file, extractor=extractor, dir_out=dir_out)
-
-
-if __name__ == '__main__':
- stage = "test"
- main(dir_out=f"/home/nasty/fintoc2022/{stage}/pandas", train=stage == "train")
diff --git a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py
index 28fab042..a0000e0a 100644
--- a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py
+++ b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py
@@ -1,5 +1,5 @@
import re
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from Levenshtein._levenshtein import ratio
@@ -17,11 +17,11 @@ class TOCFeatureExtractor:
"indice", "índice", "contenidos", "tabladecontenido" # spanish
)
- def get_toc(self, document: List[LineWithMeta]) -> List[dict]:
+ def get_toc(self, document: List[LineWithMeta]) -> List[Dict[str, Union[LineWithMeta, str]]]:
"""
Finds the table of contents in the given document
Returns:
- list of dictionaries with toc item and page number where it is located: {"line", "page"}
+ list of dictionaries with toc item (LineWithMeta) and page number where it is located: {"line", "page"}
"""
corrected_lines, marks = self.__get_probable_toc(document)
diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
index 1e0b1541..0c761ee0 100755
--- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
+++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
@@ -1,213 +1,80 @@
import gzip
-import json
+import logging
import os
import pickle
-import shutil
-from statistics import mean
-from typing import List, Optional, Union
+from typing import List, Optional
+import numpy as np
import pandas as pd
import xgbfir
-import xgboost as xgb
-from sklearn.model_selection import GroupKFold
-from tqdm import tqdm
-
-from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
-from train_dataset.data_structures.line_with_label import LineWithLabel
+from xgboost import XGBClassifier
class FintocClassifier:
-
- def __init__(self,
- tocs_path: str,
- save_path: str,
- binary_classifier_params: Optional[dict] = None,
- target_classifier_params: Optional[dict] = None,
- load_trained: bool = False,
- lang: str = "en"):
- self.save_path = save_path
- self.lang = lang
- if load_trained:
- with gzip.open(os.path.join(self.save_path, f"binary_classifier_{self.lang}.pkg.gz"), "rb") as input_file:
- self.binary_classifier = pickle.load(file=input_file)
- with gzip.open(os.path.join(self.save_path, f"target_classifier_{self.lang}.pkg.gz"), "rb") as input_file:
- self.target_classifier = pickle.load(file=input_file)
- else:
- assert(binary_classifier_params is not None and target_classifier_params is not None)
- self.binary_classifier = xgb.XGBClassifier(**binary_classifier_params)
- self.target_classifier = xgb.XGBClassifier(**target_classifier_params)
- with open(tocs_path) as f:
- tocs = json.load(f)
- self.features_extractor = FintocFeatureExtractor(tocs)
-
- def fit(self, data: Union[pd.DataFrame, List[LineWithLabel]],
- cross_val: bool = True,
- save: bool = False,
- gt_dir: Optional[str] = None,
- n_splits: int = 3) -> None:
- if isinstance(data, pd.DataFrame):
- features_df = data
- else:
- features_df = lines2dataframe(data, self.features_extractor)
- print("Features shape: {}".format(features_df.shape))
- results = None
-
- if cross_val:
- assert(gt_dir is not None)
- results = self.evaluate_fintoc_metric(features_df=features_df, gt_dir=gt_dir, n_splits=n_splits)
-
- if not save:
- return
-
- features_names = self.__get_features_names(features_df)
- self.binary_classifier.fit(features_df[features_names], features_df.label != -1)
- self.target_classifier.fit(features_df[features_names][features_df.label != -1],
- features_df.label[features_df.label != -1])
- self._save(features_names, results)
-
- def _save(self, features_names: list, scores: Optional[dict]) -> None:
- os.makedirs(self.save_path, exist_ok=True)
- if scores is not None:
- with open(os.path.join(self.save_path, f"scores_{self.lang}.json"), "w") as f:
- json.dump(scores, f)
- print("Scores were saved in {}".format(os.path.join(self.save_path, f"scores_{self.lang}.json")))
-
- with gzip.open(os.path.join(self.save_path, F"binary_classifier_{self.lang}.pkg.gz"), "wb") as output_file:
- pickle.dump(self.binary_classifier, output_file)
- with gzip.open(os.path.join(self.save_path, f"target_classifier_{self.lang}.pkg.gz"), "wb") as output_file:
- pickle.dump(self.target_classifier, output_file)
- print("Classifiers were saved in {} directory".format(self.save_path))
-
- xgbfir.saveXgbFI(self.binary_classifier, feature_names=features_names,
- OutputXlsxFile=os.path.join(self.save_path, f"feature_importances_binary_{self.lang}.xlsx"))
- xgbfir.saveXgbFI(self.target_classifier, feature_names=features_names,
- OutputXlsxFile=os.path.join(self.save_path, f"feature_importances_target_{self.lang}.xlsx"))
- print("Features importances were saved in {} directory".format(self.save_path))
-
- def predict(self, data: Union[pd.DataFrame, List[LineWithLabel]]) -> dict:
+ """
+ Classifier of financial documents for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/).
+ Lines are classified in two stages:
+ 1. Binary classification title/not title (title detection task)
+ 2. Classification of title lines into title depth classes (1-6) (TOC generation task)
+
+ More important lines have a lesser depth.
+ As a result:
+ 1. For non-title lines, classifier returns -1.
+ 2. For title lines, classifier returns their depth (from 1 to 6).
+ """
+
+ def __init__(self, language: str, weights_dir_path: Optional[str] = None) -> None:
"""
- param lines: list of documents lines, label isn't known or dataframe with lines features
- :return: dict with TOC of the documents in the required format
+ :param language: language of data ("en", "fr", "sp")
+ :param weights_dir_path: path to directory with trained models weights
"""
- if isinstance(data, pd.DataFrame):
- features_df = data
- else:
- features_df = lines2dataframe(data, self.features_extractor)
- features_names = self.__get_features_names(features_df)
- binary_predictions = self.binary_classifier.predict(features_df[features_names])
- features_df["label"] = binary_predictions
- target_predictions = self.target_classifier.predict(features_df[features_names][features_df.label])
- result_dict = create_json_result(features_df[features_df.label], target_predictions)
- return result_dict
-
- def evaluate_fintoc_metric(self,
- features_df: pd.DataFrame,
- gt_dir: str,
- n_splits: int = 3) -> dict:
-
- features_names = self.__get_features_names(features_df)
- results_path = os.path.join(self.save_path, "results")
- os.makedirs(results_path, exist_ok=True)
-
- kf = GroupKFold(n_splits=n_splits)
+ self.weights_dir_path = weights_dir_path
+ self.language = language
+ self.classifiers = {"binary": None, "target": None}
- result_scores = {"td_scores": [], "toc_scores": []}
- for i, (train_index, val_index) in tqdm(enumerate(kf.split(features_df, groups=features_df.group)),
- total=n_splits):
- df_train = features_df.loc[train_index]
- df_val = features_df.loc[val_index]
- self.binary_classifier.fit(df_train[features_names], df_train.label != -1)
- self.target_classifier.fit(
- df_train[features_names][df_train.label != -1], df_train.label[df_train.label != -1])
- result_dict = self.predict(df_val)
-
- tmpdir = "/tmp/fintoc/eval"
- if os.path.isdir(tmpdir):
- shutil.rmtree(tmpdir)
- os.makedirs(tmpdir)
- tmp_gt_dir, predictions_dir = os.path.join(tmpdir, "groundtruth"), os.path.join(tmpdir, "predictions")
- os.makedirs(tmp_gt_dir)
- os.makedirs(predictions_dir)
-
- for doc_name, result in result_dict.items():
- gt_doc_name = doc_name + ".pdf.fintoc4.json"
- if gt_doc_name not in os.listdir(gt_dir):
- print(f"\n{gt_doc_name} not found in groundtruth")
- continue
- with open(os.path.join(predictions_dir, gt_doc_name), "w") as json_file:
- json.dump(result, json_file, indent=2)
- shutil.copy(os.path.join(gt_dir, gt_doc_name), os.path.join(tmp_gt_dir, gt_doc_name))
- score(tmp_gt_dir, predictions_dir)
- shutil.rmtree(tmpdir)
-
- path_scores = os.path.join(results_path, str(i))
- os.makedirs(path_scores, exist_ok=True)
- for file_name in ['td.log', 'toc.log', 'td_report.csv', 'toc_report.csv']:
- shutil.move(file_name, os.path.join(path_scores, file_name))
- f1, inex_f1 = get_values_from_csv(path_scores)
- result_scores["td_scores"].append(f1)
- result_scores["toc_scores"].append(inex_f1)
- print(f"it {i}:\ntd {result_scores['td_scores'][-1]}\ntoc {result_scores['toc_scores'][-1]}")
- result_scores["td_mean"] = mean(result_scores["td_scores"])
- result_scores["toc_mean"] = mean(result_scores["toc_scores"])
- return result_scores
-
- def __get_features_names(self, features_df: pd.DataFrame) -> list:
- features_names = [col for col in features_df.columns if col not in ("text", "label", "group", "uid")]
- return features_names
-
-
-def train_classifier(train_dir: str) -> None:
- clf_params = {
- "en_binary": dict(random_state=42, learning_rate=0.25, max_depth=5, n_estimators=400,
- colsample_bynode=0.8, colsample_bytree=0.5, tree_method="hist"),
- "fr_binary": dict(random_state=42, learning_rate=0.1, max_depth=5, n_estimators=800,
- colsample_bynode=0.5, colsample_bytree=0.8, tree_method="approx"),
- "sp_binary": dict(random_state=42, learning_rate=0.25, max_depth=4, n_estimators=600,
- colsample_bynode=0.5, colsample_bytree=0.5, tree_method="approx"),
- "en_target": dict(random_state=42, learning_rate=0.07, max_depth=4, n_estimators=800,
- colsample_bynode=1, colsample_bytree=1, tree_method="hist"),
- "fr_target": dict(random_state=42, learning_rate=0.4, max_depth=5, n_estimators=800,
- colsample_bynode=1, colsample_bytree=0.5, tree_method="exact"),
- "sp_target": dict(random_state=42, learning_rate=0.25, max_depth=3, n_estimators=600,
- colsample_bynode=0.5, colsample_bytree=1, tree_method="hist")
- }
- for lang in ("en", "fr", "sp"):
- pandas_path = os.path.join(train_dir, "pandas", f"lines_{lang}_txt_layer_df.csv.gz")
- cls = FintocClassifier(binary_classifier_params=clf_params[f"{lang}_binary"],
- target_classifier_params=clf_params[f"{lang}_target"],
- tocs_path=os.path.join(train_dir, "toc", f"{lang}_toc.json"),
- save_path="resources",
- load_trained=False,
- lang=lang)
- features_df = pd.read_csv(pandas_path, index_col=False)
- cls.fit(data=features_df,
- cross_val=True,
- save=True,
- gt_dir=os.path.join(train_dir, "data", lang, "annots"))
-
-
-def get_results(test_dir: str) -> None:
- for lang in ("en", "fr", "sp"):
- pandas_path = os.path.join(test_dir, "pandas", f"lines_{lang}_txt_layer_df.csv.gz")
- cls = FintocClassifier(tocs_path=os.path.join(test_dir, "toc", f"{lang}_toc.json"),
- save_path="resources",
- load_trained=True,
- lang=lang)
- features_df = pd.read_csv(pandas_path, index_col=False)
- result_dict = cls.predict(features_df)
- results_dir = os.path.join(test_dir, "results", lang)
- os.makedirs(results_dir, exist_ok=True)
- for doc_name, result in result_dict.items():
- json_doc_name = doc_name + ".pdf.fintoc4.json"
- with open(os.path.join(results_dir, json_doc_name), "w") as json_file:
- json.dump(result, json_file, indent=2)
-
-
-if __name__ == "__main__":
- train = False
- fintoc_dir = "/home/nasty/fintoc2022"
- if train:
- train_classifier(os.path.join(fintoc_dir, "train"))
- else:
- get_results(os.path.join(fintoc_dir, "test"))
+ def predict(self, features: pd.DataFrame) -> List[int]:
+ """
+ Two-staged classification: title/not title and depth classification for titles.
+ For non-title lines, classifier returns -1, for title lines, classifier returns their depth (from 1 to 6).
+ """
+ binary_predictions = self.binary_classifier.predict(features)
+ # binary_predictions = [True, False, ...], target predictions are predicted only for True items
+ target_predictions = self.target_classifier.predict(features[binary_predictions])
+ result = np.ones_like(binary_predictions) * -1
+ result[binary_predictions] = target_predictions
+ # return list [1, 2, 3, -1, -1, ...], where positive values mean headers depth, -1 mean non-header lines
+ return list(result)
+
+ def fit(self, binary_classifier_parameters: dict, target_classifier_parameters: dict, features: pd.DataFrame, features_names: list[str]) -> None:
+ self.classifiers["binary"] = XGBClassifier(**binary_classifier_parameters)
+ self.classifiers["target"] = XGBClassifier(**target_classifier_parameters)
+ self.binary_classifier.fit(features[features_names], features.label != -1)
+ self.target_classifier.fit(features[features_names][features.label != -1], features.label[features.label != -1])
+
+ def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str]) -> None:
+ os.makedirs(classifiers_dir_path, exist_ok=True)
+ for classifier_type in ("binary", "target"):
+ with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}.pkg.gz"), "wb") as output_file:
+ pickle.dump(self.classifiers[classifier_type], output_file)
+ logger.info(f"Classifiers were saved in {classifiers_dir_path} directory")
+
+ os.makedirs(features_importances_dir_path, exist_ok=True)
+ for classifier_type in ("binary", "target"):
+ xgbfir.saveXgbFI(self.classifiers[classifier_type], feature_names=features_names,
+ OutputXlsxFile=os.path.join(features_importances_dir_path, f"feature_importances_{classifier_type}_{self.language}.xlsx"))
+ logger.info(f"Features importances were saved in {features_importances_dir_path} directory")
+
+ @property
+ def binary_classifier(self) -> XGBClassifier:
+ return self.__lazy_load_weights("binary")
+
+ @property
+ def target_classifier(self) -> XGBClassifier:
+ return self.__lazy_load_weights("target")
+
+ def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier:
+ if self.classifiers[classifier_type] is None:
+ with gzip.open(os.path.join(self.weights_dir_path, f"{classifier_type}_classifier_{self.language}.pkg.gz"), "rb") as input_file:
+ self.classifiers[classifier_type] = pickle.load(file=input_file)
+
+ return self.classifiers[classifier_type]
diff --git a/requirements.txt b/requirements.txt
index 30469034..10af796e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ olefile~=0.46
opencv-python>=4.5.5.64,<4.6.0
orjson>=3.8.11,<=3.9.5
pandas>=1.4.1,<=1.9.0
+pdf.tocgen>=1.3.0,<=1.3.4
pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
pdfminer.six==20211012
piexif==1.1.3
diff --git a/scripts/benchmark_pdf_performance.py b/scripts/benchmark_pdf_performance.py
index c3fa48af..c5701034 100644
--- a/scripts/benchmark_pdf_performance.py
+++ b/scripts/benchmark_pdf_performance.py
@@ -95,7 +95,7 @@ def main() -> None:
assert args.loops > 0, "The number of repetitions of testing one file must be positive"
- print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
+ print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') # noqa
configs = [{}]
if args.parameters:
diff --git a/scripts/fintoc2022/benchmark_fintoc.py b/scripts/fintoc2022/benchmark_fintoc.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/scripts/fintoc2022/dataset_loader.py b/scripts/fintoc2022/dataset_loader.py
index 1a271b41..f2beb730 100755
--- a/scripts/fintoc2022/dataset_loader.py
+++ b/scripts/fintoc2022/dataset_loader.py
@@ -7,14 +7,13 @@
import tempfile
import zipfile
from collections import Counter, defaultdict
-from typing import List
+from typing import Dict, List
import wget
from Levenshtein._levenshtein import ratio
from dedoc.config import get_config
from dedoc.readers import PdfTabbyReader, PdfTxtlayerReader
-from dedoc.utils.utils import flatten
from train_dataset.data_structures.line_with_label import LineWithLabel
@@ -22,11 +21,9 @@ class FintocLineWithLabelExtractor:
"""
Create LineWithLabel from documents and their annotations
"""
- def __init__(self):
- self.readers = {
- "tabby": PdfTabbyReader(),
- "txt_layer": PdfTxtlayerReader()
- }
+
+ def __init__(self) -> None:
+ self.readers = {"tabby": PdfTabbyReader(), "txt_layer": PdfTxtlayerReader()}
def get_lines(self, file_name: str, file_path: str, annotation_path: str, reader_name: str) -> List[LineWithLabel]:
"""
@@ -51,9 +48,9 @@ def get_lines(self, file_name: str, file_path: str, annotation_path: str, reader
annotations_page = [(ratio(line.line, annotation["text"]), annotation) for annotation in annotations[line.metadata.page_id]]
best_annotation = max(annotations_page, key=lambda t: t[0], default=(0, {}))
depth = best_annotation[1]["depth"] if len(annotations_page) > 0 and best_annotation[0] > 0.8 else "-1"
- result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=depth, group=file_name, uid=line.uid))
+ result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=str(depth), group=file_name, uid=line.uid))
- return sorted(result, key=lambda l: (l.metadata.page_id, l.metadata.line_id))
+ return sorted(result, key=lambda x: (x.metadata.page_id, x.metadata.line_id))
class FintocDatasetLoader:
@@ -72,14 +69,14 @@ def __init__(self, dataset_dir: str, logger: logging.Logger) -> None:
self.data_url = "https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download"
self.line_extractor = FintocLineWithLabelExtractor()
- def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> List[List[LineWithLabel]]:
+ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> Dict[str, List[LineWithLabel]]:
"""
Download data from a cloud at `self.data_url` and sort document lines.
:param language: ("en", "fr", "sp") - language group
:param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF
:param use_cache: whether to use cached data (if dataset is already downloaded) or download it anyway
- :return: list of documents, which are lists of lines with labels of the training dataset
+ :return: dict of documents {document path: document}, where document is a list of lines with labels of the training dataset
"""
archive_path = os.path.join(self.dataset_dir, "dataset.zip")
if not os.path.isfile(archive_path):
@@ -102,11 +99,13 @@ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> L
pdf_dir = os.path.join(data_dir, "pdf")
annotations_dir = os.path.join(data_dir, "annots")
pdf_files = {pdf_file[:-len(".pdf")]: os.path.join(pdf_dir, pdf_file) for pdf_file in os.listdir(pdf_dir) if pdf_file.endswith(".pdf")}
- annotations_files = {ann_file[:-len(".pdf.fintoc4.json")]: os.path.join(annotations_dir, ann_file)
- for ann_file in os.listdir(annotations_dir) if ann_file.endswith(".json")}
+ annotations_files = {
+ ann_file[:-len(".pdf.fintoc4.json")]: os.path.join(annotations_dir, ann_file)
+ for ann_file in os.listdir(annotations_dir) if ann_file.endswith(".json")
+ }
assert set(pdf_files) == set(annotations_files)
- result = []
+ result = {}
with tempfile.TemporaryDirectory() as tmp_dir:
for file_name in pdf_files:
pdf_tmp_path = os.path.join(tmp_dir, file_name) + ".pdf"
@@ -118,11 +117,11 @@ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> L
annotation_path=annotations_files[file_name],
reader_name=reader_name
)
- result.append(document)
+ result[pdf_files[file_name]] = document
except Exception as e:
self.logger.warning(f"Failed to read {file_name} by {reader_name}, error: {e}")
with gzip.open(pkl_path, "wb") as out:
pickle.dump(obj=result, file=out)
- self.logger.info(Counter([line.label for line in flatten(result)]))
+ self.logger.info(Counter([line.label for document in result.values() for line in document]))
return result
diff --git a/scripts/fintoc2022/toc_extractor.py b/scripts/fintoc2022/toc_extractor.py
deleted file mode 100755
index 6c7eb826..00000000
--- a/scripts/fintoc2022/toc_extractor.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import json
-import os
-import re
-import shutil
-import subprocess
-import tempfile
-import zipfile
-from typing import List
-
-import wget
-from PyPDF2 import PdfFileReader, PdfFileWriter
-from dedoc.data_structures.line_with_meta import LineWithMeta
-from tqdm import tqdm
-
-from config import _config as config
-from doc_reader.line_type_classifier.feature_extractor.toc_feature_extractor import TOCFeatureExtractor
-from doc_reader.readers.scanned_reader.pdftxtlayer_reader.pdf_with_text_reader import PdfWithTextReader
-
-toc_item = re.compile(r'"([^"]+)" (\d+)')
-reader = PdfWithTextReader(config=config)
-
-
-def get_one_columns_lines(path: str) -> List[LineWithMeta]:
- if path.startswith("file://"):
- path = path[len("file://"):]
- with tempfile.TemporaryDirectory() as tmpdir:
- path_tmp = os.path.join(tmpdir, os.path.basename(path))
- pdf_reader = PdfFileReader(path)
- writer = PdfFileWriter()
- for page_id in range(0, min(9, pdf_reader.getNumPages())):
- writer.addPage(pdf_reader.getPage(page_id))
- with open(path_tmp, 'wb') as write_file:
- writer.write(write_file)
- return reader.read(path=path_tmp, document_type=None, parameters={"is_one_column_document": True,
- "need_header_footer_analysis": "True"}).lines
-
-
-def get_automatic_toc(path: str) -> List[dict]:
- result = []
- cmd = "pdftocio -p {}".format(path)
- with os.popen(cmd) as out:
- toc = out.readlines()
- if toc:
- for line in toc:
- match = toc_item.match(line.strip())
- if match:
- result.append({"text": match.group(1), "page": match.group(2)})
- return result
-
-
-def main(dir_out: str):
- toc_extractor = TOCFeatureExtractor()
- os.makedirs(dir_out, exist_ok=True)
- data_url = "https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download"
- print("use 'pip install -U pdf.tocgen' to install tool for automatic toc extraction")
- subprocess.run("pip install -U pdf.tocgen".split(" "))
-
- root = "/tmp/.fintoc/"
- if os.path.isdir(root):
- shutil.rmtree(root)
- os.makedirs(root)
- archive = os.path.join(root, "dataset.zip")
- wget.download(data_url, archive)
- with zipfile.ZipFile(archive, 'r') as zip_ref:
- zip_ref.extractall(root)
- data_dir = os.path.join(root, "data")
-
- for lang in ("en", "fr", "sp"):
- pdf_dir = os.path.join(data_dir, lang, "pdf")
- lang_dir_out = os.path.join(dir_out, lang)
- if os.path.isdir(lang_dir_out):
- shutil.rmtree(lang_dir_out)
- os.makedirs(lang_dir_out)
-
- tocs = {}
- for file in tqdm(os.listdir(pdf_dir)):
- if not file.endswith(".pdf"):
- continue
- path = os.path.join(pdf_dir, file)
- toc = get_automatic_toc(path)
- if len(toc) == 0:
- lines = get_one_columns_lines(path)
- toc = toc_extractor.get_toc(lines)
- doc_name = file[: -len(".pdf")]
- tocs[doc_name] = toc
- with open(os.path.join(lang_dir_out, f"{doc_name}_toc.json"), "w") as out:
- json.dump(obj=toc, fp=out, indent=4, ensure_ascii=False)
-
- with open(os.path.join(dir_out, f"{lang}_toc.json"), "w") as out:
- json.dump(tocs, out)
-
-
-main(dir_out="/home/nasty/fintoc2022/toc")
diff --git a/scripts/fintoc2022/train_fintoc_classifier.py b/scripts/fintoc2022/train_fintoc_classifier.py
index e69de29b..8758f570 100644
--- a/scripts/fintoc2022/train_fintoc_classifier.py
+++ b/scripts/fintoc2022/train_fintoc_classifier.py
@@ -0,0 +1,45 @@
+"""
+Training script for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/).
+The code is a modification of the winner's solution (ISP RAS team).
+"""
+import argparse
+import logging
+import os
+
+from scripts.fintoc2022.trainer import FintocTrainer
+
+clf_params = {
+ "en_binary": dict(random_state=42, learning_rate=0.25, max_depth=5, n_estimators=400, colsample_bynode=0.8, colsample_bytree=0.5, tree_method="hist"),
+ "fr_binary": dict(random_state=42, learning_rate=0.1, max_depth=5, n_estimators=800, colsample_bynode=0.5, colsample_bytree=0.8, tree_method="approx"),
+ "sp_binary": dict(random_state=42, learning_rate=0.25, max_depth=4, n_estimators=600, colsample_bynode=0.5, colsample_bytree=0.5, tree_method="approx"),
+ "en_target": dict(random_state=42, learning_rate=0.07, max_depth=4, n_estimators=800, colsample_bynode=1, colsample_bytree=1, tree_method="hist"),
+ "fr_target": dict(random_state=42, learning_rate=0.4, max_depth=5, n_estimators=800, colsample_bynode=1, colsample_bytree=0.5, tree_method="exact"),
+ "sp_target": dict(random_state=42, learning_rate=0.25, max_depth=3, n_estimators=600, colsample_bynode=0.5, colsample_bytree=1, tree_method="hist")
+}
+
+
+if __name__ == "__main__":
+ base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fintoc2022"))
+ os.makedirs(base_dir, exist_ok=True)
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--language", choices=["en", "fr", "sp"], help="Language of training data", default="en")
+ parser.add_argument("--reader", choices=["tabby", "txt_layer"], help="Type of PDF reader used for lines extraction", default="tabby")
+ parser.add_argument("--cross_val", type=bool, help="Whether to do a cross-validation", default=True)
+ parser.add_argument("--n_splits", type=int, help="Number of splits for cross-validation", default=3)
+ args = parser.parse_args()
+
+ trainer = FintocTrainer(
+ data_url="https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download",
+ logger=logging.getLogger(),
+ language=args.language,
+ n_splits=args.n_splits,
+ classifiers_dir_path=os.path.join(base_dir, "classifiers"),
+ scores_dir_path=os.path.join(base_dir, "scores"),
+ features_importances_dir_path=os.path.join(base_dir, "features_importances"),
+ tmp_dir="/tmp/fintoc/", # path where dataset and predicted jsons will be saved
+ binary_classifier_parameters=clf_params[f"{args.language}_binary"],
+ target_classifier_parameters=clf_params[f"{args.language}_target"]
+ )
+
+ trainer.fit(reader_name=args.reader, cross_val=args.cross_val)
diff --git a/scripts/fintoc2022/trainer.py b/scripts/fintoc2022/trainer.py
new file mode 100644
index 00000000..662c8374
--- /dev/null
+++ b/scripts/fintoc2022/trainer.py
@@ -0,0 +1,173 @@
+import hashlib
+import json
+import logging
+import os
+import shutil
+from statistics import mean
+from typing import Optional
+
+import pandas as pd
+from sklearn.model_selection import GroupKFold
+from tqdm import tqdm
+
+from dedoc.structure_extractors.concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor
+from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
+from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier
+from dedoc.utils.utils import flatten
+from scripts.fintoc2022.dataset_loader import FintocDatasetLoader
+from scripts.fintoc2022.metric import score
+from scripts.fintoc2022.utils import create_json_result, get_values_from_csv
+
+
+class FintocTrainer:
+ """
+ Class to train and evaluate classifiers for the FinTOC 2022 Shared task (https://wp.lancs.ac.uk/cfie/fintoc2022/).
+ The code is a modification of the winner's solution (ISP RAS team).
+ """
+ def __init__(self,
+ data_url: str,
+ logger: logging.Logger,
+ language: str,
+ classifiers_dir_path: str,
+ scores_dir_path: str,
+ features_importances_dir_path: str,
+ tmp_dir: str,
+ binary_classifier_parameters: dict = None,
+ target_classifier_parameters: dict = None,
+ n_splits: int = 3) -> None:
+ """
+ :param data_url: url to download training data for FintocDatasetLoader
+ :param logger: logger for logging details of classifier training
+ :param language: language of data ("en", "fr", "sp")
+ :param classifiers_dir_path: path to the directory where to save trained classifiers
+ :param scores_dir_path: path to the directory where to save final scores during cross-validation
+ :param features_importances_dir_path: path to the directory where to save XLSX files with information about most important features for classifiers
+ :param tmp_dir: path to temporary directory for saving the dataset and output json files with predictions
+ :param binary_classifier_parameters: parameters to pass to xgboost.XGBClassifier for classification header/non-header
+ :param target_classifier_parameters: parameters to pass to xgboost.XGBClassifier for lines depth classification
+ :param n_splits: number of splits for cross-validation
+ """
+ self.logger = logger
+ self.language = language
+ self.feature_extractor = FintocFeatureExtractor()
+ self.structure_extractor = FintocStructureExtractor()
+
+ self.binary_classifier_parameters = {} if binary_classifier_parameters is None else binary_classifier_parameters
+ self.target_classifier_parameters = {} if target_classifier_parameters is None else target_classifier_parameters
+ self.classifier = FintocClassifier(language=self.language)
+
+ self.tmp_dir = tmp_dir
+ os.makedirs(self.tmp_dir, exist_ok=True)
+ self.scores_dir_path = scores_dir_path
+ self.features_importances_dir_path = features_importances_dir_path
+ self.classifiers_dir_path = classifiers_dir_path
+
+ self.data_url = data_url
+ url_hash = hashlib.md5(self.data_url.encode()).hexdigest()
+ self.dataset_dir = os.path.join(self.tmp_dir, f"dataset_{url_hash}")
+ self.data_loader = FintocDatasetLoader(dataset_dir=self.dataset_dir, logger=logger)
+
+ self.n_splits = n_splits
+ self.additional_features_fields = ("line", "label", "group", "uid")
+
+ def fit(self, reader_name: str, cross_val: bool = True, use_cache: bool = True) -> None:
+ """
+ 1 - Load data by `self.data_url` if needed, extract lines from PDF by chosen reader by `reader_name` if needed (FintocDatasetLoader).
+ 2 - Extract a feature matrix for extracted document lines (FintocFeatureExtractor).
+ 3 - Do a cross-validation if needed.
+ 4 - Train resulting classifiers (binary, target) and save them to `self.classifiers_dir_path` (FintocClassifier).
+
+ :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF
+ :param cross_val: whether to do cross-validation or not
+ :param use_cache: whether to use cached extracted lines as training data
+ """
+ # obtain training data
+ self.logger.info("Get data for training and evaluation")
+ data = self.data_loader.get_data(language=self.language, reader_name=reader_name, use_cache=use_cache)
+
+ # create feature matrix
+ self.logger.info("Create a feature matrix")
+ features, documents = self.structure_extractor.get_features(documents_dict=data)
+ self.logger.info(f"Features shape: {features.shape}")
+ for feature_field in self.additional_features_fields:
+ features[feature_field] = [getattr(line, feature_field) for line in flatten(documents)]
+ features["label"] = features["label"].astype(int)
+ features_names = self.__get_features_names(features)
+
+ # cross-validation using fintoc metric
+ gt_dir = os.path.join(self.dataset_dir, "data", self.language, "annots")
+ scores = self.__cross_validate(features=features, gt_dir=gt_dir) if cross_val else None
+
+ # train resulting classifiers on all data
+ self.logger.info("Train resulting classifiers")
+ self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=features, features_names=features_names)
+ self.__save(features_names=features_names, scores=scores)
+
+ def __get_features_names(self, features_df: pd.DataFrame) -> list:
+ features_names = [col for col in features_df.columns if col not in self.additional_features_fields]
+ return features_names
+
+ def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict:
+ self.logger.info("Start cross-validation")
+ features_names = self.__get_features_names(features)
+ results_path = os.path.join(self.scores_dir_path, "cross_val_results", self.language)
+ os.makedirs(results_path, exist_ok=True)
+
+ kf = GroupKFold(n_splits=self.n_splits)
+ json_results_dir = os.path.join(self.tmp_dir, "json_results", self.language)
+
+ result_scores = {"td_scores": [], "toc_scores": []}
+ for i, (train_index, val_index) in tqdm(enumerate(kf.split(features, groups=features.group)), total=self.n_splits):
+ df_train = features.loc[train_index]
+ df_val = features.loc[val_index]
+ self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=df_train, features_names=features_names)
+ predicted_classes = self.classifier.predict(df_val[features_names])
+ result_dict = create_json_result(df_val, predicted_classes)
+
+ if os.path.isdir(json_results_dir):
+ shutil.rmtree(json_results_dir)
+ os.makedirs(json_results_dir)
+
+ tmp_gt_dir, predictions_dir = os.path.join(json_results_dir, "groundtruth"), os.path.join(self.tmp_dir, "predictions")
+ os.makedirs(tmp_gt_dir)
+ os.makedirs(predictions_dir)
+
+ for doc_name, result in result_dict.items():
+ gt_doc_name = doc_name + ".pdf.fintoc4.json"
+ if gt_doc_name not in os.listdir(gt_dir):
+ self.logger.warning(f"{gt_doc_name} is not found in groundtruth")
+ continue
+ with open(os.path.join(predictions_dir, gt_doc_name), "w") as json_file:
+ json.dump(result, json_file, indent=2)
+ shutil.copy(os.path.join(gt_dir, gt_doc_name), os.path.join(tmp_gt_dir, gt_doc_name))
+ score(tmp_gt_dir, predictions_dir)
+
+ path_scores = os.path.join(results_path, str(i))
+ os.makedirs(path_scores, exist_ok=True)
+ for file_name in ["td.log", "toc.log", "td_report.csv", "toc_report.csv"]:
+ shutil.move(file_name, os.path.join(path_scores, file_name))
+
+ f1, inex_f1 = get_values_from_csv(path_scores)
+ result_scores["td_scores"].append(f1)
+ result_scores["toc_scores"].append(inex_f1)
+ self.logger.info(f'Iteration {i}:\ntd={result_scores["td_scores"][-1]}\ntoc={result_scores["toc_scores"][-1]}')
+
+ result_scores["td_mean"] = mean(result_scores["td_scores"])
+ result_scores["toc_mean"] = mean(result_scores["toc_scores"])
+ return result_scores
+
+ def __save(self, features_names: list[str], scores: Optional[dict]) -> None:
+
+ if scores is not None:
+ os.makedirs(self.scores_dir_path, exist_ok=True)
+ scores_path = os.path.join(self.scores_dir_path, f"scores_{self.language}.json")
+ with open(scores_path, "w") as f:
+ json.dump(scores, f)
+ self.logger.info(f"Scores were saved in {scores_path}")
+
+ self.classifier.save(
+ classifiers_dir_path=self.classifiers_dir_path,
+ features_importances_dir_path=self.features_importances_dir_path,
+ features_names=features_names,
+ logger=self.logger
+ )
diff --git a/scripts/fintoc2022/utils.py b/scripts/fintoc2022/utils.py
index 9a4ab457..e5a9f018 100755
--- a/scripts/fintoc2022/utils.py
+++ b/scripts/fintoc2022/utils.py
@@ -18,18 +18,21 @@ def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict:
}
]
"""
- uid2line = {item[1].uid: {"text": item[1].text.strip() if isinstance(item[1].text, str) else "",
- "page": item[1].page_id + 1,
- "group": item[1].group} for item in data.iterrows()}
+ uid2line = {
+ item[1].uid: {
+ "text": item[1].line.strip() if isinstance(item[1].line, str) else "",
+ "page": item[1].page_id + 1,
+ "group": item[1].group
+ } for item in data.iterrows()
+ }
result = defaultdict(list)
- assert(data.shape[0] == len(predictions))
+ assert data.shape[0] == len(predictions)
for i, (line_uid, prediction) in enumerate(zip(data.uid, predictions)):
line = uid2line[line_uid]
if line["text"] == "":
continue
# TODO crop text lines containing colon
- result[line["group"]].append({"id": i, "depth": str(prediction),
- "text": line["text"], "page": line["page"]})
+ result[line["group"]].append({"id": i, "depth": str(prediction), "text": line["text"], "page": line["page"]})
return result
From 3d472143374d24330d0016b1c2a17901f9b95bb5 Mon Sep 17 00:00:00 2001
From: Nasty
Date: Wed, 24 Apr 2024 16:09:47 +0300
Subject: [PATCH 3/8] Fix some bugs and write documentation
---
dedoc/api/api_args.py | 2 +-
dedoc/api/web/index.html | 1 +
dedoc/manager_config.py | 9 +++--
dedoc/structure_extractors/__init__.py | 5 ++-
.../fintoc_structure_extractor.py | 39 ++++++++++++++-----
.../fintoc_classifier.py | 6 +--
docs/source/dedoc_api_usage/api.rst | 3 +-
docs/source/index.rst | 2 +
docs/source/modules/structure_extractors.rst | 6 +++
docs/source/parameters/structure_type.rst | 12 +++---
docs/source/structure_types/fintoc.rst | 5 +++
scripts/fintoc2022/metric.py | 2 +-
scripts/fintoc2022/train_fintoc_classifier.py | 3 +-
scripts/fintoc2022/trainer.py | 21 +++++-----
scripts/fintoc2022/utils.py | 4 +-
15 files changed, 82 insertions(+), 38 deletions(-)
create mode 100644 docs/source/structure_types/fintoc.rst
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index 1c260b37..f85da673 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -7,7 +7,7 @@
@dataclass
class QueryParameters:
# type of document structure parsing
- document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
+ document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index 5ca05cec..e8b15f68 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -38,6 +38,7 @@
Type of document structure parsing
+
document_type
diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py
index 679db954..35815ecf 100644
--- a/dedoc/manager_config.py
+++ b/dedoc/manager_config.py
@@ -1,7 +1,5 @@
from typing import Optional
-from dedoc.readers.article_reader.article_reader import ArticleReader
-
def _get_manager_config(config: dict) -> dict:
"""
@@ -23,6 +21,7 @@ def _get_manager_config(config: dict) -> dict:
from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor
from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition
from dedoc.readers.archive_reader.archive_reader import ArchiveReader
+ from dedoc.readers.article_reader.article_reader import ArticleReader
from dedoc.readers.csv_reader.csv_reader import CSVReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from dedoc.readers.email_reader.email_reader import EmailReader
@@ -41,9 +40,11 @@ def _get_manager_config(config: dict) -> dict:
from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor
from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor
from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition
+ from dedoc.structure_extractors.concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor
+ from dedoc.structure_extractors.concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor
@@ -93,7 +94,9 @@ def _get_manager_config(config: dict) -> dict:
DefaultStructureExtractor.document_type: DefaultStructureExtractor(config=config),
DiplomaStructureExtractor.document_type: DiplomaStructureExtractor(config=config),
TzStructureExtractor.document_type: TzStructureExtractor(config=config),
- ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config)
+ ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config),
+ ArticleStructureExtractor.document_type: ArticleStructureExtractor(config=config),
+ FintocStructureExtractor.document_type: FintocStructureExtractor(config=config)
}
return dict(
diff --git a/dedoc/structure_extractors/__init__.py b/dedoc/structure_extractors/__init__.py
index 404d915c..20f6d350 100644
--- a/dedoc/structure_extractors/__init__.py
+++ b/dedoc/structure_extractors/__init__.py
@@ -4,11 +4,12 @@
from .concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor
from .concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor
from .concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor
+from .concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor
from .concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor
from .concrete_structure_extractors.law_structure_excractor import LawStructureExtractor
from .concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor
from .structure_extractor_composition import StructureExtractorComposition
__all__ = ['AbstractStructureExtractor', 'AbstractLawStructureExtractor', 'ArticleStructureExtractor', 'ClassifyingLawStructureExtractor',
- 'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', 'TzStructureExtractor',
- 'StructureExtractorComposition']
+ 'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FintocStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor',
+ 'TzStructureExtractor', 'StructureExtractorComposition']
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
index 3d69ccad..e0d0c30e 100644
--- a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
+++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
@@ -7,7 +7,6 @@
from PyPDF2 import PdfFileReader, PdfFileWriter
from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument
-from dedoc.readers import PdfTxtlayerReader
from dedoc.structure_extractors import AbstractStructureExtractor
from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor
@@ -16,15 +15,18 @@
class FintocStructureExtractor(AbstractStructureExtractor):
"""
- This class is an implementation of the TOC extractor for the `FinTOC 2022 Shared task`_.
+ This class is an implementation of the TOC extractor for the `FinTOC 2022 Shared task `_.
The code is a modification of the winner's solution (ISP RAS team).
- You can find the description of this type of structure in the section :ref:`fintoc_structure`.
+ This structure extractor is used for English, French and Spanish financial prospects in PDF format (with a textual layer).
+ It is recommended to use :class:`~dedoc.readers.PdfTxtlayerReader` to obtain document lines.
+ You can find the more detailed description of this type of structure in the section :ref:`fintoc_structure`.
"""
document_type = "fintoc"
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)
+ from dedoc.readers import PdfTxtlayerReader # to exclude circular imports
self.pdf_reader = PdfTxtlayerReader(config=self.config)
self.toc_extractor = TOCFeatureExtractor()
self.features_extractor = FintocFeatureExtractor()
@@ -35,9 +37,22 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None, file_path: Optional[str] = None) -> UnstructuredDocument:
"""
-
- To get the information about the method's parameters look at the documentation of the class \
- :class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
+ According to the `FinTOC 2022 `_ title detection task, lines are classified as titles and non-titles.
+ The information about titles is saved in ``line.metadata.hierarchy_level`` (:class:`~dedoc.data_structures.HierarchyLevel` class):
+
+ - Title lines have ``HierarchyLevel.header`` type, and their depth (``HierarchyLevel.level_2``) is similar to \
+ the depth of TOC item from the FinTOC 2022 TOC generation task.
+ - Non-title lines have ``HierarchyLevel.raw_text`` type, and their depth isn't obtained.
+
+ :param document: document content that has been received from some of the readers (:class:`~dedoc.readers.PdfTxtlayerReader` is recommended).
+ :param parameters: for this structure extractor, "language" parameter is used for setting document's language, e.g. ``parameters={"language": "en"}``. \
+ The following options are supported:
+
+ * "en" - English (default);
+ * "fr" - French;
+ * "sp" - Spanish.
+ :param file_path: path to the file on disk.
+ :return: document content with added additional information about title/non-title lines and hierarchy levels of titles.
"""
parameters = {} if parameters is None else parameters
language = parameters.get("language", "en")
@@ -76,12 +91,16 @@ def __filter_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
return lines
- def __get_toc(self, file_path: Optional[str]) -> Optional[List[Dict[str, Union[LineWithMeta, str]]]]:
- if file_path is None:
- return
+ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMeta, str]]]:
+ """
+ Try to get TOC from PDF automatically. If TOC wasn't extracted automatically, it is extracted using regular expressions.
+ """
+ if file_path is None or not file_path.lower().endswith(".pdf"):
+ return []
toc = self.__get_automatic_toc(path=file_path)
if len(toc) > 0:
+ self.logger.info(f"Got automatic TOC from {os.path.basename(file_path)}")
return toc
pdf_reader = PdfFileReader(file_path)
@@ -99,7 +118,7 @@ def __get_toc(self, file_path: Optional[str]) -> Optional[List[Dict[str, Union[L
def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]:
result = []
- with os.popen(f"pdftocio -p {path}") as out:
+ with os.popen(f'pdftocio -p "{path}"') as out:
toc = out.readlines()
if len(toc) == 0:
return result
diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
index 0c761ee0..cb5f0d5b 100755
--- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
+++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
@@ -51,17 +51,17 @@ def fit(self, binary_classifier_parameters: dict, target_classifier_parameters:
self.binary_classifier.fit(features[features_names], features.label != -1)
self.target_classifier.fit(features[features_names][features.label != -1], features.label[features.label != -1])
- def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str]) -> None:
+ def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str], reader: str) -> None:
os.makedirs(classifiers_dir_path, exist_ok=True)
for classifier_type in ("binary", "target"):
- with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}.pkg.gz"), "wb") as output_file:
+ with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.pkg.gz"), "wb") as output_file:
pickle.dump(self.classifiers[classifier_type], output_file)
logger.info(f"Classifiers were saved in {classifiers_dir_path} directory")
os.makedirs(features_importances_dir_path, exist_ok=True)
for classifier_type in ("binary", "target"):
xgbfir.saveXgbFI(self.classifiers[classifier_type], feature_names=features_names,
- OutputXlsxFile=os.path.join(features_importances_dir_path, f"feature_importances_{classifier_type}_{self.language}.xlsx"))
+ OutputXlsxFile=os.path.join(features_importances_dir_path, f"feature_importances_{classifier_type}_{self.language}_{reader}.xlsx"))
logger.info(f"Features importances were saved in {features_importances_dir_path} directory")
@property
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index b06b345d..8db61804 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -80,7 +80,8 @@ Api parameters description
* **law** -- Russian laws (:ref:`law_structure`);
* **tz** -- Russian technical specifications (:ref:`tz_structure`);
* **diploma** -- Russian thesis (:ref:`diploma_structure`);
- * **article** -- scientific article (:ref:`article_structure`).
+ * **article** -- scientific article (:ref:`article_structure`);
+ * **fintoc** -- English, French and Spanish financial prospects (:ref:`fintoc_structure`).
This type is used for choosing a specific structure extractor (and, in some cases, a specific reader).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 92582bec..779a6adb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -209,6 +209,7 @@ Currently the following domains can be handled:
* Russian technical specifications (:ref:`structure description `).
* Russian thesis for bachelor or master degree (:ref:`structure description `).
* English scientific articles (:ref:`structure description `).
+ * English, French and Spanish financial prospects (:ref:`structure description `).
For a document of unknown or unsupported domain there is an option to use default structure extractor
(``document_type=other`` at :ref:`api_parameters`), the default document structure described :ref:`here `.
@@ -257,6 +258,7 @@ For a document of unknown or unsupported domain there is an option to use defaul
structure_types/tz
structure_types/diploma
structure_types/article
+ structure_types/fintoc
.. toctree::
diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst
index 08655f06..86ded2c3 100644
--- a/docs/source/modules/structure_extractors.rst
+++ b/docs/source/modules/structure_extractors.rst
@@ -58,3 +58,9 @@ dedoc.structure_extractors
:members:
.. autoattribute:: document_type
+
+.. autoclass:: dedoc.structure_extractors.FintocStructureExtractor
+ :show-inheritance:
+ :members:
+
+ .. autoattribute:: document_type
diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst
index 842f6afc..dfd09803 100644
--- a/docs/source/parameters/structure_type.rst
+++ b/docs/source/parameters/structure_type.rst
@@ -22,16 +22,18 @@ Structure type configuring
- Type of the document structure according to specific domain.
If you use default manager config for :class:`~dedoc.DedocManager`, then the following options are available:
- * **other** -- structure for document of any domain (:ref:`other_structure`)
+ * **other** -- structure for document of any domain (:ref:`other_structure`).
In this case, :class:`~dedoc.structure_extractors.DefaultStructureExtractor` is used.
- * **law** -- Russian laws (:ref:`law_structure`)
+ * **law** -- Russian laws (:ref:`law_structure`).
In this case, :class:`~dedoc.structure_extractors.ClassifyingLawStructureExtractor` is used.
- * **tz** -- Russian technical specifications (:ref:`tz_structure`)
+ * **tz** -- Russian technical specifications (:ref:`tz_structure`).
In this case, :class:`~dedoc.structure_extractors.TzStructureExtractor` is used.
- * **diploma** -- Russian thesis (:ref:`diploma_structure`)
+ * **diploma** -- Russian thesis (:ref:`diploma_structure`).
In this case, :class:`~dedoc.structure_extractors.DiplomaStructureExtractor` is used.
- * **article** -- scientific article (:ref:`article_structure`)
+ * **article** -- scientific article (:ref:`article_structure`).
In this case, :class:`~dedoc.readers.ArticleReader` and :class:`~dedoc.structure_extractors.ArticleStructureExtractor` are used.
+ * **fintoc** -- English, French and Spanish financial prospects (:ref:`fintoc_structure`).
+ In this case, :class:`~dedoc.structure_extractors.FintocStructureExtractor` is used.
If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition`
diff --git a/docs/source/structure_types/fintoc.rst b/docs/source/structure_types/fintoc.rst
new file mode 100644
index 00000000..6c9e6e17
--- /dev/null
+++ b/docs/source/structure_types/fintoc.rst
@@ -0,0 +1,5 @@
+.. _fintoc_structure:
+
+FinTOC structure type
+=====================
+
diff --git a/scripts/fintoc2022/metric.py b/scripts/fintoc2022/metric.py
index eb82fd4a..3ab4da15 100755
--- a/scripts/fintoc2022/metric.py
+++ b/scripts/fintoc2022/metric.py
@@ -42,7 +42,7 @@
import numpy as np
JSON_EXTENSION = ".fintoc4.json"
-VERBOSE = True
+VERBOSE = False
STRING_THRESHOLD = 0.85
diff --git a/scripts/fintoc2022/train_fintoc_classifier.py b/scripts/fintoc2022/train_fintoc_classifier.py
index 8758f570..ddee19da 100644
--- a/scripts/fintoc2022/train_fintoc_classifier.py
+++ b/scripts/fintoc2022/train_fintoc_classifier.py
@@ -33,6 +33,7 @@
data_url="https://at.ispras.ru/owncloud/index.php/s/EZfm71WimN2h7rC/download",
logger=logging.getLogger(),
language=args.language,
+ reader_name=args.reader,
n_splits=args.n_splits,
classifiers_dir_path=os.path.join(base_dir, "classifiers"),
scores_dir_path=os.path.join(base_dir, "scores"),
@@ -42,4 +43,4 @@
target_classifier_parameters=clf_params[f"{args.language}_target"]
)
- trainer.fit(reader_name=args.reader, cross_val=args.cross_val)
+ trainer.fit(cross_val=args.cross_val)
diff --git a/scripts/fintoc2022/trainer.py b/scripts/fintoc2022/trainer.py
index 662c8374..0b0b311b 100644
--- a/scripts/fintoc2022/trainer.py
+++ b/scripts/fintoc2022/trainer.py
@@ -28,6 +28,7 @@ def __init__(self,
data_url: str,
logger: logging.Logger,
language: str,
+ reader_name: str,
classifiers_dir_path: str,
scores_dir_path: str,
features_importances_dir_path: str,
@@ -38,7 +39,8 @@ def __init__(self,
"""
:param data_url: url to download training data for FintocDatasetLoader
:param logger: logger for logging details of classifier training
- :param language: language of data ("en", "fr", "sp")
+ :param language: ("en", "fr", "sp") - language of data
+ :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF
:param classifiers_dir_path: path to the directory where to save trained classifiers
:param scores_dir_path: path to the directory where to save final scores during cross-validation
:param features_importances_dir_path: path to the directory where to save XLSX files with information about most important features for classifiers
@@ -49,6 +51,7 @@ def __init__(self,
"""
self.logger = logger
self.language = language
+ self.reader_name = reader_name
self.feature_extractor = FintocFeatureExtractor()
self.structure_extractor = FintocStructureExtractor()
@@ -70,20 +73,19 @@ def __init__(self,
self.n_splits = n_splits
self.additional_features_fields = ("line", "label", "group", "uid")
- def fit(self, reader_name: str, cross_val: bool = True, use_cache: bool = True) -> None:
+ def fit(self, cross_val: bool = True, use_cache: bool = True) -> None:
"""
1 - Load data by `self.data_url` if needed, extract lines from PDF by chosen reader by `reader_name` if needed (FintocDatasetLoader).
2 - Extract a feature matrix for extracted document lines (FintocFeatureExtractor).
3 - Do a cross-validation if needed.
4 - Train resulting classifiers (binary, target) and save them to `self.classifiers_dir_path` (FintocClassifier).
- :param reader_name: ("tabby", "txt_layer") - type of reader for lines extraction from PDF
:param cross_val: whether to do cross-validation or not
:param use_cache: whether to use cached extracted lines as training data
"""
# obtain training data
self.logger.info("Get data for training and evaluation")
- data = self.data_loader.get_data(language=self.language, reader_name=reader_name, use_cache=use_cache)
+ data = self.data_loader.get_data(language=self.language, reader_name=self.reader_name, use_cache=use_cache)
# create feature matrix
self.logger.info("Create a feature matrix")
@@ -110,11 +112,11 @@ def __get_features_names(self, features_df: pd.DataFrame) -> list:
def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict:
self.logger.info("Start cross-validation")
features_names = self.__get_features_names(features)
- results_path = os.path.join(self.scores_dir_path, "cross_val_results", self.language)
+ results_path = os.path.join(self.scores_dir_path, f"cross_val_results_{self.language}_{self.reader_name}")
os.makedirs(results_path, exist_ok=True)
kf = GroupKFold(n_splits=self.n_splits)
- json_results_dir = os.path.join(self.tmp_dir, "json_results", self.language)
+ json_results_dir = os.path.join(self.tmp_dir, f"json_results_{self.language}_{self.reader_name}")
result_scores = {"td_scores": [], "toc_scores": []}
for i, (train_index, val_index) in tqdm(enumerate(kf.split(features, groups=features.group)), total=self.n_splits):
@@ -128,7 +130,7 @@ def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict:
shutil.rmtree(json_results_dir)
os.makedirs(json_results_dir)
- tmp_gt_dir, predictions_dir = os.path.join(json_results_dir, "groundtruth"), os.path.join(self.tmp_dir, "predictions")
+ tmp_gt_dir, predictions_dir = os.path.join(json_results_dir, "groundtruth"), os.path.join(json_results_dir, "predictions")
os.makedirs(tmp_gt_dir)
os.makedirs(predictions_dir)
@@ -160,7 +162,7 @@ def __save(self, features_names: list[str], scores: Optional[dict]) -> None:
if scores is not None:
os.makedirs(self.scores_dir_path, exist_ok=True)
- scores_path = os.path.join(self.scores_dir_path, f"scores_{self.language}.json")
+ scores_path = os.path.join(self.scores_dir_path, f"scores_{self.language}_{self.reader_name}.json")
with open(scores_path, "w") as f:
json.dump(scores, f)
self.logger.info(f"Scores were saved in {scores_path}")
@@ -169,5 +171,6 @@ def __save(self, features_names: list[str], scores: Optional[dict]) -> None:
classifiers_dir_path=self.classifiers_dir_path,
features_importances_dir_path=self.features_importances_dir_path,
features_names=features_names,
- logger=self.logger
+ logger=self.logger,
+ reader=self.reader_name
)
diff --git a/scripts/fintoc2022/utils.py b/scripts/fintoc2022/utils.py
index e5a9f018..187f5158 100755
--- a/scripts/fintoc2022/utils.py
+++ b/scripts/fintoc2022/utils.py
@@ -21,7 +21,7 @@ def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict:
uid2line = {
item[1].uid: {
"text": item[1].line.strip() if isinstance(item[1].line, str) else "",
- "page": item[1].page_id + 1,
+ "page": int(item[1].page_id + 1),
"group": item[1].group
} for item in data.iterrows()
}
@@ -29,7 +29,7 @@ def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict:
assert data.shape[0] == len(predictions)
for i, (line_uid, prediction) in enumerate(zip(data.uid, predictions)):
line = uid2line[line_uid]
- if line["text"] == "":
+ if line["text"] == "" or prediction == -1:
continue
# TODO crop text lines containing colon
result[line["group"]].append({"id": i, "depth": str(prediction), "text": line["text"], "page": line["page"]})
From 4e3822f8008bb042ffb61929745273f4ffd90a6e Mon Sep 17 00:00:00 2001
From: Nasty
Date: Wed, 24 Apr 2024 17:50:57 +0300
Subject: [PATCH 4/8] Add classifiers downloading, small fixes
---
dedoc/api/api_args.py | 2 +-
dedoc/api/web/index.html | 4 ++-
dedoc/download_models.py | 11 ++++++-
.../fintoc_structure_extractor.py | 30 ++++++++++++++-----
.../fintoc_classifier.py | 13 +++++++-
scripts/fintoc2022/metric.py | 4 +--
scripts/fintoc2022/trainer.py | 2 +-
7 files changed, 52 insertions(+), 14 deletions(-)
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index f85da673..f139733f 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -29,7 +29,7 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
- language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
+ language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
description='One or multiple column document, "auto" - predict number of page columns automatically')
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index e8b15f68..055ef58b 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -38,7 +38,7 @@
Type of document structure parsing
-
+
document_type
@@ -138,6 +138,8 @@
PDF handling
+
+
language
diff --git a/dedoc/download_models.py b/dedoc/download_models.py
index 643cf30e..04bce97f 100644
--- a/dedoc/download_models.py
+++ b/dedoc/download_models.py
@@ -15,7 +15,8 @@
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
- line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013"
+ line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
+ fintoc_classifiers="ec08837809f8e8da6010777b07272af705df9e1b"
)
@@ -42,6 +43,14 @@ def download(resources_path: str) -> None:
repo_name="line_type_classifiers",
hub_name=f"{classifier_type}.pkl.gz")
+ fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
+ for language in ("en",): # TODO ("en", "fr", "sp"):
+ for classifier_type in ("target", "binary"):
+ download_from_hub(out_dir=fintoc_classifiers_resources_path,
+ out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
+ repo_name="fintoc_classifiers",
+ hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz")
+
if __name__ == "__main__":
resources_path = get_config()["resources_path"]
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
index e0d0c30e..591f8cc3 100644
--- a/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
+++ b/dedoc/structure_extractors/concrete_structure_extractors/fintoc_structure_extractor.py
@@ -6,6 +6,7 @@
import pandas as pd
from PyPDF2 import PdfFileReader, PdfFileWriter
+from dedoc.config import get_config
from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument
from dedoc.structure_extractors import AbstractStructureExtractor
from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
@@ -31,7 +32,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
self.toc_extractor = TOCFeatureExtractor()
self.features_extractor = FintocFeatureExtractor()
self.languages = ("en", "fr", "sp")
- self.classifiers = {language: FintocClassifier(language=language) for language in self.languages}
+ path = os.path.join(get_config()["resources_path"], "fintoc_classifiers")
+ self.classifiers = {language: FintocClassifier(language=language, weights_dir_path=path) for language in self.languages}
self.toc_item_regexp = re.compile(r'"([^"]+)" (\d+)')
self.empty_string_regexp = re.compile(r"^\s*\n$")
@@ -48,16 +50,14 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
:param parameters: for this structure extractor, "language" parameter is used for setting document's language, e.g. ``parameters={"language": "en"}``. \
The following options are supported:
- * "en" - English (default);
- * "fr" - French;
- * "sp" - Spanish.
+ * "en", "eng" - English (default);
+ * "fr", "fra" - French;
+ * "sp", "spa" - Spanish.
:param file_path: path to the file on disk.
:return: document content with added additional information about title/non-title lines and hierarchy levels of titles.
"""
parameters = {} if parameters is None else parameters
- language = parameters.get("language", "en")
- if language not in self.languages:
- raise ValueError(f"Language {language} is not supported by this extractor. Supported languages: {self.languages}")
+ language = self.__get_param_language(parameters=parameters)
features, documents = self.get_features(documents_dict={file_path: document.lines})
predictions = self.classifiers[language].predict(features)
@@ -73,6 +73,22 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
return document
+ def __get_param_language(self, parameters: dict) -> str:
+ language = parameters.get("language", "en")
+
+ if language in ("en", "eng", "rus+eng"):
+ return "en"
+
+ if language in ("fr", "fra"):
+ return "fr"
+
+ if language in ("sp", "spa"):
+ return "sp"
+
+ if language not in self.languages:
+ self.logger.warning(f"Language {language} is not supported by this extractor. Use default language (en)")
+ return "en"
+
def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[pd.DataFrame, List[List[LineWithMeta]]]:
toc_lines, documents = [], []
for file_path, document_lines in documents_dict.items():
diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
index cb5f0d5b..79a7ed72 100755
--- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
+++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
@@ -9,6 +9,8 @@
import xgbfir
from xgboost import XGBClassifier
+from dedoc.download_models import download_from_hub
+
class FintocClassifier:
"""
@@ -73,8 +75,17 @@ def target_classifier(self) -> XGBClassifier:
return self.__lazy_load_weights("target")
def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier:
+ assert self.weights_dir_path is not None
if self.classifiers[classifier_type] is None:
- with gzip.open(os.path.join(self.weights_dir_path, f"{classifier_type}_classifier_{self.language}.pkg.gz"), "rb") as input_file:
+ file_name = f"{classifier_type}_classifier_{self.language}.pkg.gz"
+ classifier_path = os.path.join(self.weights_dir_path, file_name)
+ if not os.path.isfile(classifier_path):
+ download_from_hub(out_dir=self.weights_dir_path,
+ out_name=file_name,
+ repo_name="fintoc_classifiers",
+ hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.pkg.gz")
+
+ with gzip.open(classifier_path, "rb") as input_file:
self.classifiers[classifier_type] = pickle.load(file=input_file)
return self.classifiers[classifier_type]
diff --git a/scripts/fintoc2022/metric.py b/scripts/fintoc2022/metric.py
index 3ab4da15..ad6695b7 100755
--- a/scripts/fintoc2022/metric.py
+++ b/scripts/fintoc2022/metric.py
@@ -42,7 +42,7 @@
import numpy as np
JSON_EXTENSION = ".fintoc4.json"
-VERBOSE = False
+VERBOSE = True
STRING_THRESHOLD = 0.85
@@ -593,7 +593,7 @@ def get_std_row(td_prec, td_reca, td_f1):
]
-def get_logger(name, path_to_log, level=logging.INFO):
+def get_logger(name, path_to_log, level=logging.ERROR):
handler = logging.FileHandler(path_to_log, mode="w")
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
diff --git a/scripts/fintoc2022/trainer.py b/scripts/fintoc2022/trainer.py
index 0b0b311b..88339138 100644
--- a/scripts/fintoc2022/trainer.py
+++ b/scripts/fintoc2022/trainer.py
@@ -164,7 +164,7 @@ def __save(self, features_names: list[str], scores: Optional[dict]) -> None:
os.makedirs(self.scores_dir_path, exist_ok=True)
scores_path = os.path.join(self.scores_dir_path, f"scores_{self.language}_{self.reader_name}.json")
with open(scores_path, "w") as f:
- json.dump(scores, f)
+ json.dump(scores, f, indent=2)
self.logger.info(f"Scores were saved in {scores_path}")
self.classifier.save(
From 751e0b572145202ca3d471b5f6880df8df8cee08 Mon Sep 17 00:00:00 2001
From: Nasty
Date: Thu, 25 Apr 2024 12:58:53 +0300
Subject: [PATCH 5/8] FinTOC benchmarks added and all classifiers uploaded to
huggingface
---
dedoc/download_models.py | 4 +-
.../fintoc_classifier.py | 2 +-
resources/benchmarks/fintoc_scores.html | 83 +++++++++++++++++++
scripts/fintoc2022/benchmark_fintoc.py | 43 ++++++++++
4 files changed, 129 insertions(+), 3 deletions(-)
create mode 100644 resources/benchmarks/fintoc_scores.html
create mode 100644 scripts/fintoc2022/benchmark_fintoc.py
diff --git a/dedoc/download_models.py b/dedoc/download_models.py
index 04bce97f..b520a7df 100644
--- a/dedoc/download_models.py
+++ b/dedoc/download_models.py
@@ -16,7 +16,7 @@
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
- fintoc_classifiers="ec08837809f8e8da6010777b07272af705df9e1b"
+ fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263"
)
@@ -44,7 +44,7 @@ def download(resources_path: str) -> None:
hub_name=f"{classifier_type}.pkl.gz")
fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
- for language in ("en",): # TODO ("en", "fr", "sp"):
+ for language in ("en", "fr", "sp"):
for classifier_type in ("target", "binary"):
download_from_hub(out_dir=fintoc_classifiers_resources_path,
out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
index 79a7ed72..15516c75 100755
--- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
+++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py
@@ -75,8 +75,8 @@ def target_classifier(self) -> XGBClassifier:
return self.__lazy_load_weights("target")
def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier:
- assert self.weights_dir_path is not None
if self.classifiers[classifier_type] is None:
+ assert self.weights_dir_path is not None
file_name = f"{classifier_type}_classifier_{self.language}.pkg.gz"
classifier_path = os.path.join(self.weights_dir_path, file_name)
if not os.path.isfile(classifier_path):
diff --git a/resources/benchmarks/fintoc_scores.html b/resources/benchmarks/fintoc_scores.html
new file mode 100644
index 00000000..50624b7d
--- /dev/null
+++ b/resources/benchmarks/fintoc_scores.html
@@ -0,0 +1,83 @@
+