Skip to content

Commit

Permalink
Review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Apr 27, 2024
1 parent c693de1 commit 970e7aa
Show file tree
Hide file tree
Showing 14 changed files with 92 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,17 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe
self.logger.info(f"Got automatic TOC from {os.path.basename(file_path)}")
return toc

lines = self.__read_one_column_lines(file_path)
return self.toc_extractor.get_toc(lines)

def __read_one_column_lines(self, file_path: str) -> List[LineWithMeta]:
"""
TOC is one-columned even in two-columned documents (as a rule), so we handle TOC lines extraction separately:
1. save first 10 pages of the document to a temporary directory;
2. read lines from these pages in one-column mode without headers and footers.
Later these lines will be analysed for TOC lines extraction.
"""
pdf_reader = PdfFileReader(file_path)
writer = PdfFileWriter()

Expand All @@ -130,14 +141,12 @@ def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMe
writer.write(write_file)
lines = self.pdf_reader.read(file_path=tmp_path, parameters={"is_one_column_document": "True", "need_header_footer_analysis": "True"}).lines

return self.toc_extractor.get_toc(lines)
return lines

def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]:
result = []
with os.popen(f'pdftocio -p "{path}"') as out:
toc = out.readlines()
if len(toc) == 0:
return result

for line in toc:
match = self.toc_item_regexp.match(line.strip())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,13 @@ def __look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[
for line_id, _ in enumerate(document):
if line_id >= n:
prev_line = document[line_id - n]
is_prev_line_ends = prev_line.line.endswith((".", ";"))
res["prev_line_ends"].append(1 if is_prev_line_ends else 0)
res["prev_line_ends"].append(prev_line.line.endswith((".", ";")))
res["prev_ends_with_colon"].append(prev_line.line.endswith(":"))
res["prev_is_space"].append(prev_line.line.lower().isspace())
else:
res["prev_line_ends"].append(0)
res["prev_ends_with_colon"].append(0)
res["prev_is_space"].append(0)
res["prev_line_ends"].append(False)
res["prev_ends_with_colon"].append(False)
res["prev_is_space"].append(False)
return res

def __get_line_relative_length(self, lines: List[LineWithMeta]) -> List[float]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,22 @@


class PairedFeatureExtractor(AbstractFeatureExtractor):
"""
This class is used as an auxiliary feature extractor to the main extractor.
It allows to add "raw" features related to the lines importance.
Based on one line property (size, indentation) it computes a raw line's depth inside the document tree.
Example:
For lines
line1 (size=16)
line2 (size=14)
line3 (size=12)
line4 (size=12)
line5 (size=14)
line6 (size=12)
We will obtain a feature vector (raw_depth_size)
[0, 1, 2, 2, 1, 2]
"""

def parameters(self) -> dict:
return {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import pickle
from typing import List, Optional
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -47,7 +47,11 @@ def predict(self, features: pd.DataFrame) -> List[int]:
# return list [1, 2, 3, -1, -1, ...], where positive values mean headers depth, -1 mean non-header lines
return list(result)

def fit(self, binary_classifier_parameters: dict, target_classifier_parameters: dict, features: pd.DataFrame, features_names: list[str]) -> None:
def fit(self,
binary_classifier_parameters: Dict[str, Union[int, float, str]],
target_classifier_parameters: Dict[str, Union[int, float, str]],
features: pd.DataFrame,
features_names: List[str]) -> None:
self.classifiers["binary"] = XGBClassifier(**binary_classifier_parameters)
self.classifiers["target"] = XGBClassifier(**target_classifier_parameters)
self.binary_classifier.fit(features[features_names], features.label != -1)
Expand Down
8 changes: 5 additions & 3 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Api parameters description
* - :cspan:`3` **Type of document structure parsing**

* - document_type
- other, law, tz, diploma
- other, law, tz, diploma, article, fintoc
- other
- Type of the document structure according to specific domain.

Expand Down Expand Up @@ -216,13 +216,15 @@ Api parameters description
It is highly recommended to use this option value for any PDF document parsing.

* - language
- rus, eng, rus+eng
- rus, eng, rus+eng, fra, spa
- rus+eng
- Language of the parsed PDF document without a textual layer. The following values are available:

* **rus** -- Russian;
* **eng** -- English;
* **rus+eng** -- both Russian and English.
* **rus+eng** -- both Russian and English;
* **fra** -- French (for fintoc structure type);
* **spa** -- Spanish (for fintoc structure type).

* - pages
- :, start:, :end, start:end
Expand Down
7 changes: 5 additions & 2 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,19 @@ PDF and images handling
It is highly recommended to use this option value for any PDF document parsing.

* - language
- rus, eng, rus+eng
- rus, eng, rus+eng, fra, spa
- rus+eng
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
* :meth:`dedoc.readers.ReaderComposition.read`
* :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract`
- Language of the document without a textual layer. The following values are available:

* **rus** -- Russian;
* **eng** -- English;
* **rus+eng** -- both Russian and English.
* **rus+eng** -- both Russian and English;
* **fra** -- French (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`);
* **spa** -- Spanish (for :class:`~dedoc.structure_extractors.FintocStructureExtractor`).

* - pages
- :, start:, :end, start:end
Expand Down
2 changes: 1 addition & 1 deletion docs/source/parameters/structure_type.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Structure type configuring
- Description

* - document_type
- other, law, tz, diploma
- other, law, tz, diploma, article, fintoc
- other
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract`
Expand Down
57 changes: 27 additions & 30 deletions scripts/fintoc2022/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@ class FintocLineWithLabelExtractor:
def __init__(self) -> None:
self.readers = {"tabby": PdfTabbyReader(), "txt_layer": PdfTxtlayerReader()}

def get_lines(self, file_name: str, file_path: str, annotation_path: str, reader_name: str) -> List[LineWithLabel]:
def get_lines(self, file_name: str, file_path: str, gt_path: str, reader_name: str) -> List[LineWithLabel]:
"""
Extract lines from PDF document, create labels for lines from annotation file given by FinTOC.
Annotations are matched to lines using Levenshtein distance (threshold=0.8).
Extract lines from PDF document, create labels for lines from the groundtruth file given by FinTOC.
Labeled lines are matched to the lines extracted by dedoc using Levenshtein distance (threshold=0.8).
:param file_name: name of the file (PDF, json)
:param file_path: path to the PDF document
:param annotation_path: path to the JSON file with annotations
:param gt_path: path to the groundtruth JSON file with labels
:param reader_name: ("tabby", "txt_layer") - type of PDF reader used for lines extraction
:return: document in form of list of lines with labels
"""
document = self.readers[reader_name].read(file_path, parameters={"need_header_footer_analysis": "True"})

annotations = defaultdict(list)
with open(annotation_path) as annotations_file:
for annotation in json.load(annotations_file):
annotations[annotation["page"] - 1].append(annotation)
labeled_lines = defaultdict(list)
with open(gt_path) as gt_file:
for labeled_line in json.load(gt_file):
labeled_lines[labeled_line["page"] - 1].append(labeled_line)

result = []
for line in document.lines:
annotations_page = [(ratio(line.line, annotation["text"]), annotation) for annotation in annotations[line.metadata.page_id]]
best_annotation = max(annotations_page, key=lambda t: t[0], default=(0, {}))
depth = best_annotation[1]["depth"] if len(annotations_page) > 0 and best_annotation[0] > 0.8 else "-1"
page_candidates = [(ratio(line.line, labeled_line["text"]), labeled_line) for labeled_line in labeled_lines[line.metadata.page_id]]
best_line = max(page_candidates, key=lambda t: t[0], default=(0, {}))
depth = best_line[1]["depth"] if len(page_candidates) > 0 and best_line[0] > 0.8 else "-1"
result.append(LineWithLabel(line=line.line, metadata=line.metadata, annotations=line.annotations, label=str(depth), group=file_name, uid=line.uid))

return sorted(result, key=lambda x: (x.metadata.page_id, x.metadata.line_id))
Expand Down Expand Up @@ -89,39 +89,36 @@ def get_data(self, language: str, reader_name: str, use_cache: bool = True) -> D

if os.path.isfile(pkl_path) and use_cache:
with gzip.open(pkl_path) as input_file:
lines = pickle.load(input_file)
self.logger.info("Data were loaded from the local disk")
return lines
parsed_files = pickle.load(input_file)
self.logger.info(f"Data were loaded from the local disk: {len(parsed_files)} files")
return parsed_files

result = self.__read_pdf_lines(archive_path, language, reader_name)

with gzip.open(pkl_path, "wb") as out:
pickle.dump(obj=result, file=out)
self.logger.info(Counter([line.label for document in result.values() for line in document]))
return result

def __read_pdf_lines(self, archive_path: str, language: str, reader_name: str) -> Dict[str, List[LineWithLabel]]:
with zipfile.ZipFile(archive_path, "r") as zip_ref:
zip_ref.extractall(self.dataset_dir)

data_dir = os.path.join(self.dataset_dir, "data", language)
pdf_dir = os.path.join(data_dir, "pdf")
annotations_dir = os.path.join(data_dir, "annots")
gt_dir = os.path.join(data_dir, "annots")
pdf_files = {pdf_file[:-len(".pdf")]: os.path.join(pdf_dir, pdf_file) for pdf_file in os.listdir(pdf_dir) if pdf_file.endswith(".pdf")}
annotations_files = {
ann_file[:-len(".pdf.fintoc4.json")]: os.path.join(annotations_dir, ann_file)
for ann_file in os.listdir(annotations_dir) if ann_file.endswith(".json")
}
assert set(pdf_files) == set(annotations_files)
gt_files = {gt_file[:-len(".pdf.fintoc4.json")]: os.path.join(gt_dir, gt_file) for gt_file in os.listdir(gt_dir) if gt_file.endswith(".json")}
assert set(pdf_files) == set(gt_files)

result = {}
with tempfile.TemporaryDirectory() as tmp_dir:
for file_name in pdf_files:
pdf_tmp_path = os.path.join(tmp_dir, file_name) + ".pdf"
shutil.copy(pdf_files[file_name], pdf_tmp_path)
try:
document = self.line_extractor.get_lines(
file_name=file_name,
file_path=pdf_tmp_path,
annotation_path=annotations_files[file_name],
reader_name=reader_name
)
document = self.line_extractor.get_lines(file_name=file_name, file_path=pdf_tmp_path, gt_path=gt_files[file_name], reader_name=reader_name)
result[pdf_files[file_name]] = document
except Exception as e:
self.logger.warning(f"Failed to read {file_name} by {reader_name}, error: {e}")

with gzip.open(pkl_path, "wb") as out:
pickle.dump(obj=result, file=out)
self.logger.info(Counter([line.label for document in result.values() for line in document]))
return result
10 changes: 5 additions & 5 deletions scripts/fintoc2022/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import shutil
from statistics import mean
from typing import Optional
from typing import Dict, List, Optional, Union

import pandas as pd
from sklearn.model_selection import GroupKFold
Expand Down Expand Up @@ -101,15 +101,15 @@ def fit(self, cross_val: bool = True, use_cache: bool = True) -> None:
scores = self.__cross_validate(features=features, gt_dir=gt_dir) if cross_val else None

# train resulting classifiers on all data
self.logger.info("Train resulting classifiers")
self.logger.info("Start training resulting classifiers on all data")
self.classifier.fit(self.binary_classifier_parameters, self.target_classifier_parameters, features=features, features_names=features_names)
self.__save(features_names=features_names, scores=scores)

def __get_features_names(self, features_df: pd.DataFrame) -> list:
def __get_features_names(self, features_df: pd.DataFrame) -> List[str]:
features_names = [col for col in features_df.columns if col not in self.additional_features_fields]
return features_names

def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict:
def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> Dict[str, Union[List[float], float]]:
self.logger.info("Start cross-validation")
features_names = self.__get_features_names(features)
results_path = os.path.join(self.scores_dir_path, f"cross_val_results_{self.language}_{self.reader_name}")
Expand Down Expand Up @@ -158,7 +158,7 @@ def __cross_validate(self, features: pd.DataFrame, gt_dir: str) -> dict:
result_scores["toc_mean"] = mean(result_scores["toc_scores"])
return result_scores

def __save(self, features_names: list[str], scores: Optional[dict]) -> None:
def __save(self, features_names: List[str], scores: Optional[Dict[str, Union[List[float], float]]]) -> None:

if scores is not None:
os.makedirs(self.scores_dir_path, exist_ok=True)
Expand Down
4 changes: 2 additions & 2 deletions scripts/fintoc2022/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
from collections import defaultdict
from typing import List, Tuple
from typing import Dict, List, Tuple, Union

import pandas as pd


def create_json_result(data: pd.DataFrame, predictions: List[int]) -> dict:
def create_json_result(data: pd.DataFrame, predictions: List[int]) -> Dict[str, List[Dict[str, Union[str, int]]]]:
"""
Creates dictionary with TOCs for each document: {"doc_name": TOC}.
TOC is a following list of dictionaries:
Expand Down
18 changes: 9 additions & 9 deletions tests/api_tests/test_api_doctype_fintoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,27 @@
class TestApiFintoc(AbstractTestApiDocReader):

def test_article_en(self) -> None:
file_name = "pdf_with_text_layer/prospectus.pdf"
file_name = "fintoc/prospectus_en.pdf"
result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true"))

tree = result["content"]["structure"]
self._check_tree_sanity(tree)

# headers
node = self._get_by_tree_path(tree, "0.1")
node = self._get_by_tree_path(tree, "0.0")
self.assertEqual("header", node["metadata"]["paragraph_type"])
self.assertEqual("Prospectus", node["text"].strip())
node = self._get_by_tree_path(tree, "0.1.2")
self.assertEqual("Key Information Document (KID)", node["text"].strip())
node = self._get_by_tree_path(tree, "0.0.0")
self.assertEqual("header", node["metadata"]["paragraph_type"])
self.assertEqual("TABLE OF CONTENTS", node["text"].strip())
self.assertEqual("PURPOSE", node["text"].strip())

# raw text
node = self._get_by_tree_path(tree, "0.1.2.0")
node = self._get_by_tree_path(tree, "0.0.0.0")
self.assertEqual("raw_text", node["metadata"]["paragraph_type"])
self.assertTrue(node["text"].startswith("PART I - GENERAL INFORMATION"))
self.assertTrue(node["text"].startswith("This document provides"))

def test_article_fr(self) -> None:
file_name = "pdf_with_text_layer/prospectus_fr.pdf"
file_name = "fintoc/prospectus_fr.pdf"
result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="fr"))

tree = result["content"]["structure"]
Expand All @@ -44,7 +44,7 @@ def test_article_fr(self) -> None:
self.assertEqual("OPCVM relevant de la directive européenne 2009/65/CE", node["text"].strip())

def test_article_sp(self) -> None:
file_name = "pdf_with_text_layer/prospectus_sp.pdf"
file_name = "fintoc/prospectus_sp.pdf"
result = self._send_request(file_name, dict(document_type="fintoc", pdf_with_text_layer="true", language="sp"))

tree = result["content"]["structure"]
Expand Down
Binary file added tests/data/fintoc/prospectus_en.pdf
Binary file not shown.
File renamed without changes.
File renamed without changes.

0 comments on commit 970e7aa

Please sign in to comment.