Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-642 FinTOC benchmarks #426

Merged
merged 8 commits into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@ exclude =
resources,
venv,
build,
dedoc.egg-info
docs/_build
dedoc.egg-info,
docs/_build,
scripts/fintoc2022/metric.py

# ANN101 - type annotations for self
# T201 - prints found
# JS101 - Multi-line container not broken after opening character
ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101,T201
scripts/benchmark_pdf_performance*:JS101
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
4 changes: 2 additions & 2 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
@dataclass
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
Expand All @@ -29,7 +29,7 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
description='One or multiple column document, "auto" - predict number of page columns automatically')
Expand Down
3 changes: 3 additions & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ <h4>Type of document structure parsing</h4>
<option value="tz">tz</option>
<option value="diploma">diploma</option>
<option value="article">article</option>
<option value="fintoc">fintoc</option>
</select> document_type
</label>
</p>
Expand Down Expand Up @@ -137,6 +138,8 @@ <h4>PDF handling</h4>
<option value="rus">rus</option>
<option value="eng">eng</option>
<option value="rus+eng" selected>rus+eng</option>
<option value="fra">fra</option>
<option value="spa">spa</option>
</select> language
</label>
</p>
Expand Down
11 changes: 10 additions & 1 deletion dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013"
line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263"
)


Expand All @@ -42,6 +43,14 @@ def download(resources_path: str) -> None:
repo_name="line_type_classifiers",
hub_name=f"{classifier_type}.pkl.gz")

fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
for language in ("en", "fr", "sp"):
for classifier_type in ("target", "binary"):
download_from_hub(out_dir=fintoc_classifiers_resources_path,
out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
repo_name="fintoc_classifiers",
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz")


if __name__ == "__main__":
resources_path = get_config()["resources_path"]
Expand Down
9 changes: 6 additions & 3 deletions dedoc/manager_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import Optional

from dedoc.readers.article_reader.article_reader import ArticleReader


def _get_manager_config(config: dict) -> dict:
"""
Expand All @@ -23,6 +21,7 @@ def _get_manager_config(config: dict) -> dict:
from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor
from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition
from dedoc.readers.archive_reader.archive_reader import ArchiveReader
from dedoc.readers.article_reader.article_reader import ArticleReader
from dedoc.readers.csv_reader.csv_reader import CSVReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from dedoc.readers.email_reader.email_reader import EmailReader
Expand All @@ -41,9 +40,11 @@ def _get_manager_config(config: dict) -> dict:
from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor
from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor
from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition
from dedoc.structure_extractors.concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor
from dedoc.structure_extractors.concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor
Expand Down Expand Up @@ -93,7 +94,9 @@ def _get_manager_config(config: dict) -> dict:
DefaultStructureExtractor.document_type: DefaultStructureExtractor(config=config),
DiplomaStructureExtractor.document_type: DiplomaStructureExtractor(config=config),
TzStructureExtractor.document_type: TzStructureExtractor(config=config),
ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config)
ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config),
ArticleStructureExtractor.document_type: ArticleStructureExtractor(config=config),
FintocStructureExtractor.document_type: FintocStructureExtractor(config=config)
}

return dict(
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class PdfBaseReader(BaseReader):

def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)
self.config["n_jobs"] = config.get("n_jobs", 1)
self.config["n_jobs"] = self.config.get("n_jobs", 1)
self.table_recognizer = TableRecognizer(config=self.config)
self.metadata_extractor = LineMetadataExtractor(config=self.config)
self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
Expand Down
5 changes: 3 additions & 2 deletions dedoc/structure_extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from .concrete_structure_extractors.article_structure_extractor import ArticleStructureExtractor
from .concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor
from .concrete_structure_extractors.diploma_structure_extractor import DiplomaStructureExtractor
from .concrete_structure_extractors.fintoc_structure_extractor import FintocStructureExtractor
from .concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor
from .concrete_structure_extractors.law_structure_excractor import LawStructureExtractor
from .concrete_structure_extractors.tz_structure_extractor import TzStructureExtractor
from .structure_extractor_composition import StructureExtractorComposition

__all__ = ['AbstractStructureExtractor', 'AbstractLawStructureExtractor', 'ArticleStructureExtractor', 'ClassifyingLawStructureExtractor',
'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor', 'TzStructureExtractor',
'StructureExtractorComposition']
'DefaultStructureExtractor', 'DiplomaStructureExtractor', 'FintocStructureExtractor', 'FoivLawStructureExtractor', 'LawStructureExtractor',
'TzStructureExtractor', 'StructureExtractorComposition']
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import os
import re
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd

from dedoc.config import get_config
from dedoc.data_structures import HierarchyLevel, LineWithMeta, UnstructuredDocument
from dedoc.structure_extractors import AbstractStructureExtractor
from dedoc.structure_extractors.feature_extractors.fintoc_feature_extractor import FintocFeatureExtractor
from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor
from dedoc.structure_extractors.line_type_classifiers.fintoc_classifier import FintocClassifier


class FintocStructureExtractor(AbstractStructureExtractor):
"""
This class is an implementation of the TOC extractor for the `FinTOC 2022 Shared task <https://wp.lancs.ac.uk/cfie/fintoc2022/>`_.
The code is a modification of the winner's solution (ISP RAS team).

This structure extractor is used for English, French and Spanish financial prospects in PDF format (with a textual layer).
It is recommended to use :class:`~dedoc.readers.PdfTxtlayerReader` to obtain document lines.
You can find the more detailed description of this type of structure in the section :ref:`fintoc_structure`.
"""
document_type = "fintoc"

def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)
from dedoc.readers import PdfTxtlayerReader # to exclude circular imports
self.pdf_reader = PdfTxtlayerReader(config=self.config)
self.toc_extractor = TOCFeatureExtractor()
self.features_extractor = FintocFeatureExtractor()
self.languages = ("en", "fr", "sp")
path = os.path.join(get_config()["resources_path"], "fintoc_classifiers")
self.classifiers = {language: FintocClassifier(language=language, weights_dir_path=path) for language in self.languages}
self.toc_item_regexp = re.compile(r'"([^"]+)" (\d+)')
self.empty_string_regexp = re.compile(r"^\s*\n$")

def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None, file_path: Optional[str] = None) -> UnstructuredDocument:
"""
According to the `FinTOC 2022 <https://wp.lancs.ac.uk/cfie/fintoc2022/>`_ title detection task, lines are classified as titles and non-titles.
The information about titles is saved in ``line.metadata.hierarchy_level`` (:class:`~dedoc.data_structures.HierarchyLevel` class):

- Title lines have ``HierarchyLevel.header`` type, and their depth (``HierarchyLevel.level_2``) is similar to \
the depth of TOC item from the FinTOC 2022 TOC generation task.
- Non-title lines have ``HierarchyLevel.raw_text`` type, and their depth isn't obtained.

:param document: document content that has been received from some of the readers (:class:`~dedoc.readers.PdfTxtlayerReader` is recommended).
:param parameters: for this structure extractor, "language" parameter is used for setting document's language, e.g. ``parameters={"language": "en"}``. \
The following options are supported:

* "en", "eng" - English (default);
* "fr", "fra" - French;
* "sp", "spa" - Spanish.
:param file_path: path to the file on disk.
:return: document content with added additional information about title/non-title lines and hierarchy levels of titles.
"""
parameters = {} if parameters is None else parameters
language = self.__get_param_language(parameters=parameters)

features, documents = self.get_features(documents_dict={file_path: document.lines})
predictions = self.classifiers[language].predict(features)
lines: List[LineWithMeta] = documents[0]
assert len(lines) == len(predictions)

for line, prediction in zip(lines, predictions):
if prediction > 0:
line.metadata.hierarchy_level = HierarchyLevel(level_1=1, level_2=prediction, line_type=HierarchyLevel.header, can_be_multiline=True)
else:
line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
document.lines = lines

return document

def __get_param_language(self, parameters: dict) -> str:
oksidgy marked this conversation as resolved.
Show resolved Hide resolved
language = parameters.get("language", "en")

if language in ("en", "eng", "rus+eng"):
return "en"

if language in ("fr", "fra"):
return "fr"

if language in ("sp", "spa"):
return "sp"

if language not in self.languages:
self.logger.warning(f"Language {language} is not supported by this extractor. Use default language (en)")
return "en"

def get_features(self, documents_dict: Dict[str, List[LineWithMeta]]) -> Tuple[pd.DataFrame, List[List[LineWithMeta]]]:
toc_lines, documents = [], []
for file_path, document_lines in documents_dict.items():
toc_lines.append(self.__get_toc(file_path=file_path))
documents.append(self.__filter_lines(document_lines))
features = self.features_extractor.transform(documents=documents, toc_lines=toc_lines)
return features, documents

def __filter_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
special_unicode_symbols = [u"\uf0b7", u"\uf0d8", u"\uf084", u"\uf0a7", u"\uf0f0", u"\x83"]

lines = [line for line in lines if not self.empty_string_regexp.match(line.line)]
for line in lines:
for ch in special_unicode_symbols:
line.set_line(line.line.replace(ch, ""))

return lines

def __get_toc(self, file_path: Optional[str]) -> List[Dict[str, Union[LineWithMeta, str]]]:
"""
Try to get TOC from PDF automatically. If TOC wasn't extracted automatically, it is extracted using regular expressions.
"""
if file_path is None or not file_path.lower().endswith(".pdf"):
return []

toc = self.__get_automatic_toc(path=file_path)
if len(toc) > 0:
self.logger.info(f"Got automatic TOC from {os.path.basename(file_path)}")
return toc

parameters = {"is_one_column_document": "True", "need_header_footer_analysis": "True", "pages": ":10"}
lines = self.pdf_reader.read(file_path=file_path, parameters=parameters).lines
return self.toc_extractor.get_toc(lines)

def __get_automatic_toc(self, path: str) -> List[Dict[str, Union[LineWithMeta, str]]]:
result = []
with os.popen(f'pdftocio -p "{path}"') as out:
toc = out.readlines()

for line in toc:
match = self.toc_item_regexp.match(line.strip())
if match:
result.append({"line": LineWithMeta(match.group(1)), "page": match.group(2)})

return result
Loading
Loading