Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-590 fix code style in scripts directory #400

Merged
merged 3 commits into from
Feb 1, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -14,7 +14,6 @@ exclude =
.github,
*__init__.py,
resources,
scripts,
venv,
build,
dedoc.egg-info
@@ -23,3 +22,5 @@ exclude =
# ANN101 - type annotations for self
ignore =
ANN101
per-file-ignores =
scripts/*:T201
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
8 changes: 4 additions & 4 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -33,18 +33,18 @@ def get_cpu_performance() -> float:


cpu_performance = get_cpu_performance()
print('"cpu_performance" = {}'.format(cpu_performance))
print(f'"cpu_performance" = {cpu_performance}')

with TemporaryDirectory() as path_base:
path_out = os.path.join(path_base, "dataset.zip")
wget.download(data_url, path_out)
with zipfile.ZipFile(path_out, 'r') as zip_ref:
with zipfile.ZipFile(path_out, "r") as zip_ref:
zip_ref.extractall(path_base)
print(path_base)

failed = []
result = OrderedDict()
result["version"] = requests.get("{}/version".format(host)).text
result["version"] = requests.get(f"{host}/version").text
result["cpu_performance"] = cpu_performance
tasks = [
Task("images", "images", {}),
@@ -90,5 +90,5 @@ def get_cpu_performance() -> float:

with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
print("save result in" + path_result)
print(f"save result in {path_result}")
print(failed)
6 changes: 3 additions & 3 deletions scripts/benchmark_pdf_miner.py
Original file line number Diff line number Diff line change
@@ -24,10 +24,10 @@
wget.download(URL, pdfs_zip_path)
wget.download(URL_GT, pdfs_zip_gt_path)

with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_path)
with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_gt_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_gt_path)

@@ -53,7 +53,7 @@
accuracy_path = Path(tmpdir) / "accuracy.txt"
if accuracy_path.exists():
accuracy_path.unlink()
command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
command = f'{accuracy_script_path} "{gt_path}" {tmp_ocr_path} >> {accuracy_path}'
os.system(command)

with open(accuracy_path, "r") as f:
25 changes: 11 additions & 14 deletions scripts/benchmark_table/benchmark_table.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import zipfile
from pathlib import Path
import json
import pprint
from typing import Optional, List
import zipfile
from pathlib import Path
from typing import List, Optional

import numpy as np
import wget

@@ -63,7 +64,7 @@ def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
pdfs_zip_path = data_dir / name_zip
wget.download(url, str(data_dir))

with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
pdfs_zip_path.unlink()

@@ -83,19 +84,17 @@ def benchmark_on_our_data() -> dict:
path_images = data_dir / "images"
path_gt = data_dir / "gt.json"
path_pred = data_dir / "pred.json"
download_dataset(data_dir,
name_zip="benchmark_table_data.zip",
url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
download_dataset(data_dir, name_zip="benchmark_table_data.zip", url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")

mode_metric_structure_only = False

with open(path_gt, "r") as fp:
gt_json = json.load(fp)
'''
"""
Creating base html (based on method predictions for future labeling)
path_images = data_dir / "images_tmp"
pred_json = prediction("gt_tmp.json", path_images)
'''
"""
pred_json = prediction(path_pred, path_images)
scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only)

@@ -113,7 +112,7 @@ def benchmark_on_generated_table() -> dict:
Article generation information https://arxiv.org/pdf/1905.13391.pdf
Note: generate the 1st table tape category
Note: don't use header table tag <th>, replacing on <td> tag
Note: all generated data (four categories) you can download from
Note: all generated data (four categories) you can download from
TODO: some tables have a low quality. Should to trace the reason.
All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU
"""
@@ -129,7 +128,7 @@ def benchmark_on_generated_table() -> dict:
# make common ground-truth file
common_gt_json = {}
for pathname in Path.iterdir(path_gt):
image_name = pathname.name.split(".")[0] + '.png'
image_name = pathname.name.split(".")[0] + ".png"
with open(pathname, "r") as fp:
table_html = fp.read()
# exclude header tags
@@ -146,9 +145,7 @@ def benchmark_on_generated_table() -> dict:
path_pred = data_dir / "pred.json"

pred_json = prediction(path_pred, path_images)
scores = call_metric(pred_json=pred_json, true_json=common_gt_json,
structure_only=mode_metric_structure_only,
ignore_nodes=['span', 'style', 'head', 'h4'])
scores = call_metric(pred_json=pred_json, true_json=common_gt_json, structure_only=mode_metric_structure_only, ignore_nodes=["span", "style", "head", "h4"])

result = dict()
result["mode_metric_structure_only"] = mode_metric_structure_only
57 changes: 33 additions & 24 deletions scripts/benchmark_table/metric.py
Original file line number Diff line number Diff line change
@@ -11,28 +11,30 @@

# Source: https://github.com/ibm-aur-nlp/PubTabNet

from collections import deque
from typing import Optional

import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import etree, html
from collections import deque

from tqdm import tqdm


class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children):
def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children): # noqa
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
self.visible = visible
self.children = list(children)

def bracket(self):
"""Show tree using brackets notation
def bracket(self) -> str:
"""
if self.tag == "td" or self.tag == 'th':
Show tree using brackets notation
"""
if self.tag == "td" or self.tag == "th":
result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
else:
result = f'"tag": {self.tag}'
@@ -43,18 +45,22 @@ def bracket(self):

class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
def maximum(*sequences): # noqa
"""
Get maximum possible value
"""
return max(map(len, sequences))

def normalized_distance(self, *sequences) -> float:
"""Get distance from 0 to 1
def normalized_distance(self, *sequences) -> float: # noqa
"""
Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)

def rename(self, node1: TableTree, node2: TableTree) -> float:
"""Compares attributes of trees"""
"""
Compares attributes of trees
"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == "td":
@@ -66,18 +72,20 @@ def rename(self, node1: TableTree, node2: TableTree) -> float:


class TEDS(object):
""" Tree Edit Distance based Similarity
"""
Tree Edit Distance based Similarity
"""

def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
def __init__(self, structure_only: bool = False, n_jobs: int = 1, ignore_nodes: Optional[list] = None) -> None:
assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1"
self.structure_only = structure_only
self.n_jobs = n_jobs
self.ignore_nodes = ignore_nodes
self.__tokens__ = []

def tokenize(self, node):
""" Tokenizes table cells
def tokenize(self, node: TableTree) -> None:
"""
Tokenizes table cells
"""
self.__tokens__.append(f"<{node.tag}>")
if node.text is not None:
@@ -89,11 +97,11 @@ def tokenize(self, node):
if node.tag != "td" and node.tail is not None:
self.__tokens__ += list(node.tail)

def get_span(self, node, name_span: str) -> int:
def get_span(self, node: TableTree, name_span: str) -> int:
value = int(node.attrib.get(name_span, "1"))
return 1 if value <= 0 else value

def load_html_tree(self, node, parent=None):
def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> TableTree:
""" Converts HTML tree to the format required by apted
"""
if node.tag == "td":
@@ -109,7 +117,7 @@ def load_html_tree(self, node, parent=None):
colspan=self.get_span(node, "colspan"),
rowspan=self.get_span(node, "rowspan"),
content=cell,
visible=False if node.attrib.get("style") == "display: none" else True, *deque())
visible=node.attrib.get("style") != "display: none", *deque()) # noqa
except Exception as ex:
print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
raise ex
@@ -148,12 +156,13 @@ def evaluate(self, pred: str, true: str) -> float:
else:
return 0.0

def batch_evaluate(self, pred_json, true_json):
""" Computes TEDS score between the prediction and the ground truth of
a batch of samples
@params pred_json: {'FILENAME': 'HTML CODE', ...}
@params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
@output: {'FILENAME': 'TEDS SCORE', ...}
def batch_evaluate(self, pred_json: dict, true_json: dict) -> dict:
"""
Computes TEDS score between the prediction and the ground truth of a batch of samples

:param pred_json: {'FILENAME': 'HTML CODE', ...}
:param true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
:return: {'FILENAME': 'TEDS SCORE', ...}
"""
samples = true_json.keys()
scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)]
10 changes: 5 additions & 5 deletions scripts/benchmark_tl_correctness.py
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")

host = "http://localhost:1231"
param_dist_errors = namedtuple('Param', ('total_file_size', 'total_incorrect_files', 'failed'))
param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))


def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
@@ -49,7 +49,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
if not os.path.isdir(benchmark_data_dir):
path_out = os.path.join(data_dir, "data_with_text_layer.zip")
wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
with zipfile.ZipFile(path_out, 'r') as zip_ref:
with zipfile.ZipFile(path_out, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(path_out)
print(f"Benchmark data downloaded to {benchmark_data_dir}")
@@ -63,15 +63,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
parameters = dict(pdf_with_text_layer="auto", pages="1:1")
result_item = OrderedDict()

incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters)
incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed

correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters)
correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
result["guessing_the_correctness_of_the_text"] = result_item

with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
print("Save result in" + path_result)
print(f"Save result in {path_result}")
36 changes: 20 additions & 16 deletions scripts/create_txtlayer_dataset.py
Original file line number Diff line number Diff line change
@@ -17,9 +17,9 @@

class CorrectTextGenerator:
def __init__(self) -> None:
self.citation = re.compile(r'\[\d+]')
self.meta = re.compile(r'\[править \| править код]')
self.symbols = re.compile(r'[→←↑]')
self.citation = re.compile(r"\[\d+]")
self.meta = re.compile(r"\[править \| править код]")
self.symbols = re.compile(r"[→←↑]")

self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0"
self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text"
@@ -37,15 +37,15 @@ def get_random_text(self, lang: str) -> str:
# 2 - Get text the article
article_result = requests.post(self.article_url.format(lang=lang, title=title))
article_result_dict = article_result.json()
article = article_result_dict["parse"]["text"]['*']
bs = BeautifulSoup(article, 'html.parser')
article = article_result_dict["parse"]["text"]["*"]
bs = BeautifulSoup(article, "html.parser")
article_text = bs.get_text()

# 3 - Clear text of the article from unused symbols
article_text_fixed = re.sub(self.citation, '', article_text)
article_text_fixed = re.sub(self.citation, "", article_text)
article_text_fixed = re.sub(self.meta, "", article_text_fixed)
article_text_fixed = re.sub(self.symbols, "", article_text_fixed)
article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed)
article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed)
except: # noqa
article_text_fixed = ""

@@ -62,18 +62,22 @@ class EncodingCorruptor(Corruptor):
def __init__(self) -> None:
self.encodings = {
"en": {
"input": ['cp1026'],
"output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8',
'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r',
'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman']
"input": ["cp1026"],
"output": [
"cp1256", "cp437", "cp775", "cp852", "cp855", "cp857", "cp860", "cp861", "cp862", "cp863", "cp866", "gb18030", "hp_roman8",
"iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "koi8_r",
"mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman"
]

},
"ru": {
"input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'],
"output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11',
'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r',
'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500',
'cp862', 'gb18030']
"input": ["cp855", "cp866", "gb18030", "iso8859_5", "koi8_r", "mac_cyrillic", "utf_8"],
"output": [
"cp1026", "cp1256", "cp437", "cp775", "cp850", "cp852", "cp863", "cp866", "hp_roman8", "iso8859_10", "iso8859_11",
"iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "iso8859_9", "koi8_r",
"mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman", "cp1140", "cp273", "cp855", "cp860", "cp861", "cp857", "cp500",
"cp862", "gb18030"
]

}
}
Loading