Skip to content

TLDR-590 fix code style in scripts directory #400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ exclude =
.github,
*__init__.py,
resources,
scripts,
venv,
build,
dedoc.egg-info
Expand All @@ -23,3 +22,5 @@ exclude =
# ANN101 - type annotations for self
ignore =
ANN101
per-file-ignores =
scripts/*:T201
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
8 changes: 4 additions & 4 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,18 @@ def get_cpu_performance() -> float:


cpu_performance = get_cpu_performance()
print('"cpu_performance" = {}'.format(cpu_performance))
print(f'"cpu_performance" = {cpu_performance}')

with TemporaryDirectory() as path_base:
path_out = os.path.join(path_base, "dataset.zip")
wget.download(data_url, path_out)
with zipfile.ZipFile(path_out, 'r') as zip_ref:
with zipfile.ZipFile(path_out, "r") as zip_ref:
zip_ref.extractall(path_base)
print(path_base)

failed = []
result = OrderedDict()
result["version"] = requests.get("{}/version".format(host)).text
result["version"] = requests.get(f"{host}/version").text
result["cpu_performance"] = cpu_performance
tasks = [
Task("images", "images", {}),
Expand Down Expand Up @@ -90,5 +90,5 @@ def get_cpu_performance() -> float:

with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
print("save result in" + path_result)
print(f"save result in {path_result}")
print(failed)
6 changes: 3 additions & 3 deletions scripts/benchmark_pdf_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
wget.download(URL, pdfs_zip_path)
wget.download(URL_GT, pdfs_zip_gt_path)

with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_path)
with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_gt_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(pdfs_zip_gt_path)

Expand All @@ -53,7 +53,7 @@
accuracy_path = Path(tmpdir) / "accuracy.txt"
if accuracy_path.exists():
accuracy_path.unlink()
command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
command = f'{accuracy_script_path} "{gt_path}" {tmp_ocr_path} >> {accuracy_path}'
os.system(command)

with open(accuracy_path, "r") as f:
Expand Down
25 changes: 11 additions & 14 deletions scripts/benchmark_table/benchmark_table.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import zipfile
from pathlib import Path
import json
import pprint
from typing import Optional, List
import zipfile
from pathlib import Path
from typing import List, Optional

import numpy as np
import wget

Expand Down Expand Up @@ -63,7 +64,7 @@ def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
pdfs_zip_path = data_dir / name_zip
wget.download(url, str(data_dir))

with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
pdfs_zip_path.unlink()

Expand All @@ -83,19 +84,17 @@ def benchmark_on_our_data() -> dict:
path_images = data_dir / "images"
path_gt = data_dir / "gt.json"
path_pred = data_dir / "pred.json"
download_dataset(data_dir,
name_zip="benchmark_table_data.zip",
url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
download_dataset(data_dir, name_zip="benchmark_table_data.zip", url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")

mode_metric_structure_only = False

with open(path_gt, "r") as fp:
gt_json = json.load(fp)
'''
"""
Creating base html (based on method predictions for future labeling)
path_images = data_dir / "images_tmp"
pred_json = prediction("gt_tmp.json", path_images)
'''
"""
pred_json = prediction(path_pred, path_images)
scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only)

Expand All @@ -113,7 +112,7 @@ def benchmark_on_generated_table() -> dict:
Article generation information https://arxiv.org/pdf/1905.13391.pdf
Note: generate the 1st table tape category
Note: don't use header table tag <th>, replacing on <td> tag
Note: all generated data (four categories) you can download from
Note: all generated data (four categories) you can download from
TODO: some tables have a low quality. Should to trace the reason.
All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU
"""
Expand All @@ -129,7 +128,7 @@ def benchmark_on_generated_table() -> dict:
# make common ground-truth file
common_gt_json = {}
for pathname in Path.iterdir(path_gt):
image_name = pathname.name.split(".")[0] + '.png'
image_name = pathname.name.split(".")[0] + ".png"
with open(pathname, "r") as fp:
table_html = fp.read()
# exclude header tags
Expand All @@ -146,9 +145,7 @@ def benchmark_on_generated_table() -> dict:
path_pred = data_dir / "pred.json"

pred_json = prediction(path_pred, path_images)
scores = call_metric(pred_json=pred_json, true_json=common_gt_json,
structure_only=mode_metric_structure_only,
ignore_nodes=['span', 'style', 'head', 'h4'])
scores = call_metric(pred_json=pred_json, true_json=common_gt_json, structure_only=mode_metric_structure_only, ignore_nodes=["span", "style", "head", "h4"])

result = dict()
result["mode_metric_structure_only"] = mode_metric_structure_only
Expand Down
57 changes: 33 additions & 24 deletions scripts/benchmark_table/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,30 @@

# Source: https://github.com/ibm-aur-nlp/PubTabNet

from collections import deque
from typing import Optional

import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import etree, html
from collections import deque

from tqdm import tqdm


class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children):
def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children): # noqa
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
self.visible = visible
self.children = list(children)

def bracket(self):
"""Show tree using brackets notation
def bracket(self) -> str:
"""
if self.tag == "td" or self.tag == 'th':
Show tree using brackets notation
"""
if self.tag == "td" or self.tag == "th":
result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
else:
result = f'"tag": {self.tag}'
Expand All @@ -43,18 +45,22 @@ def bracket(self):

class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
def maximum(*sequences): # noqa
"""
Get maximum possible value
"""
return max(map(len, sequences))

def normalized_distance(self, *sequences) -> float:
"""Get distance from 0 to 1
def normalized_distance(self, *sequences) -> float: # noqa
"""
Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)

def rename(self, node1: TableTree, node2: TableTree) -> float:
"""Compares attributes of trees"""
"""
Compares attributes of trees
"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == "td":
Expand All @@ -66,18 +72,20 @@ def rename(self, node1: TableTree, node2: TableTree) -> float:


class TEDS(object):
""" Tree Edit Distance based Similarity
"""
Tree Edit Distance based Similarity
"""

def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
def __init__(self, structure_only: bool = False, n_jobs: int = 1, ignore_nodes: Optional[list] = None) -> None:
assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1"
self.structure_only = structure_only
self.n_jobs = n_jobs
self.ignore_nodes = ignore_nodes
self.__tokens__ = []

def tokenize(self, node):
""" Tokenizes table cells
def tokenize(self, node: TableTree) -> None:
"""
Tokenizes table cells
"""
self.__tokens__.append(f"<{node.tag}>")
if node.text is not None:
Expand All @@ -89,11 +97,11 @@ def tokenize(self, node):
if node.tag != "td" and node.tail is not None:
self.__tokens__ += list(node.tail)

def get_span(self, node, name_span: str) -> int:
def get_span(self, node: TableTree, name_span: str) -> int:
value = int(node.attrib.get(name_span, "1"))
return 1 if value <= 0 else value

def load_html_tree(self, node, parent=None):
def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> TableTree:
""" Converts HTML tree to the format required by apted
"""
if node.tag == "td":
Expand All @@ -109,7 +117,7 @@ def load_html_tree(self, node, parent=None):
colspan=self.get_span(node, "colspan"),
rowspan=self.get_span(node, "rowspan"),
content=cell,
visible=False if node.attrib.get("style") == "display: none" else True, *deque())
visible=node.attrib.get("style") != "display: none", *deque()) # noqa
except Exception as ex:
print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
raise ex
Expand Down Expand Up @@ -148,12 +156,13 @@ def evaluate(self, pred: str, true: str) -> float:
else:
return 0.0

def batch_evaluate(self, pred_json, true_json):
""" Computes TEDS score between the prediction and the ground truth of
a batch of samples
@params pred_json: {'FILENAME': 'HTML CODE', ...}
@params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
@output: {'FILENAME': 'TEDS SCORE', ...}
def batch_evaluate(self, pred_json: dict, true_json: dict) -> dict:
"""
Computes TEDS score between the prediction and the ground truth of a batch of samples

:param pred_json: {'FILENAME': 'HTML CODE', ...}
:param true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
:return: {'FILENAME': 'TEDS SCORE', ...}
"""
samples = true_json.keys()
scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)]
Expand Down
10 changes: 5 additions & 5 deletions scripts/benchmark_tl_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")

host = "http://localhost:1231"
param_dist_errors = namedtuple('Param', ('total_file_size', 'total_incorrect_files', 'failed'))
param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))


def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
Expand Down Expand Up @@ -49,7 +49,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
if not os.path.isdir(benchmark_data_dir):
path_out = os.path.join(data_dir, "data_with_text_layer.zip")
wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
with zipfile.ZipFile(path_out, 'r') as zip_ref:
with zipfile.ZipFile(path_out, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(path_out)
print(f"Benchmark data downloaded to {benchmark_data_dir}")
Expand All @@ -63,15 +63,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
parameters = dict(pdf_with_text_layer="auto", pages="1:1")
result_item = OrderedDict()

incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters)
incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed

correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters)
correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
result["guessing_the_correctness_of_the_text"] = result_item

with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
print("Save result in" + path_result)
print(f"Save result in {path_result}")
36 changes: 20 additions & 16 deletions scripts/create_txtlayer_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

class CorrectTextGenerator:
def __init__(self) -> None:
self.citation = re.compile(r'\[\d+]')
self.meta = re.compile(r'\[править \| править код]')
self.symbols = re.compile(r'[→←↑]')
self.citation = re.compile(r"\[\d+]")
self.meta = re.compile(r"\[править \| править код]")
self.symbols = re.compile(r"[→←↑]")

self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0"
self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text"
Expand All @@ -37,15 +37,15 @@ def get_random_text(self, lang: str) -> str:
# 2 - Get text the article
article_result = requests.post(self.article_url.format(lang=lang, title=title))
article_result_dict = article_result.json()
article = article_result_dict["parse"]["text"]['*']
bs = BeautifulSoup(article, 'html.parser')
article = article_result_dict["parse"]["text"]["*"]
bs = BeautifulSoup(article, "html.parser")
article_text = bs.get_text()

# 3 - Clear text of the article from unused symbols
article_text_fixed = re.sub(self.citation, '', article_text)
article_text_fixed = re.sub(self.citation, "", article_text)
article_text_fixed = re.sub(self.meta, "", article_text_fixed)
article_text_fixed = re.sub(self.symbols, "", article_text_fixed)
article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed)
article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed)
except: # noqa
article_text_fixed = ""

Expand All @@ -62,18 +62,22 @@ class EncodingCorruptor(Corruptor):
def __init__(self) -> None:
self.encodings = {
"en": {
"input": ['cp1026'],
"output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8',
'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r',
'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman']
"input": ["cp1026"],
"output": [
"cp1256", "cp437", "cp775", "cp852", "cp855", "cp857", "cp860", "cp861", "cp862", "cp863", "cp866", "gb18030", "hp_roman8",
"iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "koi8_r",
"mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman"
]

},
"ru": {
"input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'],
"output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11',
'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r',
'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500',
'cp862', 'gb18030']
"input": ["cp855", "cp866", "gb18030", "iso8859_5", "koi8_r", "mac_cyrillic", "utf_8"],
"output": [
"cp1026", "cp1256", "cp437", "cp775", "cp850", "cp852", "cp863", "cp866", "hp_roman8", "iso8859_10", "iso8859_11",
"iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "iso8859_9", "koi8_r",
"mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman", "cp1140", "cp273", "cp855", "cp860", "cp861", "cp857", "cp500",
"cp862", "gb18030"
]

}
}
Expand Down
Loading