ispras · dronperminov · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/.flake8 b/.flake8
@@ -14,7 +14,6 @@ exclude =
     .github,
     *__init__.py,
     resources,
-    scripts,
     venv,
     build,
     dedoc.egg-info
@@ -23,3 +22,5 @@ exclude =
 # ANN101 - type annotations for self
 ignore =
     ANN101
+per-file-ignores =
+    scripts/*:T201
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
+        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
         args:
             - "--config=.flake8"
         additional_dependencies: [

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -33,18 +33,18 @@ def get_cpu_performance() -> float:
 
 
 cpu_performance = get_cpu_performance()
-print('"cpu_performance" = {}'.format(cpu_performance))
+print(f'"cpu_performance" = {cpu_performance}')
 
 with TemporaryDirectory() as path_base:
     path_out = os.path.join(path_base, "dataset.zip")
     wget.download(data_url, path_out)
-    with zipfile.ZipFile(path_out, 'r') as zip_ref:
+    with zipfile.ZipFile(path_out, "r") as zip_ref:
         zip_ref.extractall(path_base)
     print(path_base)
 
     failed = []
     result = OrderedDict()
-    result["version"] = requests.get("{}/version".format(host)).text
+    result["version"] = requests.get(f"{host}/version").text
     result["cpu_performance"] = cpu_performance
     tasks = [
         Task("images", "images", {}),
@@ -90,5 +90,5 @@ def get_cpu_performance() -> float:
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print("save result in" + path_result)
+    print(f"save result in {path_result}")
     print(failed)
diff --git a/scripts/benchmark_pdf_miner.py b/scripts/benchmark_pdf_miner.py
@@ -24,10 +24,10 @@
         wget.download(URL, pdfs_zip_path)
         wget.download(URL_GT, pdfs_zip_gt_path)
 
-        with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+        with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(pdfs_zip_path)
-        with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
+        with zipfile.ZipFile(pdfs_zip_gt_path, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(pdfs_zip_gt_path)
 
@@ -53,7 +53,7 @@
             accuracy_path = Path(tmpdir) / "accuracy.txt"
             if accuracy_path.exists():
                 accuracy_path.unlink()
-            command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
+            command = f'{accuracy_script_path} "{gt_path}" {tmp_ocr_path} >> {accuracy_path}'
             os.system(command)
 
             with open(accuracy_path, "r") as f:

diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
@@ -1,8 +1,9 @@
-import zipfile
-from pathlib import Path
 import json
 import pprint
-from typing import Optional, List
+import zipfile
+from pathlib import Path
+from typing import List, Optional
+
 import numpy as np
 import wget
 
@@ -63,7 +64,7 @@ def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
     pdfs_zip_path = data_dir / name_zip
     wget.download(url, str(data_dir))
 
-    with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+    with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
         zip_ref.extractall(data_dir)
     pdfs_zip_path.unlink()
 
@@ -83,19 +84,17 @@ def benchmark_on_our_data() -> dict:
     path_images = data_dir / "images"
     path_gt = data_dir / "gt.json"
     path_pred = data_dir / "pred.json"
-    download_dataset(data_dir,
-                     name_zip="benchmark_table_data.zip",
-                     url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
+    download_dataset(data_dir, name_zip="benchmark_table_data.zip", url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
 
     mode_metric_structure_only = False
 
     with open(path_gt, "r") as fp:
         gt_json = json.load(fp)
-    '''
+    """
     Creating base html (based on method predictions for future labeling)
     path_images = data_dir / "images_tmp"
     pred_json = prediction("gt_tmp.json", path_images)
-    '''
+    """
     pred_json = prediction(path_pred, path_images)
     scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only)
 
@@ -113,7 +112,7 @@ def benchmark_on_generated_table() -> dict:
     Article generation information https://arxiv.org/pdf/1905.13391.pdf
     Note: generate the 1st table tape category
     Note: don't use header table tag <th>, replacing on <td> tag
-    Note: all generated data (four categories) you can download from 
+    Note: all generated data (four categories) you can download from
     TODO: some tables have a low quality. Should to trace the reason.
     All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU
     """
@@ -129,7 +128,7 @@ def benchmark_on_generated_table() -> dict:
     # make common ground-truth file
     common_gt_json = {}
     for pathname in Path.iterdir(path_gt):
-        image_name = pathname.name.split(".")[0] + '.png'
+        image_name = pathname.name.split(".")[0] + ".png"
         with open(pathname, "r") as fp:
             table_html = fp.read()
             # exclude header tags
@@ -146,9 +145,7 @@ def benchmark_on_generated_table() -> dict:
     path_pred = data_dir / "pred.json"
 
     pred_json = prediction(path_pred, path_images)
-    scores = call_metric(pred_json=pred_json, true_json=common_gt_json,
-                         structure_only=mode_metric_structure_only,
-                         ignore_nodes=['span', 'style', 'head', 'h4'])
+    scores = call_metric(pred_json=pred_json, true_json=common_gt_json, structure_only=mode_metric_structure_only, ignore_nodes=["span", "style", "head", "h4"])
 
     result = dict()
     result["mode_metric_structure_only"] = mode_metric_structure_only

diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
@@ -11,28 +11,30 @@
 
 # Source: https://github.com/ibm-aur-nlp/PubTabNet
 
+from collections import deque
+from typing import Optional
+
 import distance
 from apted import APTED, Config
 from apted.helpers import Tree
 from lxml import etree, html
-from collections import deque
-
 from tqdm import tqdm
 
 
 class TableTree(Tree):
-    def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children):
+    def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children):  # noqa
         self.tag = tag
         self.colspan = colspan
         self.rowspan = rowspan
         self.content = content
         self.visible = visible
         self.children = list(children)
 
-    def bracket(self):
-        """Show tree using brackets notation
+    def bracket(self) -> str:
         """
-        if self.tag == "td" or self.tag == 'th':
+        Show tree using brackets notation
+        """
+        if self.tag == "td" or self.tag == "th":
             result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
         else:
             result = f'"tag": {self.tag}'
@@ -43,18 +45,22 @@ def bracket(self):
 
 class CustomConfig(Config):
     @staticmethod
-    def maximum(*sequences):
-        """Get maximum possible value
+    def maximum(*sequences):  # noqa
+        """
+        Get maximum possible value
         """
         return max(map(len, sequences))
 
-    def normalized_distance(self, *sequences) -> float:
-        """Get distance from 0 to 1
+    def normalized_distance(self, *sequences) -> float:  # noqa
+        """
+        Get distance from 0 to 1
         """
         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
 
     def rename(self, node1: TableTree, node2: TableTree) -> float:
-        """Compares attributes of trees"""
+        """
+        Compares attributes of trees
+        """
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
         if node1.tag == "td":
@@ -66,18 +72,20 @@ def rename(self, node1: TableTree, node2: TableTree) -> float:
 
 
 class TEDS(object):
-    """ Tree Edit Distance based Similarity
+    """
+    Tree Edit Distance based Similarity
     """
 
-    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+    def __init__(self, structure_only: bool = False, n_jobs: int = 1, ignore_nodes: Optional[list] = None) -> None:
         assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1"
         self.structure_only = structure_only
         self.n_jobs = n_jobs
         self.ignore_nodes = ignore_nodes
         self.__tokens__ = []
 
-    def tokenize(self, node):
-        """ Tokenizes table cells
+    def tokenize(self, node: TableTree) -> None:
+        """
+        Tokenizes table cells
         """
         self.__tokens__.append(f"<{node.tag}>")
         if node.text is not None:
@@ -89,11 +97,11 @@ def tokenize(self, node):
         if node.tag != "td" and node.tail is not None:
             self.__tokens__ += list(node.tail)
 
-    def get_span(self, node, name_span: str) -> int:
+    def get_span(self, node: TableTree, name_span: str) -> int:
         value = int(node.attrib.get(name_span, "1"))
         return 1 if value <= 0 else value
 
-    def load_html_tree(self, node, parent=None):
+    def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> TableTree:
         """ Converts HTML tree to the format required by apted
         """
         if node.tag == "td":
@@ -109,7 +117,7 @@ def load_html_tree(self, node, parent=None):
                                      colspan=self.get_span(node, "colspan"),
                                      rowspan=self.get_span(node, "rowspan"),
                                      content=cell,
-                                     visible=False if node.attrib.get("style") == "display: none" else True, *deque())
+                                     visible=node.attrib.get("style") != "display: none", *deque())  # noqa
             except Exception as ex:
                 print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
                 raise ex
@@ -148,12 +156,13 @@ def evaluate(self, pred: str, true: str) -> float:
         else:
             return 0.0
 
-    def batch_evaluate(self, pred_json, true_json):
-        """ Computes TEDS score between the prediction and the ground truth of
-            a batch of samples
-            @params pred_json: {'FILENAME': 'HTML CODE', ...}
-            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
-            @output: {'FILENAME': 'TEDS SCORE', ...}
+    def batch_evaluate(self, pred_json: dict, true_json: dict) -> dict:
+        """
+        Computes TEDS score between the prediction and the ground truth of a batch of samples
+
+        :param pred_json: {'FILENAME': 'HTML CODE', ...}
+        :param true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+        :return: {'FILENAME': 'TEDS SCORE', ...}
         """
         samples = true_json.keys()
         scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)]

diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
@@ -15,7 +15,7 @@
 path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
 
 host = "http://localhost:1231"
-param_dist_errors = namedtuple('Param', ('total_file_size', 'total_incorrect_files', 'failed'))
+param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
 
 
 def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
@@ -49,7 +49,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     if not os.path.isdir(benchmark_data_dir):
         path_out = os.path.join(data_dir, "data_with_text_layer.zip")
         wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
-        with zipfile.ZipFile(path_out, 'r') as zip_ref:
+        with zipfile.ZipFile(path_out, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(path_out)
         print(f"Benchmark data downloaded to {benchmark_data_dir}")
@@ -63,15 +63,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     parameters = dict(pdf_with_text_layer="auto", pages="1:1")
     result_item = OrderedDict()
 
-    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters)
+    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
     result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
     result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
 
-    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters)
+    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
     result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
     result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
     result["guessing_the_correctness_of_the_text"] = result_item
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print("Save result in" + path_result)
+    print(f"Save result in {path_result}")
diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py
@@ -17,9 +17,9 @@
 
 class CorrectTextGenerator:
     def __init__(self) -> None:
-        self.citation = re.compile(r'\[\d+]')
-        self.meta = re.compile(r'\[править \| править код]')
-        self.symbols = re.compile(r'[→←↑]')
+        self.citation = re.compile(r"\[\d+]")
+        self.meta = re.compile(r"\[править \| править код]")
+        self.symbols = re.compile(r"[→←↑]")
 
         self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0"
         self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text"
@@ -37,15 +37,15 @@ def get_random_text(self, lang: str) -> str:
                 # 2 - Get text the article
                 article_result = requests.post(self.article_url.format(lang=lang, title=title))
                 article_result_dict = article_result.json()
-                article = article_result_dict["parse"]["text"]['*']
-                bs = BeautifulSoup(article, 'html.parser')
+                article = article_result_dict["parse"]["text"]["*"]
+                bs = BeautifulSoup(article, "html.parser")
                 article_text = bs.get_text()
 
                 # 3 - Clear text of the article from unused symbols
-                article_text_fixed = re.sub(self.citation, '', article_text)
+                article_text_fixed = re.sub(self.citation, "", article_text)
                 article_text_fixed = re.sub(self.meta, "", article_text_fixed)
                 article_text_fixed = re.sub(self.symbols, "", article_text_fixed)
-                article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed)
+                article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed)
             except:  # noqa
                 article_text_fixed = ""
 
@@ -62,18 +62,22 @@ class EncodingCorruptor(Corruptor):
     def __init__(self) -> None:
         self.encodings = {
             "en": {
-                "input": ['cp1026'],
-                "output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8',
-                           'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r',
-                           'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman']
+                "input": ["cp1026"],
+                "output": [
+                    "cp1256", "cp437", "cp775", "cp852", "cp855", "cp857", "cp860", "cp861", "cp862", "cp863", "cp866", "gb18030", "hp_roman8",
+                    "iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "koi8_r",
+                    "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman"
+                ]
 
             },
             "ru": {
-                "input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'],
-                "output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11',
-                           'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r',
-                           'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500',
-                           'cp862', 'gb18030']
+                "input": ["cp855", "cp866", "gb18030", "iso8859_5", "koi8_r", "mac_cyrillic", "utf_8"],
+                "output": [
+                    "cp1026", "cp1256", "cp437", "cp775", "cp850", "cp852", "cp863", "cp866", "hp_roman8", "iso8859_10", "iso8859_11",
+                    "iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "iso8859_9", "koi8_r",
+                    "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman", "cp1140", "cp273", "cp855", "cp860", "cp861", "cp857", "cp500",
+                    "cp862", "gb18030"
+                ]
 
             }
         }