From 18463e802b7c0688c93a61025588bd2934a3f790 Mon Sep 17 00:00:00 2001
From: Nasty <bogatenkova.anastasiya@mail.ru>
Date: Thu, 1 Feb 2024 13:09:29 +0300
Subject: [PATCH 1/3] TLDR-590 fix code style in scripts directory

---
 .flake8                                       |  1 -
 .pre-commit-config.yaml                       |  2 +-
 scripts/benchmark.py                          | 14 ++--
 scripts/benchmark_pdf_attachments.py          | 16 ++--
 scripts/benchmark_pdf_miner.py                | 12 +--
 scripts/benchmark_table/benchmark_table.py    | 31 ++++----
 scripts/benchmark_table/metric.py             | 59 ++++++++-------
 scripts/benchmark_tl_correctness.py           | 16 ++--
 scripts/create_txtlayer_dataset.py            | 42 ++++++-----
 .../calc_tesseract_benchmarks.py              | 52 +++++++------
 scripts/tesseract_benchmark/ocr_correction.py | 12 ++-
 .../text_blob_correction.py                   |  2 +-
 scripts/test_words_bbox_extraction.py         | 60 +++++++++------
 .../train/train_acc_orientation_classifier.py | 75 +++++++------------
 .../train/train_diploma_line_classifier.py    | 12 ++-
 scripts/train/train_law_line_classifier.py    | 10 +--
 scripts/train/train_mle_language_model.py     | 18 ++---
 scripts/train/train_nn_line_classifier_law.py | 14 ++--
 scripts/train/train_paragraph_classifier.py   | 10 +--
 scripts/train/train_txtlayer_classifier.py    | 22 +++---
 scripts/train/train_tz_line_classifier.py     | 10 +--
 21 files changed, 242 insertions(+), 248 deletions(-)

diff --git a/.flake8 b/.flake8
index 8d7e241f..4c24c4a3 100644
--- a/.flake8
+++ b/.flake8
@@ -14,7 +14,6 @@ exclude =
     .github,
     *__init__.py,
     resources,
-    scripts,
     venv,
     build,
     dedoc.egg-info
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7c5d542c..76ee04b4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
+        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
         args:
             - "--config=.flake8"
         additional_dependencies: [
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 4492af03..69276d70 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -33,18 +33,18 @@ def get_cpu_performance() -> float:
 
 
 cpu_performance = get_cpu_performance()
-print('"cpu_performance" = {}'.format(cpu_performance))
+print(f'"cpu_performance" = {cpu_performance}')  # noqa
 
 with TemporaryDirectory() as path_base:
     path_out = os.path.join(path_base, "dataset.zip")
     wget.download(data_url, path_out)
-    with zipfile.ZipFile(path_out, 'r') as zip_ref:
+    with zipfile.ZipFile(path_out, "r") as zip_ref:
         zip_ref.extractall(path_base)
-    print(path_base)
+    print(path_base)  # noqa
 
     failed = []
     result = OrderedDict()
-    result["version"] = requests.get("{}/version".format(host)).text
+    result["version"] = requests.get(f"{host}/version").text
     result["cpu_performance"] = cpu_performance
     tasks = [
         Task("images", "images", {}),
@@ -60,7 +60,7 @@ def get_cpu_performance() -> float:
         Task("pdf", "pdf", {"pdf_with_text_layer": "false"}),
         Task("pdf_tables", "pdf_tables", {})
     ]
-    print(tasks)
+    print(tasks)  # noqa
     for directory, name, parameters in tasks:
         total_size = 0
         total_time = 0
@@ -90,5 +90,5 @@ def get_cpu_performance() -> float:
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print("save result in" + path_result)
-    print(failed)
+    print(f"save result in {path_result}")  # noqa
+    print(failed)  # noqa
diff --git a/scripts/benchmark_pdf_attachments.py b/scripts/benchmark_pdf_attachments.py
index 411f1275..f069685c 100644
--- a/scripts/benchmark_pdf_attachments.py
+++ b/scripts/benchmark_pdf_attachments.py
@@ -39,7 +39,7 @@ def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir:
                     shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
                     attachment_names.append(attachment_name)
 
-        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")  # noqa
         result_dict[file_name] = sorted(attachment_names)
 
     return result_dict
@@ -70,7 +70,7 @@ def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_d
                     shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
                     attachment_names.append(attachment_name)
 
-        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")  # noqa
         result_dict[file_name] = sorted(attachment_names)
 
     return result_dict
@@ -99,9 +99,9 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
             zip_ref.extractall(data_dir)
         os.remove(archive_path)
 
-        print(f"Benchmark data downloaded to {data_dir}")
+        print(f"Benchmark data downloaded to {data_dir}")  # noqa
     else:
-        print(f"Use cached benchmark data from {data_dir}")
+        print(f"Use cached benchmark data from {data_dir}")  # noqa
 
     in_dir = os.path.join(data_dir, "with_attachments")
     out_dir = os.path.join(in_dir, "extracted_attachments")
@@ -112,17 +112,17 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
 
     benchmarks_dict = {}
 
-    print("Get tabby attachments")
+    print("Get tabby attachments")  # noqa
     tabby_reader = PdfTabbyReader(config={})
     tabby_out_dir = os.path.join(out_dir, "tabby")
     benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)
 
-    print("Get pdfminer attachments")
+    print("Get pdfminer attachments")  # noqa
     pdfminer_reader = PdfTxtlayerReader(config={})
     pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
     benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)
 
-    print("Get common attachments")
+    print("Get common attachments")  # noqa
     common_out_dir = os.path.join(out_dir, "common")
     pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
     benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)
@@ -131,4 +131,4 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
     with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
         json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)
 
-    print(f"Attachments were extracted to {out_dir}")
+    print(f"Attachments were extracted to {out_dir}")  # noqa
diff --git a/scripts/benchmark_pdf_miner.py b/scripts/benchmark_pdf_miner.py
index b8870ed5..b7c5d785 100644
--- a/scripts/benchmark_pdf_miner.py
+++ b/scripts/benchmark_pdf_miner.py
@@ -24,16 +24,16 @@
         wget.download(URL, pdfs_zip_path)
         wget.download(URL_GT, pdfs_zip_gt_path)
 
-        with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+        with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(pdfs_zip_path)
-        with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
+        with zipfile.ZipFile(pdfs_zip_gt_path, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(pdfs_zip_gt_path)
 
-        print(f"Benchmark data downloaded to {data_dir}")
+        print(f"Benchmark data downloaded to {data_dir}")  # noqa
     else:
-        print(f"Use cached benchmark data from {data_dir}")
+        print(f"Use cached benchmark data from {data_dir}")  # noqa
 
     pdfs_path = data_dir / "PdfMiner Params"
     pdfs_gt_path = data_dir / "PdfMiner Params GT"
@@ -53,7 +53,7 @@
             accuracy_path = Path(tmpdir) / "accuracy.txt"
             if accuracy_path.exists():
                 accuracy_path.unlink()
-            command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
+            command = f'{accuracy_script_path} "{gt_path}" {tmp_ocr_path} >> {accuracy_path}'
             os.system(command)
 
             with open(accuracy_path, "r") as f:
@@ -68,4 +68,4 @@
     with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f:
         json.dump(info, f, ensure_ascii=False, indent=2)
 
-    print(f"save result in {output_dir}")
+    print(f"save result in {output_dir}")  # noqa
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index c6cbd7cb..8d5b7a81 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -1,8 +1,9 @@
-import zipfile
-from pathlib import Path
 import json
 import pprint
-from typing import Optional, List
+import zipfile
+from pathlib import Path
+from typing import List, Optional
+
 import numpy as np
 import wget
 
@@ -47,7 +48,7 @@ def get_tables(image_path: Path) -> str:
 def make_predict_json(data_path: Path) -> dict:
     predict_json = {}
     for pathname in Path.iterdir(data_path):
-        print(pathname)
+        print(pathname)  # noqa
 
         predict_json[pathname.name] = {"html": "<html><body>" + get_tables(pathname) + "</body></html>"}
 
@@ -56,18 +57,18 @@ def make_predict_json(data_path: Path) -> dict:
 
 def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
     if Path.exists(data_dir):
-        print(f"Use cached benchmark data from {data_dir}")
+        print(f"Use cached benchmark data from {data_dir}")  # noqa
         return
 
     data_dir.mkdir(parents=True, exist_ok=True)
     pdfs_zip_path = data_dir / name_zip
     wget.download(url, str(data_dir))
 
-    with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+    with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref:
         zip_ref.extractall(data_dir)
     pdfs_zip_path.unlink()
 
-    print(f"Benchmark data downloaded to {data_dir}")
+    print(f"Benchmark data downloaded to {data_dir}")  # noqa
 
 
 def prediction(path_pred: Path, path_images: Path) -> dict:
@@ -83,19 +84,17 @@ def benchmark_on_our_data() -> dict:
     path_images = data_dir / "images"
     path_gt = data_dir / "gt.json"
     path_pred = data_dir / "pred.json"
-    download_dataset(data_dir,
-                     name_zip="benchmark_table_data.zip",
-                     url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
+    download_dataset(data_dir, name_zip="benchmark_table_data.zip", url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download")
 
     mode_metric_structure_only = False
 
     with open(path_gt, "r") as fp:
         gt_json = json.load(fp)
-    '''
+    """
     Creating base html (based on method predictions for future labeling)
     path_images = data_dir / "images_tmp"
     pred_json = prediction("gt_tmp.json", path_images)
-    '''
+    """
     pred_json = prediction(path_pred, path_images)
     scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only)
 
@@ -113,7 +112,7 @@ def benchmark_on_generated_table() -> dict:
     Article generation information https://arxiv.org/pdf/1905.13391.pdf
     Note: generate the 1st table tape category
     Note: don't use header table tag <th>, replacing on <td> tag
-    Note: all generated data (four categories) you can download from 
+    Note: all generated data (four categories) you can download from
     TODO: some tables have a low quality. Should to trace the reason.
     All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU
     """
@@ -129,7 +128,7 @@ def benchmark_on_generated_table() -> dict:
     # make common ground-truth file
     common_gt_json = {}
     for pathname in Path.iterdir(path_gt):
-        image_name = pathname.name.split(".")[0] + '.png'
+        image_name = pathname.name.split(".")[0] + ".png"
         with open(pathname, "r") as fp:
             table_html = fp.read()
             # exclude header tags
@@ -146,9 +145,7 @@ def benchmark_on_generated_table() -> dict:
     path_pred = data_dir / "pred.json"
 
     pred_json = prediction(path_pred, path_images)
-    scores = call_metric(pred_json=pred_json, true_json=common_gt_json,
-                         structure_only=mode_metric_structure_only,
-                         ignore_nodes=['span', 'style', 'head', 'h4'])
+    scores = call_metric(pred_json=pred_json, true_json=common_gt_json, structure_only=mode_metric_structure_only, ignore_nodes=["span", "style", "head", "h4"])
 
     result = dict()
     result["mode_metric_structure_only"] = mode_metric_structure_only
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index ff84a4a7..d306247d 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -11,17 +11,18 @@
 
 # Source: https://github.com/ibm-aur-nlp/PubTabNet
 
+from collections import deque
+from typing import Optional
+
 import distance
 from apted import APTED, Config
 from apted.helpers import Tree
 from lxml import etree, html
-from collections import deque
-
 from tqdm import tqdm
 
 
 class TableTree(Tree):
-    def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children):
+    def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children):  # noqa
         self.tag = tag
         self.colspan = colspan
         self.rowspan = rowspan
@@ -29,10 +30,11 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None,
         self.visible = visible
         self.children = list(children)
 
-    def bracket(self):
-        """Show tree using brackets notation
+    def bracket(self) -> str:
         """
-        if self.tag == "td" or self.tag == 'th':
+        Show tree using brackets notation
+        """
+        if self.tag == "td" or self.tag == "th":
             result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}'
         else:
             result = f'"tag": {self.tag}'
@@ -43,18 +45,22 @@ def bracket(self):
 
 class CustomConfig(Config):
     @staticmethod
-    def maximum(*sequences):
-        """Get maximum possible value
+    def maximum(*sequences):  # noqa
+        """
+        Get maximum possible value
         """
         return max(map(len, sequences))
 
-    def normalized_distance(self, *sequences) -> float:
-        """Get distance from 0 to 1
+    def normalized_distance(self, *sequences) -> float:  # noqa
+        """
+        Get distance from 0 to 1
         """
         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
 
     def rename(self, node1: TableTree, node2: TableTree) -> float:
-        """Compares attributes of trees"""
+        """
+        Compares attributes of trees
+        """
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
         if node1.tag == "td":
@@ -66,18 +72,20 @@ def rename(self, node1: TableTree, node2: TableTree) -> float:
 
 
 class TEDS(object):
-    """ Tree Edit Distance based Similarity
+    """
+    Tree Edit Distance based Similarity
     """
 
-    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+    def __init__(self, structure_only: bool = False, n_jobs: int = 1, ignore_nodes: Optional[list] = None) -> None:
         assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1"
         self.structure_only = structure_only
         self.n_jobs = n_jobs
         self.ignore_nodes = ignore_nodes
         self.__tokens__ = []
 
-    def tokenize(self, node):
-        """ Tokenizes table cells
+    def tokenize(self, node: TableTree) -> None:
+        """
+        Tokenizes table cells
         """
         self.__tokens__.append(f"<{node.tag}>")
         if node.text is not None:
@@ -89,11 +97,11 @@ def tokenize(self, node):
         if node.tag != "td" and node.tail is not None:
             self.__tokens__ += list(node.tail)
 
-    def get_span(self, node, name_span: str) -> int:
+    def get_span(self, node: TableTree, name_span: str) -> int:
         value = int(node.attrib.get(name_span, "1"))
         return 1 if value <= 0 else value
 
-    def load_html_tree(self, node, parent=None):
+    def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> TableTree:
         """ Converts HTML tree to the format required by apted
         """
         if node.tag == "td":
@@ -109,9 +117,9 @@ def load_html_tree(self, node, parent=None):
                                      colspan=self.get_span(node, "colspan"),
                                      rowspan=self.get_span(node, "rowspan"),
                                      content=cell,
-                                     visible=False if node.attrib.get("style") == "display: none" else True, *deque())
+                                     visible=False if node.attrib.get("style") == "display: none" else True, *deque())  # noqa
             except Exception as ex:
-                print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
+                print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")  # noqa
                 raise ex
         else:
             new_node = TableTree(node.tag, None, None, None, True, *deque())
@@ -148,12 +156,13 @@ def evaluate(self, pred: str, true: str) -> float:
         else:
             return 0.0
 
-    def batch_evaluate(self, pred_json, true_json):
-        """ Computes TEDS score between the prediction and the ground truth of
-            a batch of samples
-            @params pred_json: {'FILENAME': 'HTML CODE', ...}
-            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
-            @output: {'FILENAME': 'TEDS SCORE', ...}
+    def batch_evaluate(self, pred_json: dict, true_json: dict) -> dict:
+        """
+        Computes TEDS score between the prediction and the ground truth of a batch of samples
+
+        :param pred_json: {'FILENAME': 'HTML CODE', ...}
+        :param true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+        :return: {'FILENAME': 'TEDS SCORE', ...}
         """
         samples = true_json.keys()
         scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)]
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index d237e608..50dec7ae 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -15,7 +15,7 @@
 path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
 
 host = "http://localhost:1231"
-param_dist_errors = namedtuple('Param', ('total_file_size', 'total_incorrect_files', 'failed'))
+param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
 
 
 def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
@@ -24,7 +24,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     directory = os.path.join(path_base, tl_path)
     files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
     total_file_size = len(files_list)
-    print(f"Files: {files_list}\nFiles number: {total_file_size}")
+    print(f"Files: {files_list}\nFiles number: {total_file_size}")  # noqa
     for file in tqdm(files_list):
         file_path = os.path.join(directory, file)
         r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
@@ -49,12 +49,12 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     if not os.path.isdir(benchmark_data_dir):
         path_out = os.path.join(data_dir, "data_with_text_layer.zip")
         wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
-        with zipfile.ZipFile(path_out, 'r') as zip_ref:
+        with zipfile.ZipFile(path_out, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(path_out)
-        print(f"Benchmark data downloaded to {benchmark_data_dir}")
+        print(f"Benchmark data downloaded to {benchmark_data_dir}")  # noqa
     else:
-        print(f"Use cached benchmark data from {benchmark_data_dir}")
+        print(f"Use cached benchmark data from {benchmark_data_dir}")  # noqa
 
     assert os.path.isdir(benchmark_data_dir)
 
@@ -63,15 +63,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     parameters = dict(pdf_with_text_layer="auto", pages="1:1")
     result_item = OrderedDict()
 
-    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters)
+    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
     result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
     result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
 
-    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters)
+    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
     result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
     result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
     result["guessing_the_correctness_of_the_text"] = result_item
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print("Save result in" + path_result)
+    print(f"Save result in {path_result}")  # noqa
diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py
index 2e1db6ff..a26c7995 100644
--- a/scripts/create_txtlayer_dataset.py
+++ b/scripts/create_txtlayer_dataset.py
@@ -17,9 +17,9 @@
 
 class CorrectTextGenerator:
     def __init__(self) -> None:
-        self.citation = re.compile(r'\[\d+]')
-        self.meta = re.compile(r'\[править \| править код]')
-        self.symbols = re.compile(r'[→←↑]')
+        self.citation = re.compile(r"\[\d+]")
+        self.meta = re.compile(r"\[править \| править код]")
+        self.symbols = re.compile(r"[→←↑]")
 
         self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0"
         self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text"
@@ -37,15 +37,15 @@ def get_random_text(self, lang: str) -> str:
                 # 2 - Get text the article
                 article_result = requests.post(self.article_url.format(lang=lang, title=title))
                 article_result_dict = article_result.json()
-                article = article_result_dict["parse"]["text"]['*']
-                bs = BeautifulSoup(article, 'html.parser')
+                article = article_result_dict["parse"]["text"]["*"]
+                bs = BeautifulSoup(article, "html.parser")
                 article_text = bs.get_text()
 
                 # 3 - Clear text of the article from unused symbols
-                article_text_fixed = re.sub(self.citation, '', article_text)
+                article_text_fixed = re.sub(self.citation, "", article_text)
                 article_text_fixed = re.sub(self.meta, "", article_text_fixed)
                 article_text_fixed = re.sub(self.symbols, "", article_text_fixed)
-                article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed)
+                article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed)
             except:  # noqa
                 article_text_fixed = ""
 
@@ -62,18 +62,22 @@ class EncodingCorruptor(Corruptor):
     def __init__(self) -> None:
         self.encodings = {
             "en": {
-                "input": ['cp1026'],
-                "output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8',
-                           'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r',
-                           'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman']
+                "input": ["cp1026"],
+                "output": [
+                    "cp1256", "cp437", "cp775", "cp852", "cp855", "cp857", "cp860", "cp861", "cp862", "cp863", "cp866", "gb18030", "hp_roman8",
+                    "iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "koi8_r",
+                    "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman"
+                ]
 
             },
             "ru": {
-                "input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'],
-                "output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11',
-                           'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r',
-                           'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500',
-                           'cp862', 'gb18030']
+                "input": ["cp855", "cp866", "gb18030", "iso8859_5", "koi8_r", "mac_cyrillic", "utf_8"],
+                "output": [
+                    "cp1026", "cp1256", "cp437", "cp775", "cp850", "cp852", "cp863", "cp866", "hp_roman8", "iso8859_10", "iso8859_11",
+                    "iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "iso8859_9", "koi8_r",
+                    "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman", "cp1140", "cp273", "cp855", "cp860", "cp861", "cp857", "cp500",
+                    "cp862", "gb18030"
+                ]
 
             }
         }
@@ -196,7 +200,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
     os.makedirs(os.path.join(args.out_dir, args.incorrect_dir), exist_ok=True)
 
     i = args.start_number
-    print("Generating incorrect texts")
+    print("Generating incorrect texts")  # noqa
     for _ in tqdm(range(args.dataset_size)):
         for language in ("ru", "en"):
             text = ""
@@ -207,7 +211,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
                     corruptor = random.choice(corruptor_list)
                     text = corruptor.corrupt(text, lang=language)
                 except Exception as e:
-                    print(e)
+                    print(e)  # noqa
                     text = ""
 
             with open(os.path.join(args.out_dir, args.incorrect_dir, f"{i:08d}_{language}.txt"), "w") as f:
@@ -215,7 +219,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
             i += 1
 
     i = args.start_number
-    print("Generating correct texts")
+    print("Generating correct texts")  # noqa
     for _ in tqdm(range(args.dataset_size)):
         for language in ("ru", "en"):
 
diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
index b32f3f08..ce77a128 100644
--- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
+++ b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
@@ -77,14 +77,16 @@ def _get_avg(array: List) -> float:
 
 
 def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
-    return [_get_avg(statistics[dataset]["ASCII_Spacing_Characters"]),
-            _get_avg(statistics[dataset]["ASCII_Special_Symbols"]),
-            _get_avg(statistics[dataset]["ASCII_Digits"]),
-            _get_avg(statistics[dataset]["ASCII_Uppercase_Letters"]),
-            _get_avg(statistics[dataset]["Latin1_Special_Symbols"]),
-            _get_avg(statistics[dataset]["Cyrillic"]),
-            sum(statistics[dataset]["Amount of words"]),
-            _get_avg(statistics[dataset]["Accuracy"])]
+    return [
+        _get_avg(statistics[dataset]["ASCII_Spacing_Characters"]),
+        _get_avg(statistics[dataset]["ASCII_Special_Symbols"]),
+        _get_avg(statistics[dataset]["ASCII_Digits"]),
+        _get_avg(statistics[dataset]["ASCII_Uppercase_Letters"]),
+        _get_avg(statistics[dataset]["Latin1_Special_Symbols"]),
+        _get_avg(statistics[dataset]["Cyrillic"]),
+        sum(statistics[dataset]["Amount of words"]),
+        _get_avg(statistics[dataset]["Accuracy"])
+    ]
 
 
 def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]:
@@ -106,7 +108,7 @@ def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]:
 def __parse_ocr_errors(lines: List[str]) -> List:
     ocr_errors = []
     matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors   Marked   Correct-Generated" in line][0]
-    for num, line in enumerate(lines[matched_errors[0] + 1:]):
+    for line in lines[matched_errors[0] + 1:]:
         # example line: " 2        0   { 6}-{б}"
         errors = re.findall(r"(\d+)", line)[0]
         chars = re.findall(r"{(.*)}-{(.*)}", line)[0]
@@ -158,8 +160,12 @@ def __get_summary_symbol_error(path_reports: str) -> Texttable:
 
 def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]:
     accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]]
-    accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits",
-                    "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]]
+    accs_common = [
+        [
+            "Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic",
+            "Amount of words", "AVG Accuracy"
+        ]
+    ]
 
     table_accuracy_per_image = Texttable()
     accs.extend(accuracy_values)
@@ -258,10 +264,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
                         accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]])
 
                 except Exception as ex:
-                    print(ex)
-                    print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`")
+                    print(ex)  # noqa
+                    print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`")  # noqa
 
-    print(f"Time mean correction ocr = {np.array(correction_times).mean()}")
+    print(f"Time mean correction ocr = {np.array(correction_times).mean()}")  # noqa
     table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values)
     return table_common, table_accuracy_per_image
 
@@ -277,9 +283,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
     benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip")
     if not os.path.isfile(benchmark_data_path):
         wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path)
-        print(f"Benchmark data downloaded to {benchmark_data_path}")
+        print(f"Benchmark data downloaded to {benchmark_data_path}")  # noqa
     else:
-        print(f"Use cached benchmark data from {benchmark_data_path}")
+        print(f"Use cached benchmark data from {benchmark_data_path}")  # noqa
     assert os.path.isfile(benchmark_data_path)
 
     table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir)
@@ -289,14 +295,14 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
     with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file:
         res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n")
         res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n")
-        res_file.write(f"\nTable 1 - Accuracy for each file\n")
+        res_file.write("\nTable 1 - Accuracy for each file\n")
         res_file.write(table_accuracy_per_image.draw())
-        res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n")
+        res_file.write("\n\nTable 2 - AVG by each type of symbols:\n")
         res_file.write(table_common.draw())
-        res_file.write(f"\n\nTable 3 -OCR error by symbol:\n")
+        res_file.write("\n\nTable 3 -OCR error by symbol:\n")
         res_file.write(table_errors.draw())
 
-    print(f"Tesseract version is {pytesseract.get_tesseract_version()}")
-    print(table_accuracy_per_image.draw())
-    print(table_common.draw())
-    print(table_errors.draw())
+    print(f"Tesseract version is {pytesseract.get_tesseract_version()}")  # noqa
+    print(table_accuracy_per_image.draw())  # noqa
+    print(table_common.draw())  # noqa
+    print(table_errors.draw())  # noqa
diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py
index ada563e2..82160731 100644
--- a/scripts/tesseract_benchmark/ocr_correction.py
+++ b/scripts/tesseract_benchmark/ocr_correction.py
@@ -2,11 +2,11 @@
 from typing import Tuple
 
 import torch
-from sage.spelling_correction.corrector import Corrector
 from sage.spelling_correction import AvailableCorrectors
 from sage.spelling_correction import RuM2M100ModelForSpellingCorrection
+from sage.spelling_correction.corrector import Corrector
 
-'''
+"""
 Install sage library (for ocr correction step):
 git clone https://github.com/ai-forever/sage.git
 cd sage
@@ -14,7 +14,7 @@
 pip install -r requirements.txt
 
 Note: sage use 5.2 Gb GPU ......
-'''
+"""
 USE_GPU = True
 
 
@@ -35,9 +35,7 @@ def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]:
     corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value)  # 4.49 Gb model (pytorch_model.bin)
     if torch.cuda.is_available() and USE_GPU:
         corrector.model.to(torch.device("cuda:0"))
-        print("use CUDA")
+        print("use CUDA")  # noqa
     else:
-        print("use CPU")
+        print("use CPU")  # noqa
     return corrector, corrected_path
-
-
diff --git a/scripts/tesseract_benchmark/text_blob_correction.py b/scripts/tesseract_benchmark/text_blob_correction.py
index 8ecf8be6..73e8d70e 100644
--- a/scripts/tesseract_benchmark/text_blob_correction.py
+++ b/scripts/tesseract_benchmark/text_blob_correction.py
@@ -2,7 +2,7 @@
 
 
 class TextBlobCorrector:
-    def __init__(self):
+    def __init__(self) -> None:
         return
 
     def correct(self, text: str) -> str:
diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py
index 888c3273..9f65e590 100644
--- a/scripts/test_words_bbox_extraction.py
+++ b/scripts/test_words_bbox_extraction.py
@@ -28,11 +28,16 @@ def __extract_conf_annotation(self, anns_conf: List[dict], ann_bbox: dict, text:
             interval = e - b
             if interval > 0:
                 confs.append(ann_conf["value"])
-                debug.append({f"{ann_conf['value']}[{b}:{e}]": [
-                    interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"]})
+                debug.append(
+                    {
+                        f"{ann_conf['value']}[{b}:{e}]": [
+                            interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"
+                        ]
+                    }
+                )
 
         if DETAILED_DEBUG:
-            print(debug)
+            print(debug)  # noqa
 
         return confs
 
@@ -44,10 +49,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t
             interval = e - b
             if interval > 0:
                 text_type = ann_type["value"]
-                debug.append({f"{ann_type['value']}:{b}:{e}": [
-                    interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"]})
+                debug.append(
+                    {
+                        f"{ann_type['value']}:{b}:{e}": [
+                            interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"
+                        ]
+                    }
+                )
         if DETAILED_DEBUG:
-            print(debug)
+            print(debug)  # noqa
 
         return text_type
 
@@ -67,8 +77,9 @@ def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]:
                 confs = self.__extract_conf_annotation(anns_conf, ann_bbox, node["text"])
                 text_type = self.__extract_texttype_annotation(anns_type, ann_bbox, node["text"])
 
-                words_annotation.append(BboxWithConfsType(start=ann_bbox["start"], end=ann_bbox["end"], bbox=ann_bbox["value"], confs=confs,
-                                                          text_type=text_type))
+                words_annotation.append(
+                    BboxWithConfsType(start=ann_bbox["start"], end=ann_bbox["end"], bbox=ann_bbox["value"], confs=confs, text_type=text_type)
+                )
 
             stack.extend(node["subparagraphs"])
 
@@ -91,13 +102,13 @@ def __get_words_annotation_from_cell(self, table: dict) -> List[BboxWithConfsTyp
         return words_annotation
 
     def __normalize_font_thickness(self, image: np.ndarray) -> Tuple[float, int]:
-        FONT_SCALE = 6e-4
-        THICKNESS_SCALE = 1e-3
+        font_scale = 6e-4
+        thickness_scale = 1e-3
         height, width, _ = image.shape
-        font_scale = min(width, height) * FONT_SCALE
-        thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
+        font = min(width, height) * font_scale
+        thickness = math.ceil(min(width, height) * thickness_scale)
 
-        return font_scale, thickness
+        return font, thickness
 
     def __rotate_coordinate(self, x: int, y: int, xc: float, yc: float, angle: float) -> Tuple[int, int]:
         rad = angle * math.pi / 180
@@ -123,8 +134,10 @@ def __draw_word_annotations(self, image: np.ndarray, word_annotations: List[Bbox
 
             cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0))
             text = ",".join(ann.confs) if ann.confs != [] else "None"
-            cv2.putText(image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])),
-                        cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), thickness)
+            cv2.putText(
+                image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])), cv2.FONT_HERSHEY_SIMPLEX,
+                font_scale, (0, 0, 255), thickness
+            )
         return image
 
     def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarray:
@@ -135,9 +148,9 @@ def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarr
             image = self.__draw_word_annotations(image, word_annotations, angle=table_angle)
         return image
 
-    def test_pdf_documents(self):
+    def test_pdf_documents(self) -> None:
         filename_parameters_outputdir = [
-            ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"),  "pdfminer_reader"],
+            ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"), "pdfminer_reader"],
             ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="tabby"), "tabby_reader"]
         ]
 
@@ -158,12 +171,13 @@ def test_pdf_documents(self):
                 image = self.__draw_tables_words(tables, image)
             cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
 
-    def test_table_word_extraction(self):
-        output_path = os.path.join(self.output_path, 'tables')
+    def test_table_word_extraction(self) -> None:
+        output_path = os.path.join(self.output_path, "tables")
         os.makedirs(output_path, exist_ok=True)
-        file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg",
-                      "tables/example_with_table6.png", "tables/example_with_table_horizontal_union.jpg",
-                      "scanned/orient_1.png", "tables/rotated_table.png"]
+        file_names = [
+            "tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table6.png",
+            "tables/example_with_table_horizontal_union.jpg", "scanned/orient_1.png", "tables/rotated_table.png"
+        ]
 
         for file_name in file_names:
             result = self._send_request(file_name, data=dict())
@@ -182,7 +196,7 @@ def test_table_word_extraction(self):
             if len(tables) > 0:
                 image = self.__draw_tables_words(tables, image)
 
-            cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)
+            cv2.imwrite(os.path.join(output_path, file_name.split("/")[-1]), image)
 
     def test_document_table_split_last_column(self) -> None:
         filename_to_parameters = {
diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_acc_orientation_classifier.py
index 980ec884..497a5183 100644
--- a/scripts/train/train_acc_orientation_classifier.py
+++ b/scripts/train/train_acc_orientation_classifier.py
@@ -8,24 +8,20 @@
 from torch import optim
 from torch.utils.data import DataLoader
 
+from dedoc.config import get_config
 from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
 from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.dataset_executor import DataLoaderImageOrient
 
 parser = argparse.ArgumentParser()
-checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                    "../../resources/efficient_net_b0_fixed.pth"))
-checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                    "../../resources/efficient_net_b0_fixed.pth"))
+checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth"))
+checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth"))
 checkpoint_path = "../../resources"
 
 parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False)
-parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load",
-                    default=checkpoint_path_save)
-parser.add_argument("-l", "--checkpoint_load", help="Path to checkpoint for load",
-                    default=checkpoint_path_load)
+parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", default=checkpoint_path_save)
+parser.add_argument("-l", "--checkpoint_load", help="Path to checkpoint for load", default=checkpoint_path_load)
 parser.add_argument("-f", "--from_checkpoint", type=bool, help="run for train model", default=True)
-parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test",
-                    default="/home/nasty/data/columns_orientation")
+parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test")
 
 args = parser.parse_args()
 BATCH_SIZE = 1
@@ -41,23 +37,20 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri
     """
     net_executor.net.eval()
     testloader = data_executor.load_dataset(
-        csv_path=os.path.join(args.input_data_folder, 'test/labels.csv'),
+        csv_path=os.path.join(args.input_data_folder, "test/labels.csv"),
         image_path=args.input_data_folder,
         batch_size=BATCH_SIZE
     )
     dataiter = iter(testloader)
     sample = dataiter.__next__()
-    _, orientation, columns = sample['image'], sample['orientation'], sample['columns']
+    _, orientation, columns = sample["image"], sample["orientation"], sample["columns"]
 
-    print('GroundTruth: orientation {}, columns {}'.format(orientation, columns))
+    print(f"GroundTruth: orientation {orientation}, columns {columns}")  # noqa
 
     calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1)
 
 
-def calc_accuracy_by_classes(testloader: DataLoader,
-                             classes: List,
-                             classifier: ColumnsOrientationClassifier,
-                             batch_size: int = 1) -> None:
+def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier, batch_size: int = 1) -> None:
     """
     Function calculates accuracy ba each class
     :param testloader: DataLoader
@@ -66,13 +59,13 @@ def calc_accuracy_by_classes(testloader: DataLoader,
     :param batch_size: size of batch
     :return:
     """
-    class_correct = list(0. for i in range(len(classes)))
-    class_total = list(0. for i in range(len(classes)))
+    class_correct = list(0. for _ in range(len(classes)))
+    class_total = list(0. for _ in range(len(classes)))
     time_predict = 0
     cnt_predict = 0
     with torch.no_grad():
         for data in testloader:
-            images, orientation, columns = data['image'], data['orientation'], data['columns']
+            images, orientation, columns = data["image"], data["orientation"], data["columns"]
             time_begin = time()
 
             outputs = classifier.net(images.float().to(classifier.device))
@@ -97,21 +90,16 @@ def calc_accuracy_by_classes(testloader: DataLoader,
                 class_correct[columns_i] += orientation_bool_predict
                 class_total[columns_i] += 1
                 if not orientation_bool_predict or not columns_bool_predict:
-                    print('{} predict as \norientation: {} \ncolumns: {}'.format(data['image_name'][i],
-                                                                                 classes[2 + orientation_predicted[i]],
-                                                                                 classes[columns_predicted[i]]))
+                    print( # noqa
+                        f'{data["image_name"][i]} predict as \norientation: {classes[2 + orientation_predicted[i]]} \ncolumns: {classes[columns_predicted[i]]}'
+                    )
 
     for i in range(len(classes)):
-        print('Accuracy of %5s : %2d %%' % (
-            classes[i], 100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0))
-    print('=== AVG Time predict {}'.format(time_predict / cnt_predict))
+        print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %")  # noqa
+    print(f"=== AVG Time predict {time_predict / cnt_predict}")  # noqa
 
 
-def train_model(trainloader: DataLoader,
-                checkpoint_path_save: str,
-                classifier: ColumnsOrientationClassifier,
-                epoch_cnt: int = 7,
-                save_step: int = 500) -> None:
+def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: ColumnsOrientationClassifier, epoch_cnt: int = 7, save_step: int = 500) -> None:
     """
     Function for train orientation classifier
     :param trainloader: DataLoader
@@ -128,7 +116,7 @@ def train_model(trainloader: DataLoader,
         running_loss = 0.0
         for i, data in enumerate(trainloader, 0):
             # get the inputs; data is a list of [inputs, labels]
-            inputs, orientation, columns = data['image'], data['orientation'], data['columns']
+            inputs, orientation, columns = data["image"], data["orientation"], data["columns"]
 
             # zero the parameter gradients
             optimizer.zero_grad()
@@ -136,17 +124,14 @@ def train_model(trainloader: DataLoader,
             # forward + backward + optimize
             outputs = classifier.net(inputs.float().to(classifier.device))
 
-            loss = criterion(outputs[:, :2],
-                             columns.to(classifier.device)) + criterion(outputs[:, 2:],
-                                                                        orientation.to(classifier.device))
+            loss = criterion(outputs[:, :2], columns.to(classifier.device)) + criterion(outputs[:, 2:], orientation.to(classifier.device))
             loss.backward()
             optimizer.step()
             running_loss += loss.item()
 
             # print statistics
             if i % 100 == 99:
-                print('[%d, %5d] loss: %.3f' %
-                      (epoch + 1, i + 1, running_loss / 100))
+                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}")  # noqa
                 running_loss = 0.0
 
             # save checkpoint
@@ -154,14 +139,14 @@ def train_model(trainloader: DataLoader,
                 classifier.save_weights(checkpoint_path_save)
 
     classifier.save_weights(checkpoint_path_save)
-    print('Finished Training')
+    print("Finished Training")  # noqa
 
 
 def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientationClassifier) -> None:
     classifier.net.train()
     # Part 1 - load datas
     trainloader = data_executor.load_dataset(
-        csv_path=os.path.join(args.input_data_folder, 'train/labels.csv'),
+        csv_path=os.path.join(args.input_data_folder, "train/labels.csv"),
         image_path=args.input_data_folder,
         batch_size=BATCH_SIZE
     )
@@ -169,22 +154,20 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat
     # get some random training images
     dataiter = iter(trainloader)
     sample = dataiter.__next__()
-    _, orientation, columns = sample['image'], sample['orientation'], sample['columns']
+    _, orientation, columns = sample["image"], sample["orientation"], sample["columns"]
 
     # print labels
-    print(' '.join('%5s' % data_executor.classes[orientation[j]] for j in range(BATCH_SIZE)))
-    print(' '.join('%5s' % data_executor.classes[columns[j]] for j in range(BATCH_SIZE)))
+    print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE)))  # noqa
+    print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE)))  # noqa
 
     # Part 2 - train model
     train_model(trainloader, args.checkpoint_save, classifier)
 
 
 if __name__ == "__main__":
-    from dedoc.config import _config as config
+    config = get_config()
     data_executor = DataLoaderImageOrient()
-    net = ColumnsOrientationClassifier(on_gpu=True,
-                                       checkpoint_path=checkpoint_path if not args.train else '',
-                                       config=config)
+    net = ColumnsOrientationClassifier(on_gpu=True, checkpoint_path=checkpoint_path if not args.train else "", config=config)
     if args.train:
         train_step(data_executor, net)
     else:
diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py
index 812c1681..34643c3a 100644
--- a/scripts/train/train_diploma_line_classifier.py
+++ b/scripts/train/train_diploma_line_classifier.py
@@ -17,16 +17,14 @@ def skip_labels(label: str) -> Optional[str]:
 
 classifier_name = "diploma_classifier"
 
-clf_resources_path = os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources", "line_type_classifiers")
+clf_resources_path = os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources", "line_type_classifiers")
 os.makedirs(clf_resources_path, exist_ok=True)
 resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
 assert os.path.isdir(resources_path)
 
-path_out = os.path.join(clf_resources_path, "{}.pkl.gz".format(classifier_name))
-path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name))
-path_feature_importances = os.path.join(resources_path,
-                                        "feature_importances",
-                                        "{}_feature_importances.xlsx".format(classifier_name))
+path_out = os.path.join(clf_resources_path, f"{classifier_name}.pkl.gz")
+path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json")
+path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx")
 
 feature_extractor = DiplomaFeatureExtractor()
 classifier_parameters = dict(learning_rate=0.5,
@@ -52,4 +50,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False, no_cache=False)
-print("successfully train diploma classifier")
+print("successfully train diploma classifier")  # noqa
diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py
index 5ccb0715..93780d36 100644
--- a/scripts/train/train_law_line_classifier.py
+++ b/scripts/train/train_law_line_classifier.py
@@ -32,11 +32,9 @@ def transform_labels(label: str) -> Optional[str]:
 resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
 assert os.path.isdir(resources_path)
 classifier_name = "law_txt_classifier" if txt_classifier else "law_classifier"
-path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name))
-path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name))
-path_feature_importances = os.path.join(resources_path,
-                                        "feature_importances",
-                                        "{}_feature_importances.xlsx".format(classifier_name))
+path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz")
+path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json")
+path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx")
 
 feature_extractor = LawTextFeatures(text_features_only=txt_classifier)
 classifier_parameters = dict(learning_rate=0.8,
@@ -75,4 +73,4 @@ def get_sample_weight(line: LineWithLabel) -> int:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train law classifier")
+print("successfully train law classifier")  # noqa
diff --git a/scripts/train/train_mle_language_model.py b/scripts/train/train_mle_language_model.py
index 1c64ec4b..41c9882c 100644
--- a/scripts/train/train_mle_language_model.py
+++ b/scripts/train/train_mle_language_model.py
@@ -16,11 +16,11 @@ def tokenize_doc(text_layer: str) -> str:
     # converting to lowercase
     text_layer = text_layer.lower()
     # remove all the special characters
-    document = re.sub(r'\W', ' ', text_layer)
+    document = re.sub(r"\W", " ", text_layer)
     # remove all single characters
-    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
+    document = re.sub(r"\^[a-zA-Z]\s+", " ", document)
     # substituting multiple spaces with single space
-    document = re.sub(r'\s+', ' ', document, flags=re.I)
+    document = re.sub(r"\s+", " ", document, flags=re.I)
     return document
 
 
@@ -39,27 +39,27 @@ def main() -> None:
     documents = []
     for files in files_path_big_data:
         file = os.listdir(path_big_data + files)
-        print(files)
+        print(files)  # noqa
         for writer in file:
             try:
-                with open(path_big_data + files + '/' + writer) as f:
+                with open(path_big_data + files + "/" + writer) as f:
                     text = f.read()
                     document = tokenize_doc(text)
                     documents.append(document)
-                print(writer)
+                print(writer)  # noqa
                 break
             except Exception:
-                print(Exception)
+                print(Exception)  # noqa
                 pass
     documents = " ".join(documents)
     bigram_list = create_ngramm_list(documents, 2)
     train, vocab = padded_everygram_pipeline(2, [bigram_list])
-    print(bigram_list)
+    print(bigram_list)  # noqa
     language_model_mle = MLE(2)
     language_model_mle.fit(train, vocab)
     with open("n-gram_lang_model.pkl", "wb") as f:
         pickle.dump(language_model_mle, f)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/scripts/train/train_nn_line_classifier_law.py b/scripts/train/train_nn_line_classifier_law.py
index dbdb58b7..84893653 100644
--- a/scripts/train/train_nn_line_classifier_law.py
+++ b/scripts/train/train_nn_line_classifier_law.py
@@ -31,13 +31,11 @@ def transform_labels(label: str) -> Optional[str]:
 resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
 assert os.path.isdir(resources_path)
 classifier_name = "law_txt_classifier" if txt_classifier else "law_classifier"
-path_out = os.path.join(resources_path, "{}_nn.pkl.gz".format(classifier_name))
-path_feature_importances = os.path.join(resources_path,
-                                        "feature_importances",
-                                        "{}_feature_importances_nn.xlsx".format(classifier_name))
+path_out = os.path.join(resources_path, f"{classifier_name}_nn.pkl.gz")
+path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances_nn.xlsx")
 
 feature_extractor = LawTextFeatures(text_features_only=txt_classifier)
-path_scores = os.path.join(resources_path, "benchmarks", "{}_nn_scores.json".format(classifier_name))
+path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_nn_scores.json")
 classifier_parameters = dict(learning_rate=0.8,
                              n_estimators=300,
                              booster="gbtree",
@@ -51,9 +49,7 @@ def get_sample_weight(line: LineWithLabel) -> float:
     text = line.line.lower().strip()
     regexps = LawTextFeatures.named_regexp
     application_regexp = LawTextFeatures.regexp_application_begin
-    regexp_weight = (50
-                     if any([regexp.match(text) for regexp in regexps]) or application_regexp.match(text.lower())
-                     else 1)
+    regexp_weight = (50 if any([regexp.match(text) for regexp in regexps]) or application_regexp.match(text.lower()) else 1)
     return regexp_weight * class_weight
 
 
@@ -75,4 +71,4 @@ def get_sample_weight(line: LineWithLabel) -> float:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train law classifier")
+print("successfully train law classifier")  # noqa
diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py
index 64cbc331..658e5ff3 100644
--- a/scripts/train/train_paragraph_classifier.py
+++ b/scripts/train/train_paragraph_classifier.py
@@ -17,11 +17,9 @@ def skip_labels(label: str) -> Optional[str]:
 
 resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
 assert os.path.isdir(resources_path)
-path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name))
-path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name))
-path_feature_importances = os.path.join(resources_path,
-                                        "feature_importances",
-                                        "{}_feature_importances.xlsx".format(classifier_name))
+path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz")
+path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json")
+path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx")
 
 feature_extractor = ParagraphFeatureExtractor(config=config)
 
@@ -48,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train {} classifier".format(classifier_name))
+print(f"successfully train {classifier_name} classifier")  # noqa
diff --git a/scripts/train/train_txtlayer_classifier.py b/scripts/train/train_txtlayer_classifier.py
index b203e9c9..12d6cdba 100644
--- a/scripts/train/train_txtlayer_classifier.py
+++ b/scripts/train/train_txtlayer_classifier.py
@@ -45,11 +45,11 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
                 with open(path, mode="r") as f:
                     text = f.read()
             except Exception as e:
-                print(f'Bad file {str(e)}: {path}')
+                print(f"Bad file {str(e)}: {path}")  # noqa
                 continue
 
             if len(text.strip()) == 0:
-                print(f'Empty file: {path}')
+                print(f"Empty file: {path}")  # noqa
                 continue
 
             texts.append(text)
@@ -66,12 +66,12 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
     if not os.path.isdir(txtlayer_classifier_dataset_dir):
         path_out = os.path.join(data_dir, "data.zip")
         wget.download("https://at.ispras.ru/owncloud/index.php/s/z9WLFiKKFo2WMgW/download", path_out)
-        with zipfile.ZipFile(path_out, 'r') as zip_ref:
+        with zipfile.ZipFile(path_out, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(path_out)
-        print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}")
+        print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}")  # noqa
     else:
-        print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}")
+        print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}")  # noqa
 
     assert os.path.isdir(txtlayer_classifier_dataset_dir)
 
@@ -85,20 +85,16 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
         stages_data[stage] = dict(features=features, labels=labels)
 
     clf = XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=600, booster="gbtree", tree_method="hist", max_depth=3)
-    clf.fit(
-        X=stages_data["train"]["features"],
-        y=stages_data["train"]["labels"],
-        eval_set=[(stages_data["val"]["features"], stages_data["val"]["labels"])],
-    )
+    clf.fit(X=stages_data["train"]["features"], y=stages_data["train"]["labels"], eval_set=[(stages_data["val"]["features"], stages_data["val"]["labels"])])
     test_preds = clf.predict(stages_data["test"]["features"])
 
     score = f1_score(stages_data["test"]["labels"], test_preds)
-    print(f"F1 score = {score}")
+    print(f"F1 score = {score}")  # noqa
 
     resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "resources")
-    with gzip.open(os.path.join(resources_dir, 'txtlayer_classifier.pkl.gz'), 'wb') as file:
+    with gzip.open(os.path.join(resources_dir, "txtlayer_classifier.pkl.gz"), "wb") as file:
         pickle.dump(clf, file)
 
     xgbfir.saveXgbFI(clf,
                      feature_names=features.columns,
-                     OutputXlsxFile=os.path.join(resources_dir, "feature_importances", 'txtlayer_classifier_feature_importances.xlsx'))
+                     OutputXlsxFile=os.path.join(resources_dir, "feature_importances", "txtlayer_classifier_feature_importances.xlsx"))
diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py
index b35446e7..e4677c2e 100644
--- a/scripts/train/train_tz_line_classifier.py
+++ b/scripts/train/train_tz_line_classifier.py
@@ -18,11 +18,9 @@ def skip_labels(label: str) -> Optional[str]:
 
 resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
 assert os.path.isdir(resources_path)
-path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name))
-path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name))
-path_feature_importances = os.path.join(resources_path,
-                                        "feature_importances",
-                                        "{}_feature_importances.xlsx".format(classifier_name))
+path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz")
+path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json")
+path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx")
 
 feature_extractor = TzTextFeatures(text_features_only=txt_classifier)
 classifier_parameters = dict(learning_rate=0.5,
@@ -48,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train tz classifier")
+print("successfully train tz classifier")  # noqa

From 410b0ca51cb9cfe356d1c4f5d99ca05b35d8dc0a Mon Sep 17 00:00:00 2001
From: Nasty <bogatenkova.anastasiya@mail.ru>
Date: Thu, 1 Feb 2024 13:42:23 +0300
Subject: [PATCH 2/3] Review fixes

---
 .flake8                                       |  2 ++
 scripts/benchmark.py                          | 10 +++++-----
 scripts/benchmark_pdf_attachments.py          | 16 +++++++--------
 scripts/benchmark_pdf_miner.py                |  6 +++---
 scripts/benchmark_table/benchmark_table.py    |  6 +++---
 scripts/benchmark_table/metric.py             |  2 +-
 scripts/benchmark_tl_correctness.py           |  8 ++++----
 scripts/create_txtlayer_dataset.py            |  6 +++---
 .../calc_tesseract_benchmarks.py              | 18 ++++++++---------
 scripts/tesseract_benchmark/ocr_correction.py |  4 ++--
 scripts/test_words_bbox_extraction.py         |  4 ++--
 .../train/train_acc_orientation_classifier.py | 14 ++++++-------
 .../train/train_diploma_line_classifier.py    |  2 +-
 scripts/train/train_law_line_classifier.py    |  2 +-
 scripts/train/train_mle_language_model.py     |  8 ++++----
 scripts/train/train_nn_line_classifier_law.py |  2 +-
 scripts/train/train_paragraph_classifier.py   |  2 +-
 scripts/train/train_txtlayer_classifier.py    | 10 +++++-----
 scripts/train/train_tz_line_classifier.py     |  2 +-
 .../trainers/base_sklearn_line_classifier.py  |  2 +-
 scripts/train/trainers/data_loader.py         |  2 +-
 scripts/train/trainers/errors_saver.py        |  4 ++--
 .../trainers/line_lstm_classifier_trainer.py  | 20 +++++++++----------
 23 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/.flake8 b/.flake8
index 4c24c4a3..96a1bcff 100644
--- a/.flake8
+++ b/.flake8
@@ -22,3 +22,5 @@ exclude =
 # ANN101 - type annotations for self
 ignore =
     ANN101
+per-file-ignores =
+    scripts/*:T201
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 69276d70..12c3f104 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -33,14 +33,14 @@ def get_cpu_performance() -> float:
 
 
 cpu_performance = get_cpu_performance()
-print(f'"cpu_performance" = {cpu_performance}')  # noqa
+print(f'"cpu_performance" = {cpu_performance}')
 
 with TemporaryDirectory() as path_base:
     path_out = os.path.join(path_base, "dataset.zip")
     wget.download(data_url, path_out)
     with zipfile.ZipFile(path_out, "r") as zip_ref:
         zip_ref.extractall(path_base)
-    print(path_base)  # noqa
+    print(path_base)
 
     failed = []
     result = OrderedDict()
@@ -60,7 +60,7 @@ def get_cpu_performance() -> float:
         Task("pdf", "pdf", {"pdf_with_text_layer": "false"}),
         Task("pdf_tables", "pdf_tables", {})
     ]
-    print(tasks)  # noqa
+    print(tasks)
     for directory, name, parameters in tasks:
         total_size = 0
         total_time = 0
@@ -90,5 +90,5 @@ def get_cpu_performance() -> float:
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print(f"save result in {path_result}")  # noqa
-    print(failed)  # noqa
+    print(f"save result in {path_result}")
+    print(failed)
diff --git a/scripts/benchmark_pdf_attachments.py b/scripts/benchmark_pdf_attachments.py
index f069685c..411f1275 100644
--- a/scripts/benchmark_pdf_attachments.py
+++ b/scripts/benchmark_pdf_attachments.py
@@ -39,7 +39,7 @@ def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir:
                     shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
                     attachment_names.append(attachment_name)
 
-        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")  # noqa
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
         result_dict[file_name] = sorted(attachment_names)
 
     return result_dict
@@ -70,7 +70,7 @@ def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_d
                     shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
                     attachment_names.append(attachment_name)
 
-        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")  # noqa
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
         result_dict[file_name] = sorted(attachment_names)
 
     return result_dict
@@ -99,9 +99,9 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
             zip_ref.extractall(data_dir)
         os.remove(archive_path)
 
-        print(f"Benchmark data downloaded to {data_dir}")  # noqa
+        print(f"Benchmark data downloaded to {data_dir}")
     else:
-        print(f"Use cached benchmark data from {data_dir}")  # noqa
+        print(f"Use cached benchmark data from {data_dir}")
 
     in_dir = os.path.join(data_dir, "with_attachments")
     out_dir = os.path.join(in_dir, "extracted_attachments")
@@ -112,17 +112,17 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
 
     benchmarks_dict = {}
 
-    print("Get tabby attachments")  # noqa
+    print("Get tabby attachments")
     tabby_reader = PdfTabbyReader(config={})
     tabby_out_dir = os.path.join(out_dir, "tabby")
     benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)
 
-    print("Get pdfminer attachments")  # noqa
+    print("Get pdfminer attachments")
     pdfminer_reader = PdfTxtlayerReader(config={})
     pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
     benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)
 
-    print("Get common attachments")  # noqa
+    print("Get common attachments")
     common_out_dir = os.path.join(out_dir, "common")
     pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
     benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)
@@ -131,4 +131,4 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i
     with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
         json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)
 
-    print(f"Attachments were extracted to {out_dir}")  # noqa
+    print(f"Attachments were extracted to {out_dir}")
diff --git a/scripts/benchmark_pdf_miner.py b/scripts/benchmark_pdf_miner.py
index b7c5d785..7161541b 100644
--- a/scripts/benchmark_pdf_miner.py
+++ b/scripts/benchmark_pdf_miner.py
@@ -31,9 +31,9 @@
             zip_ref.extractall(data_dir)
         os.remove(pdfs_zip_gt_path)
 
-        print(f"Benchmark data downloaded to {data_dir}")  # noqa
+        print(f"Benchmark data downloaded to {data_dir}")
     else:
-        print(f"Use cached benchmark data from {data_dir}")  # noqa
+        print(f"Use cached benchmark data from {data_dir}")
 
     pdfs_path = data_dir / "PdfMiner Params"
     pdfs_gt_path = data_dir / "PdfMiner Params GT"
@@ -68,4 +68,4 @@
     with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f:
         json.dump(info, f, ensure_ascii=False, indent=2)
 
-    print(f"save result in {output_dir}")  # noqa
+    print(f"save result in {output_dir}")
diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py
index 8d5b7a81..8a52048d 100644
--- a/scripts/benchmark_table/benchmark_table.py
+++ b/scripts/benchmark_table/benchmark_table.py
@@ -48,7 +48,7 @@ def get_tables(image_path: Path) -> str:
 def make_predict_json(data_path: Path) -> dict:
     predict_json = {}
     for pathname in Path.iterdir(data_path):
-        print(pathname)  # noqa
+        print(pathname)
 
         predict_json[pathname.name] = {"html": "<html><body>" + get_tables(pathname) + "</body></html>"}
 
@@ -57,7 +57,7 @@ def make_predict_json(data_path: Path) -> dict:
 
 def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
     if Path.exists(data_dir):
-        print(f"Use cached benchmark data from {data_dir}")  # noqa
+        print(f"Use cached benchmark data from {data_dir}")
         return
 
     data_dir.mkdir(parents=True, exist_ok=True)
@@ -68,7 +68,7 @@ def download_dataset(data_dir: Path, name_zip: str, url: str) -> None:
         zip_ref.extractall(data_dir)
     pdfs_zip_path.unlink()
 
-    print(f"Benchmark data downloaded to {data_dir}")  # noqa
+    print(f"Benchmark data downloaded to {data_dir}")
 
 
 def prediction(path_pred: Path, path_images: Path) -> dict:
diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index d306247d..ac0b90b5 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -119,7 +119,7 @@ def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) ->
                                      content=cell,
                                      visible=False if node.attrib.get("style") == "display: none" else True, *deque())  # noqa
             except Exception as ex:
-                print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")  # noqa
+                print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
                 raise ex
         else:
             new_node = TableTree(node.tag, None, None, None, True, *deque())
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index 50dec7ae..d959a1f4 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -24,7 +24,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     directory = os.path.join(path_base, tl_path)
     files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
     total_file_size = len(files_list)
-    print(f"Files: {files_list}\nFiles number: {total_file_size}")  # noqa
+    print(f"Files: {files_list}\nFiles number: {total_file_size}")
     for file in tqdm(files_list):
         file_path = os.path.join(directory, file)
         r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
@@ -52,9 +52,9 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
         with zipfile.ZipFile(path_out, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(path_out)
-        print(f"Benchmark data downloaded to {benchmark_data_dir}")  # noqa
+        print(f"Benchmark data downloaded to {benchmark_data_dir}")
     else:
-        print(f"Use cached benchmark data from {benchmark_data_dir}")  # noqa
+        print(f"Use cached benchmark data from {benchmark_data_dir}")
 
     assert os.path.isdir(benchmark_data_dir)
 
@@ -74,4 +74,4 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
 
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print(f"Save result in {path_result}")  # noqa
+    print(f"Save result in {path_result}")
diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py
index a26c7995..ca2e196c 100644
--- a/scripts/create_txtlayer_dataset.py
+++ b/scripts/create_txtlayer_dataset.py
@@ -200,7 +200,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
     os.makedirs(os.path.join(args.out_dir, args.incorrect_dir), exist_ok=True)
 
     i = args.start_number
-    print("Generating incorrect texts")  # noqa
+    print("Generating incorrect texts")
     for _ in tqdm(range(args.dataset_size)):
         for language in ("ru", "en"):
             text = ""
@@ -211,7 +211,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
                     corruptor = random.choice(corruptor_list)
                     text = corruptor.corrupt(text, lang=language)
                 except Exception as e:
-                    print(e)  # noqa
+                    print(e)
                     text = ""
 
             with open(os.path.join(args.out_dir, args.incorrect_dir, f"{i:08d}_{language}.txt"), "w") as f:
@@ -219,7 +219,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]:
             i += 1
 
     i = args.start_number
-    print("Generating correct texts")  # noqa
+    print("Generating correct texts")
     for _ in tqdm(range(args.dataset_size)):
         for language in ("ru", "en"):
 
diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
index ce77a128..07895d0d 100644
--- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
+++ b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py
@@ -264,10 +264,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
                         accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]])
 
                 except Exception as ex:
-                    print(ex)  # noqa
-                    print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`")  # noqa
+                    print(ex)
+                    print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`")
 
-    print(f"Time mean correction ocr = {np.array(correction_times).mean()}")  # noqa
+    print(f"Time mean correction ocr = {np.array(correction_times).mean()}")
     table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values)
     return table_common, table_accuracy_per_image
 
@@ -283,9 +283,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
     benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip")
     if not os.path.isfile(benchmark_data_path):
         wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path)
-        print(f"Benchmark data downloaded to {benchmark_data_path}")  # noqa
+        print(f"Benchmark data downloaded to {benchmark_data_path}")
     else:
-        print(f"Use cached benchmark data from {benchmark_data_path}")  # noqa
+        print(f"Use cached benchmark data from {benchmark_data_path}")
     assert os.path.isfile(benchmark_data_path)
 
     table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir)
@@ -302,7 +302,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
         res_file.write("\n\nTable 3 -OCR error by symbol:\n")
         res_file.write(table_errors.draw())
 
-    print(f"Tesseract version is {pytesseract.get_tesseract_version()}")  # noqa
-    print(table_accuracy_per_image.draw())  # noqa
-    print(table_common.draw())  # noqa
-    print(table_errors.draw())  # noqa
+    print(f"Tesseract version is {pytesseract.get_tesseract_version()}")
+    print(table_accuracy_per_image.draw())
+    print(table_common.draw())
+    print(table_errors.draw())
diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py
index 82160731..89fb87a1 100644
--- a/scripts/tesseract_benchmark/ocr_correction.py
+++ b/scripts/tesseract_benchmark/ocr_correction.py
@@ -35,7 +35,7 @@ def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]:
     corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value)  # 4.49 Gb model (pytorch_model.bin)
     if torch.cuda.is_available() and USE_GPU:
         corrector.model.to(torch.device("cuda:0"))
-        print("use CUDA")  # noqa
+        print("use CUDA")
     else:
-        print("use CPU")  # noqa
+        print("use CPU")
     return corrector, corrected_path
diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py
index 9f65e590..9dde8702 100644
--- a/scripts/test_words_bbox_extraction.py
+++ b/scripts/test_words_bbox_extraction.py
@@ -37,7 +37,7 @@ def __extract_conf_annotation(self, anns_conf: List[dict], ann_bbox: dict, text:
                 )
 
         if DETAILED_DEBUG:
-            print(debug)  # noqa
+            print(debug)
 
         return confs
 
@@ -57,7 +57,7 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t
                     }
                 )
         if DETAILED_DEBUG:
-            print(debug)  # noqa
+            print(debug)
 
         return text_type
 
diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_acc_orientation_classifier.py
index 497a5183..05f36083 100644
--- a/scripts/train/train_acc_orientation_classifier.py
+++ b/scripts/train/train_acc_orientation_classifier.py
@@ -45,7 +45,7 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri
     sample = dataiter.__next__()
     _, orientation, columns = sample["image"], sample["orientation"], sample["columns"]
 
-    print(f"GroundTruth: orientation {orientation}, columns {columns}")  # noqa
+    print(f"GroundTruth: orientation {orientation}, columns {columns}")
 
     calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1)
 
@@ -95,8 +95,8 @@ def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier:
                     )
 
     for i in range(len(classes)):
-        print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %")  # noqa
-    print(f"=== AVG Time predict {time_predict / cnt_predict}")  # noqa
+        print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %")
+    print(f"=== AVG Time predict {time_predict / cnt_predict}")
 
 
 def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: ColumnsOrientationClassifier, epoch_cnt: int = 7, save_step: int = 500) -> None:
@@ -131,7 +131,7 @@ def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier:
 
             # print statistics
             if i % 100 == 99:
-                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}")  # noqa
+                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}")
                 running_loss = 0.0
 
             # save checkpoint
@@ -139,7 +139,7 @@ def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier:
                 classifier.save_weights(checkpoint_path_save)
 
     classifier.save_weights(checkpoint_path_save)
-    print("Finished Training")  # noqa
+    print("Finished Training")
 
 
 def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientationClassifier) -> None:
@@ -157,8 +157,8 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat
     _, orientation, columns = sample["image"], sample["orientation"], sample["columns"]
 
     # print labels
-    print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE)))  # noqa
-    print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE)))  # noqa
+    print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE)))
+    print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE)))
 
     # Part 2 - train model
     train_model(trainloader, args.checkpoint_save, classifier)
diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py
index 34643c3a..71a4c900 100644
--- a/scripts/train/train_diploma_line_classifier.py
+++ b/scripts/train/train_diploma_line_classifier.py
@@ -50,4 +50,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False, no_cache=False)
-print("successfully train diploma classifier")  # noqa
+print("successfully train diploma classifier")
diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py
index 93780d36..7b6dd416 100644
--- a/scripts/train/train_law_line_classifier.py
+++ b/scripts/train/train_law_line_classifier.py
@@ -73,4 +73,4 @@ def get_sample_weight(line: LineWithLabel) -> int:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train law classifier")  # noqa
+print("successfully train law classifier")
diff --git a/scripts/train/train_mle_language_model.py b/scripts/train/train_mle_language_model.py
index 41c9882c..8488f41a 100644
--- a/scripts/train/train_mle_language_model.py
+++ b/scripts/train/train_mle_language_model.py
@@ -39,22 +39,22 @@ def main() -> None:
     documents = []
     for files in files_path_big_data:
         file = os.listdir(path_big_data + files)
-        print(files)  # noqa
+        print(files)
         for writer in file:
             try:
                 with open(path_big_data + files + "/" + writer) as f:
                     text = f.read()
                     document = tokenize_doc(text)
                     documents.append(document)
-                print(writer)  # noqa
+                print(writer)
                 break
             except Exception:
-                print(Exception)  # noqa
+                print(Exception)
                 pass
     documents = " ".join(documents)
     bigram_list = create_ngramm_list(documents, 2)
     train, vocab = padded_everygram_pipeline(2, [bigram_list])
-    print(bigram_list)  # noqa
+    print(bigram_list)
     language_model_mle = MLE(2)
     language_model_mle.fit(train, vocab)
     with open("n-gram_lang_model.pkl", "wb") as f:
diff --git a/scripts/train/train_nn_line_classifier_law.py b/scripts/train/train_nn_line_classifier_law.py
index 84893653..2aa6e0d5 100644
--- a/scripts/train/train_nn_line_classifier_law.py
+++ b/scripts/train/train_nn_line_classifier_law.py
@@ -71,4 +71,4 @@ def get_sample_weight(line: LineWithLabel) -> float:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train law classifier")  # noqa
+print("successfully train law classifier")
diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py
index 658e5ff3..0f2a6ba5 100644
--- a/scripts/train/train_paragraph_classifier.py
+++ b/scripts/train/train_paragraph_classifier.py
@@ -46,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print(f"successfully train {classifier_name} classifier")  # noqa
+print(f"successfully train {classifier_name} classifier")
diff --git a/scripts/train/train_txtlayer_classifier.py b/scripts/train/train_txtlayer_classifier.py
index 12d6cdba..ed3deaa4 100644
--- a/scripts/train/train_txtlayer_classifier.py
+++ b/scripts/train/train_txtlayer_classifier.py
@@ -45,11 +45,11 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
                 with open(path, mode="r") as f:
                     text = f.read()
             except Exception as e:
-                print(f"Bad file {str(e)}: {path}")  # noqa
+                print(f"Bad file {str(e)}: {path}")
                 continue
 
             if len(text.strip()) == 0:
-                print(f"Empty file: {path}")  # noqa
+                print(f"Empty file: {path}")
                 continue
 
             texts.append(text)
@@ -69,9 +69,9 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
         with zipfile.ZipFile(path_out, "r") as zip_ref:
             zip_ref.extractall(data_dir)
         os.remove(path_out)
-        print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}")  # noqa
+        print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}")
     else:
-        print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}")  # noqa
+        print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}")
 
     assert os.path.isdir(txtlayer_classifier_dataset_dir)
 
@@ -89,7 +89,7 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]:
     test_preds = clf.predict(stages_data["test"]["features"])
 
     score = f1_score(stages_data["test"]["labels"], test_preds)
-    print(f"F1 score = {score}")  # noqa
+    print(f"F1 score = {score}")
 
     resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "resources")
     with gzip.open(os.path.join(resources_dir, "txtlayer_classifier.pkl.gz"), "wb") as file:
diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py
index e4677c2e..da62ca7b 100644
--- a/scripts/train/train_tz_line_classifier.py
+++ b/scripts/train/train_tz_line_classifier.py
@@ -46,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]:
 )
 
 trainer.fit(cross_val_only=False, save_errors_images=False)
-print("successfully train tz classifier")  # noqa
+print("successfully train tz classifier")
diff --git a/scripts/train/trainers/base_sklearn_line_classifier.py b/scripts/train/trainers/base_sklearn_line_classifier.py
index b807d6a0..9ed905c8 100644
--- a/scripts/train/trainers/base_sklearn_line_classifier.py
+++ b/scripts/train/trainers/base_sklearn_line_classifier.py
@@ -97,7 +97,7 @@ def fit(self, no_cache: bool = False, cross_val_only: bool = False, save: bool =
 
             predicted = cls.predict(features_test)
             accuracy = accuracy_score(labels_test, predicted, sample_weight=sample_weight[-n:])
-            print("Final Accuracy = {}".format(accuracy))  # noqa
+            print(f"Final Accuracy = {accuracy}")
             scores["final_accuracy"] = accuracy
 
             if not os.path.isdir(os.path.dirname(self.path_out)):
diff --git a/scripts/train/trainers/data_loader.py b/scripts/train/trainers/data_loader.py
index e16e37cb..35620522 100644
--- a/scripts/train/trainers/data_loader.py
+++ b/scripts/train/trainers/data_loader.py
@@ -36,7 +36,7 @@ def get_data(self, no_cache: bool = False) -> List[List[LineWithLabel]]:
         if os.path.isfile(pkl_path) and not no_cache:
             with gzip.open(pkl_path) as input_file:
                 result = pickle.load(input_file)
-            print("func get_data(): Data were loaded from the local disk")  # noqa
+            print("func get_data(): Data were loaded from the local disk")
             return self.__sort_data(result)
         os.makedirs(self.dataset_dir, exist_ok=True)
         path_out = os.path.join(self.dataset_dir, "dataset.zip")
diff --git a/scripts/train/trainers/errors_saver.py b/scripts/train/trainers/errors_saver.py
index 0ba53cc0..98ee7269 100644
--- a/scripts/train/trainers/errors_saver.py
+++ b/scripts/train/trainers/errors_saver.py
@@ -32,10 +32,10 @@ def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str,
         assert len(set(errors_uids)) == len(errors_uids)
         self.logger.info(f"save errors in {self.errors_path}")
         errors_total_num = sum(error_cnt.values())
-        print(f"{'true':16s} -> {'predicted':16s} {'cnt':6s} {'(percent)':16s}")  # noqa
+        print(f"{'true':16s} -> {'predicted':16s} {'cnt':6s} {'(percent)':16s}")
         for error, cnt in error_cnt.most_common():
             y_true, y_pred = error
-            print(f"{y_true:16s} -> {y_pred:16s} {cnt:06,} ({100 * cnt / errors_total_num:02.2f}%)")  # noqa
+            print(f"{y_true:16s} -> {y_pred:16s} {cnt:06,} ({100 * cnt / errors_total_num:02.2f}%)")
 
         if save_errors_images:
             self.__save_images(errors_uids, csv_path)
diff --git a/scripts/train/trainers/line_lstm_classifier_trainer.py b/scripts/train/trainers/line_lstm_classifier_trainer.py
index 5f9a9225..e3b49829 100644
--- a/scripts/train/trainers/line_lstm_classifier_trainer.py
+++ b/scripts/train/trainers/line_lstm_classifier_trainer.py
@@ -151,10 +151,10 @@ def __init__(self,
         self.class_dict = class_dict
         self.num_classes = len(class_dict)
         if torch.cuda.is_available() and on_gpu:
-            print("Device is cuda")  # noqa
+            print("Device is cuda")
             self.device = torch.device("cuda:0")
         else:
-            print("Device is cpu")  # noqa
+            print("Device is cpu")
             self.device = torch.device("cpu")
 
     def __get_labels(self, data: List[List[LineWithLabel]]) -> List[str]:
@@ -189,14 +189,14 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti
         time_epoch = 0.0
 
         for epoch in range(self.num_epochs):
-            print("\n\t Epoch: {}".format(epoch))  # noqa
+            print(f"\n\t Epoch: {epoch}")
             # The Dataloader class handles all the shuffles for you
 
             loader_iter = iter(LineEpsDataSet(features_train, labels_train, self.class_dict))
             time_begin = time.time()
             train_loss, train_acc = self.train(lstm_model, loader_iter, len(labels_train), optimizer, criteria, batch_size=self.batch_size)
             time_epoch += time.time() - time_begin
-            print(f"\n\t \x1b\33[33mTrain: epoch: {epoch}| Train loss: {train_loss} | Train acc: {train_acc}\x1b[0m")  # noqa
+            print(f"\n\t \x1b\33[33mTrain: epoch: {epoch}| Train loss: {train_loss} | Train acc: {train_acc}\x1b[0m")
             if file_log:
                 file_log.write(f"\t Train: epoch: {epoch}| Train loss: {epoch} | Train acc: {train_loss}\n")
 
@@ -204,7 +204,7 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti
             if with_eval:
                 loader_iter = iter(LineEpsDataSet(features_test, labels_test, self.class_dict))
                 test_loss, test_acc = self.evaluate(lstm_model, loader_iter, len(labels_test), criteria, batch_size=self.batch_size)
-                print(f"\n\t \x1b\33[92mEvaluation: Test loss: {test_loss} | Test acc: {test_acc}\x1b[0m")  # noqa
+                print(f"\n\t \x1b\33[92mEvaluation: Test loss: {test_loss} | Test acc: {test_acc}\x1b[0m")
                 if file_log:
                     file_log.write(f"\t Eval: epoch: {epoch}| Test loss: {test_loss} | Test acc: {test_acc}\n")
                 curr_loss = test_loss
@@ -219,7 +219,7 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti
             if with_save and curr_loss < best_loss:
                 best_loss = curr_loss
                 torch.save(lstm_model.state_dict(), self.path_out)
-                print(f"Model has been saved into {self.path_out}")  # noqa
+                print(f"Model has been saved into {self.path_out}")
 
         return res_loss / self.num_epochs, res_acc / self.num_epochs, time_epoch / self.num_epochs
 
@@ -237,7 +237,7 @@ def fit(self, with_cross_val: bool = True) -> None:
         data = np.array(data, dtype=object)
 
         if with_cross_val:
-            print("\n\x1b\33[95m---------Evaluation process (cross-validation) starts-------\x1b[0m\n")  # noqa
+            print("\n\x1b\33[95m---------Evaluation process (cross-validation) starts-------\x1b[0m\n")
             kf = KFold(n_splits=self.n_splits)
             scores = []
             epoch_time = []
@@ -267,7 +267,7 @@ def fit(self, with_cross_val: bool = True) -> None:
             scores_dict["scores"] = scores
             logfile_kfold_tmp.close()
 
-        print("\n\x1b\33[95m-------------------Train process starts------------------\x1b[0m\n")  # noqa
+        print("\n\x1b\33[95m-------------------Train process starts------------------\x1b[0m\n")
         features_train, labels_train = self.get_features(data)
         lstm_model = LSTM(input_dim=features_train.shape[1], hidden_dim=features_train.shape[1],
                           hidden_dim_2=lstm_hidden_dim, num_classes=self.num_classes, lstm_layers=lstm_layers,
@@ -278,7 +278,7 @@ def fit(self, with_cross_val: bool = True) -> None:
                                                             labels_test=None,
                                                             file_log=None,
                                                             with_save=True, with_eval=False)
-        print("\x1b\33[92mFinal Accuracy from training = {}\x1b[0m".format(acc))  # noqa
+        print(f"\x1b\33[92mFinal Accuracy from training = {acc}\x1b[0m")
         scores_dict["final_accuracy"] = acc
 
         if self.path_scores is not None:
@@ -338,7 +338,7 @@ def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer:
             epoch_acc += accuracy
             cnt += 1
             if log_per_cnt != 0 and batch_num % log_per_cnt == 0:
-                print(f"\t\tbatch_num: {batch_num}, loss={epoch_loss / cnt}, acc={epoch_acc / cnt}")  # noqa
+                print(f"\t\tbatch_num: {batch_num}, loss={epoch_loss / cnt}, acc={epoch_acc / cnt}")
 
         return epoch_loss / cnt, epoch_acc / cnt
 

From 83edd01ec7d02c24852bf72799b78e7b35b5a2ee Mon Sep 17 00:00:00 2001
From: Nasty <bogatenkova.anastasiya@mail.ru>
Date: Thu, 1 Feb 2024 14:26:37 +0300
Subject: [PATCH 3/3] Review fix

---
 scripts/benchmark_table/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py
index ac0b90b5..28c46b5b 100644
--- a/scripts/benchmark_table/metric.py
+++ b/scripts/benchmark_table/metric.py
@@ -117,7 +117,7 @@ def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) ->
                                      colspan=self.get_span(node, "colspan"),
                                      rowspan=self.get_span(node, "rowspan"),
                                      content=cell,
-                                     visible=False if node.attrib.get("style") == "display: none" else True, *deque())  # noqa
+                                     visible=node.attrib.get("style") != "display: none", *deque())  # noqa
             except Exception as ex:
                 print(f"Bad html file. HTML parse exception. Exception's msg: {ex}")
                 raise ex