From 18463e802b7c0688c93a61025588bd2934a3f790 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 1 Feb 2024 13:09:29 +0300 Subject: [PATCH 1/3] TLDR-590 fix code style in scripts directory --- .flake8 | 1 - .pre-commit-config.yaml | 2 +- scripts/benchmark.py | 14 ++-- scripts/benchmark_pdf_attachments.py | 16 ++-- scripts/benchmark_pdf_miner.py | 12 +-- scripts/benchmark_table/benchmark_table.py | 31 ++++---- scripts/benchmark_table/metric.py | 59 ++++++++------- scripts/benchmark_tl_correctness.py | 16 ++-- scripts/create_txtlayer_dataset.py | 42 ++++++----- .../calc_tesseract_benchmarks.py | 52 +++++++------ scripts/tesseract_benchmark/ocr_correction.py | 12 ++- .../text_blob_correction.py | 2 +- scripts/test_words_bbox_extraction.py | 60 +++++++++------ .../train/train_acc_orientation_classifier.py | 75 +++++++------------ .../train/train_diploma_line_classifier.py | 12 ++- scripts/train/train_law_line_classifier.py | 10 +-- scripts/train/train_mle_language_model.py | 18 ++--- scripts/train/train_nn_line_classifier_law.py | 14 ++-- scripts/train/train_paragraph_classifier.py | 10 +-- scripts/train/train_txtlayer_classifier.py | 22 +++--- scripts/train/train_tz_line_classifier.py | 10 +-- 21 files changed, 242 insertions(+), 248 deletions(-) diff --git a/.flake8 b/.flake8 index 8d7e241f..4c24c4a3 100644 --- a/.flake8 +++ b/.flake8 @@ -14,7 +14,6 @@ exclude = .github, *__init__.py, resources, - scripts, venv, build, dedoc.egg-info diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7c5d542c..76ee04b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info + exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info args: - "--config=.flake8" additional_dependencies: [ diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4492af03..69276d70 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -33,18 +33,18 @@ def get_cpu_performance() -> float: cpu_performance = get_cpu_performance() -print('"cpu_performance" = {}'.format(cpu_performance)) +print(f'"cpu_performance" = {cpu_performance}') # noqa with TemporaryDirectory() as path_base: path_out = os.path.join(path_base, "dataset.zip") wget.download(data_url, path_out) - with zipfile.ZipFile(path_out, 'r') as zip_ref: + with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(path_base) - print(path_base) + print(path_base) # noqa failed = [] result = OrderedDict() - result["version"] = requests.get("{}/version".format(host)).text + result["version"] = requests.get(f"{host}/version").text result["cpu_performance"] = cpu_performance tasks = [ Task("images", "images", {}), @@ -60,7 +60,7 @@ def get_cpu_performance() -> float: Task("pdf", "pdf", {"pdf_with_text_layer": "false"}), Task("pdf_tables", "pdf_tables", {}) ] - print(tasks) + print(tasks) # noqa for directory, name, parameters in tasks: total_size = 0 total_time = 0 @@ -90,5 +90,5 @@ def get_cpu_performance() -> float: with open(path_result, "w") as file_out: json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print("save result in" + path_result) - print(failed) + print(f"save result in {path_result}") # noqa + print(failed) # noqa diff --git a/scripts/benchmark_pdf_attachments.py b/scripts/benchmark_pdf_attachments.py index 411f1275..f069685c 100644 --- a/scripts/benchmark_pdf_attachments.py +++ b/scripts/benchmark_pdf_attachments.py @@ -39,7 +39,7 @@ def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name)) attachment_names.append(attachment_name) - print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result") + print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result") # noqa result_dict[file_name] = sorted(attachment_names) return result_dict @@ -70,7 +70,7 @@ def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_d shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name)) attachment_names.append(attachment_name) - print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result") + print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result") # noqa result_dict[file_name] = sorted(attachment_names) return result_dict @@ -99,9 +99,9 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i zip_ref.extractall(data_dir) os.remove(archive_path) - print(f"Benchmark data downloaded to {data_dir}") + print(f"Benchmark data downloaded to {data_dir}") # noqa else: - print(f"Use cached benchmark data from {data_dir}") + print(f"Use cached benchmark data from {data_dir}") # noqa in_dir = os.path.join(data_dir, "with_attachments") out_dir = os.path.join(in_dir, "extracted_attachments") @@ -112,17 +112,17 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i benchmarks_dict = {} - print("Get tabby attachments") + print("Get tabby attachments") # noqa tabby_reader = PdfTabbyReader(config={}) tabby_out_dir = os.path.join(out_dir, "tabby") benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir) - print("Get pdfminer attachments") + print("Get pdfminer attachments") # noqa pdfminer_reader = PdfTxtlayerReader(config={}) pdfminer_out_dir = os.path.join(out_dir, "pdfminer") benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir) - print("Get common attachments") + print("Get common attachments") # noqa common_out_dir = os.path.join(out_dir, "common") pdf_attachments_extractor = PDFAttachmentsExtractor(config={}) benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir) @@ -131,4 +131,4 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f: json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2) - print(f"Attachments were extracted to {out_dir}") + print(f"Attachments were extracted to {out_dir}") # noqa diff --git a/scripts/benchmark_pdf_miner.py b/scripts/benchmark_pdf_miner.py index b8870ed5..b7c5d785 100644 --- a/scripts/benchmark_pdf_miner.py +++ b/scripts/benchmark_pdf_miner.py @@ -24,16 +24,16 @@ wget.download(URL, pdfs_zip_path) wget.download(URL_GT, pdfs_zip_gt_path) - with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(pdfs_zip_path) - with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref: + with zipfile.ZipFile(pdfs_zip_gt_path, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(pdfs_zip_gt_path) - print(f"Benchmark data downloaded to {data_dir}") + print(f"Benchmark data downloaded to {data_dir}") # noqa else: - print(f"Use cached benchmark data from {data_dir}") + print(f"Use cached benchmark data from {data_dir}") # noqa pdfs_path = data_dir / "PdfMiner Params" pdfs_gt_path = data_dir / "PdfMiner Params GT" @@ -53,7 +53,7 @@ accuracy_path = Path(tmpdir) / "accuracy.txt" if accuracy_path.exists(): accuracy_path.unlink() - command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}" + command = f'{accuracy_script_path} "{gt_path}" {tmp_ocr_path} >> {accuracy_path}' os.system(command) with open(accuracy_path, "r") as f: @@ -68,4 +68,4 @@ with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f: json.dump(info, f, ensure_ascii=False, indent=2) - print(f"save result in {output_dir}") + print(f"save result in {output_dir}") # noqa diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index c6cbd7cb..8d5b7a81 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -1,8 +1,9 @@ -import zipfile -from pathlib import Path import json import pprint -from typing import Optional, List +import zipfile +from pathlib import Path +from typing import List, Optional + import numpy as np import wget @@ -47,7 +48,7 @@ def get_tables(image_path: Path) -> str: def make_predict_json(data_path: Path) -> dict: predict_json = {} for pathname in Path.iterdir(data_path): - print(pathname) + print(pathname) # noqa predict_json[pathname.name] = {"html": "" + get_tables(pathname) + ""} @@ -56,18 +57,18 @@ def make_predict_json(data_path: Path) -> dict: def download_dataset(data_dir: Path, name_zip: str, url: str) -> None: if Path.exists(data_dir): - print(f"Use cached benchmark data from {data_dir}") + print(f"Use cached benchmark data from {data_dir}") # noqa return data_dir.mkdir(parents=True, exist_ok=True) pdfs_zip_path = data_dir / name_zip wget.download(url, str(data_dir)) - with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: + with zipfile.ZipFile(pdfs_zip_path, "r") as zip_ref: zip_ref.extractall(data_dir) pdfs_zip_path.unlink() - print(f"Benchmark data downloaded to {data_dir}") + print(f"Benchmark data downloaded to {data_dir}") # noqa def prediction(path_pred: Path, path_images: Path) -> dict: @@ -83,19 +84,17 @@ def benchmark_on_our_data() -> dict: path_images = data_dir / "images" path_gt = data_dir / "gt.json" path_pred = data_dir / "pred.json" - download_dataset(data_dir, - name_zip="benchmark_table_data.zip", - url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download") + download_dataset(data_dir, name_zip="benchmark_table_data.zip", url="https://at.ispras.ru/owncloud/index.php/s/Xaf4OyHj6xN2RHH/download") mode_metric_structure_only = False with open(path_gt, "r") as fp: gt_json = json.load(fp) - ''' + """ Creating base html (based on method predictions for future labeling) path_images = data_dir / "images_tmp" pred_json = prediction("gt_tmp.json", path_images) - ''' + """ pred_json = prediction(path_pred, path_images) scores = call_metric(pred_json=pred_json, true_json=gt_json, structure_only=mode_metric_structure_only) @@ -113,7 +112,7 @@ def benchmark_on_generated_table() -> dict: Article generation information https://arxiv.org/pdf/1905.13391.pdf Note: generate the 1st table tape category Note: don't use header table tag , replacing on tag - Note: all generated data (four categories) you can download from + Note: all generated data (four categories) you can download from TODO: some tables have a low quality. Should to trace the reason. All generated data (all categories) we can download from https://at.ispras.ru/owncloud/index.php/s/cjpCIR7I0G4JzZU """ @@ -129,7 +128,7 @@ def benchmark_on_generated_table() -> dict: # make common ground-truth file common_gt_json = {} for pathname in Path.iterdir(path_gt): - image_name = pathname.name.split(".")[0] + '.png' + image_name = pathname.name.split(".")[0] + ".png" with open(pathname, "r") as fp: table_html = fp.read() # exclude header tags @@ -146,9 +145,7 @@ def benchmark_on_generated_table() -> dict: path_pred = data_dir / "pred.json" pred_json = prediction(path_pred, path_images) - scores = call_metric(pred_json=pred_json, true_json=common_gt_json, - structure_only=mode_metric_structure_only, - ignore_nodes=['span', 'style', 'head', 'h4']) + scores = call_metric(pred_json=pred_json, true_json=common_gt_json, structure_only=mode_metric_structure_only, ignore_nodes=["span", "style", "head", "h4"]) result = dict() result["mode_metric_structure_only"] = mode_metric_structure_only diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index ff84a4a7..d306247d 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -11,17 +11,18 @@ # Source: https://github.com/ibm-aur-nlp/PubTabNet +from collections import deque +from typing import Optional + import distance from apted import APTED, Config from apted.helpers import Tree from lxml import etree, html -from collections import deque - from tqdm import tqdm class TableTree(Tree): - def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, *children): + def __init__(self, tag: str, colspan=None, rowspan=None, content=None, visible=None, *children): # noqa self.tag = tag self.colspan = colspan self.rowspan = rowspan @@ -29,10 +30,11 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, visible=None, self.visible = visible self.children = list(children) - def bracket(self): - """Show tree using brackets notation + def bracket(self) -> str: """ - if self.tag == "td" or self.tag == 'th': + Show tree using brackets notation + """ + if self.tag == "td" or self.tag == "th": result = f'"tag": {self.tag}, "colspan": {self.colspan}, "rowspan": {self.rowspan}, "text": {self.content}' else: result = f'"tag": {self.tag}' @@ -43,18 +45,22 @@ def bracket(self): class CustomConfig(Config): @staticmethod - def maximum(*sequences): - """Get maximum possible value + def maximum(*sequences): # noqa + """ + Get maximum possible value """ return max(map(len, sequences)) - def normalized_distance(self, *sequences) -> float: - """Get distance from 0 to 1 + def normalized_distance(self, *sequences) -> float: # noqa + """ + Get distance from 0 to 1 """ return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) def rename(self, node1: TableTree, node2: TableTree) -> float: - """Compares attributes of trees""" + """ + Compares attributes of trees + """ if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): return 1. if node1.tag == "td": @@ -66,18 +72,20 @@ def rename(self, node1: TableTree, node2: TableTree) -> float: class TEDS(object): - """ Tree Edit Distance based Similarity + """ + Tree Edit Distance based Similarity """ - def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + def __init__(self, structure_only: bool = False, n_jobs: int = 1, ignore_nodes: Optional[list] = None) -> None: assert isinstance(n_jobs, int) and (n_jobs >= 1), "n_jobs must be an integer greather than 1" self.structure_only = structure_only self.n_jobs = n_jobs self.ignore_nodes = ignore_nodes self.__tokens__ = [] - def tokenize(self, node): - """ Tokenizes table cells + def tokenize(self, node: TableTree) -> None: + """ + Tokenizes table cells """ self.__tokens__.append(f"<{node.tag}>") if node.text is not None: @@ -89,11 +97,11 @@ def tokenize(self, node): if node.tag != "td" and node.tail is not None: self.__tokens__ += list(node.tail) - def get_span(self, node, name_span: str) -> int: + def get_span(self, node: TableTree, name_span: str) -> int: value = int(node.attrib.get(name_span, "1")) return 1 if value <= 0 else value - def load_html_tree(self, node, parent=None): + def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> TableTree: """ Converts HTML tree to the format required by apted """ if node.tag == "td": @@ -109,9 +117,9 @@ def load_html_tree(self, node, parent=None): colspan=self.get_span(node, "colspan"), rowspan=self.get_span(node, "rowspan"), content=cell, - visible=False if node.attrib.get("style") == "display: none" else True, *deque()) + visible=False if node.attrib.get("style") == "display: none" else True, *deque()) # noqa except Exception as ex: - print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") + print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") # noqa raise ex else: new_node = TableTree(node.tag, None, None, None, True, *deque()) @@ -148,12 +156,13 @@ def evaluate(self, pred: str, true: str) -> float: else: return 0.0 - def batch_evaluate(self, pred_json, true_json): - """ Computes TEDS score between the prediction and the ground truth of - a batch of samples - @params pred_json: {'FILENAME': 'HTML CODE', ...} - @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} - @output: {'FILENAME': 'TEDS SCORE', ...} + def batch_evaluate(self, pred_json: dict, true_json: dict) -> dict: + """ + Computes TEDS score between the prediction and the ground truth of a batch of samples + + :param pred_json: {'FILENAME': 'HTML CODE', ...} + :param true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + :return: {'FILENAME': 'TEDS SCORE', ...} """ samples = true_json.keys() scores = [self.evaluate(pred_json.get(filename, "")["html"], true_json[filename]["html"]) for filename in tqdm(samples)] diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index d237e608..50dec7ae 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -15,7 +15,7 @@ path_result = os.path.join(path_result, "benchmarks_tl_correctness.json") host = "http://localhost:1231" -param_dist_errors = namedtuple('Param', ('total_file_size', 'total_incorrect_files', 'failed')) +param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed")) def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple: @@ -24,7 +24,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para directory = os.path.join(path_base, tl_path) files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")] total_file_size = len(files_list) - print(f"Files: {files_list}\nFiles number: {total_file_size}") + print(f"Files: {files_list}\nFiles number: {total_file_size}") # noqa for file in tqdm(files_list): file_path = os.path.join(directory, file) r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) @@ -49,12 +49,12 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para if not os.path.isdir(benchmark_data_dir): path_out = os.path.join(data_dir, "data_with_text_layer.zip") wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out) - with zipfile.ZipFile(path_out, 'r') as zip_ref: + with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(path_out) - print(f"Benchmark data downloaded to {benchmark_data_dir}") + print(f"Benchmark data downloaded to {benchmark_data_dir}") # noqa else: - print(f"Use cached benchmark data from {benchmark_data_dir}") + print(f"Use cached benchmark data from {benchmark_data_dir}") # noqa assert os.path.isdir(benchmark_data_dir) @@ -63,15 +63,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para parameters = dict(pdf_with_text_layer="auto", pages="1:1") result_item = OrderedDict() - incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters) + incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters) result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed - correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters) + correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters) result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed result["guessing_the_correctness_of_the_text"] = result_item with open(path_result, "w") as file_out: json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print("Save result in" + path_result) + print(f"Save result in {path_result}") # noqa diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py index 2e1db6ff..a26c7995 100644 --- a/scripts/create_txtlayer_dataset.py +++ b/scripts/create_txtlayer_dataset.py @@ -17,9 +17,9 @@ class CorrectTextGenerator: def __init__(self) -> None: - self.citation = re.compile(r'\[\d+]') - self.meta = re.compile(r'\[править \| править код]') - self.symbols = re.compile(r'[→←↑]') + self.citation = re.compile(r"\[\d+]") + self.meta = re.compile(r"\[править \| править код]") + self.symbols = re.compile(r"[→←↑]") self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0" self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text" @@ -37,15 +37,15 @@ def get_random_text(self, lang: str) -> str: # 2 - Get text the article article_result = requests.post(self.article_url.format(lang=lang, title=title)) article_result_dict = article_result.json() - article = article_result_dict["parse"]["text"]['*'] - bs = BeautifulSoup(article, 'html.parser') + article = article_result_dict["parse"]["text"]["*"] + bs = BeautifulSoup(article, "html.parser") article_text = bs.get_text() # 3 - Clear text of the article from unused symbols - article_text_fixed = re.sub(self.citation, '', article_text) + article_text_fixed = re.sub(self.citation, "", article_text) article_text_fixed = re.sub(self.meta, "", article_text_fixed) article_text_fixed = re.sub(self.symbols, "", article_text_fixed) - article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed) + article_text_fixed = re.sub(r"\n+", "\n", article_text_fixed) except: # noqa article_text_fixed = "" @@ -62,18 +62,22 @@ class EncodingCorruptor(Corruptor): def __init__(self) -> None: self.encodings = { "en": { - "input": ['cp1026'], - "output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8', - 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r', - 'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman'] + "input": ["cp1026"], + "output": [ + "cp1256", "cp437", "cp775", "cp852", "cp855", "cp857", "cp860", "cp861", "cp862", "cp863", "cp866", "gb18030", "hp_roman8", + "iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "koi8_r", + "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman" + ] }, "ru": { - "input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'], - "output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11', - 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r', - 'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500', - 'cp862', 'gb18030'] + "input": ["cp855", "cp866", "gb18030", "iso8859_5", "koi8_r", "mac_cyrillic", "utf_8"], + "output": [ + "cp1026", "cp1256", "cp437", "cp775", "cp850", "cp852", "cp863", "cp866", "hp_roman8", "iso8859_10", "iso8859_11", + "iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "iso8859_2", "iso8859_4", "iso8859_5", "iso8859_9", "koi8_r", + "mac_cyrillic", "mac_greek", "mac_latin2", "mac_roman", "cp1140", "cp273", "cp855", "cp860", "cp861", "cp857", "cp500", + "cp862", "gb18030" + ] } } @@ -196,7 +200,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: os.makedirs(os.path.join(args.out_dir, args.incorrect_dir), exist_ok=True) i = args.start_number - print("Generating incorrect texts") + print("Generating incorrect texts") # noqa for _ in tqdm(range(args.dataset_size)): for language in ("ru", "en"): text = "" @@ -207,7 +211,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: corruptor = random.choice(corruptor_list) text = corruptor.corrupt(text, lang=language) except Exception as e: - print(e) + print(e) # noqa text = "" with open(os.path.join(args.out_dir, args.incorrect_dir, f"{i:08d}_{language}.txt"), "w") as f: @@ -215,7 +219,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: i += 1 i = args.start_number - print("Generating correct texts") + print("Generating correct texts") # noqa for _ in tqdm(range(args.dataset_size)): for language in ("ru", "en"): diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py index b32f3f08..ce77a128 100644 --- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py +++ b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py @@ -77,14 +77,16 @@ def _get_avg(array: List) -> float: def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: - return [_get_avg(statistics[dataset]["ASCII_Spacing_Characters"]), - _get_avg(statistics[dataset]["ASCII_Special_Symbols"]), - _get_avg(statistics[dataset]["ASCII_Digits"]), - _get_avg(statistics[dataset]["ASCII_Uppercase_Letters"]), - _get_avg(statistics[dataset]["Latin1_Special_Symbols"]), - _get_avg(statistics[dataset]["Cyrillic"]), - sum(statistics[dataset]["Amount of words"]), - _get_avg(statistics[dataset]["Accuracy"])] + return [ + _get_avg(statistics[dataset]["ASCII_Spacing_Characters"]), + _get_avg(statistics[dataset]["ASCII_Special_Symbols"]), + _get_avg(statistics[dataset]["ASCII_Digits"]), + _get_avg(statistics[dataset]["ASCII_Uppercase_Letters"]), + _get_avg(statistics[dataset]["Latin1_Special_Symbols"]), + _get_avg(statistics[dataset]["Cyrillic"]), + sum(statistics[dataset]["Amount of words"]), + _get_avg(statistics[dataset]["Accuracy"]) + ] def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: @@ -106,7 +108,7 @@ def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: def __parse_ocr_errors(lines: List[str]) -> List: ocr_errors = [] matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] - for num, line in enumerate(lines[matched_errors[0] + 1:]): + for line in lines[matched_errors[0] + 1:]: # example line: " 2 0 { 6}-{б}" errors = re.findall(r"(\d+)", line)[0] chars = re.findall(r"{(.*)}-{(.*)}", line)[0] @@ -158,8 +160,12 @@ def __get_summary_symbol_error(path_reports: str) -> Texttable: def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] - accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", - "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]] + accs_common = [ + [ + "Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", + "Amount of words", "AVG Accuracy" + ] + ] table_accuracy_per_image = Texttable() accs.extend(accuracy_values) @@ -258,10 +264,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: - print(ex) - print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") + print(ex) # noqa + print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") # noqa - print(f"Time mean correction ocr = {np.array(correction_times).mean()}") + print(f"Time mean correction ocr = {np.array(correction_times).mean()}") # noqa table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) return table_common, table_accuracy_per_image @@ -277,9 +283,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") if not os.path.isfile(benchmark_data_path): wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path) - print(f"Benchmark data downloaded to {benchmark_data_path}") + print(f"Benchmark data downloaded to {benchmark_data_path}") # noqa else: - print(f"Use cached benchmark data from {benchmark_data_path}") + print(f"Use cached benchmark data from {benchmark_data_path}") # noqa assert os.path.isfile(benchmark_data_path) table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir) @@ -289,14 +295,14 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n") res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n") - res_file.write(f"\nTable 1 - Accuracy for each file\n") + res_file.write("\nTable 1 - Accuracy for each file\n") res_file.write(table_accuracy_per_image.draw()) - res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") + res_file.write("\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) - res_file.write(f"\n\nTable 3 -OCR error by symbol:\n") + res_file.write("\n\nTable 3 -OCR error by symbol:\n") res_file.write(table_errors.draw()) - print(f"Tesseract version is {pytesseract.get_tesseract_version()}") - print(table_accuracy_per_image.draw()) - print(table_common.draw()) - print(table_errors.draw()) + print(f"Tesseract version is {pytesseract.get_tesseract_version()}") # noqa + print(table_accuracy_per_image.draw()) # noqa + print(table_common.draw()) # noqa + print(table_errors.draw()) # noqa diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py index ada563e2..82160731 100644 --- a/scripts/tesseract_benchmark/ocr_correction.py +++ b/scripts/tesseract_benchmark/ocr_correction.py @@ -2,11 +2,11 @@ from typing import Tuple import torch -from sage.spelling_correction.corrector import Corrector from sage.spelling_correction import AvailableCorrectors from sage.spelling_correction import RuM2M100ModelForSpellingCorrection +from sage.spelling_correction.corrector import Corrector -''' +""" Install sage library (for ocr correction step): git clone https://github.com/ai-forever/sage.git cd sage @@ -14,7 +14,7 @@ pip install -r requirements.txt Note: sage use 5.2 Gb GPU ...... -''' +""" USE_GPU = True @@ -35,9 +35,7 @@ def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]: corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) if torch.cuda.is_available() and USE_GPU: corrector.model.to(torch.device("cuda:0")) - print("use CUDA") + print("use CUDA") # noqa else: - print("use CPU") + print("use CPU") # noqa return corrector, corrected_path - - diff --git a/scripts/tesseract_benchmark/text_blob_correction.py b/scripts/tesseract_benchmark/text_blob_correction.py index 8ecf8be6..73e8d70e 100644 --- a/scripts/tesseract_benchmark/text_blob_correction.py +++ b/scripts/tesseract_benchmark/text_blob_correction.py @@ -2,7 +2,7 @@ class TextBlobCorrector: - def __init__(self): + def __init__(self) -> None: return def correct(self, text: str) -> str: diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py index 888c3273..9f65e590 100644 --- a/scripts/test_words_bbox_extraction.py +++ b/scripts/test_words_bbox_extraction.py @@ -28,11 +28,16 @@ def __extract_conf_annotation(self, anns_conf: List[dict], ann_bbox: dict, text: interval = e - b if interval > 0: confs.append(ann_conf["value"]) - debug.append({f"{ann_conf['value']}[{b}:{e}]": [ - interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"]}) + debug.append( + { + f"{ann_conf['value']}[{b}:{e}]": [ + interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}" + ] + } + ) if DETAILED_DEBUG: - print(debug) + print(debug) # noqa return confs @@ -44,10 +49,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t interval = e - b if interval > 0: text_type = ann_type["value"] - debug.append({f"{ann_type['value']}:{b}:{e}": [ - interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}"]}) + debug.append( + { + f"{ann_type['value']}:{b}:{e}": [ + interval, f"bbox:[{ann_bbox['start']}:{ann_bbox['end']}], {text[ann_bbox['start']:ann_bbox['end']]}" + ] + } + ) if DETAILED_DEBUG: - print(debug) + print(debug) # noqa return text_type @@ -67,8 +77,9 @@ def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]: confs = self.__extract_conf_annotation(anns_conf, ann_bbox, node["text"]) text_type = self.__extract_texttype_annotation(anns_type, ann_bbox, node["text"]) - words_annotation.append(BboxWithConfsType(start=ann_bbox["start"], end=ann_bbox["end"], bbox=ann_bbox["value"], confs=confs, - text_type=text_type)) + words_annotation.append( + BboxWithConfsType(start=ann_bbox["start"], end=ann_bbox["end"], bbox=ann_bbox["value"], confs=confs, text_type=text_type) + ) stack.extend(node["subparagraphs"]) @@ -91,13 +102,13 @@ def __get_words_annotation_from_cell(self, table: dict) -> List[BboxWithConfsTyp return words_annotation def __normalize_font_thickness(self, image: np.ndarray) -> Tuple[float, int]: - FONT_SCALE = 6e-4 - THICKNESS_SCALE = 1e-3 + font_scale = 6e-4 + thickness_scale = 1e-3 height, width, _ = image.shape - font_scale = min(width, height) * FONT_SCALE - thickness = math.ceil(min(width, height) * THICKNESS_SCALE) + font = min(width, height) * font_scale + thickness = math.ceil(min(width, height) * thickness_scale) - return font_scale, thickness + return font, thickness def __rotate_coordinate(self, x: int, y: int, xc: float, yc: float, angle: float) -> Tuple[int, int]: rad = angle * math.pi / 180 @@ -123,8 +134,10 @@ def __draw_word_annotations(self, image: np.ndarray, word_annotations: List[Bbox cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0)) text = ",".join(ann.confs) if ann.confs != [] else "None" - cv2.putText(image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])), - cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), thickness) + cv2.putText( + image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])), cv2.FONT_HERSHEY_SIMPLEX, + font_scale, (0, 0, 255), thickness + ) return image def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarray: @@ -135,9 +148,9 @@ def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarr image = self.__draw_word_annotations(image, word_annotations, angle=table_angle) return image - def test_pdf_documents(self): + def test_pdf_documents(self) -> None: filename_parameters_outputdir = [ - ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"), "pdfminer_reader"], + ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"), "pdfminer_reader"], ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="tabby"), "tabby_reader"] ] @@ -158,12 +171,13 @@ def test_pdf_documents(self): image = self.__draw_tables_words(tables, image) cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image) - def test_table_word_extraction(self): - output_path = os.path.join(self.output_path, 'tables') + def test_table_word_extraction(self) -> None: + output_path = os.path.join(self.output_path, "tables") os.makedirs(output_path, exist_ok=True) - file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg", - "tables/example_with_table6.png", "tables/example_with_table_horizontal_union.jpg", - "scanned/orient_1.png", "tables/rotated_table.png"] + file_names = [ + "tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table6.png", + "tables/example_with_table_horizontal_union.jpg", "scanned/orient_1.png", "tables/rotated_table.png" + ] for file_name in file_names: result = self._send_request(file_name, data=dict()) @@ -182,7 +196,7 @@ def test_table_word_extraction(self): if len(tables) > 0: image = self.__draw_tables_words(tables, image) - cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image) + cv2.imwrite(os.path.join(output_path, file_name.split("/")[-1]), image) def test_document_table_split_last_column(self) -> None: filename_to_parameters = { diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_acc_orientation_classifier.py index 980ec884..497a5183 100644 --- a/scripts/train/train_acc_orientation_classifier.py +++ b/scripts/train/train_acc_orientation_classifier.py @@ -8,24 +8,20 @@ from torch import optim from torch.utils.data import DataLoader +from dedoc.config import get_config from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.dataset_executor import DataLoaderImageOrient parser = argparse.ArgumentParser() -checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), - "../../resources/efficient_net_b0_fixed.pth")) -checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), - "../../resources/efficient_net_b0_fixed.pth")) +checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) +checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path = "../../resources" parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False) -parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", - default=checkpoint_path_save) -parser.add_argument("-l", "--checkpoint_load", help="Path to checkpoint for load", - default=checkpoint_path_load) +parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", default=checkpoint_path_save) +parser.add_argument("-l", "--checkpoint_load", help="Path to checkpoint for load", default=checkpoint_path_load) parser.add_argument("-f", "--from_checkpoint", type=bool, help="run for train model", default=True) -parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test", - default="/home/nasty/data/columns_orientation") +parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test") args = parser.parse_args() BATCH_SIZE = 1 @@ -41,23 +37,20 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri """ net_executor.net.eval() testloader = data_executor.load_dataset( - csv_path=os.path.join(args.input_data_folder, 'test/labels.csv'), + csv_path=os.path.join(args.input_data_folder, "test/labels.csv"), image_path=args.input_data_folder, batch_size=BATCH_SIZE ) dataiter = iter(testloader) sample = dataiter.__next__() - _, orientation, columns = sample['image'], sample['orientation'], sample['columns'] + _, orientation, columns = sample["image"], sample["orientation"], sample["columns"] - print('GroundTruth: orientation {}, columns {}'.format(orientation, columns)) + print(f"GroundTruth: orientation {orientation}, columns {columns}") # noqa calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1) -def calc_accuracy_by_classes(testloader: DataLoader, - classes: List, - classifier: ColumnsOrientationClassifier, - batch_size: int = 1) -> None: +def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier, batch_size: int = 1) -> None: """ Function calculates accuracy ba each class :param testloader: DataLoader @@ -66,13 +59,13 @@ def calc_accuracy_by_classes(testloader: DataLoader, :param batch_size: size of batch :return: """ - class_correct = list(0. for i in range(len(classes))) - class_total = list(0. for i in range(len(classes))) + class_correct = list(0. for _ in range(len(classes))) + class_total = list(0. for _ in range(len(classes))) time_predict = 0 cnt_predict = 0 with torch.no_grad(): for data in testloader: - images, orientation, columns = data['image'], data['orientation'], data['columns'] + images, orientation, columns = data["image"], data["orientation"], data["columns"] time_begin = time() outputs = classifier.net(images.float().to(classifier.device)) @@ -97,21 +90,16 @@ def calc_accuracy_by_classes(testloader: DataLoader, class_correct[columns_i] += orientation_bool_predict class_total[columns_i] += 1 if not orientation_bool_predict or not columns_bool_predict: - print('{} predict as \norientation: {} \ncolumns: {}'.format(data['image_name'][i], - classes[2 + orientation_predicted[i]], - classes[columns_predicted[i]])) + print( # noqa + f'{data["image_name"][i]} predict as \norientation: {classes[2 + orientation_predicted[i]]} \ncolumns: {classes[columns_predicted[i]]}' + ) for i in range(len(classes)): - print('Accuracy of %5s : %2d %%' % ( - classes[i], 100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0)) - print('=== AVG Time predict {}'.format(time_predict / cnt_predict)) + print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %") # noqa + print(f"=== AVG Time predict {time_predict / cnt_predict}") # noqa -def train_model(trainloader: DataLoader, - checkpoint_path_save: str, - classifier: ColumnsOrientationClassifier, - epoch_cnt: int = 7, - save_step: int = 500) -> None: +def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: ColumnsOrientationClassifier, epoch_cnt: int = 7, save_step: int = 500) -> None: """ Function for train orientation classifier :param trainloader: DataLoader @@ -128,7 +116,7 @@ def train_model(trainloader: DataLoader, running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] - inputs, orientation, columns = data['image'], data['orientation'], data['columns'] + inputs, orientation, columns = data["image"], data["orientation"], data["columns"] # zero the parameter gradients optimizer.zero_grad() @@ -136,17 +124,14 @@ def train_model(trainloader: DataLoader, # forward + backward + optimize outputs = classifier.net(inputs.float().to(classifier.device)) - loss = criterion(outputs[:, :2], - columns.to(classifier.device)) + criterion(outputs[:, 2:], - orientation.to(classifier.device)) + loss = criterion(outputs[:, :2], columns.to(classifier.device)) + criterion(outputs[:, 2:], orientation.to(classifier.device)) loss.backward() optimizer.step() running_loss += loss.item() # print statistics if i % 100 == 99: - print('[%d, %5d] loss: %.3f' % - (epoch + 1, i + 1, running_loss / 100)) + print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}") # noqa running_loss = 0.0 # save checkpoint @@ -154,14 +139,14 @@ def train_model(trainloader: DataLoader, classifier.save_weights(checkpoint_path_save) classifier.save_weights(checkpoint_path_save) - print('Finished Training') + print("Finished Training") # noqa def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientationClassifier) -> None: classifier.net.train() # Part 1 - load datas trainloader = data_executor.load_dataset( - csv_path=os.path.join(args.input_data_folder, 'train/labels.csv'), + csv_path=os.path.join(args.input_data_folder, "train/labels.csv"), image_path=args.input_data_folder, batch_size=BATCH_SIZE ) @@ -169,22 +154,20 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat # get some random training images dataiter = iter(trainloader) sample = dataiter.__next__() - _, orientation, columns = sample['image'], sample['orientation'], sample['columns'] + _, orientation, columns = sample["image"], sample["orientation"], sample["columns"] # print labels - print(' '.join('%5s' % data_executor.classes[orientation[j]] for j in range(BATCH_SIZE))) - print(' '.join('%5s' % data_executor.classes[columns[j]] for j in range(BATCH_SIZE))) + print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE))) # noqa + print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE))) # noqa # Part 2 - train model train_model(trainloader, args.checkpoint_save, classifier) if __name__ == "__main__": - from dedoc.config import _config as config + config = get_config() data_executor = DataLoaderImageOrient() - net = ColumnsOrientationClassifier(on_gpu=True, - checkpoint_path=checkpoint_path if not args.train else '', - config=config) + net = ColumnsOrientationClassifier(on_gpu=True, checkpoint_path=checkpoint_path if not args.train else "", config=config) if args.train: train_step(data_executor, net) else: diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py index 812c1681..34643c3a 100644 --- a/scripts/train/train_diploma_line_classifier.py +++ b/scripts/train/train_diploma_line_classifier.py @@ -17,16 +17,14 @@ def skip_labels(label: str) -> Optional[str]: classifier_name = "diploma_classifier" -clf_resources_path = os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources", "line_type_classifiers") +clf_resources_path = os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources", "line_type_classifiers") os.makedirs(clf_resources_path, exist_ok=True) resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(clf_resources_path, "{}.pkl.gz".format(classifier_name)) -path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name)) -path_feature_importances = os.path.join(resources_path, - "feature_importances", - "{}_feature_importances.xlsx".format(classifier_name)) +path_out = os.path.join(clf_resources_path, f"{classifier_name}.pkl.gz") +path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") +path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") feature_extractor = DiplomaFeatureExtractor() classifier_parameters = dict(learning_rate=0.5, @@ -52,4 +50,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False, no_cache=False) -print("successfully train diploma classifier") +print("successfully train diploma classifier") # noqa diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py index 5ccb0715..93780d36 100644 --- a/scripts/train/train_law_line_classifier.py +++ b/scripts/train/train_law_line_classifier.py @@ -32,11 +32,9 @@ def transform_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) classifier_name = "law_txt_classifier" if txt_classifier else "law_classifier" -path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name)) -path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name)) -path_feature_importances = os.path.join(resources_path, - "feature_importances", - "{}_feature_importances.xlsx".format(classifier_name)) +path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") +path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") feature_extractor = LawTextFeatures(text_features_only=txt_classifier) classifier_parameters = dict(learning_rate=0.8, @@ -75,4 +73,4 @@ def get_sample_weight(line: LineWithLabel) -> int: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train law classifier") +print("successfully train law classifier") # noqa diff --git a/scripts/train/train_mle_language_model.py b/scripts/train/train_mle_language_model.py index 1c64ec4b..41c9882c 100644 --- a/scripts/train/train_mle_language_model.py +++ b/scripts/train/train_mle_language_model.py @@ -16,11 +16,11 @@ def tokenize_doc(text_layer: str) -> str: # converting to lowercase text_layer = text_layer.lower() # remove all the special characters - document = re.sub(r'\W', ' ', text_layer) + document = re.sub(r"\W", " ", text_layer) # remove all single characters - document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) + document = re.sub(r"\^[a-zA-Z]\s+", " ", document) # substituting multiple spaces with single space - document = re.sub(r'\s+', ' ', document, flags=re.I) + document = re.sub(r"\s+", " ", document, flags=re.I) return document @@ -39,27 +39,27 @@ def main() -> None: documents = [] for files in files_path_big_data: file = os.listdir(path_big_data + files) - print(files) + print(files) # noqa for writer in file: try: - with open(path_big_data + files + '/' + writer) as f: + with open(path_big_data + files + "/" + writer) as f: text = f.read() document = tokenize_doc(text) documents.append(document) - print(writer) + print(writer) # noqa break except Exception: - print(Exception) + print(Exception) # noqa pass documents = " ".join(documents) bigram_list = create_ngramm_list(documents, 2) train, vocab = padded_everygram_pipeline(2, [bigram_list]) - print(bigram_list) + print(bigram_list) # noqa language_model_mle = MLE(2) language_model_mle.fit(train, vocab) with open("n-gram_lang_model.pkl", "wb") as f: pickle.dump(language_model_mle, f) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/train/train_nn_line_classifier_law.py b/scripts/train/train_nn_line_classifier_law.py index dbdb58b7..84893653 100644 --- a/scripts/train/train_nn_line_classifier_law.py +++ b/scripts/train/train_nn_line_classifier_law.py @@ -31,13 +31,11 @@ def transform_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) classifier_name = "law_txt_classifier" if txt_classifier else "law_classifier" -path_out = os.path.join(resources_path, "{}_nn.pkl.gz".format(classifier_name)) -path_feature_importances = os.path.join(resources_path, - "feature_importances", - "{}_feature_importances_nn.xlsx".format(classifier_name)) +path_out = os.path.join(resources_path, f"{classifier_name}_nn.pkl.gz") +path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances_nn.xlsx") feature_extractor = LawTextFeatures(text_features_only=txt_classifier) -path_scores = os.path.join(resources_path, "benchmarks", "{}_nn_scores.json".format(classifier_name)) +path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_nn_scores.json") classifier_parameters = dict(learning_rate=0.8, n_estimators=300, booster="gbtree", @@ -51,9 +49,7 @@ def get_sample_weight(line: LineWithLabel) -> float: text = line.line.lower().strip() regexps = LawTextFeatures.named_regexp application_regexp = LawTextFeatures.regexp_application_begin - regexp_weight = (50 - if any([regexp.match(text) for regexp in regexps]) or application_regexp.match(text.lower()) - else 1) + regexp_weight = (50 if any([regexp.match(text) for regexp in regexps]) or application_regexp.match(text.lower()) else 1) return regexp_weight * class_weight @@ -75,4 +71,4 @@ def get_sample_weight(line: LineWithLabel) -> float: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train law classifier") +print("successfully train law classifier") # noqa diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py index 64cbc331..658e5ff3 100644 --- a/scripts/train/train_paragraph_classifier.py +++ b/scripts/train/train_paragraph_classifier.py @@ -17,11 +17,9 @@ def skip_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name)) -path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name)) -path_feature_importances = os.path.join(resources_path, - "feature_importances", - "{}_feature_importances.xlsx".format(classifier_name)) +path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") +path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") feature_extractor = ParagraphFeatureExtractor(config=config) @@ -48,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train {} classifier".format(classifier_name)) +print(f"successfully train {classifier_name} classifier") # noqa diff --git a/scripts/train/train_txtlayer_classifier.py b/scripts/train/train_txtlayer_classifier.py index b203e9c9..12d6cdba 100644 --- a/scripts/train/train_txtlayer_classifier.py +++ b/scripts/train/train_txtlayer_classifier.py @@ -45,11 +45,11 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: with open(path, mode="r") as f: text = f.read() except Exception as e: - print(f'Bad file {str(e)}: {path}') + print(f"Bad file {str(e)}: {path}") # noqa continue if len(text.strip()) == 0: - print(f'Empty file: {path}') + print(f"Empty file: {path}") # noqa continue texts.append(text) @@ -66,12 +66,12 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: if not os.path.isdir(txtlayer_classifier_dataset_dir): path_out = os.path.join(data_dir, "data.zip") wget.download("https://at.ispras.ru/owncloud/index.php/s/z9WLFiKKFo2WMgW/download", path_out) - with zipfile.ZipFile(path_out, 'r') as zip_ref: + with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(path_out) - print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}") + print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}") # noqa else: - print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}") + print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}") # noqa assert os.path.isdir(txtlayer_classifier_dataset_dir) @@ -85,20 +85,16 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: stages_data[stage] = dict(features=features, labels=labels) clf = XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=600, booster="gbtree", tree_method="hist", max_depth=3) - clf.fit( - X=stages_data["train"]["features"], - y=stages_data["train"]["labels"], - eval_set=[(stages_data["val"]["features"], stages_data["val"]["labels"])], - ) + clf.fit(X=stages_data["train"]["features"], y=stages_data["train"]["labels"], eval_set=[(stages_data["val"]["features"], stages_data["val"]["labels"])]) test_preds = clf.predict(stages_data["test"]["features"]) score = f1_score(stages_data["test"]["labels"], test_preds) - print(f"F1 score = {score}") + print(f"F1 score = {score}") # noqa resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "resources") - with gzip.open(os.path.join(resources_dir, 'txtlayer_classifier.pkl.gz'), 'wb') as file: + with gzip.open(os.path.join(resources_dir, "txtlayer_classifier.pkl.gz"), "wb") as file: pickle.dump(clf, file) xgbfir.saveXgbFI(clf, feature_names=features.columns, - OutputXlsxFile=os.path.join(resources_dir, "feature_importances", 'txtlayer_classifier_feature_importances.xlsx')) + OutputXlsxFile=os.path.join(resources_dir, "feature_importances", "txtlayer_classifier_feature_importances.xlsx")) diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py index b35446e7..e4677c2e 100644 --- a/scripts/train/train_tz_line_classifier.py +++ b/scripts/train/train_tz_line_classifier.py @@ -18,11 +18,9 @@ def skip_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(resources_path, "{}.pkl.gz".format(classifier_name)) -path_scores = os.path.join(resources_path, "benchmarks", "{}_scores.json".format(classifier_name)) -path_feature_importances = os.path.join(resources_path, - "feature_importances", - "{}_feature_importances.xlsx".format(classifier_name)) +path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") +path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") feature_extractor = TzTextFeatures(text_features_only=txt_classifier) classifier_parameters = dict(learning_rate=0.5, @@ -48,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train tz classifier") +print("successfully train tz classifier") # noqa From 410b0ca51cb9cfe356d1c4f5d99ca05b35d8dc0a Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 1 Feb 2024 13:42:23 +0300 Subject: [PATCH 2/3] Review fixes --- .flake8 | 2 ++ scripts/benchmark.py | 10 +++++----- scripts/benchmark_pdf_attachments.py | 16 +++++++-------- scripts/benchmark_pdf_miner.py | 6 +++--- scripts/benchmark_table/benchmark_table.py | 6 +++--- scripts/benchmark_table/metric.py | 2 +- scripts/benchmark_tl_correctness.py | 8 ++++---- scripts/create_txtlayer_dataset.py | 6 +++--- .../calc_tesseract_benchmarks.py | 18 ++++++++--------- scripts/tesseract_benchmark/ocr_correction.py | 4 ++-- scripts/test_words_bbox_extraction.py | 4 ++-- .../train/train_acc_orientation_classifier.py | 14 ++++++------- .../train/train_diploma_line_classifier.py | 2 +- scripts/train/train_law_line_classifier.py | 2 +- scripts/train/train_mle_language_model.py | 8 ++++---- scripts/train/train_nn_line_classifier_law.py | 2 +- scripts/train/train_paragraph_classifier.py | 2 +- scripts/train/train_txtlayer_classifier.py | 10 +++++----- scripts/train/train_tz_line_classifier.py | 2 +- .../trainers/base_sklearn_line_classifier.py | 2 +- scripts/train/trainers/data_loader.py | 2 +- scripts/train/trainers/errors_saver.py | 4 ++-- .../trainers/line_lstm_classifier_trainer.py | 20 +++++++++---------- 23 files changed, 77 insertions(+), 75 deletions(-) diff --git a/.flake8 b/.flake8 index 4c24c4a3..96a1bcff 100644 --- a/.flake8 +++ b/.flake8 @@ -22,3 +22,5 @@ exclude = # ANN101 - type annotations for self ignore = ANN101 +per-file-ignores = + scripts/*:T201 diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 69276d70..12c3f104 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -33,14 +33,14 @@ def get_cpu_performance() -> float: cpu_performance = get_cpu_performance() -print(f'"cpu_performance" = {cpu_performance}') # noqa +print(f'"cpu_performance" = {cpu_performance}') with TemporaryDirectory() as path_base: path_out = os.path.join(path_base, "dataset.zip") wget.download(data_url, path_out) with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(path_base) - print(path_base) # noqa + print(path_base) failed = [] result = OrderedDict() @@ -60,7 +60,7 @@ def get_cpu_performance() -> float: Task("pdf", "pdf", {"pdf_with_text_layer": "false"}), Task("pdf_tables", "pdf_tables", {}) ] - print(tasks) # noqa + print(tasks) for directory, name, parameters in tasks: total_size = 0 total_time = 0 @@ -90,5 +90,5 @@ def get_cpu_performance() -> float: with open(path_result, "w") as file_out: json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print(f"save result in {path_result}") # noqa - print(failed) # noqa + print(f"save result in {path_result}") + print(failed) diff --git a/scripts/benchmark_pdf_attachments.py b/scripts/benchmark_pdf_attachments.py index f069685c..411f1275 100644 --- a/scripts/benchmark_pdf_attachments.py +++ b/scripts/benchmark_pdf_attachments.py @@ -39,7 +39,7 @@ def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name)) attachment_names.append(attachment_name) - print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result") # noqa + print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result") result_dict[file_name] = sorted(attachment_names) return result_dict @@ -70,7 +70,7 @@ def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_d shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name)) attachment_names.append(attachment_name) - print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result") # noqa + print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result") result_dict[file_name] = sorted(attachment_names) return result_dict @@ -99,9 +99,9 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i zip_ref.extractall(data_dir) os.remove(archive_path) - print(f"Benchmark data downloaded to {data_dir}") # noqa + print(f"Benchmark data downloaded to {data_dir}") else: - print(f"Use cached benchmark data from {data_dir}") # noqa + print(f"Use cached benchmark data from {data_dir}") in_dir = os.path.join(data_dir, "with_attachments") out_dir = os.path.join(in_dir, "extracted_attachments") @@ -112,17 +112,17 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i benchmarks_dict = {} - print("Get tabby attachments") # noqa + print("Get tabby attachments") tabby_reader = PdfTabbyReader(config={}) tabby_out_dir = os.path.join(out_dir, "tabby") benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir) - print("Get pdfminer attachments") # noqa + print("Get pdfminer attachments") pdfminer_reader = PdfTxtlayerReader(config={}) pdfminer_out_dir = os.path.join(out_dir, "pdfminer") benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir) - print("Get common attachments") # noqa + print("Get common attachments") common_out_dir = os.path.join(out_dir, "common") pdf_attachments_extractor = PDFAttachmentsExtractor(config={}) benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir) @@ -131,4 +131,4 @@ def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: i with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f: json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2) - print(f"Attachments were extracted to {out_dir}") # noqa + print(f"Attachments were extracted to {out_dir}") diff --git a/scripts/benchmark_pdf_miner.py b/scripts/benchmark_pdf_miner.py index b7c5d785..7161541b 100644 --- a/scripts/benchmark_pdf_miner.py +++ b/scripts/benchmark_pdf_miner.py @@ -31,9 +31,9 @@ zip_ref.extractall(data_dir) os.remove(pdfs_zip_gt_path) - print(f"Benchmark data downloaded to {data_dir}") # noqa + print(f"Benchmark data downloaded to {data_dir}") else: - print(f"Use cached benchmark data from {data_dir}") # noqa + print(f"Use cached benchmark data from {data_dir}") pdfs_path = data_dir / "PdfMiner Params" pdfs_gt_path = data_dir / "PdfMiner Params GT" @@ -68,4 +68,4 @@ with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f: json.dump(info, f, ensure_ascii=False, indent=2) - print(f"save result in {output_dir}") # noqa + print(f"save result in {output_dir}") diff --git a/scripts/benchmark_table/benchmark_table.py b/scripts/benchmark_table/benchmark_table.py index 8d5b7a81..8a52048d 100644 --- a/scripts/benchmark_table/benchmark_table.py +++ b/scripts/benchmark_table/benchmark_table.py @@ -48,7 +48,7 @@ def get_tables(image_path: Path) -> str: def make_predict_json(data_path: Path) -> dict: predict_json = {} for pathname in Path.iterdir(data_path): - print(pathname) # noqa + print(pathname) predict_json[pathname.name] = {"html": "" + get_tables(pathname) + ""} @@ -57,7 +57,7 @@ def make_predict_json(data_path: Path) -> dict: def download_dataset(data_dir: Path, name_zip: str, url: str) -> None: if Path.exists(data_dir): - print(f"Use cached benchmark data from {data_dir}") # noqa + print(f"Use cached benchmark data from {data_dir}") return data_dir.mkdir(parents=True, exist_ok=True) @@ -68,7 +68,7 @@ def download_dataset(data_dir: Path, name_zip: str, url: str) -> None: zip_ref.extractall(data_dir) pdfs_zip_path.unlink() - print(f"Benchmark data downloaded to {data_dir}") # noqa + print(f"Benchmark data downloaded to {data_dir}") def prediction(path_pred: Path, path_images: Path) -> dict: diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index d306247d..ac0b90b5 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -119,7 +119,7 @@ def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> content=cell, visible=False if node.attrib.get("style") == "display: none" else True, *deque()) # noqa except Exception as ex: - print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") # noqa + print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") raise ex else: new_node = TableTree(node.tag, None, None, None, True, *deque()) diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index 50dec7ae..d959a1f4 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -24,7 +24,7 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para directory = os.path.join(path_base, tl_path) files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")] total_file_size = len(files_list) - print(f"Files: {files_list}\nFiles number: {total_file_size}") # noqa + print(f"Files: {files_list}\nFiles number: {total_file_size}") for file in tqdm(files_list): file_path = os.path.join(directory, file) r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) @@ -52,9 +52,9 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(path_out) - print(f"Benchmark data downloaded to {benchmark_data_dir}") # noqa + print(f"Benchmark data downloaded to {benchmark_data_dir}") else: - print(f"Use cached benchmark data from {benchmark_data_dir}") # noqa + print(f"Use cached benchmark data from {benchmark_data_dir}") assert os.path.isdir(benchmark_data_dir) @@ -74,4 +74,4 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para with open(path_result, "w") as file_out: json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print(f"Save result in {path_result}") # noqa + print(f"Save result in {path_result}") diff --git a/scripts/create_txtlayer_dataset.py b/scripts/create_txtlayer_dataset.py index a26c7995..ca2e196c 100644 --- a/scripts/create_txtlayer_dataset.py +++ b/scripts/create_txtlayer_dataset.py @@ -200,7 +200,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: os.makedirs(os.path.join(args.out_dir, args.incorrect_dir), exist_ok=True) i = args.start_number - print("Generating incorrect texts") # noqa + print("Generating incorrect texts") for _ in tqdm(range(args.dataset_size)): for language in ("ru", "en"): text = "" @@ -211,7 +211,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: corruptor = random.choice(corruptor_list) text = corruptor.corrupt(text, lang=language) except Exception as e: - print(e) # noqa + print(e) text = "" with open(os.path.join(args.out_dir, args.incorrect_dir, f"{i:08d}_{language}.txt"), "w") as f: @@ -219,7 +219,7 @@ def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: i += 1 i = args.start_number - print("Generating correct texts") # noqa + print("Generating correct texts") for _ in tqdm(range(args.dataset_size)): for language in ("ru", "en"): diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py index ce77a128..07895d0d 100644 --- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py +++ b/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py @@ -264,10 +264,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: - print(ex) # noqa - print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") # noqa + print(ex) + print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") - print(f"Time mean correction ocr = {np.array(correction_times).mean()}") # noqa + print(f"Time mean correction ocr = {np.array(correction_times).mean()}") table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) return table_common, table_accuracy_per_image @@ -283,9 +283,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") if not os.path.isfile(benchmark_data_path): wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path) - print(f"Benchmark data downloaded to {benchmark_data_path}") # noqa + print(f"Benchmark data downloaded to {benchmark_data_path}") else: - print(f"Use cached benchmark data from {benchmark_data_path}") # noqa + print(f"Use cached benchmark data from {benchmark_data_path}") assert os.path.isfile(benchmark_data_path) table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir) @@ -302,7 +302,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c res_file.write("\n\nTable 3 -OCR error by symbol:\n") res_file.write(table_errors.draw()) - print(f"Tesseract version is {pytesseract.get_tesseract_version()}") # noqa - print(table_accuracy_per_image.draw()) # noqa - print(table_common.draw()) # noqa - print(table_errors.draw()) # noqa + print(f"Tesseract version is {pytesseract.get_tesseract_version()}") + print(table_accuracy_per_image.draw()) + print(table_common.draw()) + print(table_errors.draw()) diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py index 82160731..89fb87a1 100644 --- a/scripts/tesseract_benchmark/ocr_correction.py +++ b/scripts/tesseract_benchmark/ocr_correction.py @@ -35,7 +35,7 @@ def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]: corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) if torch.cuda.is_available() and USE_GPU: corrector.model.to(torch.device("cuda:0")) - print("use CUDA") # noqa + print("use CUDA") else: - print("use CPU") # noqa + print("use CPU") return corrector, corrected_path diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py index 9f65e590..9dde8702 100644 --- a/scripts/test_words_bbox_extraction.py +++ b/scripts/test_words_bbox_extraction.py @@ -37,7 +37,7 @@ def __extract_conf_annotation(self, anns_conf: List[dict], ann_bbox: dict, text: ) if DETAILED_DEBUG: - print(debug) # noqa + print(debug) return confs @@ -57,7 +57,7 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t } ) if DETAILED_DEBUG: - print(debug) # noqa + print(debug) return text_type diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_acc_orientation_classifier.py index 497a5183..05f36083 100644 --- a/scripts/train/train_acc_orientation_classifier.py +++ b/scripts/train/train_acc_orientation_classifier.py @@ -45,7 +45,7 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri sample = dataiter.__next__() _, orientation, columns = sample["image"], sample["orientation"], sample["columns"] - print(f"GroundTruth: orientation {orientation}, columns {columns}") # noqa + print(f"GroundTruth: orientation {orientation}, columns {columns}") calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1) @@ -95,8 +95,8 @@ def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: ) for i in range(len(classes)): - print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %") # noqa - print(f"=== AVG Time predict {time_predict / cnt_predict}") # noqa + print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %") + print(f"=== AVG Time predict {time_predict / cnt_predict}") def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: ColumnsOrientationClassifier, epoch_cnt: int = 7, save_step: int = 500) -> None: @@ -131,7 +131,7 @@ def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: # print statistics if i % 100 == 99: - print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}") # noqa + print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}") running_loss = 0.0 # save checkpoint @@ -139,7 +139,7 @@ def train_model(trainloader: DataLoader, checkpoint_path_save: str, classifier: classifier.save_weights(checkpoint_path_save) classifier.save_weights(checkpoint_path_save) - print("Finished Training") # noqa + print("Finished Training") def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientationClassifier) -> None: @@ -157,8 +157,8 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat _, orientation, columns = sample["image"], sample["orientation"], sample["columns"] # print labels - print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE))) # noqa - print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE))) # noqa + print(" ".join(f"{data_executor.classes[orientation[j]]:5s}" for j in range(BATCH_SIZE))) + print(" ".join(f"{data_executor.classes[columns[j]]:5s}" for j in range(BATCH_SIZE))) # Part 2 - train model train_model(trainloader, args.checkpoint_save, classifier) diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py index 34643c3a..71a4c900 100644 --- a/scripts/train/train_diploma_line_classifier.py +++ b/scripts/train/train_diploma_line_classifier.py @@ -50,4 +50,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False, no_cache=False) -print("successfully train diploma classifier") # noqa +print("successfully train diploma classifier") diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py index 93780d36..7b6dd416 100644 --- a/scripts/train/train_law_line_classifier.py +++ b/scripts/train/train_law_line_classifier.py @@ -73,4 +73,4 @@ def get_sample_weight(line: LineWithLabel) -> int: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train law classifier") # noqa +print("successfully train law classifier") diff --git a/scripts/train/train_mle_language_model.py b/scripts/train/train_mle_language_model.py index 41c9882c..8488f41a 100644 --- a/scripts/train/train_mle_language_model.py +++ b/scripts/train/train_mle_language_model.py @@ -39,22 +39,22 @@ def main() -> None: documents = [] for files in files_path_big_data: file = os.listdir(path_big_data + files) - print(files) # noqa + print(files) for writer in file: try: with open(path_big_data + files + "/" + writer) as f: text = f.read() document = tokenize_doc(text) documents.append(document) - print(writer) # noqa + print(writer) break except Exception: - print(Exception) # noqa + print(Exception) pass documents = " ".join(documents) bigram_list = create_ngramm_list(documents, 2) train, vocab = padded_everygram_pipeline(2, [bigram_list]) - print(bigram_list) # noqa + print(bigram_list) language_model_mle = MLE(2) language_model_mle.fit(train, vocab) with open("n-gram_lang_model.pkl", "wb") as f: diff --git a/scripts/train/train_nn_line_classifier_law.py b/scripts/train/train_nn_line_classifier_law.py index 84893653..2aa6e0d5 100644 --- a/scripts/train/train_nn_line_classifier_law.py +++ b/scripts/train/train_nn_line_classifier_law.py @@ -71,4 +71,4 @@ def get_sample_weight(line: LineWithLabel) -> float: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train law classifier") # noqa +print("successfully train law classifier") diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py index 658e5ff3..0f2a6ba5 100644 --- a/scripts/train/train_paragraph_classifier.py +++ b/scripts/train/train_paragraph_classifier.py @@ -46,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print(f"successfully train {classifier_name} classifier") # noqa +print(f"successfully train {classifier_name} classifier") diff --git a/scripts/train/train_txtlayer_classifier.py b/scripts/train/train_txtlayer_classifier.py index 12d6cdba..ed3deaa4 100644 --- a/scripts/train/train_txtlayer_classifier.py +++ b/scripts/train/train_txtlayer_classifier.py @@ -45,11 +45,11 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: with open(path, mode="r") as f: text = f.read() except Exception as e: - print(f"Bad file {str(e)}: {path}") # noqa + print(f"Bad file {str(e)}: {path}") continue if len(text.strip()) == 0: - print(f"Empty file: {path}") # noqa + print(f"Empty file: {path}") continue texts.append(text) @@ -69,9 +69,9 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(data_dir) os.remove(path_out) - print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}") # noqa + print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}") else: - print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}") # noqa + print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}") assert os.path.isdir(txtlayer_classifier_dataset_dir) @@ -89,7 +89,7 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: test_preds = clf.predict(stages_data["test"]["features"]) score = f1_score(stages_data["test"]["labels"], test_preds) - print(f"F1 score = {score}") # noqa + print(f"F1 score = {score}") resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "resources") with gzip.open(os.path.join(resources_dir, "txtlayer_classifier.pkl.gz"), "wb") as file: diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py index e4677c2e..da62ca7b 100644 --- a/scripts/train/train_tz_line_classifier.py +++ b/scripts/train/train_tz_line_classifier.py @@ -46,4 +46,4 @@ def skip_labels(label: str) -> Optional[str]: ) trainer.fit(cross_val_only=False, save_errors_images=False) -print("successfully train tz classifier") # noqa +print("successfully train tz classifier") diff --git a/scripts/train/trainers/base_sklearn_line_classifier.py b/scripts/train/trainers/base_sklearn_line_classifier.py index b807d6a0..9ed905c8 100644 --- a/scripts/train/trainers/base_sklearn_line_classifier.py +++ b/scripts/train/trainers/base_sklearn_line_classifier.py @@ -97,7 +97,7 @@ def fit(self, no_cache: bool = False, cross_val_only: bool = False, save: bool = predicted = cls.predict(features_test) accuracy = accuracy_score(labels_test, predicted, sample_weight=sample_weight[-n:]) - print("Final Accuracy = {}".format(accuracy)) # noqa + print(f"Final Accuracy = {accuracy}") scores["final_accuracy"] = accuracy if not os.path.isdir(os.path.dirname(self.path_out)): diff --git a/scripts/train/trainers/data_loader.py b/scripts/train/trainers/data_loader.py index e16e37cb..35620522 100644 --- a/scripts/train/trainers/data_loader.py +++ b/scripts/train/trainers/data_loader.py @@ -36,7 +36,7 @@ def get_data(self, no_cache: bool = False) -> List[List[LineWithLabel]]: if os.path.isfile(pkl_path) and not no_cache: with gzip.open(pkl_path) as input_file: result = pickle.load(input_file) - print("func get_data(): Data were loaded from the local disk") # noqa + print("func get_data(): Data were loaded from the local disk") return self.__sort_data(result) os.makedirs(self.dataset_dir, exist_ok=True) path_out = os.path.join(self.dataset_dir, "dataset.zip") diff --git a/scripts/train/trainers/errors_saver.py b/scripts/train/trainers/errors_saver.py index 0ba53cc0..98ee7269 100644 --- a/scripts/train/trainers/errors_saver.py +++ b/scripts/train/trainers/errors_saver.py @@ -32,10 +32,10 @@ def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str, assert len(set(errors_uids)) == len(errors_uids) self.logger.info(f"save errors in {self.errors_path}") errors_total_num = sum(error_cnt.values()) - print(f"{'true':16s} -> {'predicted':16s} {'cnt':6s} {'(percent)':16s}") # noqa + print(f"{'true':16s} -> {'predicted':16s} {'cnt':6s} {'(percent)':16s}") for error, cnt in error_cnt.most_common(): y_true, y_pred = error - print(f"{y_true:16s} -> {y_pred:16s} {cnt:06,} ({100 * cnt / errors_total_num:02.2f}%)") # noqa + print(f"{y_true:16s} -> {y_pred:16s} {cnt:06,} ({100 * cnt / errors_total_num:02.2f}%)") if save_errors_images: self.__save_images(errors_uids, csv_path) diff --git a/scripts/train/trainers/line_lstm_classifier_trainer.py b/scripts/train/trainers/line_lstm_classifier_trainer.py index 5f9a9225..e3b49829 100644 --- a/scripts/train/trainers/line_lstm_classifier_trainer.py +++ b/scripts/train/trainers/line_lstm_classifier_trainer.py @@ -151,10 +151,10 @@ def __init__(self, self.class_dict = class_dict self.num_classes = len(class_dict) if torch.cuda.is_available() and on_gpu: - print("Device is cuda") # noqa + print("Device is cuda") self.device = torch.device("cuda:0") else: - print("Device is cpu") # noqa + print("Device is cpu") self.device = torch.device("cpu") def __get_labels(self, data: List[List[LineWithLabel]]) -> List[str]: @@ -189,14 +189,14 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti time_epoch = 0.0 for epoch in range(self.num_epochs): - print("\n\t Epoch: {}".format(epoch)) # noqa + print(f"\n\t Epoch: {epoch}") # The Dataloader class handles all the shuffles for you loader_iter = iter(LineEpsDataSet(features_train, labels_train, self.class_dict)) time_begin = time.time() train_loss, train_acc = self.train(lstm_model, loader_iter, len(labels_train), optimizer, criteria, batch_size=self.batch_size) time_epoch += time.time() - time_begin - print(f"\n\t \x1b\33[33mTrain: epoch: {epoch}| Train loss: {train_loss} | Train acc: {train_acc}\x1b[0m") # noqa + print(f"\n\t \x1b\33[33mTrain: epoch: {epoch}| Train loss: {train_loss} | Train acc: {train_acc}\x1b[0m") if file_log: file_log.write(f"\t Train: epoch: {epoch}| Train loss: {epoch} | Train acc: {train_loss}\n") @@ -204,7 +204,7 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti if with_eval: loader_iter = iter(LineEpsDataSet(features_test, labels_test, self.class_dict)) test_loss, test_acc = self.evaluate(lstm_model, loader_iter, len(labels_test), criteria, batch_size=self.batch_size) - print(f"\n\t \x1b\33[92mEvaluation: Test loss: {test_loss} | Test acc: {test_acc}\x1b[0m") # noqa + print(f"\n\t \x1b\33[92mEvaluation: Test loss: {test_loss} | Test acc: {test_acc}\x1b[0m") if file_log: file_log.write(f"\t Eval: epoch: {epoch}| Test loss: {test_loss} | Test acc: {test_acc}\n") curr_loss = test_loss @@ -219,7 +219,7 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti if with_save and curr_loss < best_loss: best_loss = curr_loss torch.save(lstm_model.state_dict(), self.path_out) - print(f"Model has been saved into {self.path_out}") # noqa + print(f"Model has been saved into {self.path_out}") return res_loss / self.num_epochs, res_acc / self.num_epochs, time_epoch / self.num_epochs @@ -237,7 +237,7 @@ def fit(self, with_cross_val: bool = True) -> None: data = np.array(data, dtype=object) if with_cross_val: - print("\n\x1b\33[95m---------Evaluation process (cross-validation) starts-------\x1b[0m\n") # noqa + print("\n\x1b\33[95m---------Evaluation process (cross-validation) starts-------\x1b[0m\n") kf = KFold(n_splits=self.n_splits) scores = [] epoch_time = [] @@ -267,7 +267,7 @@ def fit(self, with_cross_val: bool = True) -> None: scores_dict["scores"] = scores logfile_kfold_tmp.close() - print("\n\x1b\33[95m-------------------Train process starts------------------\x1b[0m\n") # noqa + print("\n\x1b\33[95m-------------------Train process starts------------------\x1b[0m\n") features_train, labels_train = self.get_features(data) lstm_model = LSTM(input_dim=features_train.shape[1], hidden_dim=features_train.shape[1], hidden_dim_2=lstm_hidden_dim, num_classes=self.num_classes, lstm_layers=lstm_layers, @@ -278,7 +278,7 @@ def fit(self, with_cross_val: bool = True) -> None: labels_test=None, file_log=None, with_save=True, with_eval=False) - print("\x1b\33[92mFinal Accuracy from training = {}\x1b[0m".format(acc)) # noqa + print(f"\x1b\33[92mFinal Accuracy from training = {acc}\x1b[0m") scores_dict["final_accuracy"] = acc if self.path_scores is not None: @@ -338,7 +338,7 @@ def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer: epoch_acc += accuracy cnt += 1 if log_per_cnt != 0 and batch_num % log_per_cnt == 0: - print(f"\t\tbatch_num: {batch_num}, loss={epoch_loss / cnt}, acc={epoch_acc / cnt}") # noqa + print(f"\t\tbatch_num: {batch_num}, loss={epoch_loss / cnt}, acc={epoch_acc / cnt}") return epoch_loss / cnt, epoch_acc / cnt From 83edd01ec7d02c24852bf72799b78e7b35b5a2ee Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 1 Feb 2024 14:26:37 +0300 Subject: [PATCH 3/3] Review fix --- scripts/benchmark_table/metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/benchmark_table/metric.py b/scripts/benchmark_table/metric.py index ac0b90b5..28c46b5b 100644 --- a/scripts/benchmark_table/metric.py +++ b/scripts/benchmark_table/metric.py @@ -117,7 +117,7 @@ def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> colspan=self.get_span(node, "colspan"), rowspan=self.get_span(node, "rowspan"), content=cell, - visible=False if node.attrib.get("style") == "display: none" else True, *deque()) # noqa + visible=node.attrib.get("style") != "display: none", *deque()) # noqa except Exception as ex: print(f"Bad html file. HTML parse exception. Exception's msg: {ex}") raise ex