diff --git a/dedoc/scripts/accsum b/dedoc/scripts/accsum new file mode 100755 index 00000000..1efd2a47 Binary files /dev/null and b/dedoc/scripts/accsum differ diff --git a/dedoc/scripts/calc_tesseract_benchmarks.py b/dedoc/scripts/calc_tesseract_benchmarks.py index 69f569c7..47d58a8e 100644 --- a/dedoc/scripts/calc_tesseract_benchmarks.py +++ b/dedoc/scripts/calc_tesseract_benchmarks.py @@ -2,7 +2,7 @@ import re import zipfile from tempfile import TemporaryDirectory -from typing import Dict, List +from typing import Dict, List, Tuple import cv2 import numpy as np @@ -79,24 +79,99 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: _get_avg(statistics[dataset]["Accuracy"])] -if __name__ == "__main__": - base_zip = "data_tesseract_benchmarks" - output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) - cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") - os.makedirs(cache_dir, exist_ok=True) - benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") +def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: + symbols_info = [] + matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] + start_block_line = matched_symbols[0] - if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) - print(f"Benchmark data downloaded to {benchmark_data_path}") - else: - print(f"Use cached benchmark data from {benchmark_data_path}") - assert os.path.isfile(benchmark_data_path) + for line in lines[start_block_line + 1:]: + # example line: "1187 11 99.07 {<\n>}" + row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] + row_values[-1] = row_values[-1][1:-1] # get symbol value + symbols_info.append(row_values) + # Sort errors + symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed + + return symbols_info, start_block_line + + +def __parse_ocr_errors(lines: List[str]) -> List: + ocr_errors = [] + matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] + for num, line in enumerate(lines[matched_errors[0] + 1:]): + # example line: " 2 0 { 6}-{б}" + errors = re.findall(r"(\d+)", line)[0] + chars = re.findall(r"{(.*)}-{(.*)}", line)[0] + ocr_errors.append([errors, chars[0], chars[1]]) + + return ocr_errors + +def __get_summary_symbol_error(path_reports: str) -> Texttable: + # 1 - call accsum for get summary of all reports + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accsum")) + + if os.path.exists(f"{path_reports}/../accsum_report.txt"): + os.remove(f"{path_reports}/../accsum_report.txt") + + file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) + + command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" + os.system(command) + accsum_report_path = os.path.join(path_reports, "../accsum_report.txt") + + # 2 - parse report info + with open(accsum_report_path, "r") as f: + lines = f.readlines() + + symbols_info, start_symbol_block_line = __parse_symbol_info(lines) + ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) + + # 3 - calculate ocr errors according to a symbol + ocr_errors_by_symbol = {} + for symbol_info in symbols_info: + ocr_errors_by_symbol[symbol_info[-1]] = [] + for ocr_err in ocr_errors: + if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text + continue + if symbol_info[-1] in ocr_err[-2]: + ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") + + # 4 - create table with OCR errors + ocr_err_by_symbol_table = Texttable() + title = [["Symbol", "Cnt Errors & Correct-Generated"]] + ocr_err_by_symbol_table.add_rows(title) + for symbol, value in ocr_errors_by_symbol.items(): + if len(value) != 0: + ocr_err_by_symbol_table.add_row([symbol, value]) + + return ocr_err_by_symbol_table + + +def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]] + + table_accuracy_per_image = Texttable() + accs.extend(accuracy_values) + table_accuracy_per_image.add_rows(accs) + + # calculating average accuracy for each data set + table_common = Texttable() + + for dataset_name in sorted(statistics.keys()): + row = [dataset_name] + row.extend(_get_avg_by_dataset(statistics, dataset_name)) + accs_common.append(row) + table_common.add_rows(accs_common) + + return table_common, table_accuracy_per_image + + +def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) -> Tuple[Texttable, Texttable]: statistics = {} + accuracy_values = [] with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -115,7 +190,7 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt") imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name) - accuracy_path = os.path.join(cache_dir, f"{dataset_name}_{base_name}_accuracy.txt") + accuracy_path = os.path.join(cache_dir_accuracy, f"{dataset_name}_{base_name}_accuracy.txt") with TemporaryDirectory() as tmpdir: tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt") @@ -145,30 +220,45 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: os.system(command) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accs.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: print(ex) print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") - table_aacuracy_per_image = Texttable() - table_aacuracy_per_image.add_rows(accs) + table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) + return table_common, table_accuracy_per_image - # calculating average accuracy for each data set - table_common = Texttable() - for dataset_name in sorted(statistics.keys()): - row = [dataset_name] - row.extend(_get_avg_by_dataset(statistics, dataset_name)) - accs_common.append(row) - table_common.add_rows(accs_common) +if __name__ == "__main__": + base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") + os.makedirs(cache_dir, exist_ok=True) + cache_dir_accuracy = os.path.join(cache_dir, "accuracy") + os.makedirs(cache_dir_accuracy, exist_ok=True) + + benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") + if not os.path.isfile(benchmark_data_path): + wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) + print(f"Benchmark data downloaded to {benchmark_data_path}") + else: + print(f"Use cached benchmark data from {benchmark_data_path}") + assert os.path.isfile(benchmark_data_path) + + table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path) + + table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n") - res_file.write(table_aacuracy_per_image.draw()) + res_file.write(table_accuracy_per_image.draw()) res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) + res_file.write(f"\n\nTable 3 -OCR error by symbol:\n") + res_file.write(table_errors.draw()) print(f"Tesseract version is {pytesseract.get_tesseract_version()}") - print(table_aacuracy_per_image.draw()) + print(table_accuracy_per_image.draw()) print(table_common.draw()) + print(table_errors.draw()) diff --git a/dedoc/train_dataset/trainer/errors_saver.py b/dedoc/train_dataset/trainer/errors_saver.py index ae7fd26e..1d591a96 100644 --- a/dedoc/train_dataset/trainer/errors_saver.py +++ b/dedoc/train_dataset/trainer/errors_saver.py @@ -46,7 +46,7 @@ def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str, with open(path_file) as file: lines = file.readlines() lines_cnt = Counter(lines) - lines.sort(key=lambda l: (-lines_cnt[l], l)) + lines.sort(key=lambda value: (-lines_cnt[value], value)) path_out = os.path.join(self.errors_path, f"{int(1000 * len(lines) / errors_total_num):04d}_{file_name}") with open(path_out, "w") as file_out: diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt index 6a59d51a..fd980a45 100644 --- a/resources/benchmarks/tesseract_benchmark.txt +++ b/resources/benchmarks/tesseract_benchmark.txt @@ -1,4 +1,5 @@ Tesseract version is 5.0.0 +Table 1 - Accuracy for each file +---------------+---------------------+-------+-----------------+--------------+ | Dataset | Image name | --psm | Amount of words | Accuracy OCR | +===============+=====================+=======+=================+==============+ @@ -18,7 +19,7 @@ Tesseract version is 5.0.0 | others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | | | oga_01 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 85.500 | +| others | napalm_doc_2_2_6 | 4 | 124 | 86.100 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | 1.620e+14 | 4 | 695 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ @@ -74,6 +75,8 @@ Tesseract version is 5.0.0 +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ТЗ_09 | 4 | 154 | 97.500 | +---------------+---------------------+-------+-----------------+--------------+ + +Table 2 - AVG by each type of symbols: +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | | t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | @@ -84,9 +87,170 @@ Tesseract version is 5.0.0 | h- | | | | | | | | 0 | | words | | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.967 | 79.867 | 89.533 | 0 | 0 | 86.133 | 890 | 86.03 | +| others | 90.967 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.23 | | | | | | | | | | 3 | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | | | | | | | | | | 6 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ \ No newline at end of file ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['3 & -> ', '2 & < 6> -> <б>', '2 & < > -> <__>', "2 & | +| | <1 > -> <'>", '2 & <и > -> <н>'] | ++--------+---------------------------------------------------------------------+ +| . | ['5 & <.> -> <,>', '3 & <3.> -> < De>', '3 & -> ', '2 & | +| | <6.> -> ', '2 & <г.> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['6 & <1> -> <|>', '4 & <1С> -> ', "3 & <1> -> <'>", '3 & <№1> | +| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | +| | ', '2 & <1C> -> <С>', '2 & <1> -> ', '1 & <1> -> <Г>', '1 & | +| | <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| е | ['6 & <е> -> <с>', '2 & <не> -> ', '2 & <ре> -> <с>', '1 & <е> | +| | -> <а>'] | ++--------+---------------------------------------------------------------------+ +| н | ['2 & <н> -> <и>', '2 & <не> -> ', '1 & <н> -> <й>', '1 & <н> | +| | -> <п>'] | ++--------+---------------------------------------------------------------------+ +| и | ['3 & <ти> -> < TH>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | +| | <ис> -> <не>'] | ++--------+---------------------------------------------------------------------+ +| а | ['3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| о | ['2 & <то> -> ', '1 & <о> -> <0>'] | ++--------+---------------------------------------------------------------------+ +| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ти> -> < TH>', '3 & | +| | <тип> -> ', '2 & <то> -> '] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| л | ['2 & <л> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '4 & <ОС> -> ', '3 & | +| | <С> -> ', '2 & <СА> -> ', '1 & <С> -> <—>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | ++--------+---------------------------------------------------------------------+ +| г | ['2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & <г> -> <т>', '2 & <гр> | +| | -> ', '2 & <гр> -> <тв>'] | ++--------+---------------------------------------------------------------------+ +| N | ['22 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '1 & <в> -> <В>', '1 & <в> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| р | ['2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & <ре> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['6 & <Н> -> <* П>', '6 & <Н> -> <° >', '3 & <Н> -> <¢ П>', '2 & | +| | <ЕН> -> <ек>', '2 & <Н> -> <. >', '2 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| с | ['2 & <ис> -> <не>', '1 & <с> -> ', '1 & <с> -> <©>', '1 & <с> | +| | -> <е>'] | ++--------+---------------------------------------------------------------------+ +| А | ['2 & <СА> -> '] | ++--------+---------------------------------------------------------------------+ +| И | ['3 & <И> -> ', '1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> | +| | -> <П>'] | ++--------+---------------------------------------------------------------------+ +| д | ['3 & <д> -> <л>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕН> -> <ек>'] | ++--------+---------------------------------------------------------------------+ +| О | ['4 & <ОС> -> ', '2 & <ВО> -> <Ю>', '2 & <Об> -> <06>', '1 & | +| | <О> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| П | ['1 & <П> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['4 & <Т> -> <Г>', '3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| п | ['3 & <тип> -> ', '2 & <п> -> <и>', '2 & <п> -> <н>'] | ++--------+---------------------------------------------------------------------+ +| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| - | ['3 & <-> -> <=>', '1 & <-> -> <|>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['3 & <608> -> ', '2 & < 6> -> <б>', '2 & <6.> -> '] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> ', '3 & -> <Ш>', '3 & -> <УП>', '1 | +| | & -> <|>'] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МРТ> -> '] | ++--------+---------------------------------------------------------------------+ +| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <Об> -> <06>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['8 & <;> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| ь | ['2 & <ь> -> < Ь>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| E | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | +| | '2 & <ВЗ> -> <Ръ>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['1 & <ч> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['3 & <БЗ> -> <653>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| й | ['1 & <й> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| P | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| R | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '1 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| m | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| № | ['3 & <№1> -> ', '3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1О>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> < >'] | ++--------+---------------------------------------------------------------------+ +| c | ['1 & -> <с>'] | ++--------+---------------------------------------------------------------------+ +| d | ['1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| o | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| y | ['1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| щ | ['1 & <щ> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & <‚> -> <_,>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index 72128afc..8b81cf93 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -41,7 +41,7 @@ def test_dict_with_list(self) -> None: result = self._send_request(file_name)["content"]["structure"] first_list_items = result["subparagraphs"][0]["subparagraphs"][0]["subparagraphs"] second_list_items = result["subparagraphs"][1]["subparagraphs"][0]["subparagraphs"] - first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda l: -len(l)) + first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda value: -len(value)) nodes = result["subparagraphs"][1]["subparagraphs"] self.assertEqual("list", nodes[0]["metadata"]["paragraph_type"])