Skip to content

Commit

Permalink
TLDR-538 added low quality imgs; fixed after review
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Dec 25, 2023
1 parent aa5f2c7 commit 3a2dca8
Show file tree
Hide file tree
Showing 4 changed files with 549 additions and 72 deletions.
12 changes: 6 additions & 6 deletions dedoc/scripts/calc_tesseract_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
from dedoc.scripts.language_tool_correction import LanguageToolCorrector
from dedoc.scripts.ocr_correction import init_correction_step, correction

WITHOUT_CORRECTION = 0
SAGE_CORRECTION = 1
LANGUAGE_TOOL_CORRECTION = 2
WITHOUT_CORRECTION = ""
SAGE_CORRECTION = "_sage-correction"
LANGUAGE_TOOL_CORRECTION = "_languagetool-correction"

USE_CORRECTION_OCR = LANGUAGE_TOOL_CORRECTION
USE_CORRECTION_OCR = SAGE_CORRECTION


def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str:
Expand Down Expand Up @@ -287,7 +287,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c

benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip")
if not os.path.isfile(benchmark_data_path):
wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path)
wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT", benchmark_data_path)
print(f"Benchmark data downloaded to {benchmark_data_path}")
else:
print(f"Use cached benchmark data from {benchmark_data_path}")
Expand All @@ -297,7 +297,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c

table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy)

with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file:
with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file:
res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n")
res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n")
res_file.write(f"\nTable 1 - Accuracy for each file\n")
Expand Down
4 changes: 4 additions & 0 deletions dedoc/scripts/language_tool_correction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import language_tool_python

"""
pip install language-tool-python==2.7.1
"""


class LanguageToolCorrector:
def __init__(self):
Expand Down
Loading

0 comments on commit 3a2dca8

Please sign in to comment.