Skip to content

Commit 6739ae0

Browse files
committed
Add language tool corrector
1 parent d29f21d commit 6739ae0

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

dedoc/scripts/calc_tesseract_benchmarks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,14 @@
1111
from texttable import Texttable
1212

1313
from dedoc.config import get_config
14+
from dedoc.scripts.language_tool_correction import LanguageToolCorrector
1415
from dedoc.scripts.ocr_correction import init_correction_step, correction
1516

16-
USE_CORRECTION_OCR = False
17+
WITHOUT_CORRECTION = 0
18+
SAGE_CORRECTION = 1
19+
LANGUAGE_TOOL_CORRECTION = 2
20+
21+
USE_CORRECTION_OCR = LANGUAGE_TOOL_CORRECTION
1722

1823

1924
def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str:
@@ -188,8 +193,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
188193
os.makedirs(result_dir, exist_ok=True)
189194

190195
corrector, corrected_path = None, None
191-
if USE_CORRECTION_OCR:
196+
if USE_CORRECTION_OCR == SAGE_CORRECTION:
192197
corrector, corrected_path = init_correction_step(cache_dir)
198+
elif USE_CORRECTION_OCR == LANGUAGE_TOOL_CORRECTION:
199+
corrector = LanguageToolCorrector()
193200

194201
with zipfile.ZipFile(benchmark_data_path, "r") as arch_file:
195202
names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0]
@@ -235,14 +242,23 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
235242

236243
# call correction step
237244
time_b = time.time()
238-
if USE_CORRECTION_OCR:
245+
if USE_CORRECTION_OCR == SAGE_CORRECTION:
239246
tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt")
240247
corrected_text = correction(corrector, text)
241248
correction_times.append(time.time() - time_b)
242249
with open(tmp_corrected_path, "w") as tmp_corrected_file:
243250
tmp_corrected_file.write(corrected_text)
244251
tmp_corrected_file.close()
245252

253+
calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path)
254+
elif USE_CORRECTION_OCR == LANGUAGE_TOOL_CORRECTION:
255+
tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt")
256+
corrected_text = corrector.correct(text)
257+
correction_times.append(time.time() - time_b)
258+
with open(tmp_corrected_path, "w") as tmp_corrected_file:
259+
tmp_corrected_file.write(corrected_text)
260+
tmp_corrected_file.close()
261+
246262
calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path)
247263
else:
248264
calculate_accuracy_script(tmp_gt_path, tmp_ocr_path, accuracy_path)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import language_tool_python
2+
3+
4+
class LanguageToolCorrector:
5+
def __init__(self):
6+
self.tool = language_tool_python.LanguageToolPublicAPI()
7+
8+
def correct(self, text: str) -> str:
9+
return self.tool.correct(text)

0 commit comments

Comments
 (0)