|
11 | 11 | from texttable import Texttable
|
12 | 12 |
|
13 | 13 | from dedoc.config import get_config
|
| 14 | +from dedoc.scripts.language_tool_correction import LanguageToolCorrector |
14 | 15 | from dedoc.scripts.ocr_correction import init_correction_step, correction
|
15 | 16 |
|
16 |
| -USE_CORRECTION_OCR = False |
| 17 | +WITHOUT_CORRECTION = 0 |
| 18 | +SAGE_CORRECTION = 1 |
| 19 | +LANGUAGE_TOOL_CORRECTION = 2 |
| 20 | + |
| 21 | +USE_CORRECTION_OCR = LANGUAGE_TOOL_CORRECTION |
17 | 22 |
|
18 | 23 |
|
19 | 24 | def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str:
|
@@ -188,8 +193,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
|
188 | 193 | os.makedirs(result_dir, exist_ok=True)
|
189 | 194 |
|
190 | 195 | corrector, corrected_path = None, None
|
191 |
| - if USE_CORRECTION_OCR: |
| 196 | + if USE_CORRECTION_OCR == SAGE_CORRECTION: |
192 | 197 | corrector, corrected_path = init_correction_step(cache_dir)
|
| 198 | + elif USE_CORRECTION_OCR == LANGUAGE_TOOL_CORRECTION: |
| 199 | + corrector = LanguageToolCorrector() |
193 | 200 |
|
194 | 201 | with zipfile.ZipFile(benchmark_data_path, "r") as arch_file:
|
195 | 202 | names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0]
|
@@ -235,14 +242,23 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c
|
235 | 242 |
|
236 | 243 | # call correction step
|
237 | 244 | time_b = time.time()
|
238 |
| - if USE_CORRECTION_OCR: |
| 245 | + if USE_CORRECTION_OCR == SAGE_CORRECTION: |
239 | 246 | tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt")
|
240 | 247 | corrected_text = correction(corrector, text)
|
241 | 248 | correction_times.append(time.time() - time_b)
|
242 | 249 | with open(tmp_corrected_path, "w") as tmp_corrected_file:
|
243 | 250 | tmp_corrected_file.write(corrected_text)
|
244 | 251 | tmp_corrected_file.close()
|
245 | 252 |
|
| 253 | + calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path) |
| 254 | + elif USE_CORRECTION_OCR == LANGUAGE_TOOL_CORRECTION: |
| 255 | + tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt") |
| 256 | + corrected_text = corrector.correct(text) |
| 257 | + correction_times.append(time.time() - time_b) |
| 258 | + with open(tmp_corrected_path, "w") as tmp_corrected_file: |
| 259 | + tmp_corrected_file.write(corrected_text) |
| 260 | + tmp_corrected_file.close() |
| 261 | + |
246 | 262 | calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path)
|
247 | 263 | else:
|
248 | 264 | calculate_accuracy_script(tmp_gt_path, tmp_ocr_path, accuracy_path)
|
|
0 commit comments