From 1403898b23b8fe942cd49d6e14e1253c466f1485 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Fri, 15 Dec 2023 14:08:46 +0300 Subject: [PATCH] TLDR-538 ocr correction scripts --- dedoc/scripts/language_tool_correction.py | 0 dedoc/scripts/text_blob_correction.py | 9 + requirements.txt | 3 +- .../tesseract_benchmark_sage-correction.txt | 359 ++++++++++++++++++ .../tesseract_benchmark_with_correction.txt | 259 +++++++++++++ scripts/calc_tesseract_benchmarks.py | 130 +++++-- scripts/language_tool_correction.py | 0 scripts/ocr_correction.py | 43 +++ scripts/text_blob_correction.py | 9 + 9 files changed, 771 insertions(+), 41 deletions(-) create mode 100644 dedoc/scripts/language_tool_correction.py create mode 100644 dedoc/scripts/text_blob_correction.py create mode 100644 resources/benchmarks/tesseract_benchmark_sage-correction.txt create mode 100644 resources/benchmarks/tesseract_benchmark_with_correction.txt create mode 100644 scripts/language_tool_correction.py create mode 100644 scripts/ocr_correction.py create mode 100644 scripts/text_blob_correction.py diff --git a/dedoc/scripts/language_tool_correction.py b/dedoc/scripts/language_tool_correction.py new file mode 100644 index 00000000..e69de29b diff --git a/dedoc/scripts/text_blob_correction.py b/dedoc/scripts/text_blob_correction.py new file mode 100644 index 00000000..8ecf8be6 --- /dev/null +++ b/dedoc/scripts/text_blob_correction.py @@ -0,0 +1,9 @@ +from textblob import TextBlob + + +class TextBlobCorrector: + def __init__(self): + return + + def correct(self, text: str) -> str: + return str(TextBlob(text).correct()) diff --git a/requirements.txt b/requirements.txt index 1ddb72e8..41f0c60c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,5 @@ uvicorn>=0.18.0,<=0.23.2 wget==3.2 xgbfir==0.3.1 xgboost>=1.1.1,<1.2.0 -xlrd==1.2.0 \ No newline at end of file +xlrd==1.2.0 +textblob==0.17.1 \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_sage-correction.txt b/resources/benchmarks/tesseract_benchmark_sage-correction.txt new file mode 100644 index 00000000..f75ea71e --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_sage-correction.txt @@ -0,0 +1,359 @@ +Tesseract version is 5.0.0 +Correction step: _sage-correction + +Table 1 - Accuracy for each file ++---------------+---------------------+-------+-----------------+--------------+ +| Dataset | Image name | --psm | Amount of words | Accuracy OCR | ++===============+=====================+=======+=================+==============+ +| english-words | Kaspersky | 6 | 111 | 99.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | USB | 6 | 4 | 80.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words1 | 6 | 19 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words2 | 6 | 9 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words3 | 6 | 9 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 0 | 4 | 315 | 94.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 1 | 4 | 308 | 94.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 2 | 4 | 238 | 96.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 3 | 4 | 313 | 96.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 4 | 4 | 218 | 94.100 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 5 | 4 | 291 | 94 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 6 | 4 | 268 | 95.200 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 7 | 4 | 390 | 95.100 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 8 | 4 | 117 | 94 | ++---------------+---------------------+-------+-----------------+--------------+ +| low_quality | 9 | 4 | 294 | 97.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| others | Zaklyuchenie_nevrol | 4 | 525 | 83 | +| | oga_00 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | +| | oga_01 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| others | napalm_doc_2_2_6 | 4 | 124 | 85 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 695 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 696 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 699 | 99.400 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | article_multiline | 4 | 471 | 99.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | fstek17_00 | 4 | 192 | 92.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | fstek17_01 | 4 | 332 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | law_image | 4 | 182 | 99.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | napalm_doc_13_2 | 4 | 243 | 96.700 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 98.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukodeksrf_01 | 4 | 340 | 99.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 146 | 94.400 | +| | 0 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 276 | 98.800 | +| | 1 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 165 | 98.500 | +| | 2 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 90 | 99.400 | +| | 3 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_00 | 4 | 78 | 97.400 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_01 | 4 | 296 | 98 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_03 | 4 | 337 | 98.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_04 | 4 | 257 | 96.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_05 | 4 | 238 | 97.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_07 | 4 | 233 | 98.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_08 | 4 | 284 | 95.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_09 | 4 | 154 | 97.600 | ++---------------+---------------------+-------+-----------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 94.820 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 96.04 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| low_qu | 99.190 | 75.340 | 94.544 | 0 | 0 | 97.640 | 2752 | 95.29 | +| ality | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 89.767 | 77.100 | 89.533 | 0 | 0 | 86.433 | 890 | 85 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz-npa | 98.956 | 90.920 | 92.104 | 0 | 0 | 99.488 | 7483 | 97.92 | +| | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['3 & <. №> -> < No>', '2 & < 2> -> ', '2 & < г> -> <К>', '2 & < | +| | ‚> -> <,>', "2 & <1 > -> <'>", '2 & <и > -> <н>', '2 & <№ > -> | +| | '] | ++--------+---------------------------------------------------------------------+ +| . | ['54 & <.> -> <,>', '3 & <. №> -> < No>', '3 & <3.> -> < De>', '3 & | +| | <В.В> -> ', '2 & <Г.> -> <С>', '2 & <г.> -> <ГТ>', '2 & <п.> -> | +| | <,>'] | ++--------+---------------------------------------------------------------------+ +| , | ['80 & <,> -> <.>', '3 & <ва,> -> <нь>', '1 & <,> -> <»>'] | ++--------+---------------------------------------------------------------------+ +| е | ['6 & <не> -> ', '4 & <е> -> <ё>', '3 & <все> -> <Ко>', '3 & | +| | <ге> -> <Кри>', '3 & <е-> -> <бов>', '3 & <е> -> <а>', '3 & <цев> | +| | -> ', '3 & <че-> -> <и»>', '2 & <е> -> <и>', '2 & <е> -> | +| | <ми>', '2 & <е> -> <с>', '2 & <ее> -> ', '2 & <ле> -> <У>', '1 | +| | & <е> -> <Е>', '1 & <е> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| о | ['6 & <то> -> ', '3 & <По> -> ', '3 & <Про> -> <Ис>', '3 & | +| | <но> -> ', '3 & <она> -> ', '3 & <под> -> ', '3 & | +| | <фок> -> <М>', '2 & <во> -> <за>', '2 & <до> -> ', '2 & <до> -> | +| | ', '2 & <о> -> <ак>', '2 & <о> -> <у>', '2 & <об> -> <бы>', '2 | +| | & <по> -> <10>', '2 & <то> -> ', '1 & <о> -> <в>', '1 & <о> -> | +| | <я>'] | ++--------+---------------------------------------------------------------------+ +| а | ['5 & <а> -> <о>', '4 & <на> -> ', '3 & <Нам> -> ', '3 & | +| | <а> -> <ёту>', '3 & <ва,> -> <нь>', '3 & <на> -> <под>', '3 & <она> | +| | -> ', '3 & <рак> -> <Ли>', '3 & <сан> -> <еви>', '3 & <так> -> | +| | ', '2 & <Ла> -> <А>', '2 & <а> -> <ся>', '2 & <ва> -> <к>', '2 | +| | & <на> -> ', '1 & <а> -> <Б>', '1 & <а> -> <е>', '1 & <а> -> | +| | <у>', '1 & <а> -> <ы>', '1 & <а> -> <ь>'] | ++--------+---------------------------------------------------------------------+ +| н | ['6 & <не> -> ', '4 & <на> -> ', '3 & <на> -> <под>', '3 & | +| | <но> -> ', '3 & <она> -> ', '3 & <сан> -> <еви>', '2 & | +| | <йн> -> <ем>', '2 & <н> -> <п>', '2 & <на> -> ', '2 & <нк> -> | +| | <х>', '2 & <ны> -> <им>', '1 & <н> -> <Н>', '1 & <н> -> <и>', '1 & | +| | <н> -> <й>', '1 & <н> -> <л>', '1 & <н> -> <м>', '1 & <н> -> <ф>'] | ++--------+---------------------------------------------------------------------+ +| и | ['4 & <и> -> <е>', '3 & <ив> -> <ьюж>', '3 & <тип> -> ', '3 & | +| | <ции> -> <узы>', '2 & <и > -> <н>', '2 & <и> -> <10>', '2 & <и> -> | +| | <ей>', '2 & <и> -> <мм>', '2 & <ис> -> <не>', '2 & <их> -> ', | +| | '2 & <их> -> ', '2 & <си> -> <ен>', '1 & <и> -> <В>', '1 & <и> | +| | -> <а>', '1 & <и> -> <с>', '1 & <и> -> <ь>'] | ++--------+---------------------------------------------------------------------+ +| - | ['8 & <-> -> <но>', '6 & <-> -> <ния>', '5 & <-> -> <в>', '3 & <-> | +| | -> <жья>', '3 & <-> -> <ков>', '3 & <-> -> <нил>', '3 & <-> -> | +| | <щим>', '3 & <е-> -> <бов>', '3 & <че-> -> <и»>', '2 & <-> -> | +| | <ве>', '2 & <-> -> <да>', '2 & <-> -> <ие>', '2 & <-> -> <ко>', '2 | +| | & <-> -> <ли>', '2 & <-> -> <м">', '2 & <-> -> <м>', '2 & <-> -> | +| | <мо>', '2 & <-> -> <ны>', '2 & <-> -> <ры>', '2 & <-> -> <ых>', '2 | +| | & <-> -> <“>', '2 & <у-> -> <ем>', '2 & <ы-> -> <им>', '2 & <ы-> -> | +| | <ём>', '1 & <-> -> <">', '1 & <-> -> <»>', '1 & <-> -> <д>', '1 & | +| | <-> -> <л>', '1 & <-> -> <н>', '1 & <-> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ["4 & <1> -> <'>", '4 & <1С> -> ', '3 & <1> -> <3>', '3 & <№1> | +| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | +| | ', '2 & <1C> -> <С>', '2 & <1> -> <2>', '2 & <1> -> ', '1 & | +| | <1> -> ', '1 & <1> -> <5>', '1 & <1> -> <Г>', '1 & <1> -> <С>', | +| | '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| № | ['94 & <№> -> ', '6 & <№> -> ', '3 & <. №> -> < No>', '3 & | +| | <№1> -> ', '3 & <№1»> -> ', '2 & <№ > -> '] | ++--------+---------------------------------------------------------------------+ +| в | ['4 & <в> -> <6>', '3 & <ва,> -> <нь>', '3 & <все> -> <Ко>', '3 & | +| | <ив> -> <ьюж>', '3 & <ств> -> <У н>', '3 & <цев> -> ', '2 & | +| | <в> -> <«В>', '2 & <в> -> <зм>', '2 & <в> -> <м>', '2 & <в> -> | +| | <по>', '2 & <ва> -> <к>', '2 & <во> -> <за>', '1 & <в> -> ', '1 | +| | & <в> -> <В>', '1 & <в> -> <г>', '1 & <в> -> <н>'] | ++--------+---------------------------------------------------------------------+ +| с | ['3 & <все> -> <Ко>', '3 & <сан> -> <еви>', '3 & <ств> -> <У н>', | +| | '2 & <ис> -> <не>', '2 & <с> -> <Не>', '2 & <с> -> <От>', '2 & <си> | +| | -> <ен>', '1 & <с> -> ', '1 & <с> -> <б>', '1 & <с> -> <н>'] | ++--------+---------------------------------------------------------------------+ +| т | ['6 & <то> -> ', '3 & <ств> -> <У н>', '3 & <так> -> ', '3 | +| | & <тип> -> ', '2 & <т> -> <г>', '2 & <то> -> ', '1 & <т> | +| | -> <Д>', '1 & <т> -> <Т>', '1 & <т> -> <м>'] | ++--------+---------------------------------------------------------------------+ +| л | ['2 & <зл> -> <им>', '2 & <ле> -> <У>', '1 & <л> -> ', '1 & <л> | +| | -> <Л>', '1 & <л> -> <д>', '1 & <л> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <Про> -> <Ис>', '3 & <гр> -> <тав>', '3 & <рак> -> <Ли>', '2 | +| | & <гр> -> ', '2 & <р> -> <ал>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & < 2> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я>'] | ++--------+---------------------------------------------------------------------+ +| д | ['3 & <д> -> <Пен>', '3 & <под> -> ', '2 & <до> -> ', '2 & | +| | <до> -> ', '1 & <д> -> <Т>', '1 & <д> -> <Ц>'] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <ге> -> <Кри>', '3 & <гр> -> <тав>', '2 & < г> -> <К>', '2 & | +| | <г.> -> <ГТ>', '2 & <г> -> <т>', '2 & <гр> -> '] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.> -> < De>', '1 & <3> -> <">', '1 & <3> -> '] | ++--------+---------------------------------------------------------------------+ +| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '3 & <ОС> -> ', '3 | +| | & <С> -> ', '2 & <ОС> -> '] | ++--------+---------------------------------------------------------------------+ +| N | ['22 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| А | ['2 & <А> -> ', '2 & <А> -> <Ли>'] | ++--------+---------------------------------------------------------------------+ +| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>'] | ++--------+---------------------------------------------------------------------+ +| п | ['3 & <под> -> ', '3 & <тип> -> ', '2 & <п.> -> <,>', '2 | +| | & <п> -> <и >', '2 & <п> -> <л>', '2 & <по> -> <10>', '1 & <п> -> | +| | <П>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <рак> -> <Ли>', '3 & <так> -> ', '3 & <фок> -> <М>', '2 | +| | & <нк> -> <х>'] | ++--------+---------------------------------------------------------------------+ +| у | ['3 & <у> -> <ы>', '2 & <у-> -> <ем>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['3 & <Нам> -> ', '2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| О | ['3 & <ОС> -> ', '2 & <ОС> -> ', '2 & <Об> -> <06>', '1 & | +| | <О> -> ', '1 & <О> -> <Ю>', '1 & <О> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| П | ['3 & <По> -> ', '3 & <Про> -> <Ис>', '2 & <П> -> <И>', '1 & | +| | <П> -> <К>', '1 & <П> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| б | ['3 & <"б"> -> <“8”>', '2 & <Об> -> <06>', '2 & <б> -> <«Л>', '2 & | +| | <об> -> <бы>'] | ++--------+---------------------------------------------------------------------+ +| ы | ['2 & <ны> -> <им>', '2 & <ы-> -> <им>', '2 & <ы-> -> <ём>', '1 & | +| | <ы> -> <б>', '1 & <ы> -> <е>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['9 & <;> -> <:>', '1 & <;> -> <,>', '1 & <;> -> <.>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['3 & <МРТ> -> ', '3 & <Т> -> <Г>', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| м | ['3 & <Нам> -> '] | ++--------+---------------------------------------------------------------------+ +| В | ['6 & <СЗВ> -> ', '3 & <В.В> -> ', '2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <УП>', '1 & -> '] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МРТ> -> ', '2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| ц | ['3 & <цев> -> ', '3 & <ции> -> <узы>', '2 & <ц> -> <С>', '1 & | +| | <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| Л | ['2 & <Ла> -> <А>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <2>'] | ++--------+---------------------------------------------------------------------+ +| з | ['2 & <зл> -> <им>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я>'] | ++--------+---------------------------------------------------------------------+ +| й | ['2 & <й> -> <е:>', '2 & <йн> -> <ем>'] | ++--------+---------------------------------------------------------------------+ +| " | ['3 & <"б"> -> <“8”>', '2 & <"> -> <“>', '1 & <"> -> <”>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <2>'] | ++--------+---------------------------------------------------------------------+ +| E | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | +| | '2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['3 & <че-> -> <и»>'] | ++--------+---------------------------------------------------------------------+ +| : | ['2 & <:> -> '] | ++--------+---------------------------------------------------------------------+ +| [ | ['2 & <[> -> <(>'] | ++--------+---------------------------------------------------------------------+ +| ] | ['2 & <]> -> <)>'] | ++--------+---------------------------------------------------------------------+ +| 4 | ['1 & <4> -> <“>'] | ++--------+---------------------------------------------------------------------+ +| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['3 & <БЗ> -> <653>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| ш | ['2 & <ш> -> <«Ч>', '1 & <ш> -> <ч>'] | ++--------+---------------------------------------------------------------------+ +| P | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| R | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '1 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| х | ['2 & <их> -> ', '2 & <их> -> '] | ++--------+---------------------------------------------------------------------+ +| — | ['1 & <—> -> <->'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '2 & -> <Из>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| m | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| ф | ['3 & <фок> -> <М>', '1 & <ф> -> <Ф>'] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <у>'] | ++--------+---------------------------------------------------------------------+ +| c | ['2 & -> <со>', '1 & -> <с>'] | ++--------+---------------------------------------------------------------------+ +| o | ['2 & -> <со>', '2 & -> '] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1 >'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & < ‚> -> <,>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> <Х>'] | ++--------+---------------------------------------------------------------------+ +| d | ['1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Из>'] | ++--------+---------------------------------------------------------------------+ +| x | ['1 & -> <х>'] | ++--------+---------------------------------------------------------------------+ +| y | ['1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| Г | ['2 & <Г.> -> <С>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_with_correction.txt b/resources/benchmarks/tesseract_benchmark_with_correction.txt new file mode 100644 index 00000000..2d8f79ad --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_with_correction.txt @@ -0,0 +1,259 @@ +Tesseract version is 5.0.0 +Correction step: True +Time mean correction ocr = 14.493645176743016 +Table 1 - Accuracy for each file ++---------------+---------------------+-------+-----------------+--------------+ +| Dataset | Image name | --psm | Amount of words | Accuracy OCR | ++===============+=====================+=======+=================+==============+ +| english-words | Kaspersky | 6 | 111 | 99.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | USB | 6 | 4 | 80.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words1 | 6 | 19 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words2 | 6 | 9 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| english-words | words3 | 6 | 9 | 100 | ++---------------+---------------------+-------+-----------------+--------------+ +| others | Zaklyuchenie_nevrol | 4 | 525 | 83 | +| | oga_00 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | +| | oga_01 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| others | napalm_doc_2_2_6 | 4 | 124 | 85 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 695 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 696 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | 1.620e+14 | 4 | 699 | 99.400 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | article_multiline | 4 | 471 | 99.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | fstek17_00 | 4 | 192 | 92.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | fstek17_01 | 4 | 332 | 99.600 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | law_image | 4 | 182 | 99.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | napalm_doc_13_2 | 4 | 243 | 96.700 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 98.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ukodeksrf_01 | 4 | 340 | 99.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 146 | 94.400 | +| | 0 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 276 | 98.800 | +| | 1 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 165 | 98.500 | +| | 2 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | with_applications_0 | 4 | 90 | 99.400 | +| | 3 | | | | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_00 | 4 | 78 | 97.400 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_01 | 4 | 296 | 98 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_03 | 4 | 337 | 98.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_04 | 4 | 257 | 96.300 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_05 | 4 | 238 | 97.800 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_07 | 4 | 233 | 98.500 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_08 | 4 | 284 | 95.900 | ++---------------+---------------------+-------+-----------------+--------------+ +| tz-npa | ТЗ_09 | 4 | 154 | 97.600 | ++---------------+---------------------+-------+-----------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 94.820 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 96.04 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 89.767 | 77.100 | 89.533 | 0 | 0 | 86.433 | 890 | 85 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz-npa | 98.956 | 90.920 | 92.104 | 0 | 0 | 99.488 | 7483 | 97.92 | +| | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['3 & <. №> -> < No>', '2 & < г> -> <К>', '2 & < ‚> -> <,>', "2 & | +| | <1 > -> <'>", '2 & <и > -> <н>', '2 & <№ > -> '] | ++--------+---------------------------------------------------------------------+ +| . | ['4 & <.> -> <,>', '3 & <. №> -> < No>', '3 & <3.> -> < De>', '2 & | +| | <г.> -> <ГТ>'] | ++--------+---------------------------------------------------------------------+ +| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['4 & <1С> -> ', "3 & <1> -> <'>", '3 & <1> -> <3>', '3 & <№1> | +| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | +| | ', '2 & <1C> -> <С>', '2 & <1> -> <2>', '2 & <1> -> ', '1 & | +| | <1> -> <5>', '1 & <1> -> <Г>', '1 & <1> -> <С>', '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| е | ['2 & <е> -> <и>', '2 & <е> -> <ё>', '2 & <ле> -> <У>', '2 & <не> | +| | -> ', '1 & <е> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| и | ['4 & <и> -> <е>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | +| | <ис> -> <не>', '2 & <си> -> <ен>', '1 & <и> -> <В>', '1 & <и> -> | +| | <а>', '1 & <и> -> <ь>'] | ++--------+---------------------------------------------------------------------+ +| а | ['4 & <а> -> <о>', '3 & <ва,> -> <нь>', '3 & <на> -> <под>', '1 & | +| | <а> -> <Б>', '1 & <а> -> <е>', '1 & <а> -> <у>'] | ++--------+---------------------------------------------------------------------+ +| № | ['80 & <№> -> ', '6 & <№> -> ', '3 & <. №> -> < No>', '3 & | +| | <№1> -> ', '3 & <№1»> -> ', '2 & <№ > -> '] | ++--------+---------------------------------------------------------------------+ +| н | ['3 & <на> -> <под>', '2 & <не> -> ', '1 & <н> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| о | ['3 & <фок> -> <М>', '2 & <о> -> <у>', '2 & <об> -> <бы>', '2 & | +| | <то> -> ', '1 & <о> -> <в>', '1 & <о> -> <я>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я>'] | ++--------+---------------------------------------------------------------------+ +| л | ['2 & <ле> -> <У>', '1 & <л> -> <Л>', '1 & <л> -> <д>', '1 & <л> -> | +| | <т>'] | ++--------+---------------------------------------------------------------------+ +| т | ['3 & <тип> -> ', '2 & <т> -> <г>', '2 & <то> -> '] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <гр> -> <тав>', '2 & <гр> -> '] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | ++--------+---------------------------------------------------------------------+ +| N | ['22 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '3 & <ОС> -> ', '3 | +| | & <С> -> ', '2 & <ОС> -> '] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <гр> -> <тав>', '2 & < г> -> <К>', '2 & <г.> -> <ГТ>', '2 & | +| | <г> -> <т>', '2 & <гр> -> '] | ++--------+---------------------------------------------------------------------+ +| с | ['2 & <ис> -> <не>', '2 & <си> -> <ен>', '1 & <с> -> '] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '2 & <в> -> <м>', '2 & <в> -> <по>', '1 & <в> | +| | -> <В>'] | ++--------+---------------------------------------------------------------------+ +| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| О | ['3 & <ОС> -> ', '2 & <ОС> -> ', '2 & <Об> -> <06>', '1 & | +| | <О> -> <Ю>', '1 & <О> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['3 & <МРТ> -> ', '3 & <Т> -> <Г>', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| П | ['1 & <П> -> <И>', '1 & <П> -> <К>', '1 & <П> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| - | ['2 & <-> -> <ры>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| у | ['2 & <у> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| п | ['3 & <тип> -> '] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МРТ> -> ', '2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <Об> -> <06>', '2 & <об> -> <бы>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <фок> -> <М>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <2>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['8 & <;> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| ы | ['1 & <ы> -> <б>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я>'] | ++--------+---------------------------------------------------------------------+ +| E | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | +| | '2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <С>', '1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <2>'] | ++--------+---------------------------------------------------------------------+ +| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['3 & <БЗ> -> <653>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| й | ['2 & <й> -> <е:>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| P | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| R | ['3 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '1 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '2 & -> <Из>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| m | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1 >'] | ++--------+---------------------------------------------------------------------+ +| ф | ['3 & <фок> -> <М>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & < ‚> -> <,>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> <Х>'] | ++--------+---------------------------------------------------------------------+ +| c | ['1 & -> <с>'] | ++--------+---------------------------------------------------------------------+ +| d | ['1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Из>'] | ++--------+---------------------------------------------------------------------+ +| o | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| y | ['1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <у>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/scripts/calc_tesseract_benchmarks.py b/scripts/calc_tesseract_benchmarks.py index 69229b55..ca259d87 100644 --- a/scripts/calc_tesseract_benchmarks.py +++ b/scripts/calc_tesseract_benchmarks.py @@ -1,5 +1,6 @@ import os import re +import time import zipfile from tempfile import TemporaryDirectory from typing import Dict, List, Tuple @@ -11,6 +12,14 @@ from texttable import Texttable from dedoc.config import get_config +from dedoc.scripts.text_blob_correction import TextBlobCorrector +from scripts.ocr_correction import correction, init_correction_step + +WITHOUT_CORRECTION = "" +SAGE_CORRECTION = "_sage-correction" +TEXT_BLOB_CORRECTION = "_textblob-correction" + +USE_CORRECTION_OCR = TEXT_BLOB_CORRECTION def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str: @@ -169,9 +178,28 @@ def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[ return table_common, table_accuracy_per_image -def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) -> Tuple[Texttable, Texttable]: +def calculate_accuracy_script(tmp_gt_path: str, tmp_prediction_path: str, accuracy_path: str) -> None: + # calculation accuracy build for Ubuntu from source https://github.com/eddieantonio/ocreval + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy")) + command = f"{accuracy_script_path} {tmp_gt_path} {tmp_prediction_path} >> {accuracy_path}" + os.system(command) + + +def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, cache_dir: str) -> Tuple[Texttable, Texttable]: statistics = {} accuracy_values = [] + correction_times = [] + + result_dir = os.path.join(cache_dir, "result_ocr") + os.makedirs(result_dir, exist_ok=True) + + corrector, corrected_path = None, None + if USE_CORRECTION_OCR == SAGE_CORRECTION: + corrector, corrected_path = init_correction_step(cache_dir) + elif USE_CORRECTION_OCR == TEXT_BLOB_CORRECTION: + corrector = TextBlobCorrector() + corrected_path = os.path.join(cache_dir, "result_corrected") + os.makedirs(corrected_path, exist_ok=True) with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -191,41 +219,61 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) - gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt") imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name) accuracy_path = os.path.join(cache_dir_accuracy, f"{dataset_name}_{base_name}_accuracy.txt") - - with TemporaryDirectory() as tmpdir: - tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt") - tmp_ocr_path = os.path.join(tmpdir, "tmp_ocr.txt") - - try: - with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file: - - gt_text = gt_file.read().decode("utf-8") - word_cnt = len(gt_text.split()) - - tmp_gt_file.write(gt_text.encode()) # extraction gt from zip - tmp_gt_file.flush() - - arch_file.extract(imgs_path, tmpdir) - image = cv2.imread(tmpdir + "/" + imgs_path) - - # call ocr - psm = 6 if dataset_name == "english-words" else 4 - text = _call_tesseract(image, "rus+eng", psm=psm) - tmp_ocr_file.write(text) - tmp_ocr_file.flush() - - # calculation accuracy build for Ubuntu from source https://github.com/eddieantonio/ocreval - accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy")) - command = f"{accuracy_script_path} {tmp_gt_path} {tmp_ocr_path} >> {accuracy_path}" - os.system(command) - - statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) - - except Exception as ex: - print(ex) - print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") - + if os.path.exists(accuracy_path): + os.remove(accuracy_path) + + tmp_gt_path = os.path.join(result_dir, f"{img_name}_gt.txt") + tmp_ocr_path = os.path.join(result_dir, f"{img_name}_ocr.txt") + + try: + with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file: + + gt_text = gt_file.read().decode("utf-8") + word_cnt = len(gt_text.split()) + + tmp_gt_file.write(gt_text.encode()) # extraction gt from zip + tmp_gt_file.close() + + arch_file.extract(imgs_path, result_dir) + image = cv2.imread(result_dir + "/" + imgs_path) + + # call ocr + psm = 6 if dataset_name == "english-words" else 4 + text = _call_tesseract(image, "rus+eng", psm=psm) + tmp_ocr_file.write(text) + tmp_ocr_file.close() + + # call correction step + time_b = time.time() + if USE_CORRECTION_OCR == SAGE_CORRECTION: + tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt") + corrected_text = correction(corrector, text) + correction_times.append(time.time() - time_b) + with open(tmp_corrected_path, "w") as tmp_corrected_file: + tmp_corrected_file.write(corrected_text) + tmp_corrected_file.close() + + calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path) + elif USE_CORRECTION_OCR == TEXT_BLOB_CORRECTION: + tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt") + corrected_text = corrector.correct(text) + correction_times.append(time.time() - time_b) + with open(tmp_corrected_path, "w") as tmp_corrected_file: + tmp_corrected_file.write(corrected_text) + tmp_corrected_file.close() + + calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path) + else: + calculate_accuracy_script(tmp_gt_path, tmp_ocr_path, accuracy_path) + + statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) + accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + + except Exception as ex: + print(ex) + print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") + + print(f"Time mean correction ocr = {np.array(correction_times).mean()}") table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) return table_common, table_accuracy_per_image @@ -240,18 +288,20 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) - benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) + wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT", benchmark_data_path) print(f"Benchmark data downloaded to {benchmark_data_path}") else: print(f"Use cached benchmark data from {benchmark_data_path}") assert os.path.isfile(benchmark_data_path) - table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path) + table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir) table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) - with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file: - res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n") + with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file: + res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n") + res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n") + res_file.write(f"\nTable 1 - Accuracy for each file\n") res_file.write(table_accuracy_per_image.draw()) res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) diff --git a/scripts/language_tool_correction.py b/scripts/language_tool_correction.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/ocr_correction.py b/scripts/ocr_correction.py new file mode 100644 index 00000000..ada563e2 --- /dev/null +++ b/scripts/ocr_correction.py @@ -0,0 +1,43 @@ +import os +from typing import Tuple + +import torch +from sage.spelling_correction.corrector import Corrector +from sage.spelling_correction import AvailableCorrectors +from sage.spelling_correction import RuM2M100ModelForSpellingCorrection + +''' +Install sage library (for ocr correction step): +git clone https://github.com/ai-forever/sage.git +cd sage +pip install . +pip install -r requirements.txt + +Note: sage use 5.2 Gb GPU ...... +''' +USE_GPU = True + + +def correction(model: Corrector, ocr_text: str) -> str: + + corrected_lines = [] + for line in ocr_text.split("\n"): + corrected_lines.append(model.correct(line)[0]) + corrected_text = "\n".join(corrected_lines) + + return corrected_text + + +def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]: + + corrected_path = os.path.join(cache_dir, "result_corrected") + os.makedirs(corrected_path, exist_ok=True) + corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) + if torch.cuda.is_available() and USE_GPU: + corrector.model.to(torch.device("cuda:0")) + print("use CUDA") + else: + print("use CPU") + return corrector, corrected_path + + diff --git a/scripts/text_blob_correction.py b/scripts/text_blob_correction.py new file mode 100644 index 00000000..8ecf8be6 --- /dev/null +++ b/scripts/text_blob_correction.py @@ -0,0 +1,9 @@ +from textblob import TextBlob + + +class TextBlobCorrector: + def __init__(self): + return + + def correct(self, text: str) -> str: + return str(TextBlob(text).correct())