Bump surya version to 0.8.1

xiaoyao9184 · Dec 22, 2024 · 1c161f3 · 1c161f3
1 parent 3178995
commit 1c161f3
Show file tree

Hide file tree

Showing 16 changed files with 109 additions and 24 deletions.
diff --git a/.github/workflows/docker-image-tag-version.yml b/.github/workflows/docker-image-tag-version.yml
@@ -13,11 +13,11 @@ on:
       surya_version:
         description: surya version of pypi
         required: true
-        default: 0.8.0
+        default: 0.8.1
       streamlit_version:
         description: streamlit version of pypi
         required: true
-        default: 1.41.0
+        default: 1.41.1
 
 jobs:
   build-and-push-docker-image:
@@ -29,9 +29,9 @@ jobs:
     strategy:
       matrix:
         SURYA_VERSION:
-          - ${{ github.event.inputs.surya_version || '0.8.0' }}
+          - ${{ github.event.inputs.surya_version || '0.8.1' }}
         STREAMLIT_VERSION:
-          - ${{ github.event.inputs.streamlit_version || '1.41.0' }}
+          - ${{ github.event.inputs.streamlit_version || '1.41.1' }}
         platform:
           - linux/amd64
 

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -37,7 +37,8 @@
                 "DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95",
                 "RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41",
                 "LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957",
-                "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14"
+                "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14",
+                "OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414"
             },
             "justMyCode": false
         },
@@ -75,7 +76,8 @@
                 "DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95",
                 "RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41",
                 "LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957",
-                "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14"
+                "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14",
+                "OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414"
             },
             "justMyCode": false
         },

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -23,9 +23,9 @@
             "command": "docker build -t xiaoyao9184/surya:master -f ./docker/build@source/dockerfile .",
         },
         {
-            "label": "docker: build 0.8.0",
+            "label": "docker: build 0.8.1",
             "type": "shell",
-            "command": "docker build -t xiaoyao9184/surya:0.8.0 -f ./docker/build@pypi/dockerfile .",
+            "command": "docker build -t xiaoyao9184/surya:0.8.1 -f ./docker/build@pypi/dockerfile .",
         },
         {
             "label": "huggingface-cli: download models",
@@ -43,6 +43,7 @@
                 "&& huggingface-cli download vikp/surya_rec2 --revision main --cache-dir ./cache/huggingface/hub",
                 "&& huggingface-cli download vikp/surya_tablerec --revision main --cache-dir ./cache/huggingface/hub",
                 "&& huggingface-cli download datalab-to/surya_layout0 --revision main --cache-dir ./cache/huggingface/hub",
+                "&& huggingface-cli download datalab-to/ocr_error_detection --revision main --cache-dir ./cache/huggingface/hub",
             ]
         },
         {

diff --git a/cache/README.md b/cache/README.md
@@ -41,7 +41,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this
 │   ├── 31bdd446acbf8a47ea46d7d0a4998f145f0cc75a
 │   ├── 5497e8690cfe93cbedec7efaf91f6ac734496ac8
 │   ├── 93c190b5690dd55aac16723222a9909e2be0faec
-│   ├── 9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
+│   ├── 9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
 │   ├── a6344aac8c09253b3b630fb776ae94478aa0275b
 │   ├── a83ef0a8114bd50cc650e08a9738c0f6345f5186
 │   ├── dd34282c30833587a799d334d44a637694d41c8e
@@ -54,7 +54,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this
         ├── config.json -> ../../blobs/5497e8690cfe93cbedec7efaf91f6ac734496ac8
         ├── generation_config.json -> ../../blobs/e237701f4293e736f74d2c968582935590107034
         ├── .gitattributes -> ../../blobs/a6344aac8c09253b3b630fb776ae94478aa0275b
-        ├── model.safetensors -> ../../blobs/9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
+        ├── model.safetensors -> ../../blobs/9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
         ├── preprocessor_config.json -> ../../blobs/dd34282c30833587a799d334d44a637694d41c8e
         ├── README.md -> ../../blobs/a83ef0a8114bd50cc650e08a9738c0f6345f5186
         ├── special_tokens_map.json -> ../../blobs/2f525ec0be1f2e8cb257a7b3e01de3bd003f0e81
@@ -117,11 +117,41 @@ and `./cache/huggingface/hubmodels--datalab-to--surya_layout0` like this
 4 directories, 10 files
 ```
 
+and `./cache/huggingface/hubmodels--datalab-to--ocr_error_detection` like this
+
+
+```
+.
+├── blobs
+│   ├── 21f54a4b56685f29358f3a8de1f5b8d827357d07
+│   ├── 9856c52ab99c8f7435bef6bf6e4c8a86a2594187
+│   ├── 9bbecc17cabbcbd3112c14d6982b51403b264bfa
+│   ├── a6344aac8c09253b3b630fb776ae94478aa0275b
+│   ├── c305af17d2fcaf52c00b125a2dfabfbe16e71454
+│   ├── cd3c57f2e967aad6a020decd1c1c41be-10
+│   ├── e837bab60a5d204e29622d127c2dafe508aa0731
+│   └── f4a46fa248690b0b2adc680e845ec8fd491eb24c
+├── refs
+│   └── main
+└── snapshots
+    └── c1cbda3757670fd520553eaa5197656d331de414
+        ├── config.json -> ../../blobs/9856c52ab99c8f7435bef6bf6e4c8a86a2594187
+        ├── model.safetensors -> ../../blobs/cd3c57f2e967aad6a020decd1c1c41be-10
+        ├── README.md -> ../../blobs/c305af17d2fcaf52c00b125a2dfabfbe16e71454
+        ├── special_tokens_map.json -> ../../blobs/9bbecc17cabbcbd3112c14d6982b51403b264bfa
+        ├── tokenizer_config.json -> ../../blobs/f4a46fa248690b0b2adc680e845ec8fd491eb24c
+        ├── tokenizer.json -> ../../blobs/21f54a4b56685f29358f3a8de1f5b8d827357d07
+        └── vocab.txt -> ../../blobs/e837bab60a5d204e29622d127c2dafe508aa0731
+
+5 directories, 16 files
+```
+
 It will use
 - `./cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95`
 - `./cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41`
 - `./cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14`
 - `./cache/huggingface/hub/hubmodels--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957`
+- `./cache/huggingface/hub/hubmodels--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414`
 
 For more details, refer to [up@cpu-offline/docker-compose.yml](./../docker/up@cpu-offline/docker-compose.yml).
 
@@ -143,4 +173,5 @@ huggingface-cli download vikp/surya_det3 --repo-type model --revision main --cac
 huggingface-cli download vikp/surya_rec2 --repo-type model --revision main --cache-dir ./cache/huggingface/hub
 huggingface-cli download vikp/surya_tablerec --repo-type model --revision main --cache-dir ./cache/huggingface/hub
 huggingface-cli download datalab-to/surya_layout0 --revision main --cache-dir ./cache/huggingface/hub
+huggingface-cli download datalab-to/ocr_error_detection --revision main --cache-dir ./cache/huggingface/hub
 ```
diff --git a/docker/build-without-fonts@pypi/dockerfile b/docker/build-without-fonts@pypi/dockerfile
@@ -1,5 +1,5 @@
-ARG SURYA_VERSION=0.8.0
-ARG STREAMLIT_VERSION=1.41.0
+ARG SURYA_VERSION=0.8.1
+ARG STREAMLIT_VERSION=1.41.1
 ARG GRADIO_VERSION=5.8.0
 
 FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime

diff --git a/docker/build-without-fonts@source/dockerfile b/docker/build-without-fonts@source/dockerfile
@@ -16,7 +16,7 @@ RUN pip3 install --upgrade pip
 COPY ./surya /app
 RUN pip3 install --no-cache-dir \
     -e . \
-    streamlit==1.41.0 \
+    streamlit==1.41.1 \
     gradio==5.8.0
 # The image is too large, exceeding 30GB.
 # RUN pip3 install poetry

diff --git a/docker/build@pypi/dockerfile b/docker/build@pypi/dockerfile
@@ -1,5 +1,5 @@
-ARG SURYA_VERSION=0.8.0
-ARG STREAMLIT_VERSION=1.41.0
+ARG SURYA_VERSION=0.8.1
+ARG STREAMLIT_VERSION=1.41.1
 ARG GRADIO_VERSION=5.8.0
 
 

diff --git a/docker/build@source/dockerfile b/docker/build@source/dockerfile
@@ -25,7 +25,7 @@ RUN pip3 install --upgrade pip
 COPY ./surya /app
 RUN pip3 install --no-cache-dir \
     -e . \
-    streamlit==1.41.0 \
+    streamlit==1.41.1 \
     gradio==5.8.0
 # The image is too large, with layers exceeding 10GB.
 # RUN pip3 install poetry

diff --git a/docker/up.gradio@cpu-offline/docker-compose.yml b/docker/up.gradio@cpu-offline/docker-compose.yml
@@ -2,7 +2,7 @@
 
 services:
   surya_gradio:
-    image: xiaoyao9184/surya:0.8.0
+    image: xiaoyao9184/surya:0.8.1
     container_name: surya_ocr_gradio
     working_dir: /workspace/gradio
     command: gradio gradio_app.py
@@ -13,6 +13,7 @@ services:
       - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
       - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
       - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
+      - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
     ports:
       - "7860:7860"
     volumes:

diff --git a/docker/up.gradio@gpu-offline/docker-compose.yml b/docker/up.gradio@gpu-offline/docker-compose.yml
@@ -2,7 +2,7 @@
 
 services:
   surya_gradio:
-    image: xiaoyao9184/surya:0.8.0
+    image: xiaoyao9184/surya:0.8.1
     container_name: surya_ocr_gradio
     working_dir: /workspace/gradio
     command: gradio gradio_app.py
@@ -13,6 +13,7 @@ services:
       - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
       - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
       - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
+      - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
     ports:
       - "7860:7860"
     volumes:

diff --git a/docker/up@cpu-offline/docker-compose.yml b/docker/up@cpu-offline/docker-compose.yml
@@ -2,7 +2,7 @@
 
 services:
   surya_app:
-    image: xiaoyao9184/surya:0.8.0
+    image: xiaoyao9184/surya:0.8.1
     container_name: surya_ocr_app
     environment:
       - TORCH_DEVICE=cpu
@@ -11,6 +11,7 @@ services:
       - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
       - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
       - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
+      - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
       # - FONT_DIR=/fonts # not work
     ports:
       - "8501:8501"

diff --git a/docker/up@gpu-offline/docker-compose.yml b/docker/up@gpu-offline/docker-compose.yml
@@ -2,7 +2,7 @@
 
 services:
   surya_app:
-    image: xiaoyao9184/surya:0.8.0
+    image: xiaoyao9184/surya:0.8.1
     container_name: surya_ocr_app
     environment:
       - TORCH_DEVICE=cuda
@@ -11,6 +11,7 @@ services:
       - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
       - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
       - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
+      - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
       # - FONT_DIR=/fonts # not work
     ports:
       - "8501:8501"

diff --git a/environment.yml b/environment.yml
@@ -11,7 +11,7 @@ dependencies:
   - pytorch==2.5.1
   - pytorch-cuda==12.4
 
-  - streamlit==1.41.0
+  - streamlit==1.41.1
 
   - pip:
     - -e ./surya

diff --git a/gradio/gradio_app.py b/gradio/gradio_app.py
@@ -22,6 +22,7 @@
 from surya.model.recognition.processor import load_processor as load_rec_processor
 from surya.model.table_rec.model import load_model as load_table_model
 from surya.model.table_rec.processor import load_processor as load_table_processor
+from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor
 from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
 from surya.ocr import run_ocr
 from surya.postprocessing.text import draw_text_on_image
@@ -31,7 +32,9 @@
 from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
 from surya.settings import settings
 from surya.tables import batch_table_recognition
-from surya.postprocessing.util import rescale_bboxes, rescale_bbox
+from surya.postprocessing.util import rescale_bbox
+from pdftext.extraction import plain_text_output
+from surya.ocr_error import batch_ocr_error_detection
 
 
 def load_det_cached():
@@ -46,6 +49,34 @@ def load_layout_cached():
 def load_table_cached():
     return load_table_model(), load_table_processor()
 
+def load_ocr_error_cached():
+    return load_ocr_error_model(), load_ocr_error_processor()
+
+
+def run_ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15):
+    # Sample the text from the middle of the PDF
+    page_middle = page_count // 2
+    page_range = range(max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count))
+    text = plain_text_output(pdf_file, page_range=page_range)
+
+    sample_gap = len(text) // max_samples
+    if len(text) == 0 or sample_gap == 0:
+        return "This PDF has no text or very little text", ["no text"]
+
+    if sample_gap < sample_len:
+        sample_gap = sample_len
+
+    # Split the text into samples for the model
+    samples = []
+    for i in range(0, len(text), sample_gap):
+        samples.append(text[i:i + sample_len])
+
+    results = batch_ocr_error_detection(samples, ocr_error_model, ocr_error_processor)
+    label = "This PDF has good text."
+    if results.labels.count("bad") / len(results.labels) > .2:
+        label = "This PDF may have garbled or bad OCR text."
+    return label, results.labels
+
 
 def text_detection(img) -> (Image.Image, TextDetectionResult):
     pred = batch_text_detection([img], det_model, det_processor)[0]
@@ -148,6 +179,7 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult):
 rec_model, rec_processor = load_rec_cached()
 layout_model, layout_processor = load_layout_cached()
 table_model, table_processor = load_table_cached()
+ocr_error_model, ocr_error_processor = load_ocr_error_cached()
 
 with gr.Blocks(title="Surya") as demo:
     gr.Markdown("""
@@ -179,6 +211,8 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult):
             use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
             skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
             table_rec_btn = gr.Button("Run Table Rec")
+
+            ocr_errors_btn = gr.Button("Run bad PDF text detection")
         with gr.Column():
             result_img = gr.Image(label="Result image")
             result_json = gr.JSON(label="Result json")
@@ -250,5 +284,18 @@ def table_rec_img(pil_image, in_file, page_number, use_pdf_boxes, skip_table_det
             inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
             outputs=[result_img, result_json]
         )
+        # Run bad PDF text detection
+        def ocr_errors_pdf(file, page_count, sample_len=512, max_samples=10, max_pages=15):
+            if file.endswith('.pdf'):
+                count = count_pdf(file)
+            else:
+                raise gr.Error("This feature only works with PDFs.", duration=5)
+            label, results = run_ocr_errors(file, count)
+            return gr.update(label="Result json:" + label, value=results)
+        ocr_errors_btn.click(
+            fn=ocr_errors_pdf,
+            inputs=[in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
+            outputs=[result_json]
+        )
 
 demo.launch()
diff --git a/gradio/requirements.txt b/gradio/requirements.txt
@@ -1,4 +1,4 @@
 torch==2.5.1
-surya-ocr==0.8.0
+surya-ocr==0.8.1
 gradio==5.8.0
 huggingface-hub==0.26.3
diff --git a/surya b/surya
+0 −6		.github/workflows/benchmarks.yml
+26 −0		.github/workflows/ci.yml
+1 −0		README.md
+0 −2		detect_layout.py
+55 −4		ocr_app.py
+406 −356		poetry.lock
+2 −1		pyproject.toml
+7 −0		pytest.ini
+8 −0		signatures/version1/cla.json
+3 −2		surya/benchmark/tesseract.py
+0 −2		surya/detection.py
+30 −20		surya/layout.py
+66 −0		surya/model/ocr_error/config.py
+797 −0		surya/model/ocr_error/encoder.py
+27 −0		surya/model/ocr_error/model.py
+642 −0		surya/model/ocr_error/tokenizer.py
+48 −0		surya/ocr_error.py
+9 −3		surya/schema.py
+10 −1		surya/settings.py
+22 −0		tests/conftest.py
+26 −0		tests/test_layout.py
+18 −0		tests/test_ocr_errors.py