From b30668eb10f842f02c03a4528573290bbabd021f Mon Sep 17 00:00:00 2001 From: xiaoyao9184 <6614349+xiaoyao9184@users.noreply.github.com> Date: Sun, 22 Dec 2024 17:54:51 +0800 Subject: [PATCH] Bump surya version to 0.8.1 --- .../workflows/docker-image-tag-version.yml | 8 +-- .vscode/launch.json | 6 ++- .vscode/tasks.json | 5 +- cache/README.md | 5 +- docker/build-without-fonts@pypi/dockerfile | 4 +- docker/build-without-fonts@source/dockerfile | 2 +- docker/build@pypi/dockerfile | 4 +- docker/build@source/dockerfile | 2 +- .../up.gradio@cpu-offline/docker-compose.yml | 3 +- .../up.gradio@gpu-offline/docker-compose.yml | 3 +- docker/up@cpu-offline/docker-compose.yml | 3 +- docker/up@gpu-offline/docker-compose.yml | 3 +- environment.yml | 2 +- gradio/gradio_app.py | 49 ++++++++++++++++++- gradio/requirements.txt | 2 +- surya | 2 +- 16 files changed, 79 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docker-image-tag-version.yml b/.github/workflows/docker-image-tag-version.yml index e73b792..8af25bf 100644 --- a/.github/workflows/docker-image-tag-version.yml +++ b/.github/workflows/docker-image-tag-version.yml @@ -13,11 +13,11 @@ on: surya_version: description: surya version of pypi required: true - default: 0.8.0 + default: 0.8.1 streamlit_version: description: streamlit version of pypi required: true - default: 1.41.0 + default: 1.41.1 jobs: build-and-push-docker-image: @@ -29,9 +29,9 @@ jobs: strategy: matrix: SURYA_VERSION: - - ${{ github.event.inputs.surya_version || '0.8.0' }} + - ${{ github.event.inputs.surya_version || '0.8.1' }} STREAMLIT_VERSION: - - ${{ github.event.inputs.streamlit_version || '1.41.0' }} + - ${{ github.event.inputs.streamlit_version || '1.41.1' }} platform: - linux/amd64 diff --git a/.vscode/launch.json b/.vscode/launch.json index 055370c..9872b44 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -37,7 +37,8 @@ "DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95", "RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41", "LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957", - "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14" + "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14", + "OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414" }, "justMyCode": false }, @@ -75,7 +76,8 @@ "DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95", "RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41", "LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957", - "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14" + "TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14", + "OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414" }, "justMyCode": false }, diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 73ef0c6..1b14f31 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -23,9 +23,9 @@ "command": "docker build -t xiaoyao9184/surya:master -f ./docker/build@source/dockerfile .", }, { - "label": "docker: build 0.8.0", + "label": "docker: build 0.8.1", "type": "shell", - "command": "docker build -t xiaoyao9184/surya:0.8.0 -f ./docker/build@pypi/dockerfile .", + "command": "docker build -t xiaoyao9184/surya:0.8.1 -f ./docker/build@pypi/dockerfile .", }, { "label": "huggingface-cli: download models", @@ -43,6 +43,7 @@ "&& huggingface-cli download vikp/surya_rec2 --revision main --cache-dir ./cache/huggingface/hub", "&& huggingface-cli download vikp/surya_tablerec --revision main --cache-dir ./cache/huggingface/hub", "&& huggingface-cli download datalab-to/surya_layout0 --revision main --cache-dir ./cache/huggingface/hub", + "&& huggingface-cli download datalab-to/ocr_error_detection --revision main --cache-dir ./cache/huggingface/hub", ] }, { diff --git a/cache/README.md b/cache/README.md index 9742098..f4c33e2 100644 --- a/cache/README.md +++ b/cache/README.md @@ -41,7 +41,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this │ ├── 31bdd446acbf8a47ea46d7d0a4998f145f0cc75a │ ├── 5497e8690cfe93cbedec7efaf91f6ac734496ac8 │ ├── 93c190b5690dd55aac16723222a9909e2be0faec -│ ├── 9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1 +│ ├── 9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1 │ ├── a6344aac8c09253b3b630fb776ae94478aa0275b │ ├── a83ef0a8114bd50cc650e08a9738c0f6345f5186 │ ├── dd34282c30833587a799d334d44a637694d41c8e @@ -54,7 +54,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this ├── config.json -> ../../blobs/5497e8690cfe93cbedec7efaf91f6ac734496ac8 ├── generation_config.json -> ../../blobs/e237701f4293e736f74d2c968582935590107034 ├── .gitattributes -> ../../blobs/a6344aac8c09253b3b630fb776ae94478aa0275b - ├── model.safetensors -> ../../blobs/9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1 + ├── model.safetensors -> ../../blobs/9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1 ├── preprocessor_config.json -> ../../blobs/dd34282c30833587a799d334d44a637694d41c8e ├── README.md -> ../../blobs/a83ef0a8114bd50cc650e08a9738c0f6345f5186 ├── special_tokens_map.json -> ../../blobs/2f525ec0be1f2e8cb257a7b3e01de3bd003f0e81 @@ -122,6 +122,7 @@ It will use - `./cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41` - `./cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14` - `./cache/huggingface/hub/hubmodels--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957` +- `./cache/huggingface/hub/hubmodels--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414` For more details, refer to [up@cpu-offline/docker-compose.yml](./../docker/up@cpu-offline/docker-compose.yml). diff --git a/docker/build-without-fonts@pypi/dockerfile b/docker/build-without-fonts@pypi/dockerfile index 5c1bd2a..0aebf71 100644 --- a/docker/build-without-fonts@pypi/dockerfile +++ b/docker/build-without-fonts@pypi/dockerfile @@ -1,5 +1,5 @@ -ARG SURYA_VERSION=0.8.0 -ARG STREAMLIT_VERSION=1.41.0 +ARG SURYA_VERSION=0.8.1 +ARG STREAMLIT_VERSION=1.41.1 ARG GRADIO_VERSION=5.8.0 FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-runtime diff --git a/docker/build-without-fonts@source/dockerfile b/docker/build-without-fonts@source/dockerfile index 2bd1715..1c9cfb0 100644 --- a/docker/build-without-fonts@source/dockerfile +++ b/docker/build-without-fonts@source/dockerfile @@ -16,7 +16,7 @@ RUN pip3 install --upgrade pip COPY ./surya /app RUN pip3 install --no-cache-dir \ -e . \ - streamlit==1.41.0 \ + streamlit==1.41.1 \ gradio==5.8.0 # The image is too large, exceeding 30GB. # RUN pip3 install poetry diff --git a/docker/build@pypi/dockerfile b/docker/build@pypi/dockerfile index 6410f4e..535b175 100644 --- a/docker/build@pypi/dockerfile +++ b/docker/build@pypi/dockerfile @@ -1,5 +1,5 @@ -ARG SURYA_VERSION=0.8.0 -ARG STREAMLIT_VERSION=1.41.0 +ARG SURYA_VERSION=0.8.1 +ARG STREAMLIT_VERSION=1.41.1 ARG GRADIO_VERSION=5.8.0 diff --git a/docker/build@source/dockerfile b/docker/build@source/dockerfile index cf12d50..b17e68b 100644 --- a/docker/build@source/dockerfile +++ b/docker/build@source/dockerfile @@ -25,7 +25,7 @@ RUN pip3 install --upgrade pip COPY ./surya /app RUN pip3 install --no-cache-dir \ -e . \ - streamlit==1.41.0 \ + streamlit==1.41.1 \ gradio==5.8.0 # The image is too large, with layers exceeding 10GB. # RUN pip3 install poetry diff --git a/docker/up.gradio@cpu-offline/docker-compose.yml b/docker/up.gradio@cpu-offline/docker-compose.yml index 6e9ee03..568b6cf 100644 --- a/docker/up.gradio@cpu-offline/docker-compose.yml +++ b/docker/up.gradio@cpu-offline/docker-compose.yml @@ -2,7 +2,7 @@ services: surya_gradio: - image: xiaoyao9184/surya:0.8.0 + image: xiaoyao9184/surya:0.8.1 container_name: surya_ocr_gradio working_dir: /workspace/gradio command: gradio gradio_app.py @@ -13,6 +13,7 @@ services: - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957 - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41 - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14 + - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414 ports: - "7860:7860" volumes: diff --git a/docker/up.gradio@gpu-offline/docker-compose.yml b/docker/up.gradio@gpu-offline/docker-compose.yml index ac0ab91..41e2ccf 100644 --- a/docker/up.gradio@gpu-offline/docker-compose.yml +++ b/docker/up.gradio@gpu-offline/docker-compose.yml @@ -2,7 +2,7 @@ services: surya_gradio: - image: xiaoyao9184/surya:0.8.0 + image: xiaoyao9184/surya:0.8.1 container_name: surya_ocr_gradio working_dir: /workspace/gradio command: gradio gradio_app.py @@ -13,6 +13,7 @@ services: - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957 - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41 - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14 + - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414 ports: - "7860:7860" volumes: diff --git a/docker/up@cpu-offline/docker-compose.yml b/docker/up@cpu-offline/docker-compose.yml index 282c4a3..f7f6e65 100644 --- a/docker/up@cpu-offline/docker-compose.yml +++ b/docker/up@cpu-offline/docker-compose.yml @@ -2,7 +2,7 @@ services: surya_app: - image: xiaoyao9184/surya:0.8.0 + image: xiaoyao9184/surya:0.8.1 container_name: surya_ocr_app environment: - TORCH_DEVICE=cpu @@ -11,6 +11,7 @@ services: - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957 - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41 - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14 + - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414 # - FONT_DIR=/fonts # not work ports: - "8501:8501" diff --git a/docker/up@gpu-offline/docker-compose.yml b/docker/up@gpu-offline/docker-compose.yml index ba958f3..4676a16 100644 --- a/docker/up@gpu-offline/docker-compose.yml +++ b/docker/up@gpu-offline/docker-compose.yml @@ -2,7 +2,7 @@ services: surya_app: - image: xiaoyao9184/surya:0.8.0 + image: xiaoyao9184/surya:0.8.1 container_name: surya_ocr_app environment: - TORCH_DEVICE=cuda @@ -11,6 +11,7 @@ services: - LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957 - RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41 - TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14 + - OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414 # - FONT_DIR=/fonts # not work ports: - "8501:8501" diff --git a/environment.yml b/environment.yml index 5adf8ab..8c7f65c 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: - pytorch==2.4.1 - pytorch-cuda==12.4 - - streamlit==1.41.0 + - streamlit==1.41.1 - pip: - -e ./surya diff --git a/gradio/gradio_app.py b/gradio/gradio_app.py index 48ba143..31ef78e 100644 --- a/gradio/gradio_app.py +++ b/gradio/gradio_app.py @@ -22,6 +22,7 @@ from surya.model.recognition.processor import load_processor as load_rec_processor from surya.model.table_rec.model import load_model as load_table_model from surya.model.table_rec.processor import load_processor as load_table_processor +from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image from surya.ocr import run_ocr from surya.postprocessing.text import draw_text_on_image @@ -31,7 +32,9 @@ from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult from surya.settings import settings from surya.tables import batch_table_recognition -from surya.postprocessing.util import rescale_bboxes, rescale_bbox +from surya.postprocessing.util import rescale_bbox +from pdftext.extraction import plain_text_output +from surya.ocr_error import batch_ocr_error_detection def load_det_cached(): @@ -46,6 +49,34 @@ def load_layout_cached(): def load_table_cached(): return load_table_model(), load_table_processor() +def load_ocr_error_cached(): + return load_ocr_error_model(), load_ocr_error_processor() + + +def run_ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15): + # Sample the text from the middle of the PDF + page_middle = page_count // 2 + page_range = range(max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count)) + text = plain_text_output(pdf_file, page_range=page_range) + + sample_gap = len(text) // max_samples + if len(text) == 0 or sample_gap == 0: + return "This PDF has no text or very little text", ["no text"] + + if sample_gap < sample_len: + sample_gap = sample_len + + # Split the text into samples for the model + samples = [] + for i in range(0, len(text), sample_gap): + samples.append(text[i:i + sample_len]) + + results = batch_ocr_error_detection(samples, ocr_error_model, ocr_error_processor) + label = "This PDF has good text." + if results.labels.count("bad") / len(results.labels) > .2: + label = "This PDF may have garbled or bad OCR text." + return label, results.labels + def text_detection(img) -> (Image.Image, TextDetectionResult): pred = batch_text_detection([img], det_model, det_processor)[0] @@ -148,6 +179,7 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult): rec_model, rec_processor = load_rec_cached() layout_model, layout_processor = load_layout_cached() table_model, table_processor = load_table_cached() +ocr_error_model, ocr_error_processor = load_ocr_error_cached() with gr.Blocks(title="Surya") as demo: gr.Markdown(""" @@ -179,6 +211,8 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult): use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.") skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.") table_rec_btn = gr.Button("Run Table Rec") + + ocr_errors_btn = gr.Button("Run bad PDF text detection") with gr.Column(): result_img = gr.Image(label="Result image") result_json = gr.JSON(label="Result json") @@ -250,5 +284,18 @@ def table_rec_img(pil_image, in_file, page_number, use_pdf_boxes, skip_table_det inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb], outputs=[result_img, result_json] ) + # Run bad PDF text detection + def ocr_errors_pdf(file, page_count, sample_len=512, max_samples=10, max_pages=15): + if file.endswith('.pdf'): + count = count_pdf(file) + else: + raise gr.Error("This feature only works with PDFs.", duration=5) + label, results = run_ocr_errors(file, count) + return gr.update(label="Result json:" + label, value=results) + ocr_errors_btn.click( + fn=ocr_errors_pdf, + inputs=[in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb], + outputs=[result_json] + ) demo.launch() diff --git a/gradio/requirements.txt b/gradio/requirements.txt index d628929..90fd8dd 100644 --- a/gradio/requirements.txt +++ b/gradio/requirements.txt @@ -1,4 +1,4 @@ torch==2.5.1 -surya-ocr==0.8.0 +surya-ocr==0.8.1 gradio==5.8.0 huggingface-hub==0.26.3 \ No newline at end of file diff --git a/surya b/surya index b46d5ce..0a82cc7 160000 --- a/surya +++ b/surya @@ -1 +1 @@ -Subproject commit b46d5ce2f692cfc59233ae1ec10401b8e98368fd +Subproject commit 0a82cc76b96703aaa5500f6320e8897a0d8e7812