Skip to content

Commit

Permalink
Bump surya version to 0.8.1
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoyao9184 committed Dec 22, 2024
1 parent 3178995 commit 1c161f3
Show file tree
Hide file tree
Showing 16 changed files with 109 additions and 24 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/docker-image-tag-version.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ on:
surya_version:
description: surya version of pypi
required: true
default: 0.8.0
default: 0.8.1
streamlit_version:
description: streamlit version of pypi
required: true
default: 1.41.0
default: 1.41.1

jobs:
build-and-push-docker-image:
Expand All @@ -29,9 +29,9 @@ jobs:
strategy:
matrix:
SURYA_VERSION:
- ${{ github.event.inputs.surya_version || '0.8.0' }}
- ${{ github.event.inputs.surya_version || '0.8.1' }}
STREAMLIT_VERSION:
- ${{ github.event.inputs.streamlit_version || '1.41.0' }}
- ${{ github.event.inputs.streamlit_version || '1.41.1' }}
platform:
- linux/amd64

Expand Down
6 changes: 4 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95",
"RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41",
"LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957",
"TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14"
"TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14",
"OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414"
},
"justMyCode": false
},
Expand Down Expand Up @@ -75,7 +76,8 @@
"DETECTOR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95",
"RECOGNITION_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41",
"LAYOUT_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957",
"TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14"
"TABLE_REC_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14",
"OCR_ERROR_MODEL_CHECKPOINT": "${workspaceFolder}/cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414"
},
"justMyCode": false
},
Expand Down
5 changes: 3 additions & 2 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
"command": "docker build -t xiaoyao9184/surya:master -f ./docker/build@source/dockerfile .",
},
{
"label": "docker: build 0.8.0",
"label": "docker: build 0.8.1",
"type": "shell",
"command": "docker build -t xiaoyao9184/surya:0.8.0 -f ./docker/build@pypi/dockerfile .",
"command": "docker build -t xiaoyao9184/surya:0.8.1 -f ./docker/build@pypi/dockerfile .",
},
{
"label": "huggingface-cli: download models",
Expand All @@ -43,6 +43,7 @@
"&& huggingface-cli download vikp/surya_rec2 --revision main --cache-dir ./cache/huggingface/hub",
"&& huggingface-cli download vikp/surya_tablerec --revision main --cache-dir ./cache/huggingface/hub",
"&& huggingface-cli download datalab-to/surya_layout0 --revision main --cache-dir ./cache/huggingface/hub",
"&& huggingface-cli download datalab-to/ocr_error_detection --revision main --cache-dir ./cache/huggingface/hub",
]
},
{
Expand Down
35 changes: 33 additions & 2 deletions cache/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this
│ ├── 31bdd446acbf8a47ea46d7d0a4998f145f0cc75a
│ ├── 5497e8690cfe93cbedec7efaf91f6ac734496ac8
│ ├── 93c190b5690dd55aac16723222a9909e2be0faec
│ ├── 9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
│ ├── 9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
│ ├── a6344aac8c09253b3b630fb776ae94478aa0275b
│ ├── a83ef0a8114bd50cc650e08a9738c0f6345f5186
│ ├── dd34282c30833587a799d334d44a637694d41c8e
Expand All @@ -54,7 +54,7 @@ and `./cache/huggingface/hub/models--vikp--surya_rec2` like this
├── config.json -> ../../blobs/5497e8690cfe93cbedec7efaf91f6ac734496ac8
├── generation_config.json -> ../../blobs/e237701f4293e736f74d2c968582935590107034
├── .gitattributes -> ../../blobs/a6344aac8c09253b3b630fb776ae94478aa0275b
├── model.safetensors -> ../../blobs/9a75b64cbeaed06820559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
├── model.safetensors -> ../../blobs/9a75b64cbeaed0.8.1559bcda4e12c1235de62b5bce787d57cf56a9c3a7123d1
├── preprocessor_config.json -> ../../blobs/dd34282c30833587a799d334d44a637694d41c8e
├── README.md -> ../../blobs/a83ef0a8114bd50cc650e08a9738c0f6345f5186
├── special_tokens_map.json -> ../../blobs/2f525ec0be1f2e8cb257a7b3e01de3bd003f0e81
Expand Down Expand Up @@ -117,11 +117,41 @@ and `./cache/huggingface/hubmodels--datalab-to--surya_layout0` like this
4 directories, 10 files
```

and `./cache/huggingface/hubmodels--datalab-to--ocr_error_detection` like this


```
.
├── blobs
│ ├── 21f54a4b56685f29358f3a8de1f5b8d827357d07
│ ├── 9856c52ab99c8f7435bef6bf6e4c8a86a2594187
│ ├── 9bbecc17cabbcbd3112c14d6982b51403b264bfa
│ ├── a6344aac8c09253b3b630fb776ae94478aa0275b
│ ├── c305af17d2fcaf52c00b125a2dfabfbe16e71454
│ ├── cd3c57f2e967aad6a020decd1c1c41be-10
│ ├── e837bab60a5d204e29622d127c2dafe508aa0731
│ └── f4a46fa248690b0b2adc680e845ec8fd491eb24c
├── refs
│ └── main
└── snapshots
└── c1cbda3757670fd520553eaa5197656d331de414
├── config.json -> ../../blobs/9856c52ab99c8f7435bef6bf6e4c8a86a2594187
├── model.safetensors -> ../../blobs/cd3c57f2e967aad6a020decd1c1c41be-10
├── README.md -> ../../blobs/c305af17d2fcaf52c00b125a2dfabfbe16e71454
├── special_tokens_map.json -> ../../blobs/9bbecc17cabbcbd3112c14d6982b51403b264bfa
├── tokenizer_config.json -> ../../blobs/f4a46fa248690b0b2adc680e845ec8fd491eb24c
├── tokenizer.json -> ../../blobs/21f54a4b56685f29358f3a8de1f5b8d827357d07
└── vocab.txt -> ../../blobs/e837bab60a5d204e29622d127c2dafe508aa0731
5 directories, 16 files
```

It will use
- `./cache/huggingface/hub/models--vikp--surya_det3/snapshots/467ee9ec33e6e6c5f73e57dbc1415b14032f5b95`
- `./cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41`
- `./cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14`
- `./cache/huggingface/hub/hubmodels--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957`
- `./cache/huggingface/hub/hubmodels--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414`

For more details, refer to [up@cpu-offline/docker-compose.yml](./../docker/up@cpu-offline/docker-compose.yml).

Expand All @@ -143,4 +173,5 @@ huggingface-cli download vikp/surya_det3 --repo-type model --revision main --cac
huggingface-cli download vikp/surya_rec2 --repo-type model --revision main --cache-dir ./cache/huggingface/hub
huggingface-cli download vikp/surya_tablerec --repo-type model --revision main --cache-dir ./cache/huggingface/hub
huggingface-cli download datalab-to/surya_layout0 --revision main --cache-dir ./cache/huggingface/hub
huggingface-cli download datalab-to/ocr_error_detection --revision main --cache-dir ./cache/huggingface/hub
```
4 changes: 2 additions & 2 deletions docker/build-without-fonts@pypi/dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG SURYA_VERSION=0.8.0
ARG STREAMLIT_VERSION=1.41.0
ARG SURYA_VERSION=0.8.1
ARG STREAMLIT_VERSION=1.41.1
ARG GRADIO_VERSION=5.8.0

FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
Expand Down
2 changes: 1 addition & 1 deletion docker/build-without-fonts@source/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ RUN pip3 install --upgrade pip
COPY ./surya /app
RUN pip3 install --no-cache-dir \
-e . \
streamlit==1.41.0 \
streamlit==1.41.1 \
gradio==5.8.0
# The image is too large, exceeding 30GB.
# RUN pip3 install poetry
Expand Down
4 changes: 2 additions & 2 deletions docker/build@pypi/dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG SURYA_VERSION=0.8.0
ARG STREAMLIT_VERSION=1.41.0
ARG SURYA_VERSION=0.8.1
ARG STREAMLIT_VERSION=1.41.1
ARG GRADIO_VERSION=5.8.0


Expand Down
2 changes: 1 addition & 1 deletion docker/build@source/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN pip3 install --upgrade pip
COPY ./surya /app
RUN pip3 install --no-cache-dir \
-e . \
streamlit==1.41.0 \
streamlit==1.41.1 \
gradio==5.8.0
# The image is too large, with layers exceeding 10GB.
# RUN pip3 install poetry
Expand Down
3 changes: 2 additions & 1 deletion docker/up.gradio@cpu-offline/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

services:
surya_gradio:
image: xiaoyao9184/surya:0.8.0
image: xiaoyao9184/surya:0.8.1
container_name: surya_ocr_gradio
working_dir: /workspace/gradio
command: gradio gradio_app.py
Expand All @@ -13,6 +13,7 @@ services:
- LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
- RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
- TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
- OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
ports:
- "7860:7860"
volumes:
Expand Down
3 changes: 2 additions & 1 deletion docker/up.gradio@gpu-offline/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

services:
surya_gradio:
image: xiaoyao9184/surya:0.8.0
image: xiaoyao9184/surya:0.8.1
container_name: surya_ocr_gradio
working_dir: /workspace/gradio
command: gradio gradio_app.py
Expand All @@ -13,6 +13,7 @@ services:
- LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
- RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
- TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
- OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
ports:
- "7860:7860"
volumes:
Expand Down
3 changes: 2 additions & 1 deletion docker/up@cpu-offline/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

services:
surya_app:
image: xiaoyao9184/surya:0.8.0
image: xiaoyao9184/surya:0.8.1
container_name: surya_ocr_app
environment:
- TORCH_DEVICE=cpu
Expand All @@ -11,6 +11,7 @@ services:
- LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
- RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
- TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
- OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
# - FONT_DIR=/fonts # not work
ports:
- "8501:8501"
Expand Down
3 changes: 2 additions & 1 deletion docker/up@gpu-offline/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

services:
surya_app:
image: xiaoyao9184/surya:0.8.0
image: xiaoyao9184/surya:0.8.1
container_name: surya_ocr_app
environment:
- TORCH_DEVICE=cuda
Expand All @@ -11,6 +11,7 @@ services:
- LAYOUT_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--surya_layout0/snapshots/421ac206a400227ea714d47a405e53ce74374957
- RECOGNITION_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_rec2/snapshots/6611509b2c3a32c141703ce19adc899d9d0abf41
- TABLE_REC_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--vikp--surya_tablerec/snapshots/8bca165f81e9cee5fb382413eb23175079917d14
- OCR_ERROR_MODEL_CHECKPOINT=/root/.cache/huggingface/hub/models--datalab-to--ocr_error_detection/snapshots/c1cbda3757670fd520553eaa5197656d331de414
# - FONT_DIR=/fonts # not work
ports:
- "8501:8501"
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
- pytorch==2.5.1
- pytorch-cuda==12.4

- streamlit==1.41.0
- streamlit==1.41.1

- pip:
- -e ./surya
Expand Down
49 changes: 48 additions & 1 deletion gradio/gradio_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from surya.model.recognition.processor import load_processor as load_rec_processor
from surya.model.table_rec.model import load_model as load_table_model
from surya.model.table_rec.processor import load_processor as load_table_processor
from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor
from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
from surya.ocr import run_ocr
from surya.postprocessing.text import draw_text_on_image
Expand All @@ -31,7 +32,9 @@
from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
from surya.settings import settings
from surya.tables import batch_table_recognition
from surya.postprocessing.util import rescale_bboxes, rescale_bbox
from surya.postprocessing.util import rescale_bbox
from pdftext.extraction import plain_text_output
from surya.ocr_error import batch_ocr_error_detection


def load_det_cached():
Expand All @@ -46,6 +49,34 @@ def load_layout_cached():
def load_table_cached():
return load_table_model(), load_table_processor()

def load_ocr_error_cached():
return load_ocr_error_model(), load_ocr_error_processor()


def run_ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15):
# Sample the text from the middle of the PDF
page_middle = page_count // 2
page_range = range(max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count))
text = plain_text_output(pdf_file, page_range=page_range)

sample_gap = len(text) // max_samples
if len(text) == 0 or sample_gap == 0:
return "This PDF has no text or very little text", ["no text"]

if sample_gap < sample_len:
sample_gap = sample_len

# Split the text into samples for the model
samples = []
for i in range(0, len(text), sample_gap):
samples.append(text[i:i + sample_len])

results = batch_ocr_error_detection(samples, ocr_error_model, ocr_error_processor)
label = "This PDF has good text."
if results.labels.count("bad") / len(results.labels) > .2:
label = "This PDF may have garbled or bad OCR text."
return label, results.labels


def text_detection(img) -> (Image.Image, TextDetectionResult):
pred = batch_text_detection([img], det_model, det_processor)[0]
Expand Down Expand Up @@ -148,6 +179,7 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult):
rec_model, rec_processor = load_rec_cached()
layout_model, layout_processor = load_layout_cached()
table_model, table_processor = load_table_cached()
ocr_error_model, ocr_error_processor = load_ocr_error_cached()

with gr.Blocks(title="Surya") as demo:
gr.Markdown("""
Expand Down Expand Up @@ -179,6 +211,8 @@ def ocr(img, highres_img, langs: List[str]) -> (Image.Image, OCRResult):
use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
table_rec_btn = gr.Button("Run Table Rec")

ocr_errors_btn = gr.Button("Run bad PDF text detection")
with gr.Column():
result_img = gr.Image(label="Result image")
result_json = gr.JSON(label="Result json")
Expand Down Expand Up @@ -250,5 +284,18 @@ def table_rec_img(pil_image, in_file, page_number, use_pdf_boxes, skip_table_det
inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
outputs=[result_img, result_json]
)
# Run bad PDF text detection
def ocr_errors_pdf(file, page_count, sample_len=512, max_samples=10, max_pages=15):
if file.endswith('.pdf'):
count = count_pdf(file)
else:
raise gr.Error("This feature only works with PDFs.", duration=5)
label, results = run_ocr_errors(file, count)
return gr.update(label="Result json:" + label, value=results)
ocr_errors_btn.click(
fn=ocr_errors_pdf,
inputs=[in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
outputs=[result_json]
)

demo.launch()
2 changes: 1 addition & 1 deletion gradio/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch==2.5.1
surya-ocr==0.8.0
surya-ocr==0.8.1
gradio==5.8.0
huggingface-hub==0.26.3

0 comments on commit 1c161f3

Please sign in to comment.