Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Character confidence threshold #3860

Merged
merged 19 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
## 0.16.13-dev0
## 0.16.13-dev1

### Enhancements
- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.

### Features

### Fixes

- **Fix NLTK Download** to use nltk assets in docker image
- removed the ability to automatically download nltk package if missing

## 0.16.12

### Enhancements
Expand Down
86 changes: 82 additions & 4 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pytest
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
Expand Down Expand Up @@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):

def test_get_ocr_layout_from_image_tesseract(monkeypatch):
monkeypatch.setattr(
unstructured_pytesseract,
"image_to_data",
OCRAgentTesseract,
"image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
Expand Down Expand Up @@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
monkeypatch.setattr(
unstructured_pytesseract,
"image_to_data",
OCRAgentTesseract,
"image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
Expand Down Expand Up @@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
# Check if the final layout contains both original elements and OCR-derived elements
assert all(element in final_layout for element in mock_out_layout)
assert any(element in final_layout for element in ocr_elements)


def _create_hocr_word_span(
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
) -> Tag:
word_span = BeautifulSoup(
f"<span class='ocrx_word' title='"
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
f"; x_wconf 64'></span>",
"html.parser",
).span
for char, x_conf in characters:
char_span = BeautifulSoup(
f"""
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
""", # noqa : E501
"html.parser",
).span
word_span.append(char_span)
return word_span


def test_extract_word_from_hocr():
characters = [
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
word_span = _create_hocr_word_span(characters, word_bbox)

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
assert text == "word!@"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
assert text == "word"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
assert text == "w"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
assert text == ""


def test_hocr_to_dataframe():
characters = [
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
hocr = str(_create_hocr_word_span(characters, word_bbox))
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)

assert df.shape == (1, 5)
assert df["left"].iloc[0] == 10
assert df["top"].iloc[0] == 9
assert df["width"].iloc[0] == 60
assert df["height"].iloc[0] == 13
assert df["text"].iloc[0] == "word"


def test_hocr_to_dataframe_when_no_prediction_empty_df():
df = OCRAgentTesseract().hocr_to_dataframe(hocr="")

assert df.shape == (0, 5)
assert "left" in df.columns
assert "top" in df.columns
assert "width" in df.columns
assert "text" in df.columns
assert "text" in df.columns
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
[
(
PartitionStrategy.HI_RES,
"unstructured_pytesseract.image_to_data",
"unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
"unstructured_pytesseract.image_to_data",
"unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.13-dev0" # pragma: no cover
__version__ = "0.16.13-dev1" # pragma: no cover
5 changes: 5 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)

@property
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
"""Tesseract predictions with confidence below this threshold are ignored"""
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder, maybe we'd like to have some really low default threshold, i.e. 0.1, just to filter out complete garbage chars?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am ok with 0; the default behavior is no filter at all so this PR should just keep that for now. We can use followups to change this value.


@property
def GOOGLEVISION_API_ENDPOINT(self) -> str:
"""API endpoint to use for Google Vision"""
Expand Down
90 changes: 84 additions & 6 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from __future__ import annotations

import os
import re
from typing import TYPE_CHECKING, List

import cv2
import numpy as np
import pandas as pd
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from PIL import Image as PILImage
from unstructured_pytesseract import Output

from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:

trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
np.array(image),
lang=self.language,
output_type=Output.DATAFRAME,
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()

Expand All @@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom,
)
ocr_df = unstructured_pytesseract.image_to_data(
ocr_df = self.image_to_data_with_character_confidence_filter(
np.array(zoom_image(image, zoom)),
lang=self.language,
output_type=Output.DATAFRAME,
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()

ocr_regions = self.parse_data(ocr_df, zoom=zoom)

return ocr_regions

def image_to_data_with_character_confidence_filter(
self,
image: np.ndarray,
lang: str = "eng",
config: str = "",
character_confidence_threshold: float = 0.0,
) -> pd.DataFrame:
hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
image,
lang=lang,
config="-c hocr_char_boxes=1 " + config,
extension="hocr",
)
ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
return ocr_df

def hocr_to_dataframe(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the compute performance with this code? We essentially were relying on tesseract internal cpp code to parse results but here we do it in python.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not analyzed this. We simply iterate over ~300 words, I am not sure there is any risk of significant slowdowns. What do you think?

self, hocr: str, character_confidence_threshold: float = 0.0
) -> pd.DataFrame:
soup = BeautifulSoup(hocr, "html.parser")
word_spans = soup.find_all("span", class_="ocrx_word")

df_entries = []
for word_span in word_spans:
word_title = word_span.get("title", "")
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)

# Note: word bbox is used instead of combining characters together due to tesseract
# bug that causes the character bboxes to be outside the word bbox, and they have 0
# height or width when text is horizontal
text = self.extract_word_from_hocr(
word=word_span, character_confidence_threshold=character_confidence_threshold
)
if text and bbox_match:
word_bbox = list(map(int, bbox_match.groups()))
left, top, right, bottom = word_bbox
df_entries.append(
{
"left": left,
"top": top,
"right": right,
"bottom": bottom,
"text": text,
}
)
ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"])

ocr_df["width"] = ocr_df["right"] - ocr_df["left"]
ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"]

ocr_df = ocr_df.drop(columns=["right", "bottom"])
return ocr_df

@staticmethod
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""

character_spans = word.find_all("span", class_="ocrx_cinfo")
if len(character_spans) == 0:
return ""

word_text = ""
for character_span in character_spans:
char = character_span.text

char_title = character_span.get("title", "")
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)

if not (char and conf_match):
continue

character_probability = float(conf_match.group(1)) / 100

if character_probability >= character_confidence_threshold:
word_text += char

return word_text

@requires_dependencies("unstructured_inference")
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
from unstructured.partition.pdf_image.inference_utils import (
Expand Down
Loading