-
Notifications
You must be signed in to change notification settings - Fork 818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Character confidence threshold #3860
Changes from 15 commits
a8dd7b8
9e31ebc
c0f2768
052ae50
4b54d8a
6fcd3f4
c157a66
137678f
c25039f
3bff8ae
c1e9b8e
0e44926
2d9054d
a61aa85
1611a61
cee5440
b2dd3fe
c5b6570
013a351
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.16.12" # pragma: no cover | ||
__version__ = "0.16.13-dev0" # pragma: no cover |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
from __future__ import annotations | ||
|
||
import os | ||
import re | ||
from typing import TYPE_CHECKING, List | ||
|
||
import cv2 | ||
import numpy as np | ||
import pandas as pd | ||
import unstructured_pytesseract | ||
from bs4 import BeautifulSoup, Tag | ||
from PIL import Image as PILImage | ||
from unstructured_pytesseract import Output | ||
|
||
from unstructured.logger import trace_logger | ||
from unstructured.partition.utils.config import env_config | ||
|
@@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: | |
|
||
trace_logger.detail("Processing entire page OCR with tesseract...") | ||
zoom = 1 | ||
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( | ||
ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter( | ||
np.array(image), | ||
lang=self.language, | ||
output_type=Output.DATAFRAME, | ||
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, | ||
) | ||
ocr_df = ocr_df.dropna() | ||
|
||
|
@@ -76,17 +77,89 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: | |
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), | ||
max_zoom, | ||
) | ||
ocr_df = unstructured_pytesseract.image_to_data( | ||
ocr_df = self.image_to_data_with_character_confidence_filter( | ||
np.array(zoom_image(image, zoom)), | ||
lang=self.language, | ||
output_type=Output.DATAFRAME, | ||
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, | ||
) | ||
ocr_df = ocr_df.dropna() | ||
|
||
ocr_regions = self.parse_data(ocr_df, zoom=zoom) | ||
|
||
return ocr_regions | ||
|
||
def image_to_data_with_character_confidence_filter( | ||
self, | ||
image: np.ndarray, | ||
lang: str = "eng", | ||
config: str = "", | ||
character_confidence_threshold: float = 0.5, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we are adding some default, so maybe let's also keep it in config? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see below we again have 0.5 as a default in |
||
) -> pd.DataFrame: | ||
hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( | ||
image, | ||
lang=lang, | ||
config="-c hocr_char_boxes=1 " + config, | ||
extension="hocr", | ||
) | ||
ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold) | ||
return ocr_df | ||
|
||
def hocr_to_dataframe( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the compute performance with this code? We essentially were relying on tesseract internal cpp code to parse results but here we do it in python. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not analyzed this. We simply iterate over ~300 words, I am not sure there is any risk of significant slowdowns. What do you think? |
||
self, hocr: str, character_confidence_threshold: float = 0.0 | ||
) -> pd.DataFrame: | ||
soup = BeautifulSoup(hocr, "html.parser") | ||
word_spans = soup.find_all("span", class_="ocrx_word") | ||
|
||
df_entries = [] | ||
for word_span in word_spans: | ||
word_title = word_span.get("title", "") | ||
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title) | ||
|
||
# Note: word bbox is used instead of combining characters together due to tesseract | ||
# bug that causes the character bboxes to be outside the word bbox, and they have 0 | ||
# height or width when text is horizontal | ||
text = self.extract_word_from_hocr( | ||
word=word_span, character_confidence_threshold=character_confidence_threshold | ||
) | ||
if text and bbox_match: | ||
word_bbox = list(map(int, bbox_match.groups())) | ||
left, top, right, bottom = word_bbox | ||
df_entries.append( | ||
{ | ||
"left": left, | ||
"top": top, | ||
"width": right - left, | ||
"height": bottom - top, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. small nit on performance we can create df using bbox first then use vector ops to compute width and height (and overwrite the data for right and bottom). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
"text": text, | ||
} | ||
) | ||
ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"]) | ||
return ocr_df | ||
|
||
@staticmethod | ||
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str: | ||
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" | ||
|
||
character_spans = word.find_all("span", class_="ocrx_cinfo") | ||
if len(character_spans) == 0: | ||
return "" | ||
|
||
word_text = "" | ||
for character_span in character_spans: | ||
char = character_span.text | ||
|
||
char_title = character_span.get("title", "") | ||
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) | ||
|
||
if not (char and conf_match): | ||
continue | ||
|
||
character_probability = float(conf_match.group(1)) / 100 | ||
|
||
if character_probability >= character_confidence_threshold: | ||
word_text += char | ||
|
||
return word_text | ||
|
||
@requires_dependencies("unstructured_inference") | ||
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: | ||
from unstructured.partition.pdf_image.inference_utils import ( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder, maybe we'd like to have some really low default threshold, i.e. 0.1, just to filter out complete garbage chars?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am ok with 0; the default behavior is no filter at all so this PR should just keep that for now. We can use followups to change this value.