Skip to content

Commit

Permalink
Use word bboxes instead of character bboxees
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Jan 8, 2025
1 parent 3bff8ae commit c1e9b8e
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 52 deletions.
81 changes: 53 additions & 28 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,42 +487,67 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
assert any(element in final_layout for element in ocr_elements)


def test_extract_word_from_hocr():
def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
for char, x_conf, bbox in characters:
char_span = BeautifulSoup(
f"""
<span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
""", # noqa : E501
"html.parser",
).span
word_span.append(char_span)
return word_span
def _create_hocr_word_span(
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
) -> Tag:
word_span = BeautifulSoup(
f"<span class='ocrx_word' title='"
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
f"; x_wconf 64'></span>",
"html.parser",
).span
for char, x_conf in characters:
char_span = BeautifulSoup(
f"""
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
""", # noqa : E501
"html.parser",
).span
word_span.append(char_span)
return word_span


def test_extract_word_from_hocr():
characters = [
("w", "99.0", [10, 10, 20, 20]),
("o", "98.5", [21, 9, 29, 20]),
("r", "97.5", [31, 10, 40, 21]),
("d", "96.0", [41, 11, 50, 22]),
("!", "50.0", [51, 10, 60, 20]),
("@", "45.0", [61, 10, 70, 20]),
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
word_span = _create_hocr_word_span(characters, word_bbox)

word_span = _create_hocr_word_span(characters)

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
assert text == "word!@"
assert bbox == [10, 9, 70, 22]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
assert text == "word"
assert bbox == [10, 9, 50, 22]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
assert text == "w"
assert bbox == [10, 10, 20, 20]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
assert text == ""
assert bbox is None


def test_hocr_to_dataframe():
characters = [
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
hocr = str(_create_hocr_word_span(characters, word_bbox))
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)

assert df.shape == (1, 5)
assert df["left"].iloc[0] == 10
assert df["top"].iloc[0] == 9
assert df["width"].iloc[0] == 60
assert df["height"].iloc[0] == 13
assert df["text"].iloc[0] == "word"
42 changes: 18 additions & 24 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,22 @@ def hocr_to_dataframe(
self, hocr: str, character_confidence_threshold: float = 0.0
) -> pd.DataFrame:
soup = BeautifulSoup(hocr, "html.parser")
words = soup.find_all("span", class_="ocrx_word")
word_spans = soup.find_all("span", class_="ocrx_word")

df_entries = []
for word in words:
text, bbox = self.extract_word_from_hocr(
word=word, character_confidence_threshold=character_confidence_threshold
for word_span in word_spans:
word_title = word_span.get("title", "")
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)

# Note: word bbox is used instead of combining characters together due to tesseract
# bug that causes the character bboxes to be outside the word bbox, and they have 0
# height or width when text is horizontal
text = self.extract_word_from_hocr(
word=word_span, character_confidence_threshold=character_confidence_threshold
)
if text and bbox:
left, top, right, bottom = bbox
if text and bbox_match:
word_bbox = list(map(int, bbox_match.groups()))
left, top, right, bottom = word_bbox
df_entries.append(
{
"left": left,
Expand All @@ -131,42 +138,29 @@ def hocr_to_dataframe(
@staticmethod
def extract_word_from_hocr(
word: Tag, character_confidence_threshold: float = 0.0
) -> tuple[str, list[int] | None]:
) -> str | None:
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""

character_spans = word.find_all("span", class_="ocrx_cinfo")
if len(character_spans) == 0:
return "", None
return None

word_text = ""
word_bbox = None

for character_span in character_spans:
char = character_span.text

char_title = character_span.get("title", "")
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)

if not (char and conf_match and bbox_match):
if not (char and conf_match):
continue

character_probability = float(conf_match.group(1)) / 100
character_bbox = list(map(int, bbox_match.groups()))

if character_probability >= character_confidence_threshold:
word_text += char
if word_bbox is None:
word_bbox = character_bbox
else:
word_bbox = [
min(word_bbox[0], character_bbox[0]), # x1 - starts from 0
min(word_bbox[1], character_bbox[1]), # y1 - starts from 0
max(word_bbox[2], character_bbox[2]),
max(word_bbox[3], character_bbox[3]),
]

return word_text, word_bbox

return word_text

@requires_dependencies("unstructured_inference")
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
Expand Down

0 comments on commit c1e9b8e

Please sign in to comment.