Use word bboxes instead of character bboxees

Unstructured-IO · Jan 8, 2025 · c1e9b8e · c1e9b8e
1 parent 3bff8ae
commit c1e9b8e
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 52 deletions.
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -487,42 +487,67 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     assert any(element in final_layout for element in ocr_elements)
 
 
-def test_extract_word_from_hocr():
-    def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
-        word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
-        for char, x_conf, bbox in characters:
-            char_span = BeautifulSoup(
-                f"""
-                <span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
-                """,  # noqa : E501
-                "html.parser",
-            ).span
-            word_span.append(char_span)
-        return word_span
+def _create_hocr_word_span(
+    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
+) -> Tag:
+    word_span = BeautifulSoup(
+        f"<span class='ocrx_word' title='"
+        f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
+        f"; x_wconf 64'></span>",
+        "html.parser",
+    ).span
+    for char, x_conf in characters:
+        char_span = BeautifulSoup(
+            f"""
+            <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
+            """,  # noqa : E501
+            "html.parser",
+        ).span
+        word_span.append(char_span)
+    return word_span
+
 
+def test_extract_word_from_hocr():
     characters = [
-        ("w", "99.0", [10, 10, 20, 20]),
-        ("o", "98.5", [21, 9, 29, 20]),
-        ("r", "97.5", [31, 10, 40, 21]),
-        ("d", "96.0", [41, 11, 50, 22]),
-        ("!", "50.0", [51, 10, 60, 20]),
-        ("@", "45.0", [61, 10, 70, 20]),
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
     ]
+    word_bbox = (10, 9, 70, 22)
+    word_span = _create_hocr_word_span(characters, word_bbox)
 
-    word_span = _create_hocr_word_span(characters)
-
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
     assert text == "word!@"
-    assert bbox == [10, 9, 70, 22]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
     assert text == "word"
-    assert bbox == [10, 9, 50, 22]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
     assert text == "w"
-    assert bbox == [10, 10, 20, 20]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
     assert text == ""
-    assert bbox is None
+
+
+def test_hocr_to_dataframe():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    hocr = str(_create_hocr_word_span(characters, word_bbox))
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+
+    assert df.shape == (1, 5)
+    assert df["left"].iloc[0] == 10
+    assert df["top"].iloc[0] == 9
+    assert df["width"].iloc[0] == 60
+    assert df["height"].iloc[0] == 13
+    assert df["text"].iloc[0] == "word"
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -107,15 +107,22 @@ def hocr_to_dataframe(
         self, hocr: str, character_confidence_threshold: float = 0.0
     ) -> pd.DataFrame:
         soup = BeautifulSoup(hocr, "html.parser")
-        words = soup.find_all("span", class_="ocrx_word")
+        word_spans = soup.find_all("span", class_="ocrx_word")
 
         df_entries = []
-        for word in words:
-            text, bbox = self.extract_word_from_hocr(
-                word=word, character_confidence_threshold=character_confidence_threshold
+        for word_span in word_spans:
+            word_title = word_span.get("title", "")
+            bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
+
+            # Note: word bbox is used instead of combining characters together due to tesseract
+            # bug that causes the character bboxes to be outside the word bbox, and they have 0
+            # height or width when text is horizontal
+            text = self.extract_word_from_hocr(
+                word=word_span, character_confidence_threshold=character_confidence_threshold
             )
-            if text and bbox:
-                left, top, right, bottom = bbox
+            if text and bbox_match:
+                word_bbox = list(map(int, bbox_match.groups()))
+                left, top, right, bottom = word_bbox
                 df_entries.append(
                     {
                         "left": left,
@@ -131,42 +138,29 @@ def hocr_to_dataframe(
     @staticmethod
     def extract_word_from_hocr(
         word: Tag, character_confidence_threshold: float = 0.0
-    ) -> tuple[str, list[int] | None]:
+    ) -> str | None:
         """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
 
         character_spans = word.find_all("span", class_="ocrx_cinfo")
         if len(character_spans) == 0:
-            return "", None
+            return None
 
         word_text = ""
-        word_bbox = None
-
         for character_span in character_spans:
             char = character_span.text
 
             char_title = character_span.get("title", "")
             conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
-            bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)
 
-            if not (char and conf_match and bbox_match):
+            if not (char and conf_match):
                 continue
 
             character_probability = float(conf_match.group(1)) / 100
-            character_bbox = list(map(int, bbox_match.groups()))
 
             if character_probability >= character_confidence_threshold:
                 word_text += char
-                if word_bbox is None:
-                    word_bbox = character_bbox
-                else:
-                    word_bbox = [
-                        min(word_bbox[0], character_bbox[0]),  # x1 - starts from 0
-                        min(word_bbox[1], character_bbox[1]),  # y1 - starts from 0
-                        max(word_bbox[2], character_bbox[2]),
-                        max(word_bbox[3], character_bbox[3]),
-                    ]
-
-        return word_text, word_bbox
+
+        return word_text
 
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: