Skip to content

Commit

Permalink
Do not return None
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Jan 8, 2025
1 parent c1e9b8e commit 0e44926
Showing 1 changed file with 2 additions and 4 deletions.
6 changes: 2 additions & 4 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,12 @@ def hocr_to_dataframe(
return ocr_df

@staticmethod
def extract_word_from_hocr(
word: Tag, character_confidence_threshold: float = 0.0
) -> str | None:
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""

character_spans = word.find_all("span", class_="ocrx_cinfo")
if len(character_spans) == 0:
return None
return ""

word_text = ""
for character_span in character_spans:
Expand Down

0 comments on commit 0e44926

Please sign in to comment.