diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 64ba58e073..17589df06d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -136,14 +136,12 @@ def hocr_to_dataframe( return ocr_df @staticmethod - def extract_word_from_hocr( - word: Tag, character_confidence_threshold: float = 0.0 - ) -> str | None: + def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" character_spans = word.find_all("span", class_="ocrx_cinfo") if len(character_spans) == 0: - return None + return "" word_text = "" for character_span in character_spans: