diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 444a0a6623..e72826ffce 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -98,7 +98,7 @@ def image_to_data_with_character_confidence_filter( hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr( image, lang=lang, - config="-c hocr_char_boxes=1 " + config, + config="-c hocr_char_boxes=1 psm=12" + config, extension="hocr", ) soup = BeautifulSoup(hocr, "html.parser")