From 7e351b340c2669ebbc537c3ce8f37400f038893f Mon Sep 17 00:00:00 2001 From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:10:58 -0700 Subject: [PATCH 1/4] Add script that uses pdf2image and pytesseract to extract text from PDFs --- src/convert_pdf_to_text.py | 22 ++++++++++++++++++++++ test_pdfs/.gitignore | 4 ++++ 2 files changed, 26 insertions(+) create mode 100644 src/convert_pdf_to_text.py create mode 100644 test_pdfs/.gitignore diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py new file mode 100644 index 0000000..ca5651d --- /dev/null +++ b/src/convert_pdf_to_text.py @@ -0,0 +1,22 @@ +# brew install poppler +# pip install pdf2image +# brew install tesseract +# pip install pytesseract + +import pdf2image +import pytesseract +from pytesseract import Output, TesseractError + +pdf_path = '' +print(f"converting {pdf_path} to images") +images = pdf2image.convert_from_path(pdf_path) + +# pil_im = images[0] # assuming that we're interested in the first page only +for i, image in enumerate(images): + # ocr_dict now holds all the OCR info including text and location on the image + ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) + text = " ".join(ocr_dict['text']) + + print(f"Page {i+1}") + print(text) + # print(ocr_dict) diff --git a/test_pdfs/.gitignore b/test_pdfs/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/test_pdfs/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore From 6169b74554ec915cafc6cb46da882521333469e7 Mon Sep 17 00:00:00 2001 From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:29:47 -0700 Subject: [PATCH 2/4] Update script to take directory as input and process all pdfs in directory into text files --- {test_pdfs => source_pdf_files}/.gitignore | 0 src/convert_pdf_to_text.py | 61 ++++++++++++++++++---- 2 files changed, 50 insertions(+), 11 deletions(-) rename {test_pdfs => source_pdf_files}/.gitignore (100%) diff --git a/test_pdfs/.gitignore b/source_pdf_files/.gitignore similarity index 100% rename from test_pdfs/.gitignore rename to source_pdf_files/.gitignore diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py index ca5651d..7b7a3ca 100644 --- a/src/convert_pdf_to_text.py +++ b/src/convert_pdf_to_text.py @@ -3,20 +3,59 @@ # brew install tesseract # pip install pytesseract + + +import os +import sys import pdf2image import pytesseract from pytesseract import Output, TesseractError -pdf_path = '' -print(f"converting {pdf_path} to images") -images = pdf2image.convert_from_path(pdf_path) +# Ensure the user provides the directory with PDF files as an argument +if len(sys.argv) < 2: + print("Usage: python script_name.py ") + sys.exit(1) + +# Get the directory with PDFs from the command line argument +pdf_dir = sys.argv[1] + +# Ensure the provided path is a directory +if not os.path.isdir(pdf_dir): + print(f"{pdf_dir} is not a valid directory") + sys.exit(1) + +# Set up the output directory (assuming the script is in the src folder and we want data at the same level) +script_dir = os.path.dirname(os.path.abspath(__file__)) +data_dir = os.path.join(os.path.dirname(script_dir), 'data') + +# Create the output directory if it doesn't exist +if not os.path.exists(data_dir): + os.makedirs(data_dir) + +# Iterate over all the PDF files in the provided directory +for file in os.listdir(pdf_dir): + if file.endswith(".pdf"): + pdf_path = os.path.join(pdf_dir, file) + print(f"Converting {pdf_path} to images") + + try: + # Convert the PDF to images + images = pdf2image.convert_from_path(pdf_path) + print(f"pdf contains {len(images)} pages") + # Perform OCR on each image and save the result to a file + output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt") + with open(output_text_file, 'w') as f_out: + for i, image in enumerate(images): + # Perform OCR on the image + ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) + text = " ".join(ocr_dict['text']) + f_out.write(f"{text}\n") -# pil_im = images[0] # assuming that we're interested in the first page only -for i, image in enumerate(images): - # ocr_dict now holds all the OCR info including text and location on the image - ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) - text = " ".join(ocr_dict['text']) + print(f"Finished processing {file}, output saved to {output_text_file}") - print(f"Page {i+1}") - print(text) - # print(ocr_dict) + except TesseractError as e: + print(f"Error processing {file}: {e}") + except Exception as e: + print(f"An error occurred with {file}: {e}") + else: + print(f"Skipping {file}, not a PDF file") From 6a66fe8fee468bcc6dacc59bf4455360e800c808 Mon Sep 17 00:00:00 2001 From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:35:26 -0700 Subject: [PATCH 3/4] Add pip packages to requirements_dev --- requirements_dev.txt | 4 ++-- src/convert_pdf_to_text.py | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 474bd7f..e94309c 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -7,5 +7,5 @@ coverage==4.5.4 Sphinx==7.2.6 twine==5.0.0 ruff==0.3.5 - - +pdf2image==1.17.0 +pytesseract==0.3.13 diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py index 7b7a3ca..57dcd73 100644 --- a/src/convert_pdf_to_text.py +++ b/src/convert_pdf_to_text.py @@ -1,10 +1,3 @@ -# brew install poppler -# pip install pdf2image -# brew install tesseract -# pip install pytesseract - - - import os import sys import pdf2image From 3e3efae7362775d4a04cb25d4e87f4ed958ed1f2 Mon Sep 17 00:00:00 2001 From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:00:42 -0700 Subject: [PATCH 4/4] Rename ocr script, add script that uses pdftotext to extract text from PDFs --- src/convert_pdf_to_text.py | 54 -------------------------------------- src/ocr_pdf_to_text.py | 53 +++++++++++++++++++++++++++++++++++++ src/pdf_to_text.py | 42 +++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 54 deletions(-) delete mode 100644 src/convert_pdf_to_text.py create mode 100644 src/ocr_pdf_to_text.py create mode 100755 src/pdf_to_text.py diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py deleted file mode 100644 index 57dcd73..0000000 --- a/src/convert_pdf_to_text.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import sys -import pdf2image -import pytesseract -from pytesseract import Output, TesseractError - -# Ensure the user provides the directory with PDF files as an argument -if len(sys.argv) < 2: - print("Usage: python script_name.py ") - sys.exit(1) - -# Get the directory with PDFs from the command line argument -pdf_dir = sys.argv[1] - -# Ensure the provided path is a directory -if not os.path.isdir(pdf_dir): - print(f"{pdf_dir} is not a valid directory") - sys.exit(1) - -# Set up the output directory (assuming the script is in the src folder and we want data at the same level) -script_dir = os.path.dirname(os.path.abspath(__file__)) -data_dir = os.path.join(os.path.dirname(script_dir), 'data') - -# Create the output directory if it doesn't exist -if not os.path.exists(data_dir): - os.makedirs(data_dir) - -# Iterate over all the PDF files in the provided directory -for file in os.listdir(pdf_dir): - if file.endswith(".pdf"): - pdf_path = os.path.join(pdf_dir, file) - print(f"Converting {pdf_path} to images") - - try: - # Convert the PDF to images - images = pdf2image.convert_from_path(pdf_path) - print(f"pdf contains {len(images)} pages") - # Perform OCR on each image and save the result to a file - output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt") - with open(output_text_file, 'w') as f_out: - for i, image in enumerate(images): - # Perform OCR on the image - ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) - text = " ".join(ocr_dict['text']) - f_out.write(f"{text}\n") - - print(f"Finished processing {file}, output saved to {output_text_file}") - - except TesseractError as e: - print(f"Error processing {file}: {e}") - except Exception as e: - print(f"An error occurred with {file}: {e}") - else: - print(f"Skipping {file}, not a PDF file") diff --git a/src/ocr_pdf_to_text.py b/src/ocr_pdf_to_text.py new file mode 100644 index 0000000..f3c5ba6 --- /dev/null +++ b/src/ocr_pdf_to_text.py @@ -0,0 +1,53 @@ +import os +import sys +import pdf2image +import pytesseract +from pytesseract import Output, TesseractError + +# Ensure the user provides the directory with PDF files as an argument +if len(sys.argv) < 2: + print("Usage: python script_name.py ") + sys.exit(1) + +# Get the directory with PDFs from the command line argument +pdf_dir = sys.argv[1] + +# Ensure the provided path is a directory +if not os.path.isdir(pdf_dir): + print(f"{pdf_dir} is not a valid directory") + sys.exit(1) + +# Set up the output directory (assuming the script is in the src folder and we want data at the same level) +script_dir = os.path.dirname(os.path.abspath(__file__)) +data_dir = os.path.join(os.path.dirname(script_dir), 'data') + +# Create the output directory if it doesn't exist +if not os.path.exists(data_dir): + os.makedirs(data_dir) + +# Iterate over all the PDF files in the provided directory +for file in os.listdir(pdf_dir): + if not file.endswith(".pdf"): + continue + pdf_path = os.path.join(pdf_dir, file) + print(f"Converting {pdf_path} to images") + + try: + # Convert the PDF to images + images = pdf2image.convert_from_path(pdf_path) + print(f"pdf contains {len(images)} pages") + # Perform OCR on each image and save the result to a file + output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt") + with open(output_text_file, 'w') as f_out: + for i, image in enumerate(images): + # Perform OCR on the image + ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) + text = " ".join(ocr_dict['text']) + f_out.write(f"{text}\n") + + print(f"Finished processing {file}, output saved to {output_text_file}") + + except TesseractError as e: + print(f"Error processing {file}: {e}") + except Exception as e: + print(f"An error occurred with {file}: {e}") diff --git a/src/pdf_to_text.py b/src/pdf_to_text.py new file mode 100755 index 0000000..786e067 --- /dev/null +++ b/src/pdf_to_text.py @@ -0,0 +1,42 @@ +import os +import sys +import pdftotext + +# Ensure the user provides the directory with PDF files as an argument +if len(sys.argv) < 2: + print("Usage: python script_name.py ") + sys.exit(1) + +# Get the directory with PDFs from the command line argument +pdf_dir = sys.argv[1] + +# Ensure the provided path is a directory +if not os.path.isdir(pdf_dir): + print(f"{pdf_dir} is not a valid directory") + sys.exit(1) + +# Set up the output directory (assuming the script is in the src folder and we want data at the same level) +script_dir = os.path.dirname(os.path.abspath(__file__)) +data_dir = os.path.join(os.path.dirname(script_dir), 'data') + +# Create the output directory if it doesn't exist +if not os.path.exists(data_dir): + os.makedirs(data_dir) + +# Iterate over all the PDF files in the provided directory +for file in os.listdir(pdf_dir): + if not file.endswith(".pdf"): + continue + + pdf_path = os.path.join(pdf_dir, file) + print(f"Converting {pdf_path} to text") + + with open(pdf_path, "rb") as f: + pdf = pdftotext.PDF(f) + + output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}.txt") + with open(output_text_file, 'w') as f_out: + for page in pdf: + f_out.write(f"{page}\n") + + print(f"Finished processing {file}, output saved to {output_text_file}")