Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script that uses pdf2image and pytesseract to extract text from PDFs #10

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ coverage==4.5.4
Sphinx==7.2.6
twine==5.0.0
ruff==0.3.5


pdf2image==1.17.0
pytesseract==0.3.13
4 changes: 4 additions & 0 deletions source_pdf_files/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore
53 changes: 53 additions & 0 deletions src/ocr_pdf_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import sys
import pdf2image
import pytesseract
from pytesseract import Output, TesseractError

# Ensure the user provides the directory with PDF files as an argument
if len(sys.argv) < 2:
print("Usage: python script_name.py <pdf_directory>")
sys.exit(1)

# Get the directory with PDFs from the command line argument
pdf_dir = sys.argv[1]

# Ensure the provided path is a directory
if not os.path.isdir(pdf_dir):
print(f"{pdf_dir} is not a valid directory")
sys.exit(1)

# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(os.path.dirname(script_dir), 'data')

# Create the output directory if it doesn't exist
if not os.path.exists(data_dir):
os.makedirs(data_dir)

# Iterate over all the PDF files in the provided directory
for file in os.listdir(pdf_dir):
if not file.endswith(".pdf"):
continue
pdf_path = os.path.join(pdf_dir, file)
print(f"Converting {pdf_path} to images")

try:
# Convert the PDF to images
images = pdf2image.convert_from_path(pdf_path)
print(f"pdf contains {len(images)} pages")
# Perform OCR on each image and save the result to a file
output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt")
with open(output_text_file, 'w') as f_out:
for i, image in enumerate(images):
# Perform OCR on the image
ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
text = " ".join(ocr_dict['text'])
f_out.write(f"{text}\n")

print(f"Finished processing {file}, output saved to {output_text_file}")

except TesseractError as e:
print(f"Error processing {file}: {e}")
except Exception as e:
print(f"An error occurred with {file}: {e}")
42 changes: 42 additions & 0 deletions src/pdf_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import sys
import pdftotext

# Ensure the user provides the directory with PDF files as an argument
if len(sys.argv) < 2:
print("Usage: python script_name.py <pdf_directory>")
sys.exit(1)

# Get the directory with PDFs from the command line argument
pdf_dir = sys.argv[1]

# Ensure the provided path is a directory
if not os.path.isdir(pdf_dir):
print(f"{pdf_dir} is not a valid directory")
sys.exit(1)

# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(os.path.dirname(script_dir), 'data')

# Create the output directory if it doesn't exist
if not os.path.exists(data_dir):
os.makedirs(data_dir)

# Iterate over all the PDF files in the provided directory
for file in os.listdir(pdf_dir):
if not file.endswith(".pdf"):
continue

pdf_path = os.path.join(pdf_dir, file)
print(f"Converting {pdf_path} to text")

with open(pdf_path, "rb") as f:
pdf = pdftotext.PDF(f)

output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}.txt")
with open(output_text_file, 'w') as f_out:
for page in pdf:
f_out.write(f"{page}\n")

print(f"Finished processing {file}, output saved to {output_text_file}")