From 7e351b340c2669ebbc537c3ce8f37400f038893f Mon Sep 17 00:00:00 2001
From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com>
Date: Thu, 19 Sep 2024 12:10:58 -0700
Subject: [PATCH 1/4] Add script that uses pdf2image and pytesseract to extract
 text from PDFs

---
 src/convert_pdf_to_text.py | 22 ++++++++++++++++++++++
 test_pdfs/.gitignore       |  4 ++++
 2 files changed, 26 insertions(+)
 create mode 100644 src/convert_pdf_to_text.py
 create mode 100644 test_pdfs/.gitignore

diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py
new file mode 100644
index 0000000..ca5651d
--- /dev/null
+++ b/src/convert_pdf_to_text.py
@@ -0,0 +1,22 @@
+# brew install poppler
+# pip install pdf2image
+# brew install tesseract
+# pip install pytesseract
+
+import pdf2image
+import pytesseract
+from pytesseract import Output, TesseractError
+
+pdf_path = '<FILL_THIS_IN>'
+print(f"converting {pdf_path} to images")
+images = pdf2image.convert_from_path(pdf_path)
+
+# pil_im = images[0] # assuming that we're interested in the first page only
+for i, image in enumerate(images):
+    # ocr_dict now holds all the OCR info including text and location on the image
+    ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
+    text = " ".join(ocr_dict['text'])
+
+    print(f"Page {i+1}")
+    print(text)
+    # print(ocr_dict)
diff --git a/test_pdfs/.gitignore b/test_pdfs/.gitignore
new file mode 100644
index 0000000..5e7d273
--- /dev/null
+++ b/test_pdfs/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore

From 6169b74554ec915cafc6cb46da882521333469e7 Mon Sep 17 00:00:00 2001
From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:29:47 -0700
Subject: [PATCH 2/4] Update script to take directory as input and process all
 pdfs in directory into text files

---
 {test_pdfs => source_pdf_files}/.gitignore |  0
 src/convert_pdf_to_text.py                 | 61 ++++++++++++++++++----
 2 files changed, 50 insertions(+), 11 deletions(-)
 rename {test_pdfs => source_pdf_files}/.gitignore (100%)

diff --git a/test_pdfs/.gitignore b/source_pdf_files/.gitignore
similarity index 100%
rename from test_pdfs/.gitignore
rename to source_pdf_files/.gitignore
diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py
index ca5651d..7b7a3ca 100644
--- a/src/convert_pdf_to_text.py
+++ b/src/convert_pdf_to_text.py
@@ -3,20 +3,59 @@
 # brew install tesseract
 # pip install pytesseract
 
+
+
+import os
+import sys
 import pdf2image
 import pytesseract
 from pytesseract import Output, TesseractError
 
-pdf_path = '<FILL_THIS_IN>'
-print(f"converting {pdf_path} to images")
-images = pdf2image.convert_from_path(pdf_path)
+# Ensure the user provides the directory with PDF files as an argument
+if len(sys.argv) < 2:
+    print("Usage: python script_name.py <pdf_directory>")
+    sys.exit(1)
+
+# Get the directory with PDFs from the command line argument
+pdf_dir = sys.argv[1]
+
+# Ensure the provided path is a directory
+if not os.path.isdir(pdf_dir):
+    print(f"{pdf_dir} is not a valid directory")
+    sys.exit(1)
+
+# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
+script_dir = os.path.dirname(os.path.abspath(__file__))
+data_dir = os.path.join(os.path.dirname(script_dir), 'data')
+
+# Create the output directory if it doesn't exist
+if not os.path.exists(data_dir):
+    os.makedirs(data_dir)
+
+# Iterate over all the PDF files in the provided directory
+for file in os.listdir(pdf_dir):
+    if file.endswith(".pdf"):
+        pdf_path = os.path.join(pdf_dir, file)
+        print(f"Converting {pdf_path} to images")
+
+        try:
+            # Convert the PDF to images
+            images = pdf2image.convert_from_path(pdf_path)
+            print(f"pdf contains {len(images)} pages")
+            # Perform OCR on each image and save the result to a file
+            output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt")
+            with open(output_text_file, 'w') as f_out:
+                for i, image in enumerate(images):
+                    # Perform OCR on the image
+                    ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
+                    text = " ".join(ocr_dict['text'])
+                    f_out.write(f"{text}\n")
 
-# pil_im = images[0] # assuming that we're interested in the first page only
-for i, image in enumerate(images):
-    # ocr_dict now holds all the OCR info including text and location on the image
-    ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
-    text = " ".join(ocr_dict['text'])
+            print(f"Finished processing {file}, output saved to {output_text_file}")
 
-    print(f"Page {i+1}")
-    print(text)
-    # print(ocr_dict)
+        except TesseractError as e:
+            print(f"Error processing {file}: {e}")
+        except Exception as e:
+            print(f"An error occurred with {file}: {e}")
+    else:
+        print(f"Skipping {file}, not a PDF file")

From 6a66fe8fee468bcc6dacc59bf4455360e800c808 Mon Sep 17 00:00:00 2001
From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:35:26 -0700
Subject: [PATCH 3/4] Add pip packages to requirements_dev

---
 requirements_dev.txt       | 4 ++--
 src/convert_pdf_to_text.py | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 474bd7f..e94309c 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -7,5 +7,5 @@ coverage==4.5.4
 Sphinx==7.2.6
 twine==5.0.0
 ruff==0.3.5
-
-
+pdf2image==1.17.0
+pytesseract==0.3.13
diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py
index 7b7a3ca..57dcd73 100644
--- a/src/convert_pdf_to_text.py
+++ b/src/convert_pdf_to_text.py
@@ -1,10 +1,3 @@
-# brew install poppler
-# pip install pdf2image
-# brew install tesseract
-# pip install pytesseract
-
-
-
 import os
 import sys
 import pdf2image

From 3e3efae7362775d4a04cb25d4e87f4ed958ed1f2 Mon Sep 17 00:00:00 2001
From: Jerry Fu <2072627+jfoo1984@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:00:42 -0700
Subject: [PATCH 4/4] Rename ocr script, add script that uses pdftotext to
 extract text from PDFs

---
 src/convert_pdf_to_text.py | 54 --------------------------------------
 src/ocr_pdf_to_text.py     | 53 +++++++++++++++++++++++++++++++++++++
 src/pdf_to_text.py         | 42 +++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 54 deletions(-)
 delete mode 100644 src/convert_pdf_to_text.py
 create mode 100644 src/ocr_pdf_to_text.py
 create mode 100755 src/pdf_to_text.py

diff --git a/src/convert_pdf_to_text.py b/src/convert_pdf_to_text.py
deleted file mode 100644
index 57dcd73..0000000
--- a/src/convert_pdf_to_text.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-import pdf2image
-import pytesseract
-from pytesseract import Output, TesseractError
-
-# Ensure the user provides the directory with PDF files as an argument
-if len(sys.argv) < 2:
-    print("Usage: python script_name.py <pdf_directory>")
-    sys.exit(1)
-
-# Get the directory with PDFs from the command line argument
-pdf_dir = sys.argv[1]
-
-# Ensure the provided path is a directory
-if not os.path.isdir(pdf_dir):
-    print(f"{pdf_dir} is not a valid directory")
-    sys.exit(1)
-
-# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
-script_dir = os.path.dirname(os.path.abspath(__file__))
-data_dir = os.path.join(os.path.dirname(script_dir), 'data')
-
-# Create the output directory if it doesn't exist
-if not os.path.exists(data_dir):
-    os.makedirs(data_dir)
-
-# Iterate over all the PDF files in the provided directory
-for file in os.listdir(pdf_dir):
-    if file.endswith(".pdf"):
-        pdf_path = os.path.join(pdf_dir, file)
-        print(f"Converting {pdf_path} to images")
-
-        try:
-            # Convert the PDF to images
-            images = pdf2image.convert_from_path(pdf_path)
-            print(f"pdf contains {len(images)} pages")
-            # Perform OCR on each image and save the result to a file
-            output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt")
-            with open(output_text_file, 'w') as f_out:
-                for i, image in enumerate(images):
-                    # Perform OCR on the image
-                    ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
-                    text = " ".join(ocr_dict['text'])
-                    f_out.write(f"{text}\n")
-
-            print(f"Finished processing {file}, output saved to {output_text_file}")
-
-        except TesseractError as e:
-            print(f"Error processing {file}: {e}")
-        except Exception as e:
-            print(f"An error occurred with {file}: {e}")
-    else:
-        print(f"Skipping {file}, not a PDF file")
diff --git a/src/ocr_pdf_to_text.py b/src/ocr_pdf_to_text.py
new file mode 100644
index 0000000..f3c5ba6
--- /dev/null
+++ b/src/ocr_pdf_to_text.py
@@ -0,0 +1,53 @@
+import os
+import sys
+import pdf2image
+import pytesseract
+from pytesseract import Output, TesseractError
+
+# Ensure the user provides the directory with PDF files as an argument
+if len(sys.argv) < 2:
+    print("Usage: python script_name.py <pdf_directory>")
+    sys.exit(1)
+
+# Get the directory with PDFs from the command line argument
+pdf_dir = sys.argv[1]
+
+# Ensure the provided path is a directory
+if not os.path.isdir(pdf_dir):
+    print(f"{pdf_dir} is not a valid directory")
+    sys.exit(1)
+
+# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
+script_dir = os.path.dirname(os.path.abspath(__file__))
+data_dir = os.path.join(os.path.dirname(script_dir), 'data')
+
+# Create the output directory if it doesn't exist
+if not os.path.exists(data_dir):
+    os.makedirs(data_dir)
+
+# Iterate over all the PDF files in the provided directory
+for file in os.listdir(pdf_dir):
+    if not file.endswith(".pdf"):
+        continue
+    pdf_path = os.path.join(pdf_dir, file)
+    print(f"Converting {pdf_path} to images")
+
+    try:
+        # Convert the PDF to images
+        images = pdf2image.convert_from_path(pdf_path)
+        print(f"pdf contains {len(images)} pages")
+        # Perform OCR on each image and save the result to a file
+        output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}_ocr.txt")
+        with open(output_text_file, 'w') as f_out:
+            for i, image in enumerate(images):
+                # Perform OCR on the image
+                ocr_dict = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
+                text = " ".join(ocr_dict['text'])
+                f_out.write(f"{text}\n")
+
+        print(f"Finished processing {file}, output saved to {output_text_file}")
+
+    except TesseractError as e:
+        print(f"Error processing {file}: {e}")
+    except Exception as e:
+        print(f"An error occurred with {file}: {e}")
diff --git a/src/pdf_to_text.py b/src/pdf_to_text.py
new file mode 100755
index 0000000..786e067
--- /dev/null
+++ b/src/pdf_to_text.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import pdftotext
+
+# Ensure the user provides the directory with PDF files as an argument
+if len(sys.argv) < 2:
+    print("Usage: python script_name.py <pdf_directory>")
+    sys.exit(1)
+
+# Get the directory with PDFs from the command line argument
+pdf_dir = sys.argv[1]
+
+# Ensure the provided path is a directory
+if not os.path.isdir(pdf_dir):
+    print(f"{pdf_dir} is not a valid directory")
+    sys.exit(1)
+
+# Set up the output directory (assuming the script is in the src folder and we want data at the same level)
+script_dir = os.path.dirname(os.path.abspath(__file__))
+data_dir = os.path.join(os.path.dirname(script_dir), 'data')
+
+# Create the output directory if it doesn't exist
+if not os.path.exists(data_dir):
+    os.makedirs(data_dir)
+
+# Iterate over all the PDF files in the provided directory
+for file in os.listdir(pdf_dir):
+    if not file.endswith(".pdf"):
+        continue
+
+    pdf_path = os.path.join(pdf_dir, file)
+    print(f"Converting {pdf_path} to text")
+
+    with open(pdf_path, "rb") as f:
+      pdf = pdftotext.PDF(f)
+
+    output_text_file = os.path.join(data_dir, f"{os.path.splitext(file)[0]}.txt")
+    with open(output_text_file, 'w') as f_out:
+      for page in pdf:
+        f_out.write(f"{page}\n")
+
+    print(f"Finished processing {file}, output saved to {output_text_file}")