-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path10_extract_1996.py
119 lines (104 loc) · 3.69 KB
/
10_extract_1996.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -----------------------------------------------
# 10. Catalog 1996
#
# Extract course data from the scanned
# 1996 MIT course catalog. After extracting
# the text, create a data model and save the
# processed data. This task emphasizes
# working with raw, scanned documents
# and aims to teach you how to extract
# information from non-digitized sources.
# -----------------------------------------------
# 10_extract_1996.py
"""
Script to extract course data from the scanned 1996 MIT course catalog PDF.
"""
import os
import json
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import re
from tqdm import tqdm
# Set up paths
PDF_FILE = 'mit_course_catalog_1996.pdf' # Path to your downloaded PDF
OUTPUT_JSON = '10_mit_1996.json'
# Optional: Specify the path to tesseract executable if it's not in your PATH
# For example, on Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def pdf_to_images(pdf_path):
"""
Convert PDF pages to images.
"""
print("Converting PDF pages to images...")
images = convert_from_path(pdf_path)
print(f"Converted {len(images)} pages into images.")
return images
def ocr_images(images):
"""
Perform OCR on a list of images.
"""
print("Performing OCR on images...")
text_pages = []
for i, image in enumerate(tqdm(images, desc="OCR Processing")):
text = pytesseract.image_to_string(image)
text_pages.append(text)
return text_pages
def extract_courses(text_pages):
"""
Extract course data from OCR'd text.
"""
print("Extracting courses from text...")
courses = []
course_pattern = re.compile(r'^(\d{1,3}\.[A-Za-z0-9]{1,3}[J]?)\s+(.+)', re.MULTILINE)
description_pattern = re.compile(r'^[^\d]+', re.MULTILINE)
for page_num, text in enumerate(text_pages):
lines = text.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
# Match course code and title
match = course_pattern.match(line)
if match:
course_code = match.group(1)
course_title = match.group(2).strip()
description_lines = []
i += 1
# Collect description lines
while i < len(lines):
desc_line = lines[i].strip()
# Stop if we reach another course or an empty line
if course_pattern.match(desc_line) or desc_line == '':
break
description_lines.append(desc_line)
i += 1
course_description = ' '.join(description_lines).strip()
courses.append({
'course_code': course_code,
'course_title': course_title,
'course_description': course_description,
'page_number': page_num + 1 # Pages are 1-indexed
})
else:
i += 1 # Move to the next line if no match
print(f"Extracted {len(courses)} courses.")
return courses
def save_to_json(data, output_file):
"""
Save data to a JSON file.
"""
print(f"Saving data to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
print("Data saved successfully.")
def main():
# Step 1: Convert PDF to images
images = pdf_to_images(PDF_FILE)
# Step 2: Perform OCR on images
text_pages = ocr_images(images)
# Step 3: Extract courses from text
courses = extract_courses(text_pages)
# Step 4: Save data to JSON
save_to_json(courses, OUTPUT_JSON)
if __name__ == '__main__':
main()