-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_pull.py
74 lines (64 loc) · 2.52 KB
/
01_pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -----------------------------------------------
# 1. Data Acquisition:
#
# Objective: Download all the public course
# catalog data in raw HTML format from a
# university website.
#
# Tools/Resources: Extract all the course
# catalog data from one of the follow
# three universities:
# Harvard: https://courses.my.harvard.edu
# BU: https://www.bu.edu/academics/cas/courses
# NE: https://catalog.northeastern.edu/course-descriptions
# -----------------------------------------------
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def download_page(url, filename):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
with open(filename, 'w', encoding='utf-8') as f:
f.write(response.text)
logging.info(f"Downloaded: {filename}")
return response.text
except requests.RequestException as e:
logging.error(f"Failed to download {url}: {e}")
return None
def download_northeastern():
base_url = "https://catalog.northeastern.edu"
catalog_url = f"{base_url}/course-descriptions/"
output_dir = "raw_html/northeastern"
os.makedirs(output_dir, exist_ok=True)
try:
response = requests.get(catalog_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
course_links = soup.find_all('a', class_='levelthree')
for link in course_links:
department_url = base_url + link['href']
department_name = link.text.strip().replace(' ', '_')
filename = f"{output_dir}/{department_name}.html"
content = download_page(department_url, filename)
if content:
time.sleep(1) # Be polite to the server
else:
logging.warning(f"Failed to download {department_name}")
logging.info("Northeastern download complete!")
except requests.RequestException as e:
logging.error(f"An error occurred while downloading Northeastern courses: {e}")
def main():
logging.info("Starting Northeastern course catalog download...")
download_northeastern()
logging.info("Download complete!")
if __name__ == "__main__":
main()