-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_course_scraper.py
54 lines (44 loc) · 2.13 KB
/
ml_course_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Scrape the links from the course schedule given in this website: https://mahdi-roozbahani.github.io/CS46417641-summer2024/docs/course-info/course-schedule/
import PyPDF2
import requests
import urllib.request
from bs4 import BeautifulSoup
import tqdm
url = "https://mahdi-roozbahani.github.io/CS46417641-summer2024/docs/course-info/course-schedule/"
other_url = "https://mahdi-roozbahani.github.io/CS46417641-summer2024/other/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a")
# Write all the non-PDF links to a file and save that file to the Misc folder
# If the link is a pdf, then download and save the PDF to the Notes folder
import os
os.makedirs("Misc", exist_ok=True)
os.makedirs("Notes", exist_ok=True)
with open("Misc/misc.txt", "w") as f:
for link in tqdm.tqdm(links):
link_url = link.get("href")
if link_url.endswith(".pdf"):
if not link_url.startswith("http"):
pdf_url = f"{url}{link_url}"
try:
pdf_response = requests.get(pdf_url)
pdf_response.raise_for_status()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
pdf_url = f"{other_url}{link_url}"
pdf_response = requests.get(pdf_url)
if pdf_response.status_code == 404:
print(f"Warning: {pdf_url} is not a valid PDF. Error: {e}")
pdf_name = link.get("href").split("/")[-1]
with open(f"Notes/{pdf_name}", "wb") as pdf_file:
pdf_file.write(pdf_response.content)
try:
with open(f"Notes/{pdf_name}", "rb") as pdf_file_read:
PyPDF2.PdfReader(pdf_file_read)
except PyPDF2.errors.PdfReadError as e:
print(f"Warning: {pdf_name} is not a valid PDF. Error: {e}")
else:
link_url = link.get("href")
if not link_url.startswith("https"):
link_url = f"{url}{link_url}"
f.write(link_url + "\n")