Skip to content

Commit

Permalink
refactor: rework class crawling logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Josh-Cena committed May 4, 2024
1 parent 4810f66 commit e6eafde
Show file tree
Hide file tree
Showing 6 changed files with 348 additions and 684 deletions.
55 changes: 54 additions & 1 deletion ferry/crawler/classes/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,54 @@
from .fetch import fetch_classes
from pathlib import Path
from typing import Any
from httpx import AsyncClient
from tqdm import tqdm

from .fetch import fetch_season_course_list, fetch_all_season_courses_details
from .parse import parse_courses


async def crawl_classes(
seasons: list[str],
data_dir: Path,
client: AsyncClient,
use_cache: bool = True,
) -> dict[str, list[dict[str, Any]]]:
# Concurrency with async at the season level overloads the CPU
# futures = [ fetch_class(season, data_dir=data_dir, client=client) for season in seasons ]
# classes = await tqdm_asyncio.gather(*futures, desc="Season Progress")

print(f"Fetching course info for seasons: {seasons}...")

classes: dict[str, list[dict[str, Any]]] = {}
for season in tqdm(seasons, desc="Season Progress", leave=False):
season_courses = await fetch_season_course_list(
season, data_dir=data_dir, client=client, use_cache=use_cache
)
season_fysem_courses = await fetch_season_course_list(
season,
data_dir=data_dir,
client=client,
fysem=True,
use_cache=use_cache,
)

aggregate_season_json = await fetch_all_season_courses_details(
season,
season_courses,
data_dir=data_dir,
client=client,
use_cache=use_cache,
)

classes[season] = parse_courses(
season,
aggregate_season_json,
set(x["crn"] for x in season_fysem_courses),
data_dir=data_dir,
use_cache=use_cache,
)

print("\033[F", end="")
print(f"Fetching course info for seasons: {seasons}... ✔")

return classes
218 changes: 92 additions & 126 deletions ferry/crawler/classes/fetch.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,21 @@
"""
Fetches the following information from the Yale Courses API:
(1) A list of all courses for each season
(/api_output/season_courses/)
(2) Detailed information for each course, for each season
(/api_output/course_json_cache/)
"""

from httpx import AsyncClient
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from pathlib import Path
import ujson
from typing import Any

from .parse import extract_course_info
from .process import (
fetch_course_json,
fetch_season_courses_util,
)
from ferry.crawler.cache import load_cache_json, save_cache_json

# -----------------------------------------
# Retrieve courses from unofficial Yale API
# -----------------------------------------


async def fetch_classes(
seasons: list[str],
data_dir: Path,
client: AsyncClient,
use_cache: bool = True,
) -> dict[str, list[dict[str, Any]]]:
# Concurrency with async at the season level overloads the CPU
# futures = [ fetch_class(season, data_dir=data_dir, client=client) for season in seasons ]
# classes = await tqdm_asyncio.gather(*futures, desc="Season Progress")

print(f"Fetching course info for seasons: {seasons}...")

classes = {}
for season in tqdm(seasons, desc="Season Progress", leave=False):
classes[season] = await fetch_class(
season, data_dir=data_dir, client=client, use_cache=use_cache
)

print("\033[F", end="")
print(f"Fetching course info for seasons: {seasons}... ✔")

return classes


# -----------------------------------------
# Fetch Utility Functions
# -----------------------------------------


async def fetch_class(
season: str, data_dir: Path, client: AsyncClient, use_cache: bool = True
):
# fetch season classes
season_courses = await fetch_season_courses(
season, data_dir=data_dir, client=client, use_cache=use_cache
)
season_fysem_courses = await fetch_season_courses(
season,
data_dir=data_dir,
client=client,
fysem=True,
use_cache=use_cache,
)

# fetch detailed info for all classes in each season
aggregate_season_json = await fetch_aggregate_season_json(
season,
season_courses,
data_dir=data_dir,
client=client,
use_cache=use_cache,
)

# parse courses
parsed_courses = parse_courses(
season,
aggregate_season_json,
season_fysem_courses,
data_dir=data_dir,
use_cache=use_cache,
)

return parsed_courses
from ferry.crawler.cas_request import request


# fetch overview info for all classes in each season
async def fetch_season_courses(
async def fetch_season_course_list(
season: str,
data_dir: Path,
client: AsyncClient,
fysem: bool = False,
use_cache: bool = True,
):
) -> list[dict[str, Any]]:
if fysem:
criteria = [{"field": "fsem_attrs", "value": "Y"}]
f_suffix = "_fysem"
Expand All @@ -118,27 +35,108 @@ async def fetch_season_courses(
):
return cache_load

season_courses = await fetch_season_courses_util(
season=season, criteria=criteria, client=client
url = "https://courses.yale.edu/api/?page=fose&route=search"

payload = {"other": {"srcdb": season}, "criteria": criteria}

req = await request(
method="POST", client=client, url=url, data=ujson.dumps(payload)
)
req.encoding = "utf-8"

# Unsuccessful response
if req.status_code != 200:
raise FetchClassesError(f"Unsuccessful response: code {req.status_code}")
r_json = ujson.loads(req.text)

if "fatal" in r_json.keys():
raise FetchClassesError(f'Unsuccessful response: {r_json["fatal"]}')

if "results" not in r_json.keys():
raise FetchClassesError("Unsuccessful response: no results")

season_courses = r_json["results"]
save_cache_json(
data_dir / "season_courses" / f"{season}{f_suffix}.json", season_courses
)

return season_courses


class FetchClassesError(Exception):
"""
Object for class fetching exceptions.
"""

# pylint: disable=unnecessary-pass
pass


async def fetch_course_details(
code: str, crn: str, srcdb: str, client: AsyncClient
) -> dict[str, Any]:
"""
Fetch information for a course from the API
Parameters
----------
code: string
the course code
crn: string
the course registration number
srcdb: string
season the course is in
Returns
-------
course_json: dict
JSON-formatted course information
"""

url = "https://courses.yale.edu/api/?page=fose&route=details"

payload = {
"group": "code:" + code + "",
"key": "crn:" + crn + "",
"srcdb": "" + srcdb + "",
"matched": "crn:" + crn + "",
}

# retry up to 10 times
req = await request(
method="POST",
client=client,
url=url,
data=ujson.dumps(payload),
attempts=10,
)

req.encoding = "utf-8"

# Unsuccessful response
if req.status_code != 200:
raise FetchClassesError("Unsuccessful response: code {req.status_code}")
course_json = ujson.loads(req.text)

# exclude Yale's last-updated field (we use our own later on)
if "last_updated" in course_json:
del course_json["last_updated"]

if "fatal" in course_json.keys():
raise FetchClassesError(f"Unsuccessful response: {course_json['fatal']}")

return course_json


# fetch detailed info for all classes in each season
async def fetch_aggregate_season_json(
async def fetch_all_season_courses_details(
season: str,
season_courses: list[dict[str, Any]],
data_dir: Path,
client: AsyncClient,
use_cache: bool = True,
):
# load from cache if it exists

if (
use_cache
and (
Expand All @@ -152,7 +150,9 @@ async def fetch_aggregate_season_json(

# merge all the JSON results per season
futures = [
fetch_course_json(course["code"], course["crn"], course["srcdb"], client=client)
fetch_course_details(
course["code"], course["crn"], course["srcdb"], client=client
)
for course in season_courses
]
aggregate_season_json = await tqdm_asyncio.gather(
Expand All @@ -164,37 +164,3 @@ async def fetch_aggregate_season_json(
)

return aggregate_season_json


# combine regular and fysem courses in each season
def parse_courses(
season: str,
aggregate_season_json,
fysem_courses,
data_dir: Path,
use_cache: bool = True,
) -> list[dict[str, Any]]:
# load from cache if it exists
if (
use_cache
and (
cache_load := load_cache_json(
data_dir / "parsed_courses" / f"{season}.json"
)
)
is not None
):
return cache_load

# parse course JSON in season
parsed_course_info: list[dict[str, Any]] = []
# not worth parallelizing, already pretty quick
for x in tqdm(aggregate_season_json, leave=False, desc=f"Parsing season {season}"):
try:
parsed_course_info.append(extract_course_info(x, season, fysem_courses))
except Exception as e:
print(f"Error parsing course {x['code']} in season {season}: {e}")

save_cache_json(data_dir / "parsed_courses" / f"{season}.json", parsed_course_info)

return parsed_course_info
Loading

0 comments on commit e6eafde

Please sign in to comment.