From d586f64aaf9f3774011fd9a47e92f99138618eeb Mon Sep 17 00:00:00 2001 From: Joshua Chen Date: Sat, 19 Oct 2024 00:05:18 -0400 Subject: [PATCH] refactor: move invariant checking to transformation instead of staging --- ferry/database/__init__.py | 3 +- ferry/database/deploy.py | 104 ---------------------------------- ferry/transform/__init__.py | 3 + ferry/transform/invariants.py | 51 +++++++++++++++++ 4 files changed, 55 insertions(+), 106 deletions(-) create mode 100644 ferry/transform/invariants.py diff --git a/ferry/database/__init__.py b/ferry/database/__init__.py index 37eec554b..30ba88b33 100644 --- a/ferry/database/__init__.py +++ b/ferry/database/__init__.py @@ -18,5 +18,4 @@ course_flags, course_professors, ) -from .stage import stage -from .deploy import deploy +from .sync_db import sync_db diff --git a/ferry/database/deploy.py b/ferry/database/deploy.py index 420303bd2..93eb67d8d 100644 --- a/ferry/database/deploy.py +++ b/ferry/database/deploy.py @@ -9,95 +9,11 @@ queries_dir = Path(__file__).parent / "queries" -def listing_invariants(session: sqlalchemy.orm.session.Session): - """ - Check listing invariants. - - Check invariant: - listing.season_code == course.season_code if listing.course_id == course.course_id. - """ - - for ( - listing_id, - course_id, - listing_season_code, - course_season_code, - ) in session.query( - database.Listing.listing_id, - database.Listing.course_id, - database.Listing.season_code, - database.Course.season_code, - ).filter( - database.Listing.course_id == database.Course.course_id - ): - if listing_season_code != course_season_code: - raise database.InvariantError( - f"listing {listing_id} has mismatched season_code with course {course_id}" - ) - - -def question_invariants(session: sqlalchemy.orm.session.Session): - """ - Check question invariants. - - Check invariant: - evaluation_questions.options is null iff evaluation_questions.is_narrative = True - """ - - for question in session.query(database.EvaluationQuestion): - narrative = question.is_narrative - options = bool(question.options) - if narrative and options: - raise database.InvariantError(f"narrative question {question} has options") - if not narrative and not options: - raise database.InvariantError(f"ratings question {question} lacks options") - - -def course_invariants(session: sqlalchemy.orm.session.Session): - """ - Check course invariants. - - Check invariant: - every course should have at least one listing. - """ - - courses_no_listings = ( - session.query(database.Listing) - .filter( - ~database.Listing.course_id.in_(session.query(database.Course.course_id)) - ) - .all() - ) - - if courses_no_listings: - - no_listing_courses = [str(course) for course in courses_no_listings] - - raise database.InvariantError( - f"the following courses have no listings: {', '.join(no_listing_courses)}" - ) - - def deploy(db: database.Database): """ Deploy staged tables to main ones and regenerate database. - - - Checks the database invariants on staged tables. - - If invariants pass, promotes staged tables to actual ones, - updates indexes, and recomputes search views - - If any invariant fails, exits with no changes to tables. """ - # ------------------------------------ - # Specify invariant checking functions - # ------------------------------------ - - all_items = [ - listing_invariants, - course_invariants, - question_invariants, - ] - # -------------------------------------- # Check if all staged tables are present # -------------------------------------- @@ -117,26 +33,6 @@ def deploy(db: database.Database): "Not all staged tables are present. Run stage.py again?" ) - # ------------------------------ - # Check invariants on new tables - # ------------------------------ - - print("\nChecking table invariants...") - - items = all_items - - for fn in items: - if fn.__doc__: - logging.debug(f"{fn.__doc__.strip()}") - else: - logging.debug(f"Running: {fn.__name__}") - - with database.session_scope(db.Session) as db_session: - fn(db_session) - - print("\033[F", end="") - print("Checking table invariants... ✔") - # ------------------------------------- # Upgrade staged tables to primary ones # ------------------------------------- diff --git a/ferry/transform/__init__.py b/ferry/transform/__init__.py index e0d5c7baf..7210b0fba 100644 --- a/ferry/transform/__init__.py +++ b/ferry/transform/__init__.py @@ -14,6 +14,7 @@ ) from .import_courses import import_courses from .import_evaluations import import_evaluations +from .invariants import check_invariants def write_csvs(tables: dict[str, pd.DataFrame], data_dir: Path): @@ -90,6 +91,8 @@ def transform(data_dir: Path) -> dict[str, pd.DataFrame]: all_tables = {"seasons": seasons_table, **course_tables, **eval_tables} + check_invariants(all_tables) + # Remove intermediate columns to match DB schema for table_name, db_table in [ ("seasons", database.Season.__table__), diff --git a/ferry/transform/invariants.py b/ferry/transform/invariants.py new file mode 100644 index 000000000..9bdf21ade --- /dev/null +++ b/ferry/transform/invariants.py @@ -0,0 +1,51 @@ +import pandas as pd + + +class InvariantError(Exception): + pass + + +def check_invariants(tables: dict[str, pd.DataFrame]): + """ + Check invariant: + - listing.season_code == course.season_code if listing.course_id == course.course_id. + - evaluation_questions.options is null iff evaluation_questions.is_narrative = True + - every course should have at least one listing. + """ + + listing_with_course = tables["listings"][["course_id", "season_code"]].merge( + tables["courses"][["course_id", "season_code"]], + on="course_id", + suffixes=("_listing", "_course"), + ) + diff_season_code = listing_with_course[ + listing_with_course["season_code_listing"] + != listing_with_course["season_code_course"] + ] + if not diff_season_code.empty: + raise InvariantError( + f"listing.season_code != course.season_code for {diff_season_code}" + ) + + courses_no_listing = tables["courses"]["course_id"].isin( + tables["listings"]["course_id"] + ) + if not courses_no_listing.empty: + raise InvariantError( + f"courses with no listing {tables['courses']['course_id'][courses_no_listing]}" + ) + + narrative_with_options = tables["evaluation_questions"][ + (tables["evaluation_questions"]["is_narrative"] == True) + & (tables["evaluation_questions"]["options"].notnull()) + ] + non_narrative_without_options = tables["evaluation_questions"][ + (tables["evaluation_questions"]["is_narrative"] == False) + & (tables["evaluation_questions"]["options"].isnull()) + ] + if not narrative_with_options.empty: + raise InvariantError(f"narrative with options {narrative_with_options}") + if not non_narrative_without_options.empty: + raise InvariantError( + f"non-narrative without options {non_narrative_without_options}" + )