Skip to content

Commit

Permalink
refactor: move invariant checking to transformation instead of staging
Browse files Browse the repository at this point in the history
  • Loading branch information
Josh-Cena committed Oct 19, 2024
1 parent da25013 commit 93efaa9
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 104 deletions.
104 changes: 0 additions & 104 deletions ferry/database/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,95 +9,11 @@
queries_dir = Path(__file__).parent / "queries"


def listing_invariants(session: sqlalchemy.orm.session.Session):
"""
Check listing invariants.
Check invariant:
listing.season_code == course.season_code if listing.course_id == course.course_id.
"""

for (
listing_id,
course_id,
listing_season_code,
course_season_code,
) in session.query(
database.Listing.listing_id,
database.Listing.course_id,
database.Listing.season_code,
database.Course.season_code,
).filter(
database.Listing.course_id == database.Course.course_id
):
if listing_season_code != course_season_code:
raise database.InvariantError(
f"listing {listing_id} has mismatched season_code with course {course_id}"
)


def question_invariants(session: sqlalchemy.orm.session.Session):
"""
Check question invariants.
Check invariant:
evaluation_questions.options is null iff evaluation_questions.is_narrative = True
"""

for question in session.query(database.EvaluationQuestion):
narrative = question.is_narrative
options = bool(question.options)
if narrative and options:
raise database.InvariantError(f"narrative question {question} has options")
if not narrative and not options:
raise database.InvariantError(f"ratings question {question} lacks options")


def course_invariants(session: sqlalchemy.orm.session.Session):
"""
Check course invariants.
Check invariant:
every course should have at least one listing.
"""

courses_no_listings = (
session.query(database.Listing)
.filter(
~database.Listing.course_id.in_(session.query(database.Course.course_id))
)
.all()
)

if courses_no_listings:

no_listing_courses = [str(course) for course in courses_no_listings]

raise database.InvariantError(
f"the following courses have no listings: {', '.join(no_listing_courses)}"
)


def deploy(db: database.Database):
"""
Deploy staged tables to main ones and regenerate database.
- Checks the database invariants on staged tables.
- If invariants pass, promotes staged tables to actual ones,
updates indexes, and recomputes search views
- If any invariant fails, exits with no changes to tables.
"""

# ------------------------------------
# Specify invariant checking functions
# ------------------------------------

all_items = [
listing_invariants,
course_invariants,
question_invariants,
]

# --------------------------------------
# Check if all staged tables are present
# --------------------------------------
Expand All @@ -117,26 +33,6 @@ def deploy(db: database.Database):
"Not all staged tables are present. Run stage.py again?"
)

# ------------------------------
# Check invariants on new tables
# ------------------------------

print("\nChecking table invariants...")

items = all_items

for fn in items:
if fn.__doc__:
logging.debug(f"{fn.__doc__.strip()}")
else:
logging.debug(f"Running: {fn.__name__}")

with database.session_scope(db.Session) as db_session:
fn(db_session)

print("\033[F", end="")
print("Checking table invariants... ✔")

# -------------------------------------
# Upgrade staged tables to primary ones
# -------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions ferry/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from .import_courses import import_courses
from .import_evaluations import import_evaluations
from .invariants import check_invariants


def write_csvs(tables: dict[str, pd.DataFrame], data_dir: Path):
Expand Down Expand Up @@ -90,6 +91,8 @@ def transform(data_dir: Path) -> dict[str, pd.DataFrame]:

all_tables = {"seasons": seasons_table, **course_tables, **eval_tables}

check_invariants(all_tables)

# Remove intermediate columns to match DB schema
for table_name, db_table in [
("seasons", database.Season.__table__),
Expand Down
51 changes: 51 additions & 0 deletions ferry/transform/invariants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pandas as pd


class InvariantError(Exception):
pass


def check_invariants(tables: dict[str, pd.DataFrame]):
"""
Check invariant:
- listing.season_code == course.season_code if listing.course_id == course.course_id.
- evaluation_questions.options is null iff evaluation_questions.is_narrative = True
- every course should have at least one listing.
"""

listing_with_course = tables["listings"][["course_id", "season_code"]].merge(
tables["courses"][["course_id", "season_code"]],
on="course_id",
suffixes=("_listing", "_course"),
)
diff_season_code = listing_with_course[
listing_with_course["season_code_listing"]
!= listing_with_course["season_code_course"]
]
if not diff_season_code.empty:
raise InvariantError(
f"listing.season_code != course.season_code for {diff_season_code}"
)

courses_no_listing = tables["courses"]["course_id"].isin(
tables["listings"]["course_id"]
)
if not courses_no_listing.empty:
raise InvariantError(
f"courses with no listing {tables['courses']['course_id'][courses_no_listing]}"
)

narrative_with_options = tables["evaluation_questions"][
(tables["evaluation_questions"]["is_narrative"] == True)
& (tables["evaluation_questions"]["options"].notnull())
]
non_narrative_without_options = tables["evaluation_questions"][
(tables["evaluation_questions"]["is_narrative"] == False)
& (tables["evaluation_questions"]["options"].isnull())
]
if not narrative_with_options.empty:
raise InvariantError(f"narrative with options {narrative_with_options}")
if not non_narrative_without_options.empty:
raise InvariantError(
f"non-narrative without options {non_narrative_without_options}"
)

0 comments on commit 93efaa9

Please sign in to comment.