Skip to content

Commit

Permalink
feat: more efficient method for converting calendar to calendar_dates
Browse files Browse the repository at this point in the history
  • Loading branch information
CBROWN-ONS committed Jan 23, 2024
1 parent 2a5c028 commit 09b680d
Showing 1 changed file with 34 additions and 39 deletions.
73 changes: 34 additions & 39 deletions src/transport_performance/gtfs/calendar.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Cleaners & utilities specific to the calendar table."""
import calendar
from copy import deepcopy

import numpy as np
import pandas as pd

from transport_performance.utils.defence import (
Expand Down Expand Up @@ -42,43 +42,38 @@ def create_calendar_from_dates(calendar_dates: pd.DataFrame) -> pd.DataFrame:
exp_cols = ["service_id", "date", "exception_type"]
for nm in exp_cols:
_check_column_in_df(calendar_dates, nm)

# create an empty calendar df
days = [day.lower() for day in calendar.day_name]
new_calendar = pd.DataFrame()
new_calendar["service_id"] = calendar_dates["service_id"].unique()
day_df = pd.DataFrame(
{i: np.zeros(len(new_calendar), dtype="int8") for i in days},
index=list(range(0, len(new_calendar))),
)
new_calendar = pd.concat([new_calendar, day_df], axis=1)
# create the start and end_date columns
new_calendar["start_date"] = ""
new_calendar["end_date"] = ""
new_calendar.set_index(new_calendar["service_id"], inplace=True)
# update this empty calendar with values from calendar_dates
for i, r in calendar_dates.iterrows():
# only update if calendar_dates exception_type is 1 (adding a service)
# Type 2 removes a service and will override the calendar
if r["exception_type"] == 1:
date_affected = r["date"]
day_affected = pd.to_datetime(date_affected).weekday()
# update weekday column entry to show the service runs on that day
new_calendar.loc[
r["service_id"], new_calendar.columns[day_affected + 1]
] = 1
# update the start & end date columns
s_date = new_calendar.loc[r["service_id"], "start_date"]
if s_date == "":
new_calendar.loc[r["service_id"], "start_date"] = date_affected
else:
s_date = min(s_date, date_affected)
new_calendar.loc[r["service_id"], "start_date"] = s_date
e_date = new_calendar.loc[r["service_id"], "end_date"]
if e_date == "":
new_calendar.loc[r["service_id"], "end_date"] = date_affected
else:
e_date = max(e_date, date_affected)
new_calendar.loc[r["service_id"], "end_date"] = e_date
# clean calendar_dates
cal1 = calendar_dates[calendar_dates.exception_type == 1].copy()
cal1.drop("exception_type", axis=1, inplace=True)
# get list of dates and convert to days of the week
grouped = cal1.groupby("service_id").agg({"date": lambda x: list(set(x))})

return new_calendar.reset_index(drop=True)
def _get_day_name(date: str) -> str:
"""Small helper function to get the named day of the week."""
day_index = pd.to_datetime(date).weekday()
return days[day_index]

grouped["days"] = grouped["date"].apply(
lambda x: list(set([_get_day_name(d) for d in x]))
)
# start and end date
grouped["start_date"] = grouped["date"].apply(lambda x: min(x))
grouped["end_date"] = grouped["date"].apply(lambda x: max(x))
# clean up unused data
grouped.drop("date", axis=1, inplace=True)
grouped.reset_index(inplace=True)
# add a column for each day
for day in days:
grouped[day] = (
grouped["days"]
.apply(lambda x: 1 if day in x else 0)
.astype("int8")
)
grouped.drop("days", axis=1)
# re-order index
order = deepcopy(days)
order.insert(0, "service_id")
order.append("start_date")
order.append("end_date")
return grouped.loc[:, order]

0 comments on commit 09b680d

Please sign in to comment.