feat: more efficient method for converting calendar to calendar_dates

datasciencecampus · Jan 23, 2024 · 09b680d · 09b680d
1 parent 2a5c028
commit 09b680d
Showing 1 changed file with 34 additions and 39 deletions.
diff --git a/src/transport_performance/gtfs/calendar.py b/src/transport_performance/gtfs/calendar.py
@@ -1,7 +1,7 @@
 """Cleaners & utilities specific to the calendar table."""
 import calendar
+from copy import deepcopy
 
-import numpy as np
 import pandas as pd
 
 from transport_performance.utils.defence import (
@@ -42,43 +42,38 @@ def create_calendar_from_dates(calendar_dates: pd.DataFrame) -> pd.DataFrame:
     exp_cols = ["service_id", "date", "exception_type"]
     for nm in exp_cols:
         _check_column_in_df(calendar_dates, nm)
-
-    # create an empty calendar df
     days = [day.lower() for day in calendar.day_name]
-    new_calendar = pd.DataFrame()
-    new_calendar["service_id"] = calendar_dates["service_id"].unique()
-    day_df = pd.DataFrame(
-        {i: np.zeros(len(new_calendar), dtype="int8") for i in days},
-        index=list(range(0, len(new_calendar))),
-    )
-    new_calendar = pd.concat([new_calendar, day_df], axis=1)
-    # create the start and end_date columns
-    new_calendar["start_date"] = ""
-    new_calendar["end_date"] = ""
-    new_calendar.set_index(new_calendar["service_id"], inplace=True)
-    # update this empty calendar with values from calendar_dates
-    for i, r in calendar_dates.iterrows():
-        # only update if calendar_dates exception_type is 1 (adding a service)
-        # Type 2 removes a service and will override the calendar
-        if r["exception_type"] == 1:
-            date_affected = r["date"]
-            day_affected = pd.to_datetime(date_affected).weekday()
-            # update weekday column entry to show the service runs on that day
-            new_calendar.loc[
-                r["service_id"], new_calendar.columns[day_affected + 1]
-            ] = 1
-            # update the start & end date columns
-            s_date = new_calendar.loc[r["service_id"], "start_date"]
-            if s_date == "":
-                new_calendar.loc[r["service_id"], "start_date"] = date_affected
-            else:
-                s_date = min(s_date, date_affected)
-                new_calendar.loc[r["service_id"], "start_date"] = s_date
-            e_date = new_calendar.loc[r["service_id"], "end_date"]
-            if e_date == "":
-                new_calendar.loc[r["service_id"], "end_date"] = date_affected
-            else:
-                e_date = max(e_date, date_affected)
-                new_calendar.loc[r["service_id"], "end_date"] = e_date
+    # clean calendar_dates
+    cal1 = calendar_dates[calendar_dates.exception_type == 1].copy()
+    cal1.drop("exception_type", axis=1, inplace=True)
+    # get list of dates and convert to days of the week
+    grouped = cal1.groupby("service_id").agg({"date": lambda x: list(set(x))})
 
-    return new_calendar.reset_index(drop=True)
+    def _get_day_name(date: str) -> str:
+        """Small helper function to get the named day of the week."""
+        day_index = pd.to_datetime(date).weekday()
+        return days[day_index]
+
+    grouped["days"] = grouped["date"].apply(
+        lambda x: list(set([_get_day_name(d) for d in x]))
+    )
+    # start and end date
+    grouped["start_date"] = grouped["date"].apply(lambda x: min(x))
+    grouped["end_date"] = grouped["date"].apply(lambda x: max(x))
+    # clean up unused data
+    grouped.drop("date", axis=1, inplace=True)
+    grouped.reset_index(inplace=True)
+    # add a column for each day
+    for day in days:
+        grouped[day] = (
+            grouped["days"]
+            .apply(lambda x: 1 if day in x else 0)
+            .astype("int8")
+        )
+    grouped.drop("days", axis=1)
+    # re-order index
+    order = deepcopy(days)
+    order.insert(0, "service_id")
+    order.append("start_date")
+    order.append("end_date")
+    return grouped.loc[:, order]